diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,21 +1,22 @@ { + "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, - "global_step": 712, + "global_step": 1428, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0014044943820224719, - "grad_norm": 0.9453608393669128, - "learning_rate": 6.9444444444444435e-09, - "logits/chosen": -3.205078125, - "logits/rejected": -3.185546875, - "logps/chosen": -43.59375, - "logps/rejected": -42.640625, + "epoch": 0.0007002801120448179, + "grad_norm": 101.72705078125, + "learning_rate": 0.0, + "logits/chosen": -4.4921875, + "logits/rejected": -4.50390625, + "logps/chosen": -367.25, + "logps/rejected": -333.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -24,1082 +25,4292 @@ "step": 1 }, { - "epoch": 0.014044943820224719, - "grad_norm": 1.151463508605957, - "learning_rate": 6.944444444444444e-08, - "logits/chosen": -3.24609375, - "logits/rejected": -3.196831703186035, - "logps/chosen": -42.70138931274414, - "logps/rejected": -41.57638931274414, - "loss": 0.6923, - "rewards/accuracies": 0.2465277761220932, - "rewards/chosen": 9.75396906142123e-05, - "rewards/margins": -0.00013128916907589883, - "rewards/rejected": 0.00022856394934933633, + "epoch": 0.0035014005602240898, + "grad_norm": 33.1419792175293, + "learning_rate": 2.797202797202797e-08, + "logits/chosen": -4.5009765625, + "logits/rejected": -4.5283203125, + "logps/chosen": -337.3125, + "logps/rejected": -297.53125, + "loss": 0.6915, + "rewards/accuracies": 0.109375, + "rewards/chosen": -0.0002932548522949219, + "rewards/margins": 0.0015611648559570312, + "rewards/rejected": -0.0018558502197265625, + "step": 5 + }, + { + "epoch": 0.0070028011204481795, + "grad_norm": 76.48152923583984, + "learning_rate": 6.293706293706294e-08, + "logits/chosen": -4.491406440734863, + "logits/rejected": -4.510937690734863, + "logps/chosen": -303.79998779296875, + "logps/rejected": -276.4125061035156, + "loss": 0.692, + "rewards/accuracies": 0.16875000298023224, + "rewards/chosen": -0.01181716937571764, + "rewards/margins": 0.002262115478515625, + "rewards/rejected": -0.01406936626881361, "step": 10 }, { - "epoch": 0.028089887640449437, - "grad_norm": 0.5042410492897034, - "learning_rate": 1.3888888888888888e-07, - "logits/chosen": -3.255859375, - "logits/rejected": -3.21484375, - "logps/chosen": -42.06718826293945, - "logps/rejected": -40.62968826293945, - "loss": 0.6911, - "rewards/accuracies": 0.30937498807907104, - "rewards/chosen": 0.006805038545280695, - "rewards/margins": 0.0023165703751146793, - "rewards/rejected": 0.004488563630729914, + "epoch": 0.01050420168067227, + "grad_norm": 90.04021453857422, + "learning_rate": 9.79020979020979e-08, + "logits/chosen": -4.504687309265137, + "logits/rejected": -4.514843940734863, + "logps/chosen": -339.0249938964844, + "logps/rejected": -315.04998779296875, + "loss": 0.6924, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.009069060906767845, + "rewards/margins": 0.00015182494826149195, + "rewards/rejected": -0.00923309288918972, + "step": 15 + }, + { + "epoch": 0.014005602240896359, + "grad_norm": 72.611328125, + "learning_rate": 1.3286713286713285e-07, + "logits/chosen": -4.508593559265137, + "logits/rejected": -4.518750190734863, + "logps/chosen": -296.70001220703125, + "logps/rejected": -268.76251220703125, + "loss": 0.6978, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -0.014070892706513405, + "rewards/margins": -0.010562133975327015, + "rewards/rejected": -0.00351715087890625, "step": 20 }, { - "epoch": 0.042134831460674156, - "grad_norm": 0.6343653202056885, - "learning_rate": 2.0833333333333333e-07, - "logits/chosen": -3.262500047683716, - "logits/rejected": -3.232617139816284, - "logps/chosen": -41.40625, - "logps/rejected": -40.610939025878906, - "loss": 0.6869, - "rewards/accuracies": 0.4468750059604645, - "rewards/chosen": 0.027604103088378906, - "rewards/margins": 0.012437248602509499, - "rewards/rejected": 0.015170956030488014, + "epoch": 0.01750700280112045, + "grad_norm": 70.4524154663086, + "learning_rate": 1.6783216783216782e-07, + "logits/chosen": -4.490624904632568, + "logits/rejected": -4.521874904632568, + "logps/chosen": -312.20001220703125, + "logps/rejected": -287.0, + "loss": 0.6873, + "rewards/accuracies": 0.20624999701976776, + "rewards/chosen": -0.008679961785674095, + "rewards/margins": 0.011355971917510033, + "rewards/rejected": -0.020025253295898438, + "step": 25 + }, + { + "epoch": 0.02100840336134454, + "grad_norm": 181.83139038085938, + "learning_rate": 2.0279720279720277e-07, + "logits/chosen": -4.515625, + "logits/rejected": -4.525000095367432, + "logps/chosen": -310.3500061035156, + "logps/rejected": -285.75, + "loss": 0.6951, + "rewards/accuracies": 0.26249998807907104, + "rewards/chosen": -0.05675353854894638, + "rewards/margins": -0.0030269622802734375, + "rewards/rejected": -0.05369110032916069, "step": 30 }, { - "epoch": 0.056179775280898875, - "grad_norm": 0.7174843549728394, - "learning_rate": 2.7777777777777776e-07, - "logits/chosen": -3.279296875, - "logits/rejected": -3.2447266578674316, - "logps/chosen": -40.29218673706055, - "logps/rejected": -39.842185974121094, - "loss": 0.68, - "rewards/accuracies": 0.5078125, - "rewards/chosen": 0.03814506530761719, - "rewards/margins": 0.027390670031309128, - "rewards/rejected": 0.010743332095444202, + "epoch": 0.024509803921568627, + "grad_norm": 40.653568267822266, + "learning_rate": 2.3776223776223774e-07, + "logits/chosen": -4.502343654632568, + "logits/rejected": -4.51953125, + "logps/chosen": -326.46875, + "logps/rejected": -295.5625, + "loss": 0.6799, + "rewards/accuracies": 0.3687500059604645, + "rewards/chosen": -0.1183624267578125, + "rewards/margins": 0.030280303210020065, + "rewards/rejected": -0.14861373603343964, + "step": 35 + }, + { + "epoch": 0.028011204481792718, + "grad_norm": 46.50446319580078, + "learning_rate": 2.727272727272727e-07, + "logits/chosen": -4.517968654632568, + "logits/rejected": -4.546875, + "logps/chosen": -309.45001220703125, + "logps/rejected": -290.3125, + "loss": 0.6777, + "rewards/accuracies": 0.36250001192092896, + "rewards/chosen": -0.12889710068702698, + "rewards/margins": 0.03403778001666069, + "rewards/rejected": -0.16286087036132812, "step": 40 }, { - "epoch": 0.0702247191011236, - "grad_norm": 0.5838291645050049, - "learning_rate": 3.472222222222222e-07, - "logits/chosen": -3.2763671875, - "logits/rejected": -3.230664014816284, - "logps/chosen": -40.84687423706055, - "logps/rejected": -40.31562423706055, - "loss": 0.6619, - "rewards/accuracies": 0.609375, - "rewards/chosen": 0.041875459253787994, - "rewards/margins": 0.06587791442871094, - "rewards/rejected": -0.023966407403349876, + "epoch": 0.031512605042016806, + "grad_norm": 14.87103271484375, + "learning_rate": 3.076923076923077e-07, + "logits/chosen": -4.514062404632568, + "logits/rejected": -4.534375190734863, + "logps/chosen": -309.04998779296875, + "logps/rejected": -279.04998779296875, + "loss": 0.6592, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.171966552734375, + "rewards/margins": 0.07354736328125, + "rewards/rejected": -0.24545899033546448, + "step": 45 + }, + { + "epoch": 0.0350140056022409, + "grad_norm": 47.90538787841797, + "learning_rate": 3.4265734265734264e-07, + "logits/chosen": -4.51953125, + "logits/rejected": -4.529687404632568, + "logps/chosen": -346.1499938964844, + "logps/rejected": -314.5249938964844, + "loss": 0.6184, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2255096435546875, + "rewards/margins": 0.16802291572093964, + "rewards/rejected": -0.3935233950614929, "step": 50 }, { - "epoch": 0.08426966292134831, - "grad_norm": 0.8917225003242493, - "learning_rate": 4.1666666666666667e-07, - "logits/chosen": -3.2505860328674316, - "logits/rejected": -3.1898436546325684, - "logps/chosen": -41.443748474121094, - "logps/rejected": -43.939064025878906, - "loss": 0.6155, - "rewards/accuracies": 0.6890624761581421, - "rewards/chosen": 0.011277198791503906, - "rewards/margins": 0.17209243774414062, - "rewards/rejected": -0.1607826203107834, + "epoch": 0.03851540616246499, + "grad_norm": 57.09840393066406, + "learning_rate": 3.776223776223776e-07, + "logits/chosen": -4.512499809265137, + "logits/rejected": -4.538281440734863, + "logps/chosen": -336.625, + "logps/rejected": -307.9375, + "loss": 0.6265, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2501220703125, + "rewards/margins": 0.1536407470703125, + "rewards/rejected": -0.40345460176467896, + "step": 55 + }, + { + "epoch": 0.04201680672268908, + "grad_norm": 54.38421630859375, + "learning_rate": 4.125874125874126e-07, + "logits/chosen": -4.521093845367432, + "logits/rejected": -4.532812595367432, + "logps/chosen": -333.17498779296875, + "logps/rejected": -307.3374938964844, + "loss": 0.5915, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.29718321561813354, + "rewards/margins": 0.2501785159111023, + "rewards/rejected": -0.5469970703125, "step": 60 }, { - "epoch": 0.09831460674157304, - "grad_norm": 1.416306734085083, - "learning_rate": 4.861111111111111e-07, - "logits/chosen": -3.2109375, - "logits/rejected": -3.1263670921325684, - "logps/chosen": -46.9375, - "logps/rejected": -53.76874923706055, - "loss": 0.5336, - "rewards/accuracies": 0.7015625238418579, - "rewards/chosen": -0.21380920708179474, - "rewards/margins": 0.407858282327652, - "rewards/rejected": -0.6218963861465454, + "epoch": 0.04551820728291316, + "grad_norm": 46.930301666259766, + "learning_rate": 4.4755244755244753e-07, + "logits/chosen": -4.560937404632568, + "logits/rejected": -4.564843654632568, + "logps/chosen": -313.95001220703125, + "logps/rejected": -288.82501220703125, + "loss": 0.5812, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.456216424703598, + "rewards/margins": 0.2818649411201477, + "rewards/rejected": -0.738232433795929, + "step": 65 + }, + { + "epoch": 0.049019607843137254, + "grad_norm": 16.46210289001465, + "learning_rate": 4.825174825174824e-07, + "logits/chosen": -4.526562690734863, + "logits/rejected": -4.547656059265137, + "logps/chosen": -353.45001220703125, + "logps/rejected": -322.0249938964844, + "loss": 0.5431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6093078851699829, + "rewards/margins": 0.3897949159145355, + "rewards/rejected": -0.998486340045929, "step": 70 }, { - "epoch": 0.11235955056179775, - "grad_norm": 0.6724388599395752, - "learning_rate": 4.998072590601808e-07, - "logits/chosen": -3.2593750953674316, - "logits/rejected": -3.081835985183716, - "logps/chosen": -44.157814025878906, - "logps/rejected": -63.279685974121094, - "loss": 0.405, - "rewards/accuracies": 0.7203124761581421, - "rewards/chosen": -0.117925263941288, - "rewards/margins": 1.0123169422149658, - "rewards/rejected": -1.1299316883087158, + "epoch": 0.052521008403361345, + "grad_norm": 83.04356384277344, + "learning_rate": 5.174825174825175e-07, + "logits/chosen": -4.528124809265137, + "logits/rejected": -4.564843654632568, + "logps/chosen": -292.9937438964844, + "logps/rejected": -269.10626220703125, + "loss": 0.551, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.2913818359375, + "rewards/margins": 0.39362794160842896, + "rewards/rejected": -0.6848999261856079, + "step": 75 + }, + { + "epoch": 0.056022408963585436, + "grad_norm": 123.90628051757812, + "learning_rate": 5.524475524475523e-07, + "logits/chosen": -4.520312309265137, + "logits/rejected": -4.55078125, + "logps/chosen": -322.54998779296875, + "logps/rejected": -302.98748779296875, + "loss": 0.4953, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.559588611125946, + "rewards/margins": 0.5720565915107727, + "rewards/rejected": -1.1309814453125, "step": 80 }, { - "epoch": 0.12640449438202248, - "grad_norm": 0.8954005837440491, - "learning_rate": 4.990247583129217e-07, - "logits/chosen": -3.230273485183716, - "logits/rejected": -2.9839844703674316, - "logps/chosen": -45.54218673706055, - "logps/rejected": -78.46562194824219, - "loss": 0.3109, - "rewards/accuracies": 0.7515624761581421, - "rewards/chosen": -0.15528163313865662, - "rewards/margins": 1.7075684070587158, - "rewards/rejected": -1.863037109375, + "epoch": 0.05952380952380952, + "grad_norm": 26.677682876586914, + "learning_rate": 5.874125874125873e-07, + "logits/chosen": -4.529687404632568, + "logits/rejected": -4.536718845367432, + "logps/chosen": -340.875, + "logps/rejected": -318.42498779296875, + "loss": 0.4645, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.4416137635707855, + "rewards/margins": 0.73309326171875, + "rewards/rejected": -1.17474365234375, + "step": 85 + }, + { + "epoch": 0.06302521008403361, + "grad_norm": 34.868038177490234, + "learning_rate": 6.223776223776223e-07, + "logits/chosen": -4.514062404632568, + "logits/rejected": -4.541406154632568, + "logps/chosen": -324.17498779296875, + "logps/rejected": -306.07501220703125, + "loss": 0.4807, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -0.4429565370082855, + "rewards/margins": 0.7114837765693665, + "rewards/rejected": -1.1531493663787842, "step": 90 }, { - "epoch": 0.1404494382022472, - "grad_norm": 0.5155877470970154, - "learning_rate": 4.976423351108942e-07, - "logits/chosen": -3.2529296875, - "logits/rejected": NaN, - "logps/chosen": -48.826560974121094, - "logps/rejected": -96.4312515258789, - "loss": 0.2579, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.3163391053676605, - "rewards/margins": 2.467578172683716, - "rewards/rejected": -2.783496141433716, + "epoch": 0.0665266106442577, + "grad_norm": 111.43302917480469, + "learning_rate": 6.573426573426572e-07, + "logits/chosen": -4.5234375, + "logits/rejected": -4.536718845367432, + "logps/chosen": -304.7749938964844, + "logps/rejected": -286.75, + "loss": 0.4656, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11838226020336151, + "rewards/margins": 0.7753235101699829, + "rewards/rejected": -0.893786609172821, + "step": 95 + }, + { + "epoch": 0.0700280112044818, + "grad_norm": 7.1454339027404785, + "learning_rate": 6.923076923076922e-07, + "logits/chosen": -4.491406440734863, + "logits/rejected": -4.510937690734863, + "logps/chosen": -318.8500061035156, + "logps/rejected": -301.1000061035156, + "loss": 0.4015, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -0.0882415771484375, + "rewards/margins": 1.081781029701233, + "rewards/rejected": -1.1699950695037842, "step": 100 }, { - "epoch": 0.1544943820224719, - "grad_norm": 0.9068161249160767, - "learning_rate": 4.95663319832678e-07, - "logits/chosen": -3.236328125, - "logits/rejected": -2.843554735183716, - "logps/chosen": -45.657814025878906, - "logps/rejected": -107.41874694824219, - "loss": 0.2278, - "rewards/accuracies": 0.753125011920929, - "rewards/chosen": -0.1908990889787674, - "rewards/margins": 3.155956983566284, - "rewards/rejected": -3.34814453125, + "epoch": 0.07352941176470588, + "grad_norm": 56.80259323120117, + "learning_rate": 7.272727272727272e-07, + "logits/chosen": -4.460156440734863, + "logits/rejected": -4.479687690734863, + "logps/chosen": -303.9375, + "logps/rejected": -283.2124938964844, + "loss": 0.445, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.15115967392921448, + "rewards/margins": 1.0308959484100342, + "rewards/rejected": -0.881054699420929, + "step": 105 + }, + { + "epoch": 0.07703081232492998, + "grad_norm": 210.96456909179688, + "learning_rate": 7.622377622377621e-07, + "logits/chosen": -4.453906059265137, + "logits/rejected": -4.459374904632568, + "logps/chosen": -301.26251220703125, + "logps/rejected": -282.79376220703125, + "loss": 0.4522, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.21079406142234802, + "rewards/margins": 1.071386694908142, + "rewards/rejected": -0.86181640625, "step": 110 }, { - "epoch": 0.16853932584269662, - "grad_norm": 0.5081749558448792, - "learning_rate": 4.930924800994191e-07, - "logits/chosen": -3.252734422683716, - "logits/rejected": -2.847460985183716, - "logps/chosen": -46.87812423706055, - "logps/rejected": -118.3499984741211, - "loss": 0.2113, - "rewards/accuracies": 0.776562511920929, - "rewards/chosen": -0.22010573744773865, - "rewards/margins": 3.623046875, - "rewards/rejected": -3.8433594703674316, + "epoch": 0.08053221288515407, + "grad_norm": 11.06592845916748, + "learning_rate": 7.972027972027971e-07, + "logits/chosen": -4.440625190734863, + "logits/rejected": -4.447656154632568, + "logps/chosen": -330.0249938964844, + "logps/rejected": -319.625, + "loss": 0.3711, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.9296020269393921, + "rewards/margins": 1.491845726966858, + "rewards/rejected": -0.562487781047821, + "step": 115 + }, + { + "epoch": 0.08403361344537816, + "grad_norm": 105.50016021728516, + "learning_rate": 8.321678321678321e-07, + "logits/chosen": -4.4296875, + "logits/rejected": -4.431250095367432, + "logps/chosen": -315.7875061035156, + "logps/rejected": -306.2562561035156, + "loss": 0.3411, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 0.582104504108429, + "rewards/margins": 1.7448608875274658, + "rewards/rejected": -1.1632782220840454, "step": 120 }, { - "epoch": 0.18258426966292135, - "grad_norm": 0.6637352108955383, - "learning_rate": 4.899360092892144e-07, - "logits/chosen": -3.2525391578674316, - "logits/rejected": -2.763671875, - "logps/chosen": -47.01093673706055, - "logps/rejected": -124.7437515258789, - "loss": 0.2106, - "rewards/accuracies": 0.7671874761581421, - "rewards/chosen": -0.23588410019874573, - "rewards/margins": 3.9537110328674316, - "rewards/rejected": -4.188086032867432, + "epoch": 0.08753501400560224, + "grad_norm": 168.21710205078125, + "learning_rate": 8.67132867132867e-07, + "logits/chosen": -4.440625190734863, + "logits/rejected": -4.448437690734863, + "logps/chosen": -295.2749938964844, + "logps/rejected": -295.04998779296875, + "loss": 0.406, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5245513916015625, + "rewards/margins": 1.5556640625, + "rewards/rejected": -1.031103491783142, + "step": 125 + }, + { + "epoch": 0.09103641456582633, + "grad_norm": 10.8776273727417, + "learning_rate": 9.02097902097902e-07, + "logits/chosen": -4.44140625, + "logits/rejected": -4.44140625, + "logps/chosen": -325.4624938964844, + "logps/rejected": -317.51251220703125, + "loss": 0.4175, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.46117860078811646, + "rewards/margins": 1.69061279296875, + "rewards/rejected": -2.151049852371216, "step": 130 }, { - "epoch": 0.19662921348314608, - "grad_norm": 0.3586008548736572, - "learning_rate": 4.862015116167195e-07, - "logits/chosen": -3.271484375, - "logits/rejected": -2.750781297683716, - "logps/chosen": -42.41093826293945, - "logps/rejected": -132.1125030517578, - "loss": 0.182, - "rewards/accuracies": 0.785937488079071, - "rewards/chosen": 0.008197021670639515, - "rewards/margins": 4.537890434265137, - "rewards/rejected": -4.529687404632568, + "epoch": 0.09453781512605042, + "grad_norm": 7.193403720855713, + "learning_rate": 9.37062937062937e-07, + "logits/chosen": -4.432031154632568, + "logits/rejected": -4.435937404632568, + "logps/chosen": -355.67498779296875, + "logps/rejected": -344.32501220703125, + "loss": 0.306, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.3448729515075684, + "rewards/margins": 1.7757079601287842, + "rewards/rejected": -4.119833469390869, + "step": 135 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 9.44430160522461, + "learning_rate": 9.72027972027972e-07, + "logits/chosen": -4.416406154632568, + "logits/rejected": -4.400781154632568, + "logps/chosen": -320.125, + "logps/rejected": -314.9750061035156, + "loss": 0.3724, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6501343250274658, + "rewards/margins": 1.635766625404358, + "rewards/rejected": -3.285888671875, "step": 140 }, { - "epoch": 0.21067415730337077, - "grad_norm": 0.7338384985923767, - "learning_rate": 4.81897983813931e-07, - "logits/chosen": -3.2681641578674316, - "logits/rejected": NaN, - "logps/chosen": -39.69062423706055, - "logps/rejected": -129.3562469482422, - "loss": 0.1964, - "rewards/accuracies": 0.7593749761581421, - "rewards/chosen": 0.10731048882007599, - "rewards/margins": 4.583203315734863, - "rewards/rejected": -4.477246284484863, + "epoch": 0.1015406162464986, + "grad_norm": 104.80125427246094, + "learning_rate": 9.999985057155316e-07, + "logits/chosen": -4.379687309265137, + "logits/rejected": -4.364843845367432, + "logps/chosen": -315.95001220703125, + "logps/rejected": -321.0249938964844, + "loss": 0.2703, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.275909423828125, + "rewards/margins": 2.2874755859375, + "rewards/rejected": -2.5631346702575684, + "step": 145 + }, + { + "epoch": 0.10504201680672269, + "grad_norm": 8.149246215820312, + "learning_rate": 9.999462066969451e-07, + "logits/chosen": -4.30859375, + "logits/rejected": -4.300000190734863, + "logps/chosen": -317.2875061035156, + "logps/rejected": -309.7250061035156, + "loss": 0.2573, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.4537109434604645, + "rewards/margins": 2.4706053733825684, + "rewards/rejected": -2.9227538108825684, "step": 150 }, { - "epoch": 0.2247191011235955, - "grad_norm": 0.7118776440620422, - "learning_rate": 4.770357934562704e-07, - "logits/chosen": -3.279296875, - "logits/rejected": -2.6996092796325684, - "logps/chosen": -44.157814025878906, - "logps/rejected": -138.63125610351562, - "loss": 0.1941, - "rewards/accuracies": 0.7828124761581421, - "rewards/chosen": -0.10118408501148224, - "rewards/margins": 4.762890815734863, - "rewards/rejected": -4.8642578125, + "epoch": 0.10854341736694678, + "grad_norm": 10.2169828414917, + "learning_rate": 9.998192023862448e-07, + "logits/chosen": -4.253125190734863, + "logits/rejected": -4.235156059265137, + "logps/chosen": -298.79998779296875, + "logps/rejected": -311.4375, + "loss": 0.3356, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.627392590045929, + "rewards/margins": 2.5416016578674316, + "rewards/rejected": -1.9140136241912842, + "step": 155 + }, + { + "epoch": 0.11204481792717087, + "grad_norm": 6.131630897521973, + "learning_rate": 9.996175117612607e-07, + "logits/chosen": -4.217187404632568, + "logits/rejected": -4.154687404632568, + "logps/chosen": -319.70001220703125, + "logps/rejected": -337.875, + "loss": 0.2292, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.4034667909145355, + "rewards/margins": 3.26318359375, + "rewards/rejected": -2.8581299781799316, "step": 160 }, { - "epoch": 0.23876404494382023, - "grad_norm": 0.20937258005142212, - "learning_rate": 4.716266539861866e-07, - "logits/chosen": -3.1851563453674316, - "logits/rejected": -2.635546922683716, - "logps/chosen": -51.939064025878906, - "logps/rejected": -140.15625, - "loss": 0.1944, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.5132009387016296, - "rewards/margins": 4.476171970367432, - "rewards/rejected": -4.986914157867432, + "epoch": 0.11554621848739496, + "grad_norm": 26.314456939697266, + "learning_rate": 9.993411649599492e-07, + "logits/chosen": -4.126562595367432, + "logits/rejected": -4.089062690734863, + "logps/chosen": -338.70001220703125, + "logps/rejected": -337.17498779296875, + "loss": 0.3577, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -0.941699206829071, + "rewards/margins": 2.522167921066284, + "rewards/rejected": -3.4608397483825684, + "step": 165 + }, + { + "epoch": 0.11904761904761904, + "grad_norm": 297.8920593261719, + "learning_rate": 9.989902032758903e-07, + "logits/chosen": -4.151953220367432, + "logits/rejected": -4.126172065734863, + "logps/chosen": -325.40625, + "logps/rejected": -334.625, + "loss": 0.2709, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.824884057044983, + "rewards/margins": 2.510986328125, + "rewards/rejected": -4.337841987609863, "step": 170 }, { - "epoch": 0.25280898876404495, - "grad_norm": 1.1107814311981201, - "learning_rate": 4.6568359649444796e-07, - "logits/chosen": -3.2699217796325684, - "logits/rejected": -2.663281202316284, - "logps/chosen": -37.44843673706055, - "logps/rejected": -139.5437469482422, - "loss": 0.1877, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": 0.19300994277000427, - "rewards/margins": 5.172265529632568, - "rewards/rejected": -4.980273246765137, + "epoch": 0.12254901960784313, + "grad_norm": 38.77955627441406, + "learning_rate": 9.985646791521163e-07, + "logits/chosen": -4.196875095367432, + "logits/rejected": -4.171875, + "logps/chosen": -307.32501220703125, + "logps/rejected": -299.70001220703125, + "loss": 0.2857, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.518969714641571, + "rewards/margins": 2.327831983566284, + "rewards/rejected": -2.846923828125, + "step": 175 + }, + { + "epoch": 0.12605042016806722, + "grad_norm": 9.513249397277832, + "learning_rate": 9.980646561732758e-07, + "logits/chosen": -4.165625095367432, + "logits/rejected": -4.1328125, + "logps/chosen": -342.9624938964844, + "logps/rejected": -336.9624938964844, + "loss": 0.279, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.4823059141635895, + "rewards/margins": 2.8280272483825684, + "rewards/rejected": -3.3097167015075684, "step": 180 }, { - "epoch": 0.26685393258426965, - "grad_norm": 0.1760079711675644, - "learning_rate": 4.592209383271023e-07, - "logits/chosen": -3.15234375, - "logits/rejected": NaN, - "logps/chosen": -50.7109375, - "logps/rejected": -142.33749389648438, - "loss": 0.2011, - "rewards/accuracies": 0.753125011920929, - "rewards/chosen": -0.4693801999092102, - "rewards/margins": 4.672265529632568, - "rewards/rejected": -5.140820503234863, + "epoch": 0.12955182072829133, + "grad_norm": 8.950021743774414, + "learning_rate": 9.97490209056133e-07, + "logits/chosen": -4.09765625, + "logits/rejected": -4.087500095367432, + "logps/chosen": -324.20001220703125, + "logps/rejected": -332.0, + "loss": 0.2007, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.9912109375, + "rewards/margins": 3.622851610183716, + "rewards/rejected": -2.6298828125, + "step": 185 + }, + { + "epoch": 0.1330532212885154, + "grad_norm": 95.99122619628906, + "learning_rate": 9.968414236384021e-07, + "logits/chosen": -4.088671684265137, + "logits/rejected": -4.05859375, + "logps/chosen": -323.01873779296875, + "logps/rejected": -342.1875, + "loss": 0.2168, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 1.2775146961212158, + "rewards/margins": 4.678320407867432, + "rewards/rejected": -3.3967041969299316, "step": 190 }, { - "epoch": 0.2808988764044944, - "grad_norm": 0.3540495038032532, - "learning_rate": 4.5225424859373684e-07, - "logits/chosen": -3.2144532203674316, - "logits/rejected": -2.646484375, - "logps/chosen": -44.23749923706055, - "logps/rejected": -146.5, - "loss": 0.1823, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.061879731714725494, - "rewards/margins": 5.176171779632568, - "rewards/rejected": -5.240038871765137, + "epoch": 0.13655462184873948, + "grad_norm": 23.21297264099121, + "learning_rate": 9.961183968659215e-07, + "logits/chosen": -4.132031440734863, + "logits/rejected": -4.114843845367432, + "logps/chosen": -315.6312561035156, + "logps/rejected": -333.41876220703125, + "loss": 0.2552, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.0270264148712158, + "rewards/margins": 3.6083006858825684, + "rewards/rejected": -4.63623046875, + "step": 195 + }, + { + "epoch": 0.1400560224089636, + "grad_norm": 11.13497543334961, + "learning_rate": 9.953212367781675e-07, + "logits/chosen": -4.210156440734863, + "logits/rejected": -4.153906345367432, + "logps/chosen": -329.70001220703125, + "logps/rejected": -343.25, + "loss": 0.2498, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.5098876953125, + "rewards/margins": 3.6513671875, + "rewards/rejected": -6.163769721984863, "step": 200 }, { - "epoch": 0.2949438202247191, - "grad_norm": 0.4451713263988495, - "learning_rate": 4.448003106601291e-07, - "logits/chosen": -3.2412109375, - "logits/rejected": -2.6431641578674316, - "logps/chosen": -39.69843673706055, - "logps/rejected": -141.1062469482422, - "loss": 0.2101, - "rewards/accuracies": 0.7671874761581421, - "rewards/chosen": 0.11627502739429474, - "rewards/margins": 5.123827934265137, - "rewards/rejected": -5.008593559265137, + "epoch": 0.14355742296918766, + "grad_norm": 9.842167854309082, + "learning_rate": 9.944500624921093e-07, + "logits/chosen": -4.247656345367432, + "logits/rejected": -4.170312404632568, + "logps/chosen": -308.3999938964844, + "logps/rejected": -328.42498779296875, + "loss": 0.1851, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.574206531047821, + "rewards/margins": 4.539453029632568, + "rewards/rejected": -5.109606742858887, + "step": 205 + }, + { + "epoch": 0.14705882352941177, + "grad_norm": 11.150321960449219, + "learning_rate": 9.93505004184412e-07, + "logits/chosen": -4.275781154632568, + "logits/rejected": -4.194531440734863, + "logps/chosen": -294.6000061035156, + "logps/rejected": -299.57501220703125, + "loss": 0.2046, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.0366578102111816, + "rewards/margins": 3.9019532203674316, + "rewards/rejected": -0.867999255657196, "step": 210 }, { - "epoch": 0.3089887640449438, - "grad_norm": 1.5930163860321045, - "learning_rate": 4.3687708171564917e-07, - "logits/chosen": -3.2291016578674316, - "logits/rejected": -2.604296922683716, - "logps/chosen": -42.740623474121094, - "logps/rejected": -145.375, - "loss": 0.2042, - "rewards/accuracies": 0.7578125, - "rewards/chosen": -0.070429228246212, - "rewards/margins": 5.188672065734863, - "rewards/rejected": -5.260156154632568, + "epoch": 0.15056022408963585, + "grad_norm": 11.823946952819824, + "learning_rate": 9.92486203071982e-07, + "logits/chosen": -4.1875, + "logits/rejected": -4.145312309265137, + "logps/chosen": -262.70623779296875, + "logps/rejected": -274.20001220703125, + "loss": 0.2582, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": 3.4903321266174316, + "rewards/margins": 3.1060547828674316, + "rewards/rejected": 0.3824706971645355, + "step": 215 + }, + { + "epoch": 0.15406162464985995, + "grad_norm": 4.554815769195557, + "learning_rate": 9.913938113908674e-07, + "logits/chosen": -4.1796875, + "logits/rejected": -4.120312690734863, + "logps/chosen": -282.3374938964844, + "logps/rejected": -290.6187438964844, + "loss": 0.2069, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.898474097251892, + "rewards/margins": 3.626757860183716, + "rewards/rejected": -1.730224609375, "step": 220 }, { - "epoch": 0.32303370786516855, - "grad_norm": 0.27790066599845886, - "learning_rate": 4.2850364951281705e-07, - "logits/chosen": -3.2845702171325684, - "logits/rejected": -2.6654295921325684, - "logps/chosen": -39.85625076293945, - "logps/rejected": -149.9250030517578, - "loss": 0.1742, - "rewards/accuracies": 0.778124988079071, - "rewards/chosen": 0.12183837592601776, - "rewards/margins": 5.572656154632568, + "epoch": 0.15756302521008403, + "grad_norm": 9.618704795837402, + "learning_rate": 9.902279923735093e-07, + "logits/chosen": -4.1328125, + "logits/rejected": -4.083203315734863, + "logps/chosen": -307.48126220703125, + "logps/rejected": -318.1875, + "loss": 0.1849, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.22895507514476776, + "rewards/margins": 3.7847657203674316, + "rewards/rejected": -4.01904296875, + "step": 225 + }, + { + "epoch": 0.16106442577030813, + "grad_norm": 14.65978717803955, + "learning_rate": 9.8898892022435e-07, + "logits/chosen": -4.105859279632568, + "logits/rejected": -4.065625190734863, + "logps/chosen": -344.84375, + "logps/rejected": -358.01251220703125, + "loss": 0.212, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.134960889816284, + "rewards/margins": 3.317578077316284, "rewards/rejected": -5.451562404632568, "step": 230 }, { - "epoch": 0.33707865168539325, - "grad_norm": 0.32155928015708923, - "learning_rate": 4.1970018638323547e-07, - "logits/chosen": -3.26953125, - "logits/rejected": -2.6566405296325684, - "logps/chosen": -41.8671875, - "logps/rejected": -152.64999389648438, - "loss": 0.174, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.051790811121463776, - "rewards/margins": 5.608593940734863, - "rewards/rejected": -5.559179782867432, + "epoch": 0.1645658263305322, + "grad_norm": 14.46033000946045, + "learning_rate": 9.876767800938031e-07, + "logits/chosen": -4.133984565734863, + "logits/rejected": -4.099218845367432, + "logps/chosen": -331.66876220703125, + "logps/rejected": -340.7124938964844, + "loss": 0.1346, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.249926805496216, + "rewards/margins": 3.540234327316284, + "rewards/rejected": -5.790820121765137, + "step": 235 + }, + { + "epoch": 0.16806722689075632, + "grad_norm": 41.85720443725586, + "learning_rate": 9.862917680505863e-07, + "logits/chosen": -4.169531345367432, + "logits/rejected": -4.102343559265137, + "logps/chosen": -294.1499938964844, + "logps/rejected": -309.17498779296875, + "loss": 0.1258, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.0751953125, + "rewards/margins": 4.217187404632568, + "rewards/rejected": -4.292138576507568, "step": 240 }, { - "epoch": 0.351123595505618, - "grad_norm": 0.2504253685474396, - "learning_rate": 4.1048790064067573e-07, - "logits/chosen": -3.254687547683716, - "logits/rejected": -2.630078077316284, - "logps/chosen": -40.02812576293945, - "logps/rejected": -154.9250030517578, - "loss": 0.1722, - "rewards/accuracies": 0.785937488079071, - "rewards/chosen": 0.126708984375, - "rewards/margins": 5.812109470367432, - "rewards/rejected": -5.681640625, + "epoch": 0.1715686274509804, + "grad_norm": 42.87565612792969, + "learning_rate": 9.848340910524241e-07, + "logits/chosen": -4.21484375, + "logits/rejected": -4.102343559265137, + "logps/chosen": -311.0, + "logps/rejected": -335.1499938964844, + "loss": 0.1509, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.9837768077850342, + "rewards/margins": 4.403222560882568, + "rewards/rejected": -2.420605421066284, + "step": 245 + }, + { + "epoch": 0.17507002801120447, + "grad_norm": 10.537090301513672, + "learning_rate": 9.833039669151225e-07, + "logits/chosen": -4.223437309265137, + "logits/rejected": -4.108202934265137, + "logps/chosen": -293.51251220703125, + "logps/rejected": -308.17498779296875, + "loss": 0.2425, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.3416991233825684, + "rewards/margins": 4.234765529632568, + "rewards/rejected": -1.894873023033142, "step": 250 }, { - "epoch": 0.3651685393258427, - "grad_norm": 1.5456242561340332, - "learning_rate": 4.0088898548839285e-07, - "logits/chosen": -3.2398438453674316, - "logits/rejected": -2.6322264671325684, - "logps/chosen": -42.142189025878906, - "logps/rejected": -149.5749969482422, - "loss": 0.1996, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.08585663139820099, - "rewards/margins": 5.431640625, - "rewards/rejected": -5.516211032867432, + "epoch": 0.17857142857142858, + "grad_norm": 2.457155466079712, + "learning_rate": 9.817016242800215e-07, + "logits/chosen": -4.33984375, + "logits/rejected": -4.192187309265137, + "logps/chosen": -278.23126220703125, + "logps/rejected": -295.4375, + "loss": 0.1635, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 2.135058641433716, + "rewards/margins": 4.299609184265137, + "rewards/rejected": -2.1654295921325684, + "step": 255 + }, + { + "epoch": 0.18207282913165265, + "grad_norm": 14.015923500061035, + "learning_rate": 9.8002730257983e-07, + "logits/chosen": -4.373437404632568, + "logits/rejected": -4.24609375, + "logps/chosen": -283.6812438964844, + "logps/rejected": -301.4624938964844, + "loss": 0.239, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": 2.7750000953674316, + "rewards/margins": 3.8597655296325684, + "rewards/rejected": -1.085168480873108, "step": 260 }, { - "epoch": 0.3792134831460674, - "grad_norm": 0.1563502699136734, - "learning_rate": 3.9092656555375414e-07, - "logits/chosen": -3.268359422683716, - "logits/rejected": -2.6371092796325684, - "logps/chosen": -41.34375, - "logps/rejected": -160.625, - "loss": 0.159, - "rewards/accuracies": 0.796875, - "rewards/chosen": 0.07463989406824112, - "rewards/margins": 6.014843940734863, - "rewards/rejected": -5.940234184265137, + "epoch": 0.18557422969187676, + "grad_norm": 6.20159912109375, + "learning_rate": 9.782812520028486e-07, + "logits/chosen": -4.380468845367432, + "logits/rejected": -4.271093845367432, + "logps/chosen": -313.67498779296875, + "logps/rejected": -328.0249938964844, + "loss": 0.173, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.6710448265075684, + "rewards/margins": 4.1572265625, + "rewards/rejected": -1.4860107898712158, + "step": 265 + }, + { + "epoch": 0.18907563025210083, + "grad_norm": 8.501385688781738, + "learning_rate": 9.764637334555838e-07, + "logits/chosen": -4.25, + "logits/rejected": -4.155859470367432, + "logps/chosen": -323.3500061035156, + "logps/rejected": -346.7749938964844, + "loss": 0.1794, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 0.511059582233429, + "rewards/margins": 4.514257907867432, + "rewards/rejected": -4.004687309265137, "step": 270 }, { - "epoch": 0.39325842696629215, - "grad_norm": 0.414302796125412, - "learning_rate": 3.806246411789872e-07, - "logits/chosen": -3.282421827316284, - "logits/rejected": NaN, - "logps/chosen": -39.970314025878906, - "logps/rejected": -151.84375, - "loss": 0.1905, - "rewards/accuracies": 0.7640625238418579, - "rewards/chosen": 0.1959686279296875, - "rewards/margins": 5.774609565734863, - "rewards/rejected": -5.579492092132568, + "epoch": 0.19257703081232494, + "grad_norm": 43.63261413574219, + "learning_rate": 9.74575018523763e-07, + "logits/chosen": -4.196093559265137, + "logits/rejected": -4.1171875, + "logps/chosen": -347.2749938964844, + "logps/rejected": -358.70001220703125, + "loss": 0.1583, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.863964855670929, + "rewards/margins": 4.191601753234863, + "rewards/rejected": -5.051171779632568, + "step": 275 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 3.3350625038146973, + "learning_rate": 9.726153894317505e-07, + "logits/chosen": -4.134375095367432, + "logits/rejected": -4.066015720367432, + "logps/chosen": -347.7250061035156, + "logps/rejected": -371.6499938964844, + "loss": 0.1537, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.461084008216858, + "rewards/margins": 4.367383003234863, + "rewards/rejected": -5.829492092132568, "step": 280 }, { - "epoch": 0.40730337078651685, - "grad_norm": 0.11994462460279465, - "learning_rate": 3.700080306022528e-07, - "logits/chosen": -3.199023485183716, - "logits/rejected": -2.582226514816284, - "logps/chosen": -51.41093826293945, - "logps/rejected": -164.3125, - "loss": 0.1673, - "rewards/accuracies": 0.7890625, - "rewards/chosen": -0.46795958280563354, - "rewards/margins": 5.684179782867432, - "rewards/rejected": -6.149609565734863, + "epoch": 0.19957983193277312, + "grad_norm": 8.582822799682617, + "learning_rate": 9.705851390003783e-07, + "logits/chosen": -4.100781440734863, + "logits/rejected": -4.035937309265137, + "logps/chosen": -333.8500061035156, + "logps/rejected": -351.57501220703125, + "loss": 0.1489, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.152880907058716, + "rewards/margins": 4.238379001617432, + "rewards/rejected": -6.391992092132568, + "step": 285 + }, + { + "epoch": 0.2030812324929972, + "grad_norm": 6.626759052276611, + "learning_rate": 9.684845706031877e-07, + "logits/chosen": -4.0234375, + "logits/rejected": -3.9957032203674316, + "logps/chosen": -337.75, + "logps/rejected": -357.95001220703125, + "loss": 0.1771, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.648266553878784, + "rewards/margins": 4.245703220367432, + "rewards/rejected": -6.89453125, "step": 290 }, { - "epoch": 0.42134831460674155, - "grad_norm": 0.3142966330051422, - "learning_rate": 3.5910231016833546e-07, - "logits/chosen": -3.1810545921325684, - "logits/rejected": -2.5703125, - "logps/chosen": -50.90625, - "logps/rejected": -161.7375030517578, - "loss": 0.1775, - "rewards/accuracies": 0.7796875238418579, - "rewards/chosen": -0.4352920651435852, - "rewards/margins": 5.612890720367432, - "rewards/rejected": -6.050000190734863, + "epoch": 0.20658263305322128, + "grad_norm": 8.172663688659668, + "learning_rate": 9.663139981210998e-07, + "logits/chosen": -4.048047065734863, + "logits/rejected": -4.009765625, + "logps/chosen": -322.5, + "logps/rejected": -356.04998779296875, + "loss": 0.0956, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.319116234779358, + "rewards/margins": 4.673632621765137, + "rewards/rejected": -5.993750095367432, + "step": 295 + }, + { + "epoch": 0.21008403361344538, + "grad_norm": 7.659239292144775, + "learning_rate": 9.640737458955118e-07, + "logits/chosen": -3.9984374046325684, + "logits/rejected": -3.946093797683716, + "logps/chosen": -307.3687438964844, + "logps/rejected": -328.86248779296875, + "loss": 0.1718, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.5625640749931335, + "rewards/margins": 4.830859184265137, + "rewards/rejected": -5.391699314117432, "step": 300 }, { - "epoch": 0.4353932584269663, - "grad_norm": 0.5336220264434814, - "learning_rate": 3.4793375271298895e-07, - "logits/chosen": -3.252148389816284, - "logits/rejected": NaN, - "logps/chosen": -41.959373474121094, - "logps/rejected": -157.24374389648438, - "loss": 0.1855, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": 0.02679443359375, - "rewards/margins": 5.8720703125, - "rewards/rejected": -5.84375, + "epoch": 0.21358543417366946, + "grad_norm": 15.881430625915527, + "learning_rate": 9.61764148679833e-07, + "logits/chosen": -4.079687595367432, + "logits/rejected": -3.9722657203674316, + "logps/chosen": -293.5249938964844, + "logps/rejected": -323.17498779296875, + "loss": 0.1267, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.255767822265625, + "rewards/margins": 5.384765625, + "rewards/rejected": -5.129296779632568, + "step": 305 + }, + { + "epoch": 0.21708683473389356, + "grad_norm": 3.689927577972412, + "learning_rate": 9.59385551589462e-07, + "logits/chosen": -4.171093940734863, + "logits/rejected": -4.001172065734863, + "logps/chosen": -309.2250061035156, + "logps/rejected": -344.17498779296875, + "loss": 0.1433, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.39042967557907104, + "rewards/margins": 5.873437404632568, + "rewards/rejected": -6.268750190734863, "step": 310 }, { - "epoch": 0.449438202247191, - "grad_norm": 0.11426942050457001, - "learning_rate": 3.3652926426937325e-07, - "logits/chosen": -3.280078172683716, - "logits/rejected": -2.6470704078674316, - "logps/chosen": -40.0546875, - "logps/rejected": -156.4812469482422, - "loss": 0.1744, - "rewards/accuracies": 0.770312488079071, - "rewards/chosen": 0.06229095533490181, - "rewards/margins": 5.884179592132568, - "rewards/rejected": -5.8212890625, + "epoch": 0.22058823529411764, + "grad_norm": 3.1176366806030273, + "learning_rate": 9.56938310050219e-07, + "logits/chosen": -4.214453220367432, + "logits/rejected": -4.013671875, + "logps/chosen": -304.75, + "logps/rejected": -351.7875061035156, + "loss": 0.0886, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.46104735136032104, + "rewards/margins": 6.175000190734863, + "rewards/rejected": -6.635546684265137, + "step": 315 + }, + { + "epoch": 0.22408963585434175, + "grad_norm": 10.508804321289062, + "learning_rate": 9.544227897452347e-07, + "logits/chosen": -4.287890434265137, + "logits/rejected": -4.046093940734863, + "logps/chosen": -321.46875, + "logps/rejected": -362.0249938964844, + "loss": 0.2045, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.84326171875, + "rewards/margins": 6.593945503234863, + "rewards/rejected": -5.749804496765137, "step": 320 }, { - "epoch": 0.46348314606741575, - "grad_norm": 0.12852512300014496, - "learning_rate": 3.249163192490642e-07, - "logits/chosen": -3.245898485183716, - "logits/rejected": -2.5863280296325684, - "logps/chosen": -43.52031326293945, - "logps/rejected": -163.125, - "loss": 0.1654, - "rewards/accuracies": 0.785937488079071, - "rewards/chosen": -0.09057464450597763, - "rewards/margins": 6.042578220367432, - "rewards/rejected": -6.135937690734863, + "epoch": 0.22759103641456582, + "grad_norm": 8.315167427062988, + "learning_rate": 9.518393665603082e-07, + "logits/chosen": -4.189453125, + "logits/rejected": -4.031640529632568, + "logps/chosen": -314.29376220703125, + "logps/rejected": -360.70001220703125, + "loss": 0.1578, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -1.0939452648162842, + "rewards/margins": 6.074999809265137, + "rewards/rejected": -7.168359279632568, + "step": 325 + }, + { + "epoch": 0.23109243697478993, + "grad_norm": 7.695666313171387, + "learning_rate": 9.491884265277382e-07, + "logits/chosen": -4.165625095367432, + "logits/rejected": -3.9828124046325684, + "logps/chosen": -356.6000061035156, + "logps/rejected": -392.125, + "loss": 0.1062, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -3.137158155441284, + "rewards/margins": 5.859667778015137, + "rewards/rejected": -8.993749618530273, "step": 330 }, { - "epoch": 0.47752808988764045, - "grad_norm": 1.0368801355361938, - "learning_rate": 3.1312289425378944e-07, - "logits/chosen": -3.237109422683716, - "logits/rejected": -2.5757813453674316, - "logps/chosen": -45.65156173706055, - "logps/rejected": -166.43124389648438, - "loss": 0.1851, - "rewards/accuracies": 0.7718750238418579, - "rewards/chosen": -0.196772962808609, - "rewards/margins": 6.102734565734863, - "rewards/rejected": -6.302343845367432, + "epoch": 0.234593837535014, + "grad_norm": 29.099950790405273, + "learning_rate": 9.46470365768641e-07, + "logits/chosen": -4.122656345367432, + "logits/rejected": -3.9632811546325684, + "logps/chosen": -353.7124938964844, + "logps/rejected": -384.63751220703125, + "loss": 0.1972, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.113037109375, + "rewards/margins": 5.382031440734863, + "rewards/rejected": -9.497655868530273, + "step": 335 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 8.61391830444336, + "learning_rate": 9.436855904337594e-07, + "logits/chosen": -4.144140720367432, + "logits/rejected": -3.9921875, + "logps/chosen": -359.67498779296875, + "logps/rejected": -398.4750061035156, + "loss": 0.1049, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -3.8545289039611816, + "rewards/margins": 5.109375, + "rewards/rejected": -8.96484375, "step": 340 }, { - "epoch": 0.49157303370786515, - "grad_norm": 0.2768089473247528, - "learning_rate": 3.011774006773449e-07, - "logits/chosen": -3.1851563453674316, - "logits/rejected": -2.553515672683716, - "logps/chosen": -50.12968826293945, - "logps/rejected": -172.3125, - "loss": 0.1616, - "rewards/accuracies": 0.801562488079071, - "rewards/chosen": -0.33214110136032104, - "rewards/margins": 6.178515434265137, - "rewards/rejected": -6.510156154632568, + "epoch": 0.2415966386554622, + "grad_norm": 14.49962329864502, + "learning_rate": 9.408345166427718e-07, + "logits/chosen": -4.134375095367432, + "logits/rejected": -3.9613280296325684, + "logps/chosen": -325.67498779296875, + "logps/rejected": -355.8125, + "loss": 0.2243, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.363330125808716, + "rewards/margins": 5.4833984375, + "rewards/rejected": -7.849609375, + "step": 345 + }, + { + "epoch": 0.24509803921568626, + "grad_norm": 8.390877723693848, + "learning_rate": 9.379175704221138e-07, + "logits/chosen": -4.162109375, + "logits/rejected": -3.991015672683716, + "logps/chosen": -287.53125, + "logps/rejected": -323.08123779296875, + "loss": 0.1182, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.650317370891571, + "rewards/margins": 5.601953029632568, + "rewards/rejected": -6.25146484375, "step": 350 }, { - "epoch": 0.5056179775280899, - "grad_norm": 0.5607307553291321, - "learning_rate": 2.8910861626005773e-07, - "logits/chosen": -3.2269530296325684, - "logits/rejected": -2.5611329078674316, - "logps/chosen": -45.09375, - "logps/rejected": -164.16250610351562, - "loss": 0.1745, - "rewards/accuracies": 0.7718750238418579, - "rewards/chosen": -0.21467895805835724, - "rewards/margins": 6.003515720367432, - "rewards/rejected": -6.216406345367432, + "epoch": 0.24859943977591037, + "grad_norm": 12.179805755615234, + "learning_rate": 9.349351876413181e-07, + "logits/chosen": -4.13671875, + "logits/rejected": -3.953906297683716, + "logps/chosen": -319.70001220703125, + "logps/rejected": -360.36248779296875, + "loss": 0.1587, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.6496093273162842, + "rewards/margins": 5.811327934265137, + "rewards/rejected": -7.461718559265137, + "step": 355 + }, + { + "epoch": 0.25210084033613445, + "grad_norm": 11.038627624511719, + "learning_rate": 9.318878139478839e-07, + "logits/chosen": -4.061718940734863, + "logits/rejected": -3.8882813453674316, + "logps/chosen": -318.4375, + "logps/rejected": -355.6875, + "loss": 0.1205, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -1.5353882312774658, + "rewards/margins": 5.686181545257568, + "rewards/rejected": -7.225390434265137, "step": 360 }, { - "epoch": 0.5196629213483146, - "grad_norm": 0.45973560214042664, - "learning_rate": 2.7694561576068983e-07, - "logits/chosen": -3.203906297683716, - "logits/rejected": -2.5244140625, - "logps/chosen": -46.931251525878906, - "logps/rejected": -171.2375030517578, - "loss": 0.1627, - "rewards/accuracies": 0.796875, - "rewards/chosen": -0.20502586662769318, - "rewards/margins": 6.284570217132568, - "rewards/rejected": -6.489062309265137, + "epoch": 0.2556022408963585, + "grad_norm": 12.718750953674316, + "learning_rate": 9.287759047006859e-07, + "logits/chosen": -4.049609184265137, + "logits/rejected": -3.8804688453674316, + "logps/chosen": -322.20001220703125, + "logps/rejected": -350.9750061035156, + "loss": 0.1692, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -0.41533201932907104, + "rewards/margins": 6.026074409484863, + "rewards/rejected": -6.441210746765137, + "step": 365 + }, + { + "epoch": 0.25910364145658266, + "grad_norm": 6.676199436187744, + "learning_rate": 9.255999249019307e-07, + "logits/chosen": -3.9546875953674316, + "logits/rejected": -3.853515625, + "logps/chosen": -289.6875, + "logps/rejected": -326.875, + "loss": 0.1589, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.5786621570587158, + "rewards/margins": 6.227734565734863, + "rewards/rejected": -4.654510498046875, "step": 370 }, { - "epoch": 0.5337078651685393, - "grad_norm": 0.33313286304473877, - "learning_rate": 2.647177009127972e-07, - "logits/chosen": -3.1996092796325684, - "logits/rejected": -2.5083985328674316, - "logps/chosen": -46.064064025878906, - "logps/rejected": -171.14999389648438, - "loss": 0.1645, - "rewards/accuracies": 0.7984374761581421, - "rewards/chosen": -0.1926528960466385, - "rewards/margins": 6.312890529632568, - "rewards/rejected": -6.504101753234863, + "epoch": 0.26260504201680673, + "grad_norm": 2.6601064205169678, + "learning_rate": 9.223603491276733e-07, + "logits/chosen": -3.9593749046325684, + "logits/rejected": -3.8558592796325684, + "logps/chosen": -318.0, + "logps/rejected": -365.1000061035156, + "loss": 0.1186, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2627441883087158, + "rewards/margins": 7.131640434265137, + "rewards/rejected": -5.866894721984863, + "step": 375 + }, + { + "epoch": 0.2661064425770308, + "grad_norm": 10.011348724365234, + "learning_rate": 9.190576614569034e-07, + "logits/chosen": -4.017968654632568, + "logits/rejected": -3.8695311546325684, + "logps/chosen": -323.7250061035156, + "logps/rejected": -368.2250061035156, + "loss": 0.1091, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.39854127168655396, + "rewards/margins": 7.003125190734863, + "rewards/rejected": -7.404687404632568, "step": 380 }, { - "epoch": 0.547752808988764, - "grad_norm": 0.3406722843647003, - "learning_rate": 2.524543298342874e-07, - "logits/chosen": -3.216992139816284, - "logits/rejected": -2.5326170921325684, - "logps/chosen": -43.662498474121094, - "logps/rejected": -168.7937469482422, - "loss": 0.1589, - "rewards/accuracies": 0.792187511920929, - "rewards/chosen": -0.07064209133386612, - "rewards/margins": 6.324023246765137, - "rewards/rejected": -6.3935546875, + "epoch": 0.2696078431372549, + "grad_norm": 39.08020782470703, + "learning_rate": 9.156923553992106e-07, + "logits/chosen": -4.040625095367432, + "logits/rejected": -3.8871092796325684, + "logps/chosen": -312.4750061035156, + "logps/rejected": -364.0874938964844, + "loss": 0.132, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1413085460662842, + "rewards/margins": 7.5625, + "rewards/rejected": -6.409375190734863, + "step": 385 + }, + { + "epoch": 0.27310924369747897, + "grad_norm": 2.079184055328369, + "learning_rate": 9.122649338210406e-07, + "logits/chosen": -4.075390815734863, + "logits/rejected": -3.9078125953674316, + "logps/chosen": -310.21875, + "logps/rejected": -352.7250061035156, + "loss": 0.084, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.4845214784145355, + "rewards/margins": 7.5390625, + "rewards/rejected": -7.056250095367432, "step": 390 }, { - "epoch": 0.5617977528089888, - "grad_norm": 0.2776348292827606, - "learning_rate": 2.401850460602329e-07, - "logits/chosen": -3.2457032203674316, - "logits/rejected": -2.5503907203674316, - "logps/chosen": -41.400001525878906, - "logps/rejected": -166.88125610351562, - "loss": 0.1747, - "rewards/accuracies": 0.7828124761581421, - "rewards/chosen": 0.02814788743853569, - "rewards/margins": 6.317187309265137, - "rewards/rejected": -6.288281440734863, + "epoch": 0.2766106442577031, + "grad_norm": 12.645612716674805, + "learning_rate": 9.08775908870554e-07, + "logits/chosen": -4.032422065734863, + "logits/rejected": -3.889843702316284, + "logps/chosen": -321.88751220703125, + "logps/rejected": -366.32501220703125, + "loss": 0.1464, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.23764649033546448, + "rewards/margins": 7.212109565734863, + "rewards/rejected": -6.974804878234863, + "step": 395 + }, + { + "epoch": 0.2801120448179272, + "grad_norm": 13.877970695495605, + "learning_rate": 9.052258019010979e-07, + "logits/chosen": -4.080078125, + "logits/rejected": -3.911328077316284, + "logps/chosen": -324.6499938964844, + "logps/rejected": -367.5249938964844, + "loss": 0.1189, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.22805175185203552, + "rewards/margins": 6.795117378234863, + "rewards/rejected": -6.571093559265137, "step": 400 }, { - "epoch": 0.5758426966292135, - "grad_norm": 1.0503939390182495, - "learning_rate": 2.2793940736990766e-07, - "logits/chosen": -3.241992235183716, - "logits/rejected": -2.546093702316284, - "logps/chosen": -40.43437576293945, - "logps/rejected": -163.3874969482422, - "loss": 0.1881, - "rewards/accuracies": 0.7718750238418579, - "rewards/chosen": 0.005574035458266735, - "rewards/margins": 6.197851657867432, - "rewards/rejected": -6.190625190734863, + "epoch": 0.28361344537815125, + "grad_norm": 8.056689262390137, + "learning_rate": 9.01615143393301e-07, + "logits/chosen": -4.070703029632568, + "logits/rejected": -3.923828125, + "logps/chosen": -303.82501220703125, + "logps/rejected": -351.625, + "loss": 0.0839, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.4512085020542145, + "rewards/margins": 6.696875095367432, + "rewards/rejected": -7.1455078125, + "step": 405 + }, + { + "epoch": 0.28711484593837533, + "grad_norm": 6.637094497680664, + "learning_rate": 8.979444728758065e-07, + "logits/chosen": -4.039453029632568, + "logits/rejected": -3.895312547683716, + "logps/chosen": -315.0, + "logps/rejected": -365.5, + "loss": 0.1436, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4109863340854645, + "rewards/margins": 6.589453220367432, + "rewards/rejected": -6.17919921875, "step": 410 }, { - "epoch": 0.5898876404494382, - "grad_norm": 1.4323946237564087, - "learning_rate": 2.1574691457950803e-07, - "logits/chosen": -3.208203077316284, - "logits/rejected": NaN, - "logps/chosen": -43.95781326293945, - "logps/rejected": -171.10000610351562, - "loss": 0.1664, - "rewards/accuracies": 0.7890625, - "rewards/chosen": -0.07436218112707138, - "rewards/margins": 6.463086128234863, - "rewards/rejected": -6.539453029632568, + "epoch": 0.29061624649859946, + "grad_norm": 1.8166749477386475, + "learning_rate": 8.942143388446521e-07, + "logits/chosen": -4.03125, + "logits/rejected": -3.833203077316284, + "logps/chosen": -353.0, + "logps/rejected": -384.3999938964844, + "loss": 0.1075, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.49619752168655396, + "rewards/margins": 7.074999809265137, + "rewards/rejected": -7.573828220367432, + "step": 415 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 5.399685382843018, + "learning_rate": 8.90425298681309e-07, + "logits/chosen": -3.992968797683716, + "logits/rejected": -3.8003907203674316, + "logps/chosen": -317.8999938964844, + "logps/rejected": -366.23748779296875, + "loss": 0.1127, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.502575695514679, + "rewards/margins": 7.261523246765137, + "rewards/rejected": -6.760644435882568, "step": 420 }, { - "epoch": 0.6039325842696629, - "grad_norm": 0.678728461265564, - "learning_rate": 2.036369404721023e-07, - "logits/chosen": -3.1888670921325684, - "logits/rejected": NaN, - "logps/chosen": -47.20624923706055, - "logps/rejected": -169.1062469482422, - "loss": 0.166, - "rewards/accuracies": 0.778124988079071, - "rewards/chosen": -0.290365606546402, - "rewards/margins": 6.1806640625, - "rewards/rejected": -6.469531059265137, + "epoch": 0.2976190476190476, + "grad_norm": 49.38500213623047, + "learning_rate": 8.865779185693957e-07, + "logits/chosen": -3.990234375, + "logits/rejected": -3.8460936546325684, + "logps/chosen": -314.8999938964844, + "logps/rejected": -356.23748779296875, + "loss": 0.2774, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.49787598848342896, + "rewards/margins": 6.515234470367432, + "rewards/rejected": -6.016992092132568, + "step": 425 + }, + { + "epoch": 0.3011204481792717, + "grad_norm": 6.173446178436279, + "learning_rate": 8.826727734100741e-07, + "logits/chosen": -3.994921922683716, + "logits/rejected": -3.867968797683716, + "logps/chosen": -317.0562438964844, + "logps/rejected": -348.3125, + "loss": 0.1749, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.03021240234375, + "rewards/margins": 6.094140529632568, + "rewards/rejected": -6.126562595367432, "step": 430 }, { - "epoch": 0.6179775280898876, - "grad_norm": 0.20119936764240265, - "learning_rate": 1.9163865903602372e-07, - "logits/chosen": -3.2255859375, - "logits/rejected": -2.520703077316284, - "logps/chosen": -45.240623474121094, - "logps/rejected": -167.2624969482422, - "loss": 0.1816, - "rewards/accuracies": 0.7671874761581421, - "rewards/chosen": -0.21324768662452698, - "rewards/margins": 6.177538871765137, - "rewards/rejected": -6.390625, + "epoch": 0.30462184873949577, + "grad_norm": 2.559511423110962, + "learning_rate": 8.787104467361441e-07, + "logits/chosen": -3.9769530296325684, + "logits/rejected": -3.879687547683716, + "logps/chosen": -313.11248779296875, + "logps/rejected": -349.92498779296875, + "loss": 0.0598, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.4130493104457855, + "rewards/margins": 6.033593654632568, + "rewards/rejected": -6.443163871765137, + "step": 435 + }, + { + "epoch": 0.3081232492997199, + "grad_norm": 14.759501457214355, + "learning_rate": 8.746915306248485e-07, + "logits/chosen": -3.9320311546325684, + "logits/rejected": -3.7914061546325684, + "logps/chosen": -285.2749938964844, + "logps/rejected": -328.20001220703125, + "loss": 0.1598, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -0.0230560302734375, + "rewards/margins": 5.8984375, + "rewards/rejected": -5.926953315734863, "step": 440 }, { - "epoch": 0.6320224719101124, - "grad_norm": 0.7942313551902771, - "learning_rate": 1.7978097518217702e-07, - "logits/chosen": -3.1763672828674316, - "logits/rejected": -2.5126953125, - "logps/chosen": -49.80937576293945, - "logps/rejected": -172.6374969482422, - "loss": 0.1685, - "rewards/accuracies": 0.796875, - "rewards/chosen": -0.3480590879917145, - "rewards/margins": 6.218359470367432, - "rewards/rejected": -6.568554878234863, + "epoch": 0.311624649859944, + "grad_norm": 12.394731521606445, + "learning_rate": 8.706166256094012e-07, + "logits/chosen": -3.882031202316284, + "logits/rejected": -3.7718749046325684, + "logps/chosen": -299.70623779296875, + "logps/rejected": -334.76251220703125, + "loss": 0.2544, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.664074718952179, + "rewards/margins": 5.223828315734863, + "rewards/rejected": -4.555859565734863, + "step": 445 + }, + { + "epoch": 0.31512605042016806, + "grad_norm": 9.317229270935059, + "learning_rate": 8.664863405892504e-07, + "logits/chosen": -3.80859375, + "logits/rejected": -3.715625047683716, + "logps/chosen": -285.53125, + "logps/rejected": -320.8999938964844, + "loss": 0.1249, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.667736828327179, + "rewards/margins": 5.558789253234863, + "rewards/rejected": -4.88818359375, "step": 450 }, { - "epoch": 0.6460674157303371, - "grad_norm": 0.9307264685630798, - "learning_rate": 1.6809245510957666e-07, - "logits/chosen": -3.2232422828674316, - "logits/rejected": -2.4925780296325684, - "logps/chosen": -44.875, - "logps/rejected": -171.28750610351562, - "loss": 0.1721, - "rewards/accuracies": 0.7796875238418579, - "rewards/chosen": -0.14088821411132812, - "rewards/margins": 6.381640434265137, - "rewards/rejected": -6.520312309265137, + "epoch": 0.31862745098039214, + "grad_norm": 4.753352642059326, + "learning_rate": 8.623012927390934e-07, + "logits/chosen": -3.755859375, + "logits/rejected": -3.635937452316284, + "logps/chosen": -316.8500061035156, + "logps/rejected": -344.42498779296875, + "loss": 0.1457, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 1.5716552734375, + "rewards/margins": 5.596875190734863, + "rewards/rejected": -4.024365425109863, + "step": 455 + }, + { + "epoch": 0.32212885154061627, + "grad_norm": 5.382900238037109, + "learning_rate": 8.580621074166552e-07, + "logits/chosen": -3.954296827316284, + "logits/rejected": -3.7894530296325684, + "logps/chosen": -315.0062561035156, + "logps/rejected": -343.9375, + "loss": 0.1202, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.381860375404358, + "rewards/margins": 6.178906440734863, + "rewards/rejected": -4.7978515625, "step": 460 }, { - "epoch": 0.6601123595505618, - "grad_norm": 1.6537760496139526, - "learning_rate": 1.5660125748687093e-07, - "logits/chosen": -3.2333984375, - "logits/rejected": -2.518749952316284, - "logps/chosen": -43.334373474121094, - "logps/rejected": -167.8125, - "loss": 0.1794, - "rewards/accuracies": 0.770312488079071, - "rewards/chosen": -0.06363830715417862, - "rewards/margins": 6.300976753234863, - "rewards/rejected": -6.363671779632568, + "epoch": 0.32563025210084034, + "grad_norm": 9.907479286193848, + "learning_rate": 8.537694180692416e-07, + "logits/chosen": -3.998828172683716, + "logits/rejected": -3.8160157203674316, + "logps/chosen": -307.23126220703125, + "logps/rejected": -352.95001220703125, + "loss": 0.1391, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.946850597858429, + "rewards/margins": 6.853125095367432, + "rewards/rejected": -5.913769721984863, + "step": 465 + }, + { + "epoch": 0.3291316526610644, + "grad_norm": 6.668959617614746, + "learning_rate": 8.494238661390864e-07, + "logits/chosen": -4.036328315734863, + "logits/rejected": -3.852734327316284, + "logps/chosen": -341.2749938964844, + "logps/rejected": -377.70001220703125, + "loss": 0.1029, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.559814453125, + "rewards/margins": 6.66796875, + "rewards/rejected": -6.106640815734863, "step": 470 }, { - "epoch": 0.6741573033707865, - "grad_norm": 0.6378122568130493, - "learning_rate": 1.4533506561564305e-07, - "logits/chosen": -3.2416014671325684, - "logits/rejected": -2.546093702316284, - "logps/chosen": -39.32500076293945, - "logps/rejected": -166.10000610351562, - "loss": 0.1712, - "rewards/accuracies": 0.7890625, - "rewards/chosen": 0.16349944472312927, - "rewards/margins": 6.383008003234863, - "rewards/rejected": -6.220312595367432, + "epoch": 0.3326330532212885, + "grad_norm": 4.653944492340088, + "learning_rate": 8.450261009675029e-07, + "logits/chosen": -4.055078029632568, + "logits/rejected": -3.8324217796325684, + "logps/chosen": -332.875, + "logps/rejected": -368.5249938964844, + "loss": 0.1133, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.8369506597518921, + "rewards/margins": 6.491796970367432, + "rewards/rejected": -7.325781345367432, + "step": 475 + }, + { + "epoch": 0.33613445378151263, + "grad_norm": 4.796830177307129, + "learning_rate": 8.405767796978544e-07, + "logits/chosen": -4.044921875, + "logits/rejected": -3.811328172683716, + "logps/chosen": -332.76251220703125, + "logps/rejected": -384.2250061035156, + "loss": 0.1567, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.9822021722793579, + "rewards/margins": 6.829297065734863, + "rewards/rejected": -7.808203220367432, "step": 480 }, { - "epoch": 0.6882022471910112, - "grad_norm": 0.21734459698200226, - "learning_rate": 1.343210207389125e-07, - "logits/chosen": -3.216015577316284, - "logits/rejected": NaN, - "logps/chosen": -39.485939025878906, - "logps/rejected": -162.99374389648438, - "loss": 0.1864, - "rewards/accuracies": 0.7640625238418579, - "rewards/chosen": 0.08720092475414276, - "rewards/margins": 6.254492282867432, - "rewards/rejected": -6.165625095367432, + "epoch": 0.3396358543417367, + "grad_norm": 3.359666585922241, + "learning_rate": 8.360765671773602e-07, + "logits/chosen": -3.9605469703674316, + "logits/rejected": -3.8082032203674316, + "logps/chosen": -288.1499938964844, + "logps/rejected": -325.57501220703125, + "loss": 0.0853, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.07318115234375, + "rewards/margins": 6.422265529632568, + "rewards/rejected": -8.494531631469727, + "step": 485 + }, + { + "epoch": 0.3431372549019608, + "grad_norm": 6.423489093780518, + "learning_rate": 8.315261358577484e-07, + "logits/chosen": -3.9214844703674316, + "logits/rejected": -3.7406249046325684, + "logps/chosen": -318.4312438964844, + "logps/rejected": -365.95001220703125, + "loss": 0.1519, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.554150342941284, + "rewards/margins": 6.6875, + "rewards/rejected": -9.240819931030273, "step": 490 }, { - "epoch": 0.702247191011236, - "grad_norm": 0.3136639893054962, - "learning_rate": 1.2358565665550387e-07, - "logits/chosen": -3.216015577316284, - "logits/rejected": NaN, - "logps/chosen": -42.5078125, - "logps/rejected": -166.36874389648438, - "loss": 0.1841, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.017351532354950905, - "rewards/margins": 6.283398628234863, - "rewards/rejected": -6.266992092132568, + "epoch": 0.34663865546218486, + "grad_norm": 2.9223127365112305, + "learning_rate": 8.269261656947755e-07, + "logits/chosen": -3.9156250953674316, + "logits/rejected": -3.7308592796325684, + "logps/chosen": -343.4375, + "logps/rejected": -387.5, + "loss": 0.0976, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.646899461746216, + "rewards/margins": 6.696875095367432, + "rewards/rejected": -9.341015815734863, + "step": 495 + }, + { + "epoch": 0.35014005602240894, + "grad_norm": 6.421755790710449, + "learning_rate": 8.22277344046621e-07, + "logits/chosen": -3.90625, + "logits/rejected": -3.739453077316284, + "logps/chosen": -348.51251220703125, + "logps/rejected": -373.5249938964844, + "loss": 0.149, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7789063453674316, + "rewards/margins": 5.763085842132568, + "rewards/rejected": -8.544921875, "step": 500 }, { - "epoch": 0.7162921348314607, - "grad_norm": 0.07860163599252701, - "learning_rate": 1.1315483579780094e-07, - "logits/chosen": -3.210742235183716, - "logits/rejected": -2.499218702316284, - "logps/chosen": -41.615623474121094, - "logps/rejected": -166.30624389648438, - "loss": 0.1831, - "rewards/accuracies": 0.7828124761581421, - "rewards/chosen": -0.01008453406393528, - "rewards/margins": 6.287304878234863, - "rewards/rejected": -6.297070503234863, + "epoch": 0.3536414565826331, + "grad_norm": 5.0626912117004395, + "learning_rate": 8.175803655711799e-07, + "logits/chosen": -3.9496092796325684, + "logits/rejected": -3.78125, + "logps/chosen": -323.8687438964844, + "logps/rejected": -360.26251220703125, + "loss": 0.1334, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -1.853271484375, + "rewards/margins": 6.216015815734863, + "rewards/rejected": -8.064453125, + "step": 505 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 19.834314346313477, + "learning_rate": 8.1283593212226e-07, + "logits/chosen": -4.016406059265137, + "logits/rejected": -3.833203077316284, + "logps/chosen": -342.1000061035156, + "logps/rejected": -373.04998779296875, + "loss": 0.2158, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6996338367462158, + "rewards/margins": 5.773046970367432, + "rewards/rejected": -7.474609375, "step": 510 }, { - "epoch": 0.7303370786516854, - "grad_norm": 0.173310786485672, - "learning_rate": 1.0305368692688174e-07, - "logits/chosen": -3.195507764816284, - "logits/rejected": -2.504101514816284, - "logps/chosen": -45.17656326293945, - "logps/rejected": -168.6062469482422, - "loss": 0.1724, - "rewards/accuracies": 0.7828124761581421, - "rewards/chosen": -0.14428405463695526, - "rewards/margins": 6.204297065734863, - "rewards/rejected": -6.347070217132568, + "epoch": 0.36064425770308123, + "grad_norm": 6.62537145614624, + "learning_rate": 8.080447526447078e-07, + "logits/chosen": -4.09765625, + "logits/rejected": -3.869921922683716, + "logps/chosen": -325.82501220703125, + "logps/rejected": -370.8999938964844, + "loss": 0.2546, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.572802722454071, + "rewards/margins": 6.523633003234863, + "rewards/rejected": -7.099609375, + "step": 515 + }, + { + "epoch": 0.3641456582633053, + "grad_norm": 6.350283622741699, + "learning_rate": 8.032075430684724e-07, + "logits/chosen": -4.087500095367432, + "logits/rejected": -3.8460936546325684, + "logps/chosen": -323.48748779296875, + "logps/rejected": -367.32501220703125, + "loss": 0.133, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.2872070372104645, + "rewards/margins": 7.222460746765137, + "rewards/rejected": -7.507421970367432, "step": 520 }, { - "epoch": 0.7443820224719101, - "grad_norm": 0.3090410828590393, - "learning_rate": 9.330654459513266e-08, - "logits/chosen": -3.169726610183716, - "logits/rejected": -2.4736328125, - "logps/chosen": -46.984375, - "logps/rejected": -162.36874389648438, - "loss": 0.2013, - "rewards/accuracies": 0.7515624761581421, - "rewards/chosen": -0.3089355528354645, - "rewards/margins": 5.836718559265137, - "rewards/rejected": -6.145312309265137, + "epoch": 0.36764705882352944, + "grad_norm": 3.5338668823242188, + "learning_rate": 7.983250262016275e-07, + "logits/chosen": -4.032812595367432, + "logits/rejected": -3.787890672683716, + "logps/chosen": -334.2250061035156, + "logps/rejected": -380.45001220703125, + "loss": 0.0534, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.2791503965854645, + "rewards/margins": 8.095507621765137, + "rewards/rejected": -8.372265815734863, + "step": 525 + }, + { + "epoch": 0.3711484593837535, + "grad_norm": 3.9917259216308594, + "learning_rate": 7.933979316223631e-07, + "logits/chosen": -3.942187547683716, + "logits/rejected": -3.7660155296325684, + "logps/chosen": -311.01251220703125, + "logps/rejected": -357.2250061035156, + "loss": 0.1091, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.9442993402481079, + "rewards/margins": 7.354687690734863, + "rewards/rejected": -8.307812690734863, "step": 530 }, { - "epoch": 0.7584269662921348, - "grad_norm": 0.249381884932518, - "learning_rate": 8.393689052217964e-08, - "logits/chosen": -3.1357421875, - "logits/rejected": -2.484179735183716, - "logps/chosen": -49.49687576293945, - "logps/rejected": -167.35000610351562, - "loss": 0.1759, - "rewards/accuracies": 0.770312488079071, - "rewards/chosen": -0.393869012594223, - "rewards/margins": 5.953320503234863, - "rewards/rejected": -6.347460746765137, + "epoch": 0.3746498599439776, + "grad_norm": 9.341059684753418, + "learning_rate": 7.884269955699687e-07, + "logits/chosen": -3.94921875, + "logits/rejected": -3.740234375, + "logps/chosen": -311.42498779296875, + "logps/rejected": -348.1499938964844, + "loss": 0.1043, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.904492199420929, + "rewards/margins": 6.862109184265137, + "rewards/rejected": -7.762499809265137, + "step": 535 + }, + { + "epoch": 0.37815126050420167, + "grad_norm": 13.346263885498047, + "learning_rate": 7.834129608348181e-07, + "logits/chosen": -3.950000047683716, + "logits/rejected": -3.7269530296325684, + "logps/chosen": -337.54998779296875, + "logps/rejected": -391.3500061035156, + "loss": 0.1693, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.5969909429550171, + "rewards/margins": 7.047656059265137, + "rewards/rejected": -7.643945217132568, "step": 540 }, { - "epoch": 0.7724719101123596, - "grad_norm": 0.2429146021604538, - "learning_rate": 7.49672970253691e-08, - "logits/chosen": -3.1318359375, - "logits/rejected": -2.4886717796325684, - "logps/chosen": -52.857810974121094, - "logps/rejected": -171.19375610351562, - "loss": 0.1722, - "rewards/accuracies": 0.7953125238418579, - "rewards/chosen": -0.4801391661167145, - "rewards/margins": 5.990429878234863, - "rewards/rejected": -6.468359470367432, + "epoch": 0.38165266106442575, + "grad_norm": 6.5441107749938965, + "learning_rate": 7.783565766473776e-07, + "logits/chosen": -3.940624952316284, + "logits/rejected": -3.7562499046325684, + "logps/chosen": -308.6312561035156, + "logps/rejected": -345.5375061035156, + "loss": 0.0949, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.02398681640625, + "rewards/margins": 6.360937595367432, + "rewards/rejected": -7.38427734375, + "step": 545 + }, + { + "epoch": 0.3851540616246499, + "grad_norm": 2.8823161125183105, + "learning_rate": 7.732585985662509e-07, + "logits/chosen": -4.045702934265137, + "logits/rejected": -3.8296875953674316, + "logps/chosen": -317.6875, + "logps/rejected": -366.54998779296875, + "loss": 0.1415, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16395263373851776, + "rewards/margins": 6.898633003234863, + "rewards/rejected": -6.735644340515137, "step": 550 }, { - "epoch": 0.7865168539325843, - "grad_norm": 0.5875958204269409, - "learning_rate": 6.641937264107867e-08, - "logits/chosen": -3.171093702316284, - "logits/rejected": -2.4808592796325684, - "logps/chosen": -47.681251525878906, - "logps/rejected": -174.71875, - "loss": 0.1605, - "rewards/accuracies": 0.7984374761581421, - "rewards/chosen": -0.26392096281051636, - "rewards/margins": 6.382421970367432, - "rewards/rejected": -6.64453125, + "epoch": 0.38865546218487396, + "grad_norm": 9.686930656433105, + "learning_rate": 7.681197883652779e-07, + "logits/chosen": -4.041406154632568, + "logits/rejected": -3.8304686546325684, + "logps/chosen": -309.0375061035156, + "logps/rejected": -351.04998779296875, + "loss": 0.0839, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4409423768520355, + "rewards/margins": 6.984765529632568, + "rewards/rejected": -6.541894435882568, + "step": 555 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 10.684289932250977, + "learning_rate": 7.629409139197062e-07, + "logits/chosen": -3.9789061546325684, + "logits/rejected": -3.748828172683716, + "logps/chosen": -335.8500061035156, + "logps/rejected": -377.6499938964844, + "loss": 0.0957, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0236084461212158, + "rewards/margins": 7.436327934265137, + "rewards/rejected": -8.459375381469727, "step": 560 }, { - "epoch": 0.800561797752809, - "grad_norm": 0.37553030252456665, - "learning_rate": 5.831371006785962e-08, - "logits/chosen": -3.1826171875, - "logits/rejected": -2.4847655296325684, - "logps/chosen": -48.498435974121094, - "logps/rejected": -169.1125030517578, - "loss": 0.1773, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.3075141906738281, - "rewards/margins": 6.108984470367432, - "rewards/rejected": -6.417578220367432, + "epoch": 0.3956582633053221, + "grad_norm": 40.065433502197266, + "learning_rate": 7.577227490914494e-07, + "logits/chosen": -4.010546684265137, + "logits/rejected": -3.7425780296325684, + "logps/chosen": -329.25, + "logps/rejected": -394.92498779296875, + "loss": 0.1063, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.4480224549770355, + "rewards/margins": 8.193359375, + "rewards/rejected": -8.646484375, + "step": 565 + }, + { + "epoch": 0.39915966386554624, + "grad_norm": 9.474442481994629, + "learning_rate": 7.52466073613452e-07, + "logits/chosen": -3.9765625, + "logits/rejected": -3.7738280296325684, + "logps/chosen": -309.57501220703125, + "logps/rejected": -359.4750061035156, + "loss": 0.0688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.10063476860523224, + "rewards/margins": 7.375390529632568, + "rewards/rejected": -7.2734375, "step": 570 }, { - "epoch": 0.8146067415730337, - "grad_norm": 0.31017985939979553, - "learning_rate": 5.066983655682325e-08, - "logits/chosen": -3.141796827316284, - "logits/rejected": -2.505859375, - "logps/chosen": -50.5703125, - "logps/rejected": -166.83749389648438, - "loss": 0.1899, - "rewards/accuracies": 0.7671874761581421, - "rewards/chosen": -0.4396209716796875, - "rewards/margins": 5.888671875, - "rewards/rejected": -6.325585842132568, + "epoch": 0.4026610644257703, + "grad_norm": 10.86375904083252, + "learning_rate": 7.471716729731763e-07, + "logits/chosen": -3.9957032203674316, + "logits/rejected": -3.723828077316284, + "logps/chosen": -303.45001220703125, + "logps/rejected": -363.125, + "loss": 0.0727, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.292626976966858, + "rewards/margins": 9.460546493530273, + "rewards/rejected": -8.175000190734863, + "step": 575 + }, + { + "epoch": 0.4061624649859944, + "grad_norm": 10.98038101196289, + "learning_rate": 7.418403382952292e-07, + "logits/chosen": -4.019140720367432, + "logits/rejected": -3.759765625, + "logps/chosen": -297.83123779296875, + "logps/rejected": -359.3500061035156, + "loss": 0.1873, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 1.3828125, + "rewards/margins": 8.557812690734863, + "rewards/rejected": -7.180371284484863, "step": 580 }, { - "epoch": 0.8286516853932584, - "grad_norm": 0.8791071772575378, - "learning_rate": 4.3506166868781755e-08, - "logits/chosen": -3.1845703125, - "logits/rejected": -2.4937500953674316, - "logps/chosen": -48.103126525878906, - "logps/rejected": -170.8625030517578, - "loss": 0.1698, - "rewards/accuracies": 0.785937488079071, - "rewards/chosen": -0.27438658475875854, - "rewards/margins": 6.2060546875, - "rewards/rejected": -6.482421875, + "epoch": 0.4096638655462185, + "grad_norm": 7.4733500480651855, + "learning_rate": 7.364728662231483e-07, + "logits/chosen": -4.013281345367432, + "logits/rejected": -3.744921922683716, + "logps/chosen": -317.73748779296875, + "logps/rejected": -381.51251220703125, + "loss": 0.0816, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.47197264432907104, + "rewards/margins": 9.052343368530273, + "rewards/rejected": -8.585156440734863, + "step": 585 + }, + { + "epoch": 0.41316526610644255, + "grad_norm": 6.493350028991699, + "learning_rate": 7.310700588003605e-07, + "logits/chosen": -4.008203029632568, + "logits/rejected": -3.751171827316284, + "logps/chosen": -319.67498779296875, + "logps/rejected": -392.48748779296875, + "loss": 0.0859, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.6444946527481079, + "rewards/margins": 8.985156059265137, + "rewards/rejected": -8.343066215515137, "step": 590 }, { - "epoch": 0.8426966292134831, - "grad_norm": 0.6104283928871155, - "learning_rate": 3.683995891147695e-08, - "logits/chosen": -3.176953077316284, - "logits/rejected": NaN, - "logps/chosen": -45.896873474121094, - "logps/rejected": -168.60000610351562, - "loss": 0.1721, - "rewards/accuracies": 0.784375011920929, - "rewards/chosen": -0.21481475234031677, - "rewards/margins": 6.201952934265137, - "rewards/rejected": -6.416015625, + "epoch": 0.4166666666666667, + "grad_norm": 5.3325514793396, + "learning_rate": 7.256327233503364e-07, + "logits/chosen": -4.022656440734863, + "logits/rejected": -3.778125047683716, + "logps/chosen": -324.8687438964844, + "logps/rejected": -389.88751220703125, + "loss": 0.0995, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.2916259765625, + "rewards/margins": 8.142187118530273, + "rewards/rejected": -9.428418159484863, + "step": 595 + }, + { + "epoch": 0.42016806722689076, + "grad_norm": 9.191666603088379, + "learning_rate": 7.201616723559547e-07, + "logits/chosen": -4.094531059265137, + "logits/rejected": -3.783203125, + "logps/chosen": -308.5625, + "logps/rejected": -366.61248779296875, + "loss": 0.0918, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4765869081020355, + "rewards/margins": 8.8955078125, + "rewards/rejected": -8.419530868530273, "step": 600 }, { - "epoch": 0.8567415730337079, - "grad_norm": 0.31073251366615295, - "learning_rate": 3.0687272163768986e-08, - "logits/chosen": -3.1650390625, - "logits/rejected": -2.513671875, - "logps/chosen": -46.66093826293945, - "logps/rejected": -170.8625030517578, - "loss": 0.1649, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.24439087510108948, - "rewards/margins": 6.258593559265137, - "rewards/rejected": -6.502148628234863, + "epoch": 0.42366946778711484, + "grad_norm": 6.363277435302734, + "learning_rate": 7.146577233380952e-07, + "logits/chosen": -4.1796875, + "logits/rejected": -3.88671875, + "logps/chosen": -320.67498779296875, + "logps/rejected": -369.1499938964844, + "loss": 0.1508, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.5018554925918579, + "rewards/margins": 7.989453315734863, + "rewards/rejected": -7.481738090515137, + "step": 605 + }, + { + "epoch": 0.4271708683473389, + "grad_norm": 8.64117431640625, + "learning_rate": 7.091216987334791e-07, + "logits/chosen": -4.275000095367432, + "logits/rejected": -3.9605469703674316, + "logps/chosen": -324.36248779296875, + "logps/rejected": -378.2124938964844, + "loss": 0.1249, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.914660632610321, + "rewards/margins": 8.302734375, + "rewards/rejected": -7.39453125, "step": 610 }, { - "epoch": 0.8707865168539326, - "grad_norm": 0.2421959638595581, - "learning_rate": 2.5062928986944676e-08, - "logits/chosen": -3.170117139816284, - "logits/rejected": -2.5238280296325684, - "logps/chosen": -46.318748474121094, - "logps/rejected": -171.50625610351562, - "loss": 0.1595, - "rewards/accuracies": 0.7953125238418579, - "rewards/chosen": -0.15370789170265198, - "rewards/margins": 6.348242282867432, - "rewards/rejected": -6.501953125, + "epoch": 0.43067226890756305, + "grad_norm": 5.129154682159424, + "learning_rate": 7.035544257717761e-07, + "logits/chosen": -4.269921779632568, + "logits/rejected": -4.025781154632568, + "logps/chosen": -281.7250061035156, + "logps/rejected": -344.3500061035156, + "loss": 0.111, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6723266839981079, + "rewards/margins": 7.028906345367432, + "rewards/rejected": -6.359179496765137, + "step": 615 + }, + { + "epoch": 0.4341736694677871, + "grad_norm": 4.3600969314575195, + "learning_rate": 6.979567363519926e-07, + "logits/chosen": -4.233593940734863, + "logits/rejected": -4.010937690734863, + "logps/chosen": -288.03125, + "logps/rejected": -341.3500061035156, + "loss": 0.0781, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.39274901151657104, + "rewards/margins": 6.916796684265137, + "rewards/rejected": -6.522265434265137, "step": 620 }, { - "epoch": 0.8848314606741573, - "grad_norm": 0.292059987783432, - "learning_rate": 1.9980478916351296e-08, - "logits/chosen": -3.185742139816284, - "logits/rejected": -2.491992235183716, - "logps/chosen": -46.803123474121094, - "logps/rejected": -171.0437469482422, - "loss": 0.1893, - "rewards/accuracies": 0.778124988079071, - "rewards/chosen": -0.1725509613752365, - "rewards/margins": 6.320703029632568, - "rewards/rejected": -6.491796970367432, + "epoch": 0.4376750700280112, + "grad_norm": 10.305990219116211, + "learning_rate": 6.923294669181659e-07, + "logits/chosen": -4.236328125, + "logits/rejected": -3.9925780296325684, + "logps/chosen": -301.6875, + "logps/rejected": -345.95001220703125, + "loss": 0.1325, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3890869617462158, + "rewards/margins": 7.2578125, + "rewards/rejected": -5.867968559265137, + "step": 625 + }, + { + "epoch": 0.4411764705882353, + "grad_norm": 1.9760684967041016, + "learning_rate": 6.866734583343751e-07, + "logits/chosen": -4.197656154632568, + "logits/rejected": -3.9476561546325684, + "logps/chosen": -322.34375, + "logps/rejected": -374.51251220703125, + "loss": 0.073, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.05327148362994194, + "rewards/margins": 7.111328125, + "rewards/rejected": -7.168749809265137, "step": 630 }, { - "epoch": 0.898876404494382, - "grad_norm": 0.6732786297798157, - "learning_rate": 1.5452166019378987e-08, - "logits/chosen": -3.2027344703674316, - "logits/rejected": -2.5123047828674316, - "logps/chosen": -46.25, - "logps/rejected": -172.4250030517578, - "loss": 0.1712, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.16961669921875, - "rewards/margins": 6.3662109375, - "rewards/rejected": -6.537890434265137, + "epoch": 0.44467787114845936, + "grad_norm": 5.447768688201904, + "learning_rate": 6.809895557590949e-07, + "logits/chosen": -4.140234470367432, + "logits/rejected": -3.901562452316284, + "logps/chosen": -305.3999938964844, + "logps/rejected": -357.5, + "loss": 0.1295, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.04921875149011612, + "rewards/margins": 7.282812595367432, + "rewards/rejected": -7.23388671875, + "step": 635 + }, + { + "epoch": 0.4481792717086835, + "grad_norm": 11.747628211975098, + "learning_rate": 6.752786085189059e-07, + "logits/chosen": -4.054296970367432, + "logits/rejected": -3.833203077316284, + "logps/chosen": -325.125, + "logps/rejected": -365.7749938964844, + "loss": 0.143, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.105444312095642, + "rewards/margins": 7.122265815734863, + "rewards/rejected": -8.227734565734863, "step": 640 }, { - "epoch": 0.9129213483146067, - "grad_norm": 0.25237613916397095, - "learning_rate": 1.1488899398429896e-08, - "logits/chosen": -3.2007813453674316, - "logits/rejected": -2.5160155296325684, - "logps/chosen": -43.58906173706055, - "logps/rejected": -170.09375, - "loss": 0.1787, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.10042724758386612, - "rewards/margins": 6.383008003234863, - "rewards/rejected": -6.482421875, + "epoch": 0.45168067226890757, + "grad_norm": 9.866388320922852, + "learning_rate": 6.695414699815826e-07, + "logits/chosen": -4.037499904632568, + "logits/rejected": -3.8316407203674316, + "logps/chosen": -269.61248779296875, + "logps/rejected": -324.20001220703125, + "loss": 0.0917, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.7059570550918579, + "rewards/margins": 7.22265625, + "rewards/rejected": -7.929296970367432, + "step": 645 + }, + { + "epoch": 0.45518207282913165, + "grad_norm": 3.863745927810669, + "learning_rate": 6.637789974285779e-07, + "logits/chosen": -4.09375, + "logits/rejected": -3.8394532203674316, + "logps/chosen": -295.75, + "logps/rejected": -346.57501220703125, + "loss": 0.0851, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.2662353515625, + "rewards/margins": 7.594531059265137, + "rewards/rejected": -5.324511528015137, "step": 650 }, { - "epoch": 0.9269662921348315, - "grad_norm": 0.09695342183113098, - "learning_rate": 8.100226909935059e-09, - "logits/chosen": -3.2056641578674316, - "logits/rejected": -2.5228514671325684, - "logps/chosen": -43.27656173706055, - "logps/rejected": -166.16250610351562, - "loss": 0.1876, - "rewards/accuracies": 0.7718750238418579, - "rewards/chosen": -0.07465209811925888, - "rewards/margins": 6.197656154632568, - "rewards/rejected": -6.272265434265137, + "epoch": 0.4586834733893557, + "grad_norm": 6.13060998916626, + "learning_rate": 6.579920519269218e-07, + "logits/chosen": -4.114843845367432, + "logits/rejected": -3.8179688453674316, + "logps/chosen": -297.4624938964844, + "logps/rejected": -366.67498779296875, + "loss": 0.1536, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 1.949853539466858, + "rewards/margins": 8.067968368530273, + "rewards/rejected": -6.114843845367432, + "step": 655 + }, + { + "epoch": 0.46218487394957986, + "grad_norm": 6.907744884490967, + "learning_rate": 6.521814982005552e-07, + "logits/chosen": -4.055468559265137, + "logits/rejected": -3.8070311546325684, + "logps/chosen": -276.32501220703125, + "logps/rejected": -336.20001220703125, + "loss": 0.1828, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.5094726085662842, + "rewards/margins": 7.813281059265137, + "rewards/rejected": -6.301660060882568, "step": 660 }, { - "epoch": 0.9410112359550562, - "grad_norm": 0.15045014023780823, - "learning_rate": 5.2943121627319346e-09, - "logits/chosen": -3.207812547683716, - "logits/rejected": -2.5152344703674316, - "logps/chosen": -43.06562423706055, - "logps/rejected": -166.84375, - "loss": 0.1818, - "rewards/accuracies": 0.770312488079071, - "rewards/chosen": -0.07712707668542862, - "rewards/margins": 6.268164157867432, - "rewards/rejected": -6.342382907867432, + "epoch": 0.46568627450980393, + "grad_norm": 6.405279636383057, + "learning_rate": 6.463482045011171e-07, + "logits/chosen": -3.948437452316284, + "logits/rejected": -3.7085938453674316, + "logps/chosen": -301.625, + "logps/rejected": -353.82501220703125, + "loss": 0.0832, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.435766577720642, + "rewards/margins": 7.907031059265137, + "rewards/rejected": -6.471289157867432, + "step": 665 + }, + { + "epoch": 0.469187675070028, + "grad_norm": 5.61862850189209, + "learning_rate": 6.404930424782052e-07, + "logits/chosen": -3.858203172683716, + "logits/rejected": -3.6597657203674316, + "logps/chosen": -322.6187438964844, + "logps/rejected": -367.32501220703125, + "loss": 0.2208, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -1.5012023448944092, + "rewards/margins": 7.526757717132568, + "rewards/rejected": -9.029296875, "step": 670 }, { - "epoch": 0.9550561797752809, - "grad_norm": 0.31066492199897766, - "learning_rate": 3.077914851215585e-09, - "logits/chosen": -3.2230467796325684, - "logits/rejected": -2.5365233421325684, - "logps/chosen": -44.46562576293945, - "logps/rejected": -170.8625030517578, - "loss": 0.1733, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.07046356052160263, - "rewards/margins": 6.385156154632568, - "rewards/rejected": -6.457812309265137, + "epoch": 0.4726890756302521, + "grad_norm": 10.052380561828613, + "learning_rate": 6.346168870491273e-07, + "logits/chosen": -3.8499999046325684, + "logits/rejected": -3.655468702316284, + "logps/chosen": -309.7124938964844, + "logps/rejected": -365.9750061035156, + "loss": 0.1371, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.228515625, + "rewards/margins": 7.833203315734863, + "rewards/rejected": -7.608105659484863, + "step": 675 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 7.156270980834961, + "learning_rate": 6.287206162681662e-07, + "logits/chosen": -3.8675780296325684, + "logits/rejected": -3.66015625, + "logps/chosen": -318.625, + "logps/rejected": -382.82501220703125, + "loss": 0.1899, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.8764892816543579, + "rewards/margins": 8.061426162719727, + "rewards/rejected": -8.932031631469727, "step": 680 }, { - "epoch": 0.9691011235955056, - "grad_norm": 0.11322323232889175, - "learning_rate": 1.4563744706429514e-09, - "logits/chosen": -3.216015577316284, - "logits/rejected": -2.50390625, - "logps/chosen": -41.93281173706055, - "logps/rejected": -168.16250610351562, - "loss": 0.1693, - "rewards/accuracies": 0.778124988079071, - "rewards/chosen": -0.01323547400534153, - "rewards/margins": 6.383593559265137, - "rewards/rejected": -6.3974609375, + "epoch": 0.4796918767507003, + "grad_norm": 6.979720115661621, + "learning_rate": 6.228051111953742e-07, + "logits/chosen": -3.839062452316284, + "logits/rejected": -3.701953172683716, + "logps/chosen": -318.3999938964844, + "logps/rejected": -366.8500061035156, + "loss": 0.163, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.12661132216453552, + "rewards/margins": 7.118750095367432, + "rewards/rejected": -7.241406440734863, + "step": 685 + }, + { + "epoch": 0.4831932773109244, + "grad_norm": 11.366744041442871, + "learning_rate": 6.168712557649193e-07, + "logits/chosen": -3.895312547683716, + "logits/rejected": -3.709765672683716, + "logps/chosen": -302.20001220703125, + "logps/rejected": -355.625, + "loss": 0.1296, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.614208996295929, + "rewards/margins": 8.010937690734863, + "rewards/rejected": -7.394921779632568, "step": 690 }, { - "epoch": 0.9831460674157303, - "grad_norm": 0.28659555315971375, - "learning_rate": 4.3359745382104405e-10, - "logits/chosen": -3.21484375, - "logits/rejected": NaN, - "logps/chosen": -42.96406173706055, - "logps/rejected": -169.0, - "loss": 0.1708, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.045589447021484375, - "rewards/margins": 6.397265434265137, - "rewards/rejected": -6.444140434265137, + "epoch": 0.48669467787114845, + "grad_norm": 8.56261920928955, + "learning_rate": 6.109199366530035e-07, + "logits/chosen": -3.893749952316284, + "logits/rejected": -3.7281250953674316, + "logps/chosen": -321.76251220703125, + "logps/rejected": -376.7250061035156, + "loss": 0.0851, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.7763427495956421, + "rewards/margins": 7.8515625, + "rewards/rejected": -7.073632717132568, + "step": 695 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 5.708214282989502, + "learning_rate": 6.049520431453666e-07, + "logits/chosen": -3.9175782203674316, + "logits/rejected": -3.755078077316284, + "logps/chosen": -321.2250061035156, + "logps/rejected": -367.1000061035156, + "loss": 0.0858, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5287841558456421, + "rewards/margins": 7.557812690734863, + "rewards/rejected": -7.0302734375, "step": 700 }, { - "epoch": 0.9971910112359551, - "grad_norm": 0.5906934142112732, - "learning_rate": 1.2047760167999133e-11, - "logits/chosen": -3.2037110328674316, - "logits/rejected": -2.529101610183716, - "logps/chosen": -44.228126525878906, - "logps/rejected": -170.83749389648438, - "loss": 0.1657, - "rewards/accuracies": 0.7984374761581421, - "rewards/chosen": -0.09162139892578125, - "rewards/margins": 6.3896484375, - "rewards/rejected": -6.482421875, + "epoch": 0.49369747899159666, + "grad_norm": 11.397648811340332, + "learning_rate": 5.989684670044059e-07, + "logits/chosen": -3.911328077316284, + "logits/rejected": -3.764453172683716, + "logps/chosen": -308.01251220703125, + "logps/rejected": -362.76251220703125, + "loss": 0.1002, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.741455078125, + "rewards/margins": 7.514062404632568, + "rewards/rejected": -6.768750190734863, + "step": 705 + }, + { + "epoch": 0.49719887955182074, + "grad_norm": 6.1462860107421875, + "learning_rate": 5.929701023359229e-07, + "logits/chosen": -3.9410157203674316, + "logits/rejected": -3.764843702316284, + "logps/chosen": -292.5375061035156, + "logps/rejected": -352.04998779296875, + "loss": 0.1366, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.510815441608429, + "rewards/margins": 7.702343940734863, + "rewards/rejected": -7.189648628234863, "step": 710 }, + { + "epoch": 0.5007002801120448, + "grad_norm": 5.117457866668701, + "learning_rate": 5.86957845455518e-07, + "logits/chosen": -3.962890625, + "logits/rejected": -3.7679686546325684, + "logps/chosen": -310.33123779296875, + "logps/rejected": -368.1625061035156, + "loss": 0.1361, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.5436767339706421, + "rewards/margins": 7.6328125, + "rewards/rejected": -7.092187404632568, + "step": 715 + }, + { + "epoch": 0.5042016806722689, + "grad_norm": 7.32170295715332, + "learning_rate": 5.809325947546595e-07, + "logits/chosen": -4.034765720367432, + "logits/rejected": -3.8238282203674316, + "logps/chosen": -320.0249938964844, + "logps/rejected": -375.1499938964844, + "loss": 0.1057, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6477905511856079, + "rewards/margins": 7.616796970367432, + "rewards/rejected": -6.969922065734863, + "step": 720 + }, + { + "epoch": 0.507703081232493, + "grad_norm": 1.7897793054580688, + "learning_rate": 5.748952505664384e-07, + "logits/chosen": -4.041406154632568, + "logits/rejected": -3.8167967796325684, + "logps/chosen": -287.5625, + "logps/rejected": -352.625, + "loss": 0.0551, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.556073009967804, + "rewards/margins": 8.061718940734863, + "rewards/rejected": -7.5107421875, + "step": 725 + }, + { + "epoch": 0.511204481792717, + "grad_norm": 6.413627624511719, + "learning_rate": 5.688467150310352e-07, + "logits/chosen": -4.096484184265137, + "logits/rejected": -3.844921827316284, + "logps/chosen": -315.6187438964844, + "logps/rejected": -367.70001220703125, + "loss": 0.1556, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.9060913324356079, + "rewards/margins": 7.874218940734863, + "rewards/rejected": -6.9638671875, + "step": 730 + }, + { + "epoch": 0.5147058823529411, + "grad_norm": 6.02368688583374, + "learning_rate": 5.627878919609162e-07, + "logits/chosen": -4.142187595367432, + "logits/rejected": -3.893749952316284, + "logps/chosen": -342.5249938964844, + "logps/rejected": -383.4750061035156, + "loss": 0.0987, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.24305419623851776, + "rewards/margins": 7.592187404632568, + "rewards/rejected": -7.352246284484863, + "step": 735 + }, + { + "epoch": 0.5182072829131653, + "grad_norm": 9.392160415649414, + "learning_rate": 5.567196867057792e-07, + "logits/chosen": -4.110547065734863, + "logits/rejected": -3.856640577316284, + "logps/chosen": -327.32501220703125, + "logps/rejected": -368.6000061035156, + "loss": 0.0851, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.1859862804412842, + "rewards/margins": 7.625781059265137, + "rewards/rejected": -8.806836128234863, + "step": 740 + }, + { + "epoch": 0.5217086834733894, + "grad_norm": 7.507225513458252, + "learning_rate": 5.506430060172713e-07, + "logits/chosen": -4.096875190734863, + "logits/rejected": -3.839062452316284, + "logps/chosen": -352.70001220703125, + "logps/rejected": -391.20001220703125, + "loss": 0.1142, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.350988745689392, + "rewards/margins": 7.507421970367432, + "rewards/rejected": -8.854223251342773, + "step": 745 + }, + { + "epoch": 0.5252100840336135, + "grad_norm": 6.702646255493164, + "learning_rate": 5.445587579134949e-07, + "logits/chosen": -4.087109565734863, + "logits/rejected": -3.849609375, + "logps/chosen": -300.8062438964844, + "logps/rejected": -347.36248779296875, + "loss": 0.1031, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.7536376714706421, + "rewards/margins": 7.333203315734863, + "rewards/rejected": -8.081640243530273, + "step": 750 + }, + { + "epoch": 0.5287114845938375, + "grad_norm": 2.128941774368286, + "learning_rate": 5.38467851543326e-07, + "logits/chosen": -4.013671875, + "logits/rejected": -3.795703172683716, + "logps/chosen": -322.1499938964844, + "logps/rejected": -377.54998779296875, + "loss": 0.0809, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.7710937261581421, + "rewards/margins": 7.187890529632568, + "rewards/rejected": -7.957421779632568, + "step": 755 + }, + { + "epoch": 0.5322128851540616, + "grad_norm": 6.914889335632324, + "learning_rate": 5.323711970505627e-07, + "logits/chosen": -3.9925780296325684, + "logits/rejected": -3.772265672683716, + "logps/chosen": -298.3125, + "logps/rejected": -349.9375, + "loss": 0.1101, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.8685058355331421, + "rewards/margins": 7.3203125, + "rewards/rejected": -8.191991806030273, + "step": 760 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 6.70955753326416, + "learning_rate": 5.262697054379268e-07, + "logits/chosen": -3.979687452316284, + "logits/rejected": -3.748046875, + "logps/chosen": -283.38751220703125, + "logps/rejected": -351.4750061035156, + "loss": 0.095, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.0792236328125, + "rewards/margins": 7.942968845367432, + "rewards/rejected": -6.858788967132568, + "step": 765 + }, + { + "epoch": 0.5392156862745098, + "grad_norm": 36.12495422363281, + "learning_rate": 5.201642884309341e-07, + "logits/chosen": -3.9632811546325684, + "logits/rejected": -3.719921827316284, + "logps/chosen": -304.48748779296875, + "logps/rejected": -369.1499938964844, + "loss": 0.157, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.238244652748108, + "rewards/margins": 8.524023056030273, + "rewards/rejected": -7.287499904632568, + "step": 770 + }, + { + "epoch": 0.5427170868347339, + "grad_norm": 14.315747261047363, + "learning_rate": 5.140558583416589e-07, + "logits/chosen": -3.999218702316284, + "logits/rejected": -3.7222657203674316, + "logps/chosen": -316.3999938964844, + "logps/rejected": -378.625, + "loss": 0.1734, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.00820312462747097, + "rewards/margins": 8.339062690734863, + "rewards/rejected": -8.352734565734863, + "step": 775 + }, + { + "epoch": 0.5462184873949579, + "grad_norm": 1.9824923276901245, + "learning_rate": 5.079453279324109e-07, + "logits/chosen": -3.9703125953674316, + "logits/rejected": -3.725781202316284, + "logps/chosen": -296.875, + "logps/rejected": -363.82501220703125, + "loss": 0.1622, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.08635254204273224, + "rewards/margins": 8.248437881469727, + "rewards/rejected": -8.337890625, + "step": 780 + }, + { + "epoch": 0.5497198879551821, + "grad_norm": 7.089279651641846, + "learning_rate": 5.018336102793433e-07, + "logits/chosen": -3.955859422683716, + "logits/rejected": -3.7679686546325684, + "logps/chosen": -265.8062438964844, + "logps/rejected": -320.20001220703125, + "loss": 0.1163, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.5814697742462158, + "rewards/margins": 7.352734565734863, + "rewards/rejected": -5.773046970367432, + "step": 785 + }, + { + "epoch": 0.5532212885154062, + "grad_norm": 2.578310251235962, + "learning_rate": 4.957216186360146e-07, + "logits/chosen": -3.994140625, + "logits/rejected": -3.713671922683716, + "logps/chosen": -300.375, + "logps/rejected": -361.125, + "loss": 0.0588, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 2.330676317214966, + "rewards/margins": 8.728124618530273, + "rewards/rejected": -6.393945217132568, + "step": 790 + }, + { + "epoch": 0.5567226890756303, + "grad_norm": 8.720524787902832, + "learning_rate": 4.896102662969258e-07, + "logits/chosen": -3.991406202316284, + "logits/rejected": -3.7593750953674316, + "logps/chosen": -297.4375, + "logps/rejected": -344.1875, + "loss": 0.0957, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.1497802734375, + "rewards/margins": 7.647656440734863, + "rewards/rejected": -5.4951171875, + "step": 795 + }, + { + "epoch": 0.5602240896358543, + "grad_norm": 6.386475086212158, + "learning_rate": 4.835004664610481e-07, + "logits/chosen": -3.94140625, + "logits/rejected": -3.7203125953674316, + "logps/chosen": -287.70001220703125, + "logps/rejected": -344.70001220703125, + "loss": 0.1196, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8922485113143921, + "rewards/margins": 7.640234470367432, + "rewards/rejected": -6.748437404632568, + "step": 800 + }, + { + "epoch": 0.5637254901960784, + "grad_norm": 3.925546646118164, + "learning_rate": 4.773931320953675e-07, + "logits/chosen": -3.856250047683716, + "logits/rejected": -3.660937547683716, + "logps/chosen": -325.95623779296875, + "logps/rejected": -381.20001220703125, + "loss": 0.0525, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.652050793170929, + "rewards/margins": 7.946484565734863, + "rewards/rejected": -8.599609375, + "step": 805 + }, + { + "epoch": 0.5672268907563025, + "grad_norm": 10.991551399230957, + "learning_rate": 4.7128917579846287e-07, + "logits/chosen": -3.854687452316284, + "logits/rejected": -3.5933594703674316, + "logps/chosen": -343.26873779296875, + "logps/rejected": -402.2749938964844, + "loss": 0.0996, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.73876953125, + "rewards/margins": 8.411328315734863, + "rewards/rejected": -9.155468940734863, + "step": 810 + }, + { + "epoch": 0.5707282913165266, + "grad_norm": 4.291808605194092, + "learning_rate": 4.6518950966414013e-07, + "logits/chosen": -3.806640625, + "logits/rejected": -3.583984375, + "logps/chosen": -301.67498779296875, + "logps/rejected": -367.25, + "loss": 0.0675, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.19460448622703552, + "rewards/margins": 8.220312118530273, + "rewards/rejected": -8.418749809265137, + "step": 815 + }, + { + "epoch": 0.5742296918767507, + "grad_norm": 7.3030009269714355, + "learning_rate": 4.590950451451397e-07, + "logits/chosen": -3.8453125953674316, + "logits/rejected": -3.600781202316284, + "logps/chosen": -327.45001220703125, + "logps/rejected": -386.07501220703125, + "loss": 0.1054, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3786132335662842, + "rewards/margins": 8.593358993530273, + "rewards/rejected": -9.979687690734863, + "step": 820 + }, + { + "epoch": 0.5777310924369747, + "grad_norm": 3.5457348823547363, + "learning_rate": 4.530066929169427e-07, + "logits/chosen": -3.8628907203674316, + "logits/rejected": -3.5679688453674316, + "logps/chosen": -346.4750061035156, + "logps/rejected": -403.0249938964844, + "loss": 0.0685, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.4722656309604645, + "rewards/margins": 8.35546875, + "rewards/rejected": -8.8291015625, + "step": 825 + }, + { + "epoch": 0.5812324929971989, + "grad_norm": 2.543821096420288, + "learning_rate": 4.469253627416905e-07, + "logits/chosen": -3.877734422683716, + "logits/rejected": -3.6019530296325684, + "logps/chosen": -312.64373779296875, + "logps/rejected": -373.82501220703125, + "loss": 0.0816, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.34624022245407104, + "rewards/margins": 9.057031631469727, + "rewards/rejected": -9.40576171875, + "step": 830 + }, + { + "epoch": 0.584733893557423, + "grad_norm": 8.854287147521973, + "learning_rate": 4.4085196333224296e-07, + "logits/chosen": -3.8851561546325684, + "logits/rejected": -3.546875, + "logps/chosen": -316.26873779296875, + "logps/rejected": -381.32501220703125, + "loss": 0.0958, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.4364257752895355, + "rewards/margins": 8.853906631469727, + "rewards/rejected": -9.285547256469727, + "step": 835 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 8.832906723022461, + "learning_rate": 4.347874022163919e-07, + "logits/chosen": -3.852734327316284, + "logits/rejected": -3.579296827316284, + "logps/chosen": -299.01873779296875, + "logps/rejected": -366.8500061035156, + "loss": 0.0845, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.5170654058456421, + "rewards/margins": 8.686914443969727, + "rewards/rejected": -9.208398818969727, + "step": 840 + }, + { + "epoch": 0.5917366946778712, + "grad_norm": 15.137822151184082, + "learning_rate": 4.2873258560125237e-07, + "logits/chosen": -3.861328125, + "logits/rejected": -3.583984375, + "logps/chosen": -326.2749938964844, + "logps/rejected": -385.9750061035156, + "loss": 0.1595, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.552478015422821, + "rewards/margins": 8.507421493530273, + "rewards/rejected": -9.059374809265137, + "step": 845 + }, + { + "epoch": 0.5952380952380952, + "grad_norm": 6.563608169555664, + "learning_rate": 4.2268841823785126e-07, + "logits/chosen": -3.8773436546325684, + "logits/rejected": -3.607421875, + "logps/chosen": -312.45623779296875, + "logps/rejected": -370.125, + "loss": 0.101, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.7504059076309204, + "rewards/margins": 8.3125, + "rewards/rejected": -9.064062118530273, + "step": 850 + }, + { + "epoch": 0.5987394957983193, + "grad_norm": 13.671486854553223, + "learning_rate": 4.166558032859338e-07, + "logits/chosen": -3.848437547683716, + "logits/rejected": -3.592968702316284, + "logps/chosen": -313.1187438964844, + "logps/rejected": -370.92498779296875, + "loss": 0.178, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.0292236804962158, + "rewards/margins": 8.178515434265137, + "rewards/rejected": -9.208593368530273, + "step": 855 + }, + { + "epoch": 0.6022408963585434, + "grad_norm": 24.43784523010254, + "learning_rate": 4.1063564217900617e-07, + "logits/chosen": -3.866406202316284, + "logits/rejected": -3.6640625, + "logps/chosen": -322.48126220703125, + "logps/rejected": -370.375, + "loss": 0.1452, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.087890625, + "rewards/margins": 7.253515720367432, + "rewards/rejected": -8.346094131469727, + "step": 860 + }, + { + "epoch": 0.6057422969187675, + "grad_norm": 4.205888748168945, + "learning_rate": 4.0462883448963867e-07, + "logits/chosen": -3.9609375, + "logits/rejected": -3.7269530296325684, + "logps/chosen": -310.83123779296875, + "logps/rejected": -362.4375, + "loss": 0.096, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.8477843999862671, + "rewards/margins": 7.521874904632568, + "rewards/rejected": -8.370312690734863, + "step": 865 + }, + { + "epoch": 0.6092436974789915, + "grad_norm": 5.241097927093506, + "learning_rate": 3.9863627779504473e-07, + "logits/chosen": -3.971874952316284, + "logits/rejected": -3.764843702316284, + "logps/chosen": -308.04998779296875, + "logps/rejected": -353.75, + "loss": 0.0758, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.34832763671875, + "rewards/margins": 7.138671875, + "rewards/rejected": -7.488476753234863, + "step": 870 + }, + { + "epoch": 0.6127450980392157, + "grad_norm": 5.45101261138916, + "learning_rate": 3.9265886754295907e-07, + "logits/chosen": -3.916015625, + "logits/rejected": -3.7066407203674316, + "logps/chosen": -304.71875, + "logps/rejected": -361.8062438964844, + "loss": 0.1716, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.31901854276657104, + "rewards/margins": 6.958300590515137, + "rewards/rejected": -6.638476371765137, + "step": 875 + }, + { + "epoch": 0.6162464985994398, + "grad_norm": 1.5118215084075928, + "learning_rate": 3.866974969178347e-07, + "logits/chosen": -3.9496092796325684, + "logits/rejected": -3.727343797683716, + "logps/chosen": -293.2124938964844, + "logps/rejected": -342.0249938964844, + "loss": 0.0315, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1063232421875, + "rewards/margins": 7.92578125, + "rewards/rejected": -6.819726467132568, + "step": 880 + }, + { + "epoch": 0.6197478991596639, + "grad_norm": 2.3815932273864746, + "learning_rate": 3.80753056707376e-07, + "logits/chosen": -3.957812547683716, + "logits/rejected": -3.724609375, + "logps/chosen": -316.8999938964844, + "logps/rejected": -375.25, + "loss": 0.0819, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2796142101287842, + "rewards/margins": 8.724218368530273, + "rewards/rejected": -7.444921970367432, + "step": 885 + }, + { + "epoch": 0.623249299719888, + "grad_norm": 3.3432934284210205, + "learning_rate": 3.7482643516943233e-07, + "logits/chosen": -3.9781250953674316, + "logits/rejected": -3.7242188453674316, + "logps/chosen": -305.8062438964844, + "logps/rejected": -366.07501220703125, + "loss": 0.1337, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.043957471847534, + "rewards/margins": 8.4375, + "rewards/rejected": -6.396679878234863, + "step": 890 + }, + { + "epoch": 0.626750700280112, + "grad_norm": 84.51541137695312, + "learning_rate": 3.6891851789926885e-07, + "logits/chosen": -3.908203125, + "logits/rejected": -3.701171875, + "logps/chosen": -296.64373779296875, + "logps/rejected": -356.2250061035156, + "loss": 0.1413, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6126953363418579, + "rewards/margins": 7.667187690734863, + "rewards/rejected": -7.053906440734863, + "step": 895 + }, + { + "epoch": 0.6302521008403361, + "grad_norm": 1.273039698600769, + "learning_rate": 3.6303018769723367e-07, + "logits/chosen": -3.9496092796325684, + "logits/rejected": -3.669921875, + "logps/chosen": -341.0249938964844, + "logps/rejected": -400.8999938964844, + "loss": 0.0739, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.22305908799171448, + "rewards/margins": 8.662500381469727, + "rewards/rejected": -8.442187309265137, + "step": 900 + }, + { + "epoch": 0.6337535014005602, + "grad_norm": 5.358231544494629, + "learning_rate": 3.571623244368448e-07, + "logits/chosen": -3.91015625, + "logits/rejected": -3.6714844703674316, + "logps/chosen": -323.13751220703125, + "logps/rejected": -372.6000061035156, + "loss": 0.0644, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.17644043266773224, + "rewards/margins": 7.969531059265137, + "rewards/rejected": -8.137890815734863, + "step": 905 + }, + { + "epoch": 0.6372549019607843, + "grad_norm": 5.974215984344482, + "learning_rate": 3.51315804933314e-07, + "logits/chosen": -3.9496092796325684, + "logits/rejected": -3.6480469703674316, + "logps/chosen": -346.0249938964844, + "logps/rejected": -419.6000061035156, + "loss": 0.0908, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.17644043266773224, + "rewards/margins": 8.888280868530273, + "rewards/rejected": -8.713549613952637, + "step": 910 + }, + { + "epoch": 0.6407563025210085, + "grad_norm": 2.7069153785705566, + "learning_rate": 3.454915028125263e-07, + "logits/chosen": -3.954296827316284, + "logits/rejected": -3.6468749046325684, + "logps/chosen": -333.4750061035156, + "logps/rejected": -401.4750061035156, + "loss": 0.0515, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.6935790777206421, + "rewards/margins": 9.053906440734863, + "rewards/rejected": -8.358495712280273, + "step": 915 + }, + { + "epoch": 0.6442577030812325, + "grad_norm": 44.74658203125, + "learning_rate": 3.396902883804976e-07, + "logits/chosen": -3.8921875953674316, + "logits/rejected": -3.6019530296325684, + "logps/chosen": -312.125, + "logps/rejected": -377.32501220703125, + "loss": 0.0647, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.424072265625, + "rewards/margins": 9.225000381469727, + "rewards/rejected": -8.80078125, + "step": 920 + }, + { + "epoch": 0.6477591036414566, + "grad_norm": 7.319411277770996, + "learning_rate": 3.3391302849332753e-07, + "logits/chosen": -3.856640577316284, + "logits/rejected": -3.626171827316284, + "logps/chosen": -313.0874938964844, + "logps/rejected": -377.23748779296875, + "loss": 0.0942, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.7853729128837585, + "rewards/margins": 9.032812118530273, + "rewards/rejected": -8.244531631469727, + "step": 925 + }, + { + "epoch": 0.6512605042016807, + "grad_norm": 11.135540008544922, + "learning_rate": 3.28160586427668e-07, + "logits/chosen": -3.885937452316284, + "logits/rejected": -3.6246094703674316, + "logps/chosen": -288.7437438964844, + "logps/rejected": -351.0249938964844, + "loss": 0.142, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.77581787109375, + "rewards/margins": 9.06640625, + "rewards/rejected": -8.296093940734863, + "step": 930 + }, + { + "epoch": 0.6547619047619048, + "grad_norm": 4.436044692993164, + "learning_rate": 3.2243382175172686e-07, + "logits/chosen": -3.89453125, + "logits/rejected": -3.627734422683716, + "logps/chosen": -311.86248779296875, + "logps/rejected": -375.1000061035156, + "loss": 0.1683, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.195068359375, + "rewards/margins": 8.670312881469727, + "rewards/rejected": -8.473437309265137, + "step": 935 + }, + { + "epoch": 0.6582633053221288, + "grad_norm": 3.3467283248901367, + "learning_rate": 3.167335901968253e-07, + "logits/chosen": -3.8296875953674316, + "logits/rejected": -3.5863280296325684, + "logps/chosen": -284.07501220703125, + "logps/rejected": -342.4125061035156, + "loss": 0.0814, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.4918579161167145, + "rewards/margins": 7.952343940734863, + "rewards/rejected": -7.465429782867432, + "step": 940 + }, + { + "epoch": 0.6617647058823529, + "grad_norm": 12.231038093566895, + "learning_rate": 3.1106074352952887e-07, + "logits/chosen": -3.879687547683716, + "logits/rejected": -3.5738282203674316, + "logps/chosen": -346.2250061035156, + "logps/rejected": -409.0, + "loss": 0.108, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.16412964463233948, + "rewards/margins": 8.977343559265137, + "rewards/rejected": -8.80859375, + "step": 945 + }, + { + "epoch": 0.665266106442577, + "grad_norm": 3.6774492263793945, + "learning_rate": 3.054161294243709e-07, + "logits/chosen": -3.850390672683716, + "logits/rejected": -3.6050782203674316, + "logps/chosen": -337.42498779296875, + "logps/rejected": -405.875, + "loss": 0.1166, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.9543212652206421, + "rewards/margins": 8.693359375, + "rewards/rejected": -9.644067764282227, + "step": 950 + }, + { + "epoch": 0.6687675070028011, + "grad_norm": 0.6831060647964478, + "learning_rate": 2.998005913371868e-07, + "logits/chosen": -3.8199219703674316, + "logits/rejected": -3.5648436546325684, + "logps/chosen": -302.6000061035156, + "logps/rejected": -353.5249938964844, + "loss": 0.0515, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.94317626953125, + "rewards/margins": 8.862500190734863, + "rewards/rejected": -7.921533107757568, + "step": 955 + }, + { + "epoch": 0.6722689075630253, + "grad_norm": 5.337077617645264, + "learning_rate": 2.9421496837908034e-07, + "logits/chosen": -3.8570313453674316, + "logits/rejected": -3.573046922683716, + "logps/chosen": -304.38751220703125, + "logps/rejected": -363.8999938964844, + "loss": 0.0926, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.72064208984375, + "rewards/margins": 8.595703125, + "rewards/rejected": -7.8828125, + "step": 960 + }, + { + "epoch": 0.6757703081232493, + "grad_norm": 4.56184196472168, + "learning_rate": 2.88660095191037e-07, + "logits/chosen": -3.887890577316284, + "logits/rejected": -3.602734327316284, + "logps/chosen": -315.4750061035156, + "logps/rejected": -383.875, + "loss": 0.1173, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.33234864473342896, + "rewards/margins": 8.784375190734863, + "rewards/rejected": -9.114501953125, + "step": 965 + }, + { + "epoch": 0.6792717086834734, + "grad_norm": 0.696527361869812, + "learning_rate": 2.831368018192071e-07, + "logits/chosen": -3.913281202316284, + "logits/rejected": -3.602343797683716, + "logps/chosen": -332.07501220703125, + "logps/rejected": -395.6000061035156, + "loss": 0.079, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6913818120956421, + "rewards/margins": 8.940038681030273, + "rewards/rejected": -8.248046875, + "step": 970 + }, + { + "epoch": 0.6827731092436975, + "grad_norm": 2.46315598487854, + "learning_rate": 2.7764591359087414e-07, + "logits/chosen": -3.893359422683716, + "logits/rejected": -3.5999999046325684, + "logps/chosen": -312.73748779296875, + "logps/rejected": -367.2875061035156, + "loss": 0.0655, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.8863891363143921, + "rewards/margins": 8.484375, + "rewards/rejected": -7.603515625, + "step": 975 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 2.042008876800537, + "learning_rate": 2.721882509911296e-07, + "logits/chosen": -3.9203124046325684, + "logits/rejected": -3.582812547683716, + "logps/chosen": -347.5874938964844, + "logps/rejected": -419.20001220703125, + "loss": 0.1541, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.3082641661167145, + "rewards/margins": 9.401562690734863, + "rewards/rejected": -9.099413871765137, + "step": 980 + }, + { + "epoch": 0.6897759103641457, + "grad_norm": 1.3312026262283325, + "learning_rate": 2.6676462954027033e-07, + "logits/chosen": -3.867968797683716, + "logits/rejected": -3.5484375953674316, + "logps/chosen": -304.625, + "logps/rejected": -371.625, + "loss": 0.0338, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.755627453327179, + "rewards/margins": 9.58203125, + "rewards/rejected": -8.826171875, + "step": 985 + }, + { + "epoch": 0.6932773109243697, + "grad_norm": 2.3855769634246826, + "learning_rate": 2.6137585967193725e-07, + "logits/chosen": -3.8617186546325684, + "logits/rejected": -3.551953077316284, + "logps/chosen": -303.61248779296875, + "logps/rejected": -385.3374938964844, + "loss": 0.0761, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.10163573920726776, + "rewards/margins": 9.792577743530273, + "rewards/rejected": -9.689062118530273, + "step": 990 + }, + { + "epoch": 0.6967787114845938, + "grad_norm": 11.995887756347656, + "learning_rate": 2.560227466120164e-07, + "logits/chosen": -3.828125, + "logits/rejected": -3.534374952316284, + "logps/chosen": -302.66876220703125, + "logps/rejected": -365.88751220703125, + "loss": 0.1035, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.35798341035842896, + "rewards/margins": 8.803906440734863, + "rewards/rejected": -9.166015625, + "step": 995 + }, + { + "epoch": 0.7002801120448179, + "grad_norm": 15.732640266418457, + "learning_rate": 2.5070609025831604e-07, + "logits/chosen": -3.81640625, + "logits/rejected": -3.471484422683716, + "logps/chosen": -325.4624938964844, + "logps/rejected": -398.7250061035156, + "loss": 0.1369, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 0.690185546875, + "rewards/margins": 9.998437881469727, + "rewards/rejected": -9.308496475219727, + "step": 1000 + }, + { + "epoch": 0.7037815126050421, + "grad_norm": 11.053906440734863, + "learning_rate": 2.454266850610398e-07, + "logits/chosen": -3.829296827316284, + "logits/rejected": -3.542187452316284, + "logps/chosen": -310.21875, + "logps/rejected": -370.125, + "loss": 0.1538, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.14753417670726776, + "rewards/margins": 9.098437309265137, + "rewards/rejected": -8.958593368530273, + "step": 1005 + }, + { + "epoch": 0.7072829131652661, + "grad_norm": 12.015442848205566, + "learning_rate": 2.4018531990407595e-07, + "logits/chosen": -3.794140577316284, + "logits/rejected": -3.5640625953674316, + "logps/chosen": -264.40625, + "logps/rejected": -331.7124938964844, + "loss": 0.0833, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.13149413466453552, + "rewards/margins": 8.956250190734863, + "rewards/rejected": -8.8291015625, + "step": 1010 + }, + { + "epoch": 0.7107843137254902, + "grad_norm": 1.9385806322097778, + "learning_rate": 2.3498277798711723e-07, + "logits/chosen": -3.854296922683716, + "logits/rejected": -3.48828125, + "logps/chosen": -366.0249938964844, + "logps/rejected": -437.6000061035156, + "loss": 0.1108, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.49018555879592896, + "rewards/margins": 10.124218940734863, + "rewards/rejected": -9.637890815734863, + "step": 1015 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.8205724954605103, + "learning_rate": 2.298198367086279e-07, + "logits/chosen": -3.859375, + "logits/rejected": -3.534374952316284, + "logps/chosen": -300.01251220703125, + "logps/rejected": -355.3500061035156, + "loss": 0.1203, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8221191167831421, + "rewards/margins": 8.935937881469727, + "rewards/rejected": -8.110547065734863, + "step": 1020 + }, + { + "epoch": 0.7177871148459384, + "grad_norm": 6.349449634552002, + "learning_rate": 2.2469726754968204e-07, + "logits/chosen": -3.837890625, + "logits/rejected": -3.553515672683716, + "logps/chosen": -317.36248779296875, + "logps/rejected": -380.54998779296875, + "loss": 0.0981, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.7828369140625, + "rewards/margins": 8.682812690734863, + "rewards/rejected": -7.900390625, + "step": 1025 + }, + { + "epoch": 0.7212885154061625, + "grad_norm": 9.13239574432373, + "learning_rate": 2.196158359586825e-07, + "logits/chosen": -3.882031202316284, + "logits/rejected": -3.592578172683716, + "logps/chosen": -326.0874938964844, + "logps/rejected": -394.54998779296875, + "loss": 0.0875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.2667236328125, + "rewards/margins": 8.850000381469727, + "rewards/rejected": -7.580956935882568, + "step": 1030 + }, + { + "epoch": 0.7247899159663865, + "grad_norm": 4.96783971786499, + "learning_rate": 2.1457630123698233e-07, + "logits/chosen": -3.916015625, + "logits/rejected": -3.5726561546325684, + "logps/chosen": -297.82501220703125, + "logps/rejected": -364.7250061035156, + "loss": 0.0886, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.98199462890625, + "rewards/margins": 9.078125, + "rewards/rejected": -8.098437309265137, + "step": 1035 + }, + { + "epoch": 0.7282913165266106, + "grad_norm": 1.335874080657959, + "learning_rate": 2.0957941642542587e-07, + "logits/chosen": -3.8882813453674316, + "logits/rejected": -3.5667967796325684, + "logps/chosen": -314.42498779296875, + "logps/rejected": -383.2250061035156, + "loss": 0.0919, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3975830078125, + "rewards/margins": 8.814844131469727, + "rewards/rejected": -7.421679496765137, + "step": 1040 + }, + { + "epoch": 0.7317927170868347, + "grad_norm": 10.026313781738281, + "learning_rate": 2.0462592819182374e-07, + "logits/chosen": -3.897656202316284, + "logits/rejected": -3.575390577316284, + "logps/chosen": -292.70623779296875, + "logps/rejected": -351.1625061035156, + "loss": 0.1123, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2625000476837158, + "rewards/margins": 8.888671875, + "rewards/rejected": -7.621874809265137, + "step": 1045 + }, + { + "epoch": 0.7352941176470589, + "grad_norm": 6.501437187194824, + "learning_rate": 1.997165767193801e-07, + "logits/chosen": -3.8714842796325684, + "logits/rejected": -3.5679688453674316, + "logps/chosen": -291.0874938964844, + "logps/rejected": -349.07501220703125, + "loss": 0.0678, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.906518578529358, + "rewards/margins": 9.068750381469727, + "rewards/rejected": -7.152929782867432, + "step": 1050 + }, + { + "epoch": 0.738795518207283, + "grad_norm": 11.565328598022461, + "learning_rate": 1.9485209559609145e-07, + "logits/chosen": -3.869140625, + "logits/rejected": -3.5992188453674316, + "logps/chosen": -321.7124938964844, + "logps/rejected": -376.25, + "loss": 0.0903, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.876354992389679, + "rewards/margins": 8.184374809265137, + "rewards/rejected": -7.3046875, + "step": 1055 + }, + { + "epoch": 0.742296918767507, + "grad_norm": 23.18073844909668, + "learning_rate": 1.9003321170512726e-07, + "logits/chosen": -3.868359327316284, + "logits/rejected": -3.578125, + "logps/chosen": -319.9750061035156, + "logps/rejected": -368.95001220703125, + "loss": 0.1726, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.9228760004043579, + "rewards/margins": 8.095703125, + "rewards/rejected": -7.174609184265137, + "step": 1060 + }, + { + "epoch": 0.7457983193277311, + "grad_norm": 8.127881050109863, + "learning_rate": 1.8526064511621452e-07, + "logits/chosen": -3.8570313453674316, + "logits/rejected": -3.584765672683716, + "logps/chosen": -304.26251220703125, + "logps/rejected": -375.3999938964844, + "loss": 0.0682, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.971264660358429, + "rewards/margins": 8.897656440734863, + "rewards/rejected": -7.934179782867432, + "step": 1065 + }, + { + "epoch": 0.7492997198879552, + "grad_norm": 2.701737880706787, + "learning_rate": 1.8053510897804103e-07, + "logits/chosen": -3.811328172683716, + "logits/rejected": -3.5132813453674316, + "logps/chosen": -292.0249938964844, + "logps/rejected": -358.1499938964844, + "loss": 0.0499, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 1.67572021484375, + "rewards/margins": 9.517578125, + "rewards/rejected": -7.842577934265137, + "step": 1070 + }, + { + "epoch": 0.7528011204481793, + "grad_norm": 0.31558746099472046, + "learning_rate": 1.7585730941169101e-07, + "logits/chosen": -3.7890625, + "logits/rejected": -3.532031297683716, + "logps/chosen": -290.4750061035156, + "logps/rejected": -358.0, + "loss": 0.0462, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.8272033929824829, + "rewards/margins": 8.696874618530273, + "rewards/rejected": -7.870898246765137, + "step": 1075 + }, + { + "epoch": 0.7563025210084033, + "grad_norm": 15.150861740112305, + "learning_rate": 1.7122794540513264e-07, + "logits/chosen": -3.811718702316284, + "logits/rejected": -3.522656202316284, + "logps/chosen": -306.1499938964844, + "logps/rejected": -368.875, + "loss": 0.0674, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4841064512729645, + "rewards/margins": 9.037500381469727, + "rewards/rejected": -8.553125381469727, + "step": 1080 + }, + { + "epoch": 0.7598039215686274, + "grad_norm": 5.7700300216674805, + "learning_rate": 1.6664770870876937e-07, + "logits/chosen": -3.821484327316284, + "logits/rejected": -3.4632811546325684, + "logps/chosen": -322.67498779296875, + "logps/rejected": -388.625, + "loss": 0.0482, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8797363042831421, + "rewards/margins": 9.785937309265137, + "rewards/rejected": -8.902783393859863, + "step": 1085 + }, + { + "epoch": 0.7633053221288515, + "grad_norm": 10.431517601013184, + "learning_rate": 1.621172837320754e-07, + "logits/chosen": -3.78125, + "logits/rejected": -3.494921922683716, + "logps/chosen": -304.45001220703125, + "logps/rejected": -369.67498779296875, + "loss": 0.1005, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.6063598394393921, + "rewards/margins": 9.159765243530273, + "rewards/rejected": -8.555468559265137, + "step": 1090 + }, + { + "epoch": 0.7668067226890757, + "grad_norm": 1.8601133823394775, + "learning_rate": 1.5763734744132583e-07, + "logits/chosen": -3.7445311546325684, + "logits/rejected": -3.474609375, + "logps/chosen": -312.1187438964844, + "logps/rejected": -380.4750061035156, + "loss": 0.1414, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.4251464903354645, + "rewards/margins": 8.892382621765137, + "rewards/rejected": -8.470703125, + "step": 1095 + }, + { + "epoch": 0.7703081232492998, + "grad_norm": 3.562940835952759, + "learning_rate": 1.5320856925843995e-07, + "logits/chosen": -3.7476563453674316, + "logits/rejected": -3.4683594703674316, + "logps/chosen": -313.1875, + "logps/rejected": -368.2250061035156, + "loss": 0.1008, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0226318836212158, + "rewards/margins": 9.282031059265137, + "rewards/rejected": -8.258398056030273, + "step": 1100 + }, + { + "epoch": 0.7738095238095238, + "grad_norm": 17.6439208984375, + "learning_rate": 1.4883161096095187e-07, + "logits/chosen": -3.764843702316284, + "logits/rejected": -3.450390577316284, + "logps/chosen": -313.8374938964844, + "logps/rejected": -387.875, + "loss": 0.0673, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.898022472858429, + "rewards/margins": 9.743359565734863, + "rewards/rejected": -8.846094131469727, + "step": 1105 + }, + { + "epoch": 0.7773109243697479, + "grad_norm": 0.7597159147262573, + "learning_rate": 1.4450712658312352e-07, + "logits/chosen": -3.704296827316284, + "logits/rejected": -3.463671922683716, + "logps/chosen": -313.67498779296875, + "logps/rejected": -374.79998779296875, + "loss": 0.0523, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.42988282442092896, + "rewards/margins": 8.662500381469727, + "rewards/rejected": -9.093358993530273, + "step": 1110 + }, + { + "epoch": 0.780812324929972, + "grad_norm": 9.668035507202148, + "learning_rate": 1.402357623182136e-07, + "logits/chosen": -3.7015624046325684, + "logits/rejected": -3.457812547683716, + "logps/chosen": -331.70001220703125, + "logps/rejected": -381.2749938964844, + "loss": 0.1341, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.75238037109375, + "rewards/margins": 8.151562690734863, + "rewards/rejected": -8.9033203125, + "step": 1115 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 4.6653594970703125, + "learning_rate": 1.3601815642192038e-07, + "logits/chosen": -3.733593702316284, + "logits/rejected": -3.3960938453674316, + "logps/chosen": -368.875, + "logps/rejected": -434.7749938964844, + "loss": 0.0877, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.5340576171875, + "rewards/margins": 9.623437881469727, + "rewards/rejected": -10.157812118530273, + "step": 1120 + }, + { + "epoch": 0.7878151260504201, + "grad_norm": 6.899483680725098, + "learning_rate": 1.3185493911700852e-07, + "logits/chosen": -3.666015625, + "logits/rejected": -3.408203125, + "logps/chosen": -314.8125, + "logps/rejected": -382.1000061035156, + "loss": 0.1256, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.03854980319738388, + "rewards/margins": 9.040624618530273, + "rewards/rejected": -9.077733993530273, + "step": 1125 + }, + { + "epoch": 0.7913165266106442, + "grad_norm": 6.99959659576416, + "learning_rate": 1.2774673249913652e-07, + "logits/chosen": -3.6636719703674316, + "logits/rejected": -3.4175782203674316, + "logps/chosen": -327.48748779296875, + "logps/rejected": -384.9750061035156, + "loss": 0.1503, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.278564453125, + "rewards/margins": 8.178906440734863, + "rewards/rejected": -8.453125, + "step": 1130 + }, + { + "epoch": 0.7948179271708683, + "grad_norm": 20.7562255859375, + "learning_rate": 1.2369415044390052e-07, + "logits/chosen": -3.712109327316284, + "logits/rejected": -3.444531202316284, + "logps/chosen": -311.625, + "logps/rejected": -366.32501220703125, + "loss": 0.1526, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.01655273512005806, + "rewards/margins": 8.888280868530273, + "rewards/rejected": -8.871874809265137, + "step": 1135 + }, + { + "epoch": 0.7983193277310925, + "grad_norm": 7.541940689086914, + "learning_rate": 1.1969779851510358e-07, + "logits/chosen": -3.732421875, + "logits/rejected": -3.474609375, + "logps/chosen": -328.7749938964844, + "logps/rejected": -392.95001220703125, + "loss": 0.0885, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.560778796672821, + "rewards/margins": 8.80078125, + "rewards/rejected": -8.240625381469727, + "step": 1140 + }, + { + "epoch": 0.8018207282913166, + "grad_norm": 2.2544028759002686, + "learning_rate": 1.1575827387426845e-07, + "logits/chosen": -3.7105469703674316, + "logits/rejected": -3.483203172683716, + "logps/chosen": -296.51251220703125, + "logps/rejected": -351.07501220703125, + "loss": 0.0749, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.36418455839157104, + "rewards/margins": 9.017578125, + "rewards/rejected": -8.65625, + "step": 1145 + }, + { + "epoch": 0.8053221288515406, + "grad_norm": 3.940847873687744, + "learning_rate": 1.1187616519140646e-07, + "logits/chosen": -3.719921827316284, + "logits/rejected": -3.448437452316284, + "logps/chosen": -311.1875, + "logps/rejected": -391.17498779296875, + "loss": 0.0505, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2625488340854645, + "rewards/margins": 9.794530868530273, + "rewards/rejected": -9.527734756469727, + "step": 1150 + }, + { + "epoch": 0.8088235294117647, + "grad_norm": 1.4131155014038086, + "learning_rate": 1.0805205255705402e-07, + "logits/chosen": -3.710156202316284, + "logits/rejected": -3.4437499046325684, + "logps/chosen": -314.32501220703125, + "logps/rejected": -379.8500061035156, + "loss": 0.1, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4536376893520355, + "rewards/margins": 9.346484184265137, + "rewards/rejected": -8.892969131469727, + "step": 1155 + }, + { + "epoch": 0.8123249299719888, + "grad_norm": 1.5823792219161987, + "learning_rate": 1.0428650739559136e-07, + "logits/chosen": -3.708203077316284, + "logits/rejected": -3.455078125, + "logps/chosen": -335.92498779296875, + "logps/rejected": -400.67498779296875, + "loss": 0.0985, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.008251952938735485, + "rewards/margins": 9.153515815734863, + "rewards/rejected": -9.148046493530273, + "step": 1160 + }, + { + "epoch": 0.8158263305322129, + "grad_norm": 2.751337766647339, + "learning_rate": 1.005800923798572e-07, + "logits/chosen": -3.744140625, + "logits/rejected": -3.426953077316284, + "logps/chosen": -336.29998779296875, + "logps/rejected": -408.32501220703125, + "loss": 0.0771, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.4516967833042145, + "rewards/margins": 9.059374809265137, + "rewards/rejected": -8.602343559265137, + "step": 1165 + }, + { + "epoch": 0.819327731092437, + "grad_norm": 5.285534858703613, + "learning_rate": 9.693336134706987e-08, + "logits/chosen": -3.731640577316284, + "logits/rejected": -3.482421875, + "logps/chosen": -306.42498779296875, + "logps/rejected": -367.1875, + "loss": 0.0841, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.2859741151332855, + "rewards/margins": 8.418749809265137, + "rewards/rejected": -8.137499809265137, + "step": 1170 + }, + { + "epoch": 0.822829131652661, + "grad_norm": 3.021791934967041, + "learning_rate": 9.334685921606944e-08, + "logits/chosen": -3.725781202316284, + "logits/rejected": -3.463671922683716, + "logps/chosen": -314.86248779296875, + "logps/rejected": -374.95001220703125, + "loss": 0.0813, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.4312988221645355, + "rewards/margins": 8.670312881469727, + "rewards/rejected": -8.241796493530273, + "step": 1175 + }, + { + "epoch": 0.8263305322128851, + "grad_norm": 4.565569877624512, + "learning_rate": 8.982112190589236e-08, + "logits/chosen": -3.7367186546325684, + "logits/rejected": -3.482421875, + "logps/chosen": -335.38751220703125, + "logps/rejected": -400.5249938964844, + "loss": 0.0808, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.4556640684604645, + "rewards/margins": 9.122655868530273, + "rewards/rejected": -8.664648056030273, + "step": 1180 + }, + { + "epoch": 0.8298319327731093, + "grad_norm": 11.402658462524414, + "learning_rate": 8.635667625569099e-08, + "logits/chosen": -3.740234375, + "logits/rejected": -3.428906202316284, + "logps/chosen": -341.1499938964844, + "logps/rejected": -405.70001220703125, + "loss": 0.1468, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.505859375, + "rewards/margins": 8.94140625, + "rewards/rejected": -8.4326171875, + "step": 1185 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 4.893910884857178, + "learning_rate": 8.29540399460092e-08, + "logits/chosen": -3.73828125, + "logits/rejected": -3.440234422683716, + "logps/chosen": -322.57501220703125, + "logps/rejected": -388.2749938964844, + "loss": 0.0675, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.34471434354782104, + "rewards/margins": 9.078516006469727, + "rewards/rejected": -9.425390243530273, + "step": 1190 + }, + { + "epoch": 0.8368347338935574, + "grad_norm": 3.020184278488159, + "learning_rate": 7.961372142142775e-08, + "logits/chosen": -3.7210936546325684, + "logits/rejected": -3.4769530296325684, + "logps/chosen": -288.4624938964844, + "logps/rejected": -358.875, + "loss": 0.1032, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.40043944120407104, + "rewards/margins": 9.099609375, + "rewards/rejected": -9.497655868530273, + "step": 1195 + }, + { + "epoch": 0.8403361344537815, + "grad_norm": 9.903548240661621, + "learning_rate": 7.633621981458915e-08, + "logits/chosen": -3.751171827316284, + "logits/rejected": -3.470703125, + "logps/chosen": -328.67498779296875, + "logps/rejected": -400.6499938964844, + "loss": 0.0628, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.7047363519668579, + "rewards/margins": 9.202539443969727, + "rewards/rejected": -9.906641006469727, + "step": 1200 + }, + { + "epoch": 0.8438375350140056, + "grad_norm": 2.419182300567627, + "learning_rate": 7.312202487161317e-08, + "logits/chosen": -3.7496094703674316, + "logits/rejected": -3.477343797683716, + "logps/chosen": -319.375, + "logps/rejected": -385.17498779296875, + "loss": 0.0502, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.12807616591453552, + "rewards/margins": 9.272656440734863, + "rewards/rejected": -9.147363662719727, + "step": 1205 + }, + { + "epoch": 0.8473389355742297, + "grad_norm": 8.510985374450684, + "learning_rate": 6.997161687891634e-08, + "logits/chosen": -3.705078125, + "logits/rejected": -3.462890625, + "logps/chosen": -292.8374938964844, + "logps/rejected": -358.4125061035156, + "loss": 0.0975, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.4731079041957855, + "rewards/margins": 9.19921875, + "rewards/rejected": -8.723437309265137, + "step": 1210 + }, + { + "epoch": 0.8508403361344538, + "grad_norm": 5.150385856628418, + "learning_rate": 6.688546659144478e-08, + "logits/chosen": -3.7718749046325684, + "logits/rejected": -3.48828125, + "logps/chosen": -313.0625, + "logps/rejected": -386.0249938964844, + "loss": 0.0476, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.7706543207168579, + "rewards/margins": 9.772656440734863, + "rewards/rejected": -8.990625381469727, + "step": 1215 + }, + { + "epoch": 0.8543417366946778, + "grad_norm": 8.4049711227417, + "learning_rate": 6.386403516232946e-08, + "logits/chosen": -3.7289061546325684, + "logits/rejected": -3.469921827316284, + "logps/chosen": -324.8999938964844, + "logps/rejected": -391.75, + "loss": 0.0917, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.3090636730194092, + "rewards/margins": 8.778124809265137, + "rewards/rejected": -10.088281631469727, + "step": 1220 + }, + { + "epoch": 0.8578431372549019, + "grad_norm": 53.30424499511719, + "learning_rate": 6.090777407397902e-08, + "logits/chosen": -3.7113280296325684, + "logits/rejected": -3.458984375, + "logps/chosen": -315.6875, + "logps/rejected": -378.07501220703125, + "loss": 0.1183, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.10551147162914276, + "rewards/margins": 9.064648628234863, + "rewards/rejected": -8.956250190734863, + "step": 1225 + }, + { + "epoch": 0.8613445378151261, + "grad_norm": 12.400679588317871, + "learning_rate": 5.801712507061563e-08, + "logits/chosen": -3.72265625, + "logits/rejected": -3.459765672683716, + "logps/chosen": -300.88751220703125, + "logps/rejected": -367.8500061035156, + "loss": 0.0682, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.11720886081457138, + "rewards/margins": 8.788281440734863, + "rewards/rejected": -8.904687881469727, + "step": 1230 + }, + { + "epoch": 0.8648459383753502, + "grad_norm": 2.6440250873565674, + "learning_rate": 5.519252009226638e-08, + "logits/chosen": -3.7691407203674316, + "logits/rejected": -3.46875, + "logps/chosen": -336.29998779296875, + "logps/rejected": -396.2749938964844, + "loss": 0.0641, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.09663085639476776, + "rewards/margins": 9.427343368530273, + "rewards/rejected": -9.334375381469727, + "step": 1235 + }, + { + "epoch": 0.8683473389355743, + "grad_norm": 9.605412483215332, + "learning_rate": 5.243438121022076e-08, + "logits/chosen": -3.72265625, + "logits/rejected": -3.462890625, + "logps/chosen": -330.0062561035156, + "logps/rejected": -404.17498779296875, + "loss": 0.095, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.3220153748989105, + "rewards/margins": 9.167187690734863, + "rewards/rejected": -9.490234375, + "step": 1240 + }, + { + "epoch": 0.8718487394957983, + "grad_norm": 7.742379188537598, + "learning_rate": 4.974312056396113e-08, + "logits/chosen": -3.69921875, + "logits/rejected": -3.451953172683716, + "logps/chosen": -312.7437438964844, + "logps/rejected": -382.5625, + "loss": 0.0875, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.741992175579071, + "rewards/margins": 9.014843940734863, + "rewards/rejected": -9.757031440734863, + "step": 1245 + }, + { + "epoch": 0.8753501400560224, + "grad_norm": 2.646317720413208, + "learning_rate": 4.711914029957842e-08, + "logits/chosen": -3.721874952316284, + "logits/rejected": -3.4429688453674316, + "logps/chosen": -329.1000061035156, + "logps/rejected": -392.3500061035156, + "loss": 0.137, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.7165282964706421, + "rewards/margins": 9.112890243530273, + "rewards/rejected": -9.827343940734863, + "step": 1250 + }, + { + "epoch": 0.8788515406162465, + "grad_norm": 7.2253499031066895, + "learning_rate": 4.456283250968096e-08, + "logits/chosen": -3.707812547683716, + "logits/rejected": -3.44140625, + "logps/chosen": -304.82501220703125, + "logps/rejected": -373.38751220703125, + "loss": 0.0425, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.49871522188186646, + "rewards/margins": 9.774999618530273, + "rewards/rejected": -9.2763671875, + "step": 1255 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 2.6227643489837646, + "learning_rate": 4.2074579174805167e-08, + "logits/chosen": -3.7425780296325684, + "logits/rejected": -3.4820313453674316, + "logps/chosen": -293.14373779296875, + "logps/rejected": -363.45001220703125, + "loss": 0.0338, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.5131057500839233, + "rewards/margins": 9.817968368530273, + "rewards/rejected": -9.3017578125, + "step": 1260 + }, + { + "epoch": 0.8858543417366946, + "grad_norm": 1.3551222085952759, + "learning_rate": 3.965475210633717e-08, + "logits/chosen": -3.721874952316284, + "logits/rejected": -3.423046827316284, + "logps/chosen": -343.7250061035156, + "logps/rejected": -410.67498779296875, + "loss": 0.1, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5851074457168579, + "rewards/margins": 9.858593940734863, + "rewards/rejected": -10.449999809265137, + "step": 1265 + }, + { + "epoch": 0.8893557422969187, + "grad_norm": 8.488232612609863, + "learning_rate": 3.7303712890955075e-08, + "logits/chosen": -3.7113280296325684, + "logits/rejected": -3.458203077316284, + "logps/chosen": -322.875, + "logps/rejected": -390.95001220703125, + "loss": 0.1156, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.894335925579071, + "rewards/margins": 9.289843559265137, + "rewards/rejected": -10.184374809265137, + "step": 1270 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 4.776467800140381, + "learning_rate": 3.5021812836597555e-08, + "logits/chosen": -3.710156202316284, + "logits/rejected": -3.4351563453674316, + "logps/chosen": -322.61248779296875, + "logps/rejected": -382.51251220703125, + "loss": 0.0723, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.9281371831893921, + "rewards/margins": 9.189844131469727, + "rewards/rejected": -10.128515243530273, + "step": 1275 + }, + { + "epoch": 0.896358543417367, + "grad_norm": 14.904967308044434, + "learning_rate": 3.2809392919969484e-08, + "logits/chosen": -3.70703125, + "logits/rejected": -3.430859327316284, + "logps/chosen": -324.42498779296875, + "logps/rejected": -399.8999938964844, + "loss": 0.1605, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.23081055283546448, + "rewards/margins": 9.43359375, + "rewards/rejected": -9.666406631469727, + "step": 1280 + }, + { + "epoch": 0.8998599439775911, + "grad_norm": 9.847896575927734, + "learning_rate": 3.0666783735590615e-08, + "logits/chosen": -3.75, + "logits/rejected": -3.4605469703674316, + "logps/chosen": -315.07501220703125, + "logps/rejected": -394.17498779296875, + "loss": 0.0653, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.1302490234375, + "rewards/margins": 10.15234375, + "rewards/rejected": -10.021875381469727, + "step": 1285 + }, + { + "epoch": 0.9033613445378151, + "grad_norm": 7.819700241088867, + "learning_rate": 2.859430544639624e-08, + "logits/chosen": -3.7027344703674316, + "logits/rejected": -3.426953077316284, + "logps/chosen": -342.17498779296875, + "logps/rejected": -412.9750061035156, + "loss": 0.0678, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.7393798828125, + "rewards/margins": 9.6171875, + "rewards/rejected": -10.364062309265137, + "step": 1290 + }, + { + "epoch": 0.9068627450980392, + "grad_norm": 4.363787651062012, + "learning_rate": 2.6592267735896067e-08, + "logits/chosen": -3.690234422683716, + "logits/rejected": -3.446093797683716, + "logps/chosen": -334.76251220703125, + "logps/rejected": -398.6499938964844, + "loss": 0.0916, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.8033202886581421, + "rewards/margins": 8.637890815734863, + "rewards/rejected": -9.435937881469727, + "step": 1295 + }, + { + "epoch": 0.9103641456582633, + "grad_norm": 2.6290194988250732, + "learning_rate": 2.4660969761899576e-08, + "logits/chosen": -3.700390577316284, + "logits/rejected": -3.458203077316284, + "logps/chosen": -301.0249938964844, + "logps/rejected": -365.0, + "loss": 0.164, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.2901855409145355, + "rewards/margins": 8.687891006469727, + "rewards/rejected": -8.9794921875, + "step": 1300 + }, + { + "epoch": 0.9138655462184874, + "grad_norm": 1.2787213325500488, + "learning_rate": 2.2800700111813455e-08, + "logits/chosen": -3.696484327316284, + "logits/rejected": -3.4273438453674316, + "logps/chosen": -322.29998779296875, + "logps/rejected": -381.2250061035156, + "loss": 0.0519, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4840148985385895, + "rewards/margins": 9.659375190734863, + "rewards/rejected": -9.176953315734863, + "step": 1305 + }, + { + "epoch": 0.9173669467787114, + "grad_norm": 83.05997467041016, + "learning_rate": 2.101173675951928e-08, + "logits/chosen": -3.6820311546325684, + "logits/rejected": -3.4390625953674316, + "logps/chosen": -308.6812438964844, + "logps/rejected": -380.0, + "loss": 0.101, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.42204588651657104, + "rewards/margins": 9.311718940734863, + "rewards/rejected": -9.731249809265137, + "step": 1310 + }, + { + "epoch": 0.9208683473389355, + "grad_norm": 2.9914722442626953, + "learning_rate": 1.9294347023836475e-08, + "logits/chosen": -3.720703125, + "logits/rejected": -3.44921875, + "logps/chosen": -323.2250061035156, + "logps/rejected": -402.42498779296875, + "loss": 0.0593, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.349386602640152, + "rewards/margins": 9.157812118530273, + "rewards/rejected": -9.510156631469727, + "step": 1315 + }, + { + "epoch": 0.9243697478991597, + "grad_norm": 3.304603338241577, + "learning_rate": 1.7648787528578126e-08, + "logits/chosen": -3.708984375, + "logits/rejected": -3.4117188453674316, + "logps/chosen": -324.1875, + "logps/rejected": -398.29998779296875, + "loss": 0.0767, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.3753418028354645, + "rewards/margins": 9.782031059265137, + "rewards/rejected": -10.1640625, + "step": 1320 + }, + { + "epoch": 0.9278711484593838, + "grad_norm": 2.7856390476226807, + "learning_rate": 1.6075304164204385e-08, + "logits/chosen": -3.7066407203674316, + "logits/rejected": -3.455859422683716, + "logps/chosen": -317.7562561035156, + "logps/rejected": -396.13751220703125, + "loss": 0.0949, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.7166748046875, + "rewards/margins": 8.772656440734863, + "rewards/rejected": -9.496874809265137, + "step": 1325 + }, + { + "epoch": 0.9313725490196079, + "grad_norm": 6.607837200164795, + "learning_rate": 1.4574132051079658e-08, + "logits/chosen": -3.723828077316284, + "logits/rejected": -3.4527344703674316, + "logps/chosen": -336.09375, + "logps/rejected": -404.6000061035156, + "loss": 0.0862, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.12067870795726776, + "rewards/margins": 9.612500190734863, + "rewards/rejected": -9.724218368530273, + "step": 1330 + }, + { + "epoch": 0.9348739495798319, + "grad_norm": 5.29033088684082, + "learning_rate": 1.3145495504339855e-08, + "logits/chosen": -3.685546875, + "logits/rejected": -3.4644532203674316, + "logps/chosen": -315.57501220703125, + "logps/rejected": -370.45001220703125, + "loss": 0.1172, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.13958740234375, + "rewards/margins": 8.61328125, + "rewards/rejected": -8.471094131469727, + "step": 1335 + }, + { + "epoch": 0.938375350140056, + "grad_norm": 8.475128173828125, + "learning_rate": 1.1789608000373208e-08, + "logits/chosen": -3.7242188453674316, + "logits/rejected": -3.438281297683716, + "logps/chosen": -313.14373779296875, + "logps/rejected": -389.1000061035156, + "loss": 0.0504, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.16085204482078552, + "rewards/margins": 9.942187309265137, + "rewards/rejected": -10.09765625, + "step": 1340 + }, + { + "epoch": 0.9418767507002801, + "grad_norm": 7.167829990386963, + "learning_rate": 1.0506672144921513e-08, + "logits/chosen": -3.749218702316284, + "logits/rejected": -3.4898438453674316, + "logps/chosen": -304.2875061035156, + "logps/rejected": -383.45001220703125, + "loss": 0.0727, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.04436035081744194, + "rewards/margins": 9.248437881469727, + "rewards/rejected": -9.2109375, + "step": 1345 + }, + { + "epoch": 0.9453781512605042, + "grad_norm": 3.9453182220458984, + "learning_rate": 9.296879642805288e-09, + "logits/chosen": -3.69140625, + "logits/rejected": -3.4085936546325684, + "logps/chosen": -304.625, + "logps/rejected": -367.1625061035156, + "loss": 0.0397, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.43353271484375, + "rewards/margins": 9.653905868530273, + "rewards/rejected": -10.091796875, + "step": 1350 + }, + { + "epoch": 0.9488795518207283, + "grad_norm": 9.613517761230469, + "learning_rate": 8.160411269278077e-09, + "logits/chosen": -3.7249999046325684, + "logits/rejected": -3.4898438453674316, + "logps/chosen": -306.33123779296875, + "logps/rejected": -374.20001220703125, + "loss": 0.0415, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.807202160358429, + "rewards/margins": 9.182812690734863, + "rewards/rejected": -9.989062309265137, + "step": 1355 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 5.10851526260376, + "learning_rate": 7.097436843013782e-09, + "logits/chosen": -3.7222657203674316, + "logits/rejected": -3.462890625, + "logps/chosen": -296.70001220703125, + "logps/rejected": -363.45001220703125, + "loss": 0.056, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.08205566555261612, + "rewards/margins": 8.882031440734863, + "rewards/rejected": -8.967187881469727, + "step": 1360 + }, + { + "epoch": 0.9558823529411765, + "grad_norm": 2.2889809608459473, + "learning_rate": 6.1081152007310675e-09, + "logits/chosen": -3.716015577316284, + "logits/rejected": -3.470703125, + "logps/chosen": -325.7250061035156, + "logps/rejected": -377.2250061035156, + "loss": 0.1124, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.568286120891571, + "rewards/margins": 9.180078506469727, + "rewards/rejected": -9.743359565734863, + "step": 1365 + }, + { + "epoch": 0.9593837535014006, + "grad_norm": 62.453704833984375, + "learning_rate": 5.192594173459242e-09, + "logits/chosen": -3.706249952316284, + "logits/rejected": -3.4453125, + "logps/chosen": -325.83123779296875, + "logps/rejected": -394.23748779296875, + "loss": 0.1212, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4400390684604645, + "rewards/margins": 8.687891006469727, + "rewards/rejected": -9.122949600219727, + "step": 1370 + }, + { + "epoch": 0.9628851540616247, + "grad_norm": 1.9323933124542236, + "learning_rate": 4.351010564447976e-09, + "logits/chosen": -3.7621092796325684, + "logits/rejected": -3.457812547683716, + "logps/chosen": -333.2250061035156, + "logps/rejected": -392.4750061035156, + "loss": 0.0452, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.42540282011032104, + "rewards/margins": 9.553125381469727, + "rewards/rejected": -9.125781059265137, + "step": 1375 + }, + { + "epoch": 0.9663865546218487, + "grad_norm": 3.083484411239624, + "learning_rate": 3.5834901287255524e-09, + "logits/chosen": -3.705859422683716, + "logits/rejected": -3.457812547683716, + "logps/chosen": -300.53125, + "logps/rejected": -367.7250061035156, + "loss": 0.0819, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.20588378608226776, + "rewards/margins": 9.201952934265137, + "rewards/rejected": -9.0, + "step": 1380 + }, + { + "epoch": 0.9698879551820728, + "grad_norm": 7.1853437423706055, + "learning_rate": 2.8901475543076647e-09, + "logits/chosen": -3.735156297683716, + "logits/rejected": -3.4710936546325684, + "logps/chosen": -304.48748779296875, + "logps/rejected": -368.8999938964844, + "loss": 0.1296, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.914019763469696, + "rewards/margins": 9.189453125, + "rewards/rejected": -10.100000381469727, + "step": 1385 + }, + { + "epoch": 0.9733893557422969, + "grad_norm": 11.565399169921875, + "learning_rate": 2.2710864450596336e-09, + "logits/chosen": -3.682812452316284, + "logits/rejected": -3.4730467796325684, + "logps/chosen": -283.4624938964844, + "logps/rejected": -344.75, + "loss": 0.0776, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.631591796875, + "rewards/margins": 8.482812881469727, + "rewards/rejected": -9.115625381469727, + "step": 1390 + }, + { + "epoch": 0.976890756302521, + "grad_norm": 38.52793502807617, + "learning_rate": 1.7263993052157867e-09, + "logits/chosen": -3.762890577316284, + "logits/rejected": -3.452343702316284, + "logps/chosen": -338.5, + "logps/rejected": -406.45001220703125, + "loss": 0.0462, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.16367188096046448, + "rewards/margins": 9.822656631469727, + "rewards/rejected": -9.652929306030273, + "step": 1395 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 14.1930570602417, + "learning_rate": 1.2561675255564618e-09, + "logits/chosen": -3.7249999046325684, + "logits/rejected": -3.4789061546325684, + "logps/chosen": -318.2875061035156, + "logps/rejected": -389.04998779296875, + "loss": 0.1392, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.48853760957717896, + "rewards/margins": 8.848437309265137, + "rewards/rejected": -9.339062690734863, + "step": 1400 + }, + { + "epoch": 0.9838935574229691, + "grad_norm": 5.662397861480713, + "learning_rate": 8.60461371246235e-10, + "logits/chosen": -3.7425780296325684, + "logits/rejected": -3.438671827316284, + "logps/chosen": -344.48748779296875, + "logps/rejected": -407.6000061035156, + "loss": 0.1257, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.18222656846046448, + "rewards/margins": 9.806249618530273, + "rewards/rejected": -9.988672256469727, + "step": 1405 + }, + { + "epoch": 0.9873949579831933, + "grad_norm": 2.6685423851013184, + "learning_rate": 5.393399713341517e-10, + "logits/chosen": -3.7054686546325684, + "logits/rejected": -3.4292969703674316, + "logps/chosen": -313.25, + "logps/rejected": -392.75, + "loss": 0.0421, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.519946277141571, + "rewards/margins": 9.720312118530273, + "rewards/rejected": -10.240625381469727, + "step": 1410 + }, + { + "epoch": 0.9908963585434174, + "grad_norm": 5.167869567871094, + "learning_rate": 2.928513099187402e-10, + "logits/chosen": -3.7132811546325684, + "logits/rejected": -3.440624952316284, + "logps/chosen": -312.0, + "logps/rejected": -371.2250061035156, + "loss": 0.078, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.597119152545929, + "rewards/margins": 9.114843368530273, + "rewards/rejected": -8.518359184265137, + "step": 1415 + }, + { + "epoch": 0.9943977591036415, + "grad_norm": 5.322803020477295, + "learning_rate": 1.2103221897746818e-10, + "logits/chosen": -3.6968750953674316, + "logits/rejected": -3.438671827316284, + "logps/chosen": -316.2250061035156, + "logps/rejected": -386.54998779296875, + "loss": 0.0977, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.326605200767517, + "rewards/margins": 9.290234565734863, + "rewards/rejected": -10.621484756469727, + "step": 1420 + }, + { + "epoch": 0.9978991596638656, + "grad_norm": 3.4535648822784424, + "learning_rate": 2.3908372863368222e-11, + "logits/chosen": -3.70703125, + "logits/rejected": -3.448046922683716, + "logps/chosen": -308.6000061035156, + "logps/rejected": -380.0249938964844, + "loss": 0.0616, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.05886230617761612, + "rewards/margins": 9.678125381469727, + "rewards/rejected": -9.73828125, + "step": 1425 + }, { "epoch": 1.0, - "step": 712, + "step": 1428, "total_flos": 0.0, - "train_loss": 0.23233672156092827, - "train_runtime": 2773.067, - "train_samples_per_second": 16.43, - "train_steps_per_second": 0.257 + "train_loss": 0.16149422216962198, + "train_runtime": 9781.4907, + "train_samples_per_second": 4.672, + "train_steps_per_second": 0.146 } ], - "logging_steps": 10, - "max_steps": 712, + "logging_steps": 5, + "max_steps": 1428, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, @@ -1109,14 +4320,14 @@ "should_epoch_stop": false, "should_evaluate": false, "should_log": false, - "should_save": false, - "should_training_stop": false + "should_save": true, + "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, - "train_batch_size": 2, + "train_batch_size": 1, "trial_name": null, "trial_params": null }