{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 4479, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.033489618218352314, "grad_norm": 78.33113098144531, "learning_rate": 2.1875e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -175.2264862060547, "logps/rejected": -218.9656982421875, "loss": 0.6923, "rewards/accuracies": 0.4137499928474426, "rewards/chosen": 0.0005424434202723205, "rewards/margins": 0.0029623538721352816, "rewards/rejected": -0.0024199108593165874, "step": 50 }, { "epoch": 0.06697923643670463, "grad_norm": 106.17163848876953, "learning_rate": 4.419642857142857e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -179.5259246826172, "logps/rejected": -224.7578887939453, "loss": 0.6907, "rewards/accuracies": 0.42124998569488525, "rewards/chosen": -0.005094751715660095, "rewards/margins": 0.00641661649569869, "rewards/rejected": -0.011511369608342648, "step": 100 }, { "epoch": 0.10046885465505694, "grad_norm": 86.04861450195312, "learning_rate": 6.651785714285713e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -165.04095458984375, "logps/rejected": -219.6518096923828, "loss": 0.6756, "rewards/accuracies": 0.5112500190734863, "rewards/chosen": -0.026584235951304436, "rewards/margins": 0.03996539115905762, "rewards/rejected": -0.0665496289730072, "step": 150 }, { "epoch": 0.13395847287340926, "grad_norm": 82.77224731445312, "learning_rate": 8.88392857142857e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -177.75872802734375, "logps/rejected": -223.51528930664062, "loss": 0.6591, "rewards/accuracies": 0.5099999904632568, "rewards/chosen": -0.11609632521867752, "rewards/margins": 0.10106377303600311, "rewards/rejected": -0.21716010570526123, "step": 200 }, { "epoch": 0.16744809109176156, "grad_norm": 135.95346069335938, "learning_rate": 1.1116071428571427e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -180.67254638671875, "logps/rejected": -226.42140197753906, "loss": 0.6295, "rewards/accuracies": 0.5099999904632568, "rewards/chosen": -0.18336135149002075, "rewards/margins": 0.20662552118301392, "rewards/rejected": -0.38998690247535706, "step": 250 }, { "epoch": 0.20093770931011387, "grad_norm": 89.77359771728516, "learning_rate": 1.3348214285714285e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.88687133789062, "logps/rejected": -226.58355712890625, "loss": 0.602, "rewards/accuracies": 0.5637500286102295, "rewards/chosen": -0.29084426164627075, "rewards/margins": 0.32938891649246216, "rewards/rejected": -0.6202332377433777, "step": 300 }, { "epoch": 0.23442732752846618, "grad_norm": 89.93605041503906, "learning_rate": 1.558035714285714e-06, "logits/chosen": NaN, "logits/rejected": -1.608971118927002, "logps/chosen": -176.1905059814453, "logps/rejected": -231.0211944580078, "loss": 0.5782, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.4454282522201538, "rewards/margins": 0.5162708163261414, "rewards/rejected": -0.9616988897323608, "step": 350 }, { "epoch": 0.2679169457468185, "grad_norm": 113.58289337158203, "learning_rate": 1.7812499999999999e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -176.52401733398438, "logps/rejected": -236.76588439941406, "loss": 0.5478, "rewards/accuracies": 0.6150000095367432, "rewards/chosen": -0.5549299120903015, "rewards/margins": 0.8102107048034668, "rewards/rejected": -1.3651405572891235, "step": 400 }, { "epoch": 0.3014065639651708, "grad_norm": 100.28213500976562, "learning_rate": 1.999999696300462e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -183.1260223388672, "logps/rejected": -235.15631103515625, "loss": 0.5635, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": -0.48344433307647705, "rewards/margins": 0.770007848739624, "rewards/rejected": -1.253452181816101, "step": 450 }, { "epoch": 0.33489618218352313, "grad_norm": 90.32833099365234, "learning_rate": 1.999210181452139e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -180.36907958984375, "logps/rejected": -232.14285278320312, "loss": 0.5376, "rewards/accuracies": 0.6087499856948853, "rewards/chosen": -0.5261387825012207, "rewards/margins": 0.8372372984886169, "rewards/rejected": -1.3633761405944824, "step": 500 }, { "epoch": 0.3683858004018754, "grad_norm": 72.57466125488281, "learning_rate": 1.996903560165487e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -175.88233947753906, "logps/rejected": -242.15728759765625, "loss": 0.5083, "rewards/accuracies": 0.6225000023841858, "rewards/chosen": -0.5493210554122925, "rewards/margins": 1.0930429697036743, "rewards/rejected": -1.6423640251159668, "step": 550 }, { "epoch": 0.40187541862022774, "grad_norm": 47.55934143066406, "learning_rate": 1.993083334596579e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -184.1678924560547, "logps/rejected": -251.43661499023438, "loss": 0.5193, "rewards/accuracies": 0.6225000023841858, "rewards/chosen": -0.7250985503196716, "rewards/margins": 1.2086968421936035, "rewards/rejected": -1.9337953329086304, "step": 600 }, { "epoch": 0.43536503683858, "grad_norm": 90.7481460571289, "learning_rate": 1.987755305015383e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -196.693359375, "logps/rejected": -247.3010711669922, "loss": 0.516, "rewards/accuracies": 0.6137499809265137, "rewards/chosen": -0.6984607577323914, "rewards/margins": 1.173628807067871, "rewards/rejected": -1.8720895051956177, "step": 650 }, { "epoch": 0.46885465505693236, "grad_norm": 86.08389282226562, "learning_rate": 1.980927560999178e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.29693603515625, "logps/rejected": -245.04824829101562, "loss": 0.5057, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6868166327476501, "rewards/margins": 1.367271900177002, "rewards/rejected": -2.0540883541107178, "step": 700 }, { "epoch": 0.5023442732752846, "grad_norm": 40.12553405761719, "learning_rate": 1.9726104691501045e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -179.41378784179688, "logps/rejected": -240.62547302246094, "loss": 0.5132, "rewards/accuracies": 0.5975000262260437, "rewards/chosen": -0.5570769309997559, "rewards/margins": 1.2463946342468262, "rewards/rejected": -1.803471326828003, "step": 750 }, { "epoch": 0.535833891493637, "grad_norm": 36.09309005737305, "learning_rate": 1.9628166573554945e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -170.22169494628906, "logps/rejected": -239.9406280517578, "loss": 0.4553, "rewards/accuracies": 0.6449999809265137, "rewards/chosen": -0.5568282604217529, "rewards/margins": 1.5600597858428955, "rewards/rejected": -2.1168878078460693, "step": 800 }, { "epoch": 0.5693235097119893, "grad_norm": 88.8606185913086, "learning_rate": 1.951560995614879e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -175.4136199951172, "logps/rejected": -241.44386291503906, "loss": 0.4912, "rewards/accuracies": 0.6175000071525574, "rewards/chosen": -0.6789398193359375, "rewards/margins": 1.448940634727478, "rewards/rejected": -2.127880573272705, "step": 850 }, { "epoch": 0.6028131279303416, "grad_norm": 37.501346588134766, "learning_rate": 1.9388605734627843e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -183.4543914794922, "logps/rejected": -241.45433044433594, "loss": 0.505, "rewards/accuracies": 0.6212499737739563, "rewards/chosen": -0.719947338104248, "rewards/margins": 1.5332283973693848, "rewards/rejected": -2.253175735473633, "step": 900 }, { "epoch": 0.6363027461486939, "grad_norm": 58.78173065185547, "learning_rate": 1.9247346740215936e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -182.4608612060547, "logps/rejected": -236.8692169189453, "loss": 0.4756, "rewards/accuracies": 0.6274999976158142, "rewards/chosen": -0.5931037068367004, "rewards/margins": 1.6174336671829224, "rewards/rejected": -2.2105374336242676, "step": 950 }, { "epoch": 0.6697923643670463, "grad_norm": 53.627410888671875, "learning_rate": 1.909204744723877e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -169.64356994628906, "logps/rejected": -238.07931518554688, "loss": 0.4699, "rewards/accuracies": 0.625, "rewards/chosen": -0.5164381265640259, "rewards/margins": 1.6023368835449219, "rewards/rejected": -2.1187753677368164, "step": 1000 }, { "epoch": 0.7032819825853985, "grad_norm": 47.64691162109375, "learning_rate": 1.8922943647486314e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -174.08212280273438, "logps/rejected": -251.6885223388672, "loss": 0.4309, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.560505211353302, "rewards/margins": 1.9433872699737549, "rewards/rejected": -2.503892421722412, "step": 1050 }, { "epoch": 0.7367716008037508, "grad_norm": 58.94224166870117, "learning_rate": 1.8740292092208816e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -162.09487915039062, "logps/rejected": -236.79824829101562, "loss": 0.4293, "rewards/accuracies": 0.6524999737739563, "rewards/chosen": -0.6041057705879211, "rewards/margins": 2.0014426708221436, "rewards/rejected": -2.60554838180542, "step": 1100 }, { "epoch": 0.7702612190221031, "grad_norm": 41.707763671875, "learning_rate": 1.8544370102289943e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -177.0761260986328, "logps/rejected": -240.7725067138672, "loss": 0.4419, "rewards/accuracies": 0.6612499952316284, "rewards/chosen": -0.6522895097732544, "rewards/margins": 1.7689578533172607, "rewards/rejected": -2.4212474822998047, "step": 1150 }, { "epoch": 0.8037508372404555, "grad_norm": 45.48369216918945, "learning_rate": 1.83354751471889e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -184.2169952392578, "logps/rejected": -264.9205322265625, "loss": 0.4503, "rewards/accuracies": 0.6549999713897705, "rewards/chosen": -0.49645543098449707, "rewards/margins": 2.04986572265625, "rewards/rejected": -2.546321392059326, "step": 1200 }, { "epoch": 0.8372404554588078, "grad_norm": 51.16058349609375, "learning_rate": 1.8113924393290904e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -182.03074645996094, "logps/rejected": -249.8163604736328, "loss": 0.4319, "rewards/accuracies": 0.6612499952316284, "rewards/chosen": -0.6471911072731018, "rewards/margins": 2.1099319458007812, "rewards/rejected": -2.7571229934692383, "step": 1250 }, { "epoch": 0.87073007367716, "grad_norm": 64.02259063720703, "learning_rate": 1.7880054222351658e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.18972778320312, "logps/rejected": -237.3641815185547, "loss": 0.4155, "rewards/accuracies": 0.6725000143051147, "rewards/chosen": -0.38780125975608826, "rewards/margins": 1.9852185249328613, "rewards/rejected": -2.3730199337005615, "step": 1300 }, { "epoch": 0.9042196918955124, "grad_norm": 35.12641525268555, "learning_rate": 1.763421972076705e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -175.52285766601562, "logps/rejected": -247.11244201660156, "loss": 0.4359, "rewards/accuracies": 0.6512500047683716, "rewards/chosen": -0.493091344833374, "rewards/margins": 1.8931076526641846, "rewards/rejected": -2.3861987590789795, "step": 1350 }, { "epoch": 0.9377093101138647, "grad_norm": 64.41110229492188, "learning_rate": 1.7376794140443474e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.29629516601562, "logps/rejected": -234.5249481201172, "loss": 0.4512, "rewards/accuracies": 0.6549999713897705, "rewards/chosen": -0.4724200367927551, "rewards/margins": 1.9340243339538574, "rewards/rejected": -2.4064440727233887, "step": 1400 }, { "epoch": 0.971198928332217, "grad_norm": 26.93653106689453, "learning_rate": 1.7108168332087366e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -175.42259216308594, "logps/rejected": -243.82032775878906, "loss": 0.4343, "rewards/accuracies": 0.6512500047683716, "rewards/chosen": -0.3961036205291748, "rewards/margins": 1.8803616762161255, "rewards/rejected": -2.27646541595459, "step": 1450 }, { "epoch": 1.0046885465505693, "grad_norm": 74.74053955078125, "learning_rate": 1.682875015177438e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -174.56732177734375, "logps/rejected": -246.36451721191406, "loss": 0.3957, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": -0.34164169430732727, "rewards/margins": 2.248396635055542, "rewards/rejected": -2.590038537979126, "step": 1500 }, { "epoch": 1.0381781647689217, "grad_norm": 58.65504455566406, "learning_rate": 1.6538963841699207e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -176.5469207763672, "logps/rejected": -258.92706298828125, "loss": 0.2861, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2739707827568054, "rewards/margins": 3.0113985538482666, "rewards/rejected": -3.2853691577911377, "step": 1550 }, { "epoch": 1.0716677829872738, "grad_norm": 59.74324417114258, "learning_rate": 1.6239249386046274e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -177.00692749023438, "logps/rejected": -255.23556518554688, "loss": 0.2914, "rewards/accuracies": 0.7549999952316284, "rewards/chosen": -0.4652925729751587, "rewards/margins": 3.098710298538208, "rewards/rejected": -3.564002752304077, "step": 1600 }, { "epoch": 1.1051574012056262, "grad_norm": 37.80025863647461, "learning_rate": 1.593006184295927e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -185.12716674804688, "logps/rejected": -254.19509887695312, "loss": 0.2798, "rewards/accuracies": 0.7524999976158142, "rewards/chosen": -0.28863173723220825, "rewards/margins": 3.227825880050659, "rewards/rejected": -3.516458034515381, "step": 1650 }, { "epoch": 1.1386470194239786, "grad_norm": 40.97309875488281, "learning_rate": 1.5611870653623825e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -182.1793975830078, "logps/rejected": -245.0845184326172, "loss": 0.2778, "rewards/accuracies": 0.7450000047683716, "rewards/chosen": -0.3949226438999176, "rewards/margins": 3.3151471614837646, "rewards/rejected": -3.7100696563720703, "step": 1700 }, { "epoch": 1.1721366376423308, "grad_norm": 61.272247314453125, "learning_rate": 1.5285158929512291e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -174.18487548828125, "logps/rejected": -247.96957397460938, "loss": 0.3048, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4471362233161926, "rewards/margins": 3.481740951538086, "rewards/rejected": -3.928877830505371, "step": 1750 }, { "epoch": 1.2056262558606832, "grad_norm": 20.384906768798828, "learning_rate": 1.4950422718872916e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -177.91143798828125, "logps/rejected": -264.8081970214844, "loss": 0.2738, "rewards/accuracies": 0.7574999928474426, "rewards/chosen": -0.4734611213207245, "rewards/margins": 3.4893076419830322, "rewards/rejected": -3.962768793106079, "step": 1800 }, { "epoch": 1.2391158740790356, "grad_norm": 46.84432601928711, "learning_rate": 1.4608170253576945e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -171.02236938476562, "logps/rejected": -259.7280578613281, "loss": 0.2928, "rewards/accuracies": 0.7262499928474426, "rewards/chosen": -0.6498711109161377, "rewards/margins": 3.556124210357666, "rewards/rejected": -4.205995082855225, "step": 1850 }, { "epoch": 1.2726054922973877, "grad_norm": 40.36602020263672, "learning_rate": 1.4258921177467371e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -176.40257263183594, "logps/rejected": -251.6402130126953, "loss": 0.301, "rewards/accuracies": 0.7325000166893005, "rewards/chosen": -0.7374945878982544, "rewards/margins": 3.618178606033325, "rewards/rejected": -4.355673789978027, "step": 1900 }, { "epoch": 1.3060951105157401, "grad_norm": 33.35322952270508, "learning_rate": 1.3903205757380715e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -177.98854064941406, "logps/rejected": -259.6983337402344, "loss": 0.2985, "rewards/accuracies": 0.7275000214576721, "rewards/chosen": -0.7513535022735596, "rewards/margins": 3.433237314224243, "rewards/rejected": -4.184591293334961, "step": 1950 }, { "epoch": 1.3395847287340925, "grad_norm": 31.858760833740234, "learning_rate": 1.3541564078039942e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -176.97511291503906, "logps/rejected": -267.1122131347656, "loss": 0.307, "rewards/accuracies": 0.7174999713897705, "rewards/chosen": -0.6912581920623779, "rewards/margins": 3.4836156368255615, "rewards/rejected": -4.1748738288879395, "step": 2000 }, { "epoch": 1.3730743469524447, "grad_norm": 40.272186279296875, "learning_rate": 1.3174545222040757e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -181.2541046142578, "logps/rejected": -267.8948974609375, "loss": 0.2764, "rewards/accuracies": 0.7612500190734863, "rewards/chosen": -0.5613307356834412, "rewards/margins": 3.6199841499328613, "rewards/rejected": -4.181314468383789, "step": 2050 }, { "epoch": 1.406563965170797, "grad_norm": 20.189088821411133, "learning_rate": 1.2802706436176447e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.3399658203125, "logps/rejected": -275.252685546875, "loss": 0.2673, "rewards/accuracies": 0.7512500286102295, "rewards/chosen": -0.49821099638938904, "rewards/margins": 3.6726813316345215, "rewards/rejected": -4.170892238616943, "step": 2100 }, { "epoch": 1.4400535833891492, "grad_norm": 28.09309196472168, "learning_rate": 1.2426612285366904e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -180.54571533203125, "logps/rejected": -272.14337158203125, "loss": 0.2833, "rewards/accuracies": 0.7649999856948853, "rewards/chosen": -0.5274211168289185, "rewards/margins": 3.785543203353882, "rewards/rejected": -4.31296443939209, "step": 2150 }, { "epoch": 1.4735432016075016, "grad_norm": 5.396151542663574, "learning_rate": 1.2046833795476566e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.48960876464844, "logps/rejected": -268.61944580078125, "loss": 0.2594, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.3929290771484375, "rewards/margins": 3.8942084312438965, "rewards/rejected": -4.287137508392334, "step": 2200 }, { "epoch": 1.507032819825854, "grad_norm": 26.636991500854492, "learning_rate": 1.16639475863226e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -183.34547424316406, "logps/rejected": -259.8311462402344, "loss": 0.3026, "rewards/accuracies": 0.7200000286102295, "rewards/chosen": -0.5500699281692505, "rewards/margins": 3.565783739089966, "rewards/rejected": -4.115853786468506, "step": 2250 }, { "epoch": 1.5405224380442064, "grad_norm": 14.03653335571289, "learning_rate": 1.1278534996189831e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -182.8995361328125, "logps/rejected": -273.84112548828125, "loss": 0.2603, "rewards/accuracies": 0.7487499713897705, "rewards/chosen": -0.5162584185600281, "rewards/margins": 4.0679030418396, "rewards/rejected": -4.584161758422852, "step": 2300 }, { "epoch": 1.5740120562625586, "grad_norm": 67.45540618896484, "learning_rate": 1.0891181199181518e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -176.06849670410156, "logps/rejected": -265.9678649902344, "loss": 0.272, "rewards/accuracies": 0.7475000023841858, "rewards/chosen": -0.5778465867042542, "rewards/margins": 3.9320404529571533, "rewards/rejected": -4.509886264801025, "step": 2350 }, { "epoch": 1.607501674480911, "grad_norm": 21.127580642700195, "learning_rate": 1.0502474316746242e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.6305694580078, "logps/rejected": -265.5202331542969, "loss": 0.2839, "rewards/accuracies": 0.7462499737739563, "rewards/chosen": -0.5587973594665527, "rewards/margins": 3.9246935844421387, "rewards/rejected": -4.48349142074585, "step": 2400 }, { "epoch": 1.6409912926992631, "grad_norm": 47.24773025512695, "learning_rate": 1.0113004524729797e-06, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -196.45948791503906, "logps/rejected": -272.1256408691406, "loss": 0.2791, "rewards/accuracies": 0.7587500214576721, "rewards/chosen": -0.5817875862121582, "rewards/margins": 3.766108989715576, "rewards/rejected": -4.347896099090576, "step": 2450 }, { "epoch": 1.6744809109176155, "grad_norm": 20.178668975830078, "learning_rate": 9.723363157307888e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -183.7681427001953, "logps/rejected": -268.5182800292969, "loss": 0.2744, "rewards/accuracies": 0.7475000023841858, "rewards/chosen": -0.5075680017471313, "rewards/margins": 3.9134867191314697, "rewards/rejected": -4.421054840087891, "step": 2500 }, { "epoch": 1.707970529135968, "grad_norm": 31.073015213012695, "learning_rate": 9.334141809160118e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.35658264160156, "logps/rejected": -265.6587829589844, "loss": 0.2405, "rewards/accuracies": 0.7712500095367432, "rewards/chosen": -0.6600850820541382, "rewards/margins": 4.134018421173096, "rewards/rejected": -4.794103622436523, "step": 2550 }, { "epoch": 1.7414601473543203, "grad_norm": 36.3228759765625, "learning_rate": 8.945931437248468e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.47000122070312, "logps/rejected": -270.1788635253906, "loss": 0.2674, "rewards/accuracies": 0.7524999976158142, "rewards/chosen": -0.6616349816322327, "rewards/margins": 4.066000461578369, "rewards/rejected": -4.727634906768799, "step": 2600 }, { "epoch": 1.7749497655726725, "grad_norm": 27.108051300048828, "learning_rate": 8.559321463564014e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -175.67808532714844, "logps/rejected": -261.2061767578125, "loss": 0.2494, "rewards/accuracies": 0.7549999952316284, "rewards/chosen": -0.5604009032249451, "rewards/margins": 4.31578254699707, "rewards/rejected": -4.876183032989502, "step": 2650 }, { "epoch": 1.8084393837910246, "grad_norm": 54.821876525878906, "learning_rate": 8.174898880204195e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.19236755371094, "logps/rejected": -269.1416015625, "loss": 0.2817, "rewards/accuracies": 0.7400000095367432, "rewards/chosen": -0.5425779223442078, "rewards/margins": 3.9950203895568848, "rewards/rejected": -4.537598133087158, "step": 2700 }, { "epoch": 1.841929002009377, "grad_norm": 36.13364791870117, "learning_rate": 7.793247358139428e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -179.92677307128906, "logps/rejected": -266.75799560546875, "loss": 0.2885, "rewards/accuracies": 0.7387499809265137, "rewards/chosen": -0.5648588538169861, "rewards/margins": 3.864666700363159, "rewards/rejected": -4.429525852203369, "step": 2750 }, { "epoch": 1.8754186202277294, "grad_norm": 24.641510009765625, "learning_rate": 7.414946361022179e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -171.00909423828125, "logps/rejected": -273.5279541015625, "loss": 0.2695, "rewards/accuracies": 0.7400000095367432, "rewards/chosen": -0.4850202798843384, "rewards/margins": 4.063894271850586, "rewards/rejected": -4.548914432525635, "step": 2800 }, { "epoch": 1.9089082384460818, "grad_norm": 25.44546127319336, "learning_rate": 7.040570265384029e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.17147827148438, "logps/rejected": -272.64111328125, "loss": 0.2881, "rewards/accuracies": 0.7512500286102295, "rewards/chosen": -0.5362930297851562, "rewards/margins": 4.026025295257568, "rewards/rejected": -4.562318325042725, "step": 2850 }, { "epoch": 1.942397856664434, "grad_norm": 62.34092330932617, "learning_rate": 6.670687488556586e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -188.8939208984375, "logps/rejected": -270.8504943847656, "loss": 0.2685, "rewards/accuracies": 0.7337499856948853, "rewards/chosen": -0.3625078499317169, "rewards/margins": 4.072076797485352, "rewards/rejected": -4.434584617614746, "step": 2900 }, { "epoch": 1.9758874748827864, "grad_norm": 16.188819885253906, "learning_rate": 6.305859625640224e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -177.49630737304688, "logps/rejected": -280.4139404296875, "loss": 0.2755, "rewards/accuracies": 0.7475000023841858, "rewards/chosen": -0.6155076026916504, "rewards/margins": 4.242664337158203, "rewards/rejected": -4.8581719398498535, "step": 2950 }, { "epoch": 2.0093770931011385, "grad_norm": 35.435707092285156, "learning_rate": 5.946640596831101e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -166.32289123535156, "logps/rejected": -263.216552734375, "loss": 0.2391, "rewards/accuracies": 0.7712500095367432, "rewards/chosen": -0.6572730541229248, "rewards/margins": 4.339555740356445, "rewards/rejected": -4.996828556060791, "step": 3000 }, { "epoch": 2.042866711319491, "grad_norm": 42.23343276977539, "learning_rate": 5.59357580640101e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -179.9312744140625, "logps/rejected": -277.5908508300781, "loss": 0.213, "rewards/accuracies": 0.7850000262260437, "rewards/chosen": -0.35315731167793274, "rewards/margins": 4.545411586761475, "rewards/rejected": -4.898569107055664, "step": 3050 }, { "epoch": 2.0763563295378433, "grad_norm": 2.853132486343384, "learning_rate": 5.247201314606984e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -180.59486389160156, "logps/rejected": -276.373291015625, "loss": 0.2047, "rewards/accuracies": 0.7950000166893005, "rewards/chosen": -0.3648325800895691, "rewards/margins": 4.745596885681152, "rewards/rejected": -5.110429763793945, "step": 3100 }, { "epoch": 2.1098459477561957, "grad_norm": 22.07088851928711, "learning_rate": 4.90804302378802e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -177.32708740234375, "logps/rejected": -260.5697021484375, "loss": 0.2054, "rewards/accuracies": 0.7925000190734863, "rewards/chosen": -0.48022788763046265, "rewards/margins": 4.517958641052246, "rewards/rejected": -4.998186111450195, "step": 3150 }, { "epoch": 2.1433355659745477, "grad_norm": 50.728519439697266, "learning_rate": 4.57661587988459e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -177.0932159423828, "logps/rejected": -270.6129150390625, "loss": 0.236, "rewards/accuracies": 0.7574999928474426, "rewards/chosen": -0.4882276654243469, "rewards/margins": 4.606672286987305, "rewards/rejected": -5.094900131225586, "step": 3200 }, { "epoch": 2.1768251841929, "grad_norm": 19.410276412963867, "learning_rate": 4.253423090593318e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -185.2410125732422, "logps/rejected": -282.7039794921875, "loss": 0.2242, "rewards/accuracies": 0.7612500190734863, "rewards/chosen": -0.5257070064544678, "rewards/margins": 4.692570209503174, "rewards/rejected": -5.218277454376221, "step": 3250 }, { "epoch": 2.2103148024112524, "grad_norm": 45.68756103515625, "learning_rate": 3.938955361343912e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -175.8925018310547, "logps/rejected": -284.1990966796875, "loss": 0.2259, "rewards/accuracies": 0.7699999809265137, "rewards/chosen": -0.605311930179596, "rewards/margins": 4.8395843505859375, "rewards/rejected": -5.444896221160889, "step": 3300 }, { "epoch": 2.243804420629605, "grad_norm": 51.53227996826172, "learning_rate": 3.6336901502583364e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -177.85601806640625, "logps/rejected": -275.8158874511719, "loss": 0.2048, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6794506907463074, "rewards/margins": 4.734764575958252, "rewards/rejected": -5.414215087890625, "step": 3350 }, { "epoch": 2.2772940388479572, "grad_norm": 3.569408893585205, "learning_rate": 3.3380909432234807e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -182.00836181640625, "logps/rejected": -280.286376953125, "loss": 0.1999, "rewards/accuracies": 0.7950000166893005, "rewards/chosen": -0.6098263263702393, "rewards/margins": 4.961060047149658, "rewards/rejected": -5.570886611938477, "step": 3400 }, { "epoch": 2.3107836570663096, "grad_norm": 27.362163543701172, "learning_rate": 3.0526065501779184e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -172.97593688964844, "logps/rejected": -275.5477600097656, "loss": 0.2184, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6930285096168518, "rewards/margins": 4.821885585784912, "rewards/rejected": -5.514913558959961, "step": 3450 }, { "epoch": 2.3442732752846616, "grad_norm": 28.243000030517578, "learning_rate": 2.7776704236812454e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -182.44705200195312, "logps/rejected": -277.888427734375, "loss": 0.2128, "rewards/accuracies": 0.7649999856948853, "rewards/chosen": -0.6010170578956604, "rewards/margins": 5.026294708251953, "rewards/rejected": -5.6273112297058105, "step": 3500 }, { "epoch": 2.377762893503014, "grad_norm": 14.03532886505127, "learning_rate": 2.5137000008006437e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -182.77134704589844, "logps/rejected": -279.57769775390625, "loss": 0.21, "rewards/accuracies": 0.7799999713897705, "rewards/chosen": -0.7788973450660706, "rewards/margins": 5.022655010223389, "rewards/rejected": -5.801552772521973, "step": 3550 }, { "epoch": 2.4112525117213663, "grad_norm": 35.019554138183594, "learning_rate": 2.261096069313816e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -187.45738220214844, "logps/rejected": -281.279541015625, "loss": 0.1887, "rewards/accuracies": 0.8075000047683716, "rewards/chosen": -0.7265406847000122, "rewards/margins": 5.097284317016602, "rewards/rejected": -5.823824882507324, "step": 3600 }, { "epoch": 2.4447421299397187, "grad_norm": 25.041046142578125, "learning_rate": 2.020242159190646e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -176.86915588378906, "logps/rejected": -277.746826171875, "loss": 0.2311, "rewards/accuracies": 0.7587500214576721, "rewards/chosen": -0.786669135093689, "rewards/margins": 4.789151191711426, "rewards/rejected": -5.575820446014404, "step": 3650 }, { "epoch": 2.478231748158071, "grad_norm": 20.99360466003418, "learning_rate": 1.7915039602775062e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -182.3199462890625, "logps/rejected": -273.0755920410156, "loss": 0.2429, "rewards/accuracies": 0.7737500071525574, "rewards/chosen": -0.8147923946380615, "rewards/margins": 4.847590446472168, "rewards/rejected": -5.66238260269165, "step": 3700 }, { "epoch": 2.511721366376423, "grad_norm": 18.44826889038086, "learning_rate": 1.5752287670682861e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -170.71795654296875, "logps/rejected": -276.1592102050781, "loss": 0.2043, "rewards/accuracies": 0.7862499952316284, "rewards/chosen": -0.638399064540863, "rewards/margins": 5.212125301361084, "rewards/rejected": -5.850524425506592, "step": 3750 }, { "epoch": 2.5452109845947755, "grad_norm": 40.779659271240234, "learning_rate": 1.3717449514052314e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -180.7264404296875, "logps/rejected": -284.6885986328125, "loss": 0.2033, "rewards/accuracies": 0.7962499856948853, "rewards/chosen": -0.882935106754303, "rewards/margins": 5.128498554229736, "rewards/rejected": -6.0114336013793945, "step": 3800 }, { "epoch": 2.578700602813128, "grad_norm": 44.556678771972656, "learning_rate": 1.1813614639101088e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -183.99533081054688, "logps/rejected": -275.25518798828125, "loss": 0.2274, "rewards/accuracies": 0.7774999737739563, "rewards/chosen": -0.703125, "rewards/margins": 5.014428615570068, "rewards/rejected": -5.717553615570068, "step": 3850 }, { "epoch": 2.6121902210314802, "grad_norm": 61.39085388183594, "learning_rate": 1.0043673649027517e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.3540802001953, "logps/rejected": -282.1649475097656, "loss": 0.2097, "rewards/accuracies": 0.7662500143051147, "rewards/chosen": -0.683403730392456, "rewards/margins": 5.063638687133789, "rewards/rejected": -5.747043609619141, "step": 3900 }, { "epoch": 2.6456798392498326, "grad_norm": 58.0173454284668, "learning_rate": 8.410313855191464e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.94400024414062, "logps/rejected": -286.5594177246094, "loss": 0.2042, "rewards/accuracies": 0.7862499952316284, "rewards/chosen": -0.8088821172714233, "rewards/margins": 5.067000865936279, "rewards/rejected": -5.875882625579834, "step": 3950 }, { "epoch": 2.679169457468185, "grad_norm": 16.31562042236328, "learning_rate": 6.916015196954383e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -185.46673583984375, "logps/rejected": -288.2527770996094, "loss": 0.217, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7252050638198853, "rewards/margins": 5.204960823059082, "rewards/rejected": -5.930166244506836, "step": 4000 }, { "epoch": 2.7126590756865374, "grad_norm": 20.799222946166992, "learning_rate": 5.5630464763733787e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -188.50820922851562, "logps/rejected": -288.9837646484375, "loss": 0.2258, "rewards/accuracies": 0.7724999785423279, "rewards/chosen": -0.7981621026992798, "rewards/margins": 5.062735557556152, "rewards/rejected": -5.860898017883301, "step": 4050 }, { "epoch": 2.7461486939048894, "grad_norm": 18.682947158813477, "learning_rate": 4.353461913466405e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.44317626953125, "logps/rejected": -266.35333251953125, "loss": 0.2426, "rewards/accuracies": 0.7524999976158142, "rewards/chosen": -0.6803594827651978, "rewards/margins": 4.8590497970581055, "rewards/rejected": -5.539409160614014, "step": 4100 }, { "epoch": 2.7796383121232418, "grad_norm": 54.06953048706055, "learning_rate": 3.2890980272783255e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -180.65658569335938, "logps/rejected": -280.3162536621094, "loss": 0.2086, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.812857449054718, "rewards/margins": 5.271449565887451, "rewards/rejected": -6.0843071937561035, "step": 4150 }, { "epoch": 2.813127930341594, "grad_norm": 12.436116218566895, "learning_rate": 2.371570847483839e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -180.7625732421875, "logps/rejected": -277.9272766113281, "loss": 0.2046, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6954517364501953, "rewards/margins": 5.145771026611328, "rewards/rejected": -5.841222763061523, "step": 4200 }, { "epoch": 2.8466175485599465, "grad_norm": 66.9225845336914, "learning_rate": 1.6022734607604393e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -187.79019165039062, "logps/rejected": -282.13323974609375, "loss": 0.2096, "rewards/accuracies": 0.7925000190734863, "rewards/chosen": -0.8357629179954529, "rewards/margins": 5.103863716125488, "rewards/rejected": -5.939626693725586, "step": 4250 }, { "epoch": 2.8801071667782985, "grad_norm": 15.983145713806152, "learning_rate": 9.823738956571182e-09, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -191.03807067871094, "logps/rejected": -292.3773193359375, "loss": 0.206, "rewards/accuracies": 0.7912499904632568, "rewards/chosen": -0.6932557821273804, "rewards/margins": 5.146268367767334, "rewards/rejected": -5.839523792266846, "step": 4300 }, { "epoch": 2.913596784996651, "grad_norm": 33.383487701416016, "learning_rate": 5.128133491700715e-09, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.7404327392578, "logps/rejected": -289.3056945800781, "loss": 0.1936, "rewards/accuracies": 0.7975000143051147, "rewards/chosen": -0.7487243413925171, "rewards/margins": 5.300227642059326, "rewards/rejected": -6.048952579498291, "step": 4350 }, { "epoch": 2.9470864032150033, "grad_norm": 3.542743682861328, "learning_rate": 1.9430475771796684e-09, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -190.31752014160156, "logps/rejected": -268.015380859375, "loss": 0.2124, "rewards/accuracies": 0.7862499952316284, "rewards/chosen": -0.6255255937576294, "rewards/margins": 4.9648332595825195, "rewards/rejected": -5.590358257293701, "step": 4400 }, { "epoch": 2.9805760214333556, "grad_norm": 19.205642700195312, "learning_rate": 2.733171468656259e-10, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -177.03684997558594, "logps/rejected": -277.01495361328125, "loss": 0.2059, "rewards/accuracies": 0.7837499976158142, "rewards/chosen": -0.7730162739753723, "rewards/margins": 5.190572738647461, "rewards/rejected": -5.963588714599609, "step": 4450 } ], "logging_steps": 50, "max_steps": 4479, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }