{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2571.2083587646484, "epoch": 0.001142857142857143, "grad_norm": 0.19757072627544403, "kl": 0.0, "learning_rate": 2e-08, "loss": -0.0, "reward": 0.17825700528919697, "reward_std": 0.804851658642292, "rewards/cosine_scaled_reward": -0.015534311532974243, "rewards/format_reward": 0.5208333488553762, "step": 1 }, { "completion_length": 2804.395881652832, "epoch": 0.002285714285714286, "grad_norm": 0.18212556838989258, "kl": 0.0, "learning_rate": 4e-08, "loss": -0.0, "reward": -0.1223274078220129, "reward_std": 0.46937728114426136, "rewards/cosine_scaled_reward": -0.04980122856795788, "rewards/format_reward": 0.37500000558793545, "step": 2 }, { "completion_length": 3303.9583435058594, "epoch": 0.0034285714285714284, "grad_norm": 0.16496071219444275, "kl": 4.756450653076172e-05, "learning_rate": 6e-08, "loss": 0.0, "reward": -0.5349157964810729, "reward_std": 0.4061080813407898, "rewards/cosine_scaled_reward": -0.2544318288564682, "rewards/format_reward": 0.1458333395421505, "step": 3 }, { "completion_length": 2260.6875228881836, "epoch": 0.004571428571428572, "grad_norm": 0.26921820640563965, "kl": 3.6716461181640625e-05, "learning_rate": 8e-08, "loss": 0.0, "reward": 0.1224252681422513, "reward_std": 0.812014251947403, "rewards/cosine_scaled_reward": -0.09193882904946804, "rewards/format_reward": 0.6458333414047956, "step": 4 }, { "completion_length": 3346.6041870117188, "epoch": 0.005714285714285714, "grad_norm": 0.1722181737422943, "kl": 4.376843571662903e-05, "learning_rate": 1e-07, "loss": 0.0, "reward": -0.38172444701194763, "reward_std": 0.5492917411029339, "rewards/cosine_scaled_reward": -0.22456051781773567, "rewards/format_reward": 0.25000000558793545, "step": 5 }, { "completion_length": 3113.7500610351562, "epoch": 0.006857142857142857, "grad_norm": 0.21919280290603638, "kl": 4.5668333768844604e-05, "learning_rate": 1.2e-07, "loss": 0.0, "reward": -0.2863283231854439, "reward_std": 0.5716646872460842, "rewards/cosine_scaled_reward": -0.19011332368245348, "rewards/format_reward": 0.29166667349636555, "step": 6 }, { "completion_length": 3158.8333740234375, "epoch": 0.008, "grad_norm": 0.1657346487045288, "kl": 2.4143606424331665e-05, "learning_rate": 1.4e-07, "loss": 0.0, "reward": -0.0321456715464592, "reward_std": 0.6449971524998546, "rewards/cosine_scaled_reward": -0.1226729229092598, "rewards/format_reward": 0.4791666828095913, "step": 7 }, { "completion_length": 2815.1250610351562, "epoch": 0.009142857142857144, "grad_norm": 0.15989142656326294, "kl": 2.526119351387024e-05, "learning_rate": 1.6e-07, "loss": 0.0, "reward": 0.3240444455295801, "reward_std": 0.883681982755661, "rewards/cosine_scaled_reward": 0.1182668274268508, "rewards/format_reward": 0.5000000111758709, "step": 8 }, { "completion_length": 3149.0625915527344, "epoch": 0.010285714285714285, "grad_norm": 0.21666041016578674, "kl": 4.5686960220336914e-05, "learning_rate": 1.8e-07, "loss": 0.0, "reward": -0.18796737492084503, "reward_std": 0.7943232320249081, "rewards/cosine_scaled_reward": -0.13237779098562896, "rewards/format_reward": 0.27083334140479565, "step": 9 }, { "completion_length": 2782.3750228881836, "epoch": 0.011428571428571429, "grad_norm": 0.18924005329608917, "kl": 3.37064266204834e-05, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.11666052648797631, "reward_std": 0.879204161465168, "rewards/cosine_scaled_reward": -0.012820702977478504, "rewards/format_reward": 0.41666667349636555, "step": 10 }, { "completion_length": 3473.062530517578, "epoch": 0.012571428571428572, "grad_norm": 0.20001095533370972, "kl": 4.3779611587524414e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": -0.6018680967390537, "reward_std": 0.4545674379914999, "rewards/cosine_scaled_reward": -0.28993429616093636, "rewards/format_reward": 0.1041666679084301, "step": 11 }, { "completion_length": 2469.6667098999023, "epoch": 0.013714285714285714, "grad_norm": 0.24237319827079773, "kl": 4.6290457248687744e-05, "learning_rate": 2.4e-07, "loss": 0.0, "reward": 0.08407351560890675, "reward_std": 0.751841738820076, "rewards/cosine_scaled_reward": -0.1303967982530594, "rewards/format_reward": 0.6458333414047956, "step": 12 }, { "completion_length": 2778.1458587646484, "epoch": 0.014857142857142857, "grad_norm": 0.18375596404075623, "kl": 4.48375940322876e-05, "learning_rate": 2.6e-07, "loss": 0.0, "reward": 0.09226825274527073, "reward_std": 0.6979338899254799, "rewards/cosine_scaled_reward": -0.015000073239207268, "rewards/format_reward": 0.4166666865348816, "step": 13 }, { "completion_length": 2874.750045776367, "epoch": 0.016, "grad_norm": 0.1823539286851883, "kl": 2.8234906494617462e-05, "learning_rate": 2.8e-07, "loss": 0.0, "reward": -0.11271460726857185, "reward_std": 0.7010148204863071, "rewards/cosine_scaled_reward": -0.14169861702248454, "rewards/format_reward": 0.39583333395421505, "step": 14 }, { "completion_length": 2797.395854949951, "epoch": 0.017142857142857144, "grad_norm": 0.20054183900356293, "kl": 2.563674934208393e-05, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.018786390544846654, "reward_std": 0.6112043932080269, "rewards/cosine_scaled_reward": 0.03721225541085005, "rewards/format_reward": 0.39583333767950535, "step": 15 }, { "completion_length": 3453.1458435058594, "epoch": 0.018285714285714287, "grad_norm": 0.18542714416980743, "kl": 4.1812658309936523e-05, "learning_rate": 3.2e-07, "loss": 0.0, "reward": -0.5245386594906449, "reward_std": 0.5483577623963356, "rewards/cosine_scaled_reward": -0.2334668217226863, "rewards/format_reward": 0.06250000186264515, "step": 16 }, { "completion_length": 2326.8750610351562, "epoch": 0.019428571428571427, "grad_norm": 0.2586069405078888, "kl": 3.917887806892395e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": 0.29090012004598975, "reward_std": 0.7025135271251202, "rewards/cosine_scaled_reward": 0.04808543558465317, "rewards/format_reward": 0.6666666716337204, "step": 17 }, { "completion_length": 2884.708366394043, "epoch": 0.02057142857142857, "grad_norm": 0.17255671322345734, "kl": 2.2798776626586914e-05, "learning_rate": 3.6e-07, "loss": 0.0, "reward": -0.06478883884847164, "reward_std": 0.6950070075690746, "rewards/cosine_scaled_reward": -0.09088864922523499, "rewards/format_reward": 0.43750000558793545, "step": 18 }, { "completion_length": 2841.604202270508, "epoch": 0.021714285714285714, "grad_norm": 0.19047367572784424, "kl": 3.0055642127990723e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": 0.2590886615216732, "reward_std": 0.9022833462804556, "rewards/cosine_scaled_reward": 0.051947877276688814, "rewards/format_reward": 0.4375000111758709, "step": 19 }, { "completion_length": 2353.1042289733887, "epoch": 0.022857142857142857, "grad_norm": 0.194220632314682, "kl": 1.3179145753383636e-05, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.2806552555412054, "reward_std": 0.8725750483572483, "rewards/cosine_scaled_reward": 0.004965861327946186, "rewards/format_reward": 0.6666666809469461, "step": 20 }, { "completion_length": 2684.687545776367, "epoch": 0.024, "grad_norm": 0.19551798701286316, "kl": 4.254281520843506e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": 0.09347914904356003, "reward_std": 0.8260357603430748, "rewards/cosine_scaled_reward": -0.07159630116075277, "rewards/format_reward": 0.5000000074505806, "step": 21 }, { "completion_length": 1796.4375381469727, "epoch": 0.025142857142857144, "grad_norm": 0.3671523332595825, "kl": 4.005804657936096e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": 0.19362880755215883, "reward_std": 0.8532019667327404, "rewards/cosine_scaled_reward": -0.06471679511014372, "rewards/format_reward": 0.7291666697710752, "step": 22 }, { "completion_length": 2500.062530517578, "epoch": 0.026285714285714287, "grad_norm": 0.22526901960372925, "kl": 3.0137598514556885e-05, "learning_rate": 4.6e-07, "loss": 0.0, "reward": 0.041429003700613976, "reward_std": 0.7314254455268383, "rewards/cosine_scaled_reward": -0.07520224852487445, "rewards/format_reward": 0.5208333469927311, "step": 23 }, { "completion_length": 2648.291717529297, "epoch": 0.027428571428571427, "grad_norm": 0.21498121321201324, "kl": 1.6693025827407837e-05, "learning_rate": 4.8e-07, "loss": 0.0, "reward": 0.21239036042243242, "reward_std": 0.6812088377773762, "rewards/cosine_scaled_reward": 0.027016831561923027, "rewards/format_reward": 0.5833333469927311, "step": 24 }, { "completion_length": 2769.2291946411133, "epoch": 0.02857142857142857, "grad_norm": 0.23084591329097748, "kl": 3.578886389732361e-05, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.04653824120759964, "reward_std": 0.7504720520228148, "rewards/cosine_scaled_reward": -0.0376815393101424, "rewards/format_reward": 0.3958333432674408, "step": 25 }, { "completion_length": 2962.562530517578, "epoch": 0.029714285714285714, "grad_norm": 0.16293752193450928, "kl": 3.400072455406189e-05, "learning_rate": 5.2e-07, "loss": 0.0, "reward": 0.010799127630889416, "reward_std": 0.5875276923179626, "rewards/cosine_scaled_reward": -0.043792182579636574, "rewards/format_reward": 0.4791666716337204, "step": 26 }, { "completion_length": 2952.166717529297, "epoch": 0.030857142857142857, "grad_norm": 0.2215096801519394, "kl": 3.3194024581462145e-05, "learning_rate": 5.4e-07, "loss": 0.0, "reward": 0.11158058885484934, "reward_std": 0.8504065573215485, "rewards/cosine_scaled_reward": -0.05920940637588501, "rewards/format_reward": 0.5208333469927311, "step": 27 }, { "completion_length": 2872.0625, "epoch": 0.032, "grad_norm": 0.18577344715595245, "kl": 3.726780414581299e-05, "learning_rate": 5.6e-07, "loss": 0.0, "reward": 0.13817322719842196, "reward_std": 0.7248105835169554, "rewards/cosine_scaled_reward": 0.027827581390738487, "rewards/format_reward": 0.47916666977107525, "step": 28 }, { "completion_length": 3446.5833435058594, "epoch": 0.03314285714285714, "grad_norm": 0.21212224662303925, "kl": 2.4769455194473267e-05, "learning_rate": 5.8e-07, "loss": 0.0, "reward": -0.3688106779009104, "reward_std": 0.6171325668692589, "rewards/cosine_scaled_reward": -0.17023007571697235, "rewards/format_reward": 0.14583333767950535, "step": 29 }, { "completion_length": 2783.3333587646484, "epoch": 0.03428571428571429, "grad_norm": 0.17819644510746002, "kl": 2.261658664792776e-05, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.2445693917106837, "reward_std": 0.9377000778913498, "rewards/cosine_scaled_reward": 0.03347432427108288, "rewards/format_reward": 0.4791666753590107, "step": 30 }, { "completion_length": 3050.2291870117188, "epoch": 0.03542857142857143, "grad_norm": 0.16932828724384308, "kl": 2.38027423620224e-05, "learning_rate": 6.2e-07, "loss": 0.0, "reward": -0.16898724623024464, "reward_std": 0.648616686463356, "rewards/cosine_scaled_reward": -0.11072872229851782, "rewards/format_reward": 0.2708333395421505, "step": 31 }, { "completion_length": 3094.1250610351562, "epoch": 0.036571428571428574, "grad_norm": 0.18312151730060577, "kl": 2.9257498681545258e-05, "learning_rate": 6.4e-07, "loss": 0.0, "reward": 0.004837207496166229, "reward_std": 0.6736676879227161, "rewards/cosine_scaled_reward": -0.017333179406705312, "rewards/format_reward": 0.3333333395421505, "step": 32 }, { "completion_length": 3368.562530517578, "epoch": 0.037714285714285714, "grad_norm": 0.15040728449821472, "kl": 2.4586915969848633e-05, "learning_rate": 6.6e-07, "loss": 0.0, "reward": -0.28359657526016235, "reward_std": 0.6239799037575722, "rewards/cosine_scaled_reward": -0.11568338703364134, "rewards/format_reward": 0.1458333395421505, "step": 33 }, { "completion_length": 2505.1875610351562, "epoch": 0.038857142857142854, "grad_norm": 0.30060505867004395, "kl": 2.2239633835852146e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": 0.27598637342453003, "reward_std": 0.8686297200620174, "rewards/cosine_scaled_reward": 0.011391445528715849, "rewards/format_reward": 0.5416666716337204, "step": 34 }, { "completion_length": 3063.2708740234375, "epoch": 0.04, "grad_norm": 0.24084553122520447, "kl": 4.2811036109924316e-05, "learning_rate": 7e-07, "loss": 0.0, "reward": -0.03728431276977062, "reward_std": 0.9247638881206512, "rewards/cosine_scaled_reward": -0.1015674127265811, "rewards/format_reward": 0.3541666716337204, "step": 35 }, { "completion_length": 3358.7291870117188, "epoch": 0.04114285714285714, "grad_norm": 0.17074225842952728, "kl": 2.5674700736999512e-05, "learning_rate": 7.2e-07, "loss": 0.0, "reward": -0.4219683278352022, "reward_std": 0.516293577849865, "rewards/cosine_scaled_reward": -0.20600515604019165, "rewards/format_reward": 0.1875000074505806, "step": 36 }, { "completion_length": 3300.791717529297, "epoch": 0.04228571428571429, "grad_norm": 0.1499692052602768, "kl": 1.6324222087860107e-05, "learning_rate": 7.4e-07, "loss": 0.0, "reward": -0.42433968995464966, "reward_std": 0.5183847993612289, "rewards/cosine_scaled_reward": -0.21130944415926933, "rewards/format_reward": 0.1875000074505806, "step": 37 }, { "completion_length": 3274.9166870117188, "epoch": 0.04342857142857143, "grad_norm": 0.18995541334152222, "kl": 2.5459565222263336e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": -0.42816436290740967, "reward_std": 0.3850083723664284, "rewards/cosine_scaled_reward": -0.16101331263780594, "rewards/format_reward": 0.14583333395421505, "step": 38 }, { "completion_length": 2845.81254196167, "epoch": 0.044571428571428574, "grad_norm": 0.16534960269927979, "kl": 1.8164515495300293e-05, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "reward": -0.08653704356402159, "reward_std": 0.5173388682305813, "rewards/cosine_scaled_reward": -0.039690399542450905, "rewards/format_reward": 0.3750000111758709, "step": 39 }, { "completion_length": 2536.583381652832, "epoch": 0.045714285714285714, "grad_norm": 0.25925570726394653, "kl": 2.086721360683441e-05, "learning_rate": 8e-07, "loss": 0.0, "reward": -0.0936742543708533, "reward_std": 0.5705901933833957, "rewards/cosine_scaled_reward": -0.09443798521533608, "rewards/format_reward": 0.5000000093132257, "step": 40 }, { "completion_length": 2954.979217529297, "epoch": 0.046857142857142854, "grad_norm": 0.17119692265987396, "kl": 5.511566996574402e-06, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "reward": -0.2362358495593071, "reward_std": 0.6107039824128151, "rewards/cosine_scaled_reward": -0.21764612046536058, "rewards/format_reward": 0.41666667349636555, "step": 41 }, { "completion_length": 2852.250020980835, "epoch": 0.048, "grad_norm": 0.2615634500980377, "kl": 3.533065319061279e-05, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "reward": -0.5097283767536283, "reward_std": 0.4514521397650242, "rewards/cosine_scaled_reward": -0.2993845697492361, "rewards/format_reward": 0.27083333395421505, "step": 42 }, { "completion_length": 3095.4583435058594, "epoch": 0.04914285714285714, "grad_norm": 0.16259542107582092, "kl": 2.100318670272827e-05, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "reward": -0.09295706450939178, "reward_std": 0.6536407507956028, "rewards/cosine_scaled_reward": -0.05758994724601507, "rewards/format_reward": 0.25, "step": 43 }, { "completion_length": 2561.1875381469727, "epoch": 0.05028571428571429, "grad_norm": 0.2526821494102478, "kl": 8.388608694076538e-05, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "reward": 0.15549907088279724, "reward_std": 0.8106728717684746, "rewards/cosine_scaled_reward": -0.022572665475308895, "rewards/format_reward": 0.5000000111758709, "step": 44 }, { "completion_length": 3464.729217529297, "epoch": 0.05142857142857143, "grad_norm": 0.14498470723628998, "kl": 2.6337802410125732e-05, "learning_rate": 9e-07, "loss": 0.0, "reward": -0.25844255089759827, "reward_std": 0.6470912098884583, "rewards/cosine_scaled_reward": -0.11992522329092026, "rewards/format_reward": 0.1666666716337204, "step": 45 }, { "completion_length": 3175.437530517578, "epoch": 0.052571428571428575, "grad_norm": 0.18001262843608856, "kl": 4.996359348297119e-05, "learning_rate": 9.2e-07, "loss": 0.0, "reward": -0.4702068418264389, "reward_std": 0.43945666775107384, "rewards/cosine_scaled_reward": -0.22439202293753624, "rewards/format_reward": 0.18750000186264515, "step": 46 }, { "completion_length": 2691.541763305664, "epoch": 0.053714285714285714, "grad_norm": 0.24902203679084778, "kl": 2.5863759219646454e-05, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "reward": 0.3736280268058181, "reward_std": 0.9081602022051811, "rewards/cosine_scaled_reward": 0.09569812752306461, "rewards/format_reward": 0.5416666809469461, "step": 47 }, { "completion_length": 2735.2292251586914, "epoch": 0.054857142857142854, "grad_norm": 0.2746092677116394, "kl": 0.0001163184642791748, "learning_rate": 9.6e-07, "loss": 0.0, "reward": 0.02801407827064395, "reward_std": 0.8248592298477888, "rewards/cosine_scaled_reward": -0.05238310806453228, "rewards/format_reward": 0.3958333358168602, "step": 48 }, { "completion_length": 2386.5416984558105, "epoch": 0.056, "grad_norm": 0.2179473340511322, "kl": 5.142390727996826e-05, "learning_rate": 9.8e-07, "loss": 0.0, "reward": 0.3018168299458921, "reward_std": 0.9894729033112526, "rewards/cosine_scaled_reward": 0.020154984667897224, "rewards/format_reward": 0.5833333376795053, "step": 49 }, { "completion_length": 2974.583354949951, "epoch": 0.05714285714285714, "grad_norm": 0.19161240756511688, "kl": 8.691102266311646e-05, "learning_rate": 1e-06, "loss": 0.0, "reward": -0.04128398001194, "reward_std": 0.6844424605369568, "rewards/cosine_scaled_reward": -0.014122288441285491, "rewards/format_reward": 0.3333333358168602, "step": 50 }, { "completion_length": 2252.375045776367, "epoch": 0.05828571428571429, "grad_norm": 0.22708459198474884, "kl": 0.0002549290657043457, "learning_rate": 9.999890338174275e-07, "loss": 0.0, "reward": 0.13623822387307882, "reward_std": 0.7222620993852615, "rewards/cosine_scaled_reward": -0.04494801629334688, "rewards/format_reward": 0.5416666772216558, "step": 51 }, { "completion_length": 2870.2291870117188, "epoch": 0.05942857142857143, "grad_norm": 0.21368886530399323, "kl": 0.0001485683023929596, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "reward": 0.015072201727889478, "reward_std": 0.9205258414149284, "rewards/cosine_scaled_reward": -0.06388110015541315, "rewards/format_reward": 0.37500000186264515, "step": 52 }, { "completion_length": 2698.0000762939453, "epoch": 0.060571428571428575, "grad_norm": 0.2288151979446411, "kl": 0.00016783177852630615, "learning_rate": 9.999013075636804e-07, "loss": 0.0, "reward": 0.10895399999571964, "reward_std": 0.8917515203356743, "rewards/cosine_scaled_reward": -0.04874301888048649, "rewards/format_reward": 0.5208333395421505, "step": 53 }, { "completion_length": 2874.979248046875, "epoch": 0.061714285714285715, "grad_norm": 0.16582155227661133, "kl": 4.6576838940382004e-05, "learning_rate": 9.998245517681593e-07, "loss": 0.0, "reward": 0.31593899679137394, "reward_std": 0.9704541265964508, "rewards/cosine_scaled_reward": 0.09179376903921366, "rewards/format_reward": 0.5000000149011612, "step": 54 }, { "completion_length": 3011.875045776367, "epoch": 0.06285714285714286, "grad_norm": 0.16874325275421143, "kl": 0.0001902114599943161, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": 0.1326095201075077, "reward_std": 0.8541341703385115, "rewards/cosine_scaled_reward": 0.022567307110875845, "rewards/format_reward": 0.45833333767950535, "step": 55 }, { "completion_length": 2914.1458892822266, "epoch": 0.064, "grad_norm": 0.17759209871292114, "kl": 8.147954940795898e-05, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "reward": 0.021647373214364052, "reward_std": 0.5984874460846186, "rewards/cosine_scaled_reward": -0.05477431882172823, "rewards/format_reward": 0.4375000111758709, "step": 56 }, { "completion_length": 3319.916717529297, "epoch": 0.06514285714285714, "grad_norm": 0.13513678312301636, "kl": 6.243959069252014e-05, "learning_rate": 9.994627618036452e-07, "loss": 0.0, "reward": -0.10464489553123713, "reward_std": 0.9139657057821751, "rewards/cosine_scaled_reward": -0.12102228635922074, "rewards/format_reward": 0.3333333395421505, "step": 57 }, { "completion_length": 2223.3750534057617, "epoch": 0.06628571428571428, "grad_norm": 0.19678883254528046, "kl": 0.0008740425109863281, "learning_rate": 9.992983438818915e-07, "loss": 0.0, "reward": 0.3077916601905599, "reward_std": 0.7720399703830481, "rewards/cosine_scaled_reward": 0.0032341796904802322, "rewards/format_reward": 0.6875000111758709, "step": 58 }, { "completion_length": 2857.1041870117188, "epoch": 0.06742857142857143, "grad_norm": 0.16988952457904816, "kl": 7.66068696975708e-05, "learning_rate": 9.991120277927223e-07, "loss": 0.0, "reward": -0.12906377390027046, "reward_std": 0.5678401403129101, "rewards/cosine_scaled_reward": -0.09338383004069328, "rewards/format_reward": 0.33333333395421505, "step": 59 }, { "completion_length": 3053.0208740234375, "epoch": 0.06857142857142857, "grad_norm": 0.19087883830070496, "kl": 0.00012252479791641235, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": -0.33987870812416077, "reward_std": 0.4983965791761875, "rewards/cosine_scaled_reward": -0.22157840942963958, "rewards/format_reward": 0.33333334140479565, "step": 60 }, { "completion_length": 2953.0625762939453, "epoch": 0.06971428571428571, "grad_norm": 0.1577925682067871, "kl": 0.0002728104591369629, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "reward": 0.10354057513177395, "reward_std": 0.8005717396736145, "rewards/cosine_scaled_reward": -0.0372257842682302, "rewards/format_reward": 0.4791666753590107, "step": 61 }, { "completion_length": 2714.104217529297, "epoch": 0.07085714285714285, "grad_norm": 0.1799556165933609, "kl": 0.0007392987608909607, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "reward": 0.32593785908829886, "reward_std": 0.9482499547302723, "rewards/cosine_scaled_reward": 0.08002315112389624, "rewards/format_reward": 0.562500013038516, "step": 62 }, { "completion_length": 2463.5625762939453, "epoch": 0.072, "grad_norm": 0.19159279763698578, "kl": 0.0007972661405801773, "learning_rate": 9.981479793771866e-07, "loss": 0.0, "reward": 0.4637246737256646, "reward_std": 1.0199245177209377, "rewards/cosine_scaled_reward": 0.1016120407730341, "rewards/format_reward": 0.6666666716337204, "step": 63 }, { "completion_length": 2957.2709045410156, "epoch": 0.07314285714285715, "grad_norm": 0.19695256650447845, "kl": 0.0005731135606765747, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "reward": -0.09675957635045052, "reward_std": 0.7647779621183872, "rewards/cosine_scaled_reward": -0.11780054681003094, "rewards/format_reward": 0.35416667722165585, "step": 64 }, { "completion_length": 2748.2083473205566, "epoch": 0.07428571428571429, "grad_norm": 0.19534841179847717, "kl": 0.0003612041473388672, "learning_rate": 9.975348529157229e-07, "loss": 0.0, "reward": -0.20825218525715172, "reward_std": 0.6059501096606255, "rewards/cosine_scaled_reward": -0.18784814700484276, "rewards/format_reward": 0.4166666679084301, "step": 65 }, { "completion_length": 2097.4375076293945, "epoch": 0.07542857142857143, "grad_norm": 0.250161349773407, "kl": 0.0006571710109710693, "learning_rate": 9.971955636222684e-07, "loss": 0.0, "reward": 0.0786643698811531, "reward_std": 0.6597852855920792, "rewards/cosine_scaled_reward": -0.006518872454762459, "rewards/format_reward": 0.5, "step": 66 }, { "completion_length": 3430.4791870117188, "epoch": 0.07657142857142857, "grad_norm": 0.14664356410503387, "kl": 0.0005048699676990509, "learning_rate": 9.968344786479415e-07, "loss": 0.0, "reward": -0.5055637508630753, "reward_std": 0.4412064775824547, "rewards/cosine_scaled_reward": -0.2427280293777585, "rewards/format_reward": 0.12500000558793545, "step": 67 }, { "completion_length": 2271.250045776367, "epoch": 0.07771428571428571, "grad_norm": 0.27284932136535645, "kl": 0.0013322830200195312, "learning_rate": 9.964516155915151e-07, "loss": 0.0001, "reward": 0.2981271520256996, "reward_std": 0.9507227130234241, "rewards/cosine_scaled_reward": 0.0009464251343160868, "rewards/format_reward": 0.6041666716337204, "step": 68 }, { "completion_length": 2530.2708587646484, "epoch": 0.07885714285714286, "grad_norm": 0.22157438099384308, "kl": 0.0010530054569244385, "learning_rate": 9.960469931131936e-07, "loss": 0.0, "reward": -0.22882681945338845, "reward_std": 0.6600410342216492, "rewards/cosine_scaled_reward": -0.20334339328110218, "rewards/format_reward": 0.41666667722165585, "step": 69 }, { "completion_length": 3004.7708587646484, "epoch": 0.08, "grad_norm": 0.21289457380771637, "kl": 0.001432761549949646, "learning_rate": 9.956206309337066e-07, "loss": 0.0001, "reward": -0.3504838487133384, "reward_std": 0.5108464825898409, "rewards/cosine_scaled_reward": -0.24259378435090184, "rewards/format_reward": 0.3541666716337204, "step": 70 }, { "completion_length": 2526.937515258789, "epoch": 0.08114285714285714, "grad_norm": 0.21181254088878632, "kl": 0.0007251240313053131, "learning_rate": 9.951725498333448e-07, "loss": 0.0, "reward": -0.016672035679221153, "reward_std": 0.5613718032836914, "rewards/cosine_scaled_reward": -0.044779783114790916, "rewards/format_reward": 0.4166666716337204, "step": 71 }, { "completion_length": 3029.979217529297, "epoch": 0.08228571428571428, "grad_norm": 0.20094312727451324, "kl": 0.000997304916381836, "learning_rate": 9.947027716509488e-07, "loss": 0.0, "reward": -0.21065808949060738, "reward_std": 0.6206581741571426, "rewards/cosine_scaled_reward": -0.12715522898361087, "rewards/format_reward": 0.29166666977107525, "step": 72 }, { "completion_length": 3455.625, "epoch": 0.08342857142857144, "grad_norm": 0.13940556347370148, "kl": 0.00026963651180267334, "learning_rate": 9.942113192828444e-07, "loss": 0.0, "reward": -0.3948047012090683, "reward_std": 0.596671599894762, "rewards/cosine_scaled_reward": -0.1924935569986701, "rewards/format_reward": 0.1458333395421505, "step": 73 }, { "completion_length": 3041.666717529297, "epoch": 0.08457142857142858, "grad_norm": 0.2121056467294693, "kl": 0.0018717050552368164, "learning_rate": 9.93698216681727e-07, "loss": 0.0001, "reward": -0.10725430864840746, "reward_std": 0.786587443202734, "rewards/cosine_scaled_reward": -0.08628973411396146, "rewards/format_reward": 0.3333333358168602, "step": 74 }, { "completion_length": 3010.2708740234375, "epoch": 0.08571428571428572, "grad_norm": 0.17574620246887207, "kl": 0.0008899793028831482, "learning_rate": 9.931634888554935e-07, "loss": 0.0, "reward": 0.17169499211013317, "reward_std": 0.7550366073846817, "rewards/cosine_scaled_reward": 0.06438030861318111, "rewards/format_reward": 0.41666666977107525, "step": 75 }, { "completion_length": 3000.0208587646484, "epoch": 0.08685714285714285, "grad_norm": 0.1713106483221054, "kl": 0.0001423284411430359, "learning_rate": 9.926071618660237e-07, "loss": 0.0, "reward": -0.2900051809847355, "reward_std": 0.5428481921553612, "rewards/cosine_scaled_reward": -0.17083199694752693, "rewards/format_reward": 0.3333333358168602, "step": 76 }, { "completion_length": 3030.854232788086, "epoch": 0.088, "grad_norm": 0.15809102356433868, "kl": 0.00039126724004745483, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "reward": -0.12223898246884346, "reward_std": 0.5839939434081316, "rewards/cosine_scaled_reward": -0.12459814921021461, "rewards/format_reward": 0.39583334513008595, "step": 77 }, { "completion_length": 3080.5208740234375, "epoch": 0.08914285714285715, "grad_norm": 0.1908382624387741, "kl": 0.00018369778990745544, "learning_rate": 9.91429819907136e-07, "loss": 0.0, "reward": 0.24677438661456108, "reward_std": 0.8652092255651951, "rewards/cosine_scaled_reward": 0.060313327237963676, "rewards/format_reward": 0.45833334140479565, "step": 78 }, { "completion_length": 2289.7916870117188, "epoch": 0.09028571428571429, "grad_norm": 0.2242053598165512, "kl": 0.000986546277999878, "learning_rate": 9.908088623197048e-07, "loss": 0.0, "reward": 0.043350703082978725, "reward_std": 0.7450110893696547, "rewards/cosine_scaled_reward": -0.10156810469925404, "rewards/format_reward": 0.5833333414047956, "step": 79 }, { "completion_length": 3217.6458740234375, "epoch": 0.09142857142857143, "grad_norm": 0.16875122487545013, "kl": 0.0006685960106551647, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "reward": -0.21108814515173435, "reward_std": 0.6316388584673405, "rewards/cosine_scaled_reward": -0.141633331310004, "rewards/format_reward": 0.2916666716337204, "step": 80 }, { "completion_length": 3053.0417098999023, "epoch": 0.09257142857142857, "grad_norm": 0.21555641293525696, "kl": 0.001847386360168457, "learning_rate": 9.895025252503755e-07, "loss": 0.0001, "reward": -0.08620672597317025, "reward_std": 0.6757525354623795, "rewards/cosine_scaled_reward": -0.08836011588573456, "rewards/format_reward": 0.37500000186264515, "step": 81 }, { "completion_length": 2864.375030517578, "epoch": 0.09371428571428571, "grad_norm": 0.18830879032611847, "kl": 0.0011097192764282227, "learning_rate": 9.888172094375033e-07, "loss": 0.0, "reward": 0.05295779928565025, "reward_std": 0.6998694110661745, "rewards/cosine_scaled_reward": -0.003920115530490875, "rewards/format_reward": 0.3333333358168602, "step": 82 }, { "completion_length": 2812.312515258789, "epoch": 0.09485714285714286, "grad_norm": 0.22218850255012512, "kl": 0.0006138831377029419, "learning_rate": 9.881105062929221e-07, "loss": 0.0, "reward": -0.2050288449972868, "reward_std": 0.6761703044176102, "rewards/cosine_scaled_reward": -0.15050649549812078, "rewards/format_reward": 0.33333333395421505, "step": 83 }, { "completion_length": 3117.25, "epoch": 0.096, "grad_norm": 0.16899633407592773, "kl": 0.00044608116149902344, "learning_rate": 9.873824502603459e-07, "loss": 0.0, "reward": 0.03697575815021992, "reward_std": 0.8468833193182945, "rewards/cosine_scaled_reward": -0.044630819000303745, "rewards/format_reward": 0.35416667349636555, "step": 84 }, { "completion_length": 3147.687530517578, "epoch": 0.09714285714285714, "grad_norm": 0.14954747259616852, "kl": 0.00039239227771759033, "learning_rate": 9.866330768241983e-07, "loss": 0.0, "reward": -0.10085274185985327, "reward_std": 0.7591249234974384, "rewards/cosine_scaled_reward": -0.12476183846592903, "rewards/format_reward": 0.37500000931322575, "step": 85 }, { "completion_length": 2752.958351135254, "epoch": 0.09828571428571428, "grad_norm": 0.1987195760011673, "kl": 0.0008227825164794922, "learning_rate": 9.85862422507884e-07, "loss": 0.0, "reward": 0.022698190063238144, "reward_std": 0.6212068926542997, "rewards/cosine_scaled_reward": -0.04935761634260416, "rewards/format_reward": 0.4166666679084301, "step": 86 }, { "completion_length": 2598.500030517578, "epoch": 0.09942857142857142, "grad_norm": 0.196201354265213, "kl": 0.0009197257459163666, "learning_rate": 9.850705248720068e-07, "loss": 0.0, "reward": 0.1782613815739751, "reward_std": 0.8598120957612991, "rewards/cosine_scaled_reward": -0.03767361585050821, "rewards/format_reward": 0.5416666809469461, "step": 87 }, { "completion_length": 2744.875045776367, "epoch": 0.10057142857142858, "grad_norm": 0.1994200497865677, "kl": 0.0016593635082244873, "learning_rate": 9.8425742251254e-07, "loss": 0.0001, "reward": 0.38837052757298807, "reward_std": 1.1653981991112232, "rewards/cosine_scaled_reward": 0.04098894074559212, "rewards/format_reward": 0.5416666865348816, "step": 88 }, { "completion_length": 2835.041732788086, "epoch": 0.10171428571428572, "grad_norm": 0.17567898333072662, "kl": 0.0010609626770019531, "learning_rate": 9.83423155058946e-07, "loss": 0.0, "reward": -0.0007169158197939396, "reward_std": 0.7810570430010557, "rewards/cosine_scaled_reward": -0.0879359629470855, "rewards/format_reward": 0.4583333432674408, "step": 89 }, { "completion_length": 2648.2916870117188, "epoch": 0.10285714285714286, "grad_norm": 0.32330945134162903, "kl": 0.0009630322456359863, "learning_rate": 9.825677631722435e-07, "loss": 0.0, "reward": -0.35989359859377146, "reward_std": 0.5252104215323925, "rewards/cosine_scaled_reward": -0.2538211215287447, "rewards/format_reward": 0.3750000037252903, "step": 90 }, { "completion_length": 2993.0208740234375, "epoch": 0.104, "grad_norm": 0.19204914569854736, "kl": 0.0005503743886947632, "learning_rate": 9.816912885430258e-07, "loss": 0.0, "reward": 0.10479497350752354, "reward_std": 0.8132709451019764, "rewards/cosine_scaled_reward": 0.023820115253329277, "rewards/format_reward": 0.4166666716337204, "step": 91 }, { "completion_length": 2592.6875228881836, "epoch": 0.10514285714285715, "grad_norm": 0.22138415277004242, "kl": 0.0012896955013275146, "learning_rate": 9.807937738894303e-07, "loss": 0.0001, "reward": -0.160959305241704, "reward_std": 0.616182116791606, "rewards/cosine_scaled_reward": -0.17551955580711365, "rewards/format_reward": 0.4583333469927311, "step": 92 }, { "completion_length": 3457.7083435058594, "epoch": 0.10628571428571429, "grad_norm": 0.21576857566833496, "kl": 0.0012897849082946777, "learning_rate": 9.798752629550546e-07, "loss": 0.0001, "reward": -0.5375550724565983, "reward_std": 0.46420392021536827, "rewards/cosine_scaled_reward": -0.24901489913463593, "rewards/format_reward": 0.10416666977107525, "step": 93 }, { "completion_length": 3101.8750228881836, "epoch": 0.10742857142857143, "grad_norm": 0.18881510198116302, "kl": 0.0011924207210540771, "learning_rate": 9.78935800506826e-07, "loss": 0.0, "reward": -0.3899065591394901, "reward_std": 0.47088516876101494, "rewards/cosine_scaled_reward": -0.1563735492527485, "rewards/format_reward": 0.2083333358168602, "step": 94 }, { "completion_length": 3354.0833435058594, "epoch": 0.10857142857142857, "grad_norm": 0.1480981707572937, "kl": 0.0006138160824775696, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "reward": -0.16396450996398926, "reward_std": 0.8173685595393181, "rewards/cosine_scaled_reward": -0.13610992138274014, "rewards/format_reward": 0.2916666679084301, "step": 95 }, { "completion_length": 2596.770896911621, "epoch": 0.10971428571428571, "grad_norm": 0.2216062992811203, "kl": 0.0016668587923049927, "learning_rate": 9.769942052400235e-07, "loss": 0.0001, "reward": -0.06530294334515929, "reward_std": 0.6829965673387051, "rewards/cosine_scaled_reward": -0.08926075106137432, "rewards/format_reward": 0.43750000186264515, "step": 96 }, { "completion_length": 3140.104217529297, "epoch": 0.11085714285714286, "grad_norm": 0.20360398292541504, "kl": 0.0009580925107002258, "learning_rate": 9.759921670520634e-07, "loss": 0.0, "reward": -0.06862857192754745, "reward_std": 0.7287838291376829, "rewards/cosine_scaled_reward": -0.07030826597474515, "rewards/format_reward": 0.2916666753590107, "step": 97 }, { "completion_length": 2493.2291870117188, "epoch": 0.112, "grad_norm": 0.2129816859960556, "kl": 0.0004929900169372559, "learning_rate": 9.749693666068663e-07, "loss": 0.0, "reward": -0.014337641187012196, "reward_std": 0.6026619412004948, "rewards/cosine_scaled_reward": -0.1173677199985832, "rewards/format_reward": 0.5416666846722364, "step": 98 }, { "completion_length": 2910.2708435058594, "epoch": 0.11314285714285714, "grad_norm": 0.2219778299331665, "kl": 0.0009305477142333984, "learning_rate": 9.739258537542835e-07, "loss": 0.0, "reward": -0.1526417959248647, "reward_std": 0.6645006220787764, "rewards/cosine_scaled_reward": -0.11239583557471633, "rewards/format_reward": 0.31250000186264515, "step": 99 }, { "completion_length": 2669.1875228881836, "epoch": 0.11428571428571428, "grad_norm": 0.18922168016433716, "kl": 0.0009844303131103516, "learning_rate": 9.728616793536587e-07, "loss": 0.0, "reward": 0.36958627274725586, "reward_std": 0.8694315142929554, "rewards/cosine_scaled_reward": 0.11309660819824785, "rewards/format_reward": 0.5000000111758709, "step": 100 }, { "completion_length": 2726.729202270508, "epoch": 0.11542857142857142, "grad_norm": 0.17301318049430847, "kl": 0.0010145902633666992, "learning_rate": 9.717768952713511e-07, "loss": 0.0, "reward": -0.10664987377822399, "reward_std": 0.550342533737421, "rewards/cosine_scaled_reward": -0.07740492327138782, "rewards/format_reward": 0.3958333358168602, "step": 101 }, { "completion_length": 2206.812530517578, "epoch": 0.11657142857142858, "grad_norm": 0.3242272138595581, "kl": 0.001733243465423584, "learning_rate": 9.706715543782064e-07, "loss": 0.0001, "reward": -0.04840894846711308, "reward_std": 0.5605612080544233, "rewards/cosine_scaled_reward": -0.1783417221158743, "rewards/format_reward": 0.666666679084301, "step": 102 }, { "completion_length": 2822.145927429199, "epoch": 0.11771428571428572, "grad_norm": 0.20194919407367706, "kl": 0.0008286237716674805, "learning_rate": 9.695457105469804e-07, "loss": 0.0, "reward": 0.10302554164081812, "reward_std": 0.7938342466950417, "rewards/cosine_scaled_reward": -0.058811694383621216, "rewards/format_reward": 0.5000000111758709, "step": 103 }, { "completion_length": 2794.2291870117188, "epoch": 0.11885714285714286, "grad_norm": 0.21426032483577728, "kl": 0.0017331838607788086, "learning_rate": 9.683994186497132e-07, "loss": 0.0001, "reward": -0.15357419941574335, "reward_std": 0.7048191353678703, "rewards/cosine_scaled_reward": -0.14375380612909794, "rewards/format_reward": 0.3750000037252903, "step": 104 }, { "completion_length": 2595.7500381469727, "epoch": 0.12, "grad_norm": 0.1870584338903427, "kl": 0.0007082223892211914, "learning_rate": 9.672327345550543e-07, "loss": 0.0, "reward": 0.0526086570462212, "reward_std": 1.013668704777956, "rewards/cosine_scaled_reward": -0.10153029090724885, "rewards/format_reward": 0.47916667349636555, "step": 105 }, { "completion_length": 2181.2291717529297, "epoch": 0.12114285714285715, "grad_norm": 0.193389892578125, "kl": 0.0015556812286376953, "learning_rate": 9.66045715125541e-07, "loss": 0.0001, "reward": 0.20659764064475894, "reward_std": 0.7706484608352184, "rewards/cosine_scaled_reward": 0.0427999310195446, "rewards/format_reward": 0.5625, "step": 106 }, { "completion_length": 2629.104217529297, "epoch": 0.12228571428571429, "grad_norm": 0.2650865614414215, "kl": 0.001280069351196289, "learning_rate": 9.648384182148252e-07, "loss": 0.0001, "reward": 0.047421048395335674, "reward_std": 0.8096052818000317, "rewards/cosine_scaled_reward": -0.08784432336688042, "rewards/format_reward": 0.5208333469927311, "step": 107 }, { "completion_length": 2760.3125610351562, "epoch": 0.12342857142857143, "grad_norm": 0.22029048204421997, "kl": 0.0016175508499145508, "learning_rate": 9.636109026648554e-07, "loss": 0.0001, "reward": 0.20211763679981232, "reward_std": 0.9046534672379494, "rewards/cosine_scaled_reward": 0.006914107128977776, "rewards/format_reward": 0.4791666828095913, "step": 108 }, { "completion_length": 3071.2083587646484, "epoch": 0.12457142857142857, "grad_norm": 0.16153673827648163, "kl": 0.00049591064453125, "learning_rate": 9.623632283030077e-07, "loss": 0.0, "reward": -0.13390080258250237, "reward_std": 0.6916838400065899, "rewards/cosine_scaled_reward": -0.10345052601769567, "rewards/format_reward": 0.3958333358168602, "step": 109 }, { "completion_length": 2738.8125534057617, "epoch": 0.12571428571428572, "grad_norm": 0.20852850377559662, "kl": 0.0008258223533630371, "learning_rate": 9.610954559391704e-07, "loss": 0.0, "reward": -0.033169424161314964, "reward_std": 0.9115067571401596, "rewards/cosine_scaled_reward": -0.14516241010278463, "rewards/format_reward": 0.5000000055879354, "step": 110 }, { "completion_length": 2979.416702270508, "epoch": 0.12685714285714286, "grad_norm": 0.20923613011837006, "kl": 0.0016465187072753906, "learning_rate": 9.598076473627796e-07, "loss": 0.0001, "reward": -0.1917986012995243, "reward_std": 0.6587283834815025, "rewards/cosine_scaled_reward": -0.1627930011600256, "rewards/format_reward": 0.35416667349636555, "step": 111 }, { "completion_length": 3034.604202270508, "epoch": 0.128, "grad_norm": 0.1812065690755844, "kl": 0.0010587647557258606, "learning_rate": 9.58499865339809e-07, "loss": 0.0, "reward": 0.042024691589176655, "reward_std": 0.7155229933559895, "rewards/cosine_scaled_reward": -0.007911409251391888, "rewards/format_reward": 0.3958333395421505, "step": 112 }, { "completion_length": 2602.041702270508, "epoch": 0.12914285714285714, "grad_norm": 0.23084139823913574, "kl": 0.005176067352294922, "learning_rate": 9.571721736097088e-07, "loss": 0.0002, "reward": 0.09520017961040139, "reward_std": 0.7830863744020462, "rewards/cosine_scaled_reward": -0.06248902215156704, "rewards/format_reward": 0.5000000149011612, "step": 113 }, { "completion_length": 2630.812526702881, "epoch": 0.13028571428571428, "grad_norm": 0.22400474548339844, "kl": 0.002625703811645508, "learning_rate": 9.55824636882301e-07, "loss": 0.0001, "reward": -0.18021708587184548, "reward_std": 0.5372883807867765, "rewards/cosine_scaled_reward": -0.19902520813047886, "rewards/format_reward": 0.5208333432674408, "step": 114 }, { "completion_length": 2778.3333587646484, "epoch": 0.13142857142857142, "grad_norm": 0.2168537676334381, "kl": 0.0024030208587646484, "learning_rate": 9.54457320834625e-07, "loss": 0.0001, "reward": -0.23857925506308675, "reward_std": 0.4751722030341625, "rewards/cosine_scaled_reward": -0.12388885580003262, "rewards/format_reward": 0.33333333395421505, "step": 115 }, { "completion_length": 3442.6458740234375, "epoch": 0.13257142857142856, "grad_norm": 0.16627182066440582, "kl": 0.0018346309661865234, "learning_rate": 9.530702921077358e-07, "loss": 0.0001, "reward": -0.1470964252948761, "reward_std": 0.609551090747118, "rewards/cosine_scaled_reward": -0.060838012024760246, "rewards/format_reward": 0.2083333395421505, "step": 116 }, { "completion_length": 3086.3541870117188, "epoch": 0.1337142857142857, "grad_norm": 0.1960468739271164, "kl": 0.0021970272064208984, "learning_rate": 9.516636183034564e-07, "loss": 0.0001, "reward": -0.35040562483482063, "reward_std": 0.566862914711237, "rewards/cosine_scaled_reward": -0.23000845714705065, "rewards/format_reward": 0.3125000037252903, "step": 117 }, { "completion_length": 2862.125045776367, "epoch": 0.13485714285714287, "grad_norm": 0.19369491934776306, "kl": 0.0014324188232421875, "learning_rate": 9.502373679810839e-07, "loss": 0.0001, "reward": 0.23708336800336838, "reward_std": 0.8551704436540604, "rewards/cosine_scaled_reward": 0.059158104471862316, "rewards/format_reward": 0.47916666977107525, "step": 118 }, { "completion_length": 2329.145881652832, "epoch": 0.136, "grad_norm": 0.2726350724697113, "kl": 0.006384849548339844, "learning_rate": 9.487916106540465e-07, "loss": 0.0003, "reward": 0.06812019762583077, "reward_std": 0.6704130079597235, "rewards/cosine_scaled_reward": -0.1190024558454752, "rewards/format_reward": 0.6250000223517418, "step": 119 }, { "completion_length": 2332.5208740234375, "epoch": 0.13714285714285715, "grad_norm": 0.2544117867946625, "kl": 0.002226591110229492, "learning_rate": 9.473264167865171e-07, "loss": 0.0001, "reward": 0.13008875958621502, "reward_std": 0.7615821734070778, "rewards/cosine_scaled_reward": -0.029063436202704906, "rewards/format_reward": 0.5833333469927311, "step": 120 }, { "completion_length": 1837.375015258789, "epoch": 0.1382857142857143, "grad_norm": 0.25729382038116455, "kl": 0.007863402366638184, "learning_rate": 9.458418577899774e-07, "loss": 0.0003, "reward": 0.3155789945740253, "reward_std": 0.7411515153944492, "rewards/cosine_scaled_reward": -0.02028810605406761, "rewards/format_reward": 0.791666679084301, "step": 121 }, { "completion_length": 2958.145866394043, "epoch": 0.13942857142857143, "grad_norm": 0.20917245745658875, "kl": 0.001485586166381836, "learning_rate": 9.443380060197385e-07, "loss": 0.0001, "reward": 0.05837233364582062, "reward_std": 0.964107995852828, "rewards/cosine_scaled_reward": -0.043140748515725136, "rewards/format_reward": 0.37500000558793545, "step": 122 }, { "completion_length": 2625.7083892822266, "epoch": 0.14057142857142857, "grad_norm": 0.1879587173461914, "kl": 0.0017619132995605469, "learning_rate": 9.428149347714143e-07, "loss": 0.0001, "reward": 0.08997525461018085, "reward_std": 0.7339164912700653, "rewards/cosine_scaled_reward": -0.0870713610202074, "rewards/format_reward": 0.5625000093132257, "step": 123 }, { "completion_length": 2327.4792404174805, "epoch": 0.1417142857142857, "grad_norm": 0.19856718182563782, "kl": 0.002035379409790039, "learning_rate": 9.412727182773486e-07, "loss": 0.0001, "reward": 0.33785169292241335, "reward_std": 0.8534054830670357, "rewards/cosine_scaled_reward": 0.03106366191059351, "rewards/format_reward": 0.6458333432674408, "step": 124 }, { "completion_length": 2897.2083587646484, "epoch": 0.14285714285714285, "grad_norm": 0.1509743332862854, "kl": 0.0013265609741210938, "learning_rate": 9.397114317029974e-07, "loss": 0.0001, "reward": -0.1488229539245367, "reward_std": 0.5740789603441954, "rewards/cosine_scaled_reward": -0.06105211656540632, "rewards/format_reward": 0.27083333395421505, "step": 125 }, { "completion_length": 2956.0208740234375, "epoch": 0.144, "grad_norm": 0.18172477185726166, "kl": 0.0009171962738037109, "learning_rate": 9.381311511432658e-07, "loss": 0.0, "reward": -0.04263794468715787, "reward_std": 0.8349475301802158, "rewards/cosine_scaled_reward": -0.10641535092145205, "rewards/format_reward": 0.41666666977107525, "step": 126 }, { "completion_length": 2777.229202270508, "epoch": 0.14514285714285713, "grad_norm": 0.2586840093135834, "kl": 0.0015873908996582031, "learning_rate": 9.36531953618799e-07, "loss": 0.0001, "reward": -0.25305760465562344, "reward_std": 0.6229905411601067, "rewards/cosine_scaled_reward": -0.22235668450593948, "rewards/format_reward": 0.41666667722165585, "step": 127 }, { "completion_length": 2836.7708892822266, "epoch": 0.1462857142857143, "grad_norm": 0.21033713221549988, "kl": 0.0029465854167938232, "learning_rate": 9.34913917072228e-07, "loss": 0.0001, "reward": 0.31079221796244383, "reward_std": 0.9646506570279598, "rewards/cosine_scaled_reward": 0.121150006307289, "rewards/format_reward": 0.43750000186264515, "step": 128 }, { "completion_length": 3317.375030517578, "epoch": 0.14742857142857144, "grad_norm": 0.19819480180740356, "kl": 0.0024137496948242188, "learning_rate": 9.332771203643714e-07, "loss": 0.0001, "reward": -0.364932868629694, "reward_std": 0.47471798583865166, "rewards/cosine_scaled_reward": -0.18228841945528984, "rewards/format_reward": 0.2083333395421505, "step": 129 }, { "completion_length": 2750.1041717529297, "epoch": 0.14857142857142858, "grad_norm": 0.19781967997550964, "kl": 0.0014467239379882812, "learning_rate": 9.316216432703916e-07, "loss": 0.0001, "reward": -0.2264688154682517, "reward_std": 0.6045875921845436, "rewards/cosine_scaled_reward": -0.16815861780196428, "rewards/format_reward": 0.33333333395421505, "step": 130 }, { "completion_length": 2835.3333740234375, "epoch": 0.14971428571428572, "grad_norm": 0.23097459971904755, "kl": 0.0031147003173828125, "learning_rate": 9.299475664759068e-07, "loss": 0.0001, "reward": 0.24509642273187637, "reward_std": 0.6866675093770027, "rewards/cosine_scaled_reward": 0.16801240853965282, "rewards/format_reward": 0.4583333358168602, "step": 131 }, { "completion_length": 2526.6041984558105, "epoch": 0.15085714285714286, "grad_norm": 0.19051432609558105, "kl": 0.0015637874603271484, "learning_rate": 9.282549715730579e-07, "loss": 0.0001, "reward": 0.17219684056180995, "reward_std": 0.8952484987676144, "rewards/cosine_scaled_reward": -0.009550546063110232, "rewards/format_reward": 0.4791666679084301, "step": 132 }, { "completion_length": 3378.1041870117188, "epoch": 0.152, "grad_norm": 0.22686553001403809, "kl": 0.0022031068801879883, "learning_rate": 9.265439410565328e-07, "loss": 0.0001, "reward": -0.4034382812678814, "reward_std": 0.49673712253570557, "rewards/cosine_scaled_reward": -0.20925658009946346, "rewards/format_reward": 0.18750000186264515, "step": 133 }, { "completion_length": 2480.2083892822266, "epoch": 0.15314285714285714, "grad_norm": 0.2461298406124115, "kl": 0.0034208297729492188, "learning_rate": 9.248145583195447e-07, "loss": 0.0001, "reward": -0.010700544342398643, "reward_std": 0.6441561691462994, "rewards/cosine_scaled_reward": -0.10502434149384499, "rewards/format_reward": 0.5625000055879354, "step": 134 }, { "completion_length": 1805.4583473205566, "epoch": 0.15428571428571428, "grad_norm": 0.26353919506073, "kl": 0.0027773380279541016, "learning_rate": 9.230669076497687e-07, "loss": 0.0001, "reward": 0.5292202904820442, "reward_std": 0.849658913910389, "rewards/cosine_scaled_reward": 0.17550375685095787, "rewards/format_reward": 0.708333333954215, "step": 135 }, { "completion_length": 2799.7291870117188, "epoch": 0.15542857142857142, "grad_norm": 0.20877666771411896, "kl": 0.0029191970825195312, "learning_rate": 9.213010742252327e-07, "loss": 0.0001, "reward": 0.07902248203754425, "reward_std": 0.8653755076229572, "rewards/cosine_scaled_reward": 0.008886766619980335, "rewards/format_reward": 0.37500000931322575, "step": 136 }, { "completion_length": 2720.2083587646484, "epoch": 0.15657142857142858, "grad_norm": 0.1975402683019638, "kl": 0.002421855926513672, "learning_rate": 9.195171441101668e-07, "loss": 0.0001, "reward": -0.21935221180319786, "reward_std": 0.518220279365778, "rewards/cosine_scaled_reward": -0.16422798670828342, "rewards/format_reward": 0.41666666977107525, "step": 137 }, { "completion_length": 2631.229217529297, "epoch": 0.15771428571428572, "grad_norm": 0.22529752552509308, "kl": 0.0021669864654541016, "learning_rate": 9.177152042508077e-07, "loss": 0.0001, "reward": -0.1003083037212491, "reward_std": 0.5785807222127914, "rewards/cosine_scaled_reward": -0.13203439861536026, "rewards/format_reward": 0.5208333376795053, "step": 138 }, { "completion_length": 3252.3125610351562, "epoch": 0.15885714285714286, "grad_norm": 0.16783365607261658, "kl": 0.0036573410034179688, "learning_rate": 9.158953424711624e-07, "loss": 0.0001, "reward": -0.16235242877155542, "reward_std": 0.6854680478572845, "rewards/cosine_scaled_reward": -0.1317966803908348, "rewards/format_reward": 0.3541666716337204, "step": 139 }, { "completion_length": 2871.6875610351562, "epoch": 0.16, "grad_norm": 0.23042802512645721, "kl": 0.004063129425048828, "learning_rate": 9.140576474687263e-07, "loss": 0.0002, "reward": -0.0632519107311964, "reward_std": 0.7613547593355179, "rewards/cosine_scaled_reward": -0.076241385191679, "rewards/format_reward": 0.37500000931322575, "step": 140 }, { "completion_length": 2480.291702270508, "epoch": 0.16114285714285714, "grad_norm": 0.2392415851354599, "kl": 0.003994464874267578, "learning_rate": 9.122022088101613e-07, "loss": 0.0002, "reward": 0.05139289842918515, "reward_std": 1.0145743787288666, "rewards/cosine_scaled_reward": -0.13529899902641773, "rewards/format_reward": 0.5625000074505806, "step": 141 }, { "completion_length": 2873.666717529297, "epoch": 0.16228571428571428, "grad_norm": 0.17095746099948883, "kl": 0.0033861398696899414, "learning_rate": 9.103291169269299e-07, "loss": 0.0001, "reward": 0.11978689953684807, "reward_std": 0.6609016172587872, "rewards/cosine_scaled_reward": -0.06103468872606754, "rewards/format_reward": 0.5833333414047956, "step": 142 }, { "completion_length": 2451.541702270508, "epoch": 0.16342857142857142, "grad_norm": 0.2901478111743927, "kl": 0.0036554336547851562, "learning_rate": 9.084384631108882e-07, "loss": 0.0001, "reward": -0.13543805526569486, "reward_std": 0.5446794554591179, "rewards/cosine_scaled_reward": -0.18383409455418587, "rewards/format_reward": 0.5208333469927311, "step": 143 }, { "completion_length": 2912.7083740234375, "epoch": 0.16457142857142856, "grad_norm": 0.21996816992759705, "kl": 0.004309415817260742, "learning_rate": 9.065303395098358e-07, "loss": 0.0002, "reward": 0.015518264845013618, "reward_std": 0.9661316499114037, "rewards/cosine_scaled_reward": -0.05092207749839872, "rewards/format_reward": 0.3333333358168602, "step": 144 }, { "completion_length": 2094.041706085205, "epoch": 0.1657142857142857, "grad_norm": 0.29601243138313293, "kl": 0.003927946090698242, "learning_rate": 9.046048391230247e-07, "loss": 0.0002, "reward": 0.2274817731231451, "reward_std": 0.6735235303640366, "rewards/cosine_scaled_reward": 0.019177459180355072, "rewards/format_reward": 0.6250000055879354, "step": 145 }, { "completion_length": 2216.541717529297, "epoch": 0.16685714285714287, "grad_norm": 0.25898125767707825, "kl": 0.002621889114379883, "learning_rate": 9.026620557966279e-07, "loss": 0.0001, "reward": -0.13925540121272206, "reward_std": 0.5450553633272648, "rewards/cosine_scaled_reward": -0.2378202360123396, "rewards/format_reward": 0.6875000186264515, "step": 146 }, { "completion_length": 2532.7916870117188, "epoch": 0.168, "grad_norm": 0.41119325160980225, "kl": 0.0048122406005859375, "learning_rate": 9.007020842191634e-07, "loss": 0.0002, "reward": 0.028958545066416264, "reward_std": 0.9969764724373817, "rewards/cosine_scaled_reward": -0.12233338970690966, "rewards/format_reward": 0.4583333507180214, "step": 147 }, { "completion_length": 1958.9792175292969, "epoch": 0.16914285714285715, "grad_norm": 0.21329711377620697, "kl": 0.0035009384155273438, "learning_rate": 8.987250199168808e-07, "loss": 0.0001, "reward": 0.06930056714918464, "reward_std": 0.7063222527503967, "rewards/cosine_scaled_reward": -0.13013077899813652, "rewards/format_reward": 0.6875000111758709, "step": 148 }, { "completion_length": 2652.1875610351562, "epoch": 0.1702857142857143, "grad_norm": 0.1904420405626297, "kl": 0.0030150413513183594, "learning_rate": 8.967309592491052e-07, "loss": 0.0001, "reward": 0.0019146008417010307, "reward_std": 0.813957192003727, "rewards/cosine_scaled_reward": -0.10279210843145847, "rewards/format_reward": 0.47916666977107525, "step": 149 }, { "completion_length": 2578.625030517578, "epoch": 0.17142857142857143, "grad_norm": 0.1853615641593933, "kl": 0.005157470703125, "learning_rate": 8.9471999940354e-07, "loss": 0.0002, "reward": 0.13735826686024666, "reward_std": 0.8738452345132828, "rewards/cosine_scaled_reward": -0.027063111774623394, "rewards/format_reward": 0.4583333432674408, "step": 150 }, { "completion_length": 2566.541732788086, "epoch": 0.17257142857142857, "grad_norm": 0.23767390847206116, "kl": 0.0045833587646484375, "learning_rate": 8.926922383915315e-07, "loss": 0.0002, "reward": 0.3640142543008551, "reward_std": 0.9924535490572453, "rewards/cosine_scaled_reward": 0.056208414025604725, "rewards/format_reward": 0.6041666809469461, "step": 151 }, { "completion_length": 2766.5208702087402, "epoch": 0.1737142857142857, "grad_norm": 0.24616549909114838, "kl": 0.0033342838287353516, "learning_rate": 8.906477750432903e-07, "loss": 0.0001, "reward": -0.30499533005058765, "reward_std": 0.5299477484077215, "rewards/cosine_scaled_reward": -0.2434069886803627, "rewards/format_reward": 0.41666667349636555, "step": 152 }, { "completion_length": 2895.229202270508, "epoch": 0.17485714285714285, "grad_norm": 0.2707502543926239, "kl": 0.007465362548828125, "learning_rate": 8.88586709003076e-07, "loss": 0.0003, "reward": -0.28449683357030153, "reward_std": 0.5924058370292187, "rewards/cosine_scaled_reward": -0.19268111791461706, "rewards/format_reward": 0.3333333395421505, "step": 153 }, { "completion_length": 3004.7709045410156, "epoch": 0.176, "grad_norm": 0.1523827612400055, "kl": 0.00289154052734375, "learning_rate": 8.865091407243394e-07, "loss": 0.0001, "reward": 0.22956039011478424, "reward_std": 0.9176076352596283, "rewards/cosine_scaled_reward": 0.006945975736016408, "rewards/format_reward": 0.479166679084301, "step": 154 }, { "completion_length": 2524.5000762939453, "epoch": 0.17714285714285713, "grad_norm": 0.24581067264080048, "kl": 0.004633903503417969, "learning_rate": 8.844151714648274e-07, "loss": 0.0002, "reward": 0.3532795161008835, "reward_std": 0.9012942314147949, "rewards/cosine_scaled_reward": 0.0729524465277791, "rewards/format_reward": 0.5416666679084301, "step": 155 }, { "completion_length": 2804.3958587646484, "epoch": 0.1782857142857143, "grad_norm": 0.17449386417865753, "kl": 0.0034241676330566406, "learning_rate": 8.823049032816478e-07, "loss": 0.0001, "reward": 0.0325262644328177, "reward_std": 0.7823463976383209, "rewards/cosine_scaled_reward": -0.03533525764942169, "rewards/format_reward": 0.3333333395421505, "step": 156 }, { "completion_length": 2587.6458587646484, "epoch": 0.17942857142857144, "grad_norm": 0.25060343742370605, "kl": 0.004924774169921875, "learning_rate": 8.801784390262943e-07, "loss": 0.0002, "reward": -0.024557745084166527, "reward_std": 0.6769015416502953, "rewards/cosine_scaled_reward": -0.1357441581785679, "rewards/format_reward": 0.5208333488553762, "step": 157 }, { "completion_length": 2703.291732788086, "epoch": 0.18057142857142858, "grad_norm": 0.22450025379657745, "kl": 0.0042743682861328125, "learning_rate": 8.780358823396352e-07, "loss": 0.0002, "reward": 0.35095504857599735, "reward_std": 0.8320975676178932, "rewards/cosine_scaled_reward": 0.09731243550777435, "rewards/format_reward": 0.5625000074505806, "step": 158 }, { "completion_length": 2376.3750610351562, "epoch": 0.18171428571428572, "grad_norm": 0.21200759708881378, "kl": 0.0042476654052734375, "learning_rate": 8.758773376468604e-07, "loss": 0.0002, "reward": -0.21574918151600286, "reward_std": 0.5446000955998898, "rewards/cosine_scaled_reward": -0.2437344677746296, "rewards/format_reward": 0.5416666716337204, "step": 159 }, { "completion_length": 2424.9583892822266, "epoch": 0.18285714285714286, "grad_norm": 0.2798510491847992, "kl": 0.0068912506103515625, "learning_rate": 8.737029101523929e-07, "loss": 0.0003, "reward": 0.04046674119308591, "reward_std": 0.7019340619444847, "rewards/cosine_scaled_reward": -0.06891408376395702, "rewards/format_reward": 0.5000000093132257, "step": 160 }, { "completion_length": 2377.0625610351562, "epoch": 0.184, "grad_norm": 0.23946921527385712, "kl": 0.004667758941650391, "learning_rate": 8.715127058347614e-07, "loss": 0.0002, "reward": 0.18844679649919271, "reward_std": 0.7960058376193047, "rewards/cosine_scaled_reward": 0.006343178451061249, "rewards/format_reward": 0.5833333414047956, "step": 161 }, { "completion_length": 2592.708366394043, "epoch": 0.18514285714285714, "grad_norm": 0.27762702107429504, "kl": 0.0064907073974609375, "learning_rate": 8.693068314414344e-07, "loss": 0.0003, "reward": -0.033426298294216394, "reward_std": 0.6928768754005432, "rewards/cosine_scaled_reward": -0.1188662868225947, "rewards/format_reward": 0.47916666977107525, "step": 162 }, { "completion_length": 2438.3541717529297, "epoch": 0.18628571428571428, "grad_norm": 0.28610387444496155, "kl": 0.005778312683105469, "learning_rate": 8.670853944836176e-07, "loss": 0.0002, "reward": 0.2196616232395172, "reward_std": 0.7138870656490326, "rewards/cosine_scaled_reward": 0.04680558480322361, "rewards/format_reward": 0.5625000074505806, "step": 163 }, { "completion_length": 1862.2292022705078, "epoch": 0.18742857142857142, "grad_norm": 0.22276760637760162, "kl": 0.00450897216796875, "learning_rate": 8.648485032310144e-07, "loss": 0.0002, "reward": 0.2688090084120631, "reward_std": 0.662966214120388, "rewards/cosine_scaled_reward": 0.05209773499518633, "rewards/format_reward": 0.7083333432674408, "step": 164 }, { "completion_length": 2642.979202270508, "epoch": 0.18857142857142858, "grad_norm": 0.2338155210018158, "kl": 0.006450653076171875, "learning_rate": 8.625962667065487e-07, "loss": 0.0003, "reward": -0.11441950500011444, "reward_std": 0.6914314143359661, "rewards/cosine_scaled_reward": -0.15340343955904245, "rewards/format_reward": 0.45833334140479565, "step": 165 }, { "completion_length": 2228.5208892822266, "epoch": 0.18971428571428572, "grad_norm": 0.19290319085121155, "kl": 0.0036525726318359375, "learning_rate": 8.603287946810513e-07, "loss": 0.0001, "reward": 0.039148006588220596, "reward_std": 0.5984033793210983, "rewards/cosine_scaled_reward": -0.10414127632975578, "rewards/format_reward": 0.6250000093132257, "step": 166 }, { "completion_length": 2005.8125610351562, "epoch": 0.19085714285714286, "grad_norm": 0.19015978276729584, "kl": 0.0037364959716796875, "learning_rate": 8.580461976679099e-07, "loss": 0.0001, "reward": 0.278128509176895, "reward_std": 0.851395096629858, "rewards/cosine_scaled_reward": -0.09441124647855759, "rewards/format_reward": 0.854166679084301, "step": 167 }, { "completion_length": 2624.979217529297, "epoch": 0.192, "grad_norm": 0.18921926617622375, "kl": 0.004267692565917969, "learning_rate": 8.557485869176825e-07, "loss": 0.0002, "reward": 0.18450849317014217, "reward_std": 0.9270573705434799, "rewards/cosine_scaled_reward": -0.05149332107976079, "rewards/format_reward": 0.5833333469927311, "step": 168 }, { "completion_length": 1482.1042022705078, "epoch": 0.19314285714285714, "grad_norm": 0.23450377583503723, "kl": 0.004532814025878906, "learning_rate": 8.534360744126753e-07, "loss": 0.0002, "reward": 0.7876127276103944, "reward_std": 0.8578404039144516, "rewards/cosine_scaled_reward": 0.2795695327222347, "rewards/format_reward": 0.916666679084301, "step": 169 }, { "completion_length": 2186.9167251586914, "epoch": 0.19428571428571428, "grad_norm": 0.30296990275382996, "kl": 0.0044994354248046875, "learning_rate": 8.511087728614862e-07, "loss": 0.0002, "reward": 0.056005215272307396, "reward_std": 0.6679836474359035, "rewards/cosine_scaled_reward": -0.021252445876598358, "rewards/format_reward": 0.5625000018626451, "step": 170 }, { "completion_length": 2248.312530517578, "epoch": 0.19542857142857142, "grad_norm": 0.20839498937129974, "kl": 0.0036230087280273438, "learning_rate": 8.487667956935087e-07, "loss": 0.0001, "reward": 0.16125285997986794, "reward_std": 0.7742661274969578, "rewards/cosine_scaled_reward": -0.013224839232861996, "rewards/format_reward": 0.5208333395421505, "step": 171 }, { "completion_length": 2516.833351135254, "epoch": 0.19657142857142856, "grad_norm": 0.21489036083221436, "kl": 0.007071495056152344, "learning_rate": 8.464102570534061e-07, "loss": 0.0003, "reward": 0.3356585130095482, "reward_std": 0.7922957856208086, "rewards/cosine_scaled_reward": 0.13084825314581394, "rewards/format_reward": 0.5000000055879354, "step": 172 }, { "completion_length": 1844.708381652832, "epoch": 0.1977142857142857, "grad_norm": 0.23371165990829468, "kl": 0.00569915771484375, "learning_rate": 8.440392717955475e-07, "loss": 0.0002, "reward": 0.0007524143438786268, "reward_std": 0.7251704446971416, "rewards/cosine_scaled_reward": -0.1511296879616566, "rewards/format_reward": 0.6458333395421505, "step": 173 }, { "completion_length": 1865.3958740234375, "epoch": 0.19885714285714284, "grad_norm": 0.24605944752693176, "kl": 0.0073699951171875, "learning_rate": 8.416539554784089e-07, "loss": 0.0003, "reward": 0.40005480125546455, "reward_std": 0.8749304339289665, "rewards/cosine_scaled_reward": 0.05308325891382992, "rewards/format_reward": 0.7916666753590107, "step": 174 }, { "completion_length": 2525.270896911621, "epoch": 0.2, "grad_norm": 0.20796766877174377, "kl": 0.005358695983886719, "learning_rate": 8.392544243589427e-07, "loss": 0.0002, "reward": 0.15992992464452982, "reward_std": 0.6595781818032265, "rewards/cosine_scaled_reward": 0.004710111767053604, "rewards/format_reward": 0.5208333376795053, "step": 175 }, { "completion_length": 2078.895896911621, "epoch": 0.20114285714285715, "grad_norm": 0.28462517261505127, "kl": 0.005040168762207031, "learning_rate": 8.368407953869103e-07, "loss": 0.0002, "reward": 0.25200897455215454, "reward_std": 0.9856929145753384, "rewards/cosine_scaled_reward": -0.05302844103425741, "rewards/format_reward": 0.6666666716337204, "step": 176 }, { "completion_length": 2306.2083740234375, "epoch": 0.2022857142857143, "grad_norm": 0.24023154377937317, "kl": 0.005183219909667969, "learning_rate": 8.344131861991828e-07, "loss": 0.0002, "reward": 0.28276925068348646, "reward_std": 0.6319316327571869, "rewards/cosine_scaled_reward": 0.06355313770473003, "rewards/format_reward": 0.6458333432674408, "step": 177 }, { "completion_length": 2121.791717529297, "epoch": 0.20342857142857143, "grad_norm": 0.2661464214324951, "kl": 0.008258819580078125, "learning_rate": 8.319717151140072e-07, "loss": 0.0003, "reward": 0.013979046139866114, "reward_std": 0.7351427339017391, "rewards/cosine_scaled_reward": -0.14257426233962178, "rewards/format_reward": 0.6250000055879354, "step": 178 }, { "completion_length": 2433.708351135254, "epoch": 0.20457142857142857, "grad_norm": 0.20986828207969666, "kl": 0.004935264587402344, "learning_rate": 8.295165011252396e-07, "loss": 0.0002, "reward": -0.25348505191504955, "reward_std": 0.48908819630742073, "rewards/cosine_scaled_reward": -0.21158038638532162, "rewards/format_reward": 0.4583333358168602, "step": 179 }, { "completion_length": 1877.1458892822266, "epoch": 0.2057142857142857, "grad_norm": 0.3029525876045227, "kl": 0.007785797119140625, "learning_rate": 8.270476638965461e-07, "loss": 0.0003, "reward": 0.3592034715693444, "reward_std": 1.002089962363243, "rewards/cosine_scaled_reward": 0.08408734039403498, "rewards/format_reward": 0.666666679084301, "step": 180 }, { "completion_length": 2741.729248046875, "epoch": 0.20685714285714285, "grad_norm": 0.2602676451206207, "kl": 0.00711822509765625, "learning_rate": 8.245653237555705e-07, "loss": 0.0003, "reward": -0.08426681905984879, "reward_std": 0.5513291470706463, "rewards/cosine_scaled_reward": -0.07528127636760473, "rewards/format_reward": 0.4583333358168602, "step": 181 }, { "completion_length": 1988.2292175292969, "epoch": 0.208, "grad_norm": 0.17226605117321014, "kl": 0.0025339126586914062, "learning_rate": 8.220696016880687e-07, "loss": 0.0001, "reward": 0.23337539401836693, "reward_std": 0.824427492916584, "rewards/cosine_scaled_reward": -0.0713941128924489, "rewards/format_reward": 0.7083333432674408, "step": 182 }, { "completion_length": 1507.3958892822266, "epoch": 0.20914285714285713, "grad_norm": 0.2777194082736969, "kl": 0.006999969482421875, "learning_rate": 8.195606193320136e-07, "loss": 0.0003, "reward": 0.5963715696707368, "reward_std": 0.8888205997645855, "rewards/cosine_scaled_reward": 0.07703271105128806, "rewards/format_reward": 0.8958333432674408, "step": 183 }, { "completion_length": 2085.979202270508, "epoch": 0.2102857142857143, "grad_norm": 0.2736140489578247, "kl": 0.0066013336181640625, "learning_rate": 8.170384989716657e-07, "loss": 0.0003, "reward": -0.11966793239116669, "reward_std": 0.4983285814523697, "rewards/cosine_scaled_reward": -0.20256702601909637, "rewards/format_reward": 0.6458333395421505, "step": 184 }, { "completion_length": 2048.5208854675293, "epoch": 0.21142857142857144, "grad_norm": 0.24372698366641998, "kl": 0.0058803558349609375, "learning_rate": 8.145033635316128e-07, "loss": 0.0002, "reward": 0.06693027447909117, "reward_std": 0.6862058416008949, "rewards/cosine_scaled_reward": -0.10436064226087183, "rewards/format_reward": 0.6250000093132257, "step": 185 }, { "completion_length": 2159.791702270508, "epoch": 0.21257142857142858, "grad_norm": 0.20706257224082947, "kl": 0.006072998046875, "learning_rate": 8.119553365707802e-07, "loss": 0.0002, "reward": 0.0768028711900115, "reward_std": 0.5925656575709581, "rewards/cosine_scaled_reward": -0.05905535398051143, "rewards/format_reward": 0.5625, "step": 186 }, { "completion_length": 1804.8542022705078, "epoch": 0.21371428571428572, "grad_norm": 0.2685582637786865, "kl": 0.0061969757080078125, "learning_rate": 8.093945422764069e-07, "loss": 0.0002, "reward": 0.12330089835450053, "reward_std": 0.606924245133996, "rewards/cosine_scaled_reward": -0.09539542999118567, "rewards/format_reward": 0.7500000111758709, "step": 187 }, { "completion_length": 2442.166702270508, "epoch": 0.21485714285714286, "grad_norm": 0.2364640235900879, "kl": 0.0075626373291015625, "learning_rate": 8.068211054579943e-07, "loss": 0.0003, "reward": -0.09115996491163969, "reward_std": 0.6131051359698176, "rewards/cosine_scaled_reward": -0.1099636135622859, "rewards/format_reward": 0.4791666753590107, "step": 188 }, { "completion_length": 1640.333366394043, "epoch": 0.216, "grad_norm": 0.6100494861602783, "kl": 0.006476402282714844, "learning_rate": 8.04235151541222e-07, "loss": 0.0003, "reward": 0.12896017776802182, "reward_std": 0.6158286519348621, "rewards/cosine_scaled_reward": -0.10562538355588913, "rewards/format_reward": 0.7291666828095913, "step": 189 }, { "completion_length": 1453.6042022705078, "epoch": 0.21714285714285714, "grad_norm": 0.2732960283756256, "kl": 0.006473541259765625, "learning_rate": 8.01636806561836e-07, "loss": 0.0003, "reward": 0.2690247750142589, "reward_std": 0.7748389020562172, "rewards/cosine_scaled_reward": -0.04900714522227645, "rewards/format_reward": 0.8333333432674408, "step": 190 }, { "completion_length": 1478.0000305175781, "epoch": 0.21828571428571428, "grad_norm": 0.25885748863220215, "kl": 0.0072765350341796875, "learning_rate": 7.990261971595048e-07, "loss": 0.0003, "reward": 0.5228147888556123, "reward_std": 0.9636592417955399, "rewards/cosine_scaled_reward": 0.07218710612505674, "rewards/format_reward": 0.812500013038516, "step": 191 }, { "completion_length": 1954.8542251586914, "epoch": 0.21942857142857142, "grad_norm": 0.22047950327396393, "kl": 0.0056095123291015625, "learning_rate": 7.964034505716476e-07, "loss": 0.0002, "reward": 0.0916488622315228, "reward_std": 0.666417833417654, "rewards/cosine_scaled_reward": -0.13260145671665668, "rewards/format_reward": 0.7083333488553762, "step": 192 }, { "completion_length": 2805.520866394043, "epoch": 0.22057142857142858, "grad_norm": 0.22538112103939056, "kl": 0.0061187744140625, "learning_rate": 7.93768694627233e-07, "loss": 0.0002, "reward": -0.33769641164690256, "reward_std": 0.507177896797657, "rewards/cosine_scaled_reward": -0.24244121788069606, "rewards/format_reward": 0.39583333767950535, "step": 193 }, { "completion_length": 2417.354202270508, "epoch": 0.22171428571428572, "grad_norm": 0.2549934983253479, "kl": 0.007659912109375, "learning_rate": 7.911220577405484e-07, "loss": 0.0003, "reward": 0.4917243723757565, "reward_std": 1.1182497814297676, "rewards/cosine_scaled_reward": 0.08968745917081833, "rewards/format_reward": 0.645833345130086, "step": 194 }, { "completion_length": 1545.1667175292969, "epoch": 0.22285714285714286, "grad_norm": 0.2932279706001282, "kl": 0.0074100494384765625, "learning_rate": 7.884636689049422e-07, "loss": 0.0003, "reward": 0.40460192365571856, "reward_std": 0.8929797559976578, "rewards/cosine_scaled_reward": -0.016500022262334824, "rewards/format_reward": 0.8541666716337204, "step": 195 }, { "completion_length": 2750.791717529297, "epoch": 0.224, "grad_norm": 0.2122720181941986, "kl": 0.007846832275390625, "learning_rate": 7.857936576865356e-07, "loss": 0.0003, "reward": -0.0444787316955626, "reward_std": 0.6230567768216133, "rewards/cosine_scaled_reward": -0.09410630911588669, "rewards/format_reward": 0.5208333358168602, "step": 196 }, { "completion_length": 1153.0000381469727, "epoch": 0.22514285714285714, "grad_norm": 0.30193769931793213, "kl": 0.00811767578125, "learning_rate": 7.831121542179086e-07, "loss": 0.0003, "reward": 0.5208917018026114, "reward_std": 0.8946940749883652, "rewards/cosine_scaled_reward": 0.04951087199151516, "rewards/format_reward": 0.8958333395421505, "step": 197 }, { "completion_length": 1570.083396911621, "epoch": 0.22628571428571428, "grad_norm": 0.2520270347595215, "kl": 0.008256912231445312, "learning_rate": 7.804192891917571e-07, "loss": 0.0003, "reward": 0.43910311779472977, "reward_std": 0.9849786385893822, "rewards/cosine_scaled_reward": -0.005096456501632929, "rewards/format_reward": 0.8125000149011612, "step": 198 }, { "completion_length": 1278.4375305175781, "epoch": 0.22742857142857142, "grad_norm": 0.265682190656662, "kl": 0.00727081298828125, "learning_rate": 7.777151938545235e-07, "loss": 0.0003, "reward": 0.16469779529143125, "reward_std": 0.5715500190854073, "rewards/cosine_scaled_reward": -0.1407376565039158, "rewards/format_reward": 0.9375, "step": 199 }, { "completion_length": 1395.8541946411133, "epoch": 0.22857142857142856, "grad_norm": 0.24266065657138824, "kl": 0.0067348480224609375, "learning_rate": 7.75e-07, "loss": 0.0003, "reward": 0.3167417113436386, "reward_std": 0.7663153670728207, "rewards/cosine_scaled_reward": -0.044573438179213554, "rewards/format_reward": 0.8958333432674408, "step": 200 }, { "completion_length": 1824.4375457763672, "epoch": 0.2297142857142857, "grad_norm": 0.23948884010314941, "kl": 0.00595855712890625, "learning_rate": 7.72273839962904e-07, "loss": 0.0002, "reward": 0.7574443845078349, "reward_std": 0.8649395480751991, "rewards/cosine_scaled_reward": 0.28773305006325245, "rewards/format_reward": 0.8333333432674408, "step": 201 }, { "completion_length": 1388.5833740234375, "epoch": 0.23085714285714284, "grad_norm": 0.2889217734336853, "kl": 0.008331298828125, "learning_rate": 7.695368466124296e-07, "loss": 0.0003, "reward": 0.46887177898315713, "reward_std": 0.6449649855494499, "rewards/cosine_scaled_reward": 0.19245607405900955, "rewards/format_reward": 0.8541666716337204, "step": 202 }, { "completion_length": 1545.083396911621, "epoch": 0.232, "grad_norm": 0.24110884964466095, "kl": 0.0080413818359375, "learning_rate": 7.667891533457718e-07, "loss": 0.0003, "reward": 0.37680933251976967, "reward_std": 0.8439403660595417, "rewards/cosine_scaled_reward": -0.034176651388406754, "rewards/format_reward": 0.8750000149011612, "step": 203 }, { "completion_length": 1558.5833740234375, "epoch": 0.23314285714285715, "grad_norm": 0.3269757628440857, "kl": 0.0079345703125, "learning_rate": 7.640308940816239e-07, "loss": 0.0003, "reward": 0.4802742376923561, "reward_std": 0.7629036977887154, "rewards/cosine_scaled_reward": 0.10827891901135445, "rewards/format_reward": 0.8333333432674408, "step": 204 }, { "completion_length": 1542.5000305175781, "epoch": 0.2342857142857143, "grad_norm": 0.2582697570323944, "kl": 0.0057811737060546875, "learning_rate": 7.612622032536507e-07, "loss": 0.0002, "reward": 0.7918389849364758, "reward_std": 0.8735720105469227, "rewards/cosine_scaled_reward": 0.29279233887791634, "rewards/format_reward": 0.8750000074505806, "step": 205 }, { "completion_length": 1852.4167098999023, "epoch": 0.23542857142857143, "grad_norm": 0.30814215540885925, "kl": 0.007049560546875, "learning_rate": 7.584832158039378e-07, "loss": 0.0003, "reward": -0.12371287122368813, "reward_std": 0.5305570438504219, "rewards/cosine_scaled_reward": -0.24367139115929604, "rewards/format_reward": 0.7083333544433117, "step": 206 }, { "completion_length": 1706.8125610351562, "epoch": 0.23657142857142857, "grad_norm": 0.2674766182899475, "kl": 0.009944915771484375, "learning_rate": 7.556940671764124e-07, "loss": 0.0004, "reward": 0.018216492608189583, "reward_std": 0.8229854069650173, "rewards/cosine_scaled_reward": -0.2149236612021923, "rewards/format_reward": 0.7500000074505806, "step": 207 }, { "completion_length": 1185.0833587646484, "epoch": 0.2377142857142857, "grad_norm": 0.24083174765110016, "kl": 0.0078887939453125, "learning_rate": 7.528948933102438e-07, "loss": 0.0003, "reward": 0.2574625021661632, "reward_std": 0.5847832001745701, "rewards/cosine_scaled_reward": -0.02713889814913273, "rewards/format_reward": 0.9375000074505806, "step": 208 }, { "completion_length": 1211.3333587646484, "epoch": 0.23885714285714285, "grad_norm": 0.28845900297164917, "kl": 0.0077152252197265625, "learning_rate": 7.500858306332172e-07, "loss": 0.0003, "reward": 0.5239271614700556, "reward_std": 0.8414975665509701, "rewards/cosine_scaled_reward": 0.07757960073649883, "rewards/format_reward": 0.916666679084301, "step": 209 }, { "completion_length": 1571.8333892822266, "epoch": 0.24, "grad_norm": 0.26938140392303467, "kl": 0.006866455078125, "learning_rate": 7.472670160550848e-07, "loss": 0.0003, "reward": 0.39476348645985126, "reward_std": 0.8261179774999619, "rewards/cosine_scaled_reward": -0.00616603484377265, "rewards/format_reward": 0.8541666753590107, "step": 210 }, { "completion_length": 1672.083366394043, "epoch": 0.24114285714285713, "grad_norm": 0.358892560005188, "kl": 0.0106353759765625, "learning_rate": 7.444385869608921e-07, "loss": 0.0004, "reward": 0.3219002881087363, "reward_std": 0.750870831310749, "rewards/cosine_scaled_reward": 0.013120350427925587, "rewards/format_reward": 0.7916666679084301, "step": 211 }, { "completion_length": 1112.6250534057617, "epoch": 0.2422857142857143, "grad_norm": 0.27129727602005005, "kl": 0.008617401123046875, "learning_rate": 7.416006812042827e-07, "loss": 0.0003, "reward": 0.5328430655645207, "reward_std": 0.671292532235384, "rewards/cosine_scaled_reward": 0.12215181812644005, "rewards/format_reward": 0.9375, "step": 212 }, { "completion_length": 1226.1666870117188, "epoch": 0.24342857142857144, "grad_norm": 0.36483535170555115, "kl": 0.01009368896484375, "learning_rate": 7.387534371007797e-07, "loss": 0.0004, "reward": 0.47296964284032583, "reward_std": 0.7435989566147327, "rewards/cosine_scaled_reward": 0.06554012396372855, "rewards/format_reward": 0.8958333432674408, "step": 213 }, { "completion_length": 1548.1875228881836, "epoch": 0.24457142857142858, "grad_norm": 0.3107360303401947, "kl": 0.008228302001953125, "learning_rate": 7.358969934210438e-07, "loss": 0.0003, "reward": 0.5165067554917186, "reward_std": 0.7588581070303917, "rewards/cosine_scaled_reward": 0.10956975258886814, "rewards/format_reward": 0.8541666716337204, "step": 214 }, { "completion_length": 1461.6875305175781, "epoch": 0.24571428571428572, "grad_norm": 0.22486445307731628, "kl": 0.0059413909912109375, "learning_rate": 7.330314893841101e-07, "loss": 0.0002, "reward": 0.07686804980039597, "reward_std": 0.5298155099153519, "rewards/cosine_scaled_reward": -0.14177834056317806, "rewards/format_reward": 0.8333333358168602, "step": 215 }, { "completion_length": 1340.5208702087402, "epoch": 0.24685714285714286, "grad_norm": 0.2673039436340332, "kl": 0.008434295654296875, "learning_rate": 7.301570646506027e-07, "loss": 0.0003, "reward": 0.38506741262972355, "reward_std": 0.6937872804701328, "rewards/cosine_scaled_reward": 0.08268431574106216, "rewards/format_reward": 0.8125000074505806, "step": 216 }, { "completion_length": 1469.6250534057617, "epoch": 0.248, "grad_norm": 0.3538609445095062, "kl": 0.008731842041015625, "learning_rate": 7.27273859315928e-07, "loss": 0.0003, "reward": 0.6034841773507651, "reward_std": 1.0110028125345707, "rewards/cosine_scaled_reward": 0.10976234765257686, "rewards/format_reward": 0.8125000037252903, "step": 217 }, { "completion_length": 1511.458381652832, "epoch": 0.24914285714285714, "grad_norm": 0.2224741131067276, "kl": 0.007978439331054688, "learning_rate": 7.243820139034464e-07, "loss": 0.0003, "reward": 0.21123092295601964, "reward_std": 0.7925957031548023, "rewards/cosine_scaled_reward": -0.1498157843016088, "rewards/format_reward": 0.8958333432674408, "step": 218 }, { "completion_length": 1340.4791870117188, "epoch": 0.2502857142857143, "grad_norm": 0.24136604368686676, "kl": 0.009267807006835938, "learning_rate": 7.214816693576234e-07, "loss": 0.0004, "reward": 0.3911650243680924, "reward_std": 0.6955340765416622, "rewards/cosine_scaled_reward": 0.048877415247261524, "rewards/format_reward": 0.875, "step": 219 }, { "completion_length": 1524.354175567627, "epoch": 0.25142857142857145, "grad_norm": 0.2906893491744995, "kl": 0.011249542236328125, "learning_rate": 7.185729670371604e-07, "loss": 0.0004, "reward": -0.04361694771796465, "reward_std": 0.5702285468578339, "rewards/cosine_scaled_reward": -0.2380666360259056, "rewards/format_reward": 0.8125000074505806, "step": 220 }, { "completion_length": 1496.2916870117188, "epoch": 0.25257142857142856, "grad_norm": 0.24429333209991455, "kl": 0.0070476531982421875, "learning_rate": 7.156560487081051e-07, "loss": 0.0003, "reward": 0.4626695259066764, "reward_std": 0.7840724922716618, "rewards/cosine_scaled_reward": 0.06611506012268364, "rewards/format_reward": 0.8541666716337204, "step": 221 }, { "completion_length": 1576.9167022705078, "epoch": 0.2537142857142857, "grad_norm": 0.26980236172676086, "kl": 0.009477615356445312, "learning_rate": 7.127310565369415e-07, "loss": 0.0004, "reward": 0.2927159178070724, "reward_std": 0.6861963458359241, "rewards/cosine_scaled_reward": -0.02613269304856658, "rewards/format_reward": 0.7916666679084301, "step": 222 }, { "completion_length": 1766.4583587646484, "epoch": 0.25485714285714284, "grad_norm": 0.3431569039821625, "kl": 0.009063720703125, "learning_rate": 7.097981330836616e-07, "loss": 0.0004, "reward": 0.24513752292841673, "reward_std": 0.6672438345849514, "rewards/cosine_scaled_reward": 0.0011468753218650818, "rewards/format_reward": 0.7083333432674408, "step": 223 }, { "completion_length": 1734.6458587646484, "epoch": 0.256, "grad_norm": 0.2113533616065979, "kl": 0.008029937744140625, "learning_rate": 7.068574212948169e-07, "loss": 0.0003, "reward": 0.6102093638037331, "reward_std": 1.0698180794715881, "rewards/cosine_scaled_reward": 0.07039764476940036, "rewards/format_reward": 0.8541666753590107, "step": 224 }, { "completion_length": 1987.958396911621, "epoch": 0.2571428571428571, "grad_norm": 0.3268744945526123, "kl": 0.012989044189453125, "learning_rate": 7.039090644965509e-07, "loss": 0.0005, "reward": 0.24530102079734206, "reward_std": 0.9006591737270355, "rewards/cosine_scaled_reward": -0.0955775510519743, "rewards/format_reward": 0.7708333507180214, "step": 225 }, { "completion_length": 1425.6041946411133, "epoch": 0.2582857142857143, "grad_norm": 0.24840526282787323, "kl": 0.00763702392578125, "learning_rate": 7.009532063876148e-07, "loss": 0.0003, "reward": 0.6913954578340054, "reward_std": 0.8804083652794361, "rewards/cosine_scaled_reward": 0.15895199915394187, "rewards/format_reward": 0.9375000074505806, "step": 226 }, { "completion_length": 1124.0416946411133, "epoch": 0.25942857142857145, "grad_norm": 0.31861069798469543, "kl": 0.011066436767578125, "learning_rate": 6.979899910323624e-07, "loss": 0.0004, "reward": 0.3930067252367735, "reward_std": 0.8200523294508457, "rewards/cosine_scaled_reward": -0.05645215045660734, "rewards/format_reward": 0.9583333358168602, "step": 227 }, { "completion_length": 1212.2291984558105, "epoch": 0.26057142857142856, "grad_norm": 0.27715247869491577, "kl": 0.008419036865234375, "learning_rate": 6.950195628537299e-07, "loss": 0.0003, "reward": 0.5142936524935067, "reward_std": 0.8248747400939465, "rewards/cosine_scaled_reward": 0.05838426947593689, "rewards/format_reward": 0.895833333954215, "step": 228 }, { "completion_length": 1521.9167175292969, "epoch": 0.26171428571428573, "grad_norm": 0.3403978645801544, "kl": 0.012042999267578125, "learning_rate": 6.920420666261961e-07, "loss": 0.0005, "reward": 0.19939115084707737, "reward_std": 0.5617837458848953, "rewards/cosine_scaled_reward": -0.031215182272717357, "rewards/format_reward": 0.8125000149011612, "step": 229 }, { "completion_length": 1612.5000305175781, "epoch": 0.26285714285714284, "grad_norm": 0.27715978026390076, "kl": 0.010951995849609375, "learning_rate": 6.890576474687263e-07, "loss": 0.0004, "reward": 0.025840092916041613, "reward_std": 0.5985405147075653, "rewards/cosine_scaled_reward": -0.1870412821881473, "rewards/format_reward": 0.791666679084301, "step": 230 }, { "completion_length": 1249.7500305175781, "epoch": 0.264, "grad_norm": 0.2477809637784958, "kl": 0.00991058349609375, "learning_rate": 6.860664508377001e-07, "loss": 0.0004, "reward": 0.45136169949546456, "reward_std": 0.7211090363562107, "rewards/cosine_scaled_reward": 0.07621757127344608, "rewards/format_reward": 0.8541666697710752, "step": 231 }, { "completion_length": 1592.9791946411133, "epoch": 0.2651428571428571, "grad_norm": 0.30748802423477173, "kl": 0.012454986572265625, "learning_rate": 6.83068622519821e-07, "loss": 0.0005, "reward": 0.012111502306652255, "reward_std": 0.6053478866815567, "rewards/cosine_scaled_reward": -0.21021990105509758, "rewards/format_reward": 0.8125000018626451, "step": 232 }, { "completion_length": 1202.4375495910645, "epoch": 0.2662857142857143, "grad_norm": 0.2600504755973816, "kl": 0.011119842529296875, "learning_rate": 6.800643086250121e-07, "loss": 0.0004, "reward": 0.3125277090584859, "reward_std": 0.7523941993713379, "rewards/cosine_scaled_reward": -0.07805835455656052, "rewards/format_reward": 0.9375000149011612, "step": 233 }, { "completion_length": 1664.25004196167, "epoch": 0.2674285714285714, "grad_norm": 0.28175073862075806, "kl": 0.013034820556640625, "learning_rate": 6.770536555792944e-07, "loss": 0.0005, "reward": 0.2759629947831854, "reward_std": 0.7744503617286682, "rewards/cosine_scaled_reward": -0.00288372952491045, "rewards/format_reward": 0.729166679084301, "step": 234 }, { "completion_length": 1031.729206085205, "epoch": 0.26857142857142857, "grad_norm": 0.41683852672576904, "kl": 0.009319305419921875, "learning_rate": 6.740368101176495e-07, "loss": 0.0004, "reward": 0.536057305522263, "reward_std": 0.8349857330322266, "rewards/cosine_scaled_reward": 0.1025073304772377, "rewards/format_reward": 0.9375000074505806, "step": 235 }, { "completion_length": 1922.7708587646484, "epoch": 0.26971428571428574, "grad_norm": 0.2624496519565582, "kl": 0.010082244873046875, "learning_rate": 6.710139192768694e-07, "loss": 0.0004, "reward": 0.5450696041807532, "reward_std": 1.0304477401077747, "rewards/cosine_scaled_reward": 0.09541857382282615, "rewards/format_reward": 0.7500000055879354, "step": 236 }, { "completion_length": 1460.6250305175781, "epoch": 0.27085714285714285, "grad_norm": 0.2655409872531891, "kl": 0.0093231201171875, "learning_rate": 6.679851303883891e-07, "loss": 0.0004, "reward": 0.4284868792165071, "reward_std": 0.6683508493006229, "rewards/cosine_scaled_reward": 0.04793188441544771, "rewards/format_reward": 0.875, "step": 237 }, { "completion_length": 1020.8125305175781, "epoch": 0.272, "grad_norm": 0.27945178747177124, "kl": 0.010372161865234375, "learning_rate": 6.649505910711058e-07, "loss": 0.0004, "reward": 0.6645625443197787, "reward_std": 0.9196898862719536, "rewards/cosine_scaled_reward": 0.1358748753555119, "rewards/format_reward": 0.9166666679084301, "step": 238 }, { "completion_length": 1431.6041870117188, "epoch": 0.27314285714285713, "grad_norm": 0.25624558329582214, "kl": 0.00914764404296875, "learning_rate": 6.619104492241847e-07, "loss": 0.0004, "reward": 0.584113098680973, "reward_std": 0.6673776432871819, "rewards/cosine_scaled_reward": 0.31309779919683933, "rewards/format_reward": 0.7708333395421505, "step": 239 }, { "completion_length": 1502.979232788086, "epoch": 0.2742857142857143, "grad_norm": 0.35446247458457947, "kl": 0.02126312255859375, "learning_rate": 6.588648530198504e-07, "loss": 0.0009, "reward": 0.07019667001441121, "reward_std": 0.6685795933008194, "rewards/cosine_scaled_reward": -0.22959061339497566, "rewards/format_reward": 0.916666679084301, "step": 240 }, { "completion_length": 1754.9375534057617, "epoch": 0.2754285714285714, "grad_norm": 0.29134202003479004, "kl": 0.0203399658203125, "learning_rate": 6.558139508961654e-07, "loss": 0.0008, "reward": -0.14658209728077054, "reward_std": 0.5397600717842579, "rewards/cosine_scaled_reward": -0.26472287997603416, "rewards/format_reward": 0.7500000093132257, "step": 241 }, { "completion_length": 1179.6041984558105, "epoch": 0.2765714285714286, "grad_norm": 0.3573894500732422, "kl": 0.019369125366210938, "learning_rate": 6.527578915497951e-07, "loss": 0.0008, "reward": 0.17393979895859957, "reward_std": 0.6096976324915886, "rewards/cosine_scaled_reward": -0.09758513886481524, "rewards/format_reward": 0.8958333395421505, "step": 242 }, { "completion_length": 1776.333351135254, "epoch": 0.2777142857142857, "grad_norm": 0.26269933581352234, "kl": 0.016147613525390625, "learning_rate": 6.496968239287603e-07, "loss": 0.0006, "reward": 0.18704321165569127, "reward_std": 0.727390356361866, "rewards/cosine_scaled_reward": -0.07570383511483669, "rewards/format_reward": 0.7500000055879354, "step": 243 }, { "completion_length": 1609.1041870117188, "epoch": 0.27885714285714286, "grad_norm": 0.42587101459503174, "kl": 0.016376495361328125, "learning_rate": 6.466308972251785e-07, "loss": 0.0007, "reward": 0.5847336421720684, "reward_std": 0.9021250456571579, "rewards/cosine_scaled_reward": 0.07377137243747711, "rewards/format_reward": 0.8750000149011612, "step": 244 }, { "completion_length": 2179.520896911621, "epoch": 0.28, "grad_norm": 0.5839371085166931, "kl": 0.019596099853515625, "learning_rate": 6.435602608679916e-07, "loss": 0.0008, "reward": 0.45881569012999535, "reward_std": 1.0688990727066994, "rewards/cosine_scaled_reward": 0.05305776512250304, "rewards/format_reward": 0.6666666828095913, "step": 245 }, { "completion_length": 1406.6667175292969, "epoch": 0.28114285714285714, "grad_norm": 0.28245845437049866, "kl": 0.012783050537109375, "learning_rate": 6.404850645156841e-07, "loss": 0.0005, "reward": 0.353425451554358, "reward_std": 0.7201703079044819, "rewards/cosine_scaled_reward": -0.04003934998763725, "rewards/format_reward": 0.8958333395421505, "step": 246 }, { "completion_length": 2307.7500610351562, "epoch": 0.2822857142857143, "grad_norm": 0.48197513818740845, "kl": 0.0279693603515625, "learning_rate": 6.374054580489873e-07, "loss": 0.0011, "reward": -0.030684850877150893, "reward_std": 0.7912298962473869, "rewards/cosine_scaled_reward": -0.17165578715503216, "rewards/format_reward": 0.5833333488553762, "step": 247 }, { "completion_length": 1537.020866394043, "epoch": 0.2834285714285714, "grad_norm": 0.5387913584709167, "kl": 0.020456314086914062, "learning_rate": 6.343215915635761e-07, "loss": 0.0008, "reward": 0.5385480709373951, "reward_std": 0.7778996899724007, "rewards/cosine_scaled_reward": 0.18887367472052574, "rewards/format_reward": 0.7916666679084301, "step": 248 }, { "completion_length": 1561.6042098999023, "epoch": 0.2845714285714286, "grad_norm": 0.3947852551937103, "kl": 0.021512985229492188, "learning_rate": 6.31233615362752e-07, "loss": 0.0009, "reward": 0.4948454611003399, "reward_std": 0.8307998143136501, "rewards/cosine_scaled_reward": 0.12714037066325545, "rewards/format_reward": 0.7291666734963655, "step": 249 }, { "completion_length": 1415.1875457763672, "epoch": 0.2857142857142857, "grad_norm": 0.48303961753845215, "kl": 0.01863861083984375, "learning_rate": 6.281416799501187e-07, "loss": 0.0007, "reward": 0.28884256578749046, "reward_std": 0.7405095249414444, "rewards/cosine_scaled_reward": -0.05281085259048268, "rewards/format_reward": 0.8750000074505806, "step": 250 }, { "completion_length": 1362.2292022705078, "epoch": 0.28685714285714287, "grad_norm": 0.4910474717617035, "kl": 0.02812957763671875, "learning_rate": 6.25045936022246e-07, "loss": 0.0011, "reward": 0.24397201603278518, "reward_std": 0.8808831572532654, "rewards/cosine_scaled_reward": -0.09501386666670442, "rewards/format_reward": 0.7916666734963655, "step": 251 }, { "completion_length": 1879.4791946411133, "epoch": 0.288, "grad_norm": 0.5786982774734497, "kl": 0.03542327880859375, "learning_rate": 6.219465344613258e-07, "loss": 0.0014, "reward": 0.003026331774890423, "reward_std": 0.5837202109396458, "rewards/cosine_scaled_reward": -0.1362874787300825, "rewards/format_reward": 0.729166679084301, "step": 252 }, { "completion_length": 1624.1250381469727, "epoch": 0.28914285714285715, "grad_norm": 0.6477235555648804, "kl": 0.037837982177734375, "learning_rate": 6.188436263278172e-07, "loss": 0.0015, "reward": 0.24255571886897087, "reward_std": 0.933814812451601, "rewards/cosine_scaled_reward": -0.09086994710378349, "rewards/format_reward": 0.7291666697710752, "step": 253 }, { "completion_length": 1799.5209007263184, "epoch": 0.29028571428571426, "grad_norm": 0.4789735972881317, "kl": 0.0286407470703125, "learning_rate": 6.157373628530852e-07, "loss": 0.0011, "reward": 0.2414424503222108, "reward_std": 0.7898707538843155, "rewards/cosine_scaled_reward": -0.08134954981505871, "rewards/format_reward": 0.7916666865348816, "step": 254 }, { "completion_length": 2204.27091217041, "epoch": 0.2914285714285714, "grad_norm": 0.45766016840934753, "kl": 0.03482818603515625, "learning_rate": 6.126278954320294e-07, "loss": 0.0014, "reward": 0.07502584741450846, "reward_std": 0.9448065534234047, "rewards/cosine_scaled_reward": -0.15562699240399525, "rewards/format_reward": 0.6666666753590107, "step": 255 }, { "completion_length": 1663.729232788086, "epoch": 0.2925714285714286, "grad_norm": 0.5182092785835266, "kl": 0.0309906005859375, "learning_rate": 6.095153756157051e-07, "loss": 0.0012, "reward": 0.2998895291239023, "reward_std": 0.7498802877962589, "rewards/cosine_scaled_reward": -0.001917465589940548, "rewards/format_reward": 0.7708333395421505, "step": 256 }, { "completion_length": 2637.0625762939453, "epoch": 0.2937142857142857, "grad_norm": 0.5543068647384644, "kl": 0.0585174560546875, "learning_rate": 6.06399955103937e-07, "loss": 0.0023, "reward": 0.48659578152000904, "reward_std": 1.1908883340656757, "rewards/cosine_scaled_reward": 0.1260754211107269, "rewards/format_reward": 0.604166679084301, "step": 257 }, { "completion_length": 2023.2708892822266, "epoch": 0.2948571428571429, "grad_norm": 0.5329146385192871, "kl": 0.03546142578125, "learning_rate": 6.032817857379256e-07, "loss": 0.0014, "reward": 0.459288542624563, "reward_std": 0.8846051767468452, "rewards/cosine_scaled_reward": 0.051098582334816456, "rewards/format_reward": 0.7708333544433117, "step": 258 }, { "completion_length": 1453.2708625793457, "epoch": 0.296, "grad_norm": 0.43237462639808655, "kl": 0.02968597412109375, "learning_rate": 6.001610194928464e-07, "loss": 0.0012, "reward": 0.3637466989457607, "reward_std": 0.8164427168667316, "rewards/cosine_scaled_reward": 0.06898940447717905, "rewards/format_reward": 0.7708333414047956, "step": 259 }, { "completion_length": 944.4791851043701, "epoch": 0.29714285714285715, "grad_norm": 0.6447161436080933, "kl": 0.018672943115234375, "learning_rate": 5.97037808470444e-07, "loss": 0.0007, "reward": 0.5366823731455952, "reward_std": 0.7269707396626472, "rewards/cosine_scaled_reward": 0.11251152493059635, "rewards/format_reward": 0.916666679084301, "step": 260 }, { "completion_length": 1973.9583740234375, "epoch": 0.29828571428571427, "grad_norm": 0.37675243616104126, "kl": 0.03509521484375, "learning_rate": 5.939123048916173e-07, "loss": 0.0014, "reward": 0.13344457978382707, "reward_std": 0.8264882601797581, "rewards/cosine_scaled_reward": -0.09806014783680439, "rewards/format_reward": 0.6666666734963655, "step": 261 }, { "completion_length": 1516.6875228881836, "epoch": 0.29942857142857143, "grad_norm": 0.5359493494033813, "kl": 0.03139495849609375, "learning_rate": 5.907846610890011e-07, "loss": 0.0013, "reward": -0.0372560010291636, "reward_std": 0.6091209948062897, "rewards/cosine_scaled_reward": -0.212333626113832, "rewards/format_reward": 0.729166679084301, "step": 262 }, { "completion_length": 1346.3125228881836, "epoch": 0.30057142857142854, "grad_norm": 0.3749285638332367, "kl": 0.0174713134765625, "learning_rate": 5.87655029499542e-07, "loss": 0.0007, "reward": 0.17701542098075151, "reward_std": 0.7833218686282635, "rewards/cosine_scaled_reward": -0.15012040082365274, "rewards/format_reward": 0.8541666772216558, "step": 263 }, { "completion_length": 1433.2708740234375, "epoch": 0.3017142857142857, "grad_norm": 0.4387812614440918, "kl": 0.03086090087890625, "learning_rate": 5.845235626570683e-07, "loss": 0.0012, "reward": 0.2682348359376192, "reward_std": 0.736188217997551, "rewards/cosine_scaled_reward": -0.10726138763129711, "rewards/format_reward": 0.8958333432674408, "step": 264 }, { "completion_length": 1286.4583587646484, "epoch": 0.3028571428571429, "grad_norm": 0.5124613642692566, "kl": 0.029205322265625, "learning_rate": 5.813904131848564e-07, "loss": 0.0012, "reward": 0.47514245874481276, "reward_std": 0.7719651460647583, "rewards/cosine_scaled_reward": 0.03089581150561571, "rewards/format_reward": 0.8958333507180214, "step": 265 }, { "completion_length": 1764.3958892822266, "epoch": 0.304, "grad_norm": 0.5820686221122742, "kl": 0.04436492919921875, "learning_rate": 5.78255733788191e-07, "loss": 0.0018, "reward": 0.060105842188932, "reward_std": 0.6623933054506779, "rewards/cosine_scaled_reward": -0.15320268645882607, "rewards/format_reward": 0.7500000074505806, "step": 266 }, { "completion_length": 1691.2917022705078, "epoch": 0.30514285714285716, "grad_norm": 0.3804147243499756, "kl": 0.05069732666015625, "learning_rate": 5.751196772469237e-07, "loss": 0.002, "reward": 0.010469182627275586, "reward_std": 0.6885622590780258, "rewards/cosine_scaled_reward": -0.203377990052104, "rewards/format_reward": 0.7500000149011612, "step": 267 }, { "completion_length": 1172.3125305175781, "epoch": 0.3062857142857143, "grad_norm": 0.5410240888595581, "kl": 0.036266326904296875, "learning_rate": 5.71982396408026e-07, "loss": 0.0015, "reward": 0.10452710837125778, "reward_std": 0.667732447385788, "rewards/cosine_scaled_reward": -0.19316846132278442, "rewards/format_reward": 0.895833333954215, "step": 268 }, { "completion_length": 1373.541706085205, "epoch": 0.30742857142857144, "grad_norm": 0.5952326655387878, "kl": 0.033191680908203125, "learning_rate": 5.688440441781398e-07, "loss": 0.0013, "reward": 0.33500073617324233, "reward_std": 0.7064723633229733, "rewards/cosine_scaled_reward": -0.04047585092484951, "rewards/format_reward": 0.8958333432674408, "step": 269 }, { "completion_length": 1461.0000228881836, "epoch": 0.30857142857142855, "grad_norm": 0.3846686780452728, "kl": 0.047119140625, "learning_rate": 5.657047735161255e-07, "loss": 0.0019, "reward": 0.48025781381875277, "reward_std": 0.8026834316551685, "rewards/cosine_scaled_reward": 0.06730105105089024, "rewards/format_reward": 0.8750000055879354, "step": 270 }, { "completion_length": 1104.7500190734863, "epoch": 0.3097142857142857, "grad_norm": 0.3378467559814453, "kl": 0.02167510986328125, "learning_rate": 5.625647374256061e-07, "loss": 0.0009, "reward": 0.512184641789645, "reward_std": 0.6890225373208523, "rewards/cosine_scaled_reward": 0.11897583678364754, "rewards/format_reward": 0.9583333432674408, "step": 271 }, { "completion_length": 1595.5625610351562, "epoch": 0.31085714285714283, "grad_norm": 0.9565722346305847, "kl": 0.0341033935546875, "learning_rate": 5.594240889475106e-07, "loss": 0.0014, "reward": 0.2559305219911039, "reward_std": 0.7282034941017628, "rewards/cosine_scaled_reward": -0.0902782422490418, "rewards/format_reward": 0.8750000055879354, "step": 272 }, { "completion_length": 1564.2500534057617, "epoch": 0.312, "grad_norm": 0.6573230624198914, "kl": 0.06351470947265625, "learning_rate": 5.562829811526154e-07, "loss": 0.0025, "reward": 0.33316121553070843, "reward_std": 0.7983997203409672, "rewards/cosine_scaled_reward": 0.00879891961812973, "rewards/format_reward": 0.7916666753590107, "step": 273 }, { "completion_length": 908.3750190734863, "epoch": 0.31314285714285717, "grad_norm": 0.3975994288921356, "kl": 0.020572662353515625, "learning_rate": 5.531415671340826e-07, "loss": 0.0008, "reward": 0.490811045630835, "reward_std": 0.7964842580258846, "rewards/cosine_scaled_reward": 0.027651330456137657, "rewards/format_reward": 0.9791666716337204, "step": 274 }, { "completion_length": 1273.416706085205, "epoch": 0.3142857142857143, "grad_norm": 0.5143330693244934, "kl": 0.0457763671875, "learning_rate": 5.5e-07, "loss": 0.0018, "reward": 0.5196955967694521, "reward_std": 0.8949318751692772, "rewards/cosine_scaled_reward": 0.0634711142629385, "rewards/format_reward": 0.9166666865348816, "step": 275 }, { "completion_length": 1216.2917175292969, "epoch": 0.31542857142857145, "grad_norm": 0.5758770704269409, "kl": 0.04427337646484375, "learning_rate": 5.468584328659172e-07, "loss": 0.0018, "reward": 0.45866466453298926, "reward_std": 0.7997590340673923, "rewards/cosine_scaled_reward": 0.004157306393608451, "rewards/format_reward": 0.9166666716337204, "step": 276 }, { "completion_length": 1405.8542022705078, "epoch": 0.31657142857142856, "grad_norm": 0.7709969878196716, "kl": 0.07879257202148438, "learning_rate": 5.437170188473847e-07, "loss": 0.0032, "reward": 0.3928624112159014, "reward_std": 0.8682146407663822, "rewards/cosine_scaled_reward": -0.0433911276049912, "rewards/format_reward": 0.8958333358168602, "step": 277 }, { "completion_length": 1251.5834045410156, "epoch": 0.3177142857142857, "grad_norm": 0.4815487265586853, "kl": 0.028911590576171875, "learning_rate": 5.405759110524894e-07, "loss": 0.0012, "reward": 0.602864139713347, "reward_std": 0.75666194409132, "rewards/cosine_scaled_reward": 0.1303448430262506, "rewards/format_reward": 0.9583333432674408, "step": 278 }, { "completion_length": 1584.2292098999023, "epoch": 0.31885714285714284, "grad_norm": 0.676331102848053, "kl": 0.04915618896484375, "learning_rate": 5.37435262574394e-07, "loss": 0.002, "reward": 0.1920575883705169, "reward_std": 0.5915602222084999, "rewards/cosine_scaled_reward": -0.08263814821839333, "rewards/format_reward": 0.8541666716337204, "step": 279 }, { "completion_length": 1887.104232788086, "epoch": 0.32, "grad_norm": 0.5340350270271301, "kl": 0.09076309204101562, "learning_rate": 5.342952264838747e-07, "loss": 0.0036, "reward": 0.554422979708761, "reward_std": 1.0056494362652302, "rewards/cosine_scaled_reward": 0.14068329893052578, "rewards/format_reward": 0.7291666753590107, "step": 280 }, { "completion_length": 2119.9583892822266, "epoch": 0.3211428571428571, "grad_norm": 0.901451587677002, "kl": 0.10712432861328125, "learning_rate": 5.311559558218603e-07, "loss": 0.0043, "reward": -0.04842189947521547, "reward_std": 0.735725924372673, "rewards/cosine_scaled_reward": -0.18730824999511242, "rewards/format_reward": 0.6041666772216558, "step": 281 }, { "completion_length": 1546.7708625793457, "epoch": 0.3222857142857143, "grad_norm": 0.5774481892585754, "kl": 0.08298110961914062, "learning_rate": 5.28017603591974e-07, "loss": 0.0033, "reward": 0.4459398053586483, "reward_std": 0.8270345889031887, "rewards/cosine_scaled_reward": 0.038561356253921986, "rewards/format_reward": 0.875, "step": 282 }, { "completion_length": 1432.2292175292969, "epoch": 0.32342857142857145, "grad_norm": 0.6603187322616577, "kl": 0.06803131103515625, "learning_rate": 5.248803227530763e-07, "loss": 0.0027, "reward": 0.3603776376694441, "reward_std": 0.8584562204778194, "rewards/cosine_scaled_reward": -0.058407315984368324, "rewards/format_reward": 0.8541666828095913, "step": 283 }, { "completion_length": 1322.5625228881836, "epoch": 0.32457142857142857, "grad_norm": 0.5359531044960022, "kl": 0.06329727172851562, "learning_rate": 5.21744266211809e-07, "loss": 0.0025, "reward": 0.09239194821566343, "reward_std": 0.6924001406878233, "rewards/cosine_scaled_reward": -0.19309467636048794, "rewards/format_reward": 0.8541666716337204, "step": 284 }, { "completion_length": 829.2917022705078, "epoch": 0.32571428571428573, "grad_norm": 0.4555800259113312, "kl": 0.037982940673828125, "learning_rate": 5.186095868151436e-07, "loss": 0.0015, "reward": 0.4122108933515847, "reward_std": 0.7844538278877735, "rewards/cosine_scaled_reward": 0.023661921732127666, "rewards/format_reward": 0.9375000074505806, "step": 285 }, { "completion_length": 1359.5417098999023, "epoch": 0.32685714285714285, "grad_norm": 0.5877644419670105, "kl": 0.05910491943359375, "learning_rate": 5.154764373429315e-07, "loss": 0.0024, "reward": 0.2864837823435664, "reward_std": 0.8134515210986137, "rewards/cosine_scaled_reward": -0.10266671486897394, "rewards/format_reward": 0.8750000223517418, "step": 286 }, { "completion_length": 1420.1250343322754, "epoch": 0.328, "grad_norm": 1.0718910694122314, "kl": 0.14565277099609375, "learning_rate": 5.123449705004581e-07, "loss": 0.0058, "reward": 0.23910232353955507, "reward_std": 0.6846515089273453, "rewards/cosine_scaled_reward": -0.03220707958098501, "rewards/format_reward": 0.812500013038516, "step": 287 }, { "completion_length": 1311.1666793823242, "epoch": 0.3291428571428571, "grad_norm": 0.7865566611289978, "kl": 0.07602310180664062, "learning_rate": 5.09215338910999e-07, "loss": 0.003, "reward": 0.4210634557530284, "reward_std": 0.8144961148500443, "rewards/cosine_scaled_reward": -0.004126264713704586, "rewards/format_reward": 0.9583333432674408, "step": 288 }, { "completion_length": 1409.8125495910645, "epoch": 0.3302857142857143, "grad_norm": 1.233174204826355, "kl": 0.0773468017578125, "learning_rate": 5.060876951083828e-07, "loss": 0.0031, "reward": 0.2632339745759964, "reward_std": 0.6024043373763561, "rewards/cosine_scaled_reward": 0.0020552128553390503, "rewards/format_reward": 0.8541666679084301, "step": 289 }, { "completion_length": 1338.3125381469727, "epoch": 0.3314285714285714, "grad_norm": 0.5929808616638184, "kl": 0.119293212890625, "learning_rate": 5.02962191529556e-07, "loss": 0.0048, "reward": 0.48768392187776044, "reward_std": 0.8819490298628807, "rewards/cosine_scaled_reward": -0.019131449982523918, "rewards/format_reward": 0.9583333432674408, "step": 290 }, { "completion_length": 1393.5208892822266, "epoch": 0.3325714285714286, "grad_norm": 0.6517693996429443, "kl": 0.109130859375, "learning_rate": 4.998389805071536e-07, "loss": 0.0044, "reward": 0.44597443053498864, "reward_std": 0.852834016084671, "rewards/cosine_scaled_reward": -0.036098250187933445, "rewards/format_reward": 0.9375000074505806, "step": 291 }, { "completion_length": 1563.4375228881836, "epoch": 0.33371428571428574, "grad_norm": 0.813820481300354, "kl": 0.12435150146484375, "learning_rate": 4.967182142620745e-07, "loss": 0.005, "reward": 0.1114344063680619, "reward_std": 0.6936771422624588, "rewards/cosine_scaled_reward": -0.18143241526558995, "rewards/format_reward": 0.8541666716337204, "step": 292 }, { "completion_length": 1062.9166870117188, "epoch": 0.33485714285714285, "grad_norm": 1.081771969795227, "kl": 0.08338165283203125, "learning_rate": 4.93600044896063e-07, "loss": 0.0033, "reward": 0.35100130061618984, "reward_std": 0.6911906227469444, "rewards/cosine_scaled_reward": -0.036540206521749496, "rewards/format_reward": 0.916666679084301, "step": 293 }, { "completion_length": 1598.583396911621, "epoch": 0.336, "grad_norm": 0.9857237935066223, "kl": 0.11139678955078125, "learning_rate": 4.904846243842949e-07, "loss": 0.0045, "reward": 0.2396585661917925, "reward_std": 0.7594392895698547, "rewards/cosine_scaled_reward": -0.07188234385102987, "rewards/format_reward": 0.8125000149011612, "step": 294 }, { "completion_length": 1559.4167251586914, "epoch": 0.33714285714285713, "grad_norm": 0.7534098625183105, "kl": 0.13663482666015625, "learning_rate": 4.873721045679706e-07, "loss": 0.0055, "reward": 0.35895493626594543, "reward_std": 0.9453681632876396, "rewards/cosine_scaled_reward": -0.029410481452941895, "rewards/format_reward": 0.8333333507180214, "step": 295 }, { "completion_length": 1997.6042175292969, "epoch": 0.3382857142857143, "grad_norm": 1.1301641464233398, "kl": 0.2256317138671875, "learning_rate": 4.842626371469149e-07, "loss": 0.009, "reward": 0.21912480238825083, "reward_std": 0.8136032223701477, "rewards/cosine_scaled_reward": -0.07540793996304274, "rewards/format_reward": 0.7500000074505806, "step": 296 }, { "completion_length": 2203.666717529297, "epoch": 0.3394285714285714, "grad_norm": 2.0461599826812744, "kl": 0.2470703125, "learning_rate": 4.811563736721829e-07, "loss": 0.0099, "reward": 0.0711959432810545, "reward_std": 0.7955613285303116, "rewards/cosine_scaled_reward": -0.13813064247369766, "rewards/format_reward": 0.625000013038516, "step": 297 }, { "completion_length": 1511.0417098999023, "epoch": 0.3405714285714286, "grad_norm": 2.8553810119628906, "kl": 0.27417755126953125, "learning_rate": 4.780534655386743e-07, "loss": 0.011, "reward": 0.22325835039373487, "reward_std": 0.7070137523114681, "rewards/cosine_scaled_reward": -0.10693333297967911, "rewards/format_reward": 0.8750000223517418, "step": 298 }, { "completion_length": 1423.8750381469727, "epoch": 0.3417142857142857, "grad_norm": 0.9705626368522644, "kl": 0.1470947265625, "learning_rate": 4.749540639777539e-07, "loss": 0.0059, "reward": 0.20597740169614553, "reward_std": 0.7050324305891991, "rewards/cosine_scaled_reward": -0.06768750678747892, "rewards/format_reward": 0.7916666846722364, "step": 299 }, { "completion_length": 1679.583381652832, "epoch": 0.34285714285714286, "grad_norm": 1.6013585329055786, "kl": 0.2515869140625, "learning_rate": 4.7185832004988133e-07, "loss": 0.0101, "reward": 0.16855086106806993, "reward_std": 0.7662245742976665, "rewards/cosine_scaled_reward": -0.11777076427824795, "rewards/format_reward": 0.7291666902601719, "step": 300 }, { "completion_length": 1467.8333740234375, "epoch": 0.344, "grad_norm": 1.4137245416641235, "kl": 0.14666748046875, "learning_rate": 4.68766384637248e-07, "loss": 0.0059, "reward": 0.34917816892266273, "reward_std": 0.826218631118536, "rewards/cosine_scaled_reward": -0.0683151277480647, "rewards/format_reward": 0.916666679084301, "step": 301 }, { "completion_length": 1356.4167175292969, "epoch": 0.34514285714285714, "grad_norm": 1.1879962682724, "kl": 0.149932861328125, "learning_rate": 4.656784084364238e-07, "loss": 0.006, "reward": 0.36293663922697306, "reward_std": 0.8348271325230598, "rewards/cosine_scaled_reward": 0.008066533133387566, "rewards/format_reward": 0.8125000111758709, "step": 302 }, { "completion_length": 1341.3541870117188, "epoch": 0.3462857142857143, "grad_norm": 1.171494483947754, "kl": 0.18246078491210938, "learning_rate": 4.6259454195101267e-07, "loss": 0.0073, "reward": 0.2431361076887697, "reward_std": 0.7933157682418823, "rewards/cosine_scaled_reward": -0.1022418315988034, "rewards/format_reward": 0.854166679084301, "step": 303 }, { "completion_length": 1238.6667175292969, "epoch": 0.3474285714285714, "grad_norm": 0.9185214042663574, "kl": 0.08075714111328125, "learning_rate": 4.59514935484316e-07, "loss": 0.0032, "reward": 0.26590197812765837, "reward_std": 0.6553149335086346, "rewards/cosine_scaled_reward": -0.07919650059193373, "rewards/format_reward": 0.8958333358168602, "step": 304 }, { "completion_length": 1323.9791946411133, "epoch": 0.3485714285714286, "grad_norm": 1.0339250564575195, "kl": 0.17450332641601562, "learning_rate": 4.5643973913200837e-07, "loss": 0.007, "reward": 0.13226244208635762, "reward_std": 0.6713650859892368, "rewards/cosine_scaled_reward": -0.16428566398099065, "rewards/format_reward": 0.8541666865348816, "step": 305 }, { "completion_length": 1336.8959045410156, "epoch": 0.3497142857142857, "grad_norm": 1.3460803031921387, "kl": 0.196563720703125, "learning_rate": 4.5336910277482155e-07, "loss": 0.0079, "reward": 0.6428653690963984, "reward_std": 1.0474185049533844, "rewards/cosine_scaled_reward": 0.14976803492754698, "rewards/format_reward": 0.8541666716337204, "step": 306 }, { "completion_length": 1476.3542098999023, "epoch": 0.35085714285714287, "grad_norm": 2.0747716426849365, "kl": 0.23199462890625, "learning_rate": 4.503031760712397e-07, "loss": 0.0093, "reward": 0.3243725663051009, "reward_std": 0.7664116658270359, "rewards/cosine_scaled_reward": -0.016992317512631416, "rewards/format_reward": 0.8541666716337204, "step": 307 }, { "completion_length": 2029.166732788086, "epoch": 0.352, "grad_norm": 1.3981530666351318, "kl": 0.3675079345703125, "learning_rate": 4.4724210845020494e-07, "loss": 0.0147, "reward": 0.14206439611734822, "reward_std": 0.7617513313889503, "rewards/cosine_scaled_reward": -0.12085138214752078, "rewards/format_reward": 0.7500000111758709, "step": 308 }, { "completion_length": 1717.7292175292969, "epoch": 0.35314285714285715, "grad_norm": 1.3776211738586426, "kl": 0.27923583984375, "learning_rate": 4.441860491038345e-07, "loss": 0.0112, "reward": 0.26929385494440794, "reward_std": 0.8059471026062965, "rewards/cosine_scaled_reward": -0.06185213173739612, "rewards/format_reward": 0.7500000223517418, "step": 309 }, { "completion_length": 1247.7917098999023, "epoch": 0.35428571428571426, "grad_norm": 1.195373773574829, "kl": 0.13482666015625, "learning_rate": 4.4113514698014953e-07, "loss": 0.0054, "reward": 0.20025933103170246, "reward_std": 0.6624983102083206, "rewards/cosine_scaled_reward": -0.13693573721684515, "rewards/format_reward": 0.8958333432674408, "step": 310 }, { "completion_length": 1230.8333702087402, "epoch": 0.3554285714285714, "grad_norm": 1.7214908599853516, "kl": 0.16667938232421875, "learning_rate": 4.3808955077581546e-07, "loss": 0.0067, "reward": 0.48247593361884356, "reward_std": 0.8122721910476685, "rewards/cosine_scaled_reward": 0.02081843838095665, "rewards/format_reward": 0.8958333432674408, "step": 311 }, { "completion_length": 1083.6875228881836, "epoch": 0.3565714285714286, "grad_norm": 1.3621957302093506, "kl": 0.15818023681640625, "learning_rate": 4.350494089288943e-07, "loss": 0.0063, "reward": 0.5721160881221294, "reward_std": 0.6696533262729645, "rewards/cosine_scaled_reward": 0.23700525425374508, "rewards/format_reward": 0.8750000111758709, "step": 312 }, { "completion_length": 1498.3750228881836, "epoch": 0.3577142857142857, "grad_norm": 1.5137181282043457, "kl": 0.31603240966796875, "learning_rate": 4.3201486961161093e-07, "loss": 0.0126, "reward": 0.4493098706007004, "reward_std": 0.8899626843631268, "rewards/cosine_scaled_reward": 0.060429781675338745, "rewards/format_reward": 0.7916666734963655, "step": 313 }, { "completion_length": 1152.7083702087402, "epoch": 0.3588571428571429, "grad_norm": 1.3737326860427856, "kl": 0.178466796875, "learning_rate": 4.2898608072313045e-07, "loss": 0.0071, "reward": 0.42835053242743015, "reward_std": 0.7351183108985424, "rewards/cosine_scaled_reward": 0.03823569389896875, "rewards/format_reward": 0.8750000223517418, "step": 314 }, { "completion_length": 1325.7709045410156, "epoch": 0.36, "grad_norm": 4.047604560852051, "kl": 0.39301300048828125, "learning_rate": 4.2596318988235037e-07, "loss": 0.0157, "reward": 0.33133709616959095, "reward_std": 0.82825917750597, "rewards/cosine_scaled_reward": 0.003983840346336365, "rewards/format_reward": 0.7708333507180214, "step": 315 }, { "completion_length": 1725.6042137145996, "epoch": 0.36114285714285715, "grad_norm": 2.519155979156494, "kl": 0.63311767578125, "learning_rate": 4.2294634442070553e-07, "loss": 0.0253, "reward": 0.09543975442647934, "reward_std": 0.8387586250901222, "rewards/cosine_scaled_reward": -0.15274116187356412, "rewards/format_reward": 0.7083333507180214, "step": 316 }, { "completion_length": 1519.8333740234375, "epoch": 0.36228571428571427, "grad_norm": 2.768705368041992, "kl": 0.483062744140625, "learning_rate": 4.1993569137498776e-07, "loss": 0.0193, "reward": 0.35648738220334053, "reward_std": 0.8188385404646397, "rewards/cosine_scaled_reward": -0.0025814222171902657, "rewards/format_reward": 0.7708333469927311, "step": 317 }, { "completion_length": 1004.2500343322754, "epoch": 0.36342857142857143, "grad_norm": 0.982016921043396, "kl": 0.1404876708984375, "learning_rate": 4.1693137748017915e-07, "loss": 0.0056, "reward": 0.17383845895528793, "reward_std": 0.699553694576025, "rewards/cosine_scaled_reward": -0.15935352514497936, "rewards/format_reward": 0.916666679084301, "step": 318 }, { "completion_length": 1440.8750305175781, "epoch": 0.36457142857142855, "grad_norm": 2.472060203552246, "kl": 0.6479034423828125, "learning_rate": 4.1393354916230005e-07, "loss": 0.0259, "reward": 0.10745102865621448, "reward_std": 0.8851590566337109, "rewards/cosine_scaled_reward": -0.18445847602561116, "rewards/format_reward": 0.7500000186264515, "step": 319 }, { "completion_length": 872.4791946411133, "epoch": 0.3657142857142857, "grad_norm": 1.969278335571289, "kl": 0.3460540771484375, "learning_rate": 4.1094235253127374e-07, "loss": 0.0139, "reward": 0.4431753905955702, "reward_std": 0.7728302180767059, "rewards/cosine_scaled_reward": 0.03887226711958647, "rewards/format_reward": 0.8958333507180214, "step": 320 }, { "completion_length": 1001.3541946411133, "epoch": 0.3668571428571429, "grad_norm": 1.5442945957183838, "kl": 0.4463653564453125, "learning_rate": 4.079579333738039e-07, "loss": 0.0179, "reward": 0.5132068395614624, "reward_std": 0.9250943809747696, "rewards/cosine_scaled_reward": 0.06521461345255375, "rewards/format_reward": 0.8750000074505806, "step": 321 }, { "completion_length": 1230.1666946411133, "epoch": 0.368, "grad_norm": 4.6210222244262695, "kl": 0.856231689453125, "learning_rate": 4.0498043714627006e-07, "loss": 0.0342, "reward": 0.21194136049598455, "reward_std": 0.8229392170906067, "rewards/cosine_scaled_reward": -0.1281059831380844, "rewards/format_reward": 0.8125000298023224, "step": 322 }, { "completion_length": 1046.5833740234375, "epoch": 0.36914285714285716, "grad_norm": 2.3179831504821777, "kl": 0.600982666015625, "learning_rate": 4.020100089676376e-07, "loss": 0.0241, "reward": 0.17017683573067188, "reward_std": 0.6333070918917656, "rewards/cosine_scaled_reward": -0.09792062174528837, "rewards/format_reward": 0.8333333507180214, "step": 323 }, { "completion_length": 1352.1666946411133, "epoch": 0.3702857142857143, "grad_norm": 4.481067657470703, "kl": 1.1396102905273438, "learning_rate": 3.9904679361238526e-07, "loss": 0.0455, "reward": 0.1608973522670567, "reward_std": 0.7919485196471214, "rewards/cosine_scaled_reward": -0.13527273340150714, "rewards/format_reward": 0.770833358168602, "step": 324 }, { "completion_length": 1022.145866394043, "epoch": 0.37142857142857144, "grad_norm": 3.168888807296753, "kl": 0.7562255859375, "learning_rate": 3.9609093550344907e-07, "loss": 0.0303, "reward": 0.255460548796691, "reward_std": 0.714014045894146, "rewards/cosine_scaled_reward": -0.0987097217439441, "rewards/format_reward": 0.8750000149011612, "step": 325 }, { "completion_length": 1067.8541946411133, "epoch": 0.37257142857142855, "grad_norm": 6.944228172302246, "kl": 0.607666015625, "learning_rate": 3.931425787051832e-07, "loss": 0.0243, "reward": 0.34114629309624434, "reward_std": 0.7636858969926834, "rewards/cosine_scaled_reward": -0.050329115241765976, "rewards/format_reward": 0.8958333507180214, "step": 326 }, { "completion_length": 1227.4375610351562, "epoch": 0.3737142857142857, "grad_norm": 1.7565664052963257, "kl": 0.2918243408203125, "learning_rate": 3.902018669163384e-07, "loss": 0.0117, "reward": 0.4599468493834138, "reward_std": 0.6929560974240303, "rewards/cosine_scaled_reward": 0.0536233875900507, "rewards/format_reward": 0.8958333507180214, "step": 327 }, { "completion_length": 1539.4792556762695, "epoch": 0.37485714285714283, "grad_norm": 7.016115188598633, "kl": 1.123626708984375, "learning_rate": 3.872689434630585e-07, "loss": 0.045, "reward": 0.051082022255286574, "reward_std": 0.7887684628367424, "rewards/cosine_scaled_reward": -0.199990039691329, "rewards/format_reward": 0.7500000186264515, "step": 328 }, { "completion_length": 960.0208778381348, "epoch": 0.376, "grad_norm": 2.32981014251709, "kl": 0.53167724609375, "learning_rate": 3.843439512918949e-07, "loss": 0.0212, "reward": 0.3326158430427313, "reward_std": 0.6615541912615299, "rewards/cosine_scaled_reward": 0.0014219898730516434, "rewards/format_reward": 0.8541666716337204, "step": 329 }, { "completion_length": 1015.9583740234375, "epoch": 0.37714285714285717, "grad_norm": 2.246696949005127, "kl": 0.37122344970703125, "learning_rate": 3.8142703296283953e-07, "loss": 0.0149, "reward": 0.3060060928110033, "reward_std": 0.7870642617344856, "rewards/cosine_scaled_reward": -0.09269801783375442, "rewards/format_reward": 0.9375000074505806, "step": 330 }, { "completion_length": 1435.8333587646484, "epoch": 0.3782857142857143, "grad_norm": 2.061875104904175, "kl": 1.0668792724609375, "learning_rate": 3.785183306423767e-07, "loss": 0.0428, "reward": 0.12969208881258965, "reward_std": 0.6653396300971508, "rewards/cosine_scaled_reward": -0.09254590002819896, "rewards/format_reward": 0.7708333432674408, "step": 331 }, { "completion_length": 1591.333366394043, "epoch": 0.37942857142857145, "grad_norm": 3.716142177581787, "kl": 0.729095458984375, "learning_rate": 3.7561798609655373e-07, "loss": 0.0291, "reward": 0.08031724044121802, "reward_std": 0.6911140829324722, "rewards/cosine_scaled_reward": -0.18430401291698217, "rewards/format_reward": 0.7916666939854622, "step": 332 }, { "completion_length": 1069.1875343322754, "epoch": 0.38057142857142856, "grad_norm": 4.158648490905762, "kl": 0.4372711181640625, "learning_rate": 3.72726140684072e-07, "loss": 0.0175, "reward": 0.406515009701252, "reward_std": 0.9121326096355915, "rewards/cosine_scaled_reward": -0.03547548362985253, "rewards/format_reward": 0.8541666865348816, "step": 333 }, { "completion_length": 1548.5625305175781, "epoch": 0.38171428571428573, "grad_norm": 2.0228302478790283, "kl": 1.1025390625, "learning_rate": 3.6984293534939737e-07, "loss": 0.0441, "reward": 0.0005341523792594671, "reward_std": 0.6353342607617378, "rewards/cosine_scaled_reward": -0.2071850085631013, "rewards/format_reward": 0.770833358168602, "step": 334 }, { "completion_length": 1218.1458740234375, "epoch": 0.38285714285714284, "grad_norm": 1.6079083681106567, "kl": 0.4420585632324219, "learning_rate": 3.6696851061588994e-07, "loss": 0.0177, "reward": 0.5085492568905465, "reward_std": 0.9739002659916878, "rewards/cosine_scaled_reward": 0.027598066721111536, "rewards/format_reward": 0.8750000223517418, "step": 335 }, { "completion_length": 1167.2917175292969, "epoch": 0.384, "grad_norm": 2.6033425331115723, "kl": 0.45883941650390625, "learning_rate": 3.641030065789562e-07, "loss": 0.0184, "reward": 0.43186818808317184, "reward_std": 0.8531835786998272, "rewards/cosine_scaled_reward": 0.05448053032159805, "rewards/format_reward": 0.8125000111758709, "step": 336 }, { "completion_length": 1327.1042022705078, "epoch": 0.3851428571428571, "grad_norm": 3.292928695678711, "kl": 0.3748321533203125, "learning_rate": 3.612465628992203e-07, "loss": 0.015, "reward": 0.2253723087196704, "reward_std": 0.7758054882287979, "rewards/cosine_scaled_reward": -0.13347763079218566, "rewards/format_reward": 0.895833358168602, "step": 337 }, { "completion_length": 1171.1875190734863, "epoch": 0.3862857142857143, "grad_norm": 6.154543876647949, "kl": 1.41033935546875, "learning_rate": 3.5839931879571725e-07, "loss": 0.0567, "reward": 0.21605271194130182, "reward_std": 0.6188852116465569, "rewards/cosine_scaled_reward": -0.02803803514689207, "rewards/format_reward": 0.7708333507180214, "step": 338 }, { "completion_length": 1353.3750381469727, "epoch": 0.38742857142857146, "grad_norm": 4.424708843231201, "kl": 0.518829345703125, "learning_rate": 3.555614130391079e-07, "loss": 0.0208, "reward": 0.09727495489642024, "reward_std": 0.6290086731314659, "rewards/cosine_scaled_reward": -0.1305898940190673, "rewards/format_reward": 0.7291666772216558, "step": 339 }, { "completion_length": 1321.708381652832, "epoch": 0.38857142857142857, "grad_norm": 1.7365025281906128, "kl": 1.028076171875, "learning_rate": 3.5273298394491515e-07, "loss": 0.0411, "reward": 0.2937823599204421, "reward_std": 0.857489574700594, "rewards/cosine_scaled_reward": -0.07948380801826715, "rewards/format_reward": 0.8333333432674408, "step": 340 }, { "completion_length": 1008.5208740234375, "epoch": 0.38971428571428574, "grad_norm": 2.895829677581787, "kl": 0.6253280639648438, "learning_rate": 3.4991416936678276e-07, "loss": 0.025, "reward": 0.5628593787550926, "reward_std": 0.7620139196515083, "rewards/cosine_scaled_reward": 0.1886910004541278, "rewards/format_reward": 0.7500000074505806, "step": 341 }, { "completion_length": 1325.9792175292969, "epoch": 0.39085714285714285, "grad_norm": 3.778715133666992, "kl": 1.060302734375, "learning_rate": 3.471051066897562e-07, "loss": 0.0424, "reward": 0.49639496579766273, "reward_std": 0.9629452601075172, "rewards/cosine_scaled_reward": 0.022836442454718053, "rewards/format_reward": 0.8333333432674408, "step": 342 }, { "completion_length": 1301.9792098999023, "epoch": 0.392, "grad_norm": 4.3889546394348145, "kl": 0.91851806640625, "learning_rate": 3.4430593282358777e-07, "loss": 0.0368, "reward": 0.5952606732025743, "reward_std": 0.8778381422162056, "rewards/cosine_scaled_reward": 0.0880544763058424, "rewards/format_reward": 0.8750000223517418, "step": 343 }, { "completion_length": 1580.520866394043, "epoch": 0.3931428571428571, "grad_norm": 2.815673589706421, "kl": 1.52044677734375, "learning_rate": 3.4151678419606233e-07, "loss": 0.0607, "reward": 0.5273411017842591, "reward_std": 0.7953666783869267, "rewards/cosine_scaled_reward": 0.1287881238386035, "rewards/format_reward": 0.7500000149011612, "step": 344 }, { "completion_length": 1205.958381652832, "epoch": 0.3942857142857143, "grad_norm": 4.648168563842773, "kl": 1.01983642578125, "learning_rate": 3.387377967463493e-07, "loss": 0.0409, "reward": 0.3727841805666685, "reward_std": 0.866691779345274, "rewards/cosine_scaled_reward": -0.026965959696099162, "rewards/format_reward": 0.8125000223517418, "step": 345 }, { "completion_length": 1131.4375381469727, "epoch": 0.3954285714285714, "grad_norm": 1.843777060508728, "kl": 0.547515869140625, "learning_rate": 3.359691059183761e-07, "loss": 0.0219, "reward": 0.3929372038692236, "reward_std": 0.8282114900648594, "rewards/cosine_scaled_reward": -0.039126552641391754, "rewards/format_reward": 0.8958333432674408, "step": 346 }, { "completion_length": 1396.1250381469727, "epoch": 0.3965714285714286, "grad_norm": 2.772951126098633, "kl": 0.83648681640625, "learning_rate": 3.3321084665422803e-07, "loss": 0.0334, "reward": 0.04932975070551038, "reward_std": 0.7110629379749298, "rewards/cosine_scaled_reward": -0.2151939356699586, "rewards/format_reward": 0.833333358168602, "step": 347 }, { "completion_length": 1094.2500381469727, "epoch": 0.3977142857142857, "grad_norm": 3.0713284015655518, "kl": 0.55780029296875, "learning_rate": 3.3046315338757026e-07, "loss": 0.0223, "reward": 0.3814494190737605, "reward_std": 0.7161810956895351, "rewards/cosine_scaled_reward": -0.050093160942196846, "rewards/format_reward": 0.9583333432674408, "step": 348 }, { "completion_length": 1494.020866394043, "epoch": 0.39885714285714285, "grad_norm": 7.005665302276611, "kl": 1.7324066162109375, "learning_rate": 3.2772616003709616e-07, "loss": 0.0694, "reward": 0.3296292170416564, "reward_std": 0.8451166786253452, "rewards/cosine_scaled_reward": -0.04683339223265648, "rewards/format_reward": 0.8125000260770321, "step": 349 }, { "completion_length": 1154.4167022705078, "epoch": 0.4, "grad_norm": 1.4422372579574585, "kl": 0.5752105712890625, "learning_rate": 3.250000000000001e-07, "loss": 0.023, "reward": 0.2720920806750655, "reward_std": 0.9464740082621574, "rewards/cosine_scaled_reward": -0.13292736560106277, "rewards/format_reward": 0.854166679084301, "step": 350 }, { "completion_length": 1267.2708892822266, "epoch": 0.40114285714285713, "grad_norm": 2.480191946029663, "kl": 0.645751953125, "learning_rate": 3.222848061454764e-07, "loss": 0.0258, "reward": 0.17013419978320599, "reward_std": 0.7632426917552948, "rewards/cosine_scaled_reward": -0.08551971893757582, "rewards/format_reward": 0.7083333544433117, "step": 351 }, { "completion_length": 1286.5000228881836, "epoch": 0.4022857142857143, "grad_norm": 2.153872489929199, "kl": 0.580810546875, "learning_rate": 3.195807108082429e-07, "loss": 0.0232, "reward": 0.12328234064625576, "reward_std": 0.8743564449250698, "rewards/cosine_scaled_reward": -0.14927133545279503, "rewards/format_reward": 0.7083333507180214, "step": 352 }, { "completion_length": 1413.1042251586914, "epoch": 0.4034285714285714, "grad_norm": 2.458132266998291, "kl": 0.6356201171875, "learning_rate": 3.168878457820915e-07, "loss": 0.0255, "reward": 0.47827470442280173, "reward_std": 0.9372801259160042, "rewards/cosine_scaled_reward": 0.06890620663762093, "rewards/format_reward": 0.7291666865348816, "step": 353 }, { "completion_length": 1377.5000457763672, "epoch": 0.4045714285714286, "grad_norm": 1.8596700429916382, "kl": 0.601806640625, "learning_rate": 3.142063423134644e-07, "loss": 0.0241, "reward": 0.3115939125418663, "reward_std": 0.9930433630943298, "rewards/cosine_scaled_reward": -0.01708403415977955, "rewards/format_reward": 0.6458333544433117, "step": 354 }, { "completion_length": 1212.645881652832, "epoch": 0.4057142857142857, "grad_norm": 1.9313182830810547, "kl": 0.514892578125, "learning_rate": 3.115363310950578e-07, "loss": 0.0206, "reward": 0.17402092670090497, "reward_std": 0.8653297163546085, "rewards/cosine_scaled_reward": -0.06491286307573318, "rewards/format_reward": 0.6250000149011612, "step": 355 }, { "completion_length": 1633.4584197998047, "epoch": 0.40685714285714286, "grad_norm": 1.906638503074646, "kl": 0.5458984375, "learning_rate": 3.0887794225945143e-07, "loss": 0.0218, "reward": 0.2535629828926176, "reward_std": 0.8886833339929581, "rewards/cosine_scaled_reward": -0.06057508382946253, "rewards/format_reward": 0.7083333469927311, "step": 356 }, { "completion_length": 1881.8751029968262, "epoch": 0.408, "grad_norm": 2.3647892475128174, "kl": 0.819091796875, "learning_rate": 3.062313053727671e-07, "loss": 0.0328, "reward": -0.11886521428823471, "reward_std": 0.6593221351504326, "rewards/cosine_scaled_reward": -0.2341746687889099, "rewards/format_reward": 0.6250000186264515, "step": 357 }, { "completion_length": 1829.6250305175781, "epoch": 0.40914285714285714, "grad_norm": 1.856693148612976, "kl": 0.70489501953125, "learning_rate": 3.0359654942835247e-07, "loss": 0.0282, "reward": 0.1773190926760435, "reward_std": 0.8244731090962887, "rewards/cosine_scaled_reward": -0.05620474135503173, "rewards/format_reward": 0.5833333544433117, "step": 358 }, { "completion_length": 1053.9375305175781, "epoch": 0.4102857142857143, "grad_norm": 3.442803382873535, "kl": 0.8038330078125, "learning_rate": 3.0097380284049523e-07, "loss": 0.0322, "reward": 0.16565631795674562, "reward_std": 0.8398986533284187, "rewards/cosine_scaled_reward": -0.12338497827295214, "rewards/format_reward": 0.7291666939854622, "step": 359 }, { "completion_length": 1288.1875305175781, "epoch": 0.4114285714285714, "grad_norm": 1.7091789245605469, "kl": 0.526123046875, "learning_rate": 2.9836319343816397e-07, "loss": 0.0211, "reward": 0.2598306848667562, "reward_std": 0.8984776921570301, "rewards/cosine_scaled_reward": -0.05199693702161312, "rewards/format_reward": 0.6875000223517418, "step": 360 }, { "completion_length": 1407.9583435058594, "epoch": 0.4125714285714286, "grad_norm": 3.6575090885162354, "kl": 0.920166015625, "learning_rate": 2.9576484845877793e-07, "loss": 0.0368, "reward": 0.022162297973409295, "reward_std": 0.6297386400401592, "rewards/cosine_scaled_reward": -0.17034196108579636, "rewards/format_reward": 0.7083333432674408, "step": 361 }, { "completion_length": 1246.812572479248, "epoch": 0.4137142857142857, "grad_norm": 3.3759262561798096, "kl": 0.9285125732421875, "learning_rate": 2.931788945420058e-07, "loss": 0.0371, "reward": 0.17949877493083477, "reward_std": 0.7008433230221272, "rewards/cosine_scaled_reward": -0.06406344473361969, "rewards/format_reward": 0.6875000204890966, "step": 362 }, { "completion_length": 935.9166870117188, "epoch": 0.41485714285714287, "grad_norm": 2.3449277877807617, "kl": 0.5015869140625, "learning_rate": 2.9060545772359305e-07, "loss": 0.02, "reward": 0.462602804065682, "reward_std": 0.8693399466574192, "rewards/cosine_scaled_reward": 0.1225482877343893, "rewards/format_reward": 0.687500013038516, "step": 363 }, { "completion_length": 1301.6042098999023, "epoch": 0.416, "grad_norm": 2.7286217212677, "kl": 0.5523681640625, "learning_rate": 2.8804466342921987e-07, "loss": 0.0221, "reward": -0.18497540964744985, "reward_std": 0.596361830830574, "rewards/cosine_scaled_reward": -0.2775970436632633, "rewards/format_reward": 0.6458333507180214, "step": 364 }, { "completion_length": 1725.3958587646484, "epoch": 0.41714285714285715, "grad_norm": 2.718860387802124, "kl": 0.6279296875, "learning_rate": 2.854966364683872e-07, "loss": 0.0252, "reward": 0.07833639718592167, "reward_std": 0.783052921295166, "rewards/cosine_scaled_reward": -0.14490601420402527, "rewards/format_reward": 0.6458333432674408, "step": 365 }, { "completion_length": 1542.166690826416, "epoch": 0.41828571428571426, "grad_norm": 2.4428579807281494, "kl": 0.68994140625, "learning_rate": 2.829615010283344e-07, "loss": 0.0276, "reward": 0.2222783851902932, "reward_std": 0.9212566986680031, "rewards/cosine_scaled_reward": 0.022268712986260653, "rewards/format_reward": 0.5416666753590107, "step": 366 }, { "completion_length": 1529.7708854675293, "epoch": 0.41942857142857143, "grad_norm": 1.7189098596572876, "kl": 0.618408203125, "learning_rate": 2.8043938066798645e-07, "loss": 0.0247, "reward": 0.13789566839113832, "reward_std": 0.7518769651651382, "rewards/cosine_scaled_reward": -0.08000693749636412, "rewards/format_reward": 0.6458333525806665, "step": 367 }, { "completion_length": 1874.6250610351562, "epoch": 0.4205714285714286, "grad_norm": 2.6109297275543213, "kl": 0.5501708984375, "learning_rate": 2.7793039831193133e-07, "loss": 0.022, "reward": 0.1437191739678383, "reward_std": 0.809885136783123, "rewards/cosine_scaled_reward": -0.07925083208829165, "rewards/format_reward": 0.604166679084301, "step": 368 }, { "completion_length": 1595.6458892822266, "epoch": 0.4217142857142857, "grad_norm": 2.485452890396118, "kl": 0.75439453125, "learning_rate": 2.7543467624442956e-07, "loss": 0.0302, "reward": 0.23247148096561432, "reward_std": 0.9586904980242252, "rewards/cosine_scaled_reward": -0.044586887350305915, "rewards/format_reward": 0.6041666809469461, "step": 369 }, { "completion_length": 1477.1458740234375, "epoch": 0.4228571428571429, "grad_norm": 2.785557508468628, "kl": 0.76904296875, "learning_rate": 2.729523361034538e-07, "loss": 0.0308, "reward": 0.12985735037364066, "reward_std": 0.6629677005112171, "rewards/cosine_scaled_reward": -0.04906688700430095, "rewards/format_reward": 0.6250000149011612, "step": 370 }, { "completion_length": 1081.5417079925537, "epoch": 0.424, "grad_norm": 2.7538247108459473, "kl": 0.563232421875, "learning_rate": 2.7048349887476037e-07, "loss": 0.0225, "reward": 0.592208469286561, "reward_std": 0.7628613486886024, "rewards/cosine_scaled_reward": 0.19106208952143788, "rewards/format_reward": 0.8125000186264515, "step": 371 }, { "completion_length": 1689.6458892822266, "epoch": 0.42514285714285716, "grad_norm": 2.427741050720215, "kl": 0.419677734375, "learning_rate": 2.6802828488599294e-07, "loss": 0.0168, "reward": 0.2929907846264541, "reward_std": 0.8767264820635319, "rewards/cosine_scaled_reward": -0.03612378612160683, "rewards/format_reward": 0.7083333507180214, "step": 372 }, { "completion_length": 1054.0833702087402, "epoch": 0.42628571428571427, "grad_norm": 3.8023006916046143, "kl": 0.587646484375, "learning_rate": 2.655868138008171e-07, "loss": 0.0235, "reward": 0.05784207722172141, "reward_std": 0.7628821134567261, "rewards/cosine_scaled_reward": -0.17036252235993743, "rewards/format_reward": 0.6875000260770321, "step": 373 }, { "completion_length": 1296.9375305175781, "epoch": 0.42742857142857144, "grad_norm": 3.30541729927063, "kl": 0.861572265625, "learning_rate": 2.631592046130896e-07, "loss": 0.0345, "reward": 0.2668555803829804, "reward_std": 0.8589187040925026, "rewards/cosine_scaled_reward": -0.026118143810890615, "rewards/format_reward": 0.6666666753590107, "step": 374 }, { "completion_length": 1779.8958549499512, "epoch": 0.42857142857142855, "grad_norm": 3.2860774993896484, "kl": 0.71484375, "learning_rate": 2.6074557564105724e-07, "loss": 0.0286, "reward": 0.2315208874642849, "reward_std": 0.9205419048666954, "rewards/cosine_scaled_reward": -0.08181109488941729, "rewards/format_reward": 0.6875000260770321, "step": 375 }, { "completion_length": 1341.812515258789, "epoch": 0.4297142857142857, "grad_norm": 1.9312739372253418, "kl": 0.6658935546875, "learning_rate": 2.583460445215911e-07, "loss": 0.0266, "reward": 0.11426342278718948, "reward_std": 0.7130968421697617, "rewards/cosine_scaled_reward": -0.13826850580517203, "rewards/format_reward": 0.7708333507180214, "step": 376 }, { "completion_length": 1541.6042098999023, "epoch": 0.4308571428571429, "grad_norm": 1.676692008972168, "kl": 0.5164794921875, "learning_rate": 2.5596072820445254e-07, "loss": 0.0206, "reward": 0.11265636514872313, "reward_std": 0.6777131371200085, "rewards/cosine_scaled_reward": -0.12696546595543623, "rewards/format_reward": 0.7291666939854622, "step": 377 }, { "completion_length": 1453.2917251586914, "epoch": 0.432, "grad_norm": 2.0128226280212402, "kl": 0.51177978515625, "learning_rate": 2.5358974294659373e-07, "loss": 0.0205, "reward": 0.31238897051662207, "reward_std": 1.0008442774415016, "rewards/cosine_scaled_reward": -0.07301739603281021, "rewards/format_reward": 0.7500000223517418, "step": 378 }, { "completion_length": 1567.0000457763672, "epoch": 0.43314285714285716, "grad_norm": 2.777541160583496, "kl": 0.682373046875, "learning_rate": 2.512332043064913e-07, "loss": 0.0273, "reward": 0.006588555872440338, "reward_std": 0.7485722936689854, "rewards/cosine_scaled_reward": -0.18877192959189415, "rewards/format_reward": 0.6666666846722364, "step": 379 }, { "completion_length": 1324.5208854675293, "epoch": 0.4342857142857143, "grad_norm": 3.0375523567199707, "kl": 0.670166015625, "learning_rate": 2.488912271385139e-07, "loss": 0.0268, "reward": 0.2578076588688418, "reward_std": 0.9591740146279335, "rewards/cosine_scaled_reward": -0.012080159038305283, "rewards/format_reward": 0.5416666846722364, "step": 380 }, { "completion_length": 1730.3125762939453, "epoch": 0.43542857142857144, "grad_norm": 1.9336341619491577, "kl": 0.7171630859375, "learning_rate": 2.465639255873246e-07, "loss": 0.0287, "reward": -0.1746607469394803, "reward_std": 0.5531054511666298, "rewards/cosine_scaled_reward": -0.2578210327774286, "rewards/format_reward": 0.6458333469927311, "step": 381 }, { "completion_length": 1355.208366394043, "epoch": 0.43657142857142855, "grad_norm": 2.0603504180908203, "kl": 0.6038818359375, "learning_rate": 2.4425141308231765e-07, "loss": 0.0242, "reward": 0.16119742300361395, "reward_std": 0.8973702676594257, "rewards/cosine_scaled_reward": -0.13088632840663195, "rewards/format_reward": 0.7083333469927311, "step": 382 }, { "completion_length": 1379.7292098999023, "epoch": 0.4377142857142857, "grad_norm": 1.975783348083496, "kl": 0.513427734375, "learning_rate": 2.4195380233209006e-07, "loss": 0.0206, "reward": 0.5386483520269394, "reward_std": 0.9303784146904945, "rewards/cosine_scaled_reward": 0.09457700047641993, "rewards/format_reward": 0.7500000111758709, "step": 383 }, { "completion_length": 1441.4375305175781, "epoch": 0.43885714285714283, "grad_norm": 2.0518715381622314, "kl": 0.517333984375, "learning_rate": 2.3967120531894857e-07, "loss": 0.0207, "reward": 0.5782037973403931, "reward_std": 1.07766717299819, "rewards/cosine_scaled_reward": 0.11610157322138548, "rewards/format_reward": 0.6875000260770321, "step": 384 }, { "completion_length": 1418.8958740234375, "epoch": 0.44, "grad_norm": 2.584573745727539, "kl": 0.51806640625, "learning_rate": 2.374037332934512e-07, "loss": 0.0207, "reward": 0.05193836707621813, "reward_std": 0.8466444984078407, "rewards/cosine_scaled_reward": -0.1831620568409562, "rewards/format_reward": 0.6875000149011612, "step": 385 }, { "completion_length": 1452.9166946411133, "epoch": 0.44114285714285717, "grad_norm": 2.1661787033081055, "kl": 0.517578125, "learning_rate": 2.3515149676898552e-07, "loss": 0.0207, "reward": 0.14932426065206528, "reward_std": 0.8432199582457542, "rewards/cosine_scaled_reward": -0.10539527935907245, "rewards/format_reward": 0.6458333544433117, "step": 386 }, { "completion_length": 1647.145866394043, "epoch": 0.4422857142857143, "grad_norm": 1.8875645399093628, "kl": 0.56982421875, "learning_rate": 2.3291460551638237e-07, "loss": 0.0228, "reward": 0.01697136198345106, "reward_std": 0.7178105190396309, "rewards/cosine_scaled_reward": -0.11740265972912312, "rewards/format_reward": 0.604166679084301, "step": 387 }, { "completion_length": 1262.020866394043, "epoch": 0.44342857142857145, "grad_norm": 4.029469966888428, "kl": 0.7529296875, "learning_rate": 2.306931685585657e-07, "loss": 0.0301, "reward": 0.3093376103788614, "reward_std": 0.9565748050808907, "rewards/cosine_scaled_reward": -0.05382521077990532, "rewards/format_reward": 0.7291666865348816, "step": 388 }, { "completion_length": 1679.1667404174805, "epoch": 0.44457142857142856, "grad_norm": 1.9148039817810059, "kl": 0.68115234375, "learning_rate": 2.2848729416523859e-07, "loss": 0.0272, "reward": 0.03406955860555172, "reward_std": 0.8620793931186199, "rewards/cosine_scaled_reward": -0.1516010407358408, "rewards/format_reward": 0.5833333469927311, "step": 389 }, { "completion_length": 1607.0625305175781, "epoch": 0.44571428571428573, "grad_norm": 2.3997583389282227, "kl": 0.6044921875, "learning_rate": 2.2629708984760706e-07, "loss": 0.0242, "reward": -0.03216620907187462, "reward_std": 0.7698984518647194, "rewards/cosine_scaled_reward": -0.18137890007346869, "rewards/format_reward": 0.6041666865348816, "step": 390 }, { "completion_length": 1295.2916870117188, "epoch": 0.44685714285714284, "grad_norm": 2.192011833190918, "kl": 0.499267578125, "learning_rate": 2.2412266235313973e-07, "loss": 0.02, "reward": 0.3369876742362976, "reward_std": 0.9241867884993553, "rewards/cosine_scaled_reward": -0.05256127379834652, "rewards/format_reward": 0.7500000223517418, "step": 391 }, { "completion_length": 1388.0625305175781, "epoch": 0.448, "grad_norm": 1.6250756978988647, "kl": 0.4912109375, "learning_rate": 2.2196411766036487e-07, "loss": 0.0196, "reward": 0.3453047815710306, "reward_std": 0.924341045320034, "rewards/cosine_scaled_reward": -0.037261209450662136, "rewards/format_reward": 0.7708333432674408, "step": 392 }, { "completion_length": 1362.3333587646484, "epoch": 0.4491428571428571, "grad_norm": 2.632045269012451, "kl": 0.560302734375, "learning_rate": 2.1982156097370557e-07, "loss": 0.0224, "reward": 0.3152189594693482, "reward_std": 0.8564588874578476, "rewards/cosine_scaled_reward": -0.05929289385676384, "rewards/format_reward": 0.7916666865348816, "step": 393 }, { "completion_length": 1506.1666870117188, "epoch": 0.4502857142857143, "grad_norm": 2.0749998092651367, "kl": 0.65478515625, "learning_rate": 2.1769509671835223e-07, "loss": 0.0262, "reward": -0.0516182022402063, "reward_std": 0.6332247443497181, "rewards/cosine_scaled_reward": -0.17348778434097767, "rewards/format_reward": 0.6041666865348816, "step": 394 }, { "completion_length": 1105.4583587646484, "epoch": 0.4514285714285714, "grad_norm": 3.7922682762145996, "kl": 0.60986328125, "learning_rate": 2.1558482853517253e-07, "loss": 0.0244, "reward": 0.3151353672146797, "reward_std": 0.7997763678431511, "rewards/cosine_scaled_reward": 0.014608239755034447, "rewards/format_reward": 0.6666666828095913, "step": 395 }, { "completion_length": 1559.6875762939453, "epoch": 0.45257142857142857, "grad_norm": 3.7929108142852783, "kl": 0.51507568359375, "learning_rate": 2.134908592756607e-07, "loss": 0.0206, "reward": 0.1322829071432352, "reward_std": 0.8961255550384521, "rewards/cosine_scaled_reward": -0.12251237966120243, "rewards/format_reward": 0.6458333507180214, "step": 396 }, { "completion_length": 1392.083381652832, "epoch": 0.45371428571428574, "grad_norm": 1.7970480918884277, "kl": 0.5240478515625, "learning_rate": 2.1141329099692406e-07, "loss": 0.0209, "reward": 0.24729618662968278, "reward_std": 0.7044042460620403, "rewards/cosine_scaled_reward": -0.03716139169409871, "rewards/format_reward": 0.7291666902601719, "step": 397 }, { "completion_length": 1375.2500228881836, "epoch": 0.45485714285714285, "grad_norm": 2.7973999977111816, "kl": 0.5516357421875, "learning_rate": 2.0935222495670968e-07, "loss": 0.0221, "reward": 0.05064563453197479, "reward_std": 0.7481582798063755, "rewards/cosine_scaled_reward": -0.16217901837080717, "rewards/format_reward": 0.6666666828095913, "step": 398 }, { "completion_length": 1417.9375381469727, "epoch": 0.456, "grad_norm": 3.365849733352661, "kl": 0.652587890625, "learning_rate": 2.0730776160846853e-07, "loss": 0.0261, "reward": 0.24118162877857685, "reward_std": 0.9246044531464577, "rewards/cosine_scaled_reward": -0.05636600544676185, "rewards/format_reward": 0.6666666865348816, "step": 399 }, { "completion_length": 835.3958549499512, "epoch": 0.45714285714285713, "grad_norm": 2.1656346321105957, "kl": 0.446533203125, "learning_rate": 2.0528000059645995e-07, "loss": 0.0178, "reward": 0.66196015663445, "reward_std": 1.0285310856997967, "rewards/cosine_scaled_reward": 0.17172618986387533, "rewards/format_reward": 0.8541666939854622, "step": 400 }, { "completion_length": 1314.083396911621, "epoch": 0.4582857142857143, "grad_norm": 1.6506296396255493, "kl": 0.44915771484375, "learning_rate": 2.032690407508949e-07, "loss": 0.018, "reward": 0.157606887165457, "reward_std": 0.7822442874312401, "rewards/cosine_scaled_reward": -0.11768818949349225, "rewards/format_reward": 0.7291666865348816, "step": 401 }, { "completion_length": 1331.9583892822266, "epoch": 0.4594285714285714, "grad_norm": 3.1099698543548584, "kl": 0.589599609375, "learning_rate": 2.0127498008311922e-07, "loss": 0.0236, "reward": 0.2424939200282097, "reward_std": 0.7927302867174149, "rewards/cosine_scaled_reward": -0.0522354356944561, "rewards/format_reward": 0.6875000260770321, "step": 402 }, { "completion_length": 1337.020881652832, "epoch": 0.4605714285714286, "grad_norm": 2.6694633960723877, "kl": 0.537109375, "learning_rate": 1.9929791578083655e-07, "loss": 0.0215, "reward": 0.2716318762395531, "reward_std": 0.8227218054234982, "rewards/cosine_scaled_reward": -0.02993021416477859, "rewards/format_reward": 0.6666666902601719, "step": 403 }, { "completion_length": 1489.7708740234375, "epoch": 0.4617142857142857, "grad_norm": 2.208977460861206, "kl": 0.66632080078125, "learning_rate": 1.9733794420337213e-07, "loss": 0.0267, "reward": 0.28225214779376984, "reward_std": 0.8597016483545303, "rewards/cosine_scaled_reward": -0.004551528720185161, "rewards/format_reward": 0.7083333395421505, "step": 404 }, { "completion_length": 1259.8333587646484, "epoch": 0.46285714285714286, "grad_norm": 4.63955545425415, "kl": 0.81591796875, "learning_rate": 1.9539516087697517e-07, "loss": 0.0327, "reward": 0.35530247224960476, "reward_std": 0.8299554735422134, "rewards/cosine_scaled_reward": 0.037398045882582664, "rewards/format_reward": 0.7291666865348816, "step": 405 }, { "completion_length": 1513.1250610351562, "epoch": 0.464, "grad_norm": 2.6123645305633545, "kl": 0.635986328125, "learning_rate": 1.934696604901642e-07, "loss": 0.0254, "reward": 0.06231710687279701, "reward_std": 0.8518500626087189, "rewards/cosine_scaled_reward": -0.1575743369758129, "rewards/format_reward": 0.6458333507180214, "step": 406 }, { "completion_length": 1208.5000228881836, "epoch": 0.46514285714285714, "grad_norm": 3.125258445739746, "kl": 0.557861328125, "learning_rate": 1.915615368891117e-07, "loss": 0.0224, "reward": 0.28017069818452, "reward_std": 0.7779465243220329, "rewards/cosine_scaled_reward": -0.040886467322707176, "rewards/format_reward": 0.7291666939854622, "step": 407 }, { "completion_length": 1217.5208587646484, "epoch": 0.4662857142857143, "grad_norm": 2.7784037590026855, "kl": 0.427734375, "learning_rate": 1.8967088307307e-07, "loss": 0.0171, "reward": 0.3413910511881113, "reward_std": 0.6852421574294567, "rewards/cosine_scaled_reward": -0.017510855570435524, "rewards/format_reward": 0.8958333432674408, "step": 408 }, { "completion_length": 1725.041748046875, "epoch": 0.4674285714285714, "grad_norm": 1.8275662660598755, "kl": 0.676513671875, "learning_rate": 1.8779779118983867e-07, "loss": 0.0271, "reward": 0.21645420044660568, "reward_std": 0.8697847276926041, "rewards/cosine_scaled_reward": -0.08236874872818589, "rewards/format_reward": 0.7083333544433117, "step": 409 }, { "completion_length": 1496.8333892822266, "epoch": 0.4685714285714286, "grad_norm": 3.2171754837036133, "kl": 0.934356689453125, "learning_rate": 1.8594235253127372e-07, "loss": 0.0374, "reward": 0.17047240026295185, "reward_std": 0.7880502119660378, "rewards/cosine_scaled_reward": -0.04659413266927004, "rewards/format_reward": 0.583333345130086, "step": 410 }, { "completion_length": 1500.4792098999023, "epoch": 0.4697142857142857, "grad_norm": 1.7640888690948486, "kl": 0.608154296875, "learning_rate": 1.8410465752883758e-07, "loss": 0.0243, "reward": 0.3746693143621087, "reward_std": 1.0237346589565277, "rewards/cosine_scaled_reward": -0.025670517061371356, "rewards/format_reward": 0.7500000149011612, "step": 411 }, { "completion_length": 1243.1042098999023, "epoch": 0.47085714285714286, "grad_norm": 3.035127878189087, "kl": 0.6143798828125, "learning_rate": 1.822847957491922e-07, "loss": 0.0246, "reward": 0.37103684339672327, "reward_std": 0.9396983981132507, "rewards/cosine_scaled_reward": -0.03727317973971367, "rewards/format_reward": 0.791666679084301, "step": 412 }, { "completion_length": 1486.4166946411133, "epoch": 0.472, "grad_norm": 2.925623655319214, "kl": 0.7296142578125, "learning_rate": 1.804828558898332e-07, "loss": 0.0292, "reward": 0.19466528482735157, "reward_std": 0.7697472274303436, "rewards/cosine_scaled_reward": -0.08936256961897016, "rewards/format_reward": 0.6875000149011612, "step": 413 }, { "completion_length": 1481.3125381469727, "epoch": 0.47314285714285714, "grad_norm": 2.3652820587158203, "kl": 0.717041015625, "learning_rate": 1.7869892577476722e-07, "loss": 0.0286, "reward": 0.06787084229290485, "reward_std": 0.8079018853604794, "rewards/cosine_scaled_reward": -0.15581453032791615, "rewards/format_reward": 0.6666666865348816, "step": 414 }, { "completion_length": 1480.1875610351562, "epoch": 0.4742857142857143, "grad_norm": 1.5522794723510742, "kl": 0.653564453125, "learning_rate": 1.7693309235023127e-07, "loss": 0.0261, "reward": 0.2685818700119853, "reward_std": 0.9294859580695629, "rewards/cosine_scaled_reward": -0.06137389224022627, "rewards/format_reward": 0.6875000186264515, "step": 415 }, { "completion_length": 1191.3125534057617, "epoch": 0.4754285714285714, "grad_norm": 2.506166696548462, "kl": 0.4866943359375, "learning_rate": 1.7518544168045524e-07, "loss": 0.0195, "reward": 0.540099716745317, "reward_std": 0.9752370566129684, "rewards/cosine_scaled_reward": 0.078727146377787, "rewards/format_reward": 0.833333358168602, "step": 416 }, { "completion_length": 1577.3750686645508, "epoch": 0.4765714285714286, "grad_norm": 2.541682004928589, "kl": 0.90380859375, "learning_rate": 1.7345605894346726e-07, "loss": 0.0361, "reward": 0.19305734895169735, "reward_std": 0.8487963303923607, "rewards/cosine_scaled_reward": -0.08683300111442804, "rewards/format_reward": 0.6666666902601719, "step": 417 }, { "completion_length": 1273.8958473205566, "epoch": 0.4777142857142857, "grad_norm": 2.7441353797912598, "kl": 0.631591796875, "learning_rate": 1.7174502842694212e-07, "loss": 0.0253, "reward": 0.5500797647982836, "reward_std": 0.9426254630088806, "rewards/cosine_scaled_reward": 0.08637175487820059, "rewards/format_reward": 0.8333333507180214, "step": 418 }, { "completion_length": 1354.8542098999023, "epoch": 0.47885714285714287, "grad_norm": 2.3867642879486084, "kl": 0.687744140625, "learning_rate": 1.7005243352409333e-07, "loss": 0.0275, "reward": 0.3600006675114855, "reward_std": 0.9771365597844124, "rewards/cosine_scaled_reward": -0.004668326582759619, "rewards/format_reward": 0.7083333469927311, "step": 419 }, { "completion_length": 1028.6250381469727, "epoch": 0.48, "grad_norm": 10.020596504211426, "kl": 0.8775634765625, "learning_rate": 1.6837835672960831e-07, "loss": 0.0351, "reward": 0.07369395159184933, "reward_std": 0.7157322019338608, "rewards/cosine_scaled_reward": -0.15065189078450203, "rewards/format_reward": 0.7083333544433117, "step": 420 }, { "completion_length": 1321.0833587646484, "epoch": 0.48114285714285715, "grad_norm": 3.3951587677001953, "kl": 0.703125, "learning_rate": 1.6672287963562852e-07, "loss": 0.0281, "reward": -0.07291397266089916, "reward_std": 0.6696721352636814, "rewards/cosine_scaled_reward": -0.20483622467145324, "rewards/format_reward": 0.6458333469927311, "step": 421 }, { "completion_length": 1193.3541946411133, "epoch": 0.48228571428571426, "grad_norm": 2.8352770805358887, "kl": 0.587646484375, "learning_rate": 1.6508608292777203e-07, "loss": 0.0235, "reward": 0.2525853253901005, "reward_std": 0.8423508293926716, "rewards/cosine_scaled_reward": -0.07235960848629475, "rewards/format_reward": 0.7500000149011612, "step": 422 }, { "completion_length": 1172.8333740234375, "epoch": 0.48342857142857143, "grad_norm": 2.012563467025757, "kl": 0.49560546875, "learning_rate": 1.6346804638120098e-07, "loss": 0.0198, "reward": 0.09193800436332822, "reward_std": 0.7620077319443226, "rewards/cosine_scaled_reward": -0.18725344724953175, "rewards/format_reward": 0.7916666939854622, "step": 423 }, { "completion_length": 1772.6875534057617, "epoch": 0.4845714285714286, "grad_norm": 1.4459457397460938, "kl": 0.5943603515625, "learning_rate": 1.6186884885673413e-07, "loss": 0.0238, "reward": 0.2721530678682029, "reward_std": 0.9363379552960396, "rewards/cosine_scaled_reward": -0.06505887769162655, "rewards/format_reward": 0.7083333544433117, "step": 424 }, { "completion_length": 1682.208366394043, "epoch": 0.4857142857142857, "grad_norm": 2.2990784645080566, "kl": 0.73602294921875, "learning_rate": 1.6028856829700258e-07, "loss": 0.0294, "reward": 0.5255491202697158, "reward_std": 1.0602496266365051, "rewards/cosine_scaled_reward": 0.10401645209640265, "rewards/format_reward": 0.6666666772216558, "step": 425 }, { "completion_length": 1195.916690826416, "epoch": 0.4868571428571429, "grad_norm": 2.467822790145874, "kl": 0.5208740234375, "learning_rate": 1.5872728172265146e-07, "loss": 0.0208, "reward": 0.11827224772423506, "reward_std": 0.6336887441575527, "rewards/cosine_scaled_reward": -0.1465412126854062, "rewards/format_reward": 0.7916666939854622, "step": 426 }, { "completion_length": 1604.0208587646484, "epoch": 0.488, "grad_norm": 2.8577768802642822, "kl": 0.614990234375, "learning_rate": 1.5718506522858572e-07, "loss": 0.0246, "reward": 0.27578693721443415, "reward_std": 1.0030243545770645, "rewards/cosine_scaled_reward": -0.04830464324913919, "rewards/format_reward": 0.6666666865348816, "step": 427 }, { "completion_length": 1337.7916946411133, "epoch": 0.48914285714285716, "grad_norm": 1.5771373510360718, "kl": 0.516357421875, "learning_rate": 1.5566199398026147e-07, "loss": 0.0206, "reward": 0.31163547467440367, "reward_std": 0.8827511817216873, "rewards/cosine_scaled_reward": -0.05672437600696867, "rewards/format_reward": 0.791666679084301, "step": 428 }, { "completion_length": 1092.6458854675293, "epoch": 0.49028571428571427, "grad_norm": 7.517470836639404, "kl": 0.735107421875, "learning_rate": 1.5415814221002265e-07, "loss": 0.0294, "reward": -0.04257449973374605, "reward_std": 0.6279182583093643, "rewards/cosine_scaled_reward": -0.2530031790956855, "rewards/format_reward": 0.7916666865348816, "step": 429 }, { "completion_length": 1677.7500686645508, "epoch": 0.49142857142857144, "grad_norm": 2.1630637645721436, "kl": 0.6884765625, "learning_rate": 1.5267358321348285e-07, "loss": 0.0276, "reward": 0.20026828069239855, "reward_std": 0.8897528201341629, "rewards/cosine_scaled_reward": -0.10384058998897672, "rewards/format_reward": 0.7291666865348816, "step": 430 }, { "completion_length": 1051.2500381469727, "epoch": 0.49257142857142855, "grad_norm": 2.162562847137451, "kl": 0.677490234375, "learning_rate": 1.5120838934595337e-07, "loss": 0.0271, "reward": 0.0934929153881967, "reward_std": 0.6808763556182384, "rewards/cosine_scaled_reward": -0.1685788333415985, "rewards/format_reward": 0.791666679084301, "step": 431 }, { "completion_length": 1581.7917098999023, "epoch": 0.4937142857142857, "grad_norm": 3.096701145172119, "kl": 0.9864044189453125, "learning_rate": 1.4976263201891613e-07, "loss": 0.0394, "reward": 0.003629262908361852, "reward_std": 0.6885860674083233, "rewards/cosine_scaled_reward": -0.15299149602651596, "rewards/format_reward": 0.6250000074505806, "step": 432 }, { "completion_length": 1580.2708740234375, "epoch": 0.4948571428571429, "grad_norm": 2.7364211082458496, "kl": 0.75, "learning_rate": 1.483363816965435e-07, "loss": 0.03, "reward": 0.15776942297816277, "reward_std": 0.8655783608555794, "rewards/cosine_scaled_reward": -0.08985630236566067, "rewards/format_reward": 0.6041666828095913, "step": 433 }, { "completion_length": 1532.2291984558105, "epoch": 0.496, "grad_norm": 1.8213260173797607, "kl": 0.5916748046875, "learning_rate": 1.469297078922642e-07, "loss": 0.0237, "reward": -0.09767143800854683, "reward_std": 0.6334349103271961, "rewards/cosine_scaled_reward": -0.22124753845855594, "rewards/format_reward": 0.6458333544433117, "step": 434 }, { "completion_length": 942.9166793823242, "epoch": 0.49714285714285716, "grad_norm": 2.527085542678833, "kl": 0.7471923828125, "learning_rate": 1.4554267916537495e-07, "loss": 0.0299, "reward": -0.003471766598522663, "reward_std": 0.5359638221561909, "rewards/cosine_scaled_reward": -0.20710261538624763, "rewards/format_reward": 0.8125000223517418, "step": 435 }, { "completion_length": 1480.1250381469727, "epoch": 0.4982857142857143, "grad_norm": 2.621030569076538, "kl": 0.82220458984375, "learning_rate": 1.4417536311769885e-07, "loss": 0.0329, "reward": 0.3342234673909843, "reward_std": 0.8837791383266449, "rewards/cosine_scaled_reward": 0.03901571640744805, "rewards/format_reward": 0.6250000149011612, "step": 436 }, { "completion_length": 1359.333381652832, "epoch": 0.49942857142857144, "grad_norm": 2.2266130447387695, "kl": 0.56341552734375, "learning_rate": 1.4282782639029128e-07, "loss": 0.0225, "reward": 0.17527570901438594, "reward_std": 0.7361086085438728, "rewards/cosine_scaled_reward": -0.09591093473136425, "rewards/format_reward": 0.7083333618938923, "step": 437 }, { "completion_length": 1493.1875457763672, "epoch": 0.5005714285714286, "grad_norm": 1.7907952070236206, "kl": 0.62255859375, "learning_rate": 1.4150013466019114e-07, "loss": 0.0249, "reward": 0.025391742354258895, "reward_std": 0.8453281559050083, "rewards/cosine_scaled_reward": -0.22053607925772667, "rewards/format_reward": 0.7500000260770321, "step": 438 }, { "completion_length": 1641.4167175292969, "epoch": 0.5017142857142857, "grad_norm": 2.076477527618408, "kl": 0.87103271484375, "learning_rate": 1.4019235263722034e-07, "loss": 0.0349, "reward": -0.12565073231235147, "reward_std": 0.695248618721962, "rewards/cosine_scaled_reward": -0.2208885379950516, "rewards/format_reward": 0.583333345130086, "step": 439 }, { "completion_length": 1336.645866394043, "epoch": 0.5028571428571429, "grad_norm": 3.641127586364746, "kl": 0.525146484375, "learning_rate": 1.3890454406082956e-07, "loss": 0.021, "reward": 0.20326408464461565, "reward_std": 0.780558280646801, "rewards/cosine_scaled_reward": -0.13858992606401443, "rewards/format_reward": 0.854166679084301, "step": 440 }, { "completion_length": 1554.1458740234375, "epoch": 0.504, "grad_norm": 2.14461612701416, "kl": 0.649658203125, "learning_rate": 1.3763677169699217e-07, "loss": 0.026, "reward": 0.04901134385727346, "reward_std": 0.6753638684749603, "rewards/cosine_scaled_reward": -0.1266509434208274, "rewards/format_reward": 0.625000013038516, "step": 441 }, { "completion_length": 1134.083366394043, "epoch": 0.5051428571428571, "grad_norm": 2.213015079498291, "kl": 0.38922119140625, "learning_rate": 1.3638909733514452e-07, "loss": 0.0156, "reward": 0.6072201561182737, "reward_std": 0.8717315904796124, "rewards/cosine_scaled_reward": 0.12222992815077305, "rewards/format_reward": 0.833333358168602, "step": 442 }, { "completion_length": 1613.5625381469727, "epoch": 0.5062857142857143, "grad_norm": 1.7041293382644653, "kl": 0.726806640625, "learning_rate": 1.351615817851748e-07, "loss": 0.0291, "reward": 0.07468948839232326, "reward_std": 0.6469858847558498, "rewards/cosine_scaled_reward": -0.17274294421076775, "rewards/format_reward": 0.7708333507180214, "step": 443 }, { "completion_length": 1118.9792137145996, "epoch": 0.5074285714285715, "grad_norm": 2.203859567642212, "kl": 0.5146484375, "learning_rate": 1.3395428487445914e-07, "loss": 0.0205, "reward": 0.109013965819031, "reward_std": 0.700001485645771, "rewards/cosine_scaled_reward": -0.17907634377479553, "rewards/format_reward": 0.833333358168602, "step": 444 }, { "completion_length": 1328.2500381469727, "epoch": 0.5085714285714286, "grad_norm": 2.1055400371551514, "kl": 0.700439453125, "learning_rate": 1.3276726544494571e-07, "loss": 0.0281, "reward": 0.003907807171344757, "reward_std": 0.6304442547261715, "rewards/cosine_scaled_reward": -0.1728717922233045, "rewards/format_reward": 0.7083333507180214, "step": 445 }, { "completion_length": 1408.5417098999023, "epoch": 0.5097142857142857, "grad_norm": 3.701373338699341, "kl": 1.005859375, "learning_rate": 1.316005813502869e-07, "loss": 0.0403, "reward": 0.18983037257567048, "reward_std": 0.7102062851190567, "rewards/cosine_scaled_reward": -0.07104396633803844, "rewards/format_reward": 0.6875000167638063, "step": 446 }, { "completion_length": 1182.68754196167, "epoch": 0.5108571428571429, "grad_norm": 2.484187126159668, "kl": 0.940673828125, "learning_rate": 1.3045428945301953e-07, "loss": 0.0376, "reward": 0.17063724854961038, "reward_std": 0.6657870672643185, "rewards/cosine_scaled_reward": -0.09112012386322021, "rewards/format_reward": 0.729166679084301, "step": 447 }, { "completion_length": 1155.3750228881836, "epoch": 0.512, "grad_norm": 9.17601490020752, "kl": 0.546142578125, "learning_rate": 1.2932844562179352e-07, "loss": 0.0218, "reward": 0.30222914123442024, "reward_std": 0.7583190239965916, "rewards/cosine_scaled_reward": -0.05191066488623619, "rewards/format_reward": 0.7916666865348816, "step": 448 }, { "completion_length": 1077.8541984558105, "epoch": 0.5131428571428571, "grad_norm": 2.589733839035034, "kl": 0.573486328125, "learning_rate": 1.2822310472864885e-07, "loss": 0.0229, "reward": 0.17757831397466362, "reward_std": 0.7745413295924664, "rewards/cosine_scaled_reward": -0.1571638728491962, "rewards/format_reward": 0.8541666865348816, "step": 449 }, { "completion_length": 1738.6667098999023, "epoch": 0.5142857142857142, "grad_norm": 3.1829066276550293, "kl": 1.14892578125, "learning_rate": 1.2713832064634125e-07, "loss": 0.0459, "reward": 0.03298312705010176, "reward_std": 0.7640566490590572, "rewards/cosine_scaled_reward": -0.14894709549844265, "rewards/format_reward": 0.6458333469927311, "step": 450 }, { "completion_length": 1184.0416870117188, "epoch": 0.5154285714285715, "grad_norm": 3.6981794834136963, "kl": 0.68731689453125, "learning_rate": 1.260741462457165e-07, "loss": 0.0275, "reward": 0.326223655953072, "reward_std": 0.9320776239037514, "rewards/cosine_scaled_reward": -0.049362530931830406, "rewards/format_reward": 0.7708333507180214, "step": 451 }, { "completion_length": 1356.0000228881836, "epoch": 0.5165714285714286, "grad_norm": 2.8852858543395996, "kl": 0.6221923828125, "learning_rate": 1.2503063339313356e-07, "loss": 0.0249, "reward": 0.23957047518342733, "reward_std": 0.6933315098285675, "rewards/cosine_scaled_reward": -0.05927852354943752, "rewards/format_reward": 0.7500000223517418, "step": 452 }, { "completion_length": 1344.395866394043, "epoch": 0.5177142857142857, "grad_norm": 2.0090019702911377, "kl": 0.6807861328125, "learning_rate": 1.2400783294793668e-07, "loss": 0.0272, "reward": 0.2872171855997294, "reward_std": 0.7287779673933983, "rewards/cosine_scaled_reward": -0.033939655870199203, "rewards/format_reward": 0.770833358168602, "step": 453 }, { "completion_length": 1270.8125762939453, "epoch": 0.5188571428571429, "grad_norm": 3.2378604412078857, "kl": 0.65252685546875, "learning_rate": 1.2300579475997657e-07, "loss": 0.0261, "reward": 0.11756114871241152, "reward_std": 0.7418475337326527, "rewards/cosine_scaled_reward": -0.15102737117558718, "rewards/format_reward": 0.770833358168602, "step": 454 }, { "completion_length": 1492.208351135254, "epoch": 0.52, "grad_norm": 4.2774338722229, "kl": 0.79248046875, "learning_rate": 1.220245676671809e-07, "loss": 0.0317, "reward": -0.018868495360948145, "reward_std": 0.6951228454709053, "rewards/cosine_scaled_reward": -0.20306831784546375, "rewards/format_reward": 0.6875000223517418, "step": 455 }, { "completion_length": 1657.2291870117188, "epoch": 0.5211428571428571, "grad_norm": 2.7355401515960693, "kl": 0.7105712890625, "learning_rate": 1.2106419949317388e-07, "loss": 0.0284, "reward": 0.18439527601003647, "reward_std": 0.7989875040948391, "rewards/cosine_scaled_reward": -0.12901971023529768, "rewards/format_reward": 0.7916666865348816, "step": 456 }, { "completion_length": 1063.9166946411133, "epoch": 0.5222857142857142, "grad_norm": 2.7030558586120605, "kl": 0.5423583984375, "learning_rate": 1.2012473704494537e-07, "loss": 0.0217, "reward": 0.31176955718547106, "reward_std": 0.8158461526036263, "rewards/cosine_scaled_reward": -0.0352974534034729, "rewards/format_reward": 0.7500000149011612, "step": 457 }, { "completion_length": 1281.9167175292969, "epoch": 0.5234285714285715, "grad_norm": 2.028135299682617, "kl": 0.7296142578125, "learning_rate": 1.1920622611056974e-07, "loss": 0.0292, "reward": 0.2823291067034006, "reward_std": 0.8681261576712132, "rewards/cosine_scaled_reward": -0.06675804499536753, "rewards/format_reward": 0.7708333488553762, "step": 458 }, { "completion_length": 1336.5208892822266, "epoch": 0.5245714285714286, "grad_norm": 3.601792573928833, "kl": 0.614990234375, "learning_rate": 1.1830871145697412e-07, "loss": 0.0246, "reward": 0.2878631625790149, "reward_std": 0.8964981138706207, "rewards/cosine_scaled_reward": -0.07753422670066357, "rewards/format_reward": 0.770833358168602, "step": 459 }, { "completion_length": 1851.0833740234375, "epoch": 0.5257142857142857, "grad_norm": 3.7353336811065674, "kl": 0.7938232421875, "learning_rate": 1.1743223682775649e-07, "loss": 0.0317, "reward": 0.22537417088460643, "reward_std": 0.8646272122859955, "rewards/cosine_scaled_reward": -0.08393661957234144, "rewards/format_reward": 0.7291666865348816, "step": 460 }, { "completion_length": 1549.5000534057617, "epoch": 0.5268571428571428, "grad_norm": 3.5757029056549072, "kl": 0.853271484375, "learning_rate": 1.1657684494105386e-07, "loss": 0.0342, "reward": 0.11143558099865913, "reward_std": 0.7968284860253334, "rewards/cosine_scaled_reward": -0.10749776661396027, "rewards/format_reward": 0.6250000186264515, "step": 461 }, { "completion_length": 1378.3542022705078, "epoch": 0.528, "grad_norm": 3.368990421295166, "kl": 0.8125, "learning_rate": 1.1574257748745986e-07, "loss": 0.0325, "reward": -0.07270677981432527, "reward_std": 0.5974620878696442, "rewards/cosine_scaled_reward": -0.23459232598543167, "rewards/format_reward": 0.708333358168602, "step": 462 }, { "completion_length": 1423.1250457763672, "epoch": 0.5291428571428571, "grad_norm": 3.6016039848327637, "kl": 0.7799072265625, "learning_rate": 1.1492947512799328e-07, "loss": 0.0312, "reward": 0.2612933642594726, "reward_std": 0.7466553151607513, "rewards/cosine_scaled_reward": -0.047830826602876186, "rewards/format_reward": 0.7500000223517418, "step": 463 }, { "completion_length": 1017.833366394043, "epoch": 0.5302857142857142, "grad_norm": 2.5817713737487793, "kl": 0.462646484375, "learning_rate": 1.1413757749211602e-07, "loss": 0.0185, "reward": 0.4776353507768363, "reward_std": 0.6480004005134106, "rewards/cosine_scaled_reward": 0.10391846485435963, "rewards/format_reward": 0.8958333395421505, "step": 464 }, { "completion_length": 1335.3542098999023, "epoch": 0.5314285714285715, "grad_norm": 2.435389995574951, "kl": 0.8916015625, "learning_rate": 1.1336692317580158e-07, "loss": 0.0357, "reward": -0.06275751371867955, "reward_std": 0.6158505566418171, "rewards/cosine_scaled_reward": -0.23374686716124415, "rewards/format_reward": 0.729166679084301, "step": 465 }, { "completion_length": 1214.8333587646484, "epoch": 0.5325714285714286, "grad_norm": 2.2548060417175293, "kl": 0.5836181640625, "learning_rate": 1.1261754973965422e-07, "loss": 0.0233, "reward": 0.37778762076050043, "reward_std": 0.9365783184766769, "rewards/cosine_scaled_reward": -0.0053373780101537704, "rewards/format_reward": 0.7708333488553762, "step": 466 }, { "completion_length": 1402.68754196167, "epoch": 0.5337142857142857, "grad_norm": 2.9286229610443115, "kl": 0.724578857421875, "learning_rate": 1.1188949370707787e-07, "loss": 0.029, "reward": 0.11761695193126798, "reward_std": 0.7316232472658157, "rewards/cosine_scaled_reward": -0.15633787866681814, "rewards/format_reward": 0.791666679084301, "step": 467 }, { "completion_length": 1510.9791946411133, "epoch": 0.5348571428571428, "grad_norm": 14.900471687316895, "kl": 2.21337890625, "learning_rate": 1.1118279056249653e-07, "loss": 0.0885, "reward": 0.21817315090447664, "reward_std": 0.9960628747940063, "rewards/cosine_scaled_reward": -0.07175269955769181, "rewards/format_reward": 0.6250000223517418, "step": 468 }, { "completion_length": 1016.2917098999023, "epoch": 0.536, "grad_norm": 6.350338459014893, "kl": 0.953369140625, "learning_rate": 1.1049747474962444e-07, "loss": 0.0382, "reward": 0.2713460554368794, "reward_std": 0.7287629023194313, "rewards/cosine_scaled_reward": -0.04988390300422907, "rewards/format_reward": 0.770833358168602, "step": 469 }, { "completion_length": 1466.6042251586914, "epoch": 0.5371428571428571, "grad_norm": 7.370168209075928, "kl": 1.435546875, "learning_rate": 1.0983357966978745e-07, "loss": 0.0574, "reward": 0.01497307000681758, "reward_std": 0.733777578920126, "rewards/cosine_scaled_reward": -0.16290199384093285, "rewards/format_reward": 0.6250000149011612, "step": 470 }, { "completion_length": 1592.7083740234375, "epoch": 0.5382857142857143, "grad_norm": 7.640194892883301, "kl": 1.49639892578125, "learning_rate": 1.0919113768029517e-07, "loss": 0.0598, "reward": 0.3828916675411165, "reward_std": 0.9707400016486645, "rewards/cosine_scaled_reward": 0.03202553070150316, "rewards/format_reward": 0.666666679084301, "step": 471 }, { "completion_length": 1145.1041946411133, "epoch": 0.5394285714285715, "grad_norm": 2.8886444568634033, "kl": 0.802978515625, "learning_rate": 1.0857018009286381e-07, "loss": 0.0321, "reward": 0.28387897345237434, "reward_std": 0.7857476621866226, "rewards/cosine_scaled_reward": -0.0928277347702533, "rewards/format_reward": 0.895833358168602, "step": 472 }, { "completion_length": 1445.7708740234375, "epoch": 0.5405714285714286, "grad_norm": 3.867896318435669, "kl": 1.0830078125, "learning_rate": 1.0797073717209013e-07, "loss": 0.0433, "reward": 0.11159952421439812, "reward_std": 0.7589086703956127, "rewards/cosine_scaled_reward": -0.1572155966423452, "rewards/format_reward": 0.770833358168602, "step": 473 }, { "completion_length": 1613.3750686645508, "epoch": 0.5417142857142857, "grad_norm": 4.905575275421143, "kl": 1.672607421875, "learning_rate": 1.0739283813397639e-07, "loss": 0.0669, "reward": 0.428771385923028, "reward_std": 1.020697444677353, "rewards/cosine_scaled_reward": 0.044653447810560465, "rewards/format_reward": 0.6458333563059568, "step": 474 }, { "completion_length": 1337.0417022705078, "epoch": 0.5428571428571428, "grad_norm": 4.179660320281982, "kl": 0.6060791015625, "learning_rate": 1.068365111445064e-07, "loss": 0.0242, "reward": 0.23191617615520954, "reward_std": 0.8730170913040638, "rewards/cosine_scaled_reward": -0.11908163363114, "rewards/format_reward": 0.8125000223517418, "step": 475 }, { "completion_length": 1265.6667098999023, "epoch": 0.544, "grad_norm": 6.457125186920166, "kl": 1.1802978515625, "learning_rate": 1.063017833182728e-07, "loss": 0.0473, "reward": 0.32982578047085553, "reward_std": 0.9863774701952934, "rewards/cosine_scaled_reward": -0.03677630145102739, "rewards/format_reward": 0.7291666865348816, "step": 476 }, { "completion_length": 998.0000381469727, "epoch": 0.5451428571428572, "grad_norm": 3.380866050720215, "kl": 0.7977294921875, "learning_rate": 1.0578868071715544e-07, "loss": 0.0319, "reward": 0.4656647043302655, "reward_std": 0.9266739711165428, "rewards/cosine_scaled_reward": 0.003877062350511551, "rewards/format_reward": 0.833333358168602, "step": 477 }, { "completion_length": 1333.4583892822266, "epoch": 0.5462857142857143, "grad_norm": 25.78436851501465, "kl": 2.43603515625, "learning_rate": 1.0529722834905125e-07, "loss": 0.0974, "reward": 0.3743471228517592, "reward_std": 0.8705001547932625, "rewards/cosine_scaled_reward": 0.015704313293099403, "rewards/format_reward": 0.7708333507180214, "step": 478 }, { "completion_length": 1580.6666946411133, "epoch": 0.5474285714285714, "grad_norm": 5.661951541900635, "kl": 1.4453125, "learning_rate": 1.0482745016665526e-07, "loss": 0.0579, "reward": 0.039567636558786035, "reward_std": 0.7153622172772884, "rewards/cosine_scaled_reward": -0.17540637124329805, "rewards/format_reward": 0.7083333507180214, "step": 479 }, { "completion_length": 1532.4792175292969, "epoch": 0.5485714285714286, "grad_norm": 10.698458671569824, "kl": 1.906494140625, "learning_rate": 1.0437936906629334e-07, "loss": 0.0764, "reward": 0.12518761213868856, "reward_std": 0.7352484799921513, "rewards/cosine_scaled_reward": -0.1592358397319913, "rewards/format_reward": 0.7708333507180214, "step": 480 }, { "completion_length": 1801.8959045410156, "epoch": 0.5497142857142857, "grad_norm": 7.208052158355713, "kl": 2.16015625, "learning_rate": 1.0395300688680625e-07, "loss": 0.0865, "reward": -0.068937080912292, "reward_std": 0.678013302385807, "rewards/cosine_scaled_reward": -0.21540158614516258, "rewards/format_reward": 0.6666666865348816, "step": 481 }, { "completion_length": 1408.2083587646484, "epoch": 0.5508571428571428, "grad_norm": 28.815509796142578, "kl": 2.3358154296875, "learning_rate": 1.0354838440848501e-07, "loss": 0.0934, "reward": 0.2200728515163064, "reward_std": 0.8331051766872406, "rewards/cosine_scaled_reward": -0.07284185755997896, "rewards/format_reward": 0.7083333544433117, "step": 482 }, { "completion_length": 1403.6250457763672, "epoch": 0.552, "grad_norm": 4.524230480194092, "kl": 0.8890380859375, "learning_rate": 1.0316552135205837e-07, "loss": 0.0355, "reward": 0.26419115875614807, "reward_std": 0.8404405452311039, "rewards/cosine_scaled_reward": -0.07794593391008675, "rewards/format_reward": 0.7500000298023224, "step": 483 }, { "completion_length": 1195.2916870117188, "epoch": 0.5531428571428572, "grad_norm": 3.5205888748168945, "kl": 1.357666015625, "learning_rate": 1.0280443637773163e-07, "loss": 0.0543, "reward": 0.28255582111887634, "reward_std": 0.9850383549928665, "rewards/cosine_scaled_reward": -0.051978057832457125, "rewards/format_reward": 0.6875000260770321, "step": 484 }, { "completion_length": 1259.208366394043, "epoch": 0.5542857142857143, "grad_norm": 2.4940898418426514, "kl": 0.69293212890625, "learning_rate": 1.0246514708427701e-07, "loss": 0.0277, "reward": 0.11278902753838338, "reward_std": 0.8309934213757515, "rewards/cosine_scaled_reward": -0.1679881983436644, "rewards/format_reward": 0.7708333432674408, "step": 485 }, { "completion_length": 972.7291946411133, "epoch": 0.5554285714285714, "grad_norm": 5.1607160568237305, "kl": 1.142333984375, "learning_rate": 1.0214767000817596e-07, "loss": 0.0456, "reward": 0.25730944075621665, "reward_std": 0.7034793458878994, "rewards/cosine_scaled_reward": -0.04670877754688263, "rewards/format_reward": 0.8125000223517418, "step": 486 }, { "completion_length": 1239.770881652832, "epoch": 0.5565714285714286, "grad_norm": 4.1862897872924805, "kl": 0.94287109375, "learning_rate": 1.0185202062281336e-07, "loss": 0.0377, "reward": 0.5008484733989462, "reward_std": 0.818550631403923, "rewards/cosine_scaled_reward": 0.08279422484338284, "rewards/format_reward": 0.8541666865348816, "step": 487 }, { "completion_length": 1284.854206085205, "epoch": 0.5577142857142857, "grad_norm": 3.6226584911346436, "kl": 0.8634033203125, "learning_rate": 1.0157821333772304e-07, "loss": 0.0346, "reward": 0.09361092420294881, "reward_std": 0.6568006910383701, "rewards/cosine_scaled_reward": -0.13768697017803788, "rewards/format_reward": 0.7708333488553762, "step": 488 }, { "completion_length": 1505.7708930969238, "epoch": 0.5588571428571428, "grad_norm": 3.779109477996826, "kl": 1.6417236328125, "learning_rate": 1.013262614978859e-07, "loss": 0.0656, "reward": -0.12699575908482075, "reward_std": 0.6261985003948212, "rewards/cosine_scaled_reward": -0.29026195663027465, "rewards/format_reward": 0.7291666865348816, "step": 489 }, { "completion_length": 1053.770866394043, "epoch": 0.56, "grad_norm": 1.6185153722763062, "kl": 0.48480224609375, "learning_rate": 1.0109617738307911e-07, "loss": 0.0194, "reward": 0.23615788342431188, "reward_std": 0.704241368919611, "rewards/cosine_scaled_reward": -0.062381197698414326, "rewards/format_reward": 0.833333358168602, "step": 490 }, { "completion_length": 1363.4167022705078, "epoch": 0.5611428571428572, "grad_norm": 7.597695350646973, "kl": 1.367431640625, "learning_rate": 1.0088797220727779e-07, "loss": 0.0547, "reward": 0.5214339741505682, "reward_std": 1.070147231221199, "rewards/cosine_scaled_reward": 0.063262770883739, "rewards/format_reward": 0.7500000149011612, "step": 491 }, { "completion_length": 1276.8542022705078, "epoch": 0.5622857142857143, "grad_norm": 3.7099130153656006, "kl": 0.9229736328125, "learning_rate": 1.0070165611810855e-07, "loss": 0.037, "reward": 0.25723157986067235, "reward_std": 0.7029771581292152, "rewards/cosine_scaled_reward": -0.017836466431617737, "rewards/format_reward": 0.7291666939854622, "step": 492 }, { "completion_length": 1266.0625534057617, "epoch": 0.5634285714285714, "grad_norm": 3.5932085514068604, "kl": 1.254150390625, "learning_rate": 1.005372381963547e-07, "loss": 0.0501, "reward": 0.2146676443517208, "reward_std": 0.8730643317103386, "rewards/cosine_scaled_reward": -0.10545299621298909, "rewards/format_reward": 0.7500000223517418, "step": 493 }, { "completion_length": 1115.4792022705078, "epoch": 0.5645714285714286, "grad_norm": 2.4263534545898438, "kl": 0.810546875, "learning_rate": 1.0039472645551372e-07, "loss": 0.0324, "reward": 0.29131725314073265, "reward_std": 0.9221536330878735, "rewards/cosine_scaled_reward": -0.07916020415723324, "rewards/format_reward": 0.7708333507180214, "step": 494 }, { "completion_length": 1731.3750686645508, "epoch": 0.5657142857142857, "grad_norm": 15.834768295288086, "kl": 2.26806640625, "learning_rate": 1.002741278414069e-07, "loss": 0.0906, "reward": 0.2820148948812857, "reward_std": 0.9001934975385666, "rewards/cosine_scaled_reward": -0.013229399919509888, "rewards/format_reward": 0.6458333507180214, "step": 495 }, { "completion_length": 1282.9583702087402, "epoch": 0.5668571428571428, "grad_norm": 3.1927053928375244, "kl": 1.07080078125, "learning_rate": 1.0017544823184055e-07, "loss": 0.0428, "reward": 0.2954734539380297, "reward_std": 0.7768198624253273, "rewards/cosine_scaled_reward": -0.05057825893163681, "rewards/format_reward": 0.7708333656191826, "step": 496 }, { "completion_length": 1330.3541946411133, "epoch": 0.568, "grad_norm": 3.622467279434204, "kl": 1.56591796875, "learning_rate": 1.0009869243631952e-07, "loss": 0.0626, "reward": 0.2870326414704323, "reward_std": 0.9923329427838326, "rewards/cosine_scaled_reward": -0.05224178615026176, "rewards/format_reward": 0.666666692122817, "step": 497 }, { "completion_length": 1888.2083435058594, "epoch": 0.5691428571428572, "grad_norm": 8.798296928405762, "kl": 2.22607421875, "learning_rate": 1.000438641958131e-07, "loss": 0.089, "reward": 0.09313779044896364, "reward_std": 0.8005350790917873, "rewards/cosine_scaled_reward": -0.1090406347066164, "rewards/format_reward": 0.5833333488553762, "step": 498 }, { "completion_length": 1403.8958587646484, "epoch": 0.5702857142857143, "grad_norm": 3.579638957977295, "kl": 1.40771484375, "learning_rate": 1.0001096618257236e-07, "loss": 0.0563, "reward": 0.31425497168675065, "reward_std": 0.9142299555242062, "rewards/cosine_scaled_reward": -0.03276859223842621, "rewards/format_reward": 0.7291666902601719, "step": 499 }, { "completion_length": 1698.9792022705078, "epoch": 0.5714285714285714, "grad_norm": 178.20928955078125, "kl": 7.2745361328125, "learning_rate": 1e-07, "loss": 0.2911, "reward": 0.14990665763616562, "reward_std": 0.7996302992105484, "rewards/cosine_scaled_reward": -0.12167917937040329, "rewards/format_reward": 0.7083333563059568, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.01018991470336914, "train_runtime": 16191.746, "train_samples_per_second": 1.482, "train_steps_per_second": 0.031 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }