diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5714285714285714, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 2571.2083587646484, + "epoch": 0.001142857142857143, + "grad_norm": 0.19757072627544403, + "kl": 0.0, + "learning_rate": 2e-08, + "loss": -0.0, + "reward": 0.17825700528919697, + "reward_std": 0.804851658642292, + "rewards/cosine_scaled_reward": -0.015534311532974243, + "rewards/format_reward": 0.5208333488553762, + "step": 1 + }, + { + "completion_length": 2804.395881652832, + "epoch": 0.002285714285714286, + "grad_norm": 0.18212556838989258, + "kl": 0.0, + "learning_rate": 4e-08, + "loss": -0.0, + "reward": -0.1223274078220129, + "reward_std": 0.46937728114426136, + "rewards/cosine_scaled_reward": -0.04980122856795788, + "rewards/format_reward": 0.37500000558793545, + "step": 2 + }, + { + "completion_length": 3303.9583435058594, + "epoch": 0.0034285714285714284, + "grad_norm": 0.16496071219444275, + "kl": 4.756450653076172e-05, + "learning_rate": 6e-08, + "loss": 0.0, + "reward": -0.5349157964810729, + "reward_std": 0.4061080813407898, + "rewards/cosine_scaled_reward": -0.2544318288564682, + "rewards/format_reward": 0.1458333395421505, + "step": 3 + }, + { + "completion_length": 2260.6875228881836, + "epoch": 0.004571428571428572, + "grad_norm": 0.26921820640563965, + "kl": 3.6716461181640625e-05, + "learning_rate": 8e-08, + "loss": 0.0, + "reward": 0.1224252681422513, + "reward_std": 0.812014251947403, + "rewards/cosine_scaled_reward": -0.09193882904946804, + "rewards/format_reward": 0.6458333414047956, + "step": 4 + }, + { + "completion_length": 3346.6041870117188, + "epoch": 0.005714285714285714, + "grad_norm": 0.1722181737422943, + "kl": 4.376843571662903e-05, + "learning_rate": 1e-07, + "loss": 0.0, + "reward": -0.38172444701194763, + "reward_std": 0.5492917411029339, + "rewards/cosine_scaled_reward": -0.22456051781773567, + "rewards/format_reward": 0.25000000558793545, + "step": 5 + }, + { + "completion_length": 3113.7500610351562, + "epoch": 0.006857142857142857, + "grad_norm": 0.21919280290603638, + "kl": 4.5668333768844604e-05, + "learning_rate": 1.2e-07, + "loss": 0.0, + "reward": -0.2863283231854439, + "reward_std": 0.5716646872460842, + "rewards/cosine_scaled_reward": -0.19011332368245348, + "rewards/format_reward": 0.29166667349636555, + "step": 6 + }, + { + "completion_length": 3158.8333740234375, + "epoch": 0.008, + "grad_norm": 0.1657346487045288, + "kl": 2.4143606424331665e-05, + "learning_rate": 1.4e-07, + "loss": 0.0, + "reward": -0.0321456715464592, + "reward_std": 0.6449971524998546, + "rewards/cosine_scaled_reward": -0.1226729229092598, + "rewards/format_reward": 0.4791666828095913, + "step": 7 + }, + { + "completion_length": 2815.1250610351562, + "epoch": 0.009142857142857144, + "grad_norm": 0.15989142656326294, + "kl": 2.526119351387024e-05, + "learning_rate": 1.6e-07, + "loss": 0.0, + "reward": 0.3240444455295801, + "reward_std": 0.883681982755661, + "rewards/cosine_scaled_reward": 0.1182668274268508, + "rewards/format_reward": 0.5000000111758709, + "step": 8 + }, + { + "completion_length": 3149.0625915527344, + "epoch": 0.010285714285714285, + "grad_norm": 0.21666041016578674, + "kl": 4.5686960220336914e-05, + "learning_rate": 1.8e-07, + "loss": 0.0, + "reward": -0.18796737492084503, + "reward_std": 0.7943232320249081, + "rewards/cosine_scaled_reward": -0.13237779098562896, + "rewards/format_reward": 0.27083334140479565, + "step": 9 + }, + { + "completion_length": 2782.3750228881836, + "epoch": 0.011428571428571429, + "grad_norm": 0.18924005329608917, + "kl": 3.37064266204834e-05, + "learning_rate": 2e-07, + "loss": 0.0, + "reward": 0.11666052648797631, + "reward_std": 0.879204161465168, + "rewards/cosine_scaled_reward": -0.012820702977478504, + "rewards/format_reward": 0.41666667349636555, + "step": 10 + }, + { + "completion_length": 3473.062530517578, + "epoch": 0.012571428571428572, + "grad_norm": 0.20001095533370972, + "kl": 4.3779611587524414e-05, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0, + "reward": -0.6018680967390537, + "reward_std": 0.4545674379914999, + "rewards/cosine_scaled_reward": -0.28993429616093636, + "rewards/format_reward": 0.1041666679084301, + "step": 11 + }, + { + "completion_length": 2469.6667098999023, + "epoch": 0.013714285714285714, + "grad_norm": 0.24237319827079773, + "kl": 4.6290457248687744e-05, + "learning_rate": 2.4e-07, + "loss": 0.0, + "reward": 0.08407351560890675, + "reward_std": 0.751841738820076, + "rewards/cosine_scaled_reward": -0.1303967982530594, + "rewards/format_reward": 0.6458333414047956, + "step": 12 + }, + { + "completion_length": 2778.1458587646484, + "epoch": 0.014857142857142857, + "grad_norm": 0.18375596404075623, + "kl": 4.48375940322876e-05, + "learning_rate": 2.6e-07, + "loss": 0.0, + "reward": 0.09226825274527073, + "reward_std": 0.6979338899254799, + "rewards/cosine_scaled_reward": -0.015000073239207268, + "rewards/format_reward": 0.4166666865348816, + "step": 13 + }, + { + "completion_length": 2874.750045776367, + "epoch": 0.016, + "grad_norm": 0.1823539286851883, + "kl": 2.8234906494617462e-05, + "learning_rate": 2.8e-07, + "loss": 0.0, + "reward": -0.11271460726857185, + "reward_std": 0.7010148204863071, + "rewards/cosine_scaled_reward": -0.14169861702248454, + "rewards/format_reward": 0.39583333395421505, + "step": 14 + }, + { + "completion_length": 2797.395854949951, + "epoch": 0.017142857142857144, + "grad_norm": 0.20054183900356293, + "kl": 2.563674934208393e-05, + "learning_rate": 3e-07, + "loss": 0.0, + "reward": 0.018786390544846654, + "reward_std": 0.6112043932080269, + "rewards/cosine_scaled_reward": 0.03721225541085005, + "rewards/format_reward": 0.39583333767950535, + "step": 15 + }, + { + "completion_length": 3453.1458435058594, + "epoch": 0.018285714285714287, + "grad_norm": 0.18542714416980743, + "kl": 4.1812658309936523e-05, + "learning_rate": 3.2e-07, + "loss": 0.0, + "reward": -0.5245386594906449, + "reward_std": 0.5483577623963356, + "rewards/cosine_scaled_reward": -0.2334668217226863, + "rewards/format_reward": 0.06250000186264515, + "step": 16 + }, + { + "completion_length": 2326.8750610351562, + "epoch": 0.019428571428571427, + "grad_norm": 0.2586069405078888, + "kl": 3.917887806892395e-05, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0, + "reward": 0.29090012004598975, + "reward_std": 0.7025135271251202, + "rewards/cosine_scaled_reward": 0.04808543558465317, + "rewards/format_reward": 0.6666666716337204, + "step": 17 + }, + { + "completion_length": 2884.708366394043, + "epoch": 0.02057142857142857, + "grad_norm": 0.17255671322345734, + "kl": 2.2798776626586914e-05, + "learning_rate": 3.6e-07, + "loss": 0.0, + "reward": -0.06478883884847164, + "reward_std": 0.6950070075690746, + "rewards/cosine_scaled_reward": -0.09088864922523499, + "rewards/format_reward": 0.43750000558793545, + "step": 18 + }, + { + "completion_length": 2841.604202270508, + "epoch": 0.021714285714285714, + "grad_norm": 0.19047367572784424, + "kl": 3.0055642127990723e-05, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0, + "reward": 0.2590886615216732, + "reward_std": 0.9022833462804556, + "rewards/cosine_scaled_reward": 0.051947877276688814, + "rewards/format_reward": 0.4375000111758709, + "step": 19 + }, + { + "completion_length": 2353.1042289733887, + "epoch": 0.022857142857142857, + "grad_norm": 0.194220632314682, + "kl": 1.3179145753383636e-05, + "learning_rate": 4e-07, + "loss": 0.0, + "reward": 0.2806552555412054, + "reward_std": 0.8725750483572483, + "rewards/cosine_scaled_reward": 0.004965861327946186, + "rewards/format_reward": 0.6666666809469461, + "step": 20 + }, + { + "completion_length": 2684.687545776367, + "epoch": 0.024, + "grad_norm": 0.19551798701286316, + "kl": 4.254281520843506e-05, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0, + "reward": 0.09347914904356003, + "reward_std": 0.8260357603430748, + "rewards/cosine_scaled_reward": -0.07159630116075277, + "rewards/format_reward": 0.5000000074505806, + "step": 21 + }, + { + "completion_length": 1796.4375381469727, + "epoch": 0.025142857142857144, + "grad_norm": 0.3671523332595825, + "kl": 4.005804657936096e-05, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0, + "reward": 0.19362880755215883, + "reward_std": 0.8532019667327404, + "rewards/cosine_scaled_reward": -0.06471679511014372, + "rewards/format_reward": 0.7291666697710752, + "step": 22 + }, + { + "completion_length": 2500.062530517578, + "epoch": 0.026285714285714287, + "grad_norm": 0.22526901960372925, + "kl": 3.0137598514556885e-05, + "learning_rate": 4.6e-07, + "loss": 0.0, + "reward": 0.041429003700613976, + "reward_std": 0.7314254455268383, + "rewards/cosine_scaled_reward": -0.07520224852487445, + "rewards/format_reward": 0.5208333469927311, + "step": 23 + }, + { + "completion_length": 2648.291717529297, + "epoch": 0.027428571428571427, + "grad_norm": 0.21498121321201324, + "kl": 1.6693025827407837e-05, + "learning_rate": 4.8e-07, + "loss": 0.0, + "reward": 0.21239036042243242, + "reward_std": 0.6812088377773762, + "rewards/cosine_scaled_reward": 0.027016831561923027, + "rewards/format_reward": 0.5833333469927311, + "step": 24 + }, + { + "completion_length": 2769.2291946411133, + "epoch": 0.02857142857142857, + "grad_norm": 0.23084591329097748, + "kl": 3.578886389732361e-05, + "learning_rate": 5e-07, + "loss": 0.0, + "reward": 0.04653824120759964, + "reward_std": 0.7504720520228148, + "rewards/cosine_scaled_reward": -0.0376815393101424, + "rewards/format_reward": 0.3958333432674408, + "step": 25 + }, + { + "completion_length": 2962.562530517578, + "epoch": 0.029714285714285714, + "grad_norm": 0.16293752193450928, + "kl": 3.400072455406189e-05, + "learning_rate": 5.2e-07, + "loss": 0.0, + "reward": 0.010799127630889416, + "reward_std": 0.5875276923179626, + "rewards/cosine_scaled_reward": -0.043792182579636574, + "rewards/format_reward": 0.4791666716337204, + "step": 26 + }, + { + "completion_length": 2952.166717529297, + "epoch": 0.030857142857142857, + "grad_norm": 0.2215096801519394, + "kl": 3.3194024581462145e-05, + "learning_rate": 5.4e-07, + "loss": 0.0, + "reward": 0.11158058885484934, + "reward_std": 0.8504065573215485, + "rewards/cosine_scaled_reward": -0.05920940637588501, + "rewards/format_reward": 0.5208333469927311, + "step": 27 + }, + { + "completion_length": 2872.0625, + "epoch": 0.032, + "grad_norm": 0.18577344715595245, + "kl": 3.726780414581299e-05, + "learning_rate": 5.6e-07, + "loss": 0.0, + "reward": 0.13817322719842196, + "reward_std": 0.7248105835169554, + "rewards/cosine_scaled_reward": 0.027827581390738487, + "rewards/format_reward": 0.47916666977107525, + "step": 28 + }, + { + "completion_length": 3446.5833435058594, + "epoch": 0.03314285714285714, + "grad_norm": 0.21212224662303925, + "kl": 2.4769455194473267e-05, + "learning_rate": 5.8e-07, + "loss": 0.0, + "reward": -0.3688106779009104, + "reward_std": 0.6171325668692589, + "rewards/cosine_scaled_reward": -0.17023007571697235, + "rewards/format_reward": 0.14583333767950535, + "step": 29 + }, + { + "completion_length": 2783.3333587646484, + "epoch": 0.03428571428571429, + "grad_norm": 0.17819644510746002, + "kl": 2.261658664792776e-05, + "learning_rate": 6e-07, + "loss": 0.0, + "reward": 0.2445693917106837, + "reward_std": 0.9377000778913498, + "rewards/cosine_scaled_reward": 0.03347432427108288, + "rewards/format_reward": 0.4791666753590107, + "step": 30 + }, + { + "completion_length": 3050.2291870117188, + "epoch": 0.03542857142857143, + "grad_norm": 0.16932828724384308, + "kl": 2.38027423620224e-05, + "learning_rate": 6.2e-07, + "loss": 0.0, + "reward": -0.16898724623024464, + "reward_std": 0.648616686463356, + "rewards/cosine_scaled_reward": -0.11072872229851782, + "rewards/format_reward": 0.2708333395421505, + "step": 31 + }, + { + "completion_length": 3094.1250610351562, + "epoch": 0.036571428571428574, + "grad_norm": 0.18312151730060577, + "kl": 2.9257498681545258e-05, + "learning_rate": 6.4e-07, + "loss": 0.0, + "reward": 0.004837207496166229, + "reward_std": 0.6736676879227161, + "rewards/cosine_scaled_reward": -0.017333179406705312, + "rewards/format_reward": 0.3333333395421505, + "step": 32 + }, + { + "completion_length": 3368.562530517578, + "epoch": 0.037714285714285714, + "grad_norm": 0.15040728449821472, + "kl": 2.4586915969848633e-05, + "learning_rate": 6.6e-07, + "loss": 0.0, + "reward": -0.28359657526016235, + "reward_std": 0.6239799037575722, + "rewards/cosine_scaled_reward": -0.11568338703364134, + "rewards/format_reward": 0.1458333395421505, + "step": 33 + }, + { + "completion_length": 2505.1875610351562, + "epoch": 0.038857142857142854, + "grad_norm": 0.30060505867004395, + "kl": 2.2239633835852146e-05, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0, + "reward": 0.27598637342453003, + "reward_std": 0.8686297200620174, + "rewards/cosine_scaled_reward": 0.011391445528715849, + "rewards/format_reward": 0.5416666716337204, + "step": 34 + }, + { + "completion_length": 3063.2708740234375, + "epoch": 0.04, + "grad_norm": 0.24084553122520447, + "kl": 4.2811036109924316e-05, + "learning_rate": 7e-07, + "loss": 0.0, + "reward": -0.03728431276977062, + "reward_std": 0.9247638881206512, + "rewards/cosine_scaled_reward": -0.1015674127265811, + "rewards/format_reward": 0.3541666716337204, + "step": 35 + }, + { + "completion_length": 3358.7291870117188, + "epoch": 0.04114285714285714, + "grad_norm": 0.17074225842952728, + "kl": 2.5674700736999512e-05, + "learning_rate": 7.2e-07, + "loss": 0.0, + "reward": -0.4219683278352022, + "reward_std": 0.516293577849865, + "rewards/cosine_scaled_reward": -0.20600515604019165, + "rewards/format_reward": 0.1875000074505806, + "step": 36 + }, + { + "completion_length": 3300.791717529297, + "epoch": 0.04228571428571429, + "grad_norm": 0.1499692052602768, + "kl": 1.6324222087860107e-05, + "learning_rate": 7.4e-07, + "loss": 0.0, + "reward": -0.42433968995464966, + "reward_std": 0.5183847993612289, + "rewards/cosine_scaled_reward": -0.21130944415926933, + "rewards/format_reward": 0.1875000074505806, + "step": 37 + }, + { + "completion_length": 3274.9166870117188, + "epoch": 0.04342857142857143, + "grad_norm": 0.18995541334152222, + "kl": 2.5459565222263336e-05, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0, + "reward": -0.42816436290740967, + "reward_std": 0.3850083723664284, + "rewards/cosine_scaled_reward": -0.16101331263780594, + "rewards/format_reward": 0.14583333395421505, + "step": 38 + }, + { + "completion_length": 2845.81254196167, + "epoch": 0.044571428571428574, + "grad_norm": 0.16534960269927979, + "kl": 1.8164515495300293e-05, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0, + "reward": -0.08653704356402159, + "reward_std": 0.5173388682305813, + "rewards/cosine_scaled_reward": -0.039690399542450905, + "rewards/format_reward": 0.3750000111758709, + "step": 39 + }, + { + "completion_length": 2536.583381652832, + "epoch": 0.045714285714285714, + "grad_norm": 0.25925570726394653, + "kl": 2.086721360683441e-05, + "learning_rate": 8e-07, + "loss": 0.0, + "reward": -0.0936742543708533, + "reward_std": 0.5705901933833957, + "rewards/cosine_scaled_reward": -0.09443798521533608, + "rewards/format_reward": 0.5000000093132257, + "step": 40 + }, + { + "completion_length": 2954.979217529297, + "epoch": 0.046857142857142854, + "grad_norm": 0.17119692265987396, + "kl": 5.511566996574402e-06, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0, + "reward": -0.2362358495593071, + "reward_std": 0.6107039824128151, + "rewards/cosine_scaled_reward": -0.21764612046536058, + "rewards/format_reward": 0.41666667349636555, + "step": 41 + }, + { + "completion_length": 2852.250020980835, + "epoch": 0.048, + "grad_norm": 0.2615634500980377, + "kl": 3.533065319061279e-05, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0, + "reward": -0.5097283767536283, + "reward_std": 0.4514521397650242, + "rewards/cosine_scaled_reward": -0.2993845697492361, + "rewards/format_reward": 0.27083333395421505, + "step": 42 + }, + { + "completion_length": 3095.4583435058594, + "epoch": 0.04914285714285714, + "grad_norm": 0.16259542107582092, + "kl": 2.100318670272827e-05, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0, + "reward": -0.09295706450939178, + "reward_std": 0.6536407507956028, + "rewards/cosine_scaled_reward": -0.05758994724601507, + "rewards/format_reward": 0.25, + "step": 43 + }, + { + "completion_length": 2561.1875381469727, + "epoch": 0.05028571428571429, + "grad_norm": 0.2526821494102478, + "kl": 8.388608694076538e-05, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0, + "reward": 0.15549907088279724, + "reward_std": 0.8106728717684746, + "rewards/cosine_scaled_reward": -0.022572665475308895, + "rewards/format_reward": 0.5000000111758709, + "step": 44 + }, + { + "completion_length": 3464.729217529297, + "epoch": 0.05142857142857143, + "grad_norm": 0.14498470723628998, + "kl": 2.6337802410125732e-05, + "learning_rate": 9e-07, + "loss": 0.0, + "reward": -0.25844255089759827, + "reward_std": 0.6470912098884583, + "rewards/cosine_scaled_reward": -0.11992522329092026, + "rewards/format_reward": 0.1666666716337204, + "step": 45 + }, + { + "completion_length": 3175.437530517578, + "epoch": 0.052571428571428575, + "grad_norm": 0.18001262843608856, + "kl": 4.996359348297119e-05, + "learning_rate": 9.2e-07, + "loss": 0.0, + "reward": -0.4702068418264389, + "reward_std": 0.43945666775107384, + "rewards/cosine_scaled_reward": -0.22439202293753624, + "rewards/format_reward": 0.18750000186264515, + "step": 46 + }, + { + "completion_length": 2691.541763305664, + "epoch": 0.053714285714285714, + "grad_norm": 0.24902203679084778, + "kl": 2.5863759219646454e-05, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0, + "reward": 0.3736280268058181, + "reward_std": 0.9081602022051811, + "rewards/cosine_scaled_reward": 0.09569812752306461, + "rewards/format_reward": 0.5416666809469461, + "step": 47 + }, + { + "completion_length": 2735.2292251586914, + "epoch": 0.054857142857142854, + "grad_norm": 0.2746092677116394, + "kl": 0.0001163184642791748, + "learning_rate": 9.6e-07, + "loss": 0.0, + "reward": 0.02801407827064395, + "reward_std": 0.8248592298477888, + "rewards/cosine_scaled_reward": -0.05238310806453228, + "rewards/format_reward": 0.3958333358168602, + "step": 48 + }, + { + "completion_length": 2386.5416984558105, + "epoch": 0.056, + "grad_norm": 0.2179473340511322, + "kl": 5.142390727996826e-05, + "learning_rate": 9.8e-07, + "loss": 0.0, + "reward": 0.3018168299458921, + "reward_std": 0.9894729033112526, + "rewards/cosine_scaled_reward": 0.020154984667897224, + "rewards/format_reward": 0.5833333376795053, + "step": 49 + }, + { + "completion_length": 2974.583354949951, + "epoch": 0.05714285714285714, + "grad_norm": 0.19161240756511688, + "kl": 8.691102266311646e-05, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": -0.04128398001194, + "reward_std": 0.6844424605369568, + "rewards/cosine_scaled_reward": -0.014122288441285491, + "rewards/format_reward": 0.3333333358168602, + "step": 50 + }, + { + "completion_length": 2252.375045776367, + "epoch": 0.05828571428571429, + "grad_norm": 0.22708459198474884, + "kl": 0.0002549290657043457, + "learning_rate": 9.999890338174275e-07, + "loss": 0.0, + "reward": 0.13623822387307882, + "reward_std": 0.7222620993852615, + "rewards/cosine_scaled_reward": -0.04494801629334688, + "rewards/format_reward": 0.5416666772216558, + "step": 51 + }, + { + "completion_length": 2870.2291870117188, + "epoch": 0.05942857142857143, + "grad_norm": 0.21368886530399323, + "kl": 0.0001485683023929596, + "learning_rate": 9.999561358041868e-07, + "loss": 0.0, + "reward": 0.015072201727889478, + "reward_std": 0.9205258414149284, + "rewards/cosine_scaled_reward": -0.06388110015541315, + "rewards/format_reward": 0.37500000186264515, + "step": 52 + }, + { + "completion_length": 2698.0000762939453, + "epoch": 0.060571428571428575, + "grad_norm": 0.2288151979446411, + "kl": 0.00016783177852630615, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0, + "reward": 0.10895399999571964, + "reward_std": 0.8917515203356743, + "rewards/cosine_scaled_reward": -0.04874301888048649, + "rewards/format_reward": 0.5208333395421505, + "step": 53 + }, + { + "completion_length": 2874.979248046875, + "epoch": 0.061714285714285715, + "grad_norm": 0.16582155227661133, + "kl": 4.6576838940382004e-05, + "learning_rate": 9.998245517681593e-07, + "loss": 0.0, + "reward": 0.31593899679137394, + "reward_std": 0.9704541265964508, + "rewards/cosine_scaled_reward": 0.09179376903921366, + "rewards/format_reward": 0.5000000149011612, + "step": 54 + }, + { + "completion_length": 3011.875045776367, + "epoch": 0.06285714285714286, + "grad_norm": 0.16874325275421143, + "kl": 0.0001902114599943161, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0, + "reward": 0.1326095201075077, + "reward_std": 0.8541341703385115, + "rewards/cosine_scaled_reward": 0.022567307110875845, + "rewards/format_reward": 0.45833333767950535, + "step": 55 + }, + { + "completion_length": 2914.1458892822266, + "epoch": 0.064, + "grad_norm": 0.17759209871292114, + "kl": 8.147954940795898e-05, + "learning_rate": 9.996052735444862e-07, + "loss": 0.0, + "reward": 0.021647373214364052, + "reward_std": 0.5984874460846186, + "rewards/cosine_scaled_reward": -0.05477431882172823, + "rewards/format_reward": 0.4375000111758709, + "step": 56 + }, + { + "completion_length": 3319.916717529297, + "epoch": 0.06514285714285714, + "grad_norm": 0.13513678312301636, + "kl": 6.243959069252014e-05, + "learning_rate": 9.994627618036452e-07, + "loss": 0.0, + "reward": -0.10464489553123713, + "reward_std": 0.9139657057821751, + "rewards/cosine_scaled_reward": -0.12102228635922074, + "rewards/format_reward": 0.3333333395421505, + "step": 57 + }, + { + "completion_length": 2223.3750534057617, + "epoch": 0.06628571428571428, + "grad_norm": 0.19678883254528046, + "kl": 0.0008740425109863281, + "learning_rate": 9.992983438818915e-07, + "loss": 0.0, + "reward": 0.3077916601905599, + "reward_std": 0.7720399703830481, + "rewards/cosine_scaled_reward": 0.0032341796904802322, + "rewards/format_reward": 0.6875000111758709, + "step": 58 + }, + { + "completion_length": 2857.1041870117188, + "epoch": 0.06742857142857143, + "grad_norm": 0.16988952457904816, + "kl": 7.66068696975708e-05, + "learning_rate": 9.991120277927223e-07, + "loss": 0.0, + "reward": -0.12906377390027046, + "reward_std": 0.5678401403129101, + "rewards/cosine_scaled_reward": -0.09338383004069328, + "rewards/format_reward": 0.33333333395421505, + "step": 59 + }, + { + "completion_length": 3053.0208740234375, + "epoch": 0.06857142857142857, + "grad_norm": 0.19087883830070496, + "kl": 0.00012252479791641235, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0, + "reward": -0.33987870812416077, + "reward_std": 0.4983965791761875, + "rewards/cosine_scaled_reward": -0.22157840942963958, + "rewards/format_reward": 0.33333334140479565, + "step": 60 + }, + { + "completion_length": 2953.0625762939453, + "epoch": 0.06971428571428571, + "grad_norm": 0.1577925682067871, + "kl": 0.0002728104591369629, + "learning_rate": 9.98673738502114e-07, + "loss": 0.0, + "reward": 0.10354057513177395, + "reward_std": 0.8005717396736145, + "rewards/cosine_scaled_reward": -0.0372257842682302, + "rewards/format_reward": 0.4791666753590107, + "step": 61 + }, + { + "completion_length": 2714.104217529297, + "epoch": 0.07085714285714285, + "grad_norm": 0.1799556165933609, + "kl": 0.0007392987608909607, + "learning_rate": 9.98421786662277e-07, + "loss": 0.0, + "reward": 0.32593785908829886, + "reward_std": 0.9482499547302723, + "rewards/cosine_scaled_reward": 0.08002315112389624, + "rewards/format_reward": 0.562500013038516, + "step": 62 + }, + { + "completion_length": 2463.5625762939453, + "epoch": 0.072, + "grad_norm": 0.19159279763698578, + "kl": 0.0007972661405801773, + "learning_rate": 9.981479793771866e-07, + "loss": 0.0, + "reward": 0.4637246737256646, + "reward_std": 1.0199245177209377, + "rewards/cosine_scaled_reward": 0.1016120407730341, + "rewards/format_reward": 0.6666666716337204, + "step": 63 + }, + { + "completion_length": 2957.2709045410156, + "epoch": 0.07314285714285715, + "grad_norm": 0.19695256650447845, + "kl": 0.0005731135606765747, + "learning_rate": 9.97852329991824e-07, + "loss": 0.0, + "reward": -0.09675957635045052, + "reward_std": 0.7647779621183872, + "rewards/cosine_scaled_reward": -0.11780054681003094, + "rewards/format_reward": 0.35416667722165585, + "step": 64 + }, + { + "completion_length": 2748.2083473205566, + "epoch": 0.07428571428571429, + "grad_norm": 0.19534841179847717, + "kl": 0.0003612041473388672, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0, + "reward": -0.20825218525715172, + "reward_std": 0.6059501096606255, + "rewards/cosine_scaled_reward": -0.18784814700484276, + "rewards/format_reward": 0.4166666679084301, + "step": 65 + }, + { + "completion_length": 2097.4375076293945, + "epoch": 0.07542857142857143, + "grad_norm": 0.250161349773407, + "kl": 0.0006571710109710693, + "learning_rate": 9.971955636222684e-07, + "loss": 0.0, + "reward": 0.0786643698811531, + "reward_std": 0.6597852855920792, + "rewards/cosine_scaled_reward": -0.006518872454762459, + "rewards/format_reward": 0.5, + "step": 66 + }, + { + "completion_length": 3430.4791870117188, + "epoch": 0.07657142857142857, + "grad_norm": 0.14664356410503387, + "kl": 0.0005048699676990509, + "learning_rate": 9.968344786479415e-07, + "loss": 0.0, + "reward": -0.5055637508630753, + "reward_std": 0.4412064775824547, + "rewards/cosine_scaled_reward": -0.2427280293777585, + "rewards/format_reward": 0.12500000558793545, + "step": 67 + }, + { + "completion_length": 2271.250045776367, + "epoch": 0.07771428571428571, + "grad_norm": 0.27284932136535645, + "kl": 0.0013322830200195312, + "learning_rate": 9.964516155915151e-07, + "loss": 0.0001, + "reward": 0.2981271520256996, + "reward_std": 0.9507227130234241, + "rewards/cosine_scaled_reward": 0.0009464251343160868, + "rewards/format_reward": 0.6041666716337204, + "step": 68 + }, + { + "completion_length": 2530.2708587646484, + "epoch": 0.07885714285714286, + "grad_norm": 0.22157438099384308, + "kl": 0.0010530054569244385, + "learning_rate": 9.960469931131936e-07, + "loss": 0.0, + "reward": -0.22882681945338845, + "reward_std": 0.6600410342216492, + "rewards/cosine_scaled_reward": -0.20334339328110218, + "rewards/format_reward": 0.41666667722165585, + "step": 69 + }, + { + "completion_length": 3004.7708587646484, + "epoch": 0.08, + "grad_norm": 0.21289457380771637, + "kl": 0.001432761549949646, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0001, + "reward": -0.3504838487133384, + "reward_std": 0.5108464825898409, + "rewards/cosine_scaled_reward": -0.24259378435090184, + "rewards/format_reward": 0.3541666716337204, + "step": 70 + }, + { + "completion_length": 2526.937515258789, + "epoch": 0.08114285714285714, + "grad_norm": 0.21181254088878632, + "kl": 0.0007251240313053131, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0, + "reward": -0.016672035679221153, + "reward_std": 0.5613718032836914, + "rewards/cosine_scaled_reward": -0.044779783114790916, + "rewards/format_reward": 0.4166666716337204, + "step": 71 + }, + { + "completion_length": 3029.979217529297, + "epoch": 0.08228571428571428, + "grad_norm": 0.20094312727451324, + "kl": 0.000997304916381836, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0, + "reward": -0.21065808949060738, + "reward_std": 0.6206581741571426, + "rewards/cosine_scaled_reward": -0.12715522898361087, + "rewards/format_reward": 0.29166666977107525, + "step": 72 + }, + { + "completion_length": 3455.625, + "epoch": 0.08342857142857144, + "grad_norm": 0.13940556347370148, + "kl": 0.00026963651180267334, + "learning_rate": 9.942113192828444e-07, + "loss": 0.0, + "reward": -0.3948047012090683, + "reward_std": 0.596671599894762, + "rewards/cosine_scaled_reward": -0.1924935569986701, + "rewards/format_reward": 0.1458333395421505, + "step": 73 + }, + { + "completion_length": 3041.666717529297, + "epoch": 0.08457142857142858, + "grad_norm": 0.2121056467294693, + "kl": 0.0018717050552368164, + "learning_rate": 9.93698216681727e-07, + "loss": 0.0001, + "reward": -0.10725430864840746, + "reward_std": 0.786587443202734, + "rewards/cosine_scaled_reward": -0.08628973411396146, + "rewards/format_reward": 0.3333333358168602, + "step": 74 + }, + { + "completion_length": 3010.2708740234375, + "epoch": 0.08571428571428572, + "grad_norm": 0.17574620246887207, + "kl": 0.0008899793028831482, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "reward": 0.17169499211013317, + "reward_std": 0.7550366073846817, + "rewards/cosine_scaled_reward": 0.06438030861318111, + "rewards/format_reward": 0.41666666977107525, + "step": 75 + }, + { + "completion_length": 3000.0208587646484, + "epoch": 0.08685714285714285, + "grad_norm": 0.1713106483221054, + "kl": 0.0001423284411430359, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0, + "reward": -0.2900051809847355, + "reward_std": 0.5428481921553612, + "rewards/cosine_scaled_reward": -0.17083199694752693, + "rewards/format_reward": 0.3333333358168602, + "step": 76 + }, + { + "completion_length": 3030.854232788086, + "epoch": 0.088, + "grad_norm": 0.15809102356433868, + "kl": 0.00039126724004745483, + "learning_rate": 9.9202926282791e-07, + "loss": 0.0, + "reward": -0.12223898246884346, + "reward_std": 0.5839939434081316, + "rewards/cosine_scaled_reward": -0.12459814921021461, + "rewards/format_reward": 0.39583334513008595, + "step": 77 + }, + { + "completion_length": 3080.5208740234375, + "epoch": 0.08914285714285715, + "grad_norm": 0.1908382624387741, + "kl": 0.00018369778990745544, + "learning_rate": 9.91429819907136e-07, + "loss": 0.0, + "reward": 0.24677438661456108, + "reward_std": 0.8652092255651951, + "rewards/cosine_scaled_reward": 0.060313327237963676, + "rewards/format_reward": 0.45833334140479565, + "step": 78 + }, + { + "completion_length": 2289.7916870117188, + "epoch": 0.09028571428571429, + "grad_norm": 0.2242053598165512, + "kl": 0.000986546277999878, + "learning_rate": 9.908088623197048e-07, + "loss": 0.0, + "reward": 0.043350703082978725, + "reward_std": 0.7450110893696547, + "rewards/cosine_scaled_reward": -0.10156810469925404, + "rewards/format_reward": 0.5833333414047956, + "step": 79 + }, + { + "completion_length": 3217.6458740234375, + "epoch": 0.09142857142857143, + "grad_norm": 0.16875122487545013, + "kl": 0.0006685960106551647, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0, + "reward": -0.21108814515173435, + "reward_std": 0.6316388584673405, + "rewards/cosine_scaled_reward": -0.141633331310004, + "rewards/format_reward": 0.2916666716337204, + "step": 80 + }, + { + "completion_length": 3053.0417098999023, + "epoch": 0.09257142857142857, + "grad_norm": 0.21555641293525696, + "kl": 0.001847386360168457, + "learning_rate": 9.895025252503755e-07, + "loss": 0.0001, + "reward": -0.08620672597317025, + "reward_std": 0.6757525354623795, + "rewards/cosine_scaled_reward": -0.08836011588573456, + "rewards/format_reward": 0.37500000186264515, + "step": 81 + }, + { + "completion_length": 2864.375030517578, + "epoch": 0.09371428571428571, + "grad_norm": 0.18830879032611847, + "kl": 0.0011097192764282227, + "learning_rate": 9.888172094375033e-07, + "loss": 0.0, + "reward": 0.05295779928565025, + "reward_std": 0.6998694110661745, + "rewards/cosine_scaled_reward": -0.003920115530490875, + "rewards/format_reward": 0.3333333358168602, + "step": 82 + }, + { + "completion_length": 2812.312515258789, + "epoch": 0.09485714285714286, + "grad_norm": 0.22218850255012512, + "kl": 0.0006138831377029419, + "learning_rate": 9.881105062929221e-07, + "loss": 0.0, + "reward": -0.2050288449972868, + "reward_std": 0.6761703044176102, + "rewards/cosine_scaled_reward": -0.15050649549812078, + "rewards/format_reward": 0.33333333395421505, + "step": 83 + }, + { + "completion_length": 3117.25, + "epoch": 0.096, + "grad_norm": 0.16899633407592773, + "kl": 0.00044608116149902344, + "learning_rate": 9.873824502603459e-07, + "loss": 0.0, + "reward": 0.03697575815021992, + "reward_std": 0.8468833193182945, + "rewards/cosine_scaled_reward": -0.044630819000303745, + "rewards/format_reward": 0.35416667349636555, + "step": 84 + }, + { + "completion_length": 3147.687530517578, + "epoch": 0.09714285714285714, + "grad_norm": 0.14954747259616852, + "kl": 0.00039239227771759033, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0, + "reward": -0.10085274185985327, + "reward_std": 0.7591249234974384, + "rewards/cosine_scaled_reward": -0.12476183846592903, + "rewards/format_reward": 0.37500000931322575, + "step": 85 + }, + { + "completion_length": 2752.958351135254, + "epoch": 0.09828571428571428, + "grad_norm": 0.1987195760011673, + "kl": 0.0008227825164794922, + "learning_rate": 9.85862422507884e-07, + "loss": 0.0, + "reward": 0.022698190063238144, + "reward_std": 0.6212068926542997, + "rewards/cosine_scaled_reward": -0.04935761634260416, + "rewards/format_reward": 0.4166666679084301, + "step": 86 + }, + { + "completion_length": 2598.500030517578, + "epoch": 0.09942857142857142, + "grad_norm": 0.196201354265213, + "kl": 0.0009197257459163666, + "learning_rate": 9.850705248720068e-07, + "loss": 0.0, + "reward": 0.1782613815739751, + "reward_std": 0.8598120957612991, + "rewards/cosine_scaled_reward": -0.03767361585050821, + "rewards/format_reward": 0.5416666809469461, + "step": 87 + }, + { + "completion_length": 2744.875045776367, + "epoch": 0.10057142857142858, + "grad_norm": 0.1994200497865677, + "kl": 0.0016593635082244873, + "learning_rate": 9.8425742251254e-07, + "loss": 0.0001, + "reward": 0.38837052757298807, + "reward_std": 1.1653981991112232, + "rewards/cosine_scaled_reward": 0.04098894074559212, + "rewards/format_reward": 0.5416666865348816, + "step": 88 + }, + { + "completion_length": 2835.041732788086, + "epoch": 0.10171428571428572, + "grad_norm": 0.17567898333072662, + "kl": 0.0010609626770019531, + "learning_rate": 9.83423155058946e-07, + "loss": 0.0, + "reward": -0.0007169158197939396, + "reward_std": 0.7810570430010557, + "rewards/cosine_scaled_reward": -0.0879359629470855, + "rewards/format_reward": 0.4583333432674408, + "step": 89 + }, + { + "completion_length": 2648.2916870117188, + "epoch": 0.10285714285714286, + "grad_norm": 0.32330945134162903, + "kl": 0.0009630322456359863, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0, + "reward": -0.35989359859377146, + "reward_std": 0.5252104215323925, + "rewards/cosine_scaled_reward": -0.2538211215287447, + "rewards/format_reward": 0.3750000037252903, + "step": 90 + }, + { + "completion_length": 2993.0208740234375, + "epoch": 0.104, + "grad_norm": 0.19204914569854736, + "kl": 0.0005503743886947632, + "learning_rate": 9.816912885430258e-07, + "loss": 0.0, + "reward": 0.10479497350752354, + "reward_std": 0.8132709451019764, + "rewards/cosine_scaled_reward": 0.023820115253329277, + "rewards/format_reward": 0.4166666716337204, + "step": 91 + }, + { + "completion_length": 2592.6875228881836, + "epoch": 0.10514285714285715, + "grad_norm": 0.22138415277004242, + "kl": 0.0012896955013275146, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0001, + "reward": -0.160959305241704, + "reward_std": 0.616182116791606, + "rewards/cosine_scaled_reward": -0.17551955580711365, + "rewards/format_reward": 0.4583333469927311, + "step": 92 + }, + { + "completion_length": 3457.7083435058594, + "epoch": 0.10628571428571429, + "grad_norm": 0.21576857566833496, + "kl": 0.0012897849082946777, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0001, + "reward": -0.5375550724565983, + "reward_std": 0.46420392021536827, + "rewards/cosine_scaled_reward": -0.24901489913463593, + "rewards/format_reward": 0.10416666977107525, + "step": 93 + }, + { + "completion_length": 3101.8750228881836, + "epoch": 0.10742857142857143, + "grad_norm": 0.18881510198116302, + "kl": 0.0011924207210540771, + "learning_rate": 9.78935800506826e-07, + "loss": 0.0, + "reward": -0.3899065591394901, + "reward_std": 0.47088516876101494, + "rewards/cosine_scaled_reward": -0.1563735492527485, + "rewards/format_reward": 0.2083333358168602, + "step": 94 + }, + { + "completion_length": 3354.0833435058594, + "epoch": 0.10857142857142857, + "grad_norm": 0.1480981707572937, + "kl": 0.0006138160824775696, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0, + "reward": -0.16396450996398926, + "reward_std": 0.8173685595393181, + "rewards/cosine_scaled_reward": -0.13610992138274014, + "rewards/format_reward": 0.2916666679084301, + "step": 95 + }, + { + "completion_length": 2596.770896911621, + "epoch": 0.10971428571428571, + "grad_norm": 0.2216062992811203, + "kl": 0.0016668587923049927, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0001, + "reward": -0.06530294334515929, + "reward_std": 0.6829965673387051, + "rewards/cosine_scaled_reward": -0.08926075106137432, + "rewards/format_reward": 0.43750000186264515, + "step": 96 + }, + { + "completion_length": 3140.104217529297, + "epoch": 0.11085714285714286, + "grad_norm": 0.20360398292541504, + "kl": 0.0009580925107002258, + "learning_rate": 9.759921670520634e-07, + "loss": 0.0, + "reward": -0.06862857192754745, + "reward_std": 0.7287838291376829, + "rewards/cosine_scaled_reward": -0.07030826597474515, + "rewards/format_reward": 0.2916666753590107, + "step": 97 + }, + { + "completion_length": 2493.2291870117188, + "epoch": 0.112, + "grad_norm": 0.2129816859960556, + "kl": 0.0004929900169372559, + "learning_rate": 9.749693666068663e-07, + "loss": 0.0, + "reward": -0.014337641187012196, + "reward_std": 0.6026619412004948, + "rewards/cosine_scaled_reward": -0.1173677199985832, + "rewards/format_reward": 0.5416666846722364, + "step": 98 + }, + { + "completion_length": 2910.2708435058594, + "epoch": 0.11314285714285714, + "grad_norm": 0.2219778299331665, + "kl": 0.0009305477142333984, + "learning_rate": 9.739258537542835e-07, + "loss": 0.0, + "reward": -0.1526417959248647, + "reward_std": 0.6645006220787764, + "rewards/cosine_scaled_reward": -0.11239583557471633, + "rewards/format_reward": 0.31250000186264515, + "step": 99 + }, + { + "completion_length": 2669.1875228881836, + "epoch": 0.11428571428571428, + "grad_norm": 0.18922168016433716, + "kl": 0.0009844303131103516, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0, + "reward": 0.36958627274725586, + "reward_std": 0.8694315142929554, + "rewards/cosine_scaled_reward": 0.11309660819824785, + "rewards/format_reward": 0.5000000111758709, + "step": 100 + }, + { + "completion_length": 2726.729202270508, + "epoch": 0.11542857142857142, + "grad_norm": 0.17301318049430847, + "kl": 0.0010145902633666992, + "learning_rate": 9.717768952713511e-07, + "loss": 0.0, + "reward": -0.10664987377822399, + "reward_std": 0.550342533737421, + "rewards/cosine_scaled_reward": -0.07740492327138782, + "rewards/format_reward": 0.3958333358168602, + "step": 101 + }, + { + "completion_length": 2206.812530517578, + "epoch": 0.11657142857142858, + "grad_norm": 0.3242272138595581, + "kl": 0.001733243465423584, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0001, + "reward": -0.04840894846711308, + "reward_std": 0.5605612080544233, + "rewards/cosine_scaled_reward": -0.1783417221158743, + "rewards/format_reward": 0.666666679084301, + "step": 102 + }, + { + "completion_length": 2822.145927429199, + "epoch": 0.11771428571428572, + "grad_norm": 0.20194919407367706, + "kl": 0.0008286237716674805, + "learning_rate": 9.695457105469804e-07, + "loss": 0.0, + "reward": 0.10302554164081812, + "reward_std": 0.7938342466950417, + "rewards/cosine_scaled_reward": -0.058811694383621216, + "rewards/format_reward": 0.5000000111758709, + "step": 103 + }, + { + "completion_length": 2794.2291870117188, + "epoch": 0.11885714285714286, + "grad_norm": 0.21426032483577728, + "kl": 0.0017331838607788086, + "learning_rate": 9.683994186497132e-07, + "loss": 0.0001, + "reward": -0.15357419941574335, + "reward_std": 0.7048191353678703, + "rewards/cosine_scaled_reward": -0.14375380612909794, + "rewards/format_reward": 0.3750000037252903, + "step": 104 + }, + { + "completion_length": 2595.7500381469727, + "epoch": 0.12, + "grad_norm": 0.1870584338903427, + "kl": 0.0007082223892211914, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0, + "reward": 0.0526086570462212, + "reward_std": 1.013668704777956, + "rewards/cosine_scaled_reward": -0.10153029090724885, + "rewards/format_reward": 0.47916667349636555, + "step": 105 + }, + { + "completion_length": 2181.2291717529297, + "epoch": 0.12114285714285715, + "grad_norm": 0.193389892578125, + "kl": 0.0015556812286376953, + "learning_rate": 9.66045715125541e-07, + "loss": 0.0001, + "reward": 0.20659764064475894, + "reward_std": 0.7706484608352184, + "rewards/cosine_scaled_reward": 0.0427999310195446, + "rewards/format_reward": 0.5625, + "step": 106 + }, + { + "completion_length": 2629.104217529297, + "epoch": 0.12228571428571429, + "grad_norm": 0.2650865614414215, + "kl": 0.001280069351196289, + "learning_rate": 9.648384182148252e-07, + "loss": 0.0001, + "reward": 0.047421048395335674, + "reward_std": 0.8096052818000317, + "rewards/cosine_scaled_reward": -0.08784432336688042, + "rewards/format_reward": 0.5208333469927311, + "step": 107 + }, + { + "completion_length": 2760.3125610351562, + "epoch": 0.12342857142857143, + "grad_norm": 0.22029048204421997, + "kl": 0.0016175508499145508, + "learning_rate": 9.636109026648554e-07, + "loss": 0.0001, + "reward": 0.20211763679981232, + "reward_std": 0.9046534672379494, + "rewards/cosine_scaled_reward": 0.006914107128977776, + "rewards/format_reward": 0.4791666828095913, + "step": 108 + }, + { + "completion_length": 3071.2083587646484, + "epoch": 0.12457142857142857, + "grad_norm": 0.16153673827648163, + "kl": 0.00049591064453125, + "learning_rate": 9.623632283030077e-07, + "loss": 0.0, + "reward": -0.13390080258250237, + "reward_std": 0.6916838400065899, + "rewards/cosine_scaled_reward": -0.10345052601769567, + "rewards/format_reward": 0.3958333358168602, + "step": 109 + }, + { + "completion_length": 2738.8125534057617, + "epoch": 0.12571428571428572, + "grad_norm": 0.20852850377559662, + "kl": 0.0008258223533630371, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0, + "reward": -0.033169424161314964, + "reward_std": 0.9115067571401596, + "rewards/cosine_scaled_reward": -0.14516241010278463, + "rewards/format_reward": 0.5000000055879354, + "step": 110 + }, + { + "completion_length": 2979.416702270508, + "epoch": 0.12685714285714286, + "grad_norm": 0.20923613011837006, + "kl": 0.0016465187072753906, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0001, + "reward": -0.1917986012995243, + "reward_std": 0.6587283834815025, + "rewards/cosine_scaled_reward": -0.1627930011600256, + "rewards/format_reward": 0.35416667349636555, + "step": 111 + }, + { + "completion_length": 3034.604202270508, + "epoch": 0.128, + "grad_norm": 0.1812065690755844, + "kl": 0.0010587647557258606, + "learning_rate": 9.58499865339809e-07, + "loss": 0.0, + "reward": 0.042024691589176655, + "reward_std": 0.7155229933559895, + "rewards/cosine_scaled_reward": -0.007911409251391888, + "rewards/format_reward": 0.3958333395421505, + "step": 112 + }, + { + "completion_length": 2602.041702270508, + "epoch": 0.12914285714285714, + "grad_norm": 0.23084139823913574, + "kl": 0.005176067352294922, + "learning_rate": 9.571721736097088e-07, + "loss": 0.0002, + "reward": 0.09520017961040139, + "reward_std": 0.7830863744020462, + "rewards/cosine_scaled_reward": -0.06248902215156704, + "rewards/format_reward": 0.5000000149011612, + "step": 113 + }, + { + "completion_length": 2630.812526702881, + "epoch": 0.13028571428571428, + "grad_norm": 0.22400474548339844, + "kl": 0.002625703811645508, + "learning_rate": 9.55824636882301e-07, + "loss": 0.0001, + "reward": -0.18021708587184548, + "reward_std": 0.5372883807867765, + "rewards/cosine_scaled_reward": -0.19902520813047886, + "rewards/format_reward": 0.5208333432674408, + "step": 114 + }, + { + "completion_length": 2778.3333587646484, + "epoch": 0.13142857142857142, + "grad_norm": 0.2168537676334381, + "kl": 0.0024030208587646484, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0001, + "reward": -0.23857925506308675, + "reward_std": 0.4751722030341625, + "rewards/cosine_scaled_reward": -0.12388885580003262, + "rewards/format_reward": 0.33333333395421505, + "step": 115 + }, + { + "completion_length": 3442.6458740234375, + "epoch": 0.13257142857142856, + "grad_norm": 0.16627182066440582, + "kl": 0.0018346309661865234, + "learning_rate": 9.530702921077358e-07, + "loss": 0.0001, + "reward": -0.1470964252948761, + "reward_std": 0.609551090747118, + "rewards/cosine_scaled_reward": -0.060838012024760246, + "rewards/format_reward": 0.2083333395421505, + "step": 116 + }, + { + "completion_length": 3086.3541870117188, + "epoch": 0.1337142857142857, + "grad_norm": 0.1960468739271164, + "kl": 0.0021970272064208984, + "learning_rate": 9.516636183034564e-07, + "loss": 0.0001, + "reward": -0.35040562483482063, + "reward_std": 0.566862914711237, + "rewards/cosine_scaled_reward": -0.23000845714705065, + "rewards/format_reward": 0.3125000037252903, + "step": 117 + }, + { + "completion_length": 2862.125045776367, + "epoch": 0.13485714285714287, + "grad_norm": 0.19369491934776306, + "kl": 0.0014324188232421875, + "learning_rate": 9.502373679810839e-07, + "loss": 0.0001, + "reward": 0.23708336800336838, + "reward_std": 0.8551704436540604, + "rewards/cosine_scaled_reward": 0.059158104471862316, + "rewards/format_reward": 0.47916666977107525, + "step": 118 + }, + { + "completion_length": 2329.145881652832, + "epoch": 0.136, + "grad_norm": 0.2726350724697113, + "kl": 0.006384849548339844, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0003, + "reward": 0.06812019762583077, + "reward_std": 0.6704130079597235, + "rewards/cosine_scaled_reward": -0.1190024558454752, + "rewards/format_reward": 0.6250000223517418, + "step": 119 + }, + { + "completion_length": 2332.5208740234375, + "epoch": 0.13714285714285715, + "grad_norm": 0.2544117867946625, + "kl": 0.002226591110229492, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0001, + "reward": 0.13008875958621502, + "reward_std": 0.7615821734070778, + "rewards/cosine_scaled_reward": -0.029063436202704906, + "rewards/format_reward": 0.5833333469927311, + "step": 120 + }, + { + "completion_length": 1837.375015258789, + "epoch": 0.1382857142857143, + "grad_norm": 0.25729382038116455, + "kl": 0.007863402366638184, + "learning_rate": 9.458418577899774e-07, + "loss": 0.0003, + "reward": 0.3155789945740253, + "reward_std": 0.7411515153944492, + "rewards/cosine_scaled_reward": -0.02028810605406761, + "rewards/format_reward": 0.791666679084301, + "step": 121 + }, + { + "completion_length": 2958.145866394043, + "epoch": 0.13942857142857143, + "grad_norm": 0.20917245745658875, + "kl": 0.001485586166381836, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0001, + "reward": 0.05837233364582062, + "reward_std": 0.964107995852828, + "rewards/cosine_scaled_reward": -0.043140748515725136, + "rewards/format_reward": 0.37500000558793545, + "step": 122 + }, + { + "completion_length": 2625.7083892822266, + "epoch": 0.14057142857142857, + "grad_norm": 0.1879587173461914, + "kl": 0.0017619132995605469, + "learning_rate": 9.428149347714143e-07, + "loss": 0.0001, + "reward": 0.08997525461018085, + "reward_std": 0.7339164912700653, + "rewards/cosine_scaled_reward": -0.0870713610202074, + "rewards/format_reward": 0.5625000093132257, + "step": 123 + }, + { + "completion_length": 2327.4792404174805, + "epoch": 0.1417142857142857, + "grad_norm": 0.19856718182563782, + "kl": 0.002035379409790039, + "learning_rate": 9.412727182773486e-07, + "loss": 0.0001, + "reward": 0.33785169292241335, + "reward_std": 0.8534054830670357, + "rewards/cosine_scaled_reward": 0.03106366191059351, + "rewards/format_reward": 0.6458333432674408, + "step": 124 + }, + { + "completion_length": 2897.2083587646484, + "epoch": 0.14285714285714285, + "grad_norm": 0.1509743332862854, + "kl": 0.0013265609741210938, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0001, + "reward": -0.1488229539245367, + "reward_std": 0.5740789603441954, + "rewards/cosine_scaled_reward": -0.06105211656540632, + "rewards/format_reward": 0.27083333395421505, + "step": 125 + }, + { + "completion_length": 2956.0208740234375, + "epoch": 0.144, + "grad_norm": 0.18172477185726166, + "kl": 0.0009171962738037109, + "learning_rate": 9.381311511432658e-07, + "loss": 0.0, + "reward": -0.04263794468715787, + "reward_std": 0.8349475301802158, + "rewards/cosine_scaled_reward": -0.10641535092145205, + "rewards/format_reward": 0.41666666977107525, + "step": 126 + }, + { + "completion_length": 2777.229202270508, + "epoch": 0.14514285714285713, + "grad_norm": 0.2586840093135834, + "kl": 0.0015873908996582031, + "learning_rate": 9.36531953618799e-07, + "loss": 0.0001, + "reward": -0.25305760465562344, + "reward_std": 0.6229905411601067, + "rewards/cosine_scaled_reward": -0.22235668450593948, + "rewards/format_reward": 0.41666667722165585, + "step": 127 + }, + { + "completion_length": 2836.7708892822266, + "epoch": 0.1462857142857143, + "grad_norm": 0.21033713221549988, + "kl": 0.0029465854167938232, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0001, + "reward": 0.31079221796244383, + "reward_std": 0.9646506570279598, + "rewards/cosine_scaled_reward": 0.121150006307289, + "rewards/format_reward": 0.43750000186264515, + "step": 128 + }, + { + "completion_length": 3317.375030517578, + "epoch": 0.14742857142857144, + "grad_norm": 0.19819480180740356, + "kl": 0.0024137496948242188, + "learning_rate": 9.332771203643714e-07, + "loss": 0.0001, + "reward": -0.364932868629694, + "reward_std": 0.47471798583865166, + "rewards/cosine_scaled_reward": -0.18228841945528984, + "rewards/format_reward": 0.2083333395421505, + "step": 129 + }, + { + "completion_length": 2750.1041717529297, + "epoch": 0.14857142857142858, + "grad_norm": 0.19781967997550964, + "kl": 0.0014467239379882812, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0001, + "reward": -0.2264688154682517, + "reward_std": 0.6045875921845436, + "rewards/cosine_scaled_reward": -0.16815861780196428, + "rewards/format_reward": 0.33333333395421505, + "step": 130 + }, + { + "completion_length": 2835.3333740234375, + "epoch": 0.14971428571428572, + "grad_norm": 0.23097459971904755, + "kl": 0.0031147003173828125, + "learning_rate": 9.299475664759068e-07, + "loss": 0.0001, + "reward": 0.24509642273187637, + "reward_std": 0.6866675093770027, + "rewards/cosine_scaled_reward": 0.16801240853965282, + "rewards/format_reward": 0.4583333358168602, + "step": 131 + }, + { + "completion_length": 2526.6041984558105, + "epoch": 0.15085714285714286, + "grad_norm": 0.19051432609558105, + "kl": 0.0015637874603271484, + "learning_rate": 9.282549715730579e-07, + "loss": 0.0001, + "reward": 0.17219684056180995, + "reward_std": 0.8952484987676144, + "rewards/cosine_scaled_reward": -0.009550546063110232, + "rewards/format_reward": 0.4791666679084301, + "step": 132 + }, + { + "completion_length": 3378.1041870117188, + "epoch": 0.152, + "grad_norm": 0.22686553001403809, + "kl": 0.0022031068801879883, + "learning_rate": 9.265439410565328e-07, + "loss": 0.0001, + "reward": -0.4034382812678814, + "reward_std": 0.49673712253570557, + "rewards/cosine_scaled_reward": -0.20925658009946346, + "rewards/format_reward": 0.18750000186264515, + "step": 133 + }, + { + "completion_length": 2480.2083892822266, + "epoch": 0.15314285714285714, + "grad_norm": 0.2461298406124115, + "kl": 0.0034208297729492188, + "learning_rate": 9.248145583195447e-07, + "loss": 0.0001, + "reward": -0.010700544342398643, + "reward_std": 0.6441561691462994, + "rewards/cosine_scaled_reward": -0.10502434149384499, + "rewards/format_reward": 0.5625000055879354, + "step": 134 + }, + { + "completion_length": 1805.4583473205566, + "epoch": 0.15428571428571428, + "grad_norm": 0.26353919506073, + "kl": 0.0027773380279541016, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0001, + "reward": 0.5292202904820442, + "reward_std": 0.849658913910389, + "rewards/cosine_scaled_reward": 0.17550375685095787, + "rewards/format_reward": 0.708333333954215, + "step": 135 + }, + { + "completion_length": 2799.7291870117188, + "epoch": 0.15542857142857142, + "grad_norm": 0.20877666771411896, + "kl": 0.0029191970825195312, + "learning_rate": 9.213010742252327e-07, + "loss": 0.0001, + "reward": 0.07902248203754425, + "reward_std": 0.8653755076229572, + "rewards/cosine_scaled_reward": 0.008886766619980335, + "rewards/format_reward": 0.37500000931322575, + "step": 136 + }, + { + "completion_length": 2720.2083587646484, + "epoch": 0.15657142857142858, + "grad_norm": 0.1975402683019638, + "kl": 0.002421855926513672, + "learning_rate": 9.195171441101668e-07, + "loss": 0.0001, + "reward": -0.21935221180319786, + "reward_std": 0.518220279365778, + "rewards/cosine_scaled_reward": -0.16422798670828342, + "rewards/format_reward": 0.41666666977107525, + "step": 137 + }, + { + "completion_length": 2631.229217529297, + "epoch": 0.15771428571428572, + "grad_norm": 0.22529752552509308, + "kl": 0.0021669864654541016, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0001, + "reward": -0.1003083037212491, + "reward_std": 0.5785807222127914, + "rewards/cosine_scaled_reward": -0.13203439861536026, + "rewards/format_reward": 0.5208333376795053, + "step": 138 + }, + { + "completion_length": 3252.3125610351562, + "epoch": 0.15885714285714286, + "grad_norm": 0.16783365607261658, + "kl": 0.0036573410034179688, + "learning_rate": 9.158953424711624e-07, + "loss": 0.0001, + "reward": -0.16235242877155542, + "reward_std": 0.6854680478572845, + "rewards/cosine_scaled_reward": -0.1317966803908348, + "rewards/format_reward": 0.3541666716337204, + "step": 139 + }, + { + "completion_length": 2871.6875610351562, + "epoch": 0.16, + "grad_norm": 0.23042802512645721, + "kl": 0.004063129425048828, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0002, + "reward": -0.0632519107311964, + "reward_std": 0.7613547593355179, + "rewards/cosine_scaled_reward": -0.076241385191679, + "rewards/format_reward": 0.37500000931322575, + "step": 140 + }, + { + "completion_length": 2480.291702270508, + "epoch": 0.16114285714285714, + "grad_norm": 0.2392415851354599, + "kl": 0.003994464874267578, + "learning_rate": 9.122022088101613e-07, + "loss": 0.0002, + "reward": 0.05139289842918515, + "reward_std": 1.0145743787288666, + "rewards/cosine_scaled_reward": -0.13529899902641773, + "rewards/format_reward": 0.5625000074505806, + "step": 141 + }, + { + "completion_length": 2873.666717529297, + "epoch": 0.16228571428571428, + "grad_norm": 0.17095746099948883, + "kl": 0.0033861398696899414, + "learning_rate": 9.103291169269299e-07, + "loss": 0.0001, + "reward": 0.11978689953684807, + "reward_std": 0.6609016172587872, + "rewards/cosine_scaled_reward": -0.06103468872606754, + "rewards/format_reward": 0.5833333414047956, + "step": 142 + }, + { + "completion_length": 2451.541702270508, + "epoch": 0.16342857142857142, + "grad_norm": 0.2901478111743927, + "kl": 0.0036554336547851562, + "learning_rate": 9.084384631108882e-07, + "loss": 0.0001, + "reward": -0.13543805526569486, + "reward_std": 0.5446794554591179, + "rewards/cosine_scaled_reward": -0.18383409455418587, + "rewards/format_reward": 0.5208333469927311, + "step": 143 + }, + { + "completion_length": 2912.7083740234375, + "epoch": 0.16457142857142856, + "grad_norm": 0.21996816992759705, + "kl": 0.004309415817260742, + "learning_rate": 9.065303395098358e-07, + "loss": 0.0002, + "reward": 0.015518264845013618, + "reward_std": 0.9661316499114037, + "rewards/cosine_scaled_reward": -0.05092207749839872, + "rewards/format_reward": 0.3333333358168602, + "step": 144 + }, + { + "completion_length": 2094.041706085205, + "epoch": 0.1657142857142857, + "grad_norm": 0.29601243138313293, + "kl": 0.003927946090698242, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0002, + "reward": 0.2274817731231451, + "reward_std": 0.6735235303640366, + "rewards/cosine_scaled_reward": 0.019177459180355072, + "rewards/format_reward": 0.6250000055879354, + "step": 145 + }, + { + "completion_length": 2216.541717529297, + "epoch": 0.16685714285714287, + "grad_norm": 0.25898125767707825, + "kl": 0.002621889114379883, + "learning_rate": 9.026620557966279e-07, + "loss": 0.0001, + "reward": -0.13925540121272206, + "reward_std": 0.5450553633272648, + "rewards/cosine_scaled_reward": -0.2378202360123396, + "rewards/format_reward": 0.6875000186264515, + "step": 146 + }, + { + "completion_length": 2532.7916870117188, + "epoch": 0.168, + "grad_norm": 0.41119325160980225, + "kl": 0.0048122406005859375, + "learning_rate": 9.007020842191634e-07, + "loss": 0.0002, + "reward": 0.028958545066416264, + "reward_std": 0.9969764724373817, + "rewards/cosine_scaled_reward": -0.12233338970690966, + "rewards/format_reward": 0.4583333507180214, + "step": 147 + }, + { + "completion_length": 1958.9792175292969, + "epoch": 0.16914285714285715, + "grad_norm": 0.21329711377620697, + "kl": 0.0035009384155273438, + "learning_rate": 8.987250199168808e-07, + "loss": 0.0001, + "reward": 0.06930056714918464, + "reward_std": 0.7063222527503967, + "rewards/cosine_scaled_reward": -0.13013077899813652, + "rewards/format_reward": 0.6875000111758709, + "step": 148 + }, + { + "completion_length": 2652.1875610351562, + "epoch": 0.1702857142857143, + "grad_norm": 0.1904420405626297, + "kl": 0.0030150413513183594, + "learning_rate": 8.967309592491052e-07, + "loss": 0.0001, + "reward": 0.0019146008417010307, + "reward_std": 0.813957192003727, + "rewards/cosine_scaled_reward": -0.10279210843145847, + "rewards/format_reward": 0.47916666977107525, + "step": 149 + }, + { + "completion_length": 2578.625030517578, + "epoch": 0.17142857142857143, + "grad_norm": 0.1853615641593933, + "kl": 0.005157470703125, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0002, + "reward": 0.13735826686024666, + "reward_std": 0.8738452345132828, + "rewards/cosine_scaled_reward": -0.027063111774623394, + "rewards/format_reward": 0.4583333432674408, + "step": 150 + }, + { + "completion_length": 2566.541732788086, + "epoch": 0.17257142857142857, + "grad_norm": 0.23767390847206116, + "kl": 0.0045833587646484375, + "learning_rate": 8.926922383915315e-07, + "loss": 0.0002, + "reward": 0.3640142543008551, + "reward_std": 0.9924535490572453, + "rewards/cosine_scaled_reward": 0.056208414025604725, + "rewards/format_reward": 0.6041666809469461, + "step": 151 + }, + { + "completion_length": 2766.5208702087402, + "epoch": 0.1737142857142857, + "grad_norm": 0.24616549909114838, + "kl": 0.0033342838287353516, + "learning_rate": 8.906477750432903e-07, + "loss": 0.0001, + "reward": -0.30499533005058765, + "reward_std": 0.5299477484077215, + "rewards/cosine_scaled_reward": -0.2434069886803627, + "rewards/format_reward": 0.41666667349636555, + "step": 152 + }, + { + "completion_length": 2895.229202270508, + "epoch": 0.17485714285714285, + "grad_norm": 0.2707502543926239, + "kl": 0.007465362548828125, + "learning_rate": 8.88586709003076e-07, + "loss": 0.0003, + "reward": -0.28449683357030153, + "reward_std": 0.5924058370292187, + "rewards/cosine_scaled_reward": -0.19268111791461706, + "rewards/format_reward": 0.3333333395421505, + "step": 153 + }, + { + "completion_length": 3004.7709045410156, + "epoch": 0.176, + "grad_norm": 0.1523827612400055, + "kl": 0.00289154052734375, + "learning_rate": 8.865091407243394e-07, + "loss": 0.0001, + "reward": 0.22956039011478424, + "reward_std": 0.9176076352596283, + "rewards/cosine_scaled_reward": 0.006945975736016408, + "rewards/format_reward": 0.479166679084301, + "step": 154 + }, + { + "completion_length": 2524.5000762939453, + "epoch": 0.17714285714285713, + "grad_norm": 0.24581067264080048, + "kl": 0.004633903503417969, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0002, + "reward": 0.3532795161008835, + "reward_std": 0.9012942314147949, + "rewards/cosine_scaled_reward": 0.0729524465277791, + "rewards/format_reward": 0.5416666679084301, + "step": 155 + }, + { + "completion_length": 2804.3958587646484, + "epoch": 0.1782857142857143, + "grad_norm": 0.17449386417865753, + "kl": 0.0034241676330566406, + "learning_rate": 8.823049032816478e-07, + "loss": 0.0001, + "reward": 0.0325262644328177, + "reward_std": 0.7823463976383209, + "rewards/cosine_scaled_reward": -0.03533525764942169, + "rewards/format_reward": 0.3333333395421505, + "step": 156 + }, + { + "completion_length": 2587.6458587646484, + "epoch": 0.17942857142857144, + "grad_norm": 0.25060343742370605, + "kl": 0.004924774169921875, + "learning_rate": 8.801784390262943e-07, + "loss": 0.0002, + "reward": -0.024557745084166527, + "reward_std": 0.6769015416502953, + "rewards/cosine_scaled_reward": -0.1357441581785679, + "rewards/format_reward": 0.5208333488553762, + "step": 157 + }, + { + "completion_length": 2703.291732788086, + "epoch": 0.18057142857142858, + "grad_norm": 0.22450025379657745, + "kl": 0.0042743682861328125, + "learning_rate": 8.780358823396352e-07, + "loss": 0.0002, + "reward": 0.35095504857599735, + "reward_std": 0.8320975676178932, + "rewards/cosine_scaled_reward": 0.09731243550777435, + "rewards/format_reward": 0.5625000074505806, + "step": 158 + }, + { + "completion_length": 2376.3750610351562, + "epoch": 0.18171428571428572, + "grad_norm": 0.21200759708881378, + "kl": 0.0042476654052734375, + "learning_rate": 8.758773376468604e-07, + "loss": 0.0002, + "reward": -0.21574918151600286, + "reward_std": 0.5446000955998898, + "rewards/cosine_scaled_reward": -0.2437344677746296, + "rewards/format_reward": 0.5416666716337204, + "step": 159 + }, + { + "completion_length": 2424.9583892822266, + "epoch": 0.18285714285714286, + "grad_norm": 0.2798510491847992, + "kl": 0.0068912506103515625, + "learning_rate": 8.737029101523929e-07, + "loss": 0.0003, + "reward": 0.04046674119308591, + "reward_std": 0.7019340619444847, + "rewards/cosine_scaled_reward": -0.06891408376395702, + "rewards/format_reward": 0.5000000093132257, + "step": 160 + }, + { + "completion_length": 2377.0625610351562, + "epoch": 0.184, + "grad_norm": 0.23946921527385712, + "kl": 0.004667758941650391, + "learning_rate": 8.715127058347614e-07, + "loss": 0.0002, + "reward": 0.18844679649919271, + "reward_std": 0.7960058376193047, + "rewards/cosine_scaled_reward": 0.006343178451061249, + "rewards/format_reward": 0.5833333414047956, + "step": 161 + }, + { + "completion_length": 2592.708366394043, + "epoch": 0.18514285714285714, + "grad_norm": 0.27762702107429504, + "kl": 0.0064907073974609375, + "learning_rate": 8.693068314414344e-07, + "loss": 0.0003, + "reward": -0.033426298294216394, + "reward_std": 0.6928768754005432, + "rewards/cosine_scaled_reward": -0.1188662868225947, + "rewards/format_reward": 0.47916666977107525, + "step": 162 + }, + { + "completion_length": 2438.3541717529297, + "epoch": 0.18628571428571428, + "grad_norm": 0.28610387444496155, + "kl": 0.005778312683105469, + "learning_rate": 8.670853944836176e-07, + "loss": 0.0002, + "reward": 0.2196616232395172, + "reward_std": 0.7138870656490326, + "rewards/cosine_scaled_reward": 0.04680558480322361, + "rewards/format_reward": 0.5625000074505806, + "step": 163 + }, + { + "completion_length": 1862.2292022705078, + "epoch": 0.18742857142857142, + "grad_norm": 0.22276760637760162, + "kl": 0.00450897216796875, + "learning_rate": 8.648485032310144e-07, + "loss": 0.0002, + "reward": 0.2688090084120631, + "reward_std": 0.662966214120388, + "rewards/cosine_scaled_reward": 0.05209773499518633, + "rewards/format_reward": 0.7083333432674408, + "step": 164 + }, + { + "completion_length": 2642.979202270508, + "epoch": 0.18857142857142858, + "grad_norm": 0.2338155210018158, + "kl": 0.006450653076171875, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0003, + "reward": -0.11441950500011444, + "reward_std": 0.6914314143359661, + "rewards/cosine_scaled_reward": -0.15340343955904245, + "rewards/format_reward": 0.45833334140479565, + "step": 165 + }, + { + "completion_length": 2228.5208892822266, + "epoch": 0.18971428571428572, + "grad_norm": 0.19290319085121155, + "kl": 0.0036525726318359375, + "learning_rate": 8.603287946810513e-07, + "loss": 0.0001, + "reward": 0.039148006588220596, + "reward_std": 0.5984033793210983, + "rewards/cosine_scaled_reward": -0.10414127632975578, + "rewards/format_reward": 0.6250000093132257, + "step": 166 + }, + { + "completion_length": 2005.8125610351562, + "epoch": 0.19085714285714286, + "grad_norm": 0.19015978276729584, + "kl": 0.0037364959716796875, + "learning_rate": 8.580461976679099e-07, + "loss": 0.0001, + "reward": 0.278128509176895, + "reward_std": 0.851395096629858, + "rewards/cosine_scaled_reward": -0.09441124647855759, + "rewards/format_reward": 0.854166679084301, + "step": 167 + }, + { + "completion_length": 2624.979217529297, + "epoch": 0.192, + "grad_norm": 0.18921926617622375, + "kl": 0.004267692565917969, + "learning_rate": 8.557485869176825e-07, + "loss": 0.0002, + "reward": 0.18450849317014217, + "reward_std": 0.9270573705434799, + "rewards/cosine_scaled_reward": -0.05149332107976079, + "rewards/format_reward": 0.5833333469927311, + "step": 168 + }, + { + "completion_length": 1482.1042022705078, + "epoch": 0.19314285714285714, + "grad_norm": 0.23450377583503723, + "kl": 0.004532814025878906, + "learning_rate": 8.534360744126753e-07, + "loss": 0.0002, + "reward": 0.7876127276103944, + "reward_std": 0.8578404039144516, + "rewards/cosine_scaled_reward": 0.2795695327222347, + "rewards/format_reward": 0.916666679084301, + "step": 169 + }, + { + "completion_length": 2186.9167251586914, + "epoch": 0.19428571428571428, + "grad_norm": 0.30296990275382996, + "kl": 0.0044994354248046875, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0002, + "reward": 0.056005215272307396, + "reward_std": 0.6679836474359035, + "rewards/cosine_scaled_reward": -0.021252445876598358, + "rewards/format_reward": 0.5625000018626451, + "step": 170 + }, + { + "completion_length": 2248.312530517578, + "epoch": 0.19542857142857142, + "grad_norm": 0.20839498937129974, + "kl": 0.0036230087280273438, + "learning_rate": 8.487667956935087e-07, + "loss": 0.0001, + "reward": 0.16125285997986794, + "reward_std": 0.7742661274969578, + "rewards/cosine_scaled_reward": -0.013224839232861996, + "rewards/format_reward": 0.5208333395421505, + "step": 171 + }, + { + "completion_length": 2516.833351135254, + "epoch": 0.19657142857142856, + "grad_norm": 0.21489036083221436, + "kl": 0.007071495056152344, + "learning_rate": 8.464102570534061e-07, + "loss": 0.0003, + "reward": 0.3356585130095482, + "reward_std": 0.7922957856208086, + "rewards/cosine_scaled_reward": 0.13084825314581394, + "rewards/format_reward": 0.5000000055879354, + "step": 172 + }, + { + "completion_length": 1844.708381652832, + "epoch": 0.1977142857142857, + "grad_norm": 0.23371165990829468, + "kl": 0.00569915771484375, + "learning_rate": 8.440392717955475e-07, + "loss": 0.0002, + "reward": 0.0007524143438786268, + "reward_std": 0.7251704446971416, + "rewards/cosine_scaled_reward": -0.1511296879616566, + "rewards/format_reward": 0.6458333395421505, + "step": 173 + }, + { + "completion_length": 1865.3958740234375, + "epoch": 0.19885714285714284, + "grad_norm": 0.24605944752693176, + "kl": 0.0073699951171875, + "learning_rate": 8.416539554784089e-07, + "loss": 0.0003, + "reward": 0.40005480125546455, + "reward_std": 0.8749304339289665, + "rewards/cosine_scaled_reward": 0.05308325891382992, + "rewards/format_reward": 0.7916666753590107, + "step": 174 + }, + { + "completion_length": 2525.270896911621, + "epoch": 0.2, + "grad_norm": 0.20796766877174377, + "kl": 0.005358695983886719, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0002, + "reward": 0.15992992464452982, + "reward_std": 0.6595781818032265, + "rewards/cosine_scaled_reward": 0.004710111767053604, + "rewards/format_reward": 0.5208333376795053, + "step": 175 + }, + { + "completion_length": 2078.895896911621, + "epoch": 0.20114285714285715, + "grad_norm": 0.28462517261505127, + "kl": 0.005040168762207031, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0002, + "reward": 0.25200897455215454, + "reward_std": 0.9856929145753384, + "rewards/cosine_scaled_reward": -0.05302844103425741, + "rewards/format_reward": 0.6666666716337204, + "step": 176 + }, + { + "completion_length": 2306.2083740234375, + "epoch": 0.2022857142857143, + "grad_norm": 0.24023154377937317, + "kl": 0.005183219909667969, + "learning_rate": 8.344131861991828e-07, + "loss": 0.0002, + "reward": 0.28276925068348646, + "reward_std": 0.6319316327571869, + "rewards/cosine_scaled_reward": 0.06355313770473003, + "rewards/format_reward": 0.6458333432674408, + "step": 177 + }, + { + "completion_length": 2121.791717529297, + "epoch": 0.20342857142857143, + "grad_norm": 0.2661464214324951, + "kl": 0.008258819580078125, + "learning_rate": 8.319717151140072e-07, + "loss": 0.0003, + "reward": 0.013979046139866114, + "reward_std": 0.7351427339017391, + "rewards/cosine_scaled_reward": -0.14257426233962178, + "rewards/format_reward": 0.6250000055879354, + "step": 178 + }, + { + "completion_length": 2433.708351135254, + "epoch": 0.20457142857142857, + "grad_norm": 0.20986828207969666, + "kl": 0.004935264587402344, + "learning_rate": 8.295165011252396e-07, + "loss": 0.0002, + "reward": -0.25348505191504955, + "reward_std": 0.48908819630742073, + "rewards/cosine_scaled_reward": -0.21158038638532162, + "rewards/format_reward": 0.4583333358168602, + "step": 179 + }, + { + "completion_length": 1877.1458892822266, + "epoch": 0.2057142857142857, + "grad_norm": 0.3029525876045227, + "kl": 0.007785797119140625, + "learning_rate": 8.270476638965461e-07, + "loss": 0.0003, + "reward": 0.3592034715693444, + "reward_std": 1.002089962363243, + "rewards/cosine_scaled_reward": 0.08408734039403498, + "rewards/format_reward": 0.666666679084301, + "step": 180 + }, + { + "completion_length": 2741.729248046875, + "epoch": 0.20685714285714285, + "grad_norm": 0.2602676451206207, + "kl": 0.00711822509765625, + "learning_rate": 8.245653237555705e-07, + "loss": 0.0003, + "reward": -0.08426681905984879, + "reward_std": 0.5513291470706463, + "rewards/cosine_scaled_reward": -0.07528127636760473, + "rewards/format_reward": 0.4583333358168602, + "step": 181 + }, + { + "completion_length": 1988.2292175292969, + "epoch": 0.208, + "grad_norm": 0.17226605117321014, + "kl": 0.0025339126586914062, + "learning_rate": 8.220696016880687e-07, + "loss": 0.0001, + "reward": 0.23337539401836693, + "reward_std": 0.824427492916584, + "rewards/cosine_scaled_reward": -0.0713941128924489, + "rewards/format_reward": 0.7083333432674408, + "step": 182 + }, + { + "completion_length": 1507.3958892822266, + "epoch": 0.20914285714285713, + "grad_norm": 0.2777194082736969, + "kl": 0.006999969482421875, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0003, + "reward": 0.5963715696707368, + "reward_std": 0.8888205997645855, + "rewards/cosine_scaled_reward": 0.07703271105128806, + "rewards/format_reward": 0.8958333432674408, + "step": 183 + }, + { + "completion_length": 2085.979202270508, + "epoch": 0.2102857142857143, + "grad_norm": 0.2736140489578247, + "kl": 0.0066013336181640625, + "learning_rate": 8.170384989716657e-07, + "loss": 0.0003, + "reward": -0.11966793239116669, + "reward_std": 0.4983285814523697, + "rewards/cosine_scaled_reward": -0.20256702601909637, + "rewards/format_reward": 0.6458333395421505, + "step": 184 + }, + { + "completion_length": 2048.5208854675293, + "epoch": 0.21142857142857144, + "grad_norm": 0.24372698366641998, + "kl": 0.0058803558349609375, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0002, + "reward": 0.06693027447909117, + "reward_std": 0.6862058416008949, + "rewards/cosine_scaled_reward": -0.10436064226087183, + "rewards/format_reward": 0.6250000093132257, + "step": 185 + }, + { + "completion_length": 2159.791702270508, + "epoch": 0.21257142857142858, + "grad_norm": 0.20706257224082947, + "kl": 0.006072998046875, + "learning_rate": 8.119553365707802e-07, + "loss": 0.0002, + "reward": 0.0768028711900115, + "reward_std": 0.5925656575709581, + "rewards/cosine_scaled_reward": -0.05905535398051143, + "rewards/format_reward": 0.5625, + "step": 186 + }, + { + "completion_length": 1804.8542022705078, + "epoch": 0.21371428571428572, + "grad_norm": 0.2685582637786865, + "kl": 0.0061969757080078125, + "learning_rate": 8.093945422764069e-07, + "loss": 0.0002, + "reward": 0.12330089835450053, + "reward_std": 0.606924245133996, + "rewards/cosine_scaled_reward": -0.09539542999118567, + "rewards/format_reward": 0.7500000111758709, + "step": 187 + }, + { + "completion_length": 2442.166702270508, + "epoch": 0.21485714285714286, + "grad_norm": 0.2364640235900879, + "kl": 0.0075626373291015625, + "learning_rate": 8.068211054579943e-07, + "loss": 0.0003, + "reward": -0.09115996491163969, + "reward_std": 0.6131051359698176, + "rewards/cosine_scaled_reward": -0.1099636135622859, + "rewards/format_reward": 0.4791666753590107, + "step": 188 + }, + { + "completion_length": 1640.333366394043, + "epoch": 0.216, + "grad_norm": 0.6100494861602783, + "kl": 0.006476402282714844, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0003, + "reward": 0.12896017776802182, + "reward_std": 0.6158286519348621, + "rewards/cosine_scaled_reward": -0.10562538355588913, + "rewards/format_reward": 0.7291666828095913, + "step": 189 + }, + { + "completion_length": 1453.6042022705078, + "epoch": 0.21714285714285714, + "grad_norm": 0.2732960283756256, + "kl": 0.006473541259765625, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0003, + "reward": 0.2690247750142589, + "reward_std": 0.7748389020562172, + "rewards/cosine_scaled_reward": -0.04900714522227645, + "rewards/format_reward": 0.8333333432674408, + "step": 190 + }, + { + "completion_length": 1478.0000305175781, + "epoch": 0.21828571428571428, + "grad_norm": 0.25885748863220215, + "kl": 0.0072765350341796875, + "learning_rate": 7.990261971595048e-07, + "loss": 0.0003, + "reward": 0.5228147888556123, + "reward_std": 0.9636592417955399, + "rewards/cosine_scaled_reward": 0.07218710612505674, + "rewards/format_reward": 0.812500013038516, + "step": 191 + }, + { + "completion_length": 1954.8542251586914, + "epoch": 0.21942857142857142, + "grad_norm": 0.22047950327396393, + "kl": 0.0056095123291015625, + "learning_rate": 7.964034505716476e-07, + "loss": 0.0002, + "reward": 0.0916488622315228, + "reward_std": 0.666417833417654, + "rewards/cosine_scaled_reward": -0.13260145671665668, + "rewards/format_reward": 0.7083333488553762, + "step": 192 + }, + { + "completion_length": 2805.520866394043, + "epoch": 0.22057142857142858, + "grad_norm": 0.22538112103939056, + "kl": 0.0061187744140625, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0002, + "reward": -0.33769641164690256, + "reward_std": 0.507177896797657, + "rewards/cosine_scaled_reward": -0.24244121788069606, + "rewards/format_reward": 0.39583333767950535, + "step": 193 + }, + { + "completion_length": 2417.354202270508, + "epoch": 0.22171428571428572, + "grad_norm": 0.2549934983253479, + "kl": 0.007659912109375, + "learning_rate": 7.911220577405484e-07, + "loss": 0.0003, + "reward": 0.4917243723757565, + "reward_std": 1.1182497814297676, + "rewards/cosine_scaled_reward": 0.08968745917081833, + "rewards/format_reward": 0.645833345130086, + "step": 194 + }, + { + "completion_length": 1545.1667175292969, + "epoch": 0.22285714285714286, + "grad_norm": 0.2932279706001282, + "kl": 0.0074100494384765625, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0003, + "reward": 0.40460192365571856, + "reward_std": 0.8929797559976578, + "rewards/cosine_scaled_reward": -0.016500022262334824, + "rewards/format_reward": 0.8541666716337204, + "step": 195 + }, + { + "completion_length": 2750.791717529297, + "epoch": 0.224, + "grad_norm": 0.2122720181941986, + "kl": 0.007846832275390625, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0003, + "reward": -0.0444787316955626, + "reward_std": 0.6230567768216133, + "rewards/cosine_scaled_reward": -0.09410630911588669, + "rewards/format_reward": 0.5208333358168602, + "step": 196 + }, + { + "completion_length": 1153.0000381469727, + "epoch": 0.22514285714285714, + "grad_norm": 0.30193769931793213, + "kl": 0.00811767578125, + "learning_rate": 7.831121542179086e-07, + "loss": 0.0003, + "reward": 0.5208917018026114, + "reward_std": 0.8946940749883652, + "rewards/cosine_scaled_reward": 0.04951087199151516, + "rewards/format_reward": 0.8958333395421505, + "step": 197 + }, + { + "completion_length": 1570.083396911621, + "epoch": 0.22628571428571428, + "grad_norm": 0.2520270347595215, + "kl": 0.008256912231445312, + "learning_rate": 7.804192891917571e-07, + "loss": 0.0003, + "reward": 0.43910311779472977, + "reward_std": 0.9849786385893822, + "rewards/cosine_scaled_reward": -0.005096456501632929, + "rewards/format_reward": 0.8125000149011612, + "step": 198 + }, + { + "completion_length": 1278.4375305175781, + "epoch": 0.22742857142857142, + "grad_norm": 0.265682190656662, + "kl": 0.00727081298828125, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0003, + "reward": 0.16469779529143125, + "reward_std": 0.5715500190854073, + "rewards/cosine_scaled_reward": -0.1407376565039158, + "rewards/format_reward": 0.9375, + "step": 199 + }, + { + "completion_length": 1395.8541946411133, + "epoch": 0.22857142857142856, + "grad_norm": 0.24266065657138824, + "kl": 0.0067348480224609375, + "learning_rate": 7.75e-07, + "loss": 0.0003, + "reward": 0.3167417113436386, + "reward_std": 0.7663153670728207, + "rewards/cosine_scaled_reward": -0.044573438179213554, + "rewards/format_reward": 0.8958333432674408, + "step": 200 + }, + { + "completion_length": 1824.4375457763672, + "epoch": 0.2297142857142857, + "grad_norm": 0.23948884010314941, + "kl": 0.00595855712890625, + "learning_rate": 7.72273839962904e-07, + "loss": 0.0002, + "reward": 0.7574443845078349, + "reward_std": 0.8649395480751991, + "rewards/cosine_scaled_reward": 0.28773305006325245, + "rewards/format_reward": 0.8333333432674408, + "step": 201 + }, + { + "completion_length": 1388.5833740234375, + "epoch": 0.23085714285714284, + "grad_norm": 0.2889217734336853, + "kl": 0.008331298828125, + "learning_rate": 7.695368466124296e-07, + "loss": 0.0003, + "reward": 0.46887177898315713, + "reward_std": 0.6449649855494499, + "rewards/cosine_scaled_reward": 0.19245607405900955, + "rewards/format_reward": 0.8541666716337204, + "step": 202 + }, + { + "completion_length": 1545.083396911621, + "epoch": 0.232, + "grad_norm": 0.24110884964466095, + "kl": 0.0080413818359375, + "learning_rate": 7.667891533457718e-07, + "loss": 0.0003, + "reward": 0.37680933251976967, + "reward_std": 0.8439403660595417, + "rewards/cosine_scaled_reward": -0.034176651388406754, + "rewards/format_reward": 0.8750000149011612, + "step": 203 + }, + { + "completion_length": 1558.5833740234375, + "epoch": 0.23314285714285715, + "grad_norm": 0.3269757628440857, + "kl": 0.0079345703125, + "learning_rate": 7.640308940816239e-07, + "loss": 0.0003, + "reward": 0.4802742376923561, + "reward_std": 0.7629036977887154, + "rewards/cosine_scaled_reward": 0.10827891901135445, + "rewards/format_reward": 0.8333333432674408, + "step": 204 + }, + { + "completion_length": 1542.5000305175781, + "epoch": 0.2342857142857143, + "grad_norm": 0.2582697570323944, + "kl": 0.0057811737060546875, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0002, + "reward": 0.7918389849364758, + "reward_std": 0.8735720105469227, + "rewards/cosine_scaled_reward": 0.29279233887791634, + "rewards/format_reward": 0.8750000074505806, + "step": 205 + }, + { + "completion_length": 1852.4167098999023, + "epoch": 0.23542857142857143, + "grad_norm": 0.30814215540885925, + "kl": 0.007049560546875, + "learning_rate": 7.584832158039378e-07, + "loss": 0.0003, + "reward": -0.12371287122368813, + "reward_std": 0.5305570438504219, + "rewards/cosine_scaled_reward": -0.24367139115929604, + "rewards/format_reward": 0.7083333544433117, + "step": 206 + }, + { + "completion_length": 1706.8125610351562, + "epoch": 0.23657142857142857, + "grad_norm": 0.2674766182899475, + "kl": 0.009944915771484375, + "learning_rate": 7.556940671764124e-07, + "loss": 0.0004, + "reward": 0.018216492608189583, + "reward_std": 0.8229854069650173, + "rewards/cosine_scaled_reward": -0.2149236612021923, + "rewards/format_reward": 0.7500000074505806, + "step": 207 + }, + { + "completion_length": 1185.0833587646484, + "epoch": 0.2377142857142857, + "grad_norm": 0.24083174765110016, + "kl": 0.0078887939453125, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0003, + "reward": 0.2574625021661632, + "reward_std": 0.5847832001745701, + "rewards/cosine_scaled_reward": -0.02713889814913273, + "rewards/format_reward": 0.9375000074505806, + "step": 208 + }, + { + "completion_length": 1211.3333587646484, + "epoch": 0.23885714285714285, + "grad_norm": 0.28845900297164917, + "kl": 0.0077152252197265625, + "learning_rate": 7.500858306332172e-07, + "loss": 0.0003, + "reward": 0.5239271614700556, + "reward_std": 0.8414975665509701, + "rewards/cosine_scaled_reward": 0.07757960073649883, + "rewards/format_reward": 0.916666679084301, + "step": 209 + }, + { + "completion_length": 1571.8333892822266, + "epoch": 0.24, + "grad_norm": 0.26938140392303467, + "kl": 0.006866455078125, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0003, + "reward": 0.39476348645985126, + "reward_std": 0.8261179774999619, + "rewards/cosine_scaled_reward": -0.00616603484377265, + "rewards/format_reward": 0.8541666753590107, + "step": 210 + }, + { + "completion_length": 1672.083366394043, + "epoch": 0.24114285714285713, + "grad_norm": 0.358892560005188, + "kl": 0.0106353759765625, + "learning_rate": 7.444385869608921e-07, + "loss": 0.0004, + "reward": 0.3219002881087363, + "reward_std": 0.750870831310749, + "rewards/cosine_scaled_reward": 0.013120350427925587, + "rewards/format_reward": 0.7916666679084301, + "step": 211 + }, + { + "completion_length": 1112.6250534057617, + "epoch": 0.2422857142857143, + "grad_norm": 0.27129727602005005, + "kl": 0.008617401123046875, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0003, + "reward": 0.5328430655645207, + "reward_std": 0.671292532235384, + "rewards/cosine_scaled_reward": 0.12215181812644005, + "rewards/format_reward": 0.9375, + "step": 212 + }, + { + "completion_length": 1226.1666870117188, + "epoch": 0.24342857142857144, + "grad_norm": 0.36483535170555115, + "kl": 0.01009368896484375, + "learning_rate": 7.387534371007797e-07, + "loss": 0.0004, + "reward": 0.47296964284032583, + "reward_std": 0.7435989566147327, + "rewards/cosine_scaled_reward": 0.06554012396372855, + "rewards/format_reward": 0.8958333432674408, + "step": 213 + }, + { + "completion_length": 1548.1875228881836, + "epoch": 0.24457142857142858, + "grad_norm": 0.3107360303401947, + "kl": 0.008228302001953125, + "learning_rate": 7.358969934210438e-07, + "loss": 0.0003, + "reward": 0.5165067554917186, + "reward_std": 0.7588581070303917, + "rewards/cosine_scaled_reward": 0.10956975258886814, + "rewards/format_reward": 0.8541666716337204, + "step": 214 + }, + { + "completion_length": 1461.6875305175781, + "epoch": 0.24571428571428572, + "grad_norm": 0.22486445307731628, + "kl": 0.0059413909912109375, + "learning_rate": 7.330314893841101e-07, + "loss": 0.0002, + "reward": 0.07686804980039597, + "reward_std": 0.5298155099153519, + "rewards/cosine_scaled_reward": -0.14177834056317806, + "rewards/format_reward": 0.8333333358168602, + "step": 215 + }, + { + "completion_length": 1340.5208702087402, + "epoch": 0.24685714285714286, + "grad_norm": 0.2673039436340332, + "kl": 0.008434295654296875, + "learning_rate": 7.301570646506027e-07, + "loss": 0.0003, + "reward": 0.38506741262972355, + "reward_std": 0.6937872804701328, + "rewards/cosine_scaled_reward": 0.08268431574106216, + "rewards/format_reward": 0.8125000074505806, + "step": 216 + }, + { + "completion_length": 1469.6250534057617, + "epoch": 0.248, + "grad_norm": 0.3538609445095062, + "kl": 0.008731842041015625, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0003, + "reward": 0.6034841773507651, + "reward_std": 1.0110028125345707, + "rewards/cosine_scaled_reward": 0.10976234765257686, + "rewards/format_reward": 0.8125000037252903, + "step": 217 + }, + { + "completion_length": 1511.458381652832, + "epoch": 0.24914285714285714, + "grad_norm": 0.2224741131067276, + "kl": 0.007978439331054688, + "learning_rate": 7.243820139034464e-07, + "loss": 0.0003, + "reward": 0.21123092295601964, + "reward_std": 0.7925957031548023, + "rewards/cosine_scaled_reward": -0.1498157843016088, + "rewards/format_reward": 0.8958333432674408, + "step": 218 + }, + { + "completion_length": 1340.4791870117188, + "epoch": 0.2502857142857143, + "grad_norm": 0.24136604368686676, + "kl": 0.009267807006835938, + "learning_rate": 7.214816693576234e-07, + "loss": 0.0004, + "reward": 0.3911650243680924, + "reward_std": 0.6955340765416622, + "rewards/cosine_scaled_reward": 0.048877415247261524, + "rewards/format_reward": 0.875, + "step": 219 + }, + { + "completion_length": 1524.354175567627, + "epoch": 0.25142857142857145, + "grad_norm": 0.2906893491744995, + "kl": 0.011249542236328125, + "learning_rate": 7.185729670371604e-07, + "loss": 0.0004, + "reward": -0.04361694771796465, + "reward_std": 0.5702285468578339, + "rewards/cosine_scaled_reward": -0.2380666360259056, + "rewards/format_reward": 0.8125000074505806, + "step": 220 + }, + { + "completion_length": 1496.2916870117188, + "epoch": 0.25257142857142856, + "grad_norm": 0.24429333209991455, + "kl": 0.0070476531982421875, + "learning_rate": 7.156560487081051e-07, + "loss": 0.0003, + "reward": 0.4626695259066764, + "reward_std": 0.7840724922716618, + "rewards/cosine_scaled_reward": 0.06611506012268364, + "rewards/format_reward": 0.8541666716337204, + "step": 221 + }, + { + "completion_length": 1576.9167022705078, + "epoch": 0.2537142857142857, + "grad_norm": 0.26980236172676086, + "kl": 0.009477615356445312, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0004, + "reward": 0.2927159178070724, + "reward_std": 0.6861963458359241, + "rewards/cosine_scaled_reward": -0.02613269304856658, + "rewards/format_reward": 0.7916666679084301, + "step": 222 + }, + { + "completion_length": 1766.4583587646484, + "epoch": 0.25485714285714284, + "grad_norm": 0.3431569039821625, + "kl": 0.009063720703125, + "learning_rate": 7.097981330836616e-07, + "loss": 0.0004, + "reward": 0.24513752292841673, + "reward_std": 0.6672438345849514, + "rewards/cosine_scaled_reward": 0.0011468753218650818, + "rewards/format_reward": 0.7083333432674408, + "step": 223 + }, + { + "completion_length": 1734.6458587646484, + "epoch": 0.256, + "grad_norm": 0.2113533616065979, + "kl": 0.008029937744140625, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0003, + "reward": 0.6102093638037331, + "reward_std": 1.0698180794715881, + "rewards/cosine_scaled_reward": 0.07039764476940036, + "rewards/format_reward": 0.8541666753590107, + "step": 224 + }, + { + "completion_length": 1987.958396911621, + "epoch": 0.2571428571428571, + "grad_norm": 0.3268744945526123, + "kl": 0.012989044189453125, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0005, + "reward": 0.24530102079734206, + "reward_std": 0.9006591737270355, + "rewards/cosine_scaled_reward": -0.0955775510519743, + "rewards/format_reward": 0.7708333507180214, + "step": 225 + }, + { + "completion_length": 1425.6041946411133, + "epoch": 0.2582857142857143, + "grad_norm": 0.24840526282787323, + "kl": 0.00763702392578125, + "learning_rate": 7.009532063876148e-07, + "loss": 0.0003, + "reward": 0.6913954578340054, + "reward_std": 0.8804083652794361, + "rewards/cosine_scaled_reward": 0.15895199915394187, + "rewards/format_reward": 0.9375000074505806, + "step": 226 + }, + { + "completion_length": 1124.0416946411133, + "epoch": 0.25942857142857145, + "grad_norm": 0.31861069798469543, + "kl": 0.011066436767578125, + "learning_rate": 6.979899910323624e-07, + "loss": 0.0004, + "reward": 0.3930067252367735, + "reward_std": 0.8200523294508457, + "rewards/cosine_scaled_reward": -0.05645215045660734, + "rewards/format_reward": 0.9583333358168602, + "step": 227 + }, + { + "completion_length": 1212.2291984558105, + "epoch": 0.26057142857142856, + "grad_norm": 0.27715247869491577, + "kl": 0.008419036865234375, + "learning_rate": 6.950195628537299e-07, + "loss": 0.0003, + "reward": 0.5142936524935067, + "reward_std": 0.8248747400939465, + "rewards/cosine_scaled_reward": 0.05838426947593689, + "rewards/format_reward": 0.895833333954215, + "step": 228 + }, + { + "completion_length": 1521.9167175292969, + "epoch": 0.26171428571428573, + "grad_norm": 0.3403978645801544, + "kl": 0.012042999267578125, + "learning_rate": 6.920420666261961e-07, + "loss": 0.0005, + "reward": 0.19939115084707737, + "reward_std": 0.5617837458848953, + "rewards/cosine_scaled_reward": -0.031215182272717357, + "rewards/format_reward": 0.8125000149011612, + "step": 229 + }, + { + "completion_length": 1612.5000305175781, + "epoch": 0.26285714285714284, + "grad_norm": 0.27715978026390076, + "kl": 0.010951995849609375, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0004, + "reward": 0.025840092916041613, + "reward_std": 0.5985405147075653, + "rewards/cosine_scaled_reward": -0.1870412821881473, + "rewards/format_reward": 0.791666679084301, + "step": 230 + }, + { + "completion_length": 1249.7500305175781, + "epoch": 0.264, + "grad_norm": 0.2477809637784958, + "kl": 0.00991058349609375, + "learning_rate": 6.860664508377001e-07, + "loss": 0.0004, + "reward": 0.45136169949546456, + "reward_std": 0.7211090363562107, + "rewards/cosine_scaled_reward": 0.07621757127344608, + "rewards/format_reward": 0.8541666697710752, + "step": 231 + }, + { + "completion_length": 1592.9791946411133, + "epoch": 0.2651428571428571, + "grad_norm": 0.30748802423477173, + "kl": 0.012454986572265625, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0005, + "reward": 0.012111502306652255, + "reward_std": 0.6053478866815567, + "rewards/cosine_scaled_reward": -0.21021990105509758, + "rewards/format_reward": 0.8125000018626451, + "step": 232 + }, + { + "completion_length": 1202.4375495910645, + "epoch": 0.2662857142857143, + "grad_norm": 0.2600504755973816, + "kl": 0.011119842529296875, + "learning_rate": 6.800643086250121e-07, + "loss": 0.0004, + "reward": 0.3125277090584859, + "reward_std": 0.7523941993713379, + "rewards/cosine_scaled_reward": -0.07805835455656052, + "rewards/format_reward": 0.9375000149011612, + "step": 233 + }, + { + "completion_length": 1664.25004196167, + "epoch": 0.2674285714285714, + "grad_norm": 0.28175073862075806, + "kl": 0.013034820556640625, + "learning_rate": 6.770536555792944e-07, + "loss": 0.0005, + "reward": 0.2759629947831854, + "reward_std": 0.7744503617286682, + "rewards/cosine_scaled_reward": -0.00288372952491045, + "rewards/format_reward": 0.729166679084301, + "step": 234 + }, + { + "completion_length": 1031.729206085205, + "epoch": 0.26857142857142857, + "grad_norm": 0.41683852672576904, + "kl": 0.009319305419921875, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0004, + "reward": 0.536057305522263, + "reward_std": 0.8349857330322266, + "rewards/cosine_scaled_reward": 0.1025073304772377, + "rewards/format_reward": 0.9375000074505806, + "step": 235 + }, + { + "completion_length": 1922.7708587646484, + "epoch": 0.26971428571428574, + "grad_norm": 0.2624496519565582, + "kl": 0.010082244873046875, + "learning_rate": 6.710139192768694e-07, + "loss": 0.0004, + "reward": 0.5450696041807532, + "reward_std": 1.0304477401077747, + "rewards/cosine_scaled_reward": 0.09541857382282615, + "rewards/format_reward": 0.7500000055879354, + "step": 236 + }, + { + "completion_length": 1460.6250305175781, + "epoch": 0.27085714285714285, + "grad_norm": 0.2655409872531891, + "kl": 0.0093231201171875, + "learning_rate": 6.679851303883891e-07, + "loss": 0.0004, + "reward": 0.4284868792165071, + "reward_std": 0.6683508493006229, + "rewards/cosine_scaled_reward": 0.04793188441544771, + "rewards/format_reward": 0.875, + "step": 237 + }, + { + "completion_length": 1020.8125305175781, + "epoch": 0.272, + "grad_norm": 0.27945178747177124, + "kl": 0.010372161865234375, + "learning_rate": 6.649505910711058e-07, + "loss": 0.0004, + "reward": 0.6645625443197787, + "reward_std": 0.9196898862719536, + "rewards/cosine_scaled_reward": 0.1358748753555119, + "rewards/format_reward": 0.9166666679084301, + "step": 238 + }, + { + "completion_length": 1431.6041870117188, + "epoch": 0.27314285714285713, + "grad_norm": 0.25624558329582214, + "kl": 0.00914764404296875, + "learning_rate": 6.619104492241847e-07, + "loss": 0.0004, + "reward": 0.584113098680973, + "reward_std": 0.6673776432871819, + "rewards/cosine_scaled_reward": 0.31309779919683933, + "rewards/format_reward": 0.7708333395421505, + "step": 239 + }, + { + "completion_length": 1502.979232788086, + "epoch": 0.2742857142857143, + "grad_norm": 0.35446247458457947, + "kl": 0.02126312255859375, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0009, + "reward": 0.07019667001441121, + "reward_std": 0.6685795933008194, + "rewards/cosine_scaled_reward": -0.22959061339497566, + "rewards/format_reward": 0.916666679084301, + "step": 240 + }, + { + "completion_length": 1754.9375534057617, + "epoch": 0.2754285714285714, + "grad_norm": 0.29134202003479004, + "kl": 0.0203399658203125, + "learning_rate": 6.558139508961654e-07, + "loss": 0.0008, + "reward": -0.14658209728077054, + "reward_std": 0.5397600717842579, + "rewards/cosine_scaled_reward": -0.26472287997603416, + "rewards/format_reward": 0.7500000093132257, + "step": 241 + }, + { + "completion_length": 1179.6041984558105, + "epoch": 0.2765714285714286, + "grad_norm": 0.3573894500732422, + "kl": 0.019369125366210938, + "learning_rate": 6.527578915497951e-07, + "loss": 0.0008, + "reward": 0.17393979895859957, + "reward_std": 0.6096976324915886, + "rewards/cosine_scaled_reward": -0.09758513886481524, + "rewards/format_reward": 0.8958333395421505, + "step": 242 + }, + { + "completion_length": 1776.333351135254, + "epoch": 0.2777142857142857, + "grad_norm": 0.26269933581352234, + "kl": 0.016147613525390625, + "learning_rate": 6.496968239287603e-07, + "loss": 0.0006, + "reward": 0.18704321165569127, + "reward_std": 0.727390356361866, + "rewards/cosine_scaled_reward": -0.07570383511483669, + "rewards/format_reward": 0.7500000055879354, + "step": 243 + }, + { + "completion_length": 1609.1041870117188, + "epoch": 0.27885714285714286, + "grad_norm": 0.42587101459503174, + "kl": 0.016376495361328125, + "learning_rate": 6.466308972251785e-07, + "loss": 0.0007, + "reward": 0.5847336421720684, + "reward_std": 0.9021250456571579, + "rewards/cosine_scaled_reward": 0.07377137243747711, + "rewards/format_reward": 0.8750000149011612, + "step": 244 + }, + { + "completion_length": 2179.520896911621, + "epoch": 0.28, + "grad_norm": 0.5839371085166931, + "kl": 0.019596099853515625, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0008, + "reward": 0.45881569012999535, + "reward_std": 1.0688990727066994, + "rewards/cosine_scaled_reward": 0.05305776512250304, + "rewards/format_reward": 0.6666666828095913, + "step": 245 + }, + { + "completion_length": 1406.6667175292969, + "epoch": 0.28114285714285714, + "grad_norm": 0.28245845437049866, + "kl": 0.012783050537109375, + "learning_rate": 6.404850645156841e-07, + "loss": 0.0005, + "reward": 0.353425451554358, + "reward_std": 0.7201703079044819, + "rewards/cosine_scaled_reward": -0.04003934998763725, + "rewards/format_reward": 0.8958333395421505, + "step": 246 + }, + { + "completion_length": 2307.7500610351562, + "epoch": 0.2822857142857143, + "grad_norm": 0.48197513818740845, + "kl": 0.0279693603515625, + "learning_rate": 6.374054580489873e-07, + "loss": 0.0011, + "reward": -0.030684850877150893, + "reward_std": 0.7912298962473869, + "rewards/cosine_scaled_reward": -0.17165578715503216, + "rewards/format_reward": 0.5833333488553762, + "step": 247 + }, + { + "completion_length": 1537.020866394043, + "epoch": 0.2834285714285714, + "grad_norm": 0.5387913584709167, + "kl": 0.020456314086914062, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0008, + "reward": 0.5385480709373951, + "reward_std": 0.7778996899724007, + "rewards/cosine_scaled_reward": 0.18887367472052574, + "rewards/format_reward": 0.7916666679084301, + "step": 248 + }, + { + "completion_length": 1561.6042098999023, + "epoch": 0.2845714285714286, + "grad_norm": 0.3947852551937103, + "kl": 0.021512985229492188, + "learning_rate": 6.31233615362752e-07, + "loss": 0.0009, + "reward": 0.4948454611003399, + "reward_std": 0.8307998143136501, + "rewards/cosine_scaled_reward": 0.12714037066325545, + "rewards/format_reward": 0.7291666734963655, + "step": 249 + }, + { + "completion_length": 1415.1875457763672, + "epoch": 0.2857142857142857, + "grad_norm": 0.48303961753845215, + "kl": 0.01863861083984375, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0007, + "reward": 0.28884256578749046, + "reward_std": 0.7405095249414444, + "rewards/cosine_scaled_reward": -0.05281085259048268, + "rewards/format_reward": 0.8750000074505806, + "step": 250 + }, + { + "completion_length": 1362.2292022705078, + "epoch": 0.28685714285714287, + "grad_norm": 0.4910474717617035, + "kl": 0.02812957763671875, + "learning_rate": 6.25045936022246e-07, + "loss": 0.0011, + "reward": 0.24397201603278518, + "reward_std": 0.8808831572532654, + "rewards/cosine_scaled_reward": -0.09501386666670442, + "rewards/format_reward": 0.7916666734963655, + "step": 251 + }, + { + "completion_length": 1879.4791946411133, + "epoch": 0.288, + "grad_norm": 0.5786982774734497, + "kl": 0.03542327880859375, + "learning_rate": 6.219465344613258e-07, + "loss": 0.0014, + "reward": 0.003026331774890423, + "reward_std": 0.5837202109396458, + "rewards/cosine_scaled_reward": -0.1362874787300825, + "rewards/format_reward": 0.729166679084301, + "step": 252 + }, + { + "completion_length": 1624.1250381469727, + "epoch": 0.28914285714285715, + "grad_norm": 0.6477235555648804, + "kl": 0.037837982177734375, + "learning_rate": 6.188436263278172e-07, + "loss": 0.0015, + "reward": 0.24255571886897087, + "reward_std": 0.933814812451601, + "rewards/cosine_scaled_reward": -0.09086994710378349, + "rewards/format_reward": 0.7291666697710752, + "step": 253 + }, + { + "completion_length": 1799.5209007263184, + "epoch": 0.29028571428571426, + "grad_norm": 0.4789735972881317, + "kl": 0.0286407470703125, + "learning_rate": 6.157373628530852e-07, + "loss": 0.0011, + "reward": 0.2414424503222108, + "reward_std": 0.7898707538843155, + "rewards/cosine_scaled_reward": -0.08134954981505871, + "rewards/format_reward": 0.7916666865348816, + "step": 254 + }, + { + "completion_length": 2204.27091217041, + "epoch": 0.2914285714285714, + "grad_norm": 0.45766016840934753, + "kl": 0.03482818603515625, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0014, + "reward": 0.07502584741450846, + "reward_std": 0.9448065534234047, + "rewards/cosine_scaled_reward": -0.15562699240399525, + "rewards/format_reward": 0.6666666753590107, + "step": 255 + }, + { + "completion_length": 1663.729232788086, + "epoch": 0.2925714285714286, + "grad_norm": 0.5182092785835266, + "kl": 0.0309906005859375, + "learning_rate": 6.095153756157051e-07, + "loss": 0.0012, + "reward": 0.2998895291239023, + "reward_std": 0.7498802877962589, + "rewards/cosine_scaled_reward": -0.001917465589940548, + "rewards/format_reward": 0.7708333395421505, + "step": 256 + }, + { + "completion_length": 2637.0625762939453, + "epoch": 0.2937142857142857, + "grad_norm": 0.5543068647384644, + "kl": 0.0585174560546875, + "learning_rate": 6.06399955103937e-07, + "loss": 0.0023, + "reward": 0.48659578152000904, + "reward_std": 1.1908883340656757, + "rewards/cosine_scaled_reward": 0.1260754211107269, + "rewards/format_reward": 0.604166679084301, + "step": 257 + }, + { + "completion_length": 2023.2708892822266, + "epoch": 0.2948571428571429, + "grad_norm": 0.5329146385192871, + "kl": 0.03546142578125, + "learning_rate": 6.032817857379256e-07, + "loss": 0.0014, + "reward": 0.459288542624563, + "reward_std": 0.8846051767468452, + "rewards/cosine_scaled_reward": 0.051098582334816456, + "rewards/format_reward": 0.7708333544433117, + "step": 258 + }, + { + "completion_length": 1453.2708625793457, + "epoch": 0.296, + "grad_norm": 0.43237462639808655, + "kl": 0.02968597412109375, + "learning_rate": 6.001610194928464e-07, + "loss": 0.0012, + "reward": 0.3637466989457607, + "reward_std": 0.8164427168667316, + "rewards/cosine_scaled_reward": 0.06898940447717905, + "rewards/format_reward": 0.7708333414047956, + "step": 259 + }, + { + "completion_length": 944.4791851043701, + "epoch": 0.29714285714285715, + "grad_norm": 0.6447161436080933, + "kl": 0.018672943115234375, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0007, + "reward": 0.5366823731455952, + "reward_std": 0.7269707396626472, + "rewards/cosine_scaled_reward": 0.11251152493059635, + "rewards/format_reward": 0.916666679084301, + "step": 260 + }, + { + "completion_length": 1973.9583740234375, + "epoch": 0.29828571428571427, + "grad_norm": 0.37675243616104126, + "kl": 0.03509521484375, + "learning_rate": 5.939123048916173e-07, + "loss": 0.0014, + "reward": 0.13344457978382707, + "reward_std": 0.8264882601797581, + "rewards/cosine_scaled_reward": -0.09806014783680439, + "rewards/format_reward": 0.6666666734963655, + "step": 261 + }, + { + "completion_length": 1516.6875228881836, + "epoch": 0.29942857142857143, + "grad_norm": 0.5359493494033813, + "kl": 0.03139495849609375, + "learning_rate": 5.907846610890011e-07, + "loss": 0.0013, + "reward": -0.0372560010291636, + "reward_std": 0.6091209948062897, + "rewards/cosine_scaled_reward": -0.212333626113832, + "rewards/format_reward": 0.729166679084301, + "step": 262 + }, + { + "completion_length": 1346.3125228881836, + "epoch": 0.30057142857142854, + "grad_norm": 0.3749285638332367, + "kl": 0.0174713134765625, + "learning_rate": 5.87655029499542e-07, + "loss": 0.0007, + "reward": 0.17701542098075151, + "reward_std": 0.7833218686282635, + "rewards/cosine_scaled_reward": -0.15012040082365274, + "rewards/format_reward": 0.8541666772216558, + "step": 263 + }, + { + "completion_length": 1433.2708740234375, + "epoch": 0.3017142857142857, + "grad_norm": 0.4387812614440918, + "kl": 0.03086090087890625, + "learning_rate": 5.845235626570683e-07, + "loss": 0.0012, + "reward": 0.2682348359376192, + "reward_std": 0.736188217997551, + "rewards/cosine_scaled_reward": -0.10726138763129711, + "rewards/format_reward": 0.8958333432674408, + "step": 264 + }, + { + "completion_length": 1286.4583587646484, + "epoch": 0.3028571428571429, + "grad_norm": 0.5124613642692566, + "kl": 0.029205322265625, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0012, + "reward": 0.47514245874481276, + "reward_std": 0.7719651460647583, + "rewards/cosine_scaled_reward": 0.03089581150561571, + "rewards/format_reward": 0.8958333507180214, + "step": 265 + }, + { + "completion_length": 1764.3958892822266, + "epoch": 0.304, + "grad_norm": 0.5820686221122742, + "kl": 0.04436492919921875, + "learning_rate": 5.78255733788191e-07, + "loss": 0.0018, + "reward": 0.060105842188932, + "reward_std": 0.6623933054506779, + "rewards/cosine_scaled_reward": -0.15320268645882607, + "rewards/format_reward": 0.7500000074505806, + "step": 266 + }, + { + "completion_length": 1691.2917022705078, + "epoch": 0.30514285714285716, + "grad_norm": 0.3804147243499756, + "kl": 0.05069732666015625, + "learning_rate": 5.751196772469237e-07, + "loss": 0.002, + "reward": 0.010469182627275586, + "reward_std": 0.6885622590780258, + "rewards/cosine_scaled_reward": -0.203377990052104, + "rewards/format_reward": 0.7500000149011612, + "step": 267 + }, + { + "completion_length": 1172.3125305175781, + "epoch": 0.3062857142857143, + "grad_norm": 0.5410240888595581, + "kl": 0.036266326904296875, + "learning_rate": 5.71982396408026e-07, + "loss": 0.0015, + "reward": 0.10452710837125778, + "reward_std": 0.667732447385788, + "rewards/cosine_scaled_reward": -0.19316846132278442, + "rewards/format_reward": 0.895833333954215, + "step": 268 + }, + { + "completion_length": 1373.541706085205, + "epoch": 0.30742857142857144, + "grad_norm": 0.5952326655387878, + "kl": 0.033191680908203125, + "learning_rate": 5.688440441781398e-07, + "loss": 0.0013, + "reward": 0.33500073617324233, + "reward_std": 0.7064723633229733, + "rewards/cosine_scaled_reward": -0.04047585092484951, + "rewards/format_reward": 0.8958333432674408, + "step": 269 + }, + { + "completion_length": 1461.0000228881836, + "epoch": 0.30857142857142855, + "grad_norm": 0.3846686780452728, + "kl": 0.047119140625, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0019, + "reward": 0.48025781381875277, + "reward_std": 0.8026834316551685, + "rewards/cosine_scaled_reward": 0.06730105105089024, + "rewards/format_reward": 0.8750000055879354, + "step": 270 + }, + { + "completion_length": 1104.7500190734863, + "epoch": 0.3097142857142857, + "grad_norm": 0.3378467559814453, + "kl": 0.02167510986328125, + "learning_rate": 5.625647374256061e-07, + "loss": 0.0009, + "reward": 0.512184641789645, + "reward_std": 0.6890225373208523, + "rewards/cosine_scaled_reward": 0.11897583678364754, + "rewards/format_reward": 0.9583333432674408, + "step": 271 + }, + { + "completion_length": 1595.5625610351562, + "epoch": 0.31085714285714283, + "grad_norm": 0.9565722346305847, + "kl": 0.0341033935546875, + "learning_rate": 5.594240889475106e-07, + "loss": 0.0014, + "reward": 0.2559305219911039, + "reward_std": 0.7282034941017628, + "rewards/cosine_scaled_reward": -0.0902782422490418, + "rewards/format_reward": 0.8750000055879354, + "step": 272 + }, + { + "completion_length": 1564.2500534057617, + "epoch": 0.312, + "grad_norm": 0.6573230624198914, + "kl": 0.06351470947265625, + "learning_rate": 5.562829811526154e-07, + "loss": 0.0025, + "reward": 0.33316121553070843, + "reward_std": 0.7983997203409672, + "rewards/cosine_scaled_reward": 0.00879891961812973, + "rewards/format_reward": 0.7916666753590107, + "step": 273 + }, + { + "completion_length": 908.3750190734863, + "epoch": 0.31314285714285717, + "grad_norm": 0.3975994288921356, + "kl": 0.020572662353515625, + "learning_rate": 5.531415671340826e-07, + "loss": 0.0008, + "reward": 0.490811045630835, + "reward_std": 0.7964842580258846, + "rewards/cosine_scaled_reward": 0.027651330456137657, + "rewards/format_reward": 0.9791666716337204, + "step": 274 + }, + { + "completion_length": 1273.416706085205, + "epoch": 0.3142857142857143, + "grad_norm": 0.5143330693244934, + "kl": 0.0457763671875, + "learning_rate": 5.5e-07, + "loss": 0.0018, + "reward": 0.5196955967694521, + "reward_std": 0.8949318751692772, + "rewards/cosine_scaled_reward": 0.0634711142629385, + "rewards/format_reward": 0.9166666865348816, + "step": 275 + }, + { + "completion_length": 1216.2917175292969, + "epoch": 0.31542857142857145, + "grad_norm": 0.5758770704269409, + "kl": 0.04427337646484375, + "learning_rate": 5.468584328659172e-07, + "loss": 0.0018, + "reward": 0.45866466453298926, + "reward_std": 0.7997590340673923, + "rewards/cosine_scaled_reward": 0.004157306393608451, + "rewards/format_reward": 0.9166666716337204, + "step": 276 + }, + { + "completion_length": 1405.8542022705078, + "epoch": 0.31657142857142856, + "grad_norm": 0.7709969878196716, + "kl": 0.07879257202148438, + "learning_rate": 5.437170188473847e-07, + "loss": 0.0032, + "reward": 0.3928624112159014, + "reward_std": 0.8682146407663822, + "rewards/cosine_scaled_reward": -0.0433911276049912, + "rewards/format_reward": 0.8958333358168602, + "step": 277 + }, + { + "completion_length": 1251.5834045410156, + "epoch": 0.3177142857142857, + "grad_norm": 0.4815487265586853, + "kl": 0.028911590576171875, + "learning_rate": 5.405759110524894e-07, + "loss": 0.0012, + "reward": 0.602864139713347, + "reward_std": 0.75666194409132, + "rewards/cosine_scaled_reward": 0.1303448430262506, + "rewards/format_reward": 0.9583333432674408, + "step": 278 + }, + { + "completion_length": 1584.2292098999023, + "epoch": 0.31885714285714284, + "grad_norm": 0.676331102848053, + "kl": 0.04915618896484375, + "learning_rate": 5.37435262574394e-07, + "loss": 0.002, + "reward": 0.1920575883705169, + "reward_std": 0.5915602222084999, + "rewards/cosine_scaled_reward": -0.08263814821839333, + "rewards/format_reward": 0.8541666716337204, + "step": 279 + }, + { + "completion_length": 1887.104232788086, + "epoch": 0.32, + "grad_norm": 0.5340350270271301, + "kl": 0.09076309204101562, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0036, + "reward": 0.554422979708761, + "reward_std": 1.0056494362652302, + "rewards/cosine_scaled_reward": 0.14068329893052578, + "rewards/format_reward": 0.7291666753590107, + "step": 280 + }, + { + "completion_length": 2119.9583892822266, + "epoch": 0.3211428571428571, + "grad_norm": 0.901451587677002, + "kl": 0.10712432861328125, + "learning_rate": 5.311559558218603e-07, + "loss": 0.0043, + "reward": -0.04842189947521547, + "reward_std": 0.735725924372673, + "rewards/cosine_scaled_reward": -0.18730824999511242, + "rewards/format_reward": 0.6041666772216558, + "step": 281 + }, + { + "completion_length": 1546.7708625793457, + "epoch": 0.3222857142857143, + "grad_norm": 0.5774481892585754, + "kl": 0.08298110961914062, + "learning_rate": 5.28017603591974e-07, + "loss": 0.0033, + "reward": 0.4459398053586483, + "reward_std": 0.8270345889031887, + "rewards/cosine_scaled_reward": 0.038561356253921986, + "rewards/format_reward": 0.875, + "step": 282 + }, + { + "completion_length": 1432.2292175292969, + "epoch": 0.32342857142857145, + "grad_norm": 0.6603187322616577, + "kl": 0.06803131103515625, + "learning_rate": 5.248803227530763e-07, + "loss": 0.0027, + "reward": 0.3603776376694441, + "reward_std": 0.8584562204778194, + "rewards/cosine_scaled_reward": -0.058407315984368324, + "rewards/format_reward": 0.8541666828095913, + "step": 283 + }, + { + "completion_length": 1322.5625228881836, + "epoch": 0.32457142857142857, + "grad_norm": 0.5359531044960022, + "kl": 0.06329727172851562, + "learning_rate": 5.21744266211809e-07, + "loss": 0.0025, + "reward": 0.09239194821566343, + "reward_std": 0.6924001406878233, + "rewards/cosine_scaled_reward": -0.19309467636048794, + "rewards/format_reward": 0.8541666716337204, + "step": 284 + }, + { + "completion_length": 829.2917022705078, + "epoch": 0.32571428571428573, + "grad_norm": 0.4555800259113312, + "kl": 0.037982940673828125, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0015, + "reward": 0.4122108933515847, + "reward_std": 0.7844538278877735, + "rewards/cosine_scaled_reward": 0.023661921732127666, + "rewards/format_reward": 0.9375000074505806, + "step": 285 + }, + { + "completion_length": 1359.5417098999023, + "epoch": 0.32685714285714285, + "grad_norm": 0.5877644419670105, + "kl": 0.05910491943359375, + "learning_rate": 5.154764373429315e-07, + "loss": 0.0024, + "reward": 0.2864837823435664, + "reward_std": 0.8134515210986137, + "rewards/cosine_scaled_reward": -0.10266671486897394, + "rewards/format_reward": 0.8750000223517418, + "step": 286 + }, + { + "completion_length": 1420.1250343322754, + "epoch": 0.328, + "grad_norm": 1.0718910694122314, + "kl": 0.14565277099609375, + "learning_rate": 5.123449705004581e-07, + "loss": 0.0058, + "reward": 0.23910232353955507, + "reward_std": 0.6846515089273453, + "rewards/cosine_scaled_reward": -0.03220707958098501, + "rewards/format_reward": 0.812500013038516, + "step": 287 + }, + { + "completion_length": 1311.1666793823242, + "epoch": 0.3291428571428571, + "grad_norm": 0.7865566611289978, + "kl": 0.07602310180664062, + "learning_rate": 5.09215338910999e-07, + "loss": 0.003, + "reward": 0.4210634557530284, + "reward_std": 0.8144961148500443, + "rewards/cosine_scaled_reward": -0.004126264713704586, + "rewards/format_reward": 0.9583333432674408, + "step": 288 + }, + { + "completion_length": 1409.8125495910645, + "epoch": 0.3302857142857143, + "grad_norm": 1.233174204826355, + "kl": 0.0773468017578125, + "learning_rate": 5.060876951083828e-07, + "loss": 0.0031, + "reward": 0.2632339745759964, + "reward_std": 0.6024043373763561, + "rewards/cosine_scaled_reward": 0.0020552128553390503, + "rewards/format_reward": 0.8541666679084301, + "step": 289 + }, + { + "completion_length": 1338.3125381469727, + "epoch": 0.3314285714285714, + "grad_norm": 0.5929808616638184, + "kl": 0.119293212890625, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0048, + "reward": 0.48768392187776044, + "reward_std": 0.8819490298628807, + "rewards/cosine_scaled_reward": -0.019131449982523918, + "rewards/format_reward": 0.9583333432674408, + "step": 290 + }, + { + "completion_length": 1393.5208892822266, + "epoch": 0.3325714285714286, + "grad_norm": 0.6517693996429443, + "kl": 0.109130859375, + "learning_rate": 4.998389805071536e-07, + "loss": 0.0044, + "reward": 0.44597443053498864, + "reward_std": 0.852834016084671, + "rewards/cosine_scaled_reward": -0.036098250187933445, + "rewards/format_reward": 0.9375000074505806, + "step": 291 + }, + { + "completion_length": 1563.4375228881836, + "epoch": 0.33371428571428574, + "grad_norm": 0.813820481300354, + "kl": 0.12435150146484375, + "learning_rate": 4.967182142620745e-07, + "loss": 0.005, + "reward": 0.1114344063680619, + "reward_std": 0.6936771422624588, + "rewards/cosine_scaled_reward": -0.18143241526558995, + "rewards/format_reward": 0.8541666716337204, + "step": 292 + }, + { + "completion_length": 1062.9166870117188, + "epoch": 0.33485714285714285, + "grad_norm": 1.081771969795227, + "kl": 0.08338165283203125, + "learning_rate": 4.93600044896063e-07, + "loss": 0.0033, + "reward": 0.35100130061618984, + "reward_std": 0.6911906227469444, + "rewards/cosine_scaled_reward": -0.036540206521749496, + "rewards/format_reward": 0.916666679084301, + "step": 293 + }, + { + "completion_length": 1598.583396911621, + "epoch": 0.336, + "grad_norm": 0.9857237935066223, + "kl": 0.11139678955078125, + "learning_rate": 4.904846243842949e-07, + "loss": 0.0045, + "reward": 0.2396585661917925, + "reward_std": 0.7594392895698547, + "rewards/cosine_scaled_reward": -0.07188234385102987, + "rewards/format_reward": 0.8125000149011612, + "step": 294 + }, + { + "completion_length": 1559.4167251586914, + "epoch": 0.33714285714285713, + "grad_norm": 0.7534098625183105, + "kl": 0.13663482666015625, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0055, + "reward": 0.35895493626594543, + "reward_std": 0.9453681632876396, + "rewards/cosine_scaled_reward": -0.029410481452941895, + "rewards/format_reward": 0.8333333507180214, + "step": 295 + }, + { + "completion_length": 1997.6042175292969, + "epoch": 0.3382857142857143, + "grad_norm": 1.1301641464233398, + "kl": 0.2256317138671875, + "learning_rate": 4.842626371469149e-07, + "loss": 0.009, + "reward": 0.21912480238825083, + "reward_std": 0.8136032223701477, + "rewards/cosine_scaled_reward": -0.07540793996304274, + "rewards/format_reward": 0.7500000074505806, + "step": 296 + }, + { + "completion_length": 2203.666717529297, + "epoch": 0.3394285714285714, + "grad_norm": 2.0461599826812744, + "kl": 0.2470703125, + "learning_rate": 4.811563736721829e-07, + "loss": 0.0099, + "reward": 0.0711959432810545, + "reward_std": 0.7955613285303116, + "rewards/cosine_scaled_reward": -0.13813064247369766, + "rewards/format_reward": 0.625000013038516, + "step": 297 + }, + { + "completion_length": 1511.0417098999023, + "epoch": 0.3405714285714286, + "grad_norm": 2.8553810119628906, + "kl": 0.27417755126953125, + "learning_rate": 4.780534655386743e-07, + "loss": 0.011, + "reward": 0.22325835039373487, + "reward_std": 0.7070137523114681, + "rewards/cosine_scaled_reward": -0.10693333297967911, + "rewards/format_reward": 0.8750000223517418, + "step": 298 + }, + { + "completion_length": 1423.8750381469727, + "epoch": 0.3417142857142857, + "grad_norm": 0.9705626368522644, + "kl": 0.1470947265625, + "learning_rate": 4.749540639777539e-07, + "loss": 0.0059, + "reward": 0.20597740169614553, + "reward_std": 0.7050324305891991, + "rewards/cosine_scaled_reward": -0.06768750678747892, + "rewards/format_reward": 0.7916666846722364, + "step": 299 + }, + { + "completion_length": 1679.583381652832, + "epoch": 0.34285714285714286, + "grad_norm": 1.6013585329055786, + "kl": 0.2515869140625, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0101, + "reward": 0.16855086106806993, + "reward_std": 0.7662245742976665, + "rewards/cosine_scaled_reward": -0.11777076427824795, + "rewards/format_reward": 0.7291666902601719, + "step": 300 + }, + { + "completion_length": 1467.8333740234375, + "epoch": 0.344, + "grad_norm": 1.4137245416641235, + "kl": 0.14666748046875, + "learning_rate": 4.68766384637248e-07, + "loss": 0.0059, + "reward": 0.34917816892266273, + "reward_std": 0.826218631118536, + "rewards/cosine_scaled_reward": -0.0683151277480647, + "rewards/format_reward": 0.916666679084301, + "step": 301 + }, + { + "completion_length": 1356.4167175292969, + "epoch": 0.34514285714285714, + "grad_norm": 1.1879962682724, + "kl": 0.149932861328125, + "learning_rate": 4.656784084364238e-07, + "loss": 0.006, + "reward": 0.36293663922697306, + "reward_std": 0.8348271325230598, + "rewards/cosine_scaled_reward": 0.008066533133387566, + "rewards/format_reward": 0.8125000111758709, + "step": 302 + }, + { + "completion_length": 1341.3541870117188, + "epoch": 0.3462857142857143, + "grad_norm": 1.171494483947754, + "kl": 0.18246078491210938, + "learning_rate": 4.6259454195101267e-07, + "loss": 0.0073, + "reward": 0.2431361076887697, + "reward_std": 0.7933157682418823, + "rewards/cosine_scaled_reward": -0.1022418315988034, + "rewards/format_reward": 0.854166679084301, + "step": 303 + }, + { + "completion_length": 1238.6667175292969, + "epoch": 0.3474285714285714, + "grad_norm": 0.9185214042663574, + "kl": 0.08075714111328125, + "learning_rate": 4.59514935484316e-07, + "loss": 0.0032, + "reward": 0.26590197812765837, + "reward_std": 0.6553149335086346, + "rewards/cosine_scaled_reward": -0.07919650059193373, + "rewards/format_reward": 0.8958333358168602, + "step": 304 + }, + { + "completion_length": 1323.9791946411133, + "epoch": 0.3485714285714286, + "grad_norm": 1.0339250564575195, + "kl": 0.17450332641601562, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.007, + "reward": 0.13226244208635762, + "reward_std": 0.6713650859892368, + "rewards/cosine_scaled_reward": -0.16428566398099065, + "rewards/format_reward": 0.8541666865348816, + "step": 305 + }, + { + "completion_length": 1336.8959045410156, + "epoch": 0.3497142857142857, + "grad_norm": 1.3460803031921387, + "kl": 0.196563720703125, + "learning_rate": 4.5336910277482155e-07, + "loss": 0.0079, + "reward": 0.6428653690963984, + "reward_std": 1.0474185049533844, + "rewards/cosine_scaled_reward": 0.14976803492754698, + "rewards/format_reward": 0.8541666716337204, + "step": 306 + }, + { + "completion_length": 1476.3542098999023, + "epoch": 0.35085714285714287, + "grad_norm": 2.0747716426849365, + "kl": 0.23199462890625, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0093, + "reward": 0.3243725663051009, + "reward_std": 0.7664116658270359, + "rewards/cosine_scaled_reward": -0.016992317512631416, + "rewards/format_reward": 0.8541666716337204, + "step": 307 + }, + { + "completion_length": 2029.166732788086, + "epoch": 0.352, + "grad_norm": 1.3981530666351318, + "kl": 0.3675079345703125, + "learning_rate": 4.4724210845020494e-07, + "loss": 0.0147, + "reward": 0.14206439611734822, + "reward_std": 0.7617513313889503, + "rewards/cosine_scaled_reward": -0.12085138214752078, + "rewards/format_reward": 0.7500000111758709, + "step": 308 + }, + { + "completion_length": 1717.7292175292969, + "epoch": 0.35314285714285715, + "grad_norm": 1.3776211738586426, + "kl": 0.27923583984375, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0112, + "reward": 0.26929385494440794, + "reward_std": 0.8059471026062965, + "rewards/cosine_scaled_reward": -0.06185213173739612, + "rewards/format_reward": 0.7500000223517418, + "step": 309 + }, + { + "completion_length": 1247.7917098999023, + "epoch": 0.35428571428571426, + "grad_norm": 1.195373773574829, + "kl": 0.13482666015625, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0054, + "reward": 0.20025933103170246, + "reward_std": 0.6624983102083206, + "rewards/cosine_scaled_reward": -0.13693573721684515, + "rewards/format_reward": 0.8958333432674408, + "step": 310 + }, + { + "completion_length": 1230.8333702087402, + "epoch": 0.3554285714285714, + "grad_norm": 1.7214908599853516, + "kl": 0.16667938232421875, + "learning_rate": 4.3808955077581546e-07, + "loss": 0.0067, + "reward": 0.48247593361884356, + "reward_std": 0.8122721910476685, + "rewards/cosine_scaled_reward": 0.02081843838095665, + "rewards/format_reward": 0.8958333432674408, + "step": 311 + }, + { + "completion_length": 1083.6875228881836, + "epoch": 0.3565714285714286, + "grad_norm": 1.3621957302093506, + "kl": 0.15818023681640625, + "learning_rate": 4.350494089288943e-07, + "loss": 0.0063, + "reward": 0.5721160881221294, + "reward_std": 0.6696533262729645, + "rewards/cosine_scaled_reward": 0.23700525425374508, + "rewards/format_reward": 0.8750000111758709, + "step": 312 + }, + { + "completion_length": 1498.3750228881836, + "epoch": 0.3577142857142857, + "grad_norm": 1.5137181282043457, + "kl": 0.31603240966796875, + "learning_rate": 4.3201486961161093e-07, + "loss": 0.0126, + "reward": 0.4493098706007004, + "reward_std": 0.8899626843631268, + "rewards/cosine_scaled_reward": 0.060429781675338745, + "rewards/format_reward": 0.7916666734963655, + "step": 313 + }, + { + "completion_length": 1152.7083702087402, + "epoch": 0.3588571428571429, + "grad_norm": 1.3737326860427856, + "kl": 0.178466796875, + "learning_rate": 4.2898608072313045e-07, + "loss": 0.0071, + "reward": 0.42835053242743015, + "reward_std": 0.7351183108985424, + "rewards/cosine_scaled_reward": 0.03823569389896875, + "rewards/format_reward": 0.8750000223517418, + "step": 314 + }, + { + "completion_length": 1325.7709045410156, + "epoch": 0.36, + "grad_norm": 4.047604560852051, + "kl": 0.39301300048828125, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.0157, + "reward": 0.33133709616959095, + "reward_std": 0.82825917750597, + "rewards/cosine_scaled_reward": 0.003983840346336365, + "rewards/format_reward": 0.7708333507180214, + "step": 315 + }, + { + "completion_length": 1725.6042137145996, + "epoch": 0.36114285714285715, + "grad_norm": 2.519155979156494, + "kl": 0.63311767578125, + "learning_rate": 4.2294634442070553e-07, + "loss": 0.0253, + "reward": 0.09543975442647934, + "reward_std": 0.8387586250901222, + "rewards/cosine_scaled_reward": -0.15274116187356412, + "rewards/format_reward": 0.7083333507180214, + "step": 316 + }, + { + "completion_length": 1519.8333740234375, + "epoch": 0.36228571428571427, + "grad_norm": 2.768705368041992, + "kl": 0.483062744140625, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.0193, + "reward": 0.35648738220334053, + "reward_std": 0.8188385404646397, + "rewards/cosine_scaled_reward": -0.0025814222171902657, + "rewards/format_reward": 0.7708333469927311, + "step": 317 + }, + { + "completion_length": 1004.2500343322754, + "epoch": 0.36342857142857143, + "grad_norm": 0.982016921043396, + "kl": 0.1404876708984375, + "learning_rate": 4.1693137748017915e-07, + "loss": 0.0056, + "reward": 0.17383845895528793, + "reward_std": 0.699553694576025, + "rewards/cosine_scaled_reward": -0.15935352514497936, + "rewards/format_reward": 0.916666679084301, + "step": 318 + }, + { + "completion_length": 1440.8750305175781, + "epoch": 0.36457142857142855, + "grad_norm": 2.472060203552246, + "kl": 0.6479034423828125, + "learning_rate": 4.1393354916230005e-07, + "loss": 0.0259, + "reward": 0.10745102865621448, + "reward_std": 0.8851590566337109, + "rewards/cosine_scaled_reward": -0.18445847602561116, + "rewards/format_reward": 0.7500000186264515, + "step": 319 + }, + { + "completion_length": 872.4791946411133, + "epoch": 0.3657142857142857, + "grad_norm": 1.969278335571289, + "kl": 0.3460540771484375, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0139, + "reward": 0.4431753905955702, + "reward_std": 0.7728302180767059, + "rewards/cosine_scaled_reward": 0.03887226711958647, + "rewards/format_reward": 0.8958333507180214, + "step": 320 + }, + { + "completion_length": 1001.3541946411133, + "epoch": 0.3668571428571429, + "grad_norm": 1.5442945957183838, + "kl": 0.4463653564453125, + "learning_rate": 4.079579333738039e-07, + "loss": 0.0179, + "reward": 0.5132068395614624, + "reward_std": 0.9250943809747696, + "rewards/cosine_scaled_reward": 0.06521461345255375, + "rewards/format_reward": 0.8750000074505806, + "step": 321 + }, + { + "completion_length": 1230.1666946411133, + "epoch": 0.368, + "grad_norm": 4.6210222244262695, + "kl": 0.856231689453125, + "learning_rate": 4.0498043714627006e-07, + "loss": 0.0342, + "reward": 0.21194136049598455, + "reward_std": 0.8229392170906067, + "rewards/cosine_scaled_reward": -0.1281059831380844, + "rewards/format_reward": 0.8125000298023224, + "step": 322 + }, + { + "completion_length": 1046.5833740234375, + "epoch": 0.36914285714285716, + "grad_norm": 2.3179831504821777, + "kl": 0.600982666015625, + "learning_rate": 4.020100089676376e-07, + "loss": 0.0241, + "reward": 0.17017683573067188, + "reward_std": 0.6333070918917656, + "rewards/cosine_scaled_reward": -0.09792062174528837, + "rewards/format_reward": 0.8333333507180214, + "step": 323 + }, + { + "completion_length": 1352.1666946411133, + "epoch": 0.3702857142857143, + "grad_norm": 4.481067657470703, + "kl": 1.1396102905273438, + "learning_rate": 3.9904679361238526e-07, + "loss": 0.0455, + "reward": 0.1608973522670567, + "reward_std": 0.7919485196471214, + "rewards/cosine_scaled_reward": -0.13527273340150714, + "rewards/format_reward": 0.770833358168602, + "step": 324 + }, + { + "completion_length": 1022.145866394043, + "epoch": 0.37142857142857144, + "grad_norm": 3.168888807296753, + "kl": 0.7562255859375, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0303, + "reward": 0.255460548796691, + "reward_std": 0.714014045894146, + "rewards/cosine_scaled_reward": -0.0987097217439441, + "rewards/format_reward": 0.8750000149011612, + "step": 325 + }, + { + "completion_length": 1067.8541946411133, + "epoch": 0.37257142857142855, + "grad_norm": 6.944228172302246, + "kl": 0.607666015625, + "learning_rate": 3.931425787051832e-07, + "loss": 0.0243, + "reward": 0.34114629309624434, + "reward_std": 0.7636858969926834, + "rewards/cosine_scaled_reward": -0.050329115241765976, + "rewards/format_reward": 0.8958333507180214, + "step": 326 + }, + { + "completion_length": 1227.4375610351562, + "epoch": 0.3737142857142857, + "grad_norm": 1.7565664052963257, + "kl": 0.2918243408203125, + "learning_rate": 3.902018669163384e-07, + "loss": 0.0117, + "reward": 0.4599468493834138, + "reward_std": 0.6929560974240303, + "rewards/cosine_scaled_reward": 0.0536233875900507, + "rewards/format_reward": 0.8958333507180214, + "step": 327 + }, + { + "completion_length": 1539.4792556762695, + "epoch": 0.37485714285714283, + "grad_norm": 7.016115188598633, + "kl": 1.123626708984375, + "learning_rate": 3.872689434630585e-07, + "loss": 0.045, + "reward": 0.051082022255286574, + "reward_std": 0.7887684628367424, + "rewards/cosine_scaled_reward": -0.199990039691329, + "rewards/format_reward": 0.7500000186264515, + "step": 328 + }, + { + "completion_length": 960.0208778381348, + "epoch": 0.376, + "grad_norm": 2.32981014251709, + "kl": 0.53167724609375, + "learning_rate": 3.843439512918949e-07, + "loss": 0.0212, + "reward": 0.3326158430427313, + "reward_std": 0.6615541912615299, + "rewards/cosine_scaled_reward": 0.0014219898730516434, + "rewards/format_reward": 0.8541666716337204, + "step": 329 + }, + { + "completion_length": 1015.9583740234375, + "epoch": 0.37714285714285717, + "grad_norm": 2.246696949005127, + "kl": 0.37122344970703125, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0149, + "reward": 0.3060060928110033, + "reward_std": 0.7870642617344856, + "rewards/cosine_scaled_reward": -0.09269801783375442, + "rewards/format_reward": 0.9375000074505806, + "step": 330 + }, + { + "completion_length": 1435.8333587646484, + "epoch": 0.3782857142857143, + "grad_norm": 2.061875104904175, + "kl": 1.0668792724609375, + "learning_rate": 3.785183306423767e-07, + "loss": 0.0428, + "reward": 0.12969208881258965, + "reward_std": 0.6653396300971508, + "rewards/cosine_scaled_reward": -0.09254590002819896, + "rewards/format_reward": 0.7708333432674408, + "step": 331 + }, + { + "completion_length": 1591.333366394043, + "epoch": 0.37942857142857145, + "grad_norm": 3.716142177581787, + "kl": 0.729095458984375, + "learning_rate": 3.7561798609655373e-07, + "loss": 0.0291, + "reward": 0.08031724044121802, + "reward_std": 0.6911140829324722, + "rewards/cosine_scaled_reward": -0.18430401291698217, + "rewards/format_reward": 0.7916666939854622, + "step": 332 + }, + { + "completion_length": 1069.1875343322754, + "epoch": 0.38057142857142856, + "grad_norm": 4.158648490905762, + "kl": 0.4372711181640625, + "learning_rate": 3.72726140684072e-07, + "loss": 0.0175, + "reward": 0.406515009701252, + "reward_std": 0.9121326096355915, + "rewards/cosine_scaled_reward": -0.03547548362985253, + "rewards/format_reward": 0.8541666865348816, + "step": 333 + }, + { + "completion_length": 1548.5625305175781, + "epoch": 0.38171428571428573, + "grad_norm": 2.0228302478790283, + "kl": 1.1025390625, + "learning_rate": 3.6984293534939737e-07, + "loss": 0.0441, + "reward": 0.0005341523792594671, + "reward_std": 0.6353342607617378, + "rewards/cosine_scaled_reward": -0.2071850085631013, + "rewards/format_reward": 0.770833358168602, + "step": 334 + }, + { + "completion_length": 1218.1458740234375, + "epoch": 0.38285714285714284, + "grad_norm": 1.6079083681106567, + "kl": 0.4420585632324219, + "learning_rate": 3.6696851061588994e-07, + "loss": 0.0177, + "reward": 0.5085492568905465, + "reward_std": 0.9739002659916878, + "rewards/cosine_scaled_reward": 0.027598066721111536, + "rewards/format_reward": 0.8750000223517418, + "step": 335 + }, + { + "completion_length": 1167.2917175292969, + "epoch": 0.384, + "grad_norm": 2.6033425331115723, + "kl": 0.45883941650390625, + "learning_rate": 3.641030065789562e-07, + "loss": 0.0184, + "reward": 0.43186818808317184, + "reward_std": 0.8531835786998272, + "rewards/cosine_scaled_reward": 0.05448053032159805, + "rewards/format_reward": 0.8125000111758709, + "step": 336 + }, + { + "completion_length": 1327.1042022705078, + "epoch": 0.3851428571428571, + "grad_norm": 3.292928695678711, + "kl": 0.3748321533203125, + "learning_rate": 3.612465628992203e-07, + "loss": 0.015, + "reward": 0.2253723087196704, + "reward_std": 0.7758054882287979, + "rewards/cosine_scaled_reward": -0.13347763079218566, + "rewards/format_reward": 0.895833358168602, + "step": 337 + }, + { + "completion_length": 1171.1875190734863, + "epoch": 0.3862857142857143, + "grad_norm": 6.154543876647949, + "kl": 1.41033935546875, + "learning_rate": 3.5839931879571725e-07, + "loss": 0.0567, + "reward": 0.21605271194130182, + "reward_std": 0.6188852116465569, + "rewards/cosine_scaled_reward": -0.02803803514689207, + "rewards/format_reward": 0.7708333507180214, + "step": 338 + }, + { + "completion_length": 1353.3750381469727, + "epoch": 0.38742857142857146, + "grad_norm": 4.424708843231201, + "kl": 0.518829345703125, + "learning_rate": 3.555614130391079e-07, + "loss": 0.0208, + "reward": 0.09727495489642024, + "reward_std": 0.6290086731314659, + "rewards/cosine_scaled_reward": -0.1305898940190673, + "rewards/format_reward": 0.7291666772216558, + "step": 339 + }, + { + "completion_length": 1321.708381652832, + "epoch": 0.38857142857142857, + "grad_norm": 1.7365025281906128, + "kl": 1.028076171875, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0411, + "reward": 0.2937823599204421, + "reward_std": 0.857489574700594, + "rewards/cosine_scaled_reward": -0.07948380801826715, + "rewards/format_reward": 0.8333333432674408, + "step": 340 + }, + { + "completion_length": 1008.5208740234375, + "epoch": 0.38971428571428574, + "grad_norm": 2.895829677581787, + "kl": 0.6253280639648438, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.025, + "reward": 0.5628593787550926, + "reward_std": 0.7620139196515083, + "rewards/cosine_scaled_reward": 0.1886910004541278, + "rewards/format_reward": 0.7500000074505806, + "step": 341 + }, + { + "completion_length": 1325.9792175292969, + "epoch": 0.39085714285714285, + "grad_norm": 3.778715133666992, + "kl": 1.060302734375, + "learning_rate": 3.471051066897562e-07, + "loss": 0.0424, + "reward": 0.49639496579766273, + "reward_std": 0.9629452601075172, + "rewards/cosine_scaled_reward": 0.022836442454718053, + "rewards/format_reward": 0.8333333432674408, + "step": 342 + }, + { + "completion_length": 1301.9792098999023, + "epoch": 0.392, + "grad_norm": 4.3889546394348145, + "kl": 0.91851806640625, + "learning_rate": 3.4430593282358777e-07, + "loss": 0.0368, + "reward": 0.5952606732025743, + "reward_std": 0.8778381422162056, + "rewards/cosine_scaled_reward": 0.0880544763058424, + "rewards/format_reward": 0.8750000223517418, + "step": 343 + }, + { + "completion_length": 1580.520866394043, + "epoch": 0.3931428571428571, + "grad_norm": 2.815673589706421, + "kl": 1.52044677734375, + "learning_rate": 3.4151678419606233e-07, + "loss": 0.0607, + "reward": 0.5273411017842591, + "reward_std": 0.7953666783869267, + "rewards/cosine_scaled_reward": 0.1287881238386035, + "rewards/format_reward": 0.7500000149011612, + "step": 344 + }, + { + "completion_length": 1205.958381652832, + "epoch": 0.3942857142857143, + "grad_norm": 4.648168563842773, + "kl": 1.01983642578125, + "learning_rate": 3.387377967463493e-07, + "loss": 0.0409, + "reward": 0.3727841805666685, + "reward_std": 0.866691779345274, + "rewards/cosine_scaled_reward": -0.026965959696099162, + "rewards/format_reward": 0.8125000223517418, + "step": 345 + }, + { + "completion_length": 1131.4375381469727, + "epoch": 0.3954285714285714, + "grad_norm": 1.843777060508728, + "kl": 0.547515869140625, + "learning_rate": 3.359691059183761e-07, + "loss": 0.0219, + "reward": 0.3929372038692236, + "reward_std": 0.8282114900648594, + "rewards/cosine_scaled_reward": -0.039126552641391754, + "rewards/format_reward": 0.8958333432674408, + "step": 346 + }, + { + "completion_length": 1396.1250381469727, + "epoch": 0.3965714285714286, + "grad_norm": 2.772951126098633, + "kl": 0.83648681640625, + "learning_rate": 3.3321084665422803e-07, + "loss": 0.0334, + "reward": 0.04932975070551038, + "reward_std": 0.7110629379749298, + "rewards/cosine_scaled_reward": -0.2151939356699586, + "rewards/format_reward": 0.833333358168602, + "step": 347 + }, + { + "completion_length": 1094.2500381469727, + "epoch": 0.3977142857142857, + "grad_norm": 3.0713284015655518, + "kl": 0.55780029296875, + "learning_rate": 3.3046315338757026e-07, + "loss": 0.0223, + "reward": 0.3814494190737605, + "reward_std": 0.7161810956895351, + "rewards/cosine_scaled_reward": -0.050093160942196846, + "rewards/format_reward": 0.9583333432674408, + "step": 348 + }, + { + "completion_length": 1494.020866394043, + "epoch": 0.39885714285714285, + "grad_norm": 7.005665302276611, + "kl": 1.7324066162109375, + "learning_rate": 3.2772616003709616e-07, + "loss": 0.0694, + "reward": 0.3296292170416564, + "reward_std": 0.8451166786253452, + "rewards/cosine_scaled_reward": -0.04683339223265648, + "rewards/format_reward": 0.8125000260770321, + "step": 349 + }, + { + "completion_length": 1154.4167022705078, + "epoch": 0.4, + "grad_norm": 1.4422372579574585, + "kl": 0.5752105712890625, + "learning_rate": 3.250000000000001e-07, + "loss": 0.023, + "reward": 0.2720920806750655, + "reward_std": 0.9464740082621574, + "rewards/cosine_scaled_reward": -0.13292736560106277, + "rewards/format_reward": 0.854166679084301, + "step": 350 + }, + { + "completion_length": 1267.2708892822266, + "epoch": 0.40114285714285713, + "grad_norm": 2.480191946029663, + "kl": 0.645751953125, + "learning_rate": 3.222848061454764e-07, + "loss": 0.0258, + "reward": 0.17013419978320599, + "reward_std": 0.7632426917552948, + "rewards/cosine_scaled_reward": -0.08551971893757582, + "rewards/format_reward": 0.7083333544433117, + "step": 351 + }, + { + "completion_length": 1286.5000228881836, + "epoch": 0.4022857142857143, + "grad_norm": 2.153872489929199, + "kl": 0.580810546875, + "learning_rate": 3.195807108082429e-07, + "loss": 0.0232, + "reward": 0.12328234064625576, + "reward_std": 0.8743564449250698, + "rewards/cosine_scaled_reward": -0.14927133545279503, + "rewards/format_reward": 0.7083333507180214, + "step": 352 + }, + { + "completion_length": 1413.1042251586914, + "epoch": 0.4034285714285714, + "grad_norm": 2.458132266998291, + "kl": 0.6356201171875, + "learning_rate": 3.168878457820915e-07, + "loss": 0.0255, + "reward": 0.47827470442280173, + "reward_std": 0.9372801259160042, + "rewards/cosine_scaled_reward": 0.06890620663762093, + "rewards/format_reward": 0.7291666865348816, + "step": 353 + }, + { + "completion_length": 1377.5000457763672, + "epoch": 0.4045714285714286, + "grad_norm": 1.8596700429916382, + "kl": 0.601806640625, + "learning_rate": 3.142063423134644e-07, + "loss": 0.0241, + "reward": 0.3115939125418663, + "reward_std": 0.9930433630943298, + "rewards/cosine_scaled_reward": -0.01708403415977955, + "rewards/format_reward": 0.6458333544433117, + "step": 354 + }, + { + "completion_length": 1212.645881652832, + "epoch": 0.4057142857142857, + "grad_norm": 1.9313182830810547, + "kl": 0.514892578125, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0206, + "reward": 0.17402092670090497, + "reward_std": 0.8653297163546085, + "rewards/cosine_scaled_reward": -0.06491286307573318, + "rewards/format_reward": 0.6250000149011612, + "step": 355 + }, + { + "completion_length": 1633.4584197998047, + "epoch": 0.40685714285714286, + "grad_norm": 1.906638503074646, + "kl": 0.5458984375, + "learning_rate": 3.0887794225945143e-07, + "loss": 0.0218, + "reward": 0.2535629828926176, + "reward_std": 0.8886833339929581, + "rewards/cosine_scaled_reward": -0.06057508382946253, + "rewards/format_reward": 0.7083333469927311, + "step": 356 + }, + { + "completion_length": 1881.8751029968262, + "epoch": 0.408, + "grad_norm": 2.3647892475128174, + "kl": 0.819091796875, + "learning_rate": 3.062313053727671e-07, + "loss": 0.0328, + "reward": -0.11886521428823471, + "reward_std": 0.6593221351504326, + "rewards/cosine_scaled_reward": -0.2341746687889099, + "rewards/format_reward": 0.6250000186264515, + "step": 357 + }, + { + "completion_length": 1829.6250305175781, + "epoch": 0.40914285714285714, + "grad_norm": 1.856693148612976, + "kl": 0.70489501953125, + "learning_rate": 3.0359654942835247e-07, + "loss": 0.0282, + "reward": 0.1773190926760435, + "reward_std": 0.8244731090962887, + "rewards/cosine_scaled_reward": -0.05620474135503173, + "rewards/format_reward": 0.5833333544433117, + "step": 358 + }, + { + "completion_length": 1053.9375305175781, + "epoch": 0.4102857142857143, + "grad_norm": 3.442803382873535, + "kl": 0.8038330078125, + "learning_rate": 3.0097380284049523e-07, + "loss": 0.0322, + "reward": 0.16565631795674562, + "reward_std": 0.8398986533284187, + "rewards/cosine_scaled_reward": -0.12338497827295214, + "rewards/format_reward": 0.7291666939854622, + "step": 359 + }, + { + "completion_length": 1288.1875305175781, + "epoch": 0.4114285714285714, + "grad_norm": 1.7091789245605469, + "kl": 0.526123046875, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.0211, + "reward": 0.2598306848667562, + "reward_std": 0.8984776921570301, + "rewards/cosine_scaled_reward": -0.05199693702161312, + "rewards/format_reward": 0.6875000223517418, + "step": 360 + }, + { + "completion_length": 1407.9583435058594, + "epoch": 0.4125714285714286, + "grad_norm": 3.6575090885162354, + "kl": 0.920166015625, + "learning_rate": 2.9576484845877793e-07, + "loss": 0.0368, + "reward": 0.022162297973409295, + "reward_std": 0.6297386400401592, + "rewards/cosine_scaled_reward": -0.17034196108579636, + "rewards/format_reward": 0.7083333432674408, + "step": 361 + }, + { + "completion_length": 1246.812572479248, + "epoch": 0.4137142857142857, + "grad_norm": 3.3759262561798096, + "kl": 0.9285125732421875, + "learning_rate": 2.931788945420058e-07, + "loss": 0.0371, + "reward": 0.17949877493083477, + "reward_std": 0.7008433230221272, + "rewards/cosine_scaled_reward": -0.06406344473361969, + "rewards/format_reward": 0.6875000204890966, + "step": 362 + }, + { + "completion_length": 935.9166870117188, + "epoch": 0.41485714285714287, + "grad_norm": 2.3449277877807617, + "kl": 0.5015869140625, + "learning_rate": 2.9060545772359305e-07, + "loss": 0.02, + "reward": 0.462602804065682, + "reward_std": 0.8693399466574192, + "rewards/cosine_scaled_reward": 0.1225482877343893, + "rewards/format_reward": 0.687500013038516, + "step": 363 + }, + { + "completion_length": 1301.6042098999023, + "epoch": 0.416, + "grad_norm": 2.7286217212677, + "kl": 0.5523681640625, + "learning_rate": 2.8804466342921987e-07, + "loss": 0.0221, + "reward": -0.18497540964744985, + "reward_std": 0.596361830830574, + "rewards/cosine_scaled_reward": -0.2775970436632633, + "rewards/format_reward": 0.6458333507180214, + "step": 364 + }, + { + "completion_length": 1725.3958587646484, + "epoch": 0.41714285714285715, + "grad_norm": 2.718860387802124, + "kl": 0.6279296875, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0252, + "reward": 0.07833639718592167, + "reward_std": 0.783052921295166, + "rewards/cosine_scaled_reward": -0.14490601420402527, + "rewards/format_reward": 0.6458333432674408, + "step": 365 + }, + { + "completion_length": 1542.166690826416, + "epoch": 0.41828571428571426, + "grad_norm": 2.4428579807281494, + "kl": 0.68994140625, + "learning_rate": 2.829615010283344e-07, + "loss": 0.0276, + "reward": 0.2222783851902932, + "reward_std": 0.9212566986680031, + "rewards/cosine_scaled_reward": 0.022268712986260653, + "rewards/format_reward": 0.5416666753590107, + "step": 366 + }, + { + "completion_length": 1529.7708854675293, + "epoch": 0.41942857142857143, + "grad_norm": 1.7189098596572876, + "kl": 0.618408203125, + "learning_rate": 2.8043938066798645e-07, + "loss": 0.0247, + "reward": 0.13789566839113832, + "reward_std": 0.7518769651651382, + "rewards/cosine_scaled_reward": -0.08000693749636412, + "rewards/format_reward": 0.6458333525806665, + "step": 367 + }, + { + "completion_length": 1874.6250610351562, + "epoch": 0.4205714285714286, + "grad_norm": 2.6109297275543213, + "kl": 0.5501708984375, + "learning_rate": 2.7793039831193133e-07, + "loss": 0.022, + "reward": 0.1437191739678383, + "reward_std": 0.809885136783123, + "rewards/cosine_scaled_reward": -0.07925083208829165, + "rewards/format_reward": 0.604166679084301, + "step": 368 + }, + { + "completion_length": 1595.6458892822266, + "epoch": 0.4217142857142857, + "grad_norm": 2.485452890396118, + "kl": 0.75439453125, + "learning_rate": 2.7543467624442956e-07, + "loss": 0.0302, + "reward": 0.23247148096561432, + "reward_std": 0.9586904980242252, + "rewards/cosine_scaled_reward": -0.044586887350305915, + "rewards/format_reward": 0.6041666809469461, + "step": 369 + }, + { + "completion_length": 1477.1458740234375, + "epoch": 0.4228571428571429, + "grad_norm": 2.785557508468628, + "kl": 0.76904296875, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0308, + "reward": 0.12985735037364066, + "reward_std": 0.6629677005112171, + "rewards/cosine_scaled_reward": -0.04906688700430095, + "rewards/format_reward": 0.6250000149011612, + "step": 370 + }, + { + "completion_length": 1081.5417079925537, + "epoch": 0.424, + "grad_norm": 2.7538247108459473, + "kl": 0.563232421875, + "learning_rate": 2.7048349887476037e-07, + "loss": 0.0225, + "reward": 0.592208469286561, + "reward_std": 0.7628613486886024, + "rewards/cosine_scaled_reward": 0.19106208952143788, + "rewards/format_reward": 0.8125000186264515, + "step": 371 + }, + { + "completion_length": 1689.6458892822266, + "epoch": 0.42514285714285716, + "grad_norm": 2.427741050720215, + "kl": 0.419677734375, + "learning_rate": 2.6802828488599294e-07, + "loss": 0.0168, + "reward": 0.2929907846264541, + "reward_std": 0.8767264820635319, + "rewards/cosine_scaled_reward": -0.03612378612160683, + "rewards/format_reward": 0.7083333507180214, + "step": 372 + }, + { + "completion_length": 1054.0833702087402, + "epoch": 0.42628571428571427, + "grad_norm": 3.8023006916046143, + "kl": 0.587646484375, + "learning_rate": 2.655868138008171e-07, + "loss": 0.0235, + "reward": 0.05784207722172141, + "reward_std": 0.7628821134567261, + "rewards/cosine_scaled_reward": -0.17036252235993743, + "rewards/format_reward": 0.6875000260770321, + "step": 373 + }, + { + "completion_length": 1296.9375305175781, + "epoch": 0.42742857142857144, + "grad_norm": 3.30541729927063, + "kl": 0.861572265625, + "learning_rate": 2.631592046130896e-07, + "loss": 0.0345, + "reward": 0.2668555803829804, + "reward_std": 0.8589187040925026, + "rewards/cosine_scaled_reward": -0.026118143810890615, + "rewards/format_reward": 0.6666666753590107, + "step": 374 + }, + { + "completion_length": 1779.8958549499512, + "epoch": 0.42857142857142855, + "grad_norm": 3.2860774993896484, + "kl": 0.71484375, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0286, + "reward": 0.2315208874642849, + "reward_std": 0.9205419048666954, + "rewards/cosine_scaled_reward": -0.08181109488941729, + "rewards/format_reward": 0.6875000260770321, + "step": 375 + }, + { + "completion_length": 1341.812515258789, + "epoch": 0.4297142857142857, + "grad_norm": 1.9312739372253418, + "kl": 0.6658935546875, + "learning_rate": 2.583460445215911e-07, + "loss": 0.0266, + "reward": 0.11426342278718948, + "reward_std": 0.7130968421697617, + "rewards/cosine_scaled_reward": -0.13826850580517203, + "rewards/format_reward": 0.7708333507180214, + "step": 376 + }, + { + "completion_length": 1541.6042098999023, + "epoch": 0.4308571428571429, + "grad_norm": 1.676692008972168, + "kl": 0.5164794921875, + "learning_rate": 2.5596072820445254e-07, + "loss": 0.0206, + "reward": 0.11265636514872313, + "reward_std": 0.6777131371200085, + "rewards/cosine_scaled_reward": -0.12696546595543623, + "rewards/format_reward": 0.7291666939854622, + "step": 377 + }, + { + "completion_length": 1453.2917251586914, + "epoch": 0.432, + "grad_norm": 2.0128226280212402, + "kl": 0.51177978515625, + "learning_rate": 2.5358974294659373e-07, + "loss": 0.0205, + "reward": 0.31238897051662207, + "reward_std": 1.0008442774415016, + "rewards/cosine_scaled_reward": -0.07301739603281021, + "rewards/format_reward": 0.7500000223517418, + "step": 378 + }, + { + "completion_length": 1567.0000457763672, + "epoch": 0.43314285714285716, + "grad_norm": 2.777541160583496, + "kl": 0.682373046875, + "learning_rate": 2.512332043064913e-07, + "loss": 0.0273, + "reward": 0.006588555872440338, + "reward_std": 0.7485722936689854, + "rewards/cosine_scaled_reward": -0.18877192959189415, + "rewards/format_reward": 0.6666666846722364, + "step": 379 + }, + { + "completion_length": 1324.5208854675293, + "epoch": 0.4342857142857143, + "grad_norm": 3.0375523567199707, + "kl": 0.670166015625, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0268, + "reward": 0.2578076588688418, + "reward_std": 0.9591740146279335, + "rewards/cosine_scaled_reward": -0.012080159038305283, + "rewards/format_reward": 0.5416666846722364, + "step": 380 + }, + { + "completion_length": 1730.3125762939453, + "epoch": 0.43542857142857144, + "grad_norm": 1.9336341619491577, + "kl": 0.7171630859375, + "learning_rate": 2.465639255873246e-07, + "loss": 0.0287, + "reward": -0.1746607469394803, + "reward_std": 0.5531054511666298, + "rewards/cosine_scaled_reward": -0.2578210327774286, + "rewards/format_reward": 0.6458333469927311, + "step": 381 + }, + { + "completion_length": 1355.208366394043, + "epoch": 0.43657142857142855, + "grad_norm": 2.0603504180908203, + "kl": 0.6038818359375, + "learning_rate": 2.4425141308231765e-07, + "loss": 0.0242, + "reward": 0.16119742300361395, + "reward_std": 0.8973702676594257, + "rewards/cosine_scaled_reward": -0.13088632840663195, + "rewards/format_reward": 0.7083333469927311, + "step": 382 + }, + { + "completion_length": 1379.7292098999023, + "epoch": 0.4377142857142857, + "grad_norm": 1.975783348083496, + "kl": 0.513427734375, + "learning_rate": 2.4195380233209006e-07, + "loss": 0.0206, + "reward": 0.5386483520269394, + "reward_std": 0.9303784146904945, + "rewards/cosine_scaled_reward": 0.09457700047641993, + "rewards/format_reward": 0.7500000111758709, + "step": 383 + }, + { + "completion_length": 1441.4375305175781, + "epoch": 0.43885714285714283, + "grad_norm": 2.0518715381622314, + "kl": 0.517333984375, + "learning_rate": 2.3967120531894857e-07, + "loss": 0.0207, + "reward": 0.5782037973403931, + "reward_std": 1.07766717299819, + "rewards/cosine_scaled_reward": 0.11610157322138548, + "rewards/format_reward": 0.6875000260770321, + "step": 384 + }, + { + "completion_length": 1418.8958740234375, + "epoch": 0.44, + "grad_norm": 2.584573745727539, + "kl": 0.51806640625, + "learning_rate": 2.374037332934512e-07, + "loss": 0.0207, + "reward": 0.05193836707621813, + "reward_std": 0.8466444984078407, + "rewards/cosine_scaled_reward": -0.1831620568409562, + "rewards/format_reward": 0.6875000149011612, + "step": 385 + }, + { + "completion_length": 1452.9166946411133, + "epoch": 0.44114285714285717, + "grad_norm": 2.1661787033081055, + "kl": 0.517578125, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.0207, + "reward": 0.14932426065206528, + "reward_std": 0.8432199582457542, + "rewards/cosine_scaled_reward": -0.10539527935907245, + "rewards/format_reward": 0.6458333544433117, + "step": 386 + }, + { + "completion_length": 1647.145866394043, + "epoch": 0.4422857142857143, + "grad_norm": 1.8875645399093628, + "kl": 0.56982421875, + "learning_rate": 2.3291460551638237e-07, + "loss": 0.0228, + "reward": 0.01697136198345106, + "reward_std": 0.7178105190396309, + "rewards/cosine_scaled_reward": -0.11740265972912312, + "rewards/format_reward": 0.604166679084301, + "step": 387 + }, + { + "completion_length": 1262.020866394043, + "epoch": 0.44342857142857145, + "grad_norm": 4.029469966888428, + "kl": 0.7529296875, + "learning_rate": 2.306931685585657e-07, + "loss": 0.0301, + "reward": 0.3093376103788614, + "reward_std": 0.9565748050808907, + "rewards/cosine_scaled_reward": -0.05382521077990532, + "rewards/format_reward": 0.7291666865348816, + "step": 388 + }, + { + "completion_length": 1679.1667404174805, + "epoch": 0.44457142857142856, + "grad_norm": 1.9148039817810059, + "kl": 0.68115234375, + "learning_rate": 2.2848729416523859e-07, + "loss": 0.0272, + "reward": 0.03406955860555172, + "reward_std": 0.8620793931186199, + "rewards/cosine_scaled_reward": -0.1516010407358408, + "rewards/format_reward": 0.5833333469927311, + "step": 389 + }, + { + "completion_length": 1607.0625305175781, + "epoch": 0.44571428571428573, + "grad_norm": 2.3997583389282227, + "kl": 0.6044921875, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0242, + "reward": -0.03216620907187462, + "reward_std": 0.7698984518647194, + "rewards/cosine_scaled_reward": -0.18137890007346869, + "rewards/format_reward": 0.6041666865348816, + "step": 390 + }, + { + "completion_length": 1295.2916870117188, + "epoch": 0.44685714285714284, + "grad_norm": 2.192011833190918, + "kl": 0.499267578125, + "learning_rate": 2.2412266235313973e-07, + "loss": 0.02, + "reward": 0.3369876742362976, + "reward_std": 0.9241867884993553, + "rewards/cosine_scaled_reward": -0.05256127379834652, + "rewards/format_reward": 0.7500000223517418, + "step": 391 + }, + { + "completion_length": 1388.0625305175781, + "epoch": 0.448, + "grad_norm": 1.6250756978988647, + "kl": 0.4912109375, + "learning_rate": 2.2196411766036487e-07, + "loss": 0.0196, + "reward": 0.3453047815710306, + "reward_std": 0.924341045320034, + "rewards/cosine_scaled_reward": -0.037261209450662136, + "rewards/format_reward": 0.7708333432674408, + "step": 392 + }, + { + "completion_length": 1362.3333587646484, + "epoch": 0.4491428571428571, + "grad_norm": 2.632045269012451, + "kl": 0.560302734375, + "learning_rate": 2.1982156097370557e-07, + "loss": 0.0224, + "reward": 0.3152189594693482, + "reward_std": 0.8564588874578476, + "rewards/cosine_scaled_reward": -0.05929289385676384, + "rewards/format_reward": 0.7916666865348816, + "step": 393 + }, + { + "completion_length": 1506.1666870117188, + "epoch": 0.4502857142857143, + "grad_norm": 2.0749998092651367, + "kl": 0.65478515625, + "learning_rate": 2.1769509671835223e-07, + "loss": 0.0262, + "reward": -0.0516182022402063, + "reward_std": 0.6332247443497181, + "rewards/cosine_scaled_reward": -0.17348778434097767, + "rewards/format_reward": 0.6041666865348816, + "step": 394 + }, + { + "completion_length": 1105.4583587646484, + "epoch": 0.4514285714285714, + "grad_norm": 3.7922682762145996, + "kl": 0.60986328125, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.0244, + "reward": 0.3151353672146797, + "reward_std": 0.7997763678431511, + "rewards/cosine_scaled_reward": 0.014608239755034447, + "rewards/format_reward": 0.6666666828095913, + "step": 395 + }, + { + "completion_length": 1559.6875762939453, + "epoch": 0.45257142857142857, + "grad_norm": 3.7929108142852783, + "kl": 0.51507568359375, + "learning_rate": 2.134908592756607e-07, + "loss": 0.0206, + "reward": 0.1322829071432352, + "reward_std": 0.8961255550384521, + "rewards/cosine_scaled_reward": -0.12251237966120243, + "rewards/format_reward": 0.6458333507180214, + "step": 396 + }, + { + "completion_length": 1392.083381652832, + "epoch": 0.45371428571428574, + "grad_norm": 1.7970480918884277, + "kl": 0.5240478515625, + "learning_rate": 2.1141329099692406e-07, + "loss": 0.0209, + "reward": 0.24729618662968278, + "reward_std": 0.7044042460620403, + "rewards/cosine_scaled_reward": -0.03716139169409871, + "rewards/format_reward": 0.7291666902601719, + "step": 397 + }, + { + "completion_length": 1375.2500228881836, + "epoch": 0.45485714285714285, + "grad_norm": 2.7973999977111816, + "kl": 0.5516357421875, + "learning_rate": 2.0935222495670968e-07, + "loss": 0.0221, + "reward": 0.05064563453197479, + "reward_std": 0.7481582798063755, + "rewards/cosine_scaled_reward": -0.16217901837080717, + "rewards/format_reward": 0.6666666828095913, + "step": 398 + }, + { + "completion_length": 1417.9375381469727, + "epoch": 0.456, + "grad_norm": 3.365849733352661, + "kl": 0.652587890625, + "learning_rate": 2.0730776160846853e-07, + "loss": 0.0261, + "reward": 0.24118162877857685, + "reward_std": 0.9246044531464577, + "rewards/cosine_scaled_reward": -0.05636600544676185, + "rewards/format_reward": 0.6666666865348816, + "step": 399 + }, + { + "completion_length": 835.3958549499512, + "epoch": 0.45714285714285713, + "grad_norm": 2.1656346321105957, + "kl": 0.446533203125, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.0178, + "reward": 0.66196015663445, + "reward_std": 1.0285310856997967, + "rewards/cosine_scaled_reward": 0.17172618986387533, + "rewards/format_reward": 0.8541666939854622, + "step": 400 + }, + { + "completion_length": 1314.083396911621, + "epoch": 0.4582857142857143, + "grad_norm": 1.6506296396255493, + "kl": 0.44915771484375, + "learning_rate": 2.032690407508949e-07, + "loss": 0.018, + "reward": 0.157606887165457, + "reward_std": 0.7822442874312401, + "rewards/cosine_scaled_reward": -0.11768818949349225, + "rewards/format_reward": 0.7291666865348816, + "step": 401 + }, + { + "completion_length": 1331.9583892822266, + "epoch": 0.4594285714285714, + "grad_norm": 3.1099698543548584, + "kl": 0.589599609375, + "learning_rate": 2.0127498008311922e-07, + "loss": 0.0236, + "reward": 0.2424939200282097, + "reward_std": 0.7927302867174149, + "rewards/cosine_scaled_reward": -0.0522354356944561, + "rewards/format_reward": 0.6875000260770321, + "step": 402 + }, + { + "completion_length": 1337.020881652832, + "epoch": 0.4605714285714286, + "grad_norm": 2.6694633960723877, + "kl": 0.537109375, + "learning_rate": 1.9929791578083655e-07, + "loss": 0.0215, + "reward": 0.2716318762395531, + "reward_std": 0.8227218054234982, + "rewards/cosine_scaled_reward": -0.02993021416477859, + "rewards/format_reward": 0.6666666902601719, + "step": 403 + }, + { + "completion_length": 1489.7708740234375, + "epoch": 0.4617142857142857, + "grad_norm": 2.208977460861206, + "kl": 0.66632080078125, + "learning_rate": 1.9733794420337213e-07, + "loss": 0.0267, + "reward": 0.28225214779376984, + "reward_std": 0.8597016483545303, + "rewards/cosine_scaled_reward": -0.004551528720185161, + "rewards/format_reward": 0.7083333395421505, + "step": 404 + }, + { + "completion_length": 1259.8333587646484, + "epoch": 0.46285714285714286, + "grad_norm": 4.63955545425415, + "kl": 0.81591796875, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0327, + "reward": 0.35530247224960476, + "reward_std": 0.8299554735422134, + "rewards/cosine_scaled_reward": 0.037398045882582664, + "rewards/format_reward": 0.7291666865348816, + "step": 405 + }, + { + "completion_length": 1513.1250610351562, + "epoch": 0.464, + "grad_norm": 2.6123645305633545, + "kl": 0.635986328125, + "learning_rate": 1.934696604901642e-07, + "loss": 0.0254, + "reward": 0.06231710687279701, + "reward_std": 0.8518500626087189, + "rewards/cosine_scaled_reward": -0.1575743369758129, + "rewards/format_reward": 0.6458333507180214, + "step": 406 + }, + { + "completion_length": 1208.5000228881836, + "epoch": 0.46514285714285714, + "grad_norm": 3.125258445739746, + "kl": 0.557861328125, + "learning_rate": 1.915615368891117e-07, + "loss": 0.0224, + "reward": 0.28017069818452, + "reward_std": 0.7779465243220329, + "rewards/cosine_scaled_reward": -0.040886467322707176, + "rewards/format_reward": 0.7291666939854622, + "step": 407 + }, + { + "completion_length": 1217.5208587646484, + "epoch": 0.4662857142857143, + "grad_norm": 2.7784037590026855, + "kl": 0.427734375, + "learning_rate": 1.8967088307307e-07, + "loss": 0.0171, + "reward": 0.3413910511881113, + "reward_std": 0.6852421574294567, + "rewards/cosine_scaled_reward": -0.017510855570435524, + "rewards/format_reward": 0.8958333432674408, + "step": 408 + }, + { + "completion_length": 1725.041748046875, + "epoch": 0.4674285714285714, + "grad_norm": 1.8275662660598755, + "kl": 0.676513671875, + "learning_rate": 1.8779779118983867e-07, + "loss": 0.0271, + "reward": 0.21645420044660568, + "reward_std": 0.8697847276926041, + "rewards/cosine_scaled_reward": -0.08236874872818589, + "rewards/format_reward": 0.7083333544433117, + "step": 409 + }, + { + "completion_length": 1496.8333892822266, + "epoch": 0.4685714285714286, + "grad_norm": 3.2171754837036133, + "kl": 0.934356689453125, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.0374, + "reward": 0.17047240026295185, + "reward_std": 0.7880502119660378, + "rewards/cosine_scaled_reward": -0.04659413266927004, + "rewards/format_reward": 0.583333345130086, + "step": 410 + }, + { + "completion_length": 1500.4792098999023, + "epoch": 0.4697142857142857, + "grad_norm": 1.7640888690948486, + "kl": 0.608154296875, + "learning_rate": 1.8410465752883758e-07, + "loss": 0.0243, + "reward": 0.3746693143621087, + "reward_std": 1.0237346589565277, + "rewards/cosine_scaled_reward": -0.025670517061371356, + "rewards/format_reward": 0.7500000149011612, + "step": 411 + }, + { + "completion_length": 1243.1042098999023, + "epoch": 0.47085714285714286, + "grad_norm": 3.035127878189087, + "kl": 0.6143798828125, + "learning_rate": 1.822847957491922e-07, + "loss": 0.0246, + "reward": 0.37103684339672327, + "reward_std": 0.9396983981132507, + "rewards/cosine_scaled_reward": -0.03727317973971367, + "rewards/format_reward": 0.791666679084301, + "step": 412 + }, + { + "completion_length": 1486.4166946411133, + "epoch": 0.472, + "grad_norm": 2.925623655319214, + "kl": 0.7296142578125, + "learning_rate": 1.804828558898332e-07, + "loss": 0.0292, + "reward": 0.19466528482735157, + "reward_std": 0.7697472274303436, + "rewards/cosine_scaled_reward": -0.08936256961897016, + "rewards/format_reward": 0.6875000149011612, + "step": 413 + }, + { + "completion_length": 1481.3125381469727, + "epoch": 0.47314285714285714, + "grad_norm": 2.3652820587158203, + "kl": 0.717041015625, + "learning_rate": 1.7869892577476722e-07, + "loss": 0.0286, + "reward": 0.06787084229290485, + "reward_std": 0.8079018853604794, + "rewards/cosine_scaled_reward": -0.15581453032791615, + "rewards/format_reward": 0.6666666865348816, + "step": 414 + }, + { + "completion_length": 1480.1875610351562, + "epoch": 0.4742857142857143, + "grad_norm": 1.5522794723510742, + "kl": 0.653564453125, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.0261, + "reward": 0.2685818700119853, + "reward_std": 0.9294859580695629, + "rewards/cosine_scaled_reward": -0.06137389224022627, + "rewards/format_reward": 0.6875000186264515, + "step": 415 + }, + { + "completion_length": 1191.3125534057617, + "epoch": 0.4754285714285714, + "grad_norm": 2.506166696548462, + "kl": 0.4866943359375, + "learning_rate": 1.7518544168045524e-07, + "loss": 0.0195, + "reward": 0.540099716745317, + "reward_std": 0.9752370566129684, + "rewards/cosine_scaled_reward": 0.078727146377787, + "rewards/format_reward": 0.833333358168602, + "step": 416 + }, + { + "completion_length": 1577.3750686645508, + "epoch": 0.4765714285714286, + "grad_norm": 2.541682004928589, + "kl": 0.90380859375, + "learning_rate": 1.7345605894346726e-07, + "loss": 0.0361, + "reward": 0.19305734895169735, + "reward_std": 0.8487963303923607, + "rewards/cosine_scaled_reward": -0.08683300111442804, + "rewards/format_reward": 0.6666666902601719, + "step": 417 + }, + { + "completion_length": 1273.8958473205566, + "epoch": 0.4777142857142857, + "grad_norm": 2.7441353797912598, + "kl": 0.631591796875, + "learning_rate": 1.7174502842694212e-07, + "loss": 0.0253, + "reward": 0.5500797647982836, + "reward_std": 0.9426254630088806, + "rewards/cosine_scaled_reward": 0.08637175487820059, + "rewards/format_reward": 0.8333333507180214, + "step": 418 + }, + { + "completion_length": 1354.8542098999023, + "epoch": 0.47885714285714287, + "grad_norm": 2.3867642879486084, + "kl": 0.687744140625, + "learning_rate": 1.7005243352409333e-07, + "loss": 0.0275, + "reward": 0.3600006675114855, + "reward_std": 0.9771365597844124, + "rewards/cosine_scaled_reward": -0.004668326582759619, + "rewards/format_reward": 0.7083333469927311, + "step": 419 + }, + { + "completion_length": 1028.6250381469727, + "epoch": 0.48, + "grad_norm": 10.020596504211426, + "kl": 0.8775634765625, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0351, + "reward": 0.07369395159184933, + "reward_std": 0.7157322019338608, + "rewards/cosine_scaled_reward": -0.15065189078450203, + "rewards/format_reward": 0.7083333544433117, + "step": 420 + }, + { + "completion_length": 1321.0833587646484, + "epoch": 0.48114285714285715, + "grad_norm": 3.3951587677001953, + "kl": 0.703125, + "learning_rate": 1.6672287963562852e-07, + "loss": 0.0281, + "reward": -0.07291397266089916, + "reward_std": 0.6696721352636814, + "rewards/cosine_scaled_reward": -0.20483622467145324, + "rewards/format_reward": 0.6458333469927311, + "step": 421 + }, + { + "completion_length": 1193.3541946411133, + "epoch": 0.48228571428571426, + "grad_norm": 2.8352770805358887, + "kl": 0.587646484375, + "learning_rate": 1.6508608292777203e-07, + "loss": 0.0235, + "reward": 0.2525853253901005, + "reward_std": 0.8423508293926716, + "rewards/cosine_scaled_reward": -0.07235960848629475, + "rewards/format_reward": 0.7500000149011612, + "step": 422 + }, + { + "completion_length": 1172.8333740234375, + "epoch": 0.48342857142857143, + "grad_norm": 2.012563467025757, + "kl": 0.49560546875, + "learning_rate": 1.6346804638120098e-07, + "loss": 0.0198, + "reward": 0.09193800436332822, + "reward_std": 0.7620077319443226, + "rewards/cosine_scaled_reward": -0.18725344724953175, + "rewards/format_reward": 0.7916666939854622, + "step": 423 + }, + { + "completion_length": 1772.6875534057617, + "epoch": 0.4845714285714286, + "grad_norm": 1.4459457397460938, + "kl": 0.5943603515625, + "learning_rate": 1.6186884885673413e-07, + "loss": 0.0238, + "reward": 0.2721530678682029, + "reward_std": 0.9363379552960396, + "rewards/cosine_scaled_reward": -0.06505887769162655, + "rewards/format_reward": 0.7083333544433117, + "step": 424 + }, + { + "completion_length": 1682.208366394043, + "epoch": 0.4857142857142857, + "grad_norm": 2.2990784645080566, + "kl": 0.73602294921875, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.0294, + "reward": 0.5255491202697158, + "reward_std": 1.0602496266365051, + "rewards/cosine_scaled_reward": 0.10401645209640265, + "rewards/format_reward": 0.6666666772216558, + "step": 425 + }, + { + "completion_length": 1195.916690826416, + "epoch": 0.4868571428571429, + "grad_norm": 2.467822790145874, + "kl": 0.5208740234375, + "learning_rate": 1.5872728172265146e-07, + "loss": 0.0208, + "reward": 0.11827224772423506, + "reward_std": 0.6336887441575527, + "rewards/cosine_scaled_reward": -0.1465412126854062, + "rewards/format_reward": 0.7916666939854622, + "step": 426 + }, + { + "completion_length": 1604.0208587646484, + "epoch": 0.488, + "grad_norm": 2.8577768802642822, + "kl": 0.614990234375, + "learning_rate": 1.5718506522858572e-07, + "loss": 0.0246, + "reward": 0.27578693721443415, + "reward_std": 1.0030243545770645, + "rewards/cosine_scaled_reward": -0.04830464324913919, + "rewards/format_reward": 0.6666666865348816, + "step": 427 + }, + { + "completion_length": 1337.7916946411133, + "epoch": 0.48914285714285716, + "grad_norm": 1.5771373510360718, + "kl": 0.516357421875, + "learning_rate": 1.5566199398026147e-07, + "loss": 0.0206, + "reward": 0.31163547467440367, + "reward_std": 0.8827511817216873, + "rewards/cosine_scaled_reward": -0.05672437600696867, + "rewards/format_reward": 0.791666679084301, + "step": 428 + }, + { + "completion_length": 1092.6458854675293, + "epoch": 0.49028571428571427, + "grad_norm": 7.517470836639404, + "kl": 0.735107421875, + "learning_rate": 1.5415814221002265e-07, + "loss": 0.0294, + "reward": -0.04257449973374605, + "reward_std": 0.6279182583093643, + "rewards/cosine_scaled_reward": -0.2530031790956855, + "rewards/format_reward": 0.7916666865348816, + "step": 429 + }, + { + "completion_length": 1677.7500686645508, + "epoch": 0.49142857142857144, + "grad_norm": 2.1630637645721436, + "kl": 0.6884765625, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0276, + "reward": 0.20026828069239855, + "reward_std": 0.8897528201341629, + "rewards/cosine_scaled_reward": -0.10384058998897672, + "rewards/format_reward": 0.7291666865348816, + "step": 430 + }, + { + "completion_length": 1051.2500381469727, + "epoch": 0.49257142857142855, + "grad_norm": 2.162562847137451, + "kl": 0.677490234375, + "learning_rate": 1.5120838934595337e-07, + "loss": 0.0271, + "reward": 0.0934929153881967, + "reward_std": 0.6808763556182384, + "rewards/cosine_scaled_reward": -0.1685788333415985, + "rewards/format_reward": 0.791666679084301, + "step": 431 + }, + { + "completion_length": 1581.7917098999023, + "epoch": 0.4937142857142857, + "grad_norm": 3.096701145172119, + "kl": 0.9864044189453125, + "learning_rate": 1.4976263201891613e-07, + "loss": 0.0394, + "reward": 0.003629262908361852, + "reward_std": 0.6885860674083233, + "rewards/cosine_scaled_reward": -0.15299149602651596, + "rewards/format_reward": 0.6250000074505806, + "step": 432 + }, + { + "completion_length": 1580.2708740234375, + "epoch": 0.4948571428571429, + "grad_norm": 2.7364211082458496, + "kl": 0.75, + "learning_rate": 1.483363816965435e-07, + "loss": 0.03, + "reward": 0.15776942297816277, + "reward_std": 0.8655783608555794, + "rewards/cosine_scaled_reward": -0.08985630236566067, + "rewards/format_reward": 0.6041666828095913, + "step": 433 + }, + { + "completion_length": 1532.2291984558105, + "epoch": 0.496, + "grad_norm": 1.8213260173797607, + "kl": 0.5916748046875, + "learning_rate": 1.469297078922642e-07, + "loss": 0.0237, + "reward": -0.09767143800854683, + "reward_std": 0.6334349103271961, + "rewards/cosine_scaled_reward": -0.22124753845855594, + "rewards/format_reward": 0.6458333544433117, + "step": 434 + }, + { + "completion_length": 942.9166793823242, + "epoch": 0.49714285714285716, + "grad_norm": 2.527085542678833, + "kl": 0.7471923828125, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.0299, + "reward": -0.003471766598522663, + "reward_std": 0.5359638221561909, + "rewards/cosine_scaled_reward": -0.20710261538624763, + "rewards/format_reward": 0.8125000223517418, + "step": 435 + }, + { + "completion_length": 1480.1250381469727, + "epoch": 0.4982857142857143, + "grad_norm": 2.621030569076538, + "kl": 0.82220458984375, + "learning_rate": 1.4417536311769885e-07, + "loss": 0.0329, + "reward": 0.3342234673909843, + "reward_std": 0.8837791383266449, + "rewards/cosine_scaled_reward": 0.03901571640744805, + "rewards/format_reward": 0.6250000149011612, + "step": 436 + }, + { + "completion_length": 1359.333381652832, + "epoch": 0.49942857142857144, + "grad_norm": 2.2266130447387695, + "kl": 0.56341552734375, + "learning_rate": 1.4282782639029128e-07, + "loss": 0.0225, + "reward": 0.17527570901438594, + "reward_std": 0.7361086085438728, + "rewards/cosine_scaled_reward": -0.09591093473136425, + "rewards/format_reward": 0.7083333618938923, + "step": 437 + }, + { + "completion_length": 1493.1875457763672, + "epoch": 0.5005714285714286, + "grad_norm": 1.7907952070236206, + "kl": 0.62255859375, + "learning_rate": 1.4150013466019114e-07, + "loss": 0.0249, + "reward": 0.025391742354258895, + "reward_std": 0.8453281559050083, + "rewards/cosine_scaled_reward": -0.22053607925772667, + "rewards/format_reward": 0.7500000260770321, + "step": 438 + }, + { + "completion_length": 1641.4167175292969, + "epoch": 0.5017142857142857, + "grad_norm": 2.076477527618408, + "kl": 0.87103271484375, + "learning_rate": 1.4019235263722034e-07, + "loss": 0.0349, + "reward": -0.12565073231235147, + "reward_std": 0.695248618721962, + "rewards/cosine_scaled_reward": -0.2208885379950516, + "rewards/format_reward": 0.583333345130086, + "step": 439 + }, + { + "completion_length": 1336.645866394043, + "epoch": 0.5028571428571429, + "grad_norm": 3.641127586364746, + "kl": 0.525146484375, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.021, + "reward": 0.20326408464461565, + "reward_std": 0.780558280646801, + "rewards/cosine_scaled_reward": -0.13858992606401443, + "rewards/format_reward": 0.854166679084301, + "step": 440 + }, + { + "completion_length": 1554.1458740234375, + "epoch": 0.504, + "grad_norm": 2.14461612701416, + "kl": 0.649658203125, + "learning_rate": 1.3763677169699217e-07, + "loss": 0.026, + "reward": 0.04901134385727346, + "reward_std": 0.6753638684749603, + "rewards/cosine_scaled_reward": -0.1266509434208274, + "rewards/format_reward": 0.625000013038516, + "step": 441 + }, + { + "completion_length": 1134.083366394043, + "epoch": 0.5051428571428571, + "grad_norm": 2.213015079498291, + "kl": 0.38922119140625, + "learning_rate": 1.3638909733514452e-07, + "loss": 0.0156, + "reward": 0.6072201561182737, + "reward_std": 0.8717315904796124, + "rewards/cosine_scaled_reward": 0.12222992815077305, + "rewards/format_reward": 0.833333358168602, + "step": 442 + }, + { + "completion_length": 1613.5625381469727, + "epoch": 0.5062857142857143, + "grad_norm": 1.7041293382644653, + "kl": 0.726806640625, + "learning_rate": 1.351615817851748e-07, + "loss": 0.0291, + "reward": 0.07468948839232326, + "reward_std": 0.6469858847558498, + "rewards/cosine_scaled_reward": -0.17274294421076775, + "rewards/format_reward": 0.7708333507180214, + "step": 443 + }, + { + "completion_length": 1118.9792137145996, + "epoch": 0.5074285714285715, + "grad_norm": 2.203859567642212, + "kl": 0.5146484375, + "learning_rate": 1.3395428487445914e-07, + "loss": 0.0205, + "reward": 0.109013965819031, + "reward_std": 0.700001485645771, + "rewards/cosine_scaled_reward": -0.17907634377479553, + "rewards/format_reward": 0.833333358168602, + "step": 444 + }, + { + "completion_length": 1328.2500381469727, + "epoch": 0.5085714285714286, + "grad_norm": 2.1055400371551514, + "kl": 0.700439453125, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0281, + "reward": 0.003907807171344757, + "reward_std": 0.6304442547261715, + "rewards/cosine_scaled_reward": -0.1728717922233045, + "rewards/format_reward": 0.7083333507180214, + "step": 445 + }, + { + "completion_length": 1408.5417098999023, + "epoch": 0.5097142857142857, + "grad_norm": 3.701373338699341, + "kl": 1.005859375, + "learning_rate": 1.316005813502869e-07, + "loss": 0.0403, + "reward": 0.18983037257567048, + "reward_std": 0.7102062851190567, + "rewards/cosine_scaled_reward": -0.07104396633803844, + "rewards/format_reward": 0.6875000167638063, + "step": 446 + }, + { + "completion_length": 1182.68754196167, + "epoch": 0.5108571428571429, + "grad_norm": 2.484187126159668, + "kl": 0.940673828125, + "learning_rate": 1.3045428945301953e-07, + "loss": 0.0376, + "reward": 0.17063724854961038, + "reward_std": 0.6657870672643185, + "rewards/cosine_scaled_reward": -0.09112012386322021, + "rewards/format_reward": 0.729166679084301, + "step": 447 + }, + { + "completion_length": 1155.3750228881836, + "epoch": 0.512, + "grad_norm": 9.17601490020752, + "kl": 0.546142578125, + "learning_rate": 1.2932844562179352e-07, + "loss": 0.0218, + "reward": 0.30222914123442024, + "reward_std": 0.7583190239965916, + "rewards/cosine_scaled_reward": -0.05191066488623619, + "rewards/format_reward": 0.7916666865348816, + "step": 448 + }, + { + "completion_length": 1077.8541984558105, + "epoch": 0.5131428571428571, + "grad_norm": 2.589733839035034, + "kl": 0.573486328125, + "learning_rate": 1.2822310472864885e-07, + "loss": 0.0229, + "reward": 0.17757831397466362, + "reward_std": 0.7745413295924664, + "rewards/cosine_scaled_reward": -0.1571638728491962, + "rewards/format_reward": 0.8541666865348816, + "step": 449 + }, + { + "completion_length": 1738.6667098999023, + "epoch": 0.5142857142857142, + "grad_norm": 3.1829066276550293, + "kl": 1.14892578125, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.0459, + "reward": 0.03298312705010176, + "reward_std": 0.7640566490590572, + "rewards/cosine_scaled_reward": -0.14894709549844265, + "rewards/format_reward": 0.6458333469927311, + "step": 450 + }, + { + "completion_length": 1184.0416870117188, + "epoch": 0.5154285714285715, + "grad_norm": 3.6981794834136963, + "kl": 0.68731689453125, + "learning_rate": 1.260741462457165e-07, + "loss": 0.0275, + "reward": 0.326223655953072, + "reward_std": 0.9320776239037514, + "rewards/cosine_scaled_reward": -0.049362530931830406, + "rewards/format_reward": 0.7708333507180214, + "step": 451 + }, + { + "completion_length": 1356.0000228881836, + "epoch": 0.5165714285714286, + "grad_norm": 2.8852858543395996, + "kl": 0.6221923828125, + "learning_rate": 1.2503063339313356e-07, + "loss": 0.0249, + "reward": 0.23957047518342733, + "reward_std": 0.6933315098285675, + "rewards/cosine_scaled_reward": -0.05927852354943752, + "rewards/format_reward": 0.7500000223517418, + "step": 452 + }, + { + "completion_length": 1344.395866394043, + "epoch": 0.5177142857142857, + "grad_norm": 2.0090019702911377, + "kl": 0.6807861328125, + "learning_rate": 1.2400783294793668e-07, + "loss": 0.0272, + "reward": 0.2872171855997294, + "reward_std": 0.7287779673933983, + "rewards/cosine_scaled_reward": -0.033939655870199203, + "rewards/format_reward": 0.770833358168602, + "step": 453 + }, + { + "completion_length": 1270.8125762939453, + "epoch": 0.5188571428571429, + "grad_norm": 3.2378604412078857, + "kl": 0.65252685546875, + "learning_rate": 1.2300579475997657e-07, + "loss": 0.0261, + "reward": 0.11756114871241152, + "reward_std": 0.7418475337326527, + "rewards/cosine_scaled_reward": -0.15102737117558718, + "rewards/format_reward": 0.770833358168602, + "step": 454 + }, + { + "completion_length": 1492.208351135254, + "epoch": 0.52, + "grad_norm": 4.2774338722229, + "kl": 0.79248046875, + "learning_rate": 1.220245676671809e-07, + "loss": 0.0317, + "reward": -0.018868495360948145, + "reward_std": 0.6951228454709053, + "rewards/cosine_scaled_reward": -0.20306831784546375, + "rewards/format_reward": 0.6875000223517418, + "step": 455 + }, + { + "completion_length": 1657.2291870117188, + "epoch": 0.5211428571428571, + "grad_norm": 2.7355401515960693, + "kl": 0.7105712890625, + "learning_rate": 1.2106419949317388e-07, + "loss": 0.0284, + "reward": 0.18439527601003647, + "reward_std": 0.7989875040948391, + "rewards/cosine_scaled_reward": -0.12901971023529768, + "rewards/format_reward": 0.7916666865348816, + "step": 456 + }, + { + "completion_length": 1063.9166946411133, + "epoch": 0.5222857142857142, + "grad_norm": 2.7030558586120605, + "kl": 0.5423583984375, + "learning_rate": 1.2012473704494537e-07, + "loss": 0.0217, + "reward": 0.31176955718547106, + "reward_std": 0.8158461526036263, + "rewards/cosine_scaled_reward": -0.0352974534034729, + "rewards/format_reward": 0.7500000149011612, + "step": 457 + }, + { + "completion_length": 1281.9167175292969, + "epoch": 0.5234285714285715, + "grad_norm": 2.028135299682617, + "kl": 0.7296142578125, + "learning_rate": 1.1920622611056974e-07, + "loss": 0.0292, + "reward": 0.2823291067034006, + "reward_std": 0.8681261576712132, + "rewards/cosine_scaled_reward": -0.06675804499536753, + "rewards/format_reward": 0.7708333488553762, + "step": 458 + }, + { + "completion_length": 1336.5208892822266, + "epoch": 0.5245714285714286, + "grad_norm": 3.601792573928833, + "kl": 0.614990234375, + "learning_rate": 1.1830871145697412e-07, + "loss": 0.0246, + "reward": 0.2878631625790149, + "reward_std": 0.8964981138706207, + "rewards/cosine_scaled_reward": -0.07753422670066357, + "rewards/format_reward": 0.770833358168602, + "step": 459 + }, + { + "completion_length": 1851.0833740234375, + "epoch": 0.5257142857142857, + "grad_norm": 3.7353336811065674, + "kl": 0.7938232421875, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.0317, + "reward": 0.22537417088460643, + "reward_std": 0.8646272122859955, + "rewards/cosine_scaled_reward": -0.08393661957234144, + "rewards/format_reward": 0.7291666865348816, + "step": 460 + }, + { + "completion_length": 1549.5000534057617, + "epoch": 0.5268571428571428, + "grad_norm": 3.5757029056549072, + "kl": 0.853271484375, + "learning_rate": 1.1657684494105386e-07, + "loss": 0.0342, + "reward": 0.11143558099865913, + "reward_std": 0.7968284860253334, + "rewards/cosine_scaled_reward": -0.10749776661396027, + "rewards/format_reward": 0.6250000186264515, + "step": 461 + }, + { + "completion_length": 1378.3542022705078, + "epoch": 0.528, + "grad_norm": 3.368990421295166, + "kl": 0.8125, + "learning_rate": 1.1574257748745986e-07, + "loss": 0.0325, + "reward": -0.07270677981432527, + "reward_std": 0.5974620878696442, + "rewards/cosine_scaled_reward": -0.23459232598543167, + "rewards/format_reward": 0.708333358168602, + "step": 462 + }, + { + "completion_length": 1423.1250457763672, + "epoch": 0.5291428571428571, + "grad_norm": 3.6016039848327637, + "kl": 0.7799072265625, + "learning_rate": 1.1492947512799328e-07, + "loss": 0.0312, + "reward": 0.2612933642594726, + "reward_std": 0.7466553151607513, + "rewards/cosine_scaled_reward": -0.047830826602876186, + "rewards/format_reward": 0.7500000223517418, + "step": 463 + }, + { + "completion_length": 1017.833366394043, + "epoch": 0.5302857142857142, + "grad_norm": 2.5817713737487793, + "kl": 0.462646484375, + "learning_rate": 1.1413757749211602e-07, + "loss": 0.0185, + "reward": 0.4776353507768363, + "reward_std": 0.6480004005134106, + "rewards/cosine_scaled_reward": 0.10391846485435963, + "rewards/format_reward": 0.8958333395421505, + "step": 464 + }, + { + "completion_length": 1335.3542098999023, + "epoch": 0.5314285714285715, + "grad_norm": 2.435389995574951, + "kl": 0.8916015625, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.0357, + "reward": -0.06275751371867955, + "reward_std": 0.6158505566418171, + "rewards/cosine_scaled_reward": -0.23374686716124415, + "rewards/format_reward": 0.729166679084301, + "step": 465 + }, + { + "completion_length": 1214.8333587646484, + "epoch": 0.5325714285714286, + "grad_norm": 2.2548060417175293, + "kl": 0.5836181640625, + "learning_rate": 1.1261754973965422e-07, + "loss": 0.0233, + "reward": 0.37778762076050043, + "reward_std": 0.9365783184766769, + "rewards/cosine_scaled_reward": -0.0053373780101537704, + "rewards/format_reward": 0.7708333488553762, + "step": 466 + }, + { + "completion_length": 1402.68754196167, + "epoch": 0.5337142857142857, + "grad_norm": 2.9286229610443115, + "kl": 0.724578857421875, + "learning_rate": 1.1188949370707787e-07, + "loss": 0.029, + "reward": 0.11761695193126798, + "reward_std": 0.7316232472658157, + "rewards/cosine_scaled_reward": -0.15633787866681814, + "rewards/format_reward": 0.791666679084301, + "step": 467 + }, + { + "completion_length": 1510.9791946411133, + "epoch": 0.5348571428571428, + "grad_norm": 14.900471687316895, + "kl": 2.21337890625, + "learning_rate": 1.1118279056249653e-07, + "loss": 0.0885, + "reward": 0.21817315090447664, + "reward_std": 0.9960628747940063, + "rewards/cosine_scaled_reward": -0.07175269955769181, + "rewards/format_reward": 0.6250000223517418, + "step": 468 + }, + { + "completion_length": 1016.2917098999023, + "epoch": 0.536, + "grad_norm": 6.350338459014893, + "kl": 0.953369140625, + "learning_rate": 1.1049747474962444e-07, + "loss": 0.0382, + "reward": 0.2713460554368794, + "reward_std": 0.7287629023194313, + "rewards/cosine_scaled_reward": -0.04988390300422907, + "rewards/format_reward": 0.770833358168602, + "step": 469 + }, + { + "completion_length": 1466.6042251586914, + "epoch": 0.5371428571428571, + "grad_norm": 7.370168209075928, + "kl": 1.435546875, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.0574, + "reward": 0.01497307000681758, + "reward_std": 0.733777578920126, + "rewards/cosine_scaled_reward": -0.16290199384093285, + "rewards/format_reward": 0.6250000149011612, + "step": 470 + }, + { + "completion_length": 1592.7083740234375, + "epoch": 0.5382857142857143, + "grad_norm": 7.640194892883301, + "kl": 1.49639892578125, + "learning_rate": 1.0919113768029517e-07, + "loss": 0.0598, + "reward": 0.3828916675411165, + "reward_std": 0.9707400016486645, + "rewards/cosine_scaled_reward": 0.03202553070150316, + "rewards/format_reward": 0.666666679084301, + "step": 471 + }, + { + "completion_length": 1145.1041946411133, + "epoch": 0.5394285714285715, + "grad_norm": 2.8886444568634033, + "kl": 0.802978515625, + "learning_rate": 1.0857018009286381e-07, + "loss": 0.0321, + "reward": 0.28387897345237434, + "reward_std": 0.7857476621866226, + "rewards/cosine_scaled_reward": -0.0928277347702533, + "rewards/format_reward": 0.895833358168602, + "step": 472 + }, + { + "completion_length": 1445.7708740234375, + "epoch": 0.5405714285714286, + "grad_norm": 3.867896318435669, + "kl": 1.0830078125, + "learning_rate": 1.0797073717209013e-07, + "loss": 0.0433, + "reward": 0.11159952421439812, + "reward_std": 0.7589086703956127, + "rewards/cosine_scaled_reward": -0.1572155966423452, + "rewards/format_reward": 0.770833358168602, + "step": 473 + }, + { + "completion_length": 1613.3750686645508, + "epoch": 0.5417142857142857, + "grad_norm": 4.905575275421143, + "kl": 1.672607421875, + "learning_rate": 1.0739283813397639e-07, + "loss": 0.0669, + "reward": 0.428771385923028, + "reward_std": 1.020697444677353, + "rewards/cosine_scaled_reward": 0.044653447810560465, + "rewards/format_reward": 0.6458333563059568, + "step": 474 + }, + { + "completion_length": 1337.0417022705078, + "epoch": 0.5428571428571428, + "grad_norm": 4.179660320281982, + "kl": 0.6060791015625, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0242, + "reward": 0.23191617615520954, + "reward_std": 0.8730170913040638, + "rewards/cosine_scaled_reward": -0.11908163363114, + "rewards/format_reward": 0.8125000223517418, + "step": 475 + }, + { + "completion_length": 1265.6667098999023, + "epoch": 0.544, + "grad_norm": 6.457125186920166, + "kl": 1.1802978515625, + "learning_rate": 1.063017833182728e-07, + "loss": 0.0473, + "reward": 0.32982578047085553, + "reward_std": 0.9863774701952934, + "rewards/cosine_scaled_reward": -0.03677630145102739, + "rewards/format_reward": 0.7291666865348816, + "step": 476 + }, + { + "completion_length": 998.0000381469727, + "epoch": 0.5451428571428572, + "grad_norm": 3.380866050720215, + "kl": 0.7977294921875, + "learning_rate": 1.0578868071715544e-07, + "loss": 0.0319, + "reward": 0.4656647043302655, + "reward_std": 0.9266739711165428, + "rewards/cosine_scaled_reward": 0.003877062350511551, + "rewards/format_reward": 0.833333358168602, + "step": 477 + }, + { + "completion_length": 1333.4583892822266, + "epoch": 0.5462857142857143, + "grad_norm": 25.78436851501465, + "kl": 2.43603515625, + "learning_rate": 1.0529722834905125e-07, + "loss": 0.0974, + "reward": 0.3743471228517592, + "reward_std": 0.8705001547932625, + "rewards/cosine_scaled_reward": 0.015704313293099403, + "rewards/format_reward": 0.7708333507180214, + "step": 478 + }, + { + "completion_length": 1580.6666946411133, + "epoch": 0.5474285714285714, + "grad_norm": 5.661951541900635, + "kl": 1.4453125, + "learning_rate": 1.0482745016665526e-07, + "loss": 0.0579, + "reward": 0.039567636558786035, + "reward_std": 0.7153622172772884, + "rewards/cosine_scaled_reward": -0.17540637124329805, + "rewards/format_reward": 0.7083333507180214, + "step": 479 + }, + { + "completion_length": 1532.4792175292969, + "epoch": 0.5485714285714286, + "grad_norm": 10.698458671569824, + "kl": 1.906494140625, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0764, + "reward": 0.12518761213868856, + "reward_std": 0.7352484799921513, + "rewards/cosine_scaled_reward": -0.1592358397319913, + "rewards/format_reward": 0.7708333507180214, + "step": 480 + }, + { + "completion_length": 1801.8959045410156, + "epoch": 0.5497142857142857, + "grad_norm": 7.208052158355713, + "kl": 2.16015625, + "learning_rate": 1.0395300688680625e-07, + "loss": 0.0865, + "reward": -0.068937080912292, + "reward_std": 0.678013302385807, + "rewards/cosine_scaled_reward": -0.21540158614516258, + "rewards/format_reward": 0.6666666865348816, + "step": 481 + }, + { + "completion_length": 1408.2083587646484, + "epoch": 0.5508571428571428, + "grad_norm": 28.815509796142578, + "kl": 2.3358154296875, + "learning_rate": 1.0354838440848501e-07, + "loss": 0.0934, + "reward": 0.2200728515163064, + "reward_std": 0.8331051766872406, + "rewards/cosine_scaled_reward": -0.07284185755997896, + "rewards/format_reward": 0.7083333544433117, + "step": 482 + }, + { + "completion_length": 1403.6250457763672, + "epoch": 0.552, + "grad_norm": 4.524230480194092, + "kl": 0.8890380859375, + "learning_rate": 1.0316552135205837e-07, + "loss": 0.0355, + "reward": 0.26419115875614807, + "reward_std": 0.8404405452311039, + "rewards/cosine_scaled_reward": -0.07794593391008675, + "rewards/format_reward": 0.7500000298023224, + "step": 483 + }, + { + "completion_length": 1195.2916870117188, + "epoch": 0.5531428571428572, + "grad_norm": 3.5205888748168945, + "kl": 1.357666015625, + "learning_rate": 1.0280443637773163e-07, + "loss": 0.0543, + "reward": 0.28255582111887634, + "reward_std": 0.9850383549928665, + "rewards/cosine_scaled_reward": -0.051978057832457125, + "rewards/format_reward": 0.6875000260770321, + "step": 484 + }, + { + "completion_length": 1259.208366394043, + "epoch": 0.5542857142857143, + "grad_norm": 2.4940898418426514, + "kl": 0.69293212890625, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0277, + "reward": 0.11278902753838338, + "reward_std": 0.8309934213757515, + "rewards/cosine_scaled_reward": -0.1679881983436644, + "rewards/format_reward": 0.7708333432674408, + "step": 485 + }, + { + "completion_length": 972.7291946411133, + "epoch": 0.5554285714285714, + "grad_norm": 5.1607160568237305, + "kl": 1.142333984375, + "learning_rate": 1.0214767000817596e-07, + "loss": 0.0456, + "reward": 0.25730944075621665, + "reward_std": 0.7034793458878994, + "rewards/cosine_scaled_reward": -0.04670877754688263, + "rewards/format_reward": 0.8125000223517418, + "step": 486 + }, + { + "completion_length": 1239.770881652832, + "epoch": 0.5565714285714286, + "grad_norm": 4.1862897872924805, + "kl": 0.94287109375, + "learning_rate": 1.0185202062281336e-07, + "loss": 0.0377, + "reward": 0.5008484733989462, + "reward_std": 0.818550631403923, + "rewards/cosine_scaled_reward": 0.08279422484338284, + "rewards/format_reward": 0.8541666865348816, + "step": 487 + }, + { + "completion_length": 1284.854206085205, + "epoch": 0.5577142857142857, + "grad_norm": 3.6226584911346436, + "kl": 0.8634033203125, + "learning_rate": 1.0157821333772304e-07, + "loss": 0.0346, + "reward": 0.09361092420294881, + "reward_std": 0.6568006910383701, + "rewards/cosine_scaled_reward": -0.13768697017803788, + "rewards/format_reward": 0.7708333488553762, + "step": 488 + }, + { + "completion_length": 1505.7708930969238, + "epoch": 0.5588571428571428, + "grad_norm": 3.779109477996826, + "kl": 1.6417236328125, + "learning_rate": 1.013262614978859e-07, + "loss": 0.0656, + "reward": -0.12699575908482075, + "reward_std": 0.6261985003948212, + "rewards/cosine_scaled_reward": -0.29026195663027465, + "rewards/format_reward": 0.7291666865348816, + "step": 489 + }, + { + "completion_length": 1053.770866394043, + "epoch": 0.56, + "grad_norm": 1.6185153722763062, + "kl": 0.48480224609375, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0194, + "reward": 0.23615788342431188, + "reward_std": 0.704241368919611, + "rewards/cosine_scaled_reward": -0.062381197698414326, + "rewards/format_reward": 0.833333358168602, + "step": 490 + }, + { + "completion_length": 1363.4167022705078, + "epoch": 0.5611428571428572, + "grad_norm": 7.597695350646973, + "kl": 1.367431640625, + "learning_rate": 1.0088797220727779e-07, + "loss": 0.0547, + "reward": 0.5214339741505682, + "reward_std": 1.070147231221199, + "rewards/cosine_scaled_reward": 0.063262770883739, + "rewards/format_reward": 0.7500000149011612, + "step": 491 + }, + { + "completion_length": 1276.8542022705078, + "epoch": 0.5622857142857143, + "grad_norm": 3.7099130153656006, + "kl": 0.9229736328125, + "learning_rate": 1.0070165611810855e-07, + "loss": 0.037, + "reward": 0.25723157986067235, + "reward_std": 0.7029771581292152, + "rewards/cosine_scaled_reward": -0.017836466431617737, + "rewards/format_reward": 0.7291666939854622, + "step": 492 + }, + { + "completion_length": 1266.0625534057617, + "epoch": 0.5634285714285714, + "grad_norm": 3.5932085514068604, + "kl": 1.254150390625, + "learning_rate": 1.005372381963547e-07, + "loss": 0.0501, + "reward": 0.2146676443517208, + "reward_std": 0.8730643317103386, + "rewards/cosine_scaled_reward": -0.10545299621298909, + "rewards/format_reward": 0.7500000223517418, + "step": 493 + }, + { + "completion_length": 1115.4792022705078, + "epoch": 0.5645714285714286, + "grad_norm": 2.4263534545898438, + "kl": 0.810546875, + "learning_rate": 1.0039472645551372e-07, + "loss": 0.0324, + "reward": 0.29131725314073265, + "reward_std": 0.9221536330878735, + "rewards/cosine_scaled_reward": -0.07916020415723324, + "rewards/format_reward": 0.7708333507180214, + "step": 494 + }, + { + "completion_length": 1731.3750686645508, + "epoch": 0.5657142857142857, + "grad_norm": 15.834768295288086, + "kl": 2.26806640625, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0906, + "reward": 0.2820148948812857, + "reward_std": 0.9001934975385666, + "rewards/cosine_scaled_reward": -0.013229399919509888, + "rewards/format_reward": 0.6458333507180214, + "step": 495 + }, + { + "completion_length": 1282.9583702087402, + "epoch": 0.5668571428571428, + "grad_norm": 3.1927053928375244, + "kl": 1.07080078125, + "learning_rate": 1.0017544823184055e-07, + "loss": 0.0428, + "reward": 0.2954734539380297, + "reward_std": 0.7768198624253273, + "rewards/cosine_scaled_reward": -0.05057825893163681, + "rewards/format_reward": 0.7708333656191826, + "step": 496 + }, + { + "completion_length": 1330.3541946411133, + "epoch": 0.568, + "grad_norm": 3.622467279434204, + "kl": 1.56591796875, + "learning_rate": 1.0009869243631952e-07, + "loss": 0.0626, + "reward": 0.2870326414704323, + "reward_std": 0.9923329427838326, + "rewards/cosine_scaled_reward": -0.05224178615026176, + "rewards/format_reward": 0.666666692122817, + "step": 497 + }, + { + "completion_length": 1888.2083435058594, + "epoch": 0.5691428571428572, + "grad_norm": 8.798296928405762, + "kl": 2.22607421875, + "learning_rate": 1.000438641958131e-07, + "loss": 0.089, + "reward": 0.09313779044896364, + "reward_std": 0.8005350790917873, + "rewards/cosine_scaled_reward": -0.1090406347066164, + "rewards/format_reward": 0.5833333488553762, + "step": 498 + }, + { + "completion_length": 1403.8958587646484, + "epoch": 0.5702857142857143, + "grad_norm": 3.579638957977295, + "kl": 1.40771484375, + "learning_rate": 1.0001096618257236e-07, + "loss": 0.0563, + "reward": 0.31425497168675065, + "reward_std": 0.9142299555242062, + "rewards/cosine_scaled_reward": -0.03276859223842621, + "rewards/format_reward": 0.7291666902601719, + "step": 499 + }, + { + "completion_length": 1698.9792022705078, + "epoch": 0.5714285714285714, + "grad_norm": 178.20928955078125, + "kl": 7.2745361328125, + "learning_rate": 1e-07, + "loss": 0.2911, + "reward": 0.14990665763616562, + "reward_std": 0.7996302992105484, + "rewards/cosine_scaled_reward": -0.12167917937040329, + "rewards/format_reward": 0.7083333563059568, + "step": 500 + }, + { + "epoch": 0.5714285714285714, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.01018991470336914, + "train_runtime": 16191.746, + "train_samples_per_second": 1.482, + "train_steps_per_second": 0.031 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}