{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2593.500030517578, "epoch": 0.001142857142857143, "grad_norm": 0.2531319856643677, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": 0.1798910442739725, "reward_std": 0.2546093426644802, "rewards/cosine_scaled_reward": 0.03244469128549099, "rewards/format_reward": 0.4791666828095913, "step": 1 }, { "completion_length": 2896.7916870117188, "epoch": 0.002285714285714286, "grad_norm": 0.18851928412914276, "kl": 0.0, "learning_rate": 2e-08, "loss": 0.0, "reward": 0.09316219575703144, "reward_std": 0.29461798816919327, "rewards/cosine_scaled_reward": -0.013163809664547443, "rewards/format_reward": 0.37500001303851604, "step": 2 }, { "completion_length": 3354.7708740234375, "epoch": 0.0034285714285714284, "grad_norm": 0.16173841059207916, "kl": 4.8100948333740234e-05, "learning_rate": 4e-08, "loss": 0.0, "reward": -0.11184105090796947, "reward_std": 0.18034866452217102, "rewards/cosine_scaled_reward": -0.21646979451179504, "rewards/format_reward": 0.12500000558793545, "step": 3 }, { "completion_length": 2534.791717529297, "epoch": 0.004571428571428572, "grad_norm": 0.2748425304889679, "kl": 3.293156623840332e-05, "learning_rate": 6e-08, "loss": 0.0, "reward": 0.09380067978054285, "reward_std": 0.3090171590447426, "rewards/cosine_scaled_reward": -0.14579539687838405, "rewards/format_reward": 0.5208333432674408, "step": 4 }, { "completion_length": 3140.5208740234375, "epoch": 0.005714285714285714, "grad_norm": 0.24623969197273254, "kl": 4.279613494873047e-05, "learning_rate": 8e-08, "loss": 0.0, "reward": -0.09575780294835567, "reward_std": 0.17200364544987679, "rewards/cosine_scaled_reward": -0.2587662376463413, "rewards/format_reward": 0.22916667349636555, "step": 5 }, { "completion_length": 3190.7291870117188, "epoch": 0.006857142857142857, "grad_norm": 0.19608470797538757, "kl": 4.5180320739746094e-05, "learning_rate": 1e-07, "loss": 0.0, "reward": -0.058510275557637215, "reward_std": 0.23042680323123932, "rewards/cosine_scaled_reward": -0.1778582688421011, "rewards/format_reward": 0.22916667722165585, "step": 6 }, { "completion_length": 3066.5416870117188, "epoch": 0.008, "grad_norm": 0.15382979810237885, "kl": 2.86102294921875e-05, "learning_rate": 1.2e-07, "loss": 0.0, "reward": 0.08077196776866913, "reward_std": 0.25011931732296944, "rewards/cosine_scaled_reward": -0.11747794598340988, "rewards/format_reward": 0.5208333432674408, "step": 7 }, { "completion_length": 2712.3125610351562, "epoch": 0.009142857142857144, "grad_norm": 0.1719086617231369, "kl": 1.9162893295288086e-05, "learning_rate": 1.4e-07, "loss": 0.0, "reward": 0.22242407873272896, "reward_std": 0.2544141337275505, "rewards/cosine_scaled_reward": 0.08669950067996979, "rewards/format_reward": 0.4583333432674408, "step": 8 }, { "completion_length": 3236.0834350585938, "epoch": 0.010285714285714285, "grad_norm": 0.18805697560310364, "kl": 3.9696693420410156e-05, "learning_rate": 1.6e-07, "loss": 0.0, "reward": 0.0005422467365860939, "reward_std": 0.22025279328227043, "rewards/cosine_scaled_reward": -0.1387481726706028, "rewards/format_reward": 0.2916666753590107, "step": 9 }, { "completion_length": 2817.2291870117188, "epoch": 0.011428571428571429, "grad_norm": 0.17756684124469757, "kl": 3.081560134887695e-05, "learning_rate": 1.8e-07, "loss": 0.0, "reward": 0.06626716488972306, "reward_std": 0.24357087537646294, "rewards/cosine_scaled_reward": -0.08263782970607281, "rewards/format_reward": 0.3958333358168602, "step": 10 }, { "completion_length": 3375.3958740234375, "epoch": 0.012571428571428572, "grad_norm": 0.1558818519115448, "kl": 3.355741500854492e-05, "learning_rate": 2e-07, "loss": 0.0, "reward": -0.06830349378287792, "reward_std": 0.2444549147039652, "rewards/cosine_scaled_reward": -0.1914975270628929, "rewards/format_reward": 0.14583333395421505, "step": 11 }, { "completion_length": 2640.3959045410156, "epoch": 0.013714285714285714, "grad_norm": 0.19693666696548462, "kl": 3.769993782043457e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": 0.20038466714322567, "reward_std": 0.2972068637609482, "rewards/cosine_scaled_reward": -0.002930758520960808, "rewards/format_reward": 0.583333358168602, "step": 12 }, { "completion_length": 2929.041748046875, "epoch": 0.014857142857142857, "grad_norm": 0.20476509630680084, "kl": 3.999471664428711e-05, "learning_rate": 2.4e-07, "loss": 0.0, "reward": 0.1786028677597642, "reward_std": 0.2050685416907072, "rewards/cosine_scaled_reward": 0.020189031958580017, "rewards/format_reward": 0.4583333432674408, "step": 13 }, { "completion_length": 2892.0209350585938, "epoch": 0.016, "grad_norm": 0.24246934056282043, "kl": 3.141164779663086e-05, "learning_rate": 2.6e-07, "loss": 0.0, "reward": 0.030550841242074966, "reward_std": 0.2482227310538292, "rewards/cosine_scaled_reward": -0.12857902504038066, "rewards/format_reward": 0.37500000558793545, "step": 14 }, { "completion_length": 2702.1875915527344, "epoch": 0.017142857142857144, "grad_norm": 0.20641961693763733, "kl": 2.3245811462402344e-05, "learning_rate": 2.8e-07, "loss": 0.0, "reward": 0.14474756829440594, "reward_std": 0.23251333087682724, "rewards/cosine_scaled_reward": 0.005298769101500511, "rewards/format_reward": 0.4166666753590107, "step": 15 }, { "completion_length": 3581.0208740234375, "epoch": 0.018285714285714287, "grad_norm": 0.1642305999994278, "kl": 4.464387893676758e-05, "learning_rate": 3e-07, "loss": 0.0, "reward": -0.14038190618157387, "reward_std": 0.2387940175831318, "rewards/cosine_scaled_reward": -0.23054109513759613, "rewards/format_reward": 0.0416666679084301, "step": 16 }, { "completion_length": 2264.666748046875, "epoch": 0.019428571428571427, "grad_norm": 0.2635192275047302, "kl": 4.088878631591797e-05, "learning_rate": 3.2e-07, "loss": 0.0, "reward": 0.1540674790740013, "reward_std": 0.2818757649511099, "rewards/cosine_scaled_reward": -0.022916601970791817, "rewards/format_reward": 0.5416666679084301, "step": 17 }, { "completion_length": 3018.3334350585938, "epoch": 0.02057142857142857, "grad_norm": 0.17216412723064423, "kl": 2.47955322265625e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": 0.030305630527436733, "reward_std": 0.20705362781882286, "rewards/cosine_scaled_reward": -0.14827953279018402, "rewards/format_reward": 0.354166679084301, "step": 18 }, { "completion_length": 2884.0001220703125, "epoch": 0.021714285714285714, "grad_norm": 0.19174857437610626, "kl": 3.063678741455078e-05, "learning_rate": 3.6e-07, "loss": 0.0, "reward": 0.20790010876953602, "reward_std": 0.29342253506183624, "rewards/cosine_scaled_reward": 0.10294647887349129, "rewards/format_reward": 0.4375000074505806, "step": 19 }, { "completion_length": 2425.541748046875, "epoch": 0.022857142857142857, "grad_norm": 0.2391517460346222, "kl": 2.168118953704834e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": 0.14567521400749683, "reward_std": 0.2973865978419781, "rewards/cosine_scaled_reward": -0.06746451498474926, "rewards/format_reward": 0.5833333507180214, "step": 20 }, { "completion_length": 2701.6875, "epoch": 0.024, "grad_norm": 0.24159380793571472, "kl": 4.054605960845947e-05, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.05628635361790657, "reward_std": 0.24320747144520283, "rewards/cosine_scaled_reward": -0.12232345715165138, "rewards/format_reward": 0.3958333358168602, "step": 21 }, { "completion_length": 1856.3333740234375, "epoch": 0.025142857142857144, "grad_norm": 0.50559401512146, "kl": 4.360079765319824e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": 0.23862483352422714, "reward_std": 0.2502210922539234, "rewards/cosine_scaled_reward": -0.017226822674274445, "rewards/format_reward": 0.7500000149011612, "step": 22 }, { "completion_length": 2594.541748046875, "epoch": 0.026285714285714287, "grad_norm": 0.2060898244380951, "kl": 3.0606985092163086e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": 0.16113443858921528, "reward_std": 0.33911262452602386, "rewards/cosine_scaled_reward": -0.00336562842130661, "rewards/format_reward": 0.5416666865348816, "step": 23 }, { "completion_length": 2759.0833740234375, "epoch": 0.027428571428571427, "grad_norm": 0.21166135370731354, "kl": 3.281235694885254e-05, "learning_rate": 4.6e-07, "loss": 0.0, "reward": 0.15770690888166428, "reward_std": 0.24454708211123943, "rewards/cosine_scaled_reward": -0.006449127569794655, "rewards/format_reward": 0.5000000149011612, "step": 24 }, { "completion_length": 2638.7500610351562, "epoch": 0.02857142857142857, "grad_norm": 0.2307935506105423, "kl": 3.364682197570801e-05, "learning_rate": 4.8e-07, "loss": 0.0, "reward": 0.08012701012194157, "reward_std": 0.2043658159673214, "rewards/cosine_scaled_reward": -0.10021369205787778, "rewards/format_reward": 0.416666679084301, "step": 25 }, { "completion_length": 2893.3333740234375, "epoch": 0.029714285714285714, "grad_norm": 0.16613639891147614, "kl": 3.281235694885254e-05, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.15319424215704203, "reward_std": 0.2377900332212448, "rewards/cosine_scaled_reward": -0.021871407516300678, "rewards/format_reward": 0.5000000149011612, "step": 26 }, { "completion_length": 2979.6043090820312, "epoch": 0.030857142857142857, "grad_norm": 0.22748583555221558, "kl": 3.71783971786499e-05, "learning_rate": 5.2e-07, "loss": 0.0, "reward": 0.20988648012280464, "reward_std": 0.3224359378218651, "rewards/cosine_scaled_reward": 0.012707266956567764, "rewards/format_reward": 0.520833358168602, "step": 27 }, { "completion_length": 2814.7500610351562, "epoch": 0.032, "grad_norm": 0.19229423999786377, "kl": 3.766268491744995e-05, "learning_rate": 5.4e-07, "loss": 0.0, "reward": 0.19077781355008483, "reward_std": 0.2345624901354313, "rewards/cosine_scaled_reward": 0.07308735512197018, "rewards/format_reward": 0.479166679084301, "step": 28 }, { "completion_length": 3052.1458740234375, "epoch": 0.03314285714285714, "grad_norm": 0.20518264174461365, "kl": 2.2858381271362305e-05, "learning_rate": 5.6e-07, "loss": 0.0, "reward": -0.1032419636612758, "reward_std": 0.18674146011471748, "rewards/cosine_scaled_reward": -0.3117845207452774, "rewards/format_reward": 0.3125000111758709, "step": 29 }, { "completion_length": 2897.1458740234375, "epoch": 0.03428571428571429, "grad_norm": 0.18484562635421753, "kl": 2.422928810119629e-05, "learning_rate": 5.8e-07, "loss": 0.0, "reward": 0.15586171858012676, "reward_std": 0.28348754346370697, "rewards/cosine_scaled_reward": -0.0056856535375118256, "rewards/format_reward": 0.47916667722165585, "step": 30 }, { "completion_length": 3157.104248046875, "epoch": 0.03542857142857143, "grad_norm": 0.17953482270240784, "kl": 2.7835369110107422e-05, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.027457450167275965, "reward_std": 0.2997221350669861, "rewards/cosine_scaled_reward": -0.10362934321165085, "rewards/format_reward": 0.27083333767950535, "step": 31 }, { "completion_length": 3132.604248046875, "epoch": 0.036571428571428574, "grad_norm": 0.20453964173793793, "kl": 3.3348798751831055e-05, "learning_rate": 6.2e-07, "loss": 0.0, "reward": 0.12347688060253859, "reward_std": 0.32773852348327637, "rewards/cosine_scaled_reward": -0.04991224408149719, "rewards/format_reward": 0.41666667722165585, "step": 32 }, { "completion_length": 3322.9583740234375, "epoch": 0.037714285714285714, "grad_norm": 0.1382211297750473, "kl": 3.045797348022461e-05, "learning_rate": 6.4e-07, "loss": 0.0, "reward": 0.028675228357315063, "reward_std": 0.24782302975654602, "rewards/cosine_scaled_reward": -0.08641303982585669, "rewards/format_reward": 0.3125000149011612, "step": 33 }, { "completion_length": 2470.854217529297, "epoch": 0.038857142857142854, "grad_norm": 0.30319151282310486, "kl": 3.30507755279541e-05, "learning_rate": 6.6e-07, "loss": 0.0, "reward": 0.201703529804945, "reward_std": 0.31444599851965904, "rewards/cosine_scaled_reward": 0.04628665745258331, "rewards/format_reward": 0.5416666716337204, "step": 34 }, { "completion_length": 3163.2084045410156, "epoch": 0.04, "grad_norm": 0.20104609429836273, "kl": 3.534555435180664e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": -0.018313759297598153, "reward_std": 0.27370789274573326, "rewards/cosine_scaled_reward": -0.1344178761355579, "rewards/format_reward": 0.2083333358168602, "step": 35 }, { "completion_length": 3377.8333740234375, "epoch": 0.04114285714285714, "grad_norm": 0.16919943690299988, "kl": 3.9130449295043945e-05, "learning_rate": 7e-07, "loss": 0.0, "reward": -0.07359358388930559, "reward_std": 0.21812193095684052, "rewards/cosine_scaled_reward": -0.1993401860818267, "rewards/format_reward": 0.2083333395421505, "step": 36 }, { "completion_length": 3403.8958740234375, "epoch": 0.04228571428571429, "grad_norm": 0.1572796255350113, "kl": 1.5166588127613068e-05, "learning_rate": 7.2e-07, "loss": 0.0, "reward": -0.11656345473602414, "reward_std": 0.16832329705357552, "rewards/cosine_scaled_reward": -0.23412149026989937, "rewards/format_reward": 0.14583333395421505, "step": 37 }, { "completion_length": 3141.3125610351562, "epoch": 0.04342857142857143, "grad_norm": 0.16345535218715668, "kl": 2.950429916381836e-05, "learning_rate": 7.4e-07, "loss": 0.0, "reward": -0.01369121391326189, "reward_std": 0.20268694125115871, "rewards/cosine_scaled_reward": -0.1278916783630848, "rewards/format_reward": 0.2083333358168602, "step": 38 }, { "completion_length": 2718.7709045410156, "epoch": 0.044571428571428574, "grad_norm": 0.18454480171203613, "kl": 1.6637146472930908e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": 0.1349094829056412, "reward_std": 0.19345630332827568, "rewards/cosine_scaled_reward": -0.005877137184143066, "rewards/format_reward": 0.4583333432674408, "step": 39 }, { "completion_length": 2670.5209350585938, "epoch": 0.045714285714285714, "grad_norm": 0.18595482409000397, "kl": 2.847611904144287e-05, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "reward": 0.12064463802380487, "reward_std": 0.23050246760249138, "rewards/cosine_scaled_reward": -0.08602435514330864, "rewards/format_reward": 0.5208333544433117, "step": 40 }, { "completion_length": 2916.104248046875, "epoch": 0.046857142857142854, "grad_norm": 0.17687222361564636, "kl": 3.0338764190673828e-05, "learning_rate": 8e-07, "loss": 0.0, "reward": 0.03498362717800774, "reward_std": 0.2998371869325638, "rewards/cosine_scaled_reward": -0.14938516542315483, "rewards/format_reward": 0.4166666716337204, "step": 41 }, { "completion_length": 2854.104217529297, "epoch": 0.048, "grad_norm": 0.2547663450241089, "kl": 4.5239925384521484e-05, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "reward": -0.06212767260149121, "reward_std": 0.1429902408272028, "rewards/cosine_scaled_reward": -0.2606445848941803, "rewards/format_reward": 0.35416666977107525, "step": 42 }, { "completion_length": 3066.854248046875, "epoch": 0.04914285714285714, "grad_norm": 0.1688769906759262, "kl": 2.8118491172790527e-05, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "reward": 0.06209855154156685, "reward_std": 0.3040098361670971, "rewards/cosine_scaled_reward": -0.0826764814555645, "rewards/format_reward": 0.2916666716337204, "step": 43 }, { "completion_length": 2718.4791717529297, "epoch": 0.05028571428571429, "grad_norm": 0.2759998142719269, "kl": 0.00011104345321655273, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "reward": 0.16620675288140774, "reward_std": 0.2648898549377918, "rewards/cosine_scaled_reward": 0.023219330236315727, "rewards/format_reward": 0.5, "step": 44 }, { "completion_length": 3411.875, "epoch": 0.05142857142857143, "grad_norm": 0.15706008672714233, "kl": 5.768239498138428e-05, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "reward": -0.010180938057601452, "reward_std": 0.23901794850826263, "rewards/cosine_scaled_reward": -0.10855040326714516, "rewards/format_reward": 0.20833334513008595, "step": 45 }, { "completion_length": 3205.1666870117188, "epoch": 0.052571428571428575, "grad_norm": 0.18602944910526276, "kl": 5.361437797546387e-05, "learning_rate": 9e-07, "loss": 0.0, "reward": -0.04720673710107803, "reward_std": 0.14394842460751534, "rewards/cosine_scaled_reward": -0.16787217557430267, "rewards/format_reward": 0.20833334513008595, "step": 46 }, { "completion_length": 2893.166748046875, "epoch": 0.053714285714285714, "grad_norm": 0.2058715671300888, "kl": 3.2939016819000244e-05, "learning_rate": 9.2e-07, "loss": 0.0, "reward": 0.2286459095776081, "reward_std": 0.28589488565921783, "rewards/cosine_scaled_reward": 0.07320494949817657, "rewards/format_reward": 0.5208333488553762, "step": 47 }, { "completion_length": 2904.6875610351562, "epoch": 0.054857142857142854, "grad_norm": 0.22265109419822693, "kl": 0.0001233220100402832, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "reward": 0.007764505222439766, "reward_std": 0.16852838546037674, "rewards/cosine_scaled_reward": -0.16211793571710587, "rewards/format_reward": 0.3125000074505806, "step": 48 }, { "completion_length": 2427.479248046875, "epoch": 0.056, "grad_norm": 0.2190365344285965, "kl": 0.00010353326797485352, "learning_rate": 9.6e-07, "loss": 0.0, "reward": 0.1893621925264597, "reward_std": 0.31022848933935165, "rewards/cosine_scaled_reward": 0.017491597682237625, "rewards/format_reward": 0.5625000298023224, "step": 49 }, { "completion_length": 2934.3541870117188, "epoch": 0.05714285714285714, "grad_norm": 0.18039213120937347, "kl": 7.003545761108398e-05, "learning_rate": 9.8e-07, "loss": 0.0, "reward": 0.11967798322439194, "reward_std": 0.19052543118596077, "rewards/cosine_scaled_reward": 0.040288787335157394, "rewards/format_reward": 0.31250000558793545, "step": 50 }, { "completion_length": 2329.4584350585938, "epoch": 0.05828571428571429, "grad_norm": 0.23464825749397278, "kl": 0.0002582073211669922, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.12857681699097157, "reward_std": 0.20647091418504715, "rewards/cosine_scaled_reward": -0.0722556822001934, "rewards/format_reward": 0.5833333432674408, "step": 51 }, { "completion_length": 2854.500030517578, "epoch": 0.05942857142857143, "grad_norm": 0.21867266297340393, "kl": 0.00016963481903076172, "learning_rate": 9.999890338174275e-07, "loss": 0.0, "reward": 0.14125719666481018, "reward_std": 0.32246551662683487, "rewards/cosine_scaled_reward": 0.0029166871681809425, "rewards/format_reward": 0.3958333432674408, "step": 52 }, { "completion_length": 2820.2500915527344, "epoch": 0.060571428571428575, "grad_norm": 0.19331412017345428, "kl": 0.00015205144882202148, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "reward": 0.1258083715365501, "reward_std": 0.32310857623815536, "rewards/cosine_scaled_reward": -0.020295356400310993, "rewards/format_reward": 0.4791666716337204, "step": 53 }, { "completion_length": 2856.0833740234375, "epoch": 0.061714285714285715, "grad_norm": 0.1580573320388794, "kl": 3.808736801147461e-05, "learning_rate": 9.999013075636804e-07, "loss": 0.0, "reward": 0.2725381199270487, "reward_std": 0.36154576390981674, "rewards/cosine_scaled_reward": 0.1424376405775547, "rewards/format_reward": 0.5208333488553762, "step": 54 }, { "completion_length": 2909.1250610351562, "epoch": 0.06285714285714286, "grad_norm": 0.18711332976818085, "kl": 0.00010773539543151855, "learning_rate": 9.998245517681593e-07, "loss": 0.0, "reward": 0.13717407977674156, "reward_std": 0.3047039993107319, "rewards/cosine_scaled_reward": -0.015960073098540306, "rewards/format_reward": 0.4166666716337204, "step": 55 }, { "completion_length": 2983.4583740234375, "epoch": 0.064, "grad_norm": 0.172135129570961, "kl": 5.167722702026367e-05, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": 0.06429924443364143, "reward_std": 0.23643197491765022, "rewards/cosine_scaled_reward": -0.09967034682631493, "rewards/format_reward": 0.375, "step": 56 }, { "completion_length": 3349.291748046875, "epoch": 0.06514285714285714, "grad_norm": 0.13240715861320496, "kl": 3.217160701751709e-05, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "reward": 0.08298390917479992, "reward_std": 0.2719106115400791, "rewards/cosine_scaled_reward": -0.061886819545179605, "rewards/format_reward": 0.3333333432674408, "step": 57 }, { "completion_length": 2203.2291870117188, "epoch": 0.06628571428571428, "grad_norm": 0.22709013521671295, "kl": 0.00039386749267578125, "learning_rate": 9.994627618036452e-07, "loss": 0.0, "reward": 0.2722779009491205, "reward_std": 0.2938956469297409, "rewards/cosine_scaled_reward": 0.08483442291617393, "rewards/format_reward": 0.666666692122817, "step": 58 }, { "completion_length": 2840.187530517578, "epoch": 0.06742857142857143, "grad_norm": 0.17239369451999664, "kl": 1.9103288650512695e-05, "learning_rate": 9.992983438818915e-07, "loss": 0.0, "reward": 0.0854027196764946, "reward_std": 0.22641704231500626, "rewards/cosine_scaled_reward": -0.06355120055377483, "rewards/format_reward": 0.37500000558793545, "step": 59 }, { "completion_length": 3012.1250610351562, "epoch": 0.06857142857142857, "grad_norm": 0.1650194674730301, "kl": 7.164478302001953e-05, "learning_rate": 9.991120277927223e-07, "loss": 0.0, "reward": 0.012519015290308744, "reward_std": 0.22390243411064148, "rewards/cosine_scaled_reward": -0.14655769802629948, "rewards/format_reward": 0.375, "step": 60 }, { "completion_length": 3101.8126220703125, "epoch": 0.06971428571428571, "grad_norm": 0.17062878608703613, "kl": 0.00014269817620515823, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": 0.05342123447917402, "reward_std": 0.25336652249097824, "rewards/cosine_scaled_reward": -0.15571192651987076, "rewards/format_reward": 0.4791666716337204, "step": 61 }, { "completion_length": 2601.5833435058594, "epoch": 0.07085714285714285, "grad_norm": 0.22512151300907135, "kl": 0.0006105899810791016, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "reward": 0.17650287225842476, "reward_std": 0.3199679031968117, "rewards/cosine_scaled_reward": -0.06858954066410661, "rewards/format_reward": 0.5625000149011612, "step": 62 }, { "completion_length": 2216.9166870117188, "epoch": 0.072, "grad_norm": 0.23623046278953552, "kl": 0.0003886222839355469, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "reward": 0.2345086196437478, "reward_std": 0.364711195230484, "rewards/cosine_scaled_reward": -0.0016382848843932152, "rewards/format_reward": 0.7083333358168602, "step": 63 }, { "completion_length": 2802.8125915527344, "epoch": 0.07314285714285715, "grad_norm": 0.17913182079792023, "kl": 0.00014469027519226074, "learning_rate": 9.981479793771866e-07, "loss": 0.0, "reward": 0.13276350032538176, "reward_std": 0.3188353106379509, "rewards/cosine_scaled_reward": -0.014707108959555626, "rewards/format_reward": 0.4166666828095913, "step": 64 }, { "completion_length": 2648.791717529297, "epoch": 0.07428571428571429, "grad_norm": 0.19293726980686188, "kl": 0.00011703372001647949, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "reward": 0.02339506382122636, "reward_std": 0.16691064462065697, "rewards/cosine_scaled_reward": -0.16883965581655502, "rewards/format_reward": 0.4166666865348816, "step": 65 }, { "completion_length": 2108.4375228881836, "epoch": 0.07542857142857143, "grad_norm": 0.2505956292152405, "kl": 0.00021564960479736328, "learning_rate": 9.975348529157229e-07, "loss": 0.0, "reward": 0.18803048506379128, "reward_std": 0.23812443763017654, "rewards/cosine_scaled_reward": 0.02932748757302761, "rewards/format_reward": 0.5, "step": 66 }, { "completion_length": 3409.4583740234375, "epoch": 0.07657142857142857, "grad_norm": 0.15418125689029694, "kl": 0.00023257732391357422, "learning_rate": 9.971955636222684e-07, "loss": 0.0, "reward": -0.13601324148476124, "reward_std": 0.14229955151677132, "rewards/cosine_scaled_reward": -0.28745780140161514, "rewards/format_reward": 0.1666666716337204, "step": 67 }, { "completion_length": 1989.791748046875, "epoch": 0.07771428571428571, "grad_norm": 0.2515312433242798, "kl": 0.0010142326354980469, "learning_rate": 9.968344786479415e-07, "loss": 0.0, "reward": 0.22917652688920498, "reward_std": 0.29839884862303734, "rewards/cosine_scaled_reward": -0.01369619369506836, "rewards/format_reward": 0.6666666716337204, "step": 68 }, { "completion_length": 2491.041748046875, "epoch": 0.07885714285714286, "grad_norm": 0.26017820835113525, "kl": 0.0007710456848144531, "learning_rate": 9.964516155915151e-07, "loss": 0.0, "reward": 0.049685924575896934, "reward_std": 0.2717648334801197, "rewards/cosine_scaled_reward": -0.1978147281333804, "rewards/format_reward": 0.5000000149011612, "step": 69 }, { "completion_length": 3060.7083740234375, "epoch": 0.08, "grad_norm": 0.16957440972328186, "kl": 0.0008012652397155762, "learning_rate": 9.960469931131936e-07, "loss": 0.0, "reward": -0.042351190000772476, "reward_std": 0.1818229742348194, "rewards/cosine_scaled_reward": -0.21522082015872002, "rewards/format_reward": 0.3333333432674408, "step": 70 }, { "completion_length": 2694.312545776367, "epoch": 0.08114285714285714, "grad_norm": 0.1995006948709488, "kl": 0.0007072687149047852, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "reward": 0.07359633408486843, "reward_std": 0.22721958719193935, "rewards/cosine_scaled_reward": -0.06798692792654037, "rewards/format_reward": 0.3541666716337204, "step": 71 }, { "completion_length": 2656.0625, "epoch": 0.08228571428571428, "grad_norm": 0.21664902567863464, "kl": 0.0008908510208129883, "learning_rate": 9.951725498333448e-07, "loss": 0.0, "reward": 0.04184230323880911, "reward_std": 0.22974644601345062, "rewards/cosine_scaled_reward": -0.15995099861174822, "rewards/format_reward": 0.47916667722165585, "step": 72 }, { "completion_length": 3503.125, "epoch": 0.08342857142857144, "grad_norm": 0.17171922326087952, "kl": 0.00013053417205810547, "learning_rate": 9.947027716509488e-07, "loss": 0.0, "reward": -0.05220331740565598, "reward_std": 0.246462631970644, "rewards/cosine_scaled_reward": -0.1583964079618454, "rewards/format_reward": 0.1875000074505806, "step": 73 }, { "completion_length": 3180.4791870117188, "epoch": 0.08457142857142858, "grad_norm": 0.19165758788585663, "kl": 0.00042688846588134766, "learning_rate": 9.942113192828444e-07, "loss": 0.0, "reward": 0.03528491756878793, "reward_std": 0.24782484769821167, "rewards/cosine_scaled_reward": -0.04785974510014057, "rewards/format_reward": 0.2916666716337204, "step": 74 }, { "completion_length": 3037.416748046875, "epoch": 0.08571428571428572, "grad_norm": 0.15819406509399414, "kl": 0.0006095767021179199, "learning_rate": 9.93698216681727e-07, "loss": 0.0, "reward": 0.0823002946563065, "reward_std": 0.19139036908745766, "rewards/cosine_scaled_reward": -0.05699274316430092, "rewards/format_reward": 0.37500001676380634, "step": 75 }, { "completion_length": 2817.7709350585938, "epoch": 0.08685714285714285, "grad_norm": 0.1885841190814972, "kl": 0.0001068115234375, "learning_rate": 9.931634888554935e-07, "loss": 0.0, "reward": 0.03719830792397261, "reward_std": 0.24271035939455032, "rewards/cosine_scaled_reward": -0.178798396140337, "rewards/format_reward": 0.4166666753590107, "step": 76 }, { "completion_length": 3224.6458740234375, "epoch": 0.088, "grad_norm": 0.1698525995016098, "kl": 0.00024068355560302734, "learning_rate": 9.926071618660237e-07, "loss": 0.0, "reward": 0.005038566887378693, "reward_std": 0.20738781057298183, "rewards/cosine_scaled_reward": -0.15254707634449005, "rewards/format_reward": 0.2708333432674408, "step": 77 }, { "completion_length": 3077.8543090820312, "epoch": 0.08914285714285715, "grad_norm": 0.20861662924289703, "kl": 0.0001271367073059082, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "reward": 0.10431988351047039, "reward_std": 0.31425832584500313, "rewards/cosine_scaled_reward": -0.03074691817164421, "rewards/format_reward": 0.33333333395421505, "step": 78 }, { "completion_length": 2268.041717529297, "epoch": 0.09028571428571429, "grad_norm": 0.22589290142059326, "kl": 0.000714719295501709, "learning_rate": 9.91429819907136e-07, "loss": 0.0, "reward": 0.10906978440470994, "reward_std": 0.18880136497318745, "rewards/cosine_scaled_reward": -0.09606132353655994, "rewards/format_reward": 0.5, "step": 79 }, { "completion_length": 3246.6250610351562, "epoch": 0.09142857142857143, "grad_norm": 0.18828435242176056, "kl": 0.0005083680152893066, "learning_rate": 9.908088623197048e-07, "loss": 0.0, "reward": 0.06318249693140388, "reward_std": 0.31002140790224075, "rewards/cosine_scaled_reward": -0.07233269140124321, "rewards/format_reward": 0.2916666679084301, "step": 80 }, { "completion_length": 3215.3333740234375, "epoch": 0.09257142857142857, "grad_norm": 0.27181336283683777, "kl": 0.0016459226608276367, "learning_rate": 9.901664203302124e-07, "loss": 0.0001, "reward": -0.06593337655067444, "reward_std": 0.228203646838665, "rewards/cosine_scaled_reward": -0.1975144650787115, "rewards/format_reward": 0.1875, "step": 81 }, { "completion_length": 2783.4375610351562, "epoch": 0.09371428571428571, "grad_norm": 0.17714105546474457, "kl": 0.0010538101196289062, "learning_rate": 9.895025252503755e-07, "loss": 0.0, "reward": 0.10739830927923322, "reward_std": 0.2685987316071987, "rewards/cosine_scaled_reward": -0.03886566497385502, "rewards/format_reward": 0.4166666716337204, "step": 82 }, { "completion_length": 2704.0833740234375, "epoch": 0.09485714285714286, "grad_norm": 0.2489301860332489, "kl": 0.0006620883941650391, "learning_rate": 9.888172094375033e-07, "loss": 0.0, "reward": 0.051296481397002935, "reward_std": 0.22126955911517143, "rewards/cosine_scaled_reward": -0.10262950323522091, "rewards/format_reward": 0.33333333395421505, "step": 83 }, { "completion_length": 2919.479217529297, "epoch": 0.096, "grad_norm": 0.16122053563594818, "kl": 0.0003643035888671875, "learning_rate": 9.881105062929221e-07, "loss": 0.0, "reward": 0.14531802013516426, "reward_std": 0.30427272617816925, "rewards/cosine_scaled_reward": 0.02680143341422081, "rewards/format_reward": 0.3958333358168602, "step": 84 }, { "completion_length": 3221.625, "epoch": 0.09714285714285714, "grad_norm": 0.13713707029819489, "kl": 0.00022965669631958008, "learning_rate": 9.873824502603459e-07, "loss": 0.0, "reward": 0.045454833656549454, "reward_std": 0.32109829783439636, "rewards/cosine_scaled_reward": -0.0938787111081183, "rewards/format_reward": 0.33333334513008595, "step": 85 }, { "completion_length": 3070.1666870117188, "epoch": 0.09828571428571428, "grad_norm": 0.1718609482049942, "kl": 0.0007703304290771484, "learning_rate": 9.866330768241983e-07, "loss": 0.0, "reward": 0.020616615191102028, "reward_std": 0.20714357122778893, "rewards/cosine_scaled_reward": -0.12587381899356842, "rewards/format_reward": 0.3125000074505806, "step": 86 }, { "completion_length": 2553.8126220703125, "epoch": 0.09942857142857142, "grad_norm": 0.23934882879257202, "kl": 0.0006546974182128906, "learning_rate": 9.85862422507884e-07, "loss": 0.0, "reward": 0.16441121231764555, "reward_std": 0.27248556166887283, "rewards/cosine_scaled_reward": -0.07301743514835835, "rewards/format_reward": 0.6041666865348816, "step": 87 }, { "completion_length": 2649.6666870117188, "epoch": 0.10057142857142858, "grad_norm": 0.24427059292793274, "kl": 0.0014219284057617188, "learning_rate": 9.850705248720068e-07, "loss": 0.0001, "reward": 0.30650845542550087, "reward_std": 0.442048154771328, "rewards/cosine_scaled_reward": 0.09996251187112648, "rewards/format_reward": 0.645833358168602, "step": 88 }, { "completion_length": 3074.9793090820312, "epoch": 0.10171428571428572, "grad_norm": 0.20860984921455383, "kl": 0.0009670257568359375, "learning_rate": 9.8425742251254e-07, "loss": 0.0, "reward": 0.06839994341135025, "reward_std": 0.3136523775756359, "rewards/cosine_scaled_reward": -0.0889416765421629, "rewards/format_reward": 0.3333333469927311, "step": 89 }, { "completion_length": 2521.3959045410156, "epoch": 0.10285714285714286, "grad_norm": 0.26103153824806213, "kl": 0.001010894775390625, "learning_rate": 9.83423155058946e-07, "loss": 0.0, "reward": -0.015254740603268147, "reward_std": 0.12981478869915009, "rewards/cosine_scaled_reward": -0.2523176036775112, "rewards/format_reward": 0.47916667722165585, "step": 90 }, { "completion_length": 3130.4166870117188, "epoch": 0.104, "grad_norm": 0.1804915815591812, "kl": 0.0006823539733886719, "learning_rate": 9.825677631722435e-07, "loss": 0.0, "reward": 0.024226047098636627, "reward_std": 0.24618754535913467, "rewards/cosine_scaled_reward": -0.13014899473637342, "rewards/format_reward": 0.33333333395421505, "step": 91 }, { "completion_length": 2594.5000915527344, "epoch": 0.10514285714285715, "grad_norm": 0.25217053294181824, "kl": 0.0017712712287902832, "learning_rate": 9.816912885430258e-07, "loss": 0.0001, "reward": 0.06422214396297932, "reward_std": 0.20822307094931602, "rewards/cosine_scaled_reward": -0.12269663874758407, "rewards/format_reward": 0.47916666977107525, "step": 92 }, { "completion_length": 3538.125, "epoch": 0.10628571428571429, "grad_norm": 0.16923604905605316, "kl": 0.0010764598846435547, "learning_rate": 9.807937738894303e-07, "loss": 0.0, "reward": -0.16369806230068207, "reward_std": 0.1373737584799528, "rewards/cosine_scaled_reward": -0.2538382336497307, "rewards/format_reward": 0.02083333395421505, "step": 93 }, { "completion_length": 3163.4583740234375, "epoch": 0.10742857142857143, "grad_norm": 0.1778799593448639, "kl": 0.001428365707397461, "learning_rate": 9.798752629550546e-07, "loss": 0.0001, "reward": 0.014819767326116562, "reward_std": 0.23536204546689987, "rewards/cosine_scaled_reward": -0.11371379345655441, "rewards/format_reward": 0.2083333358168602, "step": 94 }, { "completion_length": 3426.7083740234375, "epoch": 0.10857142857142857, "grad_norm": 0.17324331402778625, "kl": 0.0004151463508605957, "learning_rate": 9.78935800506826e-07, "loss": 0.0, "reward": -0.035191090777516365, "reward_std": 0.27483338490128517, "rewards/cosine_scaled_reward": -0.1303609658498317, "rewards/format_reward": 0.1875000074505806, "step": 95 }, { "completion_length": 2760.25, "epoch": 0.10971428571428571, "grad_norm": 0.17773400247097015, "kl": 0.0018305778503417969, "learning_rate": 9.779754323328192e-07, "loss": 0.0001, "reward": 0.08919629082083702, "reward_std": 0.22743511945009232, "rewards/cosine_scaled_reward": -0.016248881816864014, "rewards/format_reward": 0.35416666977107525, "step": 96 }, { "completion_length": 3314.3959350585938, "epoch": 0.11085714285714286, "grad_norm": 0.16482950747013092, "kl": 0.0007615089416503906, "learning_rate": 9.769942052400235e-07, "loss": 0.0, "reward": -0.014882845804095268, "reward_std": 0.28871480002999306, "rewards/cosine_scaled_reward": -0.17778804525732994, "rewards/format_reward": 0.2291666679084301, "step": 97 }, { "completion_length": 2971.729248046875, "epoch": 0.112, "grad_norm": 0.15072181820869446, "kl": 0.0003001689910888672, "learning_rate": 9.759921670520634e-07, "loss": 0.0, "reward": 0.03349009482190013, "reward_std": 0.18237757682800293, "rewards/cosine_scaled_reward": -0.1615230068564415, "rewards/format_reward": 0.3958333432674408, "step": 98 }, { "completion_length": 2852.7708740234375, "epoch": 0.11314285714285714, "grad_norm": 0.19570456445217133, "kl": 0.0006053447723388672, "learning_rate": 9.749693666068663e-07, "loss": 0.0, "reward": 0.06496170163154602, "reward_std": 0.21407892182469368, "rewards/cosine_scaled_reward": -0.0664414819329977, "rewards/format_reward": 0.3125, "step": 99 }, { "completion_length": 2670.4168090820312, "epoch": 0.11428571428571428, "grad_norm": 0.17085237801074982, "kl": 0.0008068084716796875, "learning_rate": 9.739258537542835e-07, "loss": 0.0, "reward": 0.15346461767330766, "reward_std": 0.30039672553539276, "rewards/cosine_scaled_reward": -0.029875319451093674, "rewards/format_reward": 0.4791666865348816, "step": 100 }, { "completion_length": 3090.6875610351562, "epoch": 0.11542857142857142, "grad_norm": 0.17470602691173553, "kl": 0.0010237693786621094, "learning_rate": 9.728616793536587e-07, "loss": 0.0, "reward": 0.04014189355075359, "reward_std": 0.22363552451133728, "rewards/cosine_scaled_reward": -0.09809151291847229, "rewards/format_reward": 0.3125000074505806, "step": 101 }, { "completion_length": 2311.3750610351562, "epoch": 0.11657142857142858, "grad_norm": 0.2883223593235016, "kl": 0.002286195755004883, "learning_rate": 9.717768952713511e-07, "loss": 0.0001, "reward": 0.1678666821680963, "reward_std": 0.21508530527353287, "rewards/cosine_scaled_reward": -0.10981736332178116, "rewards/format_reward": 0.6875000149011612, "step": 102 }, { "completion_length": 3020.3125915527344, "epoch": 0.11771428571428572, "grad_norm": 0.22340644896030426, "kl": 0.0008773207664489746, "learning_rate": 9.706715543782064e-07, "loss": 0.0, "reward": 0.09143933840095997, "reward_std": 0.27459392696619034, "rewards/cosine_scaled_reward": -0.08100417070090771, "rewards/format_reward": 0.3958333395421505, "step": 103 }, { "completion_length": 2666.041748046875, "epoch": 0.11885714285714286, "grad_norm": 0.2013712376356125, "kl": 0.001735687255859375, "learning_rate": 9.695457105469804e-07, "loss": 0.0001, "reward": 0.04090608523983974, "reward_std": 0.13189667649567127, "rewards/cosine_scaled_reward": -0.14965718239545822, "rewards/format_reward": 0.4166666679084301, "step": 104 }, { "completion_length": 2776.3125, "epoch": 0.12, "grad_norm": 0.21267718076705933, "kl": 0.0008919239044189453, "learning_rate": 9.683994186497132e-07, "loss": 0.0, "reward": 0.08371754828840494, "reward_std": 0.24455546587705612, "rewards/cosine_scaled_reward": -0.024641111493110657, "rewards/format_reward": 0.33333333395421505, "step": 105 }, { "completion_length": 2168.854217529297, "epoch": 0.12114285714285715, "grad_norm": 0.194187730550766, "kl": 0.0012993812561035156, "learning_rate": 9.672327345550543e-07, "loss": 0.0001, "reward": 0.37329915910959244, "reward_std": 0.23653614707291126, "rewards/cosine_scaled_reward": 0.2502317950129509, "rewards/format_reward": 0.6666666772216558, "step": 106 }, { "completion_length": 3031.2916870117188, "epoch": 0.12228571428571429, "grad_norm": 0.22123286128044128, "kl": 0.0007143020629882812, "learning_rate": 9.66045715125541e-07, "loss": 0.0, "reward": 0.06332904286682606, "reward_std": 0.2138124220073223, "rewards/cosine_scaled_reward": -0.10465502738952637, "rewards/format_reward": 0.3750000111758709, "step": 107 }, { "completion_length": 2653.354248046875, "epoch": 0.12342857142857143, "grad_norm": 0.19954904913902283, "kl": 0.001068115234375, "learning_rate": 9.648384182148252e-07, "loss": 0.0, "reward": 0.1708927322179079, "reward_std": 0.2717986926436424, "rewards/cosine_scaled_reward": 0.013383063487708569, "rewards/format_reward": 0.479166679084301, "step": 108 }, { "completion_length": 3006.3333740234375, "epoch": 0.12457142857142857, "grad_norm": 0.15602044761180878, "kl": 0.0003781318664550781, "learning_rate": 9.636109026648554e-07, "loss": 0.0, "reward": -0.01096257008612156, "reward_std": 0.15808527171611786, "rewards/cosine_scaled_reward": -0.19856059784069657, "rewards/format_reward": 0.375, "step": 109 }, { "completion_length": 2700.2501220703125, "epoch": 0.12571428571428572, "grad_norm": 0.23633067309856415, "kl": 0.0006241798400878906, "learning_rate": 9.623632283030077e-07, "loss": 0.0, "reward": 0.14015202317386866, "reward_std": 0.3137088418006897, "rewards/cosine_scaled_reward": -0.07339744362980127, "rewards/format_reward": 0.5, "step": 110 }, { "completion_length": 2980.729248046875, "epoch": 0.12685714285714286, "grad_norm": 0.20043744146823883, "kl": 0.0012969970703125, "learning_rate": 9.610954559391704e-07, "loss": 0.0001, "reward": 0.12000765651464462, "reward_std": 0.3265395238995552, "rewards/cosine_scaled_reward": -0.05010761972516775, "rewards/format_reward": 0.354166679084301, "step": 111 }, { "completion_length": 3150.8333740234375, "epoch": 0.128, "grad_norm": 0.16131086647510529, "kl": 0.0005326271057128906, "learning_rate": 9.598076473627796e-07, "loss": 0.0, "reward": 0.12711239233613014, "reward_std": 0.37858303636312485, "rewards/cosine_scaled_reward": -0.00730159692466259, "rewards/format_reward": 0.39583334885537624, "step": 112 }, { "completion_length": 2972.2083435058594, "epoch": 0.12914285714285714, "grad_norm": 0.1954687088727951, "kl": 0.0014047622680664062, "learning_rate": 9.58499865339809e-07, "loss": 0.0001, "reward": 0.07627532246988267, "reward_std": 0.291649978607893, "rewards/cosine_scaled_reward": -0.07863758876919746, "rewards/format_reward": 0.3750000037252903, "step": 113 }, { "completion_length": 2534.3334045410156, "epoch": 0.13028571428571428, "grad_norm": 0.19298915565013885, "kl": 0.0018050670623779297, "learning_rate": 9.571721736097088e-07, "loss": 0.0001, "reward": 0.10349475312978029, "reward_std": 0.2791562117636204, "rewards/cosine_scaled_reward": -0.17762713879346848, "rewards/format_reward": 0.5833333507180214, "step": 114 }, { "completion_length": 2881.6459350585938, "epoch": 0.13142857142857142, "grad_norm": 0.20512622594833374, "kl": 0.002078533172607422, "learning_rate": 9.55824636882301e-07, "loss": 0.0001, "reward": 0.061600953340530396, "reward_std": 0.26807834208011627, "rewards/cosine_scaled_reward": -0.0658574104309082, "rewards/format_reward": 0.33333334885537624, "step": 115 }, { "completion_length": 3314.8959350585938, "epoch": 0.13257142857142856, "grad_norm": 0.18601059913635254, "kl": 0.0011138916015625, "learning_rate": 9.54457320834625e-07, "loss": 0.0, "reward": -0.046209411695599556, "reward_std": 0.27809127047657967, "rewards/cosine_scaled_reward": -0.19332470558583736, "rewards/format_reward": 0.22916667349636555, "step": 116 }, { "completion_length": 3279.4166870117188, "epoch": 0.1337142857142857, "grad_norm": 0.1657201051712036, "kl": 0.001247406005859375, "learning_rate": 9.530702921077358e-07, "loss": 0.0, "reward": -0.013633315684273839, "reward_std": 0.25296393781900406, "rewards/cosine_scaled_reward": -0.1371129583567381, "rewards/format_reward": 0.25000000186264515, "step": 117 }, { "completion_length": 3042.2709350585938, "epoch": 0.13485714285714287, "grad_norm": 0.15822374820709229, "kl": 0.0009875297546386719, "learning_rate": 9.516636183034564e-07, "loss": 0.0, "reward": 0.2582199349999428, "reward_std": 0.44860056787729263, "rewards/cosine_scaled_reward": 0.1183284455910325, "rewards/format_reward": 0.4375000074505806, "step": 118 }, { "completion_length": 2344.666748046875, "epoch": 0.136, "grad_norm": 0.23042984306812286, "kl": 0.0028333663940429688, "learning_rate": 9.502373679810839e-07, "loss": 0.0001, "reward": 0.19421357102692127, "reward_std": 0.2767038568854332, "rewards/cosine_scaled_reward": -0.02037617191672325, "rewards/format_reward": 0.6250000149011612, "step": 119 }, { "completion_length": 2476.979217529297, "epoch": 0.13714285714285715, "grad_norm": 0.20887106657028198, "kl": 0.0014963150024414062, "learning_rate": 9.487916106540465e-07, "loss": 0.0001, "reward": 0.04804867971688509, "reward_std": 0.2182541899383068, "rewards/cosine_scaled_reward": -0.19375143572688103, "rewards/format_reward": 0.5000000074505806, "step": 120 }, { "completion_length": 1820.5000915527344, "epoch": 0.1382857142857143, "grad_norm": 0.5606526732444763, "kl": 0.017534255981445312, "learning_rate": 9.473264167865171e-07, "loss": 0.0007, "reward": 0.20858957897871733, "reward_std": 0.2296605035662651, "rewards/cosine_scaled_reward": -0.03449038416147232, "rewards/format_reward": 0.7500000149011612, "step": 121 }, { "completion_length": 3009.3125610351562, "epoch": 0.13942857142857143, "grad_norm": 0.18202351033687592, "kl": 0.0010209083557128906, "learning_rate": 9.458418577899774e-07, "loss": 0.0, "reward": 0.17906883545219898, "reward_std": 0.37603622674942017, "rewards/cosine_scaled_reward": 0.019067944958806038, "rewards/format_reward": 0.4166666753590107, "step": 122 }, { "completion_length": 2798.5001220703125, "epoch": 0.14057142857142857, "grad_norm": 0.18842831254005432, "kl": 0.001125335693359375, "learning_rate": 9.443380060197385e-07, "loss": 0.0, "reward": 0.09704925492405891, "reward_std": 0.29832683503627777, "rewards/cosine_scaled_reward": -0.1078597791492939, "rewards/format_reward": 0.541666679084301, "step": 123 }, { "completion_length": 2319.5001220703125, "epoch": 0.1417142857142857, "grad_norm": 0.258215069770813, "kl": 0.005318641662597656, "learning_rate": 9.428149347714143e-07, "loss": 0.0002, "reward": 0.14139796933159232, "reward_std": 0.2474342044442892, "rewards/cosine_scaled_reward": -0.08365245535969734, "rewards/format_reward": 0.5416666865348816, "step": 124 }, { "completion_length": 2868.7083740234375, "epoch": 0.14285714285714285, "grad_norm": 0.18052953481674194, "kl": 0.0010223388671875, "learning_rate": 9.412727182773486e-07, "loss": 0.0, "reward": 0.0953734740614891, "reward_std": 0.2038922980427742, "rewards/cosine_scaled_reward": -0.021539516746997833, "rewards/format_reward": 0.3125000111758709, "step": 125 }, { "completion_length": 2733.9375610351562, "epoch": 0.144, "grad_norm": 0.1686813235282898, "kl": 0.0012359619140625, "learning_rate": 9.397114317029974e-07, "loss": 0.0, "reward": 0.1783338594250381, "reward_std": 0.22713003307580948, "rewards/cosine_scaled_reward": -0.019597443286329508, "rewards/format_reward": 0.5416666716337204, "step": 126 }, { "completion_length": 2906.4584045410156, "epoch": 0.14514285714285713, "grad_norm": 0.16627976298332214, "kl": 0.000850677490234375, "learning_rate": 9.381311511432658e-07, "loss": 0.0, "reward": 0.039196121506392956, "reward_std": 0.17883937060832977, "rewards/cosine_scaled_reward": -0.15023103915154934, "rewards/format_reward": 0.3958333432674408, "step": 127 }, { "completion_length": 2877.7916870117188, "epoch": 0.1462857142857143, "grad_norm": 0.19161036610603333, "kl": 0.0016307830810546875, "learning_rate": 9.36531953618799e-07, "loss": 0.0001, "reward": 0.16749756410717964, "reward_std": 0.32903048396110535, "rewards/cosine_scaled_reward": 0.06750535871833563, "rewards/format_reward": 0.3958333432674408, "step": 128 }, { "completion_length": 3303.9791870117188, "epoch": 0.14742857142857144, "grad_norm": 0.18518763780593872, "kl": 0.0013399124145507812, "learning_rate": 9.34913917072228e-07, "loss": 0.0001, "reward": -0.04355821633362211, "reward_std": 0.256184009835124, "rewards/cosine_scaled_reward": -0.1530312355607748, "rewards/format_reward": 0.20833334140479565, "step": 129 }, { "completion_length": 2958.9376220703125, "epoch": 0.14857142857142858, "grad_norm": 0.17572814226150513, "kl": 0.0014519691467285156, "learning_rate": 9.332771203643714e-07, "loss": 0.0001, "reward": -0.018080759793519974, "reward_std": 0.15979976579546928, "rewards/cosine_scaled_reward": -0.1620011143386364, "rewards/format_reward": 0.27083333767950535, "step": 130 }, { "completion_length": 2775.5625610351562, "epoch": 0.14971428571428572, "grad_norm": 0.25634628534317017, "kl": 0.0019540786743164062, "learning_rate": 9.316216432703916e-07, "loss": 0.0001, "reward": 0.16852700896561146, "reward_std": 0.32166776061058044, "rewards/cosine_scaled_reward": 0.031354669481515884, "rewards/format_reward": 0.41666667722165585, "step": 131 }, { "completion_length": 2601.541748046875, "epoch": 0.15085714285714286, "grad_norm": 0.17196296155452728, "kl": 0.0007419586181640625, "learning_rate": 9.299475664759068e-07, "loss": 0.0, "reward": 0.17289095558226109, "reward_std": 0.24414288625121117, "rewards/cosine_scaled_reward": 0.04381055012345314, "rewards/format_reward": 0.4583333432674408, "step": 132 }, { "completion_length": 3346.354248046875, "epoch": 0.152, "grad_norm": 0.17542175948619843, "kl": 0.0012054443359375, "learning_rate": 9.282549715730579e-07, "loss": 0.0, "reward": -0.061608402989804745, "reward_std": 0.1897377409040928, "rewards/cosine_scaled_reward": -0.17360613122582436, "rewards/format_reward": 0.18750000186264515, "step": 133 }, { "completion_length": 2794.9375610351562, "epoch": 0.15314285714285714, "grad_norm": 0.19970323145389557, "kl": 0.0014543533325195312, "learning_rate": 9.265439410565328e-07, "loss": 0.0001, "reward": 0.11259202286601067, "reward_std": 0.320014838129282, "rewards/cosine_scaled_reward": -0.044173166155815125, "rewards/format_reward": 0.4375000149011612, "step": 134 }, { "completion_length": 1900.5000457763672, "epoch": 0.15428571428571428, "grad_norm": 0.23359762132167816, "kl": 0.001819610595703125, "learning_rate": 9.248145583195447e-07, "loss": 0.0001, "reward": 0.2668313942849636, "reward_std": 0.28612423315644264, "rewards/cosine_scaled_reward": 0.11550819734111428, "rewards/format_reward": 0.625, "step": 135 }, { "completion_length": 2652.4584350585938, "epoch": 0.15542857142857142, "grad_norm": 0.18795403838157654, "kl": 0.0008897781372070312, "learning_rate": 9.230669076497687e-07, "loss": 0.0, "reward": 0.21900932490825653, "reward_std": 0.30797392688691616, "rewards/cosine_scaled_reward": 0.07143217464908957, "rewards/format_reward": 0.5208333432674408, "step": 136 }, { "completion_length": 2986.6041870117188, "epoch": 0.15657142857142858, "grad_norm": 0.18147197365760803, "kl": 0.00103759765625, "learning_rate": 9.213010742252327e-07, "loss": 0.0, "reward": -0.018022691132500768, "reward_std": 0.23599210940301418, "rewards/cosine_scaled_reward": -0.1875150203704834, "rewards/format_reward": 0.3750000149011612, "step": 137 }, { "completion_length": 2430.2500610351562, "epoch": 0.15771428571428572, "grad_norm": 0.1941184550523758, "kl": 0.0010514259338378906, "learning_rate": 9.195171441101668e-07, "loss": 0.0, "reward": 0.16211648099124432, "reward_std": 0.2540963143110275, "rewards/cosine_scaled_reward": -0.08601415157318115, "rewards/format_reward": 0.645833358168602, "step": 138 }, { "completion_length": 2956.3334350585938, "epoch": 0.15885714285714286, "grad_norm": 0.19715362787246704, "kl": 0.001739501953125, "learning_rate": 9.177152042508077e-07, "loss": 0.0001, "reward": 0.098558459430933, "reward_std": 0.29911189526319504, "rewards/cosine_scaled_reward": -0.12457533576525748, "rewards/format_reward": 0.4791666753590107, "step": 139 }, { "completion_length": 2963.979248046875, "epoch": 0.16, "grad_norm": 0.3068901002407074, "kl": 0.00263214111328125, "learning_rate": 9.158953424711624e-07, "loss": 0.0001, "reward": 0.05242377519607544, "reward_std": 0.2736871726810932, "rewards/cosine_scaled_reward": -0.06892253458499908, "rewards/format_reward": 0.2708333395421505, "step": 140 }, { "completion_length": 2732.1875915527344, "epoch": 0.16114285714285714, "grad_norm": 0.20098887383937836, "kl": 0.0020694732666015625, "learning_rate": 9.140576474687263e-07, "loss": 0.0001, "reward": 0.09720400208607316, "reward_std": 0.3277590870857239, "rewards/cosine_scaled_reward": -0.12496923531580251, "rewards/format_reward": 0.5000000149011612, "step": 141 }, { "completion_length": 2843.6875915527344, "epoch": 0.16228571428571428, "grad_norm": 0.17572563886642456, "kl": 0.0017600059509277344, "learning_rate": 9.122022088101613e-07, "loss": 0.0001, "reward": 0.16560933645814657, "reward_std": 0.31519390642642975, "rewards/cosine_scaled_reward": -0.04391762427985668, "rewards/format_reward": 0.5416666716337204, "step": 142 }, { "completion_length": 2583.8751220703125, "epoch": 0.16342857142857142, "grad_norm": 0.2704511284828186, "kl": 0.00214385986328125, "learning_rate": 9.103291169269299e-07, "loss": 0.0001, "reward": 0.08706286805681884, "reward_std": 0.27615340799093246, "rewards/cosine_scaled_reward": -0.11747677624225616, "rewards/format_reward": 0.4583333507180214, "step": 143 }, { "completion_length": 2858.166748046875, "epoch": 0.16457142857142856, "grad_norm": 0.24892909824848175, "kl": 0.0025987625122070312, "learning_rate": 9.084384631108882e-07, "loss": 0.0001, "reward": 0.0692533003166318, "reward_std": 0.26655495166778564, "rewards/cosine_scaled_reward": -0.0823652264662087, "rewards/format_reward": 0.3541666753590107, "step": 144 }, { "completion_length": 2059.541748046875, "epoch": 0.1657142857142857, "grad_norm": 0.24266168475151062, "kl": 0.004192352294921875, "learning_rate": 9.065303395098358e-07, "loss": 0.0002, "reward": 0.25175508856773376, "reward_std": 0.22561134956777096, "rewards/cosine_scaled_reward": 0.03231249749660492, "rewards/format_reward": 0.6666666828095913, "step": 145 }, { "completion_length": 2468.1876220703125, "epoch": 0.16685714285714287, "grad_norm": 0.19397643208503723, "kl": 0.0013666152954101562, "learning_rate": 9.046048391230247e-07, "loss": 0.0001, "reward": 0.06067563686519861, "reward_std": 0.24036981165409088, "rewards/cosine_scaled_reward": -0.20845083706080914, "rewards/format_reward": 0.5625000149011612, "step": 146 }, { "completion_length": 2808.6459350585938, "epoch": 0.168, "grad_norm": 0.20159922540187836, "kl": 0.0025682449340820312, "learning_rate": 9.026620557966279e-07, "loss": 0.0001, "reward": 0.11753404047340155, "reward_std": 0.2573778033256531, "rewards/cosine_scaled_reward": -0.04415273433551192, "rewards/format_reward": 0.4583333544433117, "step": 147 }, { "completion_length": 2150.9791870117188, "epoch": 0.16914285714285715, "grad_norm": 0.21388837695121765, "kl": 0.0014133453369140625, "learning_rate": 9.007020842191634e-07, "loss": 0.0001, "reward": 0.1763187162578106, "reward_std": 0.22804216668009758, "rewards/cosine_scaled_reward": -0.048267703503370285, "rewards/format_reward": 0.6458333432674408, "step": 148 }, { "completion_length": 2555.4584350585938, "epoch": 0.1702857142857143, "grad_norm": 0.17537924647331238, "kl": 0.0022039413452148438, "learning_rate": 8.987250199168808e-07, "loss": 0.0001, "reward": 0.2855038889683783, "reward_std": 0.28325819969177246, "rewards/cosine_scaled_reward": 0.10880730301141739, "rewards/format_reward": 0.6458333432674408, "step": 149 }, { "completion_length": 2453.854248046875, "epoch": 0.17142857142857143, "grad_norm": 0.276807576417923, "kl": 0.0028324127197265625, "learning_rate": 8.967309592491052e-07, "loss": 0.0001, "reward": 0.18719099462032318, "reward_std": 0.3572011888027191, "rewards/cosine_scaled_reward": 0.03765287483111024, "rewards/format_reward": 0.5000000111758709, "step": 150 }, { "completion_length": 2566.5209350585938, "epoch": 0.17257142857142857, "grad_norm": 0.23010526597499847, "kl": 0.003368377685546875, "learning_rate": 8.9471999940354e-07, "loss": 0.0001, "reward": 0.18735797423869371, "reward_std": 0.3481326140463352, "rewards/cosine_scaled_reward": 0.039755554869771004, "rewards/format_reward": 0.5000000149011612, "step": 151 }, { "completion_length": 2584.2500610351562, "epoch": 0.1737142857142857, "grad_norm": 0.2767723500728607, "kl": 0.0019273757934570312, "learning_rate": 8.926922383915315e-07, "loss": 0.0001, "reward": 0.08376874192617834, "reward_std": 0.29247722774744034, "rewards/cosine_scaled_reward": -0.13108721375465393, "rewards/format_reward": 0.4375000111758709, "step": 152 }, { "completion_length": 2822.8959350585938, "epoch": 0.17485714285714285, "grad_norm": 0.2643832266330719, "kl": 0.0040264129638671875, "learning_rate": 8.906477750432903e-07, "loss": 0.0002, "reward": -0.010647638468071818, "reward_std": 0.2258121222257614, "rewards/cosine_scaled_reward": -0.21952056884765625, "rewards/format_reward": 0.4166666753590107, "step": 153 }, { "completion_length": 3242.0625610351562, "epoch": 0.176, "grad_norm": 0.2574949264526367, "kl": 0.0021734237670898438, "learning_rate": 8.88586709003076e-07, "loss": 0.0001, "reward": 0.11165983291903103, "reward_std": 0.27420539781451225, "rewards/cosine_scaled_reward": -0.013817982282489538, "rewards/format_reward": 0.35416667722165585, "step": 154 }, { "completion_length": 2522.2084350585938, "epoch": 0.17714285714285713, "grad_norm": 0.21373054385185242, "kl": 0.0026683807373046875, "learning_rate": 8.865091407243394e-07, "loss": 0.0001, "reward": 0.2718268558382988, "reward_std": 0.32829324156045914, "rewards/cosine_scaled_reward": 0.16317287646234035, "rewards/format_reward": 0.47916667722165585, "step": 155 }, { "completion_length": 2815.104217529297, "epoch": 0.1782857142857143, "grad_norm": 0.1889781653881073, "kl": 0.00186920166015625, "learning_rate": 8.844151714648274e-07, "loss": 0.0001, "reward": 0.10164620354771614, "reward_std": 0.23263414576649666, "rewards/cosine_scaled_reward": -0.04785778373479843, "rewards/format_reward": 0.37500000558793545, "step": 156 }, { "completion_length": 2579.416717529297, "epoch": 0.17942857142857144, "grad_norm": 0.21700051426887512, "kl": 0.00357818603515625, "learning_rate": 8.823049032816478e-07, "loss": 0.0001, "reward": 0.0624239519238472, "reward_std": 0.19695715978741646, "rewards/cosine_scaled_reward": -0.1928333044052124, "rewards/format_reward": 0.5208333432674408, "step": 157 }, { "completion_length": 2961.1875, "epoch": 0.18057142857142858, "grad_norm": 0.19288885593414307, "kl": 0.0033111572265625, "learning_rate": 8.801784390262943e-07, "loss": 0.0001, "reward": 0.19672009721398354, "reward_std": 0.34154168516397476, "rewards/cosine_scaled_reward": 0.055924009531736374, "rewards/format_reward": 0.4166666865348816, "step": 158 }, { "completion_length": 2796.3959350585938, "epoch": 0.18171428571428572, "grad_norm": 0.21284666657447815, "kl": 0.003772735595703125, "learning_rate": 8.780358823396352e-07, "loss": 0.0002, "reward": -0.014586111530661583, "reward_std": 0.22132756188511848, "rewards/cosine_scaled_reward": -0.20993047207593918, "rewards/format_reward": 0.3541666828095913, "step": 159 }, { "completion_length": 2681.583465576172, "epoch": 0.18285714285714286, "grad_norm": 0.19720213115215302, "kl": 0.00548553466796875, "learning_rate": 8.758773376468604e-07, "loss": 0.0002, "reward": 0.14375573489814997, "reward_std": 0.28298795223236084, "rewards/cosine_scaled_reward": -0.02817438170313835, "rewards/format_reward": 0.4583333544433117, "step": 160 }, { "completion_length": 2425.5000610351562, "epoch": 0.184, "grad_norm": 0.24934129416942596, "kl": 0.0036067962646484375, "learning_rate": 8.737029101523929e-07, "loss": 0.0001, "reward": 0.23228778317570686, "reward_std": 0.30961449444293976, "rewards/cosine_scaled_reward": 0.004279725253582001, "rewards/format_reward": 0.6041666716337204, "step": 161 }, { "completion_length": 2979.0625610351562, "epoch": 0.18514285714285714, "grad_norm": 0.2740453779697418, "kl": 0.006992340087890625, "learning_rate": 8.715127058347614e-07, "loss": 0.0003, "reward": 0.021589869633316994, "reward_std": 0.21139143407344818, "rewards/cosine_scaled_reward": -0.11643525585532188, "rewards/format_reward": 0.3125000074505806, "step": 162 }, { "completion_length": 2308.1043090820312, "epoch": 0.18628571428571428, "grad_norm": 0.19304688274860382, "kl": 0.0030364990234375, "learning_rate": 8.693068314414344e-07, "loss": 0.0001, "reward": 0.3190014772117138, "reward_std": 0.23985935747623444, "rewards/cosine_scaled_reward": 0.12445038510486484, "rewards/format_reward": 0.625, "step": 163 }, { "completion_length": 2485.5834045410156, "epoch": 0.18742857142857142, "grad_norm": 0.20876161754131317, "kl": 0.004638671875, "learning_rate": 8.670853944836176e-07, "loss": 0.0002, "reward": 0.1634826324880123, "reward_std": 0.27906645834445953, "rewards/cosine_scaled_reward": -0.0056002295459620655, "rewards/format_reward": 0.5416666716337204, "step": 164 }, { "completion_length": 2515.916748046875, "epoch": 0.18857142857142858, "grad_norm": 0.2549312114715576, "kl": 0.00457000732421875, "learning_rate": 8.648485032310144e-07, "loss": 0.0002, "reward": 0.07781760673969984, "reward_std": 0.27216625958681107, "rewards/cosine_scaled_reward": -0.13273300230503082, "rewards/format_reward": 0.5000000186264515, "step": 165 }, { "completion_length": 2622.8333740234375, "epoch": 0.18971428571428572, "grad_norm": 0.4316038191318512, "kl": 0.0029125213623046875, "learning_rate": 8.625962667065487e-07, "loss": 0.0001, "reward": 0.05336676351726055, "reward_std": 0.2708708755671978, "rewards/cosine_scaled_reward": -0.1293769534677267, "rewards/format_reward": 0.45833334140479565, "step": 166 }, { "completion_length": 2269.7500610351562, "epoch": 0.19085714285714286, "grad_norm": 0.26302385330200195, "kl": 0.003326416015625, "learning_rate": 8.603287946810513e-07, "loss": 0.0001, "reward": 0.19086750224232674, "reward_std": 0.29077377915382385, "rewards/cosine_scaled_reward": -0.04062679596245289, "rewards/format_reward": 0.6666667014360428, "step": 167 }, { "completion_length": 2815.8959350585938, "epoch": 0.192, "grad_norm": 0.18967334926128387, "kl": 0.004009246826171875, "learning_rate": 8.580461976679099e-07, "loss": 0.0002, "reward": 0.18181406240910292, "reward_std": 0.3442248087376356, "rewards/cosine_scaled_reward": -0.018053213134407997, "rewards/format_reward": 0.4583333432674408, "step": 168 }, { "completion_length": 1723.3125305175781, "epoch": 0.19314285714285714, "grad_norm": 0.20175939798355103, "kl": 0.003383636474609375, "learning_rate": 8.557485869176825e-07, "loss": 0.0001, "reward": 0.5135928094387054, "reward_std": 0.2722737267613411, "rewards/cosine_scaled_reward": 0.3224016949534416, "rewards/format_reward": 0.875, "step": 169 }, { "completion_length": 2422.3125915527344, "epoch": 0.19428571428571428, "grad_norm": 0.20338761806488037, "kl": 0.003936767578125, "learning_rate": 8.534360744126753e-07, "loss": 0.0002, "reward": 0.10243179090321064, "reward_std": 0.17919140681624413, "rewards/cosine_scaled_reward": -0.0379374660551548, "rewards/format_reward": 0.41666667349636555, "step": 170 }, { "completion_length": 2499.041748046875, "epoch": 0.19542857142857142, "grad_norm": 0.2039409875869751, "kl": 0.0029468536376953125, "learning_rate": 8.511087728614862e-07, "loss": 0.0001, "reward": 0.26387462578713894, "reward_std": 0.2428448162972927, "rewards/cosine_scaled_reward": 0.146632214426063, "rewards/format_reward": 0.5208333432674408, "step": 171 }, { "completion_length": 2817.1458740234375, "epoch": 0.19657142857142856, "grad_norm": 0.23230940103530884, "kl": 0.00603485107421875, "learning_rate": 8.487667956935087e-07, "loss": 0.0002, "reward": 0.18164719082415104, "reward_std": 0.2874234914779663, "rewards/cosine_scaled_reward": 0.005057951435446739, "rewards/format_reward": 0.3958333358168602, "step": 172 }, { "completion_length": 1697.0626068115234, "epoch": 0.1977142857142857, "grad_norm": 0.25889357924461365, "kl": 0.004001617431640625, "learning_rate": 8.464102570534061e-07, "loss": 0.0002, "reward": 0.16589125618338585, "reward_std": 0.21044109761714935, "rewards/cosine_scaled_reward": -0.07962662167847157, "rewards/format_reward": 0.7291666716337204, "step": 173 }, { "completion_length": 2253.041778564453, "epoch": 0.19885714285714284, "grad_norm": 0.22761958837509155, "kl": 0.0071430206298828125, "learning_rate": 8.440392717955475e-07, "loss": 0.0003, "reward": 0.2043942455202341, "reward_std": 0.3179479092359543, "rewards/cosine_scaled_reward": -0.06055827997624874, "rewards/format_reward": 0.6875000149011612, "step": 174 }, { "completion_length": 2658.3750610351562, "epoch": 0.2, "grad_norm": 0.18005910515785217, "kl": 0.004535675048828125, "learning_rate": 8.416539554784089e-07, "loss": 0.0002, "reward": 0.2105781240388751, "reward_std": 0.23263323679566383, "rewards/cosine_scaled_reward": 0.08791563287377357, "rewards/format_reward": 0.5000000149011612, "step": 175 }, { "completion_length": 2122.7500610351562, "epoch": 0.20114285714285715, "grad_norm": 0.23283635079860687, "kl": 0.0038604736328125, "learning_rate": 8.392544243589427e-07, "loss": 0.0002, "reward": 0.15146937035024166, "reward_std": 0.35297995805740356, "rewards/cosine_scaled_reward": -0.09888035990297794, "rewards/format_reward": 0.6458333432674408, "step": 176 }, { "completion_length": 2699.166748046875, "epoch": 0.2022857142857143, "grad_norm": 0.2360665202140808, "kl": 0.006439208984375, "learning_rate": 8.368407953869103e-07, "loss": 0.0003, "reward": 0.12413773685693741, "reward_std": 0.28092398308217525, "rewards/cosine_scaled_reward": -0.0860077440738678, "rewards/format_reward": 0.5208333488553762, "step": 177 }, { "completion_length": 2637.9375610351562, "epoch": 0.20342857142857143, "grad_norm": 0.2292148619890213, "kl": 0.00957489013671875, "learning_rate": 8.344131861991828e-07, "loss": 0.0004, "reward": 0.15966340154409409, "reward_std": 0.31500688195228577, "rewards/cosine_scaled_reward": -0.0060564366285689175, "rewards/format_reward": 0.4583333544433117, "step": 178 }, { "completion_length": 2525.2916870117188, "epoch": 0.20457142857142857, "grad_norm": 0.3491784930229187, "kl": 0.005161285400390625, "learning_rate": 8.319717151140072e-07, "loss": 0.0002, "reward": 0.09528316929936409, "reward_std": 0.19896456971764565, "rewards/cosine_scaled_reward": -0.09036011155694723, "rewards/format_reward": 0.4583333432674408, "step": 179 }, { "completion_length": 2001.7292175292969, "epoch": 0.2057142857142857, "grad_norm": 0.24371346831321716, "kl": 0.006465911865234375, "learning_rate": 8.295165011252396e-07, "loss": 0.0003, "reward": 0.25116462633013725, "reward_std": 0.2804570086300373, "rewards/cosine_scaled_reward": 0.06461600167676806, "rewards/format_reward": 0.6458333432674408, "step": 180 }, { "completion_length": 2886.729248046875, "epoch": 0.20685714285714285, "grad_norm": 0.18016843497753143, "kl": 0.006114959716796875, "learning_rate": 8.270476638965461e-07, "loss": 0.0002, "reward": 0.060142465867102146, "reward_std": 0.20902230963110924, "rewards/cosine_scaled_reward": -0.09921930730342865, "rewards/format_reward": 0.354166679084301, "step": 181 }, { "completion_length": 2052.562530517578, "epoch": 0.208, "grad_norm": 0.1674763262271881, "kl": 0.0023097991943359375, "learning_rate": 8.245653237555705e-07, "loss": 0.0001, "reward": 0.2553917448967695, "reward_std": 0.3303040415048599, "rewards/cosine_scaled_reward": 0.018131352961063385, "rewards/format_reward": 0.6875000149011612, "step": 182 }, { "completion_length": 1718.7709045410156, "epoch": 0.20914285714285713, "grad_norm": 0.22614581882953644, "kl": 0.006946563720703125, "learning_rate": 8.220696016880687e-07, "loss": 0.0003, "reward": 0.3155193105340004, "reward_std": 0.32469745725393295, "rewards/cosine_scaled_reward": 0.015594778582453728, "rewards/format_reward": 0.8750000149011612, "step": 183 }, { "completion_length": 2556.541748046875, "epoch": 0.2102857142857143, "grad_norm": 0.22962883114814758, "kl": 0.005680084228515625, "learning_rate": 8.195606193320136e-07, "loss": 0.0002, "reward": 0.05491369962692261, "reward_std": 0.23178402706980705, "rewards/cosine_scaled_reward": -0.12312540411949158, "rewards/format_reward": 0.43750001303851604, "step": 184 }, { "completion_length": 2175.2916717529297, "epoch": 0.21142857142857144, "grad_norm": 0.21504995226860046, "kl": 0.00449371337890625, "learning_rate": 8.170384989716657e-07, "loss": 0.0002, "reward": 0.02204909175634384, "reward_std": 0.1608840376138687, "rewards/cosine_scaled_reward": -0.22503361850976944, "rewards/format_reward": 0.5416666772216558, "step": 185 }, { "completion_length": 2807.125, "epoch": 0.21257142857142858, "grad_norm": 0.17259298264980316, "kl": 0.00559234619140625, "learning_rate": 8.145033635316128e-07, "loss": 0.0002, "reward": 0.17079078778624535, "reward_std": 0.22075738571584225, "rewards/cosine_scaled_reward": 0.029174381867051125, "rewards/format_reward": 0.45833333395421505, "step": 186 }, { "completion_length": 1926.4375915527344, "epoch": 0.21371428571428572, "grad_norm": 0.2804771363735199, "kl": 0.00753021240234375, "learning_rate": 8.119553365707802e-07, "loss": 0.0003, "reward": 0.21946877613663673, "reward_std": 0.25319264084100723, "rewards/cosine_scaled_reward": -0.04631359688937664, "rewards/format_reward": 0.708333358168602, "step": 187 }, { "completion_length": 2948.3959045410156, "epoch": 0.21485714285714286, "grad_norm": 0.18056756258010864, "kl": 0.005832672119140625, "learning_rate": 8.093945422764069e-07, "loss": 0.0002, "reward": 0.04070337675511837, "reward_std": 0.2970491647720337, "rewards/cosine_scaled_reward": -0.12244150042533875, "rewards/format_reward": 0.3333333358168602, "step": 188 }, { "completion_length": 2054.541717529297, "epoch": 0.216, "grad_norm": 0.27006274461746216, "kl": 0.004894256591796875, "learning_rate": 8.068211054579943e-07, "loss": 0.0002, "reward": 0.13675427064299583, "reward_std": 0.22908888384699821, "rewards/cosine_scaled_reward": -0.11930672498419881, "rewards/format_reward": 0.6250000149011612, "step": 189 }, { "completion_length": 1816.8125305175781, "epoch": 0.21714285714285714, "grad_norm": 0.2942986488342285, "kl": 0.004177093505859375, "learning_rate": 8.04235151541222e-07, "loss": 0.0002, "reward": 0.2299617975950241, "reward_std": 0.2523540575057268, "rewards/cosine_scaled_reward": -0.02879966050386429, "rewards/format_reward": 0.6875000260770321, "step": 190 }, { "completion_length": 1893.4584045410156, "epoch": 0.21828571428571428, "grad_norm": 0.24997583031654358, "kl": 0.005523681640625, "learning_rate": 8.01636806561836e-07, "loss": 0.0002, "reward": 0.21511027216911316, "reward_std": 0.3751527927815914, "rewards/cosine_scaled_reward": -0.03335425350815058, "rewards/format_reward": 0.6250000298023224, "step": 191 }, { "completion_length": 2557.3959045410156, "epoch": 0.21942857142857142, "grad_norm": 0.18185892701148987, "kl": 0.0045013427734375, "learning_rate": 7.990261971595048e-07, "loss": 0.0002, "reward": 0.1383908847346902, "reward_std": 0.2589973732829094, "rewards/cosine_scaled_reward": -0.13022802397608757, "rewards/format_reward": 0.6041666716337204, "step": 192 }, { "completion_length": 3001.375, "epoch": 0.22057142857142858, "grad_norm": 0.18063190579414368, "kl": 0.005649566650390625, "learning_rate": 7.964034505716476e-07, "loss": 0.0002, "reward": -0.041762998211197555, "reward_std": 0.20236865058541298, "rewards/cosine_scaled_reward": -0.23694830760359764, "rewards/format_reward": 0.3541666679084301, "step": 193 }, { "completion_length": 2589.5626220703125, "epoch": 0.22171428571428572, "grad_norm": 0.21290159225463867, "kl": 0.0049457550048828125, "learning_rate": 7.93768694627233e-07, "loss": 0.0002, "reward": 0.3359889704734087, "reward_std": 0.3363325707614422, "rewards/cosine_scaled_reward": 0.17966394126415253, "rewards/format_reward": 0.6458333432674408, "step": 194 }, { "completion_length": 2308.8333740234375, "epoch": 0.22285714285714286, "grad_norm": 0.3455241620540619, "kl": 0.004795074462890625, "learning_rate": 7.911220577405484e-07, "loss": 0.0002, "reward": 0.15757257863879204, "reward_std": 0.34646955132484436, "rewards/cosine_scaled_reward": -0.1573782730847597, "rewards/format_reward": 0.645833358168602, "step": 195 }, { "completion_length": 3186.5000610351562, "epoch": 0.224, "grad_norm": 0.1627224236726761, "kl": 0.00597381591796875, "learning_rate": 7.884636689049422e-07, "loss": 0.0002, "reward": 0.07601241301745176, "reward_std": 0.24550564214587212, "rewards/cosine_scaled_reward": -0.06591853499412537, "rewards/format_reward": 0.3750000111758709, "step": 196 }, { "completion_length": 1489.6250610351562, "epoch": 0.22514285714285714, "grad_norm": 0.25338006019592285, "kl": 0.00522613525390625, "learning_rate": 7.857936576865356e-07, "loss": 0.0002, "reward": 0.2503656707704067, "reward_std": 0.33918196335434914, "rewards/cosine_scaled_reward": -0.03431052714586258, "rewards/format_reward": 0.8125000149011612, "step": 197 }, { "completion_length": 2260.979217529297, "epoch": 0.22628571428571428, "grad_norm": 0.21298959851264954, "kl": 0.0067291259765625, "learning_rate": 7.831121542179086e-07, "loss": 0.0003, "reward": 0.13585240580141544, "reward_std": 0.25526952743530273, "rewards/cosine_scaled_reward": -0.08993709087371826, "rewards/format_reward": 0.6458333358168602, "step": 198 }, { "completion_length": 2225.2084350585938, "epoch": 0.22742857142857142, "grad_norm": 0.2402978390455246, "kl": 0.007724761962890625, "learning_rate": 7.804192891917571e-07, "loss": 0.0003, "reward": 0.14825211465358734, "reward_std": 0.25423312187194824, "rewards/cosine_scaled_reward": -0.15464808233082294, "rewards/format_reward": 0.7291666865348816, "step": 199 }, { "completion_length": 1518.4375610351562, "epoch": 0.22857142857142856, "grad_norm": 0.2037964165210724, "kl": 0.0044708251953125, "learning_rate": 7.777151938545235e-07, "loss": 0.0002, "reward": 0.30662217177450657, "reward_std": 0.2782711200416088, "rewards/cosine_scaled_reward": 0.023611009120941162, "rewards/format_reward": 0.8541666716337204, "step": 200 }, { "completion_length": 2185.104217529297, "epoch": 0.2297142857142857, "grad_norm": 0.22064223885536194, "kl": 0.00380706787109375, "learning_rate": 7.75e-07, "loss": 0.0002, "reward": 0.4745171591639519, "reward_std": 0.3512828201055527, "rewards/cosine_scaled_reward": 0.32128068804740906, "rewards/format_reward": 0.770833358168602, "step": 201 }, { "completion_length": 1733.2291870117188, "epoch": 0.23085714285714284, "grad_norm": 0.21100802719593048, "kl": 0.004169464111328125, "learning_rate": 7.72273839962904e-07, "loss": 0.0002, "reward": 0.40203397534787655, "reward_std": 0.21288743987679482, "rewards/cosine_scaled_reward": 0.26086020842194557, "rewards/format_reward": 0.7083333432674408, "step": 202 }, { "completion_length": 2548.0000915527344, "epoch": 0.232, "grad_norm": 0.2812572717666626, "kl": 0.0077362060546875, "learning_rate": 7.695368466124296e-07, "loss": 0.0003, "reward": 0.15158599987626076, "reward_std": 0.3328489065170288, "rewards/cosine_scaled_reward": -0.013430323451757431, "rewards/format_reward": 0.4583333469927311, "step": 203 }, { "completion_length": 1831.7291870117188, "epoch": 0.23314285714285715, "grad_norm": 0.2870136499404907, "kl": 0.008026123046875, "learning_rate": 7.667891533457718e-07, "loss": 0.0003, "reward": 0.2725534439086914, "reward_std": 0.2783195786178112, "rewards/cosine_scaled_reward": -0.009869151283055544, "rewards/format_reward": 0.770833358168602, "step": 204 }, { "completion_length": 2287.916717529297, "epoch": 0.2342857142857143, "grad_norm": 0.2195574939250946, "kl": 0.004608154296875, "learning_rate": 7.640308940816239e-07, "loss": 0.0002, "reward": 0.2794661708176136, "reward_std": 0.3664914593100548, "rewards/cosine_scaled_reward": 0.09487088554305956, "rewards/format_reward": 0.6458333507180214, "step": 205 }, { "completion_length": 2537.729217529297, "epoch": 0.23542857142857143, "grad_norm": 0.21473117172718048, "kl": 0.0049896240234375, "learning_rate": 7.612622032536507e-07, "loss": 0.0002, "reward": 0.1754872469464317, "reward_std": 0.31655465066432953, "rewards/cosine_scaled_reward": -0.07037313468754292, "rewards/format_reward": 0.6250000149011612, "step": 206 }, { "completion_length": 2232.0834045410156, "epoch": 0.23657142857142857, "grad_norm": 0.22935722768306732, "kl": 0.00576019287109375, "learning_rate": 7.584832158039378e-07, "loss": 0.0002, "reward": 0.1333406250923872, "reward_std": 0.23974663391709328, "rewards/cosine_scaled_reward": -0.1676623560488224, "rewards/format_reward": 0.6875, "step": 207 }, { "completion_length": 1856.9375610351562, "epoch": 0.2377142857142857, "grad_norm": 0.2221345752477646, "kl": 0.005878448486328125, "learning_rate": 7.556940671764124e-07, "loss": 0.0002, "reward": 0.26369363255798817, "reward_std": 0.1700501013547182, "rewards/cosine_scaled_reward": 0.0034250058233737946, "rewards/format_reward": 0.7500000149011612, "step": 208 }, { "completion_length": 2069.8750915527344, "epoch": 0.23885714285714285, "grad_norm": 0.29865333437919617, "kl": 0.0078582763671875, "learning_rate": 7.528948933102438e-07, "loss": 0.0003, "reward": 0.2509584181243554, "reward_std": 0.2393549047410488, "rewards/cosine_scaled_reward": 0.04618738777935505, "rewards/format_reward": 0.6666666865348816, "step": 209 }, { "completion_length": 2130.5000610351562, "epoch": 0.24, "grad_norm": 0.19106724858283997, "kl": 0.006458282470703125, "learning_rate": 7.500858306332172e-07, "loss": 0.0003, "reward": 0.14285951387137175, "reward_std": 0.19960445538163185, "rewards/cosine_scaled_reward": -0.08981587737798691, "rewards/format_reward": 0.6250000204890966, "step": 210 }, { "completion_length": 1910.729248046875, "epoch": 0.24114285714285713, "grad_norm": 0.2566603720188141, "kl": 0.00563812255859375, "learning_rate": 7.472670160550848e-07, "loss": 0.0002, "reward": 0.24128811853006482, "reward_std": 0.23205993883311749, "rewards/cosine_scaled_reward": -0.012406919151544571, "rewards/format_reward": 0.6875000149011612, "step": 211 }, { "completion_length": 1509.9375457763672, "epoch": 0.2422857142857143, "grad_norm": 0.2559695243835449, "kl": 0.005527496337890625, "learning_rate": 7.444385869608921e-07, "loss": 0.0002, "reward": 0.29675021627917886, "reward_std": 0.21607037633657455, "rewards/cosine_scaled_reward": 0.03729934897273779, "rewards/format_reward": 0.8125000149011612, "step": 212 }, { "completion_length": 1916.1459350585938, "epoch": 0.24342857142857144, "grad_norm": 0.24653296172618866, "kl": 0.00902557373046875, "learning_rate": 7.416006812042827e-07, "loss": 0.0004, "reward": 0.3602223386988044, "reward_std": 0.2784761004149914, "rewards/cosine_scaled_reward": 0.17531066434457898, "rewards/format_reward": 0.6875000149011612, "step": 213 }, { "completion_length": 2526.1876220703125, "epoch": 0.24457142857142858, "grad_norm": 0.3414965569972992, "kl": 0.00806427001953125, "learning_rate": 7.387534371007797e-07, "loss": 0.0003, "reward": 0.2717064246535301, "reward_std": 0.3078125827014446, "rewards/cosine_scaled_reward": 0.1282469742000103, "rewards/format_reward": 0.6041666716337204, "step": 214 }, { "completion_length": 1703.4792175292969, "epoch": 0.24571428571428572, "grad_norm": 0.2487371265888214, "kl": 0.00457000732421875, "learning_rate": 7.358969934210438e-07, "loss": 0.0002, "reward": 0.22754944674670696, "reward_std": 0.170820539817214, "rewards/cosine_scaled_reward": -0.09126808494329453, "rewards/format_reward": 0.8333333432674408, "step": 215 }, { "completion_length": 1111.8958892822266, "epoch": 0.24685714285714286, "grad_norm": 0.24535112082958221, "kl": 0.0048828125, "learning_rate": 7.330314893841101e-07, "loss": 0.0002, "reward": 0.439119428396225, "reward_std": 0.3081901855766773, "rewards/cosine_scaled_reward": 0.18471297062933445, "rewards/format_reward": 0.8958333432674408, "step": 216 }, { "completion_length": 1762.0208435058594, "epoch": 0.248, "grad_norm": 0.21277444064617157, "kl": 0.0063629150390625, "learning_rate": 7.301570646506027e-07, "loss": 0.0003, "reward": 0.23682209104299545, "reward_std": 0.3436572030186653, "rewards/cosine_scaled_reward": 0.0008598812855780125, "rewards/format_reward": 0.7083333432674408, "step": 217 }, { "completion_length": 1882.2083435058594, "epoch": 0.24914285714285714, "grad_norm": 0.22767505049705505, "kl": 0.00641632080078125, "learning_rate": 7.27273859315928e-07, "loss": 0.0003, "reward": 0.13319060020148754, "reward_std": 0.29052822291851044, "rewards/cosine_scaled_reward": -0.18922634795308113, "rewards/format_reward": 0.7291666865348816, "step": 218 }, { "completion_length": 1539.0625305175781, "epoch": 0.2502857142857143, "grad_norm": 0.2611101567745209, "kl": 0.007419586181640625, "learning_rate": 7.243820139034464e-07, "loss": 0.0003, "reward": 0.24942272529006004, "reward_std": 0.24504001811146736, "rewards/cosine_scaled_reward": 0.018378445878624916, "rewards/format_reward": 0.7708333432674408, "step": 219 }, { "completion_length": 1554.8542022705078, "epoch": 0.25142857142857145, "grad_norm": 0.2599616050720215, "kl": 0.0065155029296875, "learning_rate": 7.214816693576234e-07, "loss": 0.0003, "reward": 0.07774155330844223, "reward_std": 0.14831382408738136, "rewards/cosine_scaled_reward": -0.29498909786343575, "rewards/format_reward": 0.8125000149011612, "step": 220 }, { "completion_length": 1505.1666870117188, "epoch": 0.25257142857142856, "grad_norm": 0.2269742339849472, "kl": 0.0047454833984375, "learning_rate": 7.185729670371604e-07, "loss": 0.0002, "reward": 0.32571933791041374, "reward_std": 0.24275402911007404, "rewards/cosine_scaled_reward": 0.11157770827412605, "rewards/format_reward": 0.7916666865348816, "step": 221 }, { "completion_length": 1946.916748046875, "epoch": 0.2537142857142857, "grad_norm": 0.23065422475337982, "kl": 0.00635528564453125, "learning_rate": 7.156560487081051e-07, "loss": 0.0003, "reward": 0.3265978842973709, "reward_std": 0.29620324075222015, "rewards/cosine_scaled_reward": 0.11483582854270935, "rewards/format_reward": 0.6666666865348816, "step": 222 }, { "completion_length": 1822.6458740234375, "epoch": 0.25485714285714284, "grad_norm": 0.23844553530216217, "kl": 0.0056915283203125, "learning_rate": 7.127310565369415e-07, "loss": 0.0002, "reward": 0.25359345600008965, "reward_std": 0.305300273001194, "rewards/cosine_scaled_reward": -0.011000402271747589, "rewards/format_reward": 0.7291666865348816, "step": 223 }, { "completion_length": 2066.5000610351562, "epoch": 0.256, "grad_norm": 0.2057129144668579, "kl": 0.006046295166015625, "learning_rate": 7.097981330836616e-07, "loss": 0.0002, "reward": 0.1604653261601925, "reward_std": 0.2231849767267704, "rewards/cosine_scaled_reward": -0.12973792850971222, "rewards/format_reward": 0.7083333395421505, "step": 224 }, { "completion_length": 2385.4375610351562, "epoch": 0.2571428571428571, "grad_norm": 0.2630828022956848, "kl": 0.00998687744140625, "learning_rate": 7.068574212948169e-07, "loss": 0.0004, "reward": 0.1808540727943182, "reward_std": 0.29596592485904694, "rewards/cosine_scaled_reward": -0.056875346694141626, "rewards/format_reward": 0.6666666865348816, "step": 225 }, { "completion_length": 1941.7708740234375, "epoch": 0.2582857142857143, "grad_norm": 0.19778260588645935, "kl": 0.00537872314453125, "learning_rate": 7.039090644965509e-07, "loss": 0.0002, "reward": 0.4080950105562806, "reward_std": 0.3161883242428303, "rewards/cosine_scaled_reward": 0.1740732565522194, "rewards/format_reward": 0.7916666716337204, "step": 226 }, { "completion_length": 1382.3125305175781, "epoch": 0.25942857142857145, "grad_norm": 0.23452956974506378, "kl": 0.0117950439453125, "learning_rate": 7.009532063876148e-07, "loss": 0.0005, "reward": 0.2297086864709854, "reward_std": 0.2339212652295828, "rewards/cosine_scaled_reward": -0.12544679269194603, "rewards/format_reward": 0.895833358168602, "step": 227 }, { "completion_length": 1569.5209197998047, "epoch": 0.26057142857142856, "grad_norm": 0.2557767629623413, "kl": 0.005340576171875, "learning_rate": 6.979899910323624e-07, "loss": 0.0002, "reward": 0.35539381578564644, "reward_std": 0.28502682596445084, "rewards/cosine_scaled_reward": 0.15302862459793687, "rewards/format_reward": 0.7500000149011612, "step": 228 }, { "completion_length": 1640.5626220703125, "epoch": 0.26171428571428573, "grad_norm": 0.216407909989357, "kl": 0.007568359375, "learning_rate": 6.950195628537299e-07, "loss": 0.0003, "reward": 0.17562153795734048, "reward_std": 0.13211852312088013, "rewards/cosine_scaled_reward": -0.12780890613794327, "rewards/format_reward": 0.7916666865348816, "step": 229 }, { "completion_length": 1904.0208892822266, "epoch": 0.26285714285714284, "grad_norm": 0.2001759558916092, "kl": 0.00653076171875, "learning_rate": 6.920420666261961e-07, "loss": 0.0003, "reward": 0.12062056954891887, "reward_std": 0.19564786739647388, "rewards/cosine_scaled_reward": -0.21936549118254334, "rewards/format_reward": 0.7708333432674408, "step": 230 }, { "completion_length": 1837.7500305175781, "epoch": 0.264, "grad_norm": 0.3313891589641571, "kl": 0.0077056884765625, "learning_rate": 6.890576474687263e-07, "loss": 0.0003, "reward": 0.3438783623278141, "reward_std": 0.3303004875779152, "rewards/cosine_scaled_reward": 0.0843821857124567, "rewards/format_reward": 0.7916667014360428, "step": 231 }, { "completion_length": 2066.916778564453, "epoch": 0.2651428571428571, "grad_norm": 0.24516141414642334, "kl": 0.0080718994140625, "learning_rate": 6.860664508377001e-07, "loss": 0.0003, "reward": 0.13200945453718305, "reward_std": 0.20449103228747845, "rewards/cosine_scaled_reward": -0.0955875813961029, "rewards/format_reward": 0.6250000149011612, "step": 232 }, { "completion_length": 1328.9584045410156, "epoch": 0.2662857142857143, "grad_norm": 0.21824268996715546, "kl": 0.00582122802734375, "learning_rate": 6.83068622519821e-07, "loss": 0.0002, "reward": 0.20045446418225765, "reward_std": 0.25751522183418274, "rewards/cosine_scaled_reward": -0.17755268700420856, "rewards/format_reward": 0.9166666716337204, "step": 233 }, { "completion_length": 2047.9375839233398, "epoch": 0.2674285714285714, "grad_norm": 0.2818503975868225, "kl": 0.0115814208984375, "learning_rate": 6.800643086250121e-07, "loss": 0.0005, "reward": 0.18364327400922775, "reward_std": 0.27602763287723064, "rewards/cosine_scaled_reward": -0.0731953289359808, "rewards/format_reward": 0.6250000149011612, "step": 234 }, { "completion_length": 1446.4166870117188, "epoch": 0.26857142857142857, "grad_norm": 0.26381435990333557, "kl": 0.00649261474609375, "learning_rate": 6.770536555792944e-07, "loss": 0.0003, "reward": 0.42406467348337173, "reward_std": 0.3336157165467739, "rewards/cosine_scaled_reward": 0.15425015799701214, "rewards/format_reward": 0.8125000149011612, "step": 235 }, { "completion_length": 1943.5416870117188, "epoch": 0.26971428571428574, "grad_norm": 0.22560541331768036, "kl": 0.006351470947265625, "learning_rate": 6.740368101176495e-07, "loss": 0.0003, "reward": 0.1980036310851574, "reward_std": 0.2674345225095749, "rewards/cosine_scaled_reward": -0.08064225409179926, "rewards/format_reward": 0.75, "step": 236 }, { "completion_length": 1437.1459045410156, "epoch": 0.27085714285714285, "grad_norm": 0.2287108153104782, "kl": 0.00568389892578125, "learning_rate": 6.710139192768694e-07, "loss": 0.0002, "reward": 0.3367458498105407, "reward_std": 0.19615472108125687, "rewards/cosine_scaled_reward": 0.052811697125434875, "rewards/format_reward": 0.875, "step": 237 }, { "completion_length": 1172.1250305175781, "epoch": 0.272, "grad_norm": 0.24761365354061127, "kl": 0.00733184814453125, "learning_rate": 6.679851303883891e-07, "loss": 0.0003, "reward": 0.32783316634595394, "reward_std": 0.2573069650679827, "rewards/cosine_scaled_reward": -0.023007137060631067, "rewards/format_reward": 0.9791666716337204, "step": 238 }, { "completion_length": 1534.1250762939453, "epoch": 0.27314285714285713, "grad_norm": 0.2047697901725769, "kl": 0.00616455078125, "learning_rate": 6.649505910711058e-07, "loss": 0.0002, "reward": 0.44937574677169323, "reward_std": 0.25652869790792465, "rewards/cosine_scaled_reward": 0.2903861254453659, "rewards/format_reward": 0.75, "step": 239 }, { "completion_length": 2051.354248046875, "epoch": 0.2742857142857143, "grad_norm": 0.2782205045223236, "kl": 0.010284423828125, "learning_rate": 6.619104492241847e-07, "loss": 0.0004, "reward": 0.08665064908564091, "reward_std": 0.14281608164310455, "rewards/cosine_scaled_reward": -0.21663008630275726, "rewards/format_reward": 0.6875000149011612, "step": 240 }, { "completion_length": 1735.2500610351562, "epoch": 0.2754285714285714, "grad_norm": 0.2723366618156433, "kl": 0.00836944580078125, "learning_rate": 6.588648530198504e-07, "loss": 0.0003, "reward": 0.07045929972082376, "reward_std": 0.1520705670118332, "rewards/cosine_scaled_reward": -0.27597323805093765, "rewards/format_reward": 0.7916666865348816, "step": 241 }, { "completion_length": 1264.6667022705078, "epoch": 0.2765714285714286, "grad_norm": 0.2971493601799011, "kl": 0.0120086669921875, "learning_rate": 6.558139508961654e-07, "loss": 0.0005, "reward": 0.2805042117834091, "reward_std": 0.2464975118637085, "rewards/cosine_scaled_reward": -0.06483453698456287, "rewards/format_reward": 0.9166666865348816, "step": 242 }, { "completion_length": 1633.6458740234375, "epoch": 0.2777142857142857, "grad_norm": 0.1975041925907135, "kl": 0.00750732421875, "learning_rate": 6.527578915497951e-07, "loss": 0.0003, "reward": 0.3133881650865078, "reward_std": 0.28336846828460693, "rewards/cosine_scaled_reward": 0.05694121681153774, "rewards/format_reward": 0.8333333432674408, "step": 243 }, { "completion_length": 1654.354248046875, "epoch": 0.27885714285714286, "grad_norm": 0.2421889454126358, "kl": 0.00768280029296875, "learning_rate": 6.496968239287603e-07, "loss": 0.0003, "reward": 0.3425067327916622, "reward_std": 0.2821289487183094, "rewards/cosine_scaled_reward": 0.11226867651566863, "rewards/format_reward": 0.7708333432674408, "step": 244 }, { "completion_length": 1824.3750915527344, "epoch": 0.28, "grad_norm": 0.20805487036705017, "kl": 0.006072998046875, "learning_rate": 6.466308972251785e-07, "loss": 0.0002, "reward": 0.34558769315481186, "reward_std": 0.33702201396226883, "rewards/cosine_scaled_reward": 0.10386714269407094, "rewards/format_reward": 0.8125000149011612, "step": 245 }, { "completion_length": 1482.729248046875, "epoch": 0.28114285714285714, "grad_norm": 0.23012664914131165, "kl": 0.00945281982421875, "learning_rate": 6.435602608679916e-07, "loss": 0.0004, "reward": 0.20973372273147106, "reward_std": 0.22376436367630959, "rewards/cosine_scaled_reward": -0.12794288620352745, "rewards/format_reward": 0.8958333432674408, "step": 246 }, { "completion_length": 2047.9791870117188, "epoch": 0.2822857142857143, "grad_norm": 0.23625898361206055, "kl": 0.00788116455078125, "learning_rate": 6.404850645156841e-07, "loss": 0.0003, "reward": 0.10961971618235111, "reward_std": 0.25065623596310616, "rewards/cosine_scaled_reward": -0.15552188456058502, "rewards/format_reward": 0.666666679084301, "step": 247 }, { "completion_length": 1475.6041870117188, "epoch": 0.2834285714285714, "grad_norm": 0.31174013018608093, "kl": 0.007293701171875, "learning_rate": 6.374054580489873e-07, "loss": 0.0003, "reward": 0.3890385031700134, "reward_std": 0.2624141275882721, "rewards/cosine_scaled_reward": 0.20213650539517403, "rewards/format_reward": 0.7291666828095913, "step": 248 }, { "completion_length": 1318.8333740234375, "epoch": 0.2845714285714286, "grad_norm": 0.21641266345977783, "kl": 0.008941650390625, "learning_rate": 6.343215915635761e-07, "loss": 0.0004, "reward": 0.4674979895353317, "reward_std": 0.25317949429154396, "rewards/cosine_scaled_reward": 0.20079964771866798, "rewards/format_reward": 0.9375000149011612, "step": 249 }, { "completion_length": 1108.8125610351562, "epoch": 0.2857142857142857, "grad_norm": 0.30281519889831543, "kl": 0.00795745849609375, "learning_rate": 6.31233615362752e-07, "loss": 0.0003, "reward": 0.29210687801241875, "reward_std": 0.1993991807103157, "rewards/cosine_scaled_reward": -0.07733290828764439, "rewards/format_reward": 1.0, "step": 250 }, { "completion_length": 1167.2292022705078, "epoch": 0.28685714285714287, "grad_norm": 0.28529420495033264, "kl": 0.00995635986328125, "learning_rate": 6.281416799501187e-07, "loss": 0.0004, "reward": 0.22866389155387878, "reward_std": 0.25506168603897095, "rewards/cosine_scaled_reward": -0.08852448500692844, "rewards/format_reward": 0.875, "step": 251 }, { "completion_length": 1595.7917175292969, "epoch": 0.288, "grad_norm": 0.23463518917560577, "kl": 0.0086212158203125, "learning_rate": 6.25045936022246e-07, "loss": 0.0003, "reward": 0.18430738151073456, "reward_std": 0.15956872701644897, "rewards/cosine_scaled_reward": -0.11572365462779999, "rewards/format_reward": 0.8333333432674408, "step": 252 }, { "completion_length": 1727.3958740234375, "epoch": 0.28914285714285715, "grad_norm": 0.28016501665115356, "kl": 0.01375579833984375, "learning_rate": 6.219465344613258e-07, "loss": 0.0005, "reward": 0.23585626482963562, "reward_std": 0.23895636573433876, "rewards/cosine_scaled_reward": -0.06529825925827026, "rewards/format_reward": 0.8125000298023224, "step": 253 }, { "completion_length": 1843.6250762939453, "epoch": 0.29028571428571426, "grad_norm": 0.2972549498081207, "kl": 0.009918212890625, "learning_rate": 6.188436263278172e-07, "loss": 0.0004, "reward": 0.19810109585523605, "reward_std": 0.3144396096467972, "rewards/cosine_scaled_reward": -0.06495114602148533, "rewards/format_reward": 0.7291666865348816, "step": 254 }, { "completion_length": 2136.791717529297, "epoch": 0.2914285714285714, "grad_norm": 0.25467684864997864, "kl": 0.00911712646484375, "learning_rate": 6.157373628530852e-07, "loss": 0.0004, "reward": 0.14784921891987324, "reward_std": 0.22354254499077797, "rewards/cosine_scaled_reward": -0.15361732488963753, "rewards/format_reward": 0.7291666716337204, "step": 255 }, { "completion_length": 1582.2292175292969, "epoch": 0.2925714285714286, "grad_norm": 0.23993702232837677, "kl": 0.0102081298828125, "learning_rate": 6.126278954320294e-07, "loss": 0.0004, "reward": 0.21227549016475677, "reward_std": 0.23894662968814373, "rewards/cosine_scaled_reward": -0.07508064294233918, "rewards/format_reward": 0.8333333432674408, "step": 256 }, { "completion_length": 2020.2084045410156, "epoch": 0.2937142857142857, "grad_norm": 0.23374323546886444, "kl": 0.00890350341796875, "learning_rate": 6.095153756157051e-07, "loss": 0.0004, "reward": 0.41816296614706516, "reward_std": 0.3155359774827957, "rewards/cosine_scaled_reward": 0.2016722597181797, "rewards/format_reward": 0.7500000149011612, "step": 257 }, { "completion_length": 1815.1042175292969, "epoch": 0.2948571428571429, "grad_norm": 0.22643603384494781, "kl": 0.007904052734375, "learning_rate": 6.06399955103937e-07, "loss": 0.0003, "reward": 0.3056030236184597, "reward_std": 0.3313448801636696, "rewards/cosine_scaled_reward": 0.019550755620002747, "rewards/format_reward": 0.8333333432674408, "step": 258 }, { "completion_length": 1405.7292175292969, "epoch": 0.296, "grad_norm": 0.26583632826805115, "kl": 0.010009765625, "learning_rate": 6.032817857379256e-07, "loss": 0.0004, "reward": 0.3662095330655575, "reward_std": 0.29809001833200455, "rewards/cosine_scaled_reward": 0.05342107731848955, "rewards/format_reward": 0.8750000149011612, "step": 259 }, { "completion_length": 1058.0208435058594, "epoch": 0.29714285714285715, "grad_norm": 0.27586984634399414, "kl": 0.007415771484375, "learning_rate": 6.001610194928464e-07, "loss": 0.0003, "reward": 0.42465347796678543, "reward_std": 0.28675223514437675, "rewards/cosine_scaled_reward": 0.14612944051623344, "rewards/format_reward": 0.9375000149011612, "step": 260 }, { "completion_length": 2196.8750610351562, "epoch": 0.29828571428571427, "grad_norm": 0.20076176524162292, "kl": 0.0092315673828125, "learning_rate": 5.97037808470444e-07, "loss": 0.0004, "reward": 0.10841672308743, "reward_std": 0.1914910487830639, "rewards/cosine_scaled_reward": -0.1518568762112409, "rewards/format_reward": 0.6041666716337204, "step": 261 }, { "completion_length": 1699.2292175292969, "epoch": 0.29942857142857143, "grad_norm": 0.26689934730529785, "kl": 0.0111541748046875, "learning_rate": 5.939123048916173e-07, "loss": 0.0004, "reward": 0.09261467261239886, "reward_std": 0.16253572702407837, "rewards/cosine_scaled_reward": -0.20514269173145294, "rewards/format_reward": 0.7083333432674408, "step": 262 }, { "completion_length": 1472.5000305175781, "epoch": 0.30057142857142854, "grad_norm": 0.23336315155029297, "kl": 0.00583648681640625, "learning_rate": 5.907846610890011e-07, "loss": 0.0002, "reward": 0.1648530475795269, "reward_std": 0.18808570504188538, "rewards/cosine_scaled_reward": -0.2227061167359352, "rewards/format_reward": 0.9166666865348816, "step": 263 }, { "completion_length": 1523.7708740234375, "epoch": 0.3017142857142857, "grad_norm": 0.22092439234256744, "kl": 0.0074310302734375, "learning_rate": 5.87655029499542e-07, "loss": 0.0003, "reward": 0.20358567498624325, "reward_std": 0.27659233286976814, "rewards/cosine_scaled_reward": -0.1430983915925026, "rewards/format_reward": 0.8541666865348816, "step": 264 }, { "completion_length": 1479.7708892822266, "epoch": 0.3028571428571429, "grad_norm": 0.29656776785850525, "kl": 0.0095367431640625, "learning_rate": 5.845235626570683e-07, "loss": 0.0004, "reward": 0.2208748161792755, "reward_std": 0.23769052140414715, "rewards/cosine_scaled_reward": -0.09724835399538279, "rewards/format_reward": 0.8541666716337204, "step": 265 }, { "completion_length": 1845.1250610351562, "epoch": 0.304, "grad_norm": 0.24222609400749207, "kl": 0.0095062255859375, "learning_rate": 5.813904131848564e-07, "loss": 0.0004, "reward": 0.24290394503623247, "reward_std": 0.2597558721899986, "rewards/cosine_scaled_reward": -0.017342038452625275, "rewards/format_reward": 0.75, "step": 266 }, { "completion_length": 2344.312530517578, "epoch": 0.30514285714285716, "grad_norm": 0.21543891727924347, "kl": 0.0122833251953125, "learning_rate": 5.78255733788191e-07, "loss": 0.0005, "reward": 0.025223158299922943, "reward_std": 0.18688056617975235, "rewards/cosine_scaled_reward": -0.2162624504417181, "rewards/format_reward": 0.5000000149011612, "step": 267 }, { "completion_length": 1327.8958587646484, "epoch": 0.3062857142857143, "grad_norm": 0.3169882893562317, "kl": 0.01444244384765625, "learning_rate": 5.751196772469237e-07, "loss": 0.0006, "reward": 0.2750858571380377, "reward_std": 0.3029320724308491, "rewards/cosine_scaled_reward": -0.0884361332282424, "rewards/format_reward": 0.8958333432674408, "step": 268 }, { "completion_length": 1640.0416870117188, "epoch": 0.30742857142857144, "grad_norm": 0.24088799953460693, "kl": 0.00873565673828125, "learning_rate": 5.71982396408026e-07, "loss": 0.0003, "reward": 0.18298603361472487, "reward_std": 0.21892964094877243, "rewards/cosine_scaled_reward": -0.13406258076429367, "rewards/format_reward": 0.8333333432674408, "step": 269 }, { "completion_length": 1622.2500610351562, "epoch": 0.30857142857142855, "grad_norm": 0.23051612079143524, "kl": 0.00720977783203125, "learning_rate": 5.688440441781398e-07, "loss": 0.0003, "reward": 0.37890365347266197, "reward_std": 0.32260435819625854, "rewards/cosine_scaled_reward": 0.06904735416173935, "rewards/format_reward": 0.9166666865348816, "step": 270 }, { "completion_length": 1450.0417175292969, "epoch": 0.3097142857142857, "grad_norm": 0.28786543011665344, "kl": 0.00946044921875, "learning_rate": 5.657047735161255e-07, "loss": 0.0004, "reward": 0.5036385664716363, "reward_std": 0.36333882436156273, "rewards/cosine_scaled_reward": 0.2697860337793827, "rewards/format_reward": 0.8333333432674408, "step": 271 }, { "completion_length": 1877.8541870117188, "epoch": 0.31085714285714283, "grad_norm": 0.20036853849887848, "kl": 0.00983428955078125, "learning_rate": 5.625647374256061e-07, "loss": 0.0004, "reward": 0.22807104885578156, "reward_std": 0.24351342767477036, "rewards/cosine_scaled_reward": -0.0636335639283061, "rewards/format_reward": 0.8333333432674408, "step": 272 }, { "completion_length": 1397.3333740234375, "epoch": 0.312, "grad_norm": 0.26651531457901, "kl": 0.0090484619140625, "learning_rate": 5.594240889475106e-07, "loss": 0.0004, "reward": 0.2839905247092247, "reward_std": 0.2825750559568405, "rewards/cosine_scaled_reward": -0.004876431077718735, "rewards/format_reward": 0.8750000298023224, "step": 273 }, { "completion_length": 1043.9167175292969, "epoch": 0.31314285714285717, "grad_norm": 0.3097701072692871, "kl": 0.0113067626953125, "learning_rate": 5.562829811526154e-07, "loss": 0.0005, "reward": 0.331375852227211, "reward_std": 0.2427959106862545, "rewards/cosine_scaled_reward": 0.009415101259946823, "rewards/format_reward": 0.9583333432674408, "step": 274 }, { "completion_length": 1657.0833587646484, "epoch": 0.3142857142857143, "grad_norm": 0.2341931164264679, "kl": 0.0104217529296875, "learning_rate": 5.531415671340826e-07, "loss": 0.0004, "reward": 0.3595724329352379, "reward_std": 0.3247465565800667, "rewards/cosine_scaled_reward": 0.1323377527296543, "rewards/format_reward": 0.8125000149011612, "step": 275 }, { "completion_length": 1292.0000610351562, "epoch": 0.31542857142857145, "grad_norm": 0.3187240660190582, "kl": 0.01602935791015625, "learning_rate": 5.5e-07, "loss": 0.0006, "reward": 0.34184306114912033, "reward_std": 0.3191189467906952, "rewards/cosine_scaled_reward": 0.022058267146348953, "rewards/format_reward": 0.8958333432674408, "step": 276 }, { "completion_length": 1618.7500305175781, "epoch": 0.31657142857142856, "grad_norm": 0.3473435938358307, "kl": 0.01397705078125, "learning_rate": 5.468584328659172e-07, "loss": 0.0006, "reward": 0.3007097691297531, "reward_std": 0.20629184320569038, "rewards/cosine_scaled_reward": 0.02854561060667038, "rewards/format_reward": 0.8125000149011612, "step": 277 }, { "completion_length": 1575.3959045410156, "epoch": 0.3177142857142857, "grad_norm": 0.295357882976532, "kl": 0.00948333740234375, "learning_rate": 5.437170188473847e-07, "loss": 0.0004, "reward": 0.34411953017115593, "reward_std": 0.25512586534023285, "rewards/cosine_scaled_reward": 0.08236894011497498, "rewards/format_reward": 0.8125000149011612, "step": 278 }, { "completion_length": 1402.7292175292969, "epoch": 0.31885714285714284, "grad_norm": 0.26129284501075745, "kl": 0.01031494140625, "learning_rate": 5.405759110524894e-07, "loss": 0.0004, "reward": 0.25433837436139584, "reward_std": 0.21544880792498589, "rewards/cosine_scaled_reward": -0.10006260499358177, "rewards/format_reward": 0.9166666865348816, "step": 279 }, { "completion_length": 1718.5625610351562, "epoch": 0.32, "grad_norm": 0.2867210805416107, "kl": 0.0108489990234375, "learning_rate": 5.37435262574394e-07, "loss": 0.0004, "reward": 0.40966310165822506, "reward_std": 0.34037622064352036, "rewards/cosine_scaled_reward": 0.18308642879128456, "rewards/format_reward": 0.8125, "step": 280 }, { "completion_length": 2452.1875610351562, "epoch": 0.3211428571428571, "grad_norm": 0.21252016723155975, "kl": 0.01434326171875, "learning_rate": 5.342952264838747e-07, "loss": 0.0006, "reward": 0.06680710799992085, "reward_std": 0.2203577384352684, "rewards/cosine_scaled_reward": -0.1721898689866066, "rewards/format_reward": 0.5416666828095913, "step": 281 }, { "completion_length": 1353.3750457763672, "epoch": 0.3222857142857143, "grad_norm": 0.2406478375196457, "kl": 0.008941650390625, "learning_rate": 5.311559558218603e-07, "loss": 0.0004, "reward": 0.3876841962337494, "reward_std": 0.2718113847076893, "rewards/cosine_scaled_reward": 0.08670558547601104, "rewards/format_reward": 0.9166666716337204, "step": 282 }, { "completion_length": 2312.291717529297, "epoch": 0.32342857142857145, "grad_norm": 0.18672321736812592, "kl": 0.01123046875, "learning_rate": 5.28017603591974e-07, "loss": 0.0004, "reward": 0.33478916296735406, "reward_std": 0.3283892571926117, "rewards/cosine_scaled_reward": 0.18481532111763954, "rewards/format_reward": 0.708333358168602, "step": 283 }, { "completion_length": 1362.9167175292969, "epoch": 0.32457142857142857, "grad_norm": 0.43515703082084656, "kl": 0.01312255859375, "learning_rate": 5.248803227530763e-07, "loss": 0.0005, "reward": 0.24751500971615314, "reward_std": 0.21904924511909485, "rewards/cosine_scaled_reward": -0.10085805598646402, "rewards/format_reward": 0.9166666716337204, "step": 284 }, { "completion_length": 914.6666870117188, "epoch": 0.32571428571428573, "grad_norm": 0.2816125154495239, "kl": 0.00911712646484375, "learning_rate": 5.21744266211809e-07, "loss": 0.0004, "reward": 0.3308372348546982, "reward_std": 0.22466129437088966, "rewards/cosine_scaled_reward": 0.007770329713821411, "rewards/format_reward": 1.0, "step": 285 }, { "completion_length": 1364.2709045410156, "epoch": 0.32685714285714285, "grad_norm": 0.24092473089694977, "kl": 0.00989532470703125, "learning_rate": 5.186095868151436e-07, "loss": 0.0004, "reward": 0.3854522183537483, "reward_std": 0.3654041290283203, "rewards/cosine_scaled_reward": 0.0710476387757808, "rewards/format_reward": 0.9375000149011612, "step": 286 }, { "completion_length": 1273.1042175292969, "epoch": 0.328, "grad_norm": 0.30035242438316345, "kl": 0.01116180419921875, "learning_rate": 5.154764373429315e-07, "loss": 0.0004, "reward": 0.19016006495803595, "reward_std": 0.1666824333369732, "rewards/cosine_scaled_reward": -0.10087527148425579, "rewards/format_reward": 0.7916666716337204, "step": 287 }, { "completion_length": 1498.5625305175781, "epoch": 0.3291428571428571, "grad_norm": 0.2478376030921936, "kl": 0.010406494140625, "learning_rate": 5.123449705004581e-07, "loss": 0.0004, "reward": 0.23150785267353058, "reward_std": 0.17795532755553722, "rewards/cosine_scaled_reward": -0.09038132801651955, "rewards/format_reward": 0.875, "step": 288 }, { "completion_length": 1331.5833892822266, "epoch": 0.3302857142857143, "grad_norm": 0.338457852602005, "kl": 0.015472412109375, "learning_rate": 5.09215338910999e-07, "loss": 0.0006, "reward": 0.2746650446206331, "reward_std": 0.23780345171689987, "rewards/cosine_scaled_reward": -0.03811670187860727, "rewards/format_reward": 0.8541666865348816, "step": 289 }, { "completion_length": 1039.1250610351562, "epoch": 0.3314285714285714, "grad_norm": 0.37133654952049255, "kl": 0.01009368896484375, "learning_rate": 5.060876951083828e-07, "loss": 0.0004, "reward": 0.3331058695912361, "reward_std": 0.26539327949285507, "rewards/cosine_scaled_reward": 0.03625666256994009, "rewards/format_reward": 0.9166666865348816, "step": 290 }, { "completion_length": 1281.6875610351562, "epoch": 0.3325714285714286, "grad_norm": 0.25397613644599915, "kl": 0.0101470947265625, "learning_rate": 5.02962191529556e-07, "loss": 0.0004, "reward": 0.28346237912774086, "reward_std": 0.3078480400145054, "rewards/cosine_scaled_reward": -0.04864836111664772, "rewards/format_reward": 0.9375000149011612, "step": 291 }, { "completion_length": 1514.6042175292969, "epoch": 0.33371428571428574, "grad_norm": 0.24802722036838531, "kl": 0.0099945068359375, "learning_rate": 4.998389805071536e-07, "loss": 0.0004, "reward": 0.22861065715551376, "reward_std": 0.19359621033072472, "rewards/cosine_scaled_reward": -0.1172735309228301, "rewards/format_reward": 0.875, "step": 292 }, { "completion_length": 1185.8958740234375, "epoch": 0.33485714285714285, "grad_norm": 0.2437037080526352, "kl": 0.0122222900390625, "learning_rate": 4.967182142620745e-07, "loss": 0.0005, "reward": 0.29054274410009384, "reward_std": 0.22883088141679764, "rewards/cosine_scaled_reward": -0.07128089666366577, "rewards/format_reward": 0.9583333432674408, "step": 293 }, { "completion_length": 1941.1458740234375, "epoch": 0.336, "grad_norm": 0.35831886529922485, "kl": 0.0147552490234375, "learning_rate": 4.93600044896063e-07, "loss": 0.0006, "reward": 0.2816973514854908, "reward_std": 0.3041081838309765, "rewards/cosine_scaled_reward": 0.06637562066316605, "rewards/format_reward": 0.7083333432674408, "step": 294 }, { "completion_length": 1546.1041870117188, "epoch": 0.33714285714285713, "grad_norm": 0.2846806049346924, "kl": 0.01165771484375, "learning_rate": 4.904846243842949e-07, "loss": 0.0005, "reward": 0.45683666691184044, "reward_std": 0.36336134001612663, "rewards/cosine_scaled_reward": 0.17266745120286942, "rewards/format_reward": 0.895833358168602, "step": 295 }, { "completion_length": 1614.5208435058594, "epoch": 0.3382857142857143, "grad_norm": 0.2929825782775879, "kl": 0.01120758056640625, "learning_rate": 4.873721045679706e-07, "loss": 0.0004, "reward": 0.1770874448120594, "reward_std": 0.2070347797125578, "rewards/cosine_scaled_reward": -0.15072383964434266, "rewards/format_reward": 0.8750000298023224, "step": 296 }, { "completion_length": 2259.0834045410156, "epoch": 0.3394285714285714, "grad_norm": 0.2630348205566406, "kl": 0.01708984375, "learning_rate": 4.842626371469149e-07, "loss": 0.0007, "reward": 0.2537407707422972, "reward_std": 0.27694452553987503, "rewards/cosine_scaled_reward": 0.010442063212394714, "rewards/format_reward": 0.7291666865348816, "step": 297 }, { "completion_length": 1325.3542022705078, "epoch": 0.3405714285714286, "grad_norm": 0.2507433593273163, "kl": 0.01007080078125, "learning_rate": 4.811563736721829e-07, "loss": 0.0004, "reward": 0.21724335104227066, "reward_std": 0.18220657296478748, "rewards/cosine_scaled_reward": -0.12216328456997871, "rewards/format_reward": 0.8958333432674408, "step": 298 }, { "completion_length": 1462.4791870117188, "epoch": 0.3417142857142857, "grad_norm": 0.3154860734939575, "kl": 0.0118408203125, "learning_rate": 4.780534655386743e-07, "loss": 0.0005, "reward": 0.25573862344026566, "reward_std": 0.19661388732492924, "rewards/cosine_scaled_reward": -0.05486198514699936, "rewards/format_reward": 0.8958333432674408, "step": 299 }, { "completion_length": 1630.166748046875, "epoch": 0.34285714285714286, "grad_norm": 0.9737704396247864, "kl": 0.019989013671875, "learning_rate": 4.749540639777539e-07, "loss": 0.0008, "reward": 0.23839148692786694, "reward_std": 0.23841408640146255, "rewards/cosine_scaled_reward": -0.037340753711760044, "rewards/format_reward": 0.770833358168602, "step": 300 }, { "completion_length": 1643.6041870117188, "epoch": 0.344, "grad_norm": 0.39158645272254944, "kl": 0.0224151611328125, "learning_rate": 4.7185832004988133e-07, "loss": 0.0009, "reward": 0.22077538957819343, "reward_std": 0.27438678219914436, "rewards/cosine_scaled_reward": -0.11595524847507477, "rewards/format_reward": 0.833333358168602, "step": 301 }, { "completion_length": 1667.6250457763672, "epoch": 0.34514285714285714, "grad_norm": 0.32535773515701294, "kl": 0.016754150390625, "learning_rate": 4.68766384637248e-07, "loss": 0.0007, "reward": 0.2695366069674492, "reward_std": 0.27404558658599854, "rewards/cosine_scaled_reward": 0.03671133052557707, "rewards/format_reward": 0.7500000149011612, "step": 302 }, { "completion_length": 1149.1250305175781, "epoch": 0.3462857142857143, "grad_norm": 0.3706236481666565, "kl": 0.01305389404296875, "learning_rate": 4.656784084364238e-07, "loss": 0.0005, "reward": 0.2421702779829502, "reward_std": 0.28027326986193657, "rewards/cosine_scaled_reward": -0.12036034030097653, "rewards/format_reward": 0.9375, "step": 303 }, { "completion_length": 1612.8125305175781, "epoch": 0.3474285714285714, "grad_norm": 0.2613138258457184, "kl": 0.0163421630859375, "learning_rate": 4.6259454195101267e-07, "loss": 0.0007, "reward": 0.3241605297662318, "reward_std": 0.23523182421922684, "rewards/cosine_scaled_reward": 0.048464858322404325, "rewards/format_reward": 0.8333333432674408, "step": 304 }, { "completion_length": 1475.8750305175781, "epoch": 0.3485714285714286, "grad_norm": 0.3334023058414459, "kl": 0.0114898681640625, "learning_rate": 4.59514935484316e-07, "loss": 0.0005, "reward": 0.16058492846786976, "reward_std": 0.24794265627861023, "rewards/cosine_scaled_reward": -0.1861222069710493, "rewards/format_reward": 0.8541666865348816, "step": 305 }, { "completion_length": 1062.2292175292969, "epoch": 0.3497142857142857, "grad_norm": 0.29249686002731323, "kl": 0.00969696044921875, "learning_rate": 4.5643973913200837e-07, "loss": 0.0004, "reward": 0.4680076576769352, "reward_std": 0.32026200741529465, "rewards/cosine_scaled_reward": 0.16786080971360207, "rewards/format_reward": 0.9583333432674408, "step": 306 }, { "completion_length": 1272.9583892822266, "epoch": 0.35085714285714287, "grad_norm": 0.31508970260620117, "kl": 0.00922393798828125, "learning_rate": 4.5336910277482155e-07, "loss": 0.0004, "reward": 0.2798173949122429, "reward_std": 0.35323888808488846, "rewards/cosine_scaled_reward": -0.01756212580949068, "rewards/format_reward": 0.8541666865348816, "step": 307 }, { "completion_length": 2330.9584350585938, "epoch": 0.352, "grad_norm": 0.2136361300945282, "kl": 0.0195465087890625, "learning_rate": 4.503031760712397e-07, "loss": 0.0008, "reward": 0.19128491915762424, "reward_std": 0.25959878973662853, "rewards/cosine_scaled_reward": -0.0526702341157943, "rewards/format_reward": 0.6666666716337204, "step": 308 }, { "completion_length": 1883.3333740234375, "epoch": 0.35314285714285715, "grad_norm": 0.20839928090572357, "kl": 0.0121612548828125, "learning_rate": 4.4724210845020494e-07, "loss": 0.0005, "reward": 0.2328551933169365, "reward_std": 0.2755979187786579, "rewards/cosine_scaled_reward": -0.09418771299533546, "rewards/format_reward": 0.8750000149011612, "step": 309 }, { "completion_length": 1376.1875610351562, "epoch": 0.35428571428571426, "grad_norm": 0.28804856538772583, "kl": 0.0186920166015625, "learning_rate": 4.441860491038345e-07, "loss": 0.0007, "reward": 0.2578071504831314, "reward_std": 0.24786734953522682, "rewards/cosine_scaled_reward": -0.14047110336832702, "rewards/format_reward": 0.9791666716337204, "step": 310 }, { "completion_length": 1313.750015258789, "epoch": 0.3554285714285714, "grad_norm": 0.3734670579433441, "kl": 0.0127410888671875, "learning_rate": 4.4113514698014953e-07, "loss": 0.0005, "reward": 0.29336644522845745, "reward_std": 0.2504027783870697, "rewards/cosine_scaled_reward": -0.012938316911458969, "rewards/format_reward": 0.9375000149011612, "step": 311 }, { "completion_length": 1223.416732788086, "epoch": 0.3565714285714286, "grad_norm": 0.3231605887413025, "kl": 0.01352691650390625, "learning_rate": 4.3808955077581546e-07, "loss": 0.0005, "reward": 0.49592292681336403, "reward_std": 0.2532188519835472, "rewards/cosine_scaled_reward": 0.2624667380005121, "rewards/format_reward": 0.9166666716337204, "step": 312 }, { "completion_length": 1883.1875610351562, "epoch": 0.3577142857142857, "grad_norm": 0.40404918789863586, "kl": 0.021453857421875, "learning_rate": 4.350494089288943e-07, "loss": 0.0009, "reward": 0.21103960182517767, "reward_std": 0.2670624628663063, "rewards/cosine_scaled_reward": -0.03821822814643383, "rewards/format_reward": 0.6875000149011612, "step": 313 }, { "completion_length": 1640.104263305664, "epoch": 0.3588571428571429, "grad_norm": 0.26183629035949707, "kl": 0.02033233642578125, "learning_rate": 4.3201486961161093e-07, "loss": 0.0008, "reward": 0.2432878129184246, "reward_std": 0.19171066209673882, "rewards/cosine_scaled_reward": -0.0031162824016064405, "rewards/format_reward": 0.75, "step": 314 }, { "completion_length": 1864.2084045410156, "epoch": 0.36, "grad_norm": 0.6592049598693848, "kl": 0.046630859375, "learning_rate": 4.2898608072313045e-07, "loss": 0.0019, "reward": 0.2404745277017355, "reward_std": 0.22513627633452415, "rewards/cosine_scaled_reward": -0.006075944751501083, "rewards/format_reward": 0.708333358168602, "step": 315 }, { "completion_length": 2024.6042175292969, "epoch": 0.36114285714285715, "grad_norm": 0.41091492772102356, "kl": 0.02691650390625, "learning_rate": 4.2596318988235037e-07, "loss": 0.0011, "reward": 0.04616490565240383, "reward_std": 0.15632366575300694, "rewards/cosine_scaled_reward": -0.2781721465289593, "rewards/format_reward": 0.6875000149011612, "step": 316 }, { "completion_length": 1890.6667175292969, "epoch": 0.36228571428571427, "grad_norm": 0.395164430141449, "kl": 0.0240631103515625, "learning_rate": 4.2294634442070553e-07, "loss": 0.001, "reward": 0.1601184867322445, "reward_std": 0.23606964573264122, "rewards/cosine_scaled_reward": -0.0570518858730793, "rewards/format_reward": 0.6041666716337204, "step": 317 }, { "completion_length": 1231.2916870117188, "epoch": 0.36342857142857143, "grad_norm": 0.36525601148605347, "kl": 0.022247314453125, "learning_rate": 4.1993569137498776e-07, "loss": 0.0009, "reward": 0.2613836098462343, "reward_std": 0.18278861418366432, "rewards/cosine_scaled_reward": -0.08080821360636037, "rewards/format_reward": 0.9375, "step": 318 }, { "completion_length": 1301.6041870117188, "epoch": 0.36457142857142855, "grad_norm": 0.360119104385376, "kl": 0.012481689453125, "learning_rate": 4.1693137748017915e-07, "loss": 0.0005, "reward": 0.2703345976769924, "reward_std": 0.27354278787970543, "rewards/cosine_scaled_reward": -0.05605246126651764, "rewards/format_reward": 0.8958333432674408, "step": 319 }, { "completion_length": 1155.2083587646484, "epoch": 0.3657142857142857, "grad_norm": 0.40306153893470764, "kl": 0.021759033203125, "learning_rate": 4.1393354916230005e-07, "loss": 0.0009, "reward": 0.3281657323241234, "reward_std": 0.3362472988665104, "rewards/cosine_scaled_reward": -0.012413738295435905, "rewards/format_reward": 0.9166666716337204, "step": 320 }, { "completion_length": 952.2708740234375, "epoch": 0.3668571428571429, "grad_norm": 0.29503145813941956, "kl": 0.00998687744140625, "learning_rate": 4.1094235253127374e-07, "loss": 0.0004, "reward": 0.44769028574228287, "reward_std": 0.2670172415673733, "rewards/cosine_scaled_reward": 0.1519340705126524, "rewards/format_reward": 1.0, "step": 321 }, { "completion_length": 1964.6667175292969, "epoch": 0.368, "grad_norm": 0.6683375239372253, "kl": 0.048004150390625, "learning_rate": 4.079579333738039e-07, "loss": 0.0019, "reward": 0.10212668823078275, "reward_std": 0.1831696219742298, "rewards/cosine_scaled_reward": -0.211124025285244, "rewards/format_reward": 0.6875000298023224, "step": 322 }, { "completion_length": 1723.979232788086, "epoch": 0.36914285714285716, "grad_norm": 0.38685184717178345, "kl": 0.03357696533203125, "learning_rate": 4.0498043714627006e-07, "loss": 0.0013, "reward": 0.19741436280310154, "reward_std": 0.20172200351953506, "rewards/cosine_scaled_reward": -0.08652417734265327, "rewards/format_reward": 0.7500000149011612, "step": 323 }, { "completion_length": 1568.0833740234375, "epoch": 0.3702857142857143, "grad_norm": 1.1231465339660645, "kl": 0.02925872802734375, "learning_rate": 4.020100089676376e-07, "loss": 0.0012, "reward": 0.2117295628413558, "reward_std": 0.28987129777669907, "rewards/cosine_scaled_reward": -0.07313152588903904, "rewards/format_reward": 0.7916666716337204, "step": 324 }, { "completion_length": 1840.2708892822266, "epoch": 0.37142857142857144, "grad_norm": 0.2682565152645111, "kl": 0.0226287841796875, "learning_rate": 3.9904679361238526e-07, "loss": 0.0009, "reward": 0.23154988093301654, "reward_std": 0.19991927221417427, "rewards/cosine_scaled_reward": -0.014693088829517365, "rewards/format_reward": 0.75, "step": 325 }, { "completion_length": 1522.4583435058594, "epoch": 0.37257142857142855, "grad_norm": 0.44652143120765686, "kl": 0.02679443359375, "learning_rate": 3.9609093550344907e-07, "loss": 0.0011, "reward": 0.2407391034066677, "reward_std": 0.22661786526441574, "rewards/cosine_scaled_reward": -0.04047023877501488, "rewards/format_reward": 0.8541666865348816, "step": 326 }, { "completion_length": 1716.7708435058594, "epoch": 0.3737142857142857, "grad_norm": 0.30573394894599915, "kl": 0.0191192626953125, "learning_rate": 3.931425787051832e-07, "loss": 0.0008, "reward": 0.3224290758371353, "reward_std": 0.2919512912631035, "rewards/cosine_scaled_reward": 0.08114048466086388, "rewards/format_reward": 0.7916666865348816, "step": 327 }, { "completion_length": 1643.166748046875, "epoch": 0.37485714285714283, "grad_norm": 0.46766549348831177, "kl": 0.0225830078125, "learning_rate": 3.902018669163384e-07, "loss": 0.0009, "reward": 0.18003581184893847, "reward_std": 0.19023212790489197, "rewards/cosine_scaled_reward": -0.14118107501417398, "rewards/format_reward": 0.8125000149011612, "step": 328 }, { "completion_length": 990.7083587646484, "epoch": 0.376, "grad_norm": 0.37604936957359314, "kl": 0.01605224609375, "learning_rate": 3.872689434630585e-07, "loss": 0.0006, "reward": 0.47424641251564026, "reward_std": 0.3888479918241501, "rewards/cosine_scaled_reward": 0.12837250716984272, "rewards/format_reward": 0.9583333432674408, "step": 329 }, { "completion_length": 1277.1666870117188, "epoch": 0.37714285714285717, "grad_norm": 0.6129769086837769, "kl": 0.0253753662109375, "learning_rate": 3.843439512918949e-07, "loss": 0.001, "reward": 0.2782764509320259, "reward_std": 0.3325960785150528, "rewards/cosine_scaled_reward": -0.05062708631157875, "rewards/format_reward": 0.8541667014360428, "step": 330 }, { "completion_length": 1592.5833740234375, "epoch": 0.3782857142857143, "grad_norm": 0.4822657108306885, "kl": 0.02129364013671875, "learning_rate": 3.8142703296283953e-07, "loss": 0.0009, "reward": 0.11315785581246018, "reward_std": 0.1640291679650545, "rewards/cosine_scaled_reward": -0.17293575033545494, "rewards/format_reward": 0.7291666716337204, "step": 331 }, { "completion_length": 1577.5834045410156, "epoch": 0.37942857142857145, "grad_norm": 1.590923547744751, "kl": 0.02500152587890625, "learning_rate": 3.785183306423767e-07, "loss": 0.001, "reward": 0.2356187179684639, "reward_std": 0.3015174902975559, "rewards/cosine_scaled_reward": -0.12084164097905159, "rewards/format_reward": 0.8541666865348816, "step": 332 }, { "completion_length": 1291.8542175292969, "epoch": 0.38057142857142856, "grad_norm": 0.25067755579948425, "kl": 0.0163116455078125, "learning_rate": 3.7561798609655373e-07, "loss": 0.0007, "reward": 0.41119640320539474, "reward_std": 0.2953006289899349, "rewards/cosine_scaled_reward": 0.05285493656992912, "rewards/format_reward": 1.0, "step": 333 }, { "completion_length": 2088.0000915527344, "epoch": 0.38171428571428573, "grad_norm": 0.3662320375442505, "kl": 0.0378875732421875, "learning_rate": 3.72726140684072e-07, "loss": 0.0015, "reward": 0.10603704676032066, "reward_std": 0.27144619822502136, "rewards/cosine_scaled_reward": -0.17696780152618885, "rewards/format_reward": 0.6666666865348816, "step": 334 }, { "completion_length": 1203.166732788086, "epoch": 0.38285714285714284, "grad_norm": 0.2919592559337616, "kl": 0.01375579833984375, "learning_rate": 3.6984293534939737e-07, "loss": 0.0006, "reward": 0.39064921438694, "reward_std": 0.27864502742886543, "rewards/cosine_scaled_reward": 0.0996411181986332, "rewards/format_reward": 0.9375000149011612, "step": 335 }, { "completion_length": 1457.9375305175781, "epoch": 0.384, "grad_norm": 0.5630917549133301, "kl": 0.02484130859375, "learning_rate": 3.6696851061588994e-07, "loss": 0.001, "reward": 0.3291323632001877, "reward_std": 0.3386707752943039, "rewards/cosine_scaled_reward": 0.032098641619086266, "rewards/format_reward": 0.7916667014360428, "step": 336 }, { "completion_length": 1539.9792175292969, "epoch": 0.3851428571428571, "grad_norm": 0.3664340674877167, "kl": 0.02039337158203125, "learning_rate": 3.641030065789562e-07, "loss": 0.0008, "reward": 0.2679777964949608, "reward_std": 0.2982511632144451, "rewards/cosine_scaled_reward": -0.08566620387136936, "rewards/format_reward": 0.9375000149011612, "step": 337 }, { "completion_length": 1319.6875305175781, "epoch": 0.3862857142857143, "grad_norm": 0.2860366404056549, "kl": 0.014190673828125, "learning_rate": 3.612465628992203e-07, "loss": 0.0006, "reward": 0.3431988013908267, "reward_std": 0.2697646599262953, "rewards/cosine_scaled_reward": 0.04516521096229553, "rewards/format_reward": 0.875, "step": 338 }, { "completion_length": 1608.1459045410156, "epoch": 0.38742857142857146, "grad_norm": 0.5042271018028259, "kl": 0.0297698974609375, "learning_rate": 3.5839931879571725e-07, "loss": 0.0012, "reward": 0.22079084441065788, "reward_std": 0.27200285717844963, "rewards/cosine_scaled_reward": -0.15526098851114511, "rewards/format_reward": 0.8958333432674408, "step": 339 }, { "completion_length": 1437.4583740234375, "epoch": 0.38857142857142857, "grad_norm": 0.3569318652153015, "kl": 0.01732635498046875, "learning_rate": 3.555614130391079e-07, "loss": 0.0007, "reward": 0.2634928924962878, "reward_std": 0.27312322705984116, "rewards/cosine_scaled_reward": -0.05826007016003132, "rewards/format_reward": 0.8750000149011612, "step": 340 }, { "completion_length": 1169.6042175292969, "epoch": 0.38971428571428574, "grad_norm": 0.2896157205104828, "kl": 0.0129241943359375, "learning_rate": 3.5273298394491515e-07, "loss": 0.0005, "reward": 0.46248795837163925, "reward_std": 0.31046775355935097, "rewards/cosine_scaled_reward": 0.1391837690025568, "rewards/format_reward": 0.9375000149011612, "step": 341 }, { "completion_length": 1596.3333587646484, "epoch": 0.39085714285714285, "grad_norm": 0.5315576791763306, "kl": 0.03363037109375, "learning_rate": 3.4991416936678276e-07, "loss": 0.0013, "reward": 0.3130345083773136, "reward_std": 0.3867286182940006, "rewards/cosine_scaled_reward": -0.06444146143621765, "rewards/format_reward": 0.8750000298023224, "step": 342 }, { "completion_length": 1510.6666870117188, "epoch": 0.392, "grad_norm": 0.4027750492095947, "kl": 0.02393341064453125, "learning_rate": 3.471051066897562e-07, "loss": 0.001, "reward": 0.4770284369587898, "reward_std": 0.4011606350541115, "rewards/cosine_scaled_reward": 0.1810836885124445, "rewards/format_reward": 0.9375000149011612, "step": 343 }, { "completion_length": 1328.3750305175781, "epoch": 0.3931428571428571, "grad_norm": 0.31406036019325256, "kl": 0.021820068359375, "learning_rate": 3.4430593282358777e-07, "loss": 0.0009, "reward": 0.4601411782205105, "reward_std": 0.26154783368110657, "rewards/cosine_scaled_reward": 0.20328249409794807, "rewards/format_reward": 0.9375000149011612, "step": 344 }, { "completion_length": 1693.1041870117188, "epoch": 0.3942857142857143, "grad_norm": 0.5955045819282532, "kl": 0.03485107421875, "learning_rate": 3.4151678419606233e-07, "loss": 0.0014, "reward": 0.2936294376850128, "reward_std": 0.25769656151533127, "rewards/cosine_scaled_reward": -0.0028788005001842976, "rewards/format_reward": 0.833333358168602, "step": 345 }, { "completion_length": 1577.75, "epoch": 0.3954285714285714, "grad_norm": 0.29605719447135925, "kl": 0.02605438232421875, "learning_rate": 3.387377967463493e-07, "loss": 0.001, "reward": 0.27892691642045975, "reward_std": 0.23167649656534195, "rewards/cosine_scaled_reward": -0.08691584412008524, "rewards/format_reward": 0.9375, "step": 346 }, { "completion_length": 1622.8958740234375, "epoch": 0.3965714285714286, "grad_norm": 0.27287817001342773, "kl": 0.01595306396484375, "learning_rate": 3.359691059183761e-07, "loss": 0.0006, "reward": 0.15320170670747757, "reward_std": 0.1787218227982521, "rewards/cosine_scaled_reward": -0.21337689459323883, "rewards/format_reward": 0.9166666716337204, "step": 347 }, { "completion_length": 1563.6875610351562, "epoch": 0.3977142857142857, "grad_norm": 0.32775041460990906, "kl": 0.03545379638671875, "learning_rate": 3.3321084665422803e-07, "loss": 0.0014, "reward": 0.36891554296016693, "reward_std": 0.24630171805620193, "rewards/cosine_scaled_reward": 0.08824050053954124, "rewards/format_reward": 0.875, "step": 348 }, { "completion_length": 1382.541748046875, "epoch": 0.39885714285714285, "grad_norm": 0.33139002323150635, "kl": 0.0301971435546875, "learning_rate": 3.3046315338757026e-07, "loss": 0.0012, "reward": 0.37559112161397934, "reward_std": 0.28325023502111435, "rewards/cosine_scaled_reward": 0.02644458832219243, "rewards/format_reward": 0.9375, "step": 349 }, { "completion_length": 900.2916717529297, "epoch": 0.4, "grad_norm": 0.2705373764038086, "kl": 0.008544921875, "learning_rate": 3.2772616003709616e-07, "loss": 0.0003, "reward": 0.2913980260491371, "reward_std": 0.30835580080747604, "rewards/cosine_scaled_reward": -0.0918974825181067, "rewards/format_reward": 0.9791666716337204, "step": 350 }, { "completion_length": 1303.0000305175781, "epoch": 0.40114285714285713, "grad_norm": 0.3872186839580536, "kl": 0.023712158203125, "learning_rate": 3.250000000000001e-07, "loss": 0.0009, "reward": 0.2837943397462368, "reward_std": 0.25149884819984436, "rewards/cosine_scaled_reward": -0.0790461078286171, "rewards/format_reward": 0.9375, "step": 351 }, { "completion_length": 1608.1250305175781, "epoch": 0.4022857142857143, "grad_norm": 0.6398208141326904, "kl": 0.02740478515625, "learning_rate": 3.222848061454764e-07, "loss": 0.0011, "reward": 0.28259705752134323, "reward_std": 0.2690298482775688, "rewards/cosine_scaled_reward": 0.0077111730352044106, "rewards/format_reward": 0.8125000149011612, "step": 352 }, { "completion_length": 1021.6666717529297, "epoch": 0.4034285714285714, "grad_norm": 0.41845765709877014, "kl": 0.01645660400390625, "learning_rate": 3.195807108082429e-07, "loss": 0.0007, "reward": 0.38043496757745743, "reward_std": 0.21722519025206566, "rewards/cosine_scaled_reward": 0.07915054634213448, "rewards/format_reward": 0.9791666716337204, "step": 353 }, { "completion_length": 979.8541870117188, "epoch": 0.4045714285714286, "grad_norm": 0.28274133801460266, "kl": 0.010162353515625, "learning_rate": 3.168878457820915e-07, "loss": 0.0004, "reward": 0.4308842793107033, "reward_std": 0.24077448807656765, "rewards/cosine_scaled_reward": 0.08124570176005363, "rewards/format_reward": 1.0, "step": 354 }, { "completion_length": 1101.3958587646484, "epoch": 0.4057142857142857, "grad_norm": 0.38370591402053833, "kl": 0.01351165771484375, "learning_rate": 3.142063423134644e-07, "loss": 0.0005, "reward": 0.3669841568917036, "reward_std": 0.2703222706913948, "rewards/cosine_scaled_reward": -0.0021127446088939905, "rewards/format_reward": 0.9791666716337204, "step": 355 }, { "completion_length": 1464.9791870117188, "epoch": 0.40685714285714286, "grad_norm": 0.3078323006629944, "kl": 0.032012939453125, "learning_rate": 3.115363310950578e-07, "loss": 0.0013, "reward": 0.3374742101877928, "reward_std": 0.32919860631227493, "rewards/cosine_scaled_reward": 0.003608912229537964, "rewards/format_reward": 0.8958333432674408, "step": 356 }, { "completion_length": 1756.8334045410156, "epoch": 0.408, "grad_norm": 0.5091083645820618, "kl": 0.0364532470703125, "learning_rate": 3.0887794225945143e-07, "loss": 0.0015, "reward": 0.14945390354841948, "reward_std": 0.2556047923862934, "rewards/cosine_scaled_reward": -0.2234173621982336, "rewards/format_reward": 0.8541666865348816, "step": 357 }, { "completion_length": 1431.8333740234375, "epoch": 0.40914285714285714, "grad_norm": 0.27200624346733093, "kl": 0.0190277099609375, "learning_rate": 3.062313053727671e-07, "loss": 0.0008, "reward": 0.39337392151355743, "reward_std": 0.3704180307686329, "rewards/cosine_scaled_reward": 0.07256992720067501, "rewards/format_reward": 0.9166666716337204, "step": 358 }, { "completion_length": 1123.437515258789, "epoch": 0.4102857142857143, "grad_norm": 0.6349442601203918, "kl": 0.029815673828125, "learning_rate": 3.0359654942835247e-07, "loss": 0.0012, "reward": 0.23123174533247948, "reward_std": 0.25142205134034157, "rewards/cosine_scaled_reward": -0.14995449781417847, "rewards/format_reward": 0.8958333432674408, "step": 359 }, { "completion_length": 1300.9791870117188, "epoch": 0.4114285714285714, "grad_norm": 0.38422876596450806, "kl": 0.031402587890625, "learning_rate": 3.0097380284049523e-07, "loss": 0.0013, "reward": 0.3842775635421276, "reward_std": 0.33736754953861237, "rewards/cosine_scaled_reward": 0.07002734206616879, "rewards/format_reward": 0.9583333432674408, "step": 360 }, { "completion_length": 1139.0625305175781, "epoch": 0.4125714285714286, "grad_norm": 0.28237876296043396, "kl": 0.0174102783203125, "learning_rate": 2.9836319343816397e-07, "loss": 0.0007, "reward": 0.2165006436407566, "reward_std": 0.19712563790380955, "rewards/cosine_scaled_reward": -0.1817290298640728, "rewards/format_reward": 0.9791666716337204, "step": 361 }, { "completion_length": 1061.4583587646484, "epoch": 0.4137142857142857, "grad_norm": 0.35058391094207764, "kl": 0.0254669189453125, "learning_rate": 2.9576484845877793e-07, "loss": 0.001, "reward": 0.3845072239637375, "reward_std": 0.2359091378748417, "rewards/cosine_scaled_reward": 0.058078229427337646, "rewards/format_reward": 0.9791666716337204, "step": 362 }, { "completion_length": 951.1042022705078, "epoch": 0.41485714285714287, "grad_norm": 0.4236137568950653, "kl": 0.02153778076171875, "learning_rate": 2.931788945420058e-07, "loss": 0.0009, "reward": 0.561962466686964, "reward_std": 0.26705051213502884, "rewards/cosine_scaled_reward": 0.26302190124988556, "rewards/format_reward": 0.9791666716337204, "step": 363 }, { "completion_length": 1268.2917175292969, "epoch": 0.416, "grad_norm": 0.3417412340641022, "kl": 0.01409912109375, "learning_rate": 2.9060545772359305e-07, "loss": 0.0006, "reward": 0.1475709266960621, "reward_std": 0.17424429766833782, "rewards/cosine_scaled_reward": -0.26906686276197433, "rewards/format_reward": 0.9375000149011612, "step": 364 }, { "completion_length": 1956.9375610351562, "epoch": 0.41714285714285715, "grad_norm": 0.4032626748085022, "kl": 0.0582122802734375, "learning_rate": 2.8804466342921987e-07, "loss": 0.0023, "reward": 0.13641461171209812, "reward_std": 0.26020678877830505, "rewards/cosine_scaled_reward": -0.15251348353922367, "rewards/format_reward": 0.666666679084301, "step": 365 }, { "completion_length": 1301.8958892822266, "epoch": 0.41828571428571426, "grad_norm": 0.40691182017326355, "kl": 0.02590179443359375, "learning_rate": 2.854966364683872e-07, "loss": 0.001, "reward": 0.31418493390083313, "reward_std": 0.2587681859731674, "rewards/cosine_scaled_reward": 0.01146254688501358, "rewards/format_reward": 0.875, "step": 366 }, { "completion_length": 1587.8125610351562, "epoch": 0.41942857142857143, "grad_norm": 0.45763275027275085, "kl": 0.027862548828125, "learning_rate": 2.829615010283344e-07, "loss": 0.0011, "reward": 0.3541104570031166, "reward_std": 0.2708537131547928, "rewards/cosine_scaled_reward": -0.004807896912097931, "rewards/format_reward": 0.9166666716337204, "step": 367 }, { "completion_length": 1991.2500305175781, "epoch": 0.4205714285714286, "grad_norm": 0.932867169380188, "kl": 0.042816162109375, "learning_rate": 2.8043938066798645e-07, "loss": 0.0017, "reward": 0.27406515926122665, "reward_std": 0.4030579626560211, "rewards/cosine_scaled_reward": -0.029212753055617213, "rewards/format_reward": 0.770833358168602, "step": 368 }, { "completion_length": 1489.1875305175781, "epoch": 0.4217142857142857, "grad_norm": 0.5538850426673889, "kl": 0.037109375, "learning_rate": 2.7793039831193133e-07, "loss": 0.0015, "reward": 0.3791775330901146, "reward_std": 0.42510559409856796, "rewards/cosine_scaled_reward": 0.015287954360246658, "rewards/format_reward": 0.8958333432674408, "step": 369 }, { "completion_length": 1329.6042175292969, "epoch": 0.4228571428571429, "grad_norm": 0.4791981875896454, "kl": 0.041656494140625, "learning_rate": 2.7543467624442956e-07, "loss": 0.0017, "reward": 0.27333252876996994, "reward_std": 0.22211980447173119, "rewards/cosine_scaled_reward": -0.06442724168300629, "rewards/format_reward": 0.9166666716337204, "step": 370 }, { "completion_length": 690.0833511352539, "epoch": 0.424, "grad_norm": 0.3764467239379883, "kl": 0.009735107421875, "learning_rate": 2.729523361034538e-07, "loss": 0.0004, "reward": 0.3839330803602934, "reward_std": 0.1987195983529091, "rewards/cosine_scaled_reward": 0.08161145448684692, "rewards/format_reward": 1.0, "step": 371 }, { "completion_length": 1491.0208435058594, "epoch": 0.42514285714285716, "grad_norm": 0.46033698320388794, "kl": 0.0261383056640625, "learning_rate": 2.7048349887476037e-07, "loss": 0.001, "reward": 0.3577600382268429, "reward_std": 0.221741683781147, "rewards/cosine_scaled_reward": 0.05540268123149872, "rewards/format_reward": 0.9166666865348816, "step": 372 }, { "completion_length": 973.4375305175781, "epoch": 0.42628571428571427, "grad_norm": 0.5783974528312683, "kl": 0.024383544921875, "learning_rate": 2.6802828488599294e-07, "loss": 0.001, "reward": 0.2985563389956951, "reward_std": 0.24250118806958199, "rewards/cosine_scaled_reward": -0.07883045147173107, "rewards/format_reward": 0.9375000149011612, "step": 373 }, { "completion_length": 1205.8542175292969, "epoch": 0.42742857142857144, "grad_norm": 0.34894105792045593, "kl": 0.01763153076171875, "learning_rate": 2.655868138008171e-07, "loss": 0.0007, "reward": 0.35072382912039757, "reward_std": 0.20784537121653557, "rewards/cosine_scaled_reward": 0.04581686854362488, "rewards/format_reward": 0.9583333432674408, "step": 374 }, { "completion_length": 1723.7708740234375, "epoch": 0.42857142857142855, "grad_norm": 0.3739396929740906, "kl": 0.072723388671875, "learning_rate": 2.631592046130896e-07, "loss": 0.0029, "reward": 0.44560275599360466, "reward_std": 0.2987182140350342, "rewards/cosine_scaled_reward": 0.17832038179039955, "rewards/format_reward": 0.7916666679084301, "step": 375 }, { "completion_length": 1434.4583587646484, "epoch": 0.4297142857142857, "grad_norm": 0.6302974224090576, "kl": 0.0382080078125, "learning_rate": 2.6074557564105724e-07, "loss": 0.0015, "reward": 0.20585713908076286, "reward_std": 0.22607924416661263, "rewards/cosine_scaled_reward": -0.1532294088974595, "rewards/format_reward": 0.895833358168602, "step": 376 }, { "completion_length": 1910.9583740234375, "epoch": 0.4308571428571429, "grad_norm": 0.4583210051059723, "kl": 0.06280517578125, "learning_rate": 2.583460445215911e-07, "loss": 0.0025, "reward": 0.2012559212744236, "reward_std": 0.2751467525959015, "rewards/cosine_scaled_reward": -0.11389604769647121, "rewards/format_reward": 0.833333358168602, "step": 377 }, { "completion_length": 1376.5416870117188, "epoch": 0.432, "grad_norm": 0.4945673644542694, "kl": 0.036407470703125, "learning_rate": 2.5596072820445254e-07, "loss": 0.0015, "reward": 0.35591157153248787, "reward_std": 0.3600316420197487, "rewards/cosine_scaled_reward": 0.007506262511014938, "rewards/format_reward": 0.9375000149011612, "step": 378 }, { "completion_length": 1669.3542175292969, "epoch": 0.43314285714285716, "grad_norm": 0.7727935314178467, "kl": 0.0548858642578125, "learning_rate": 2.5358974294659373e-07, "loss": 0.0022, "reward": 0.22004763688892126, "reward_std": 0.252176720649004, "rewards/cosine_scaled_reward": -0.09906591847538948, "rewards/format_reward": 0.8333333432674408, "step": 379 }, { "completion_length": 1446.5209045410156, "epoch": 0.4342857142857143, "grad_norm": 0.3336326479911804, "kl": 0.0492706298828125, "learning_rate": 2.512332043064913e-07, "loss": 0.002, "reward": 0.28320768661797047, "reward_std": 0.20618024468421936, "rewards/cosine_scaled_reward": 0.01299813762307167, "rewards/format_reward": 0.8541666716337204, "step": 380 }, { "completion_length": 1881.1042175292969, "epoch": 0.43542857142857144, "grad_norm": 0.7163864970207214, "kl": 0.068115234375, "learning_rate": 2.488912271385139e-07, "loss": 0.0027, "reward": 0.16360357124358416, "reward_std": 0.25915900990366936, "rewards/cosine_scaled_reward": -0.16741767711937428, "rewards/format_reward": 0.8125000298023224, "step": 381 }, { "completion_length": 1083.3750305175781, "epoch": 0.43657142857142855, "grad_norm": 0.3177710175514221, "kl": 0.0191650390625, "learning_rate": 2.465639255873246e-07, "loss": 0.0008, "reward": 0.19142531976103783, "reward_std": 0.2180217020213604, "rewards/cosine_scaled_reward": -0.1876898668706417, "rewards/format_reward": 0.9375, "step": 382 }, { "completion_length": 1691.6459045410156, "epoch": 0.4377142857142857, "grad_norm": 0.8062416315078735, "kl": 0.09381103515625, "learning_rate": 2.4425141308231765e-07, "loss": 0.0038, "reward": 0.43192901834845543, "reward_std": 0.34921496361494064, "rewards/cosine_scaled_reward": 0.20125210843980312, "rewards/format_reward": 0.8750000298023224, "step": 383 }, { "completion_length": 1159.312515258789, "epoch": 0.43885714285714283, "grad_norm": 0.50071781873703, "kl": 0.02691650390625, "learning_rate": 2.4195380233209006e-07, "loss": 0.0011, "reward": 0.5177538767457008, "reward_std": 0.33252015709877014, "rewards/cosine_scaled_reward": 0.27795255556702614, "rewards/format_reward": 0.9166666865348816, "step": 384 }, { "completion_length": 1704.5209045410156, "epoch": 0.44, "grad_norm": 0.9335633516311646, "kl": 0.0668487548828125, "learning_rate": 2.3967120531894857e-07, "loss": 0.0027, "reward": 0.20765825361013412, "reward_std": 0.21825537830591202, "rewards/cosine_scaled_reward": -0.10266052093356848, "rewards/format_reward": 0.8333333432674408, "step": 385 }, { "completion_length": 1376.8958435058594, "epoch": 0.44114285714285717, "grad_norm": 0.4630853235721588, "kl": 0.03307342529296875, "learning_rate": 2.374037332934512e-07, "loss": 0.0013, "reward": 0.4098484069108963, "reward_std": 0.2593814432621002, "rewards/cosine_scaled_reward": 0.07745273411273956, "rewards/format_reward": 0.9375, "step": 386 }, { "completion_length": 1727.9167175292969, "epoch": 0.4422857142857143, "grad_norm": 0.7240560054779053, "kl": 0.0662994384765625, "learning_rate": 2.3515149676898552e-07, "loss": 0.0026, "reward": 0.29854518361389637, "reward_std": 0.27315741032361984, "rewards/cosine_scaled_reward": 0.015177648514509201, "rewards/format_reward": 0.8125000149011612, "step": 387 }, { "completion_length": 1099.3333740234375, "epoch": 0.44342857142857145, "grad_norm": 0.6079539060592651, "kl": 0.023193359375, "learning_rate": 2.3291460551638237e-07, "loss": 0.0009, "reward": 0.38250723481178284, "reward_std": 0.2334286943078041, "rewards/cosine_scaled_reward": 0.054990146309137344, "rewards/format_reward": 1.0, "step": 388 }, { "completion_length": 1335.6041870117188, "epoch": 0.44457142857142856, "grad_norm": 0.38633546233177185, "kl": 0.05527496337890625, "learning_rate": 2.306931685585657e-07, "loss": 0.0022, "reward": 0.24971096962690353, "reward_std": 0.23452140390872955, "rewards/cosine_scaled_reward": -0.11643039900809526, "rewards/format_reward": 0.9375000149011612, "step": 389 }, { "completion_length": 1625.8125457763672, "epoch": 0.44571428571428573, "grad_norm": 0.5874533653259277, "kl": 0.045379638671875, "learning_rate": 2.2848729416523859e-07, "loss": 0.0018, "reward": 0.14043784514069557, "reward_std": 0.2116746250540018, "rewards/cosine_scaled_reward": -0.16961687617003918, "rewards/format_reward": 0.7708333432674408, "step": 390 }, { "completion_length": 1688.0833587646484, "epoch": 0.44685714285714284, "grad_norm": 0.8851686716079712, "kl": 0.11041259765625, "learning_rate": 2.2629708984760706e-07, "loss": 0.0044, "reward": 0.31278832722455263, "reward_std": 0.4149634316563606, "rewards/cosine_scaled_reward": 0.03951522649731487, "rewards/format_reward": 0.7500000074505806, "step": 391 }, { "completion_length": 1464.4791870117188, "epoch": 0.448, "grad_norm": 0.7757493853569031, "kl": 0.05254364013671875, "learning_rate": 2.2412266235313973e-07, "loss": 0.0021, "reward": 0.23795920982956886, "reward_std": 0.305756276473403, "rewards/cosine_scaled_reward": -0.10466890409588814, "rewards/format_reward": 0.8958333432674408, "step": 392 }, { "completion_length": 1721.9583740234375, "epoch": 0.4491428571428571, "grad_norm": 0.5244545936584473, "kl": 0.07568359375, "learning_rate": 2.2196411766036487e-07, "loss": 0.003, "reward": 0.274740107357502, "reward_std": 0.3141351081430912, "rewards/cosine_scaled_reward": -0.02863520081155002, "rewards/format_reward": 0.833333358168602, "step": 393 }, { "completion_length": 1661.3542022705078, "epoch": 0.4502857142857143, "grad_norm": 0.8934553861618042, "kl": 0.077362060546875, "learning_rate": 2.1982156097370557e-07, "loss": 0.0031, "reward": 0.15594827197492123, "reward_std": 0.2428222820162773, "rewards/cosine_scaled_reward": -0.17488732561469078, "rewards/format_reward": 0.7916666865348816, "step": 394 }, { "completion_length": 1452.1458740234375, "epoch": 0.4514285714285714, "grad_norm": 0.6420386433601379, "kl": 0.0724945068359375, "learning_rate": 2.1769509671835223e-07, "loss": 0.0029, "reward": 0.3845909982919693, "reward_std": 0.3638080097734928, "rewards/cosine_scaled_reward": 0.08904469013214111, "rewards/format_reward": 0.7916666865348816, "step": 395 }, { "completion_length": 1154.7292022705078, "epoch": 0.45257142857142857, "grad_norm": 0.5331507325172424, "kl": 0.02313232421875, "learning_rate": 2.1558482853517253e-07, "loss": 0.0009, "reward": 0.386741541326046, "reward_std": 0.21999208815395832, "rewards/cosine_scaled_reward": 0.00984945148229599, "rewards/format_reward": 0.9791666716337204, "step": 396 }, { "completion_length": 1427.3125610351562, "epoch": 0.45371428571428574, "grad_norm": 0.3781166970729828, "kl": 0.0625, "learning_rate": 2.134908592756607e-07, "loss": 0.0025, "reward": 0.20402754470705986, "reward_std": 0.2863064855337143, "rewards/cosine_scaled_reward": -0.1487132391630439, "rewards/format_reward": 0.8125, "step": 397 }, { "completion_length": 1177.8333587646484, "epoch": 0.45485714285714285, "grad_norm": 0.4327796697616577, "kl": 0.054107666015625, "learning_rate": 2.1141329099692406e-07, "loss": 0.0022, "reward": 0.2399006700143218, "reward_std": 0.22659046947956085, "rewards/cosine_scaled_reward": -0.09948757383972406, "rewards/format_reward": 0.8750000149011612, "step": 398 }, { "completion_length": 1008.2708740234375, "epoch": 0.456, "grad_norm": 0.26638633012771606, "kl": 0.016021728515625, "learning_rate": 2.0935222495670968e-07, "loss": 0.0006, "reward": 0.4025176055729389, "reward_std": 0.2404554933309555, "rewards/cosine_scaled_reward": 0.07033197954297066, "rewards/format_reward": 1.0, "step": 399 }, { "completion_length": 911.3750305175781, "epoch": 0.45714285714285713, "grad_norm": 0.6145984530448914, "kl": 0.01869964599609375, "learning_rate": 2.0730776160846853e-07, "loss": 0.0007, "reward": 0.4475930854678154, "reward_std": 0.18855594843626022, "rewards/cosine_scaled_reward": 0.1654730625450611, "rewards/format_reward": 0.9791666716337204, "step": 400 }, { "completion_length": 1711.6458740234375, "epoch": 0.4582857142857143, "grad_norm": 0.7000367641448975, "kl": 0.11761474609375, "learning_rate": 2.0528000059645995e-07, "loss": 0.0047, "reward": 0.2637552545638755, "reward_std": 0.2834421545267105, "rewards/cosine_scaled_reward": -0.017568758921697736, "rewards/format_reward": 0.7500000298023224, "step": 401 }, { "completion_length": 1553.1875610351562, "epoch": 0.4594285714285714, "grad_norm": 0.9715263247489929, "kl": 0.084716796875, "learning_rate": 2.032690407508949e-07, "loss": 0.0034, "reward": 0.3285421133041382, "reward_std": 0.29113895259797573, "rewards/cosine_scaled_reward": -0.012742497026920319, "rewards/format_reward": 0.895833358168602, "step": 402 }, { "completion_length": 1059.2916870117188, "epoch": 0.4605714285714286, "grad_norm": 0.7187147736549377, "kl": 0.046661376953125, "learning_rate": 2.0127498008311922e-07, "loss": 0.0019, "reward": 0.33717523515224457, "reward_std": 0.24185105971992016, "rewards/cosine_scaled_reward": 0.04978985991328955, "rewards/format_reward": 0.8750000149011612, "step": 403 }, { "completion_length": 1179.8333435058594, "epoch": 0.4617142857142857, "grad_norm": 0.6305052638053894, "kl": 0.03697967529296875, "learning_rate": 1.9929791578083655e-07, "loss": 0.0015, "reward": 0.29722677916288376, "reward_std": 0.2047160156071186, "rewards/cosine_scaled_reward": -0.04974408820271492, "rewards/format_reward": 0.9375000149011612, "step": 404 }, { "completion_length": 1272.2500305175781, "epoch": 0.46285714285714286, "grad_norm": 1.0514519214630127, "kl": 0.03973388671875, "learning_rate": 1.9733794420337213e-07, "loss": 0.0016, "reward": 0.4752641096711159, "reward_std": 0.3702741339802742, "rewards/cosine_scaled_reward": 0.18676720187067986, "rewards/format_reward": 0.9375000149011612, "step": 405 }, { "completion_length": 1381.7917022705078, "epoch": 0.464, "grad_norm": 0.4949979782104492, "kl": 0.05523681640625, "learning_rate": 1.9539516087697517e-07, "loss": 0.0022, "reward": 0.3171195648610592, "reward_std": 0.24485689774155617, "rewards/cosine_scaled_reward": 0.00026333145797252655, "rewards/format_reward": 0.9583333432674408, "step": 406 }, { "completion_length": 1613.0833435058594, "epoch": 0.46514285714285714, "grad_norm": 0.48335549235343933, "kl": 0.07067108154296875, "learning_rate": 1.934696604901642e-07, "loss": 0.0028, "reward": 0.3383835516870022, "reward_std": 0.1939285695552826, "rewards/cosine_scaled_reward": 0.005908636376261711, "rewards/format_reward": 0.9375000149011612, "step": 407 }, { "completion_length": 1326.1458587646484, "epoch": 0.4662857142857143, "grad_norm": 0.42962008714675903, "kl": 0.04166412353515625, "learning_rate": 1.915615368891117e-07, "loss": 0.0017, "reward": 0.5063566863536835, "reward_std": 0.39465366303920746, "rewards/cosine_scaled_reward": 0.18573056161403656, "rewards/format_reward": 0.9583333432674408, "step": 408 }, { "completion_length": 1747.8958740234375, "epoch": 0.4674285714285714, "grad_norm": 0.5078721642494202, "kl": 0.1215057373046875, "learning_rate": 1.8967088307307e-07, "loss": 0.0049, "reward": 0.20226813852787018, "reward_std": 0.23090328555554152, "rewards/cosine_scaled_reward": -0.08720803633332253, "rewards/format_reward": 0.7916666865348816, "step": 409 }, { "completion_length": 1563.3958892822266, "epoch": 0.4685714285714286, "grad_norm": 0.6820676922798157, "kl": 0.083648681640625, "learning_rate": 1.8779779118983867e-07, "loss": 0.0033, "reward": 0.4776548147201538, "reward_std": 0.29904134944081306, "rewards/cosine_scaled_reward": 0.15387392230331898, "rewards/format_reward": 0.9791666716337204, "step": 410 }, { "completion_length": 1896.6458740234375, "epoch": 0.4697142857142857, "grad_norm": 0.7664035558700562, "kl": 0.136627197265625, "learning_rate": 1.8594235253127372e-07, "loss": 0.0055, "reward": 0.17960253171622753, "reward_std": 0.28863129764795303, "rewards/cosine_scaled_reward": -0.1354135131696239, "rewards/format_reward": 0.7500000149011612, "step": 411 }, { "completion_length": 1207.9166870117188, "epoch": 0.47085714285714286, "grad_norm": 0.7434352040290833, "kl": 0.0690765380859375, "learning_rate": 1.8410465752883758e-07, "loss": 0.0028, "reward": 0.31478671357035637, "reward_std": 0.2499629482626915, "rewards/cosine_scaled_reward": -0.020952284336090088, "rewards/format_reward": 0.9375000149011612, "step": 412 }, { "completion_length": 1497.1875305175781, "epoch": 0.472, "grad_norm": 0.6059801578521729, "kl": 0.094818115234375, "learning_rate": 1.822847957491922e-07, "loss": 0.0038, "reward": 0.25864014215767384, "reward_std": 0.3024759627878666, "rewards/cosine_scaled_reward": -0.019603880122303963, "rewards/format_reward": 0.8541666716337204, "step": 413 }, { "completion_length": 2146.4583740234375, "epoch": 0.47314285714285714, "grad_norm": 0.8143852353096008, "kl": 0.1427001953125, "learning_rate": 1.804828558898332e-07, "loss": 0.0057, "reward": 0.12906305491924286, "reward_std": 0.23890957981348038, "rewards/cosine_scaled_reward": -0.22878951579332352, "rewards/format_reward": 0.7916666865348816, "step": 414 }, { "completion_length": 1836.2084350585938, "epoch": 0.4742857142857143, "grad_norm": 0.836365818977356, "kl": 0.17041015625, "learning_rate": 1.7869892577476722e-07, "loss": 0.0068, "reward": 0.29722545109689236, "reward_std": 0.26087959110736847, "rewards/cosine_scaled_reward": -0.022759397514164448, "rewards/format_reward": 0.8125000149011612, "step": 415 }, { "completion_length": 1202.0625305175781, "epoch": 0.4754285714285714, "grad_norm": 0.6389414668083191, "kl": 0.0267486572265625, "learning_rate": 1.7693309235023127e-07, "loss": 0.0011, "reward": 0.5062696859240532, "reward_std": 0.36872031539678574, "rewards/cosine_scaled_reward": 0.207824494689703, "rewards/format_reward": 1.0, "step": 416 }, { "completion_length": 1793.291748046875, "epoch": 0.4765714285714286, "grad_norm": 1.9947246313095093, "kl": 0.144775390625, "learning_rate": 1.7518544168045524e-07, "loss": 0.0058, "reward": 0.22884063888341188, "reward_std": 0.2757262773811817, "rewards/cosine_scaled_reward": -0.11375176906585693, "rewards/format_reward": 0.8125000298023224, "step": 417 }, { "completion_length": 1436.4792098999023, "epoch": 0.4777142857142857, "grad_norm": 1.2780954837799072, "kl": 0.1436767578125, "learning_rate": 1.7345605894346726e-07, "loss": 0.0058, "reward": 0.3646044321358204, "reward_std": 0.25644519180059433, "rewards/cosine_scaled_reward": 0.11428587138652802, "rewards/format_reward": 0.833333358168602, "step": 418 }, { "completion_length": 1709.5, "epoch": 0.47885714285714287, "grad_norm": 1.3950793743133545, "kl": 0.15716552734375, "learning_rate": 1.7174502842694212e-07, "loss": 0.0063, "reward": 0.2546742111444473, "reward_std": 0.2845570184290409, "rewards/cosine_scaled_reward": -0.027704435400664806, "rewards/format_reward": 0.7916666716337204, "step": 419 }, { "completion_length": 1101.9583587646484, "epoch": 0.48, "grad_norm": 1.140668511390686, "kl": 0.080596923828125, "learning_rate": 1.7005243352409333e-07, "loss": 0.0032, "reward": 0.2978057861328125, "reward_std": 0.23859090358018875, "rewards/cosine_scaled_reward": -0.10128935193642974, "rewards/format_reward": 0.9375000149011612, "step": 420 }, { "completion_length": 1453.6250305175781, "epoch": 0.48114285714285715, "grad_norm": 0.6903990507125854, "kl": 0.143310546875, "learning_rate": 1.6837835672960831e-07, "loss": 0.0057, "reward": 0.24449162557721138, "reward_std": 0.25894002988934517, "rewards/cosine_scaled_reward": -0.07152132876217365, "rewards/format_reward": 0.8750000149011612, "step": 421 }, { "completion_length": 1786.6875305175781, "epoch": 0.48228571428571426, "grad_norm": 0.7118463516235352, "kl": 0.1838836669921875, "learning_rate": 1.6672287963562852e-07, "loss": 0.0073, "reward": 0.26678669825196266, "reward_std": 0.22027145139873028, "rewards/cosine_scaled_reward": 0.008928820490837097, "rewards/format_reward": 0.7708333432674408, "step": 422 }, { "completion_length": 1891.7500305175781, "epoch": 0.48342857142857143, "grad_norm": 1.0444345474243164, "kl": 0.233154296875, "learning_rate": 1.6508608292777203e-07, "loss": 0.0093, "reward": 0.16685185953974724, "reward_std": 0.24080579355359077, "rewards/cosine_scaled_reward": -0.11634586472064257, "rewards/format_reward": 0.770833358168602, "step": 423 }, { "completion_length": 1682.8333740234375, "epoch": 0.4845714285714286, "grad_norm": 1.0426344871520996, "kl": 0.1017913818359375, "learning_rate": 1.6346804638120098e-07, "loss": 0.0041, "reward": 0.17576977238059044, "reward_std": 0.25679684802889824, "rewards/cosine_scaled_reward": -0.17408692091703415, "rewards/format_reward": 0.8333333432674408, "step": 424 }, { "completion_length": 1349.2708892822266, "epoch": 0.4857142857142857, "grad_norm": 0.44903820753097534, "kl": 0.058441162109375, "learning_rate": 1.6186884885673413e-07, "loss": 0.0023, "reward": 0.5212084054946899, "reward_std": 0.31782156974077225, "rewards/cosine_scaled_reward": 0.31852289102971554, "rewards/format_reward": 0.9375000149011612, "step": 425 }, { "completion_length": 1496.7500610351562, "epoch": 0.4868571428571429, "grad_norm": 1.0811808109283447, "kl": 0.2073974609375, "learning_rate": 1.6028856829700258e-07, "loss": 0.0083, "reward": 0.26587871834635735, "reward_std": 0.25409030355513096, "rewards/cosine_scaled_reward": -0.07828386966139078, "rewards/format_reward": 0.8541666716337204, "step": 426 }, { "completion_length": 1663.2709045410156, "epoch": 0.488, "grad_norm": 1.3951669931411743, "kl": 0.111724853515625, "learning_rate": 1.5872728172265146e-07, "loss": 0.0045, "reward": 0.34958234429359436, "reward_std": 0.4163043648004532, "rewards/cosine_scaled_reward": 0.035510750487446785, "rewards/format_reward": 0.8333333432674408, "step": 427 }, { "completion_length": 1454.4792175292969, "epoch": 0.48914285714285716, "grad_norm": 0.8079267740249634, "kl": 0.0900115966796875, "learning_rate": 1.5718506522858572e-07, "loss": 0.0036, "reward": 0.2652810290455818, "reward_std": 0.2516926135867834, "rewards/cosine_scaled_reward": -0.06710008531808853, "rewards/format_reward": 0.8958333432674408, "step": 428 }, { "completion_length": 1153.0625457763672, "epoch": 0.49028571428571427, "grad_norm": 0.755164384841919, "kl": 0.112701416015625, "learning_rate": 1.5566199398026147e-07, "loss": 0.0045, "reward": 0.28123652189970016, "reward_std": 0.25917378067970276, "rewards/cosine_scaled_reward": -0.06613434542668983, "rewards/format_reward": 0.9375000149011612, "step": 429 }, { "completion_length": 1428.2708587646484, "epoch": 0.49142857142857144, "grad_norm": 1.3430911302566528, "kl": 0.1293487548828125, "learning_rate": 1.5415814221002265e-07, "loss": 0.0052, "reward": 0.254832336679101, "reward_std": 0.23446263745427132, "rewards/cosine_scaled_reward": -0.033917545806616545, "rewards/format_reward": 0.8541666865348816, "step": 430 }, { "completion_length": 1351.3958892822266, "epoch": 0.49257142857142855, "grad_norm": 1.1112327575683594, "kl": 0.208984375, "learning_rate": 1.5267358321348285e-07, "loss": 0.0084, "reward": 0.18969909101724625, "reward_std": 0.23189299926161766, "rewards/cosine_scaled_reward": -0.14616328151896596, "rewards/format_reward": 0.833333358168602, "step": 431 }, { "completion_length": 1790.1250610351562, "epoch": 0.4937142857142857, "grad_norm": 0.6371591091156006, "kl": 0.1598358154296875, "learning_rate": 1.5120838934595337e-07, "loss": 0.0064, "reward": 0.24663135036826134, "reward_std": 0.2685924358665943, "rewards/cosine_scaled_reward": -0.03793436847627163, "rewards/format_reward": 0.770833358168602, "step": 432 }, { "completion_length": 1450.0000610351562, "epoch": 0.4948571428571429, "grad_norm": 0.8176037073135376, "kl": 0.0924072265625, "learning_rate": 1.4976263201891613e-07, "loss": 0.0037, "reward": 0.33402128890156746, "reward_std": 0.24811824969947338, "rewards/cosine_scaled_reward": 0.017201400361955166, "rewards/format_reward": 0.9791666716337204, "step": 433 }, { "completion_length": 1634.3750305175781, "epoch": 0.496, "grad_norm": 1.4213171005249023, "kl": 0.212249755859375, "learning_rate": 1.483363816965435e-07, "loss": 0.0085, "reward": 0.11890215799212456, "reward_std": 0.19112374633550644, "rewards/cosine_scaled_reward": -0.23629990592598915, "rewards/format_reward": 0.8333333432674408, "step": 434 }, { "completion_length": 1083.0416870117188, "epoch": 0.49714285714285716, "grad_norm": 0.7486536502838135, "kl": 0.1158599853515625, "learning_rate": 1.469297078922642e-07, "loss": 0.0046, "reward": 0.17990897595882416, "reward_std": 0.18072829023003578, "rewards/cosine_scaled_reward": -0.22289768233895302, "rewards/format_reward": 0.9166666716337204, "step": 435 }, { "completion_length": 1389.3334045410156, "epoch": 0.4982857142857143, "grad_norm": 0.9094387888908386, "kl": 0.17256927490234375, "learning_rate": 1.4554267916537495e-07, "loss": 0.0069, "reward": 0.32354634441435337, "reward_std": 0.22079703584313393, "rewards/cosine_scaled_reward": 0.08843789249658585, "rewards/format_reward": 0.8125000149011612, "step": 436 }, { "completion_length": 1371.8958587646484, "epoch": 0.49942857142857144, "grad_norm": 1.0350995063781738, "kl": 0.067138671875, "learning_rate": 1.4417536311769885e-07, "loss": 0.0027, "reward": 0.2465380048379302, "reward_std": 0.24149386584758759, "rewards/cosine_scaled_reward": -0.09340556943789124, "rewards/format_reward": 0.9166666716337204, "step": 437 }, { "completion_length": 1961.9792175292969, "epoch": 0.5005714285714286, "grad_norm": 0.695432186126709, "kl": 0.27734375, "learning_rate": 1.4282782639029128e-07, "loss": 0.0111, "reward": 0.23410780914127827, "reward_std": 0.26633862406015396, "rewards/cosine_scaled_reward": -0.07006174232810736, "rewards/format_reward": 0.770833358168602, "step": 438 }, { "completion_length": 1414.1875457763672, "epoch": 0.5017142857142857, "grad_norm": 1.3709079027175903, "kl": 0.0963897705078125, "learning_rate": 1.4150013466019114e-07, "loss": 0.0039, "reward": 0.17976359650492668, "reward_std": 0.27022555842995644, "rewards/cosine_scaled_reward": -0.16386261460138485, "rewards/format_reward": 0.8750000149011612, "step": 439 }, { "completion_length": 1298.375015258789, "epoch": 0.5028571428571429, "grad_norm": 0.7073745131492615, "kl": 0.10003662109375, "learning_rate": 1.4019235263722034e-07, "loss": 0.004, "reward": 0.1858317069709301, "reward_std": 0.15907006710767746, "rewards/cosine_scaled_reward": -0.1895090565085411, "rewards/format_reward": 0.9166666865348816, "step": 440 }, { "completion_length": 1580.8333435058594, "epoch": 0.504, "grad_norm": 1.3497065305709839, "kl": 0.17523193359375, "learning_rate": 1.3890454406082956e-07, "loss": 0.007, "reward": 0.28470361296785995, "reward_std": 0.2480045147240162, "rewards/cosine_scaled_reward": 0.0019478872418403625, "rewards/format_reward": 0.7916666716337204, "step": 441 }, { "completion_length": 1075.4375305175781, "epoch": 0.5051428571428571, "grad_norm": 0.725144624710083, "kl": 0.048858642578125, "learning_rate": 1.3763677169699217e-07, "loss": 0.002, "reward": 0.3077830299735069, "reward_std": 0.2904723510146141, "rewards/cosine_scaled_reward": -0.05075856437906623, "rewards/format_reward": 0.9791666716337204, "step": 442 }, { "completion_length": 1847.7916870117188, "epoch": 0.5062857142857143, "grad_norm": 0.9789474010467529, "kl": 0.283935546875, "learning_rate": 1.3638909733514452e-07, "loss": 0.0113, "reward": 0.1660569068044424, "reward_std": 0.20301354117691517, "rewards/cosine_scaled_reward": -0.12026797235012054, "rewards/format_reward": 0.7500000298023224, "step": 443 }, { "completion_length": 1602.9584045410156, "epoch": 0.5074285714285715, "grad_norm": 1.3816633224487305, "kl": 0.2508544921875, "learning_rate": 1.351615817851748e-07, "loss": 0.01, "reward": 0.2705380953848362, "reward_std": 0.3167138807475567, "rewards/cosine_scaled_reward": -0.0010249577462673187, "rewards/format_reward": 0.7500000149011612, "step": 444 }, { "completion_length": 1616.854248046875, "epoch": 0.5085714285714286, "grad_norm": 2.4442567825317383, "kl": 0.381591796875, "learning_rate": 1.3395428487445914e-07, "loss": 0.0152, "reward": 0.29816973581910133, "reward_std": 0.3240351192653179, "rewards/cosine_scaled_reward": -0.05487822741270065, "rewards/format_reward": 0.8541666716337204, "step": 445 }, { "completion_length": 1421.8333740234375, "epoch": 0.5097142857142857, "grad_norm": 1.0425515174865723, "kl": 0.075347900390625, "learning_rate": 1.3276726544494571e-07, "loss": 0.003, "reward": 0.22422216087579727, "reward_std": 0.24329788237810135, "rewards/cosine_scaled_reward": -0.02643941156566143, "rewards/format_reward": 0.7500000298023224, "step": 446 }, { "completion_length": 1299.6458740234375, "epoch": 0.5108571428571429, "grad_norm": 0.9955561757087708, "kl": 0.13824462890625, "learning_rate": 1.316005813502869e-07, "loss": 0.0055, "reward": 0.2557426951825619, "reward_std": 0.2716595195233822, "rewards/cosine_scaled_reward": -0.07632811553776264, "rewards/format_reward": 0.9166666716337204, "step": 447 }, { "completion_length": 1150.0417175292969, "epoch": 0.512, "grad_norm": 1.2563894987106323, "kl": 0.17791748046875, "learning_rate": 1.3045428945301953e-07, "loss": 0.0071, "reward": 0.32831166312098503, "reward_std": 0.2525380663573742, "rewards/cosine_scaled_reward": -0.023525401949882507, "rewards/format_reward": 0.9375000149011612, "step": 448 }, { "completion_length": 1260.6250305175781, "epoch": 0.5131428571428571, "grad_norm": 1.812902808189392, "kl": 0.26861572265625, "learning_rate": 1.2932844562179352e-07, "loss": 0.0107, "reward": 0.16285967081785202, "reward_std": 0.2142799235880375, "rewards/cosine_scaled_reward": -0.21595630422234535, "rewards/format_reward": 0.8750000149011612, "step": 449 }, { "completion_length": 1231.6667175292969, "epoch": 0.5142857142857142, "grad_norm": 1.7580350637435913, "kl": 0.19732666015625, "learning_rate": 1.2822310472864885e-07, "loss": 0.0079, "reward": 0.25604721903800964, "reward_std": 0.2637137100100517, "rewards/cosine_scaled_reward": -0.0012245629914104939, "rewards/format_reward": 0.8125000149011612, "step": 450 }, { "completion_length": 1126.6667022705078, "epoch": 0.5154285714285715, "grad_norm": 8.124907493591309, "kl": 0.2808837890625, "learning_rate": 1.2713832064634125e-07, "loss": 0.0112, "reward": 0.32462166622281075, "reward_std": 0.24854115769267082, "rewards/cosine_scaled_reward": -0.025696704164147377, "rewards/format_reward": 0.9166666865348816, "step": 451 }, { "completion_length": 1795.479232788086, "epoch": 0.5165714285714286, "grad_norm": 7.079920768737793, "kl": 0.427490234375, "learning_rate": 1.260741462457165e-07, "loss": 0.0171, "reward": 0.2079998729750514, "reward_std": 0.32888830825686455, "rewards/cosine_scaled_reward": -0.05745353177189827, "rewards/format_reward": 0.6875000298023224, "step": 452 }, { "completion_length": 1771.3750305175781, "epoch": 0.5177142857142857, "grad_norm": 0.9983645677566528, "kl": 0.3150634765625, "learning_rate": 1.2503063339313356e-07, "loss": 0.0126, "reward": 0.30452151922509074, "reward_std": 0.3506147041916847, "rewards/cosine_scaled_reward": -0.0257783941924572, "rewards/format_reward": 0.833333358168602, "step": 453 }, { "completion_length": 1406.2292175292969, "epoch": 0.5188571428571429, "grad_norm": 0.7196060419082642, "kl": 0.197479248046875, "learning_rate": 1.2400783294793668e-07, "loss": 0.0079, "reward": 0.23121106252074242, "reward_std": 0.20694708451628685, "rewards/cosine_scaled_reward": -0.07145766541361809, "rewards/format_reward": 0.8750000149011612, "step": 454 }, { "completion_length": 1343.8958740234375, "epoch": 0.52, "grad_norm": 2.1558892726898193, "kl": 0.191925048828125, "learning_rate": 1.2300579475997657e-07, "loss": 0.0077, "reward": 0.14940793626010418, "reward_std": 0.17657526955008507, "rewards/cosine_scaled_reward": -0.21375716105103493, "rewards/format_reward": 0.833333358168602, "step": 455 }, { "completion_length": 1670.0625610351562, "epoch": 0.5211428571428571, "grad_norm": 1.342545986175537, "kl": 0.29425048828125, "learning_rate": 1.220245676671809e-07, "loss": 0.0118, "reward": 0.22154957614839077, "reward_std": 0.30688488483428955, "rewards/cosine_scaled_reward": -0.1299609588459134, "rewards/format_reward": 0.833333358168602, "step": 456 }, { "completion_length": 1772.541748046875, "epoch": 0.5222857142857142, "grad_norm": 0.9106640219688416, "kl": 0.37689208984375, "learning_rate": 1.2106419949317388e-07, "loss": 0.0151, "reward": 0.27197565883398056, "reward_std": 0.26648762077093124, "rewards/cosine_scaled_reward": -0.009499620646238327, "rewards/format_reward": 0.7291666865348816, "step": 457 }, { "completion_length": 1358.4166870117188, "epoch": 0.5234285714285715, "grad_norm": 1.6191344261169434, "kl": 0.20147705078125, "learning_rate": 1.2012473704494537e-07, "loss": 0.0081, "reward": 0.17686885967850685, "reward_std": 0.1800503470003605, "rewards/cosine_scaled_reward": -0.150895812548697, "rewards/format_reward": 0.833333358168602, "step": 458 }, { "completion_length": 1272.2292022705078, "epoch": 0.5245714285714286, "grad_norm": 1.0406947135925293, "kl": 0.173828125, "learning_rate": 1.1920622611056974e-07, "loss": 0.0069, "reward": 0.3333798721432686, "reward_std": 0.29825500398874283, "rewards/cosine_scaled_reward": 0.05970238149166107, "rewards/format_reward": 0.895833358168602, "step": 459 }, { "completion_length": 2041.4167175292969, "epoch": 0.5257142857142857, "grad_norm": 2.132143259048462, "kl": 0.349609375, "learning_rate": 1.1830871145697412e-07, "loss": 0.014, "reward": 0.21742988645564765, "reward_std": 0.21657241694629192, "rewards/cosine_scaled_reward": -0.04676482267677784, "rewards/format_reward": 0.6666667014360428, "step": 460 }, { "completion_length": 1672.3125915527344, "epoch": 0.5268571428571428, "grad_norm": 1.4400893449783325, "kl": 0.27490234375, "learning_rate": 1.1743223682775649e-07, "loss": 0.011, "reward": 0.24742556363344193, "reward_std": 0.2879389263689518, "rewards/cosine_scaled_reward": -0.019029099494218826, "rewards/format_reward": 0.7500000298023224, "step": 461 }, { "completion_length": 1469.8958740234375, "epoch": 0.528, "grad_norm": 1.8993326425552368, "kl": 0.332977294921875, "learning_rate": 1.1657684494105386e-07, "loss": 0.0133, "reward": 0.17773004883201793, "reward_std": 0.26185616478323936, "rewards/cosine_scaled_reward": -0.20606819819658995, "rewards/format_reward": 0.8750000149011612, "step": 462 }, { "completion_length": 1753.7084045410156, "epoch": 0.5291428571428571, "grad_norm": 1.1649374961853027, "kl": 0.165557861328125, "learning_rate": 1.1574257748745986e-07, "loss": 0.0066, "reward": 0.2517554350197315, "reward_std": 0.19878023117780685, "rewards/cosine_scaled_reward": -0.03619728982448578, "rewards/format_reward": 0.8125000149011612, "step": 463 }, { "completion_length": 1010.4583435058594, "epoch": 0.5302857142857142, "grad_norm": 0.5831181406974792, "kl": 0.1368255615234375, "learning_rate": 1.1492947512799328e-07, "loss": 0.0055, "reward": 0.45407935231924057, "reward_std": 0.2512675076723099, "rewards/cosine_scaled_reward": 0.1662245448678732, "rewards/format_reward": 0.9375000149011612, "step": 464 }, { "completion_length": 1744.3125610351562, "epoch": 0.5314285714285715, "grad_norm": 1.3955875635147095, "kl": 0.37939453125, "learning_rate": 1.1413757749211602e-07, "loss": 0.0152, "reward": 0.2800963893532753, "reward_std": 0.3290780335664749, "rewards/cosine_scaled_reward": 0.019320473540574312, "rewards/format_reward": 0.7708333432674408, "step": 465 }, { "completion_length": 1541.3125610351562, "epoch": 0.5325714285714286, "grad_norm": 1.1654468774795532, "kl": 0.2691650390625, "learning_rate": 1.1336692317580158e-07, "loss": 0.0107, "reward": 0.3624018207192421, "reward_std": 0.29893627390265465, "rewards/cosine_scaled_reward": 0.09198451042175293, "rewards/format_reward": 0.7916666865348816, "step": 466 }, { "completion_length": 1635.2500610351562, "epoch": 0.5337142857142857, "grad_norm": 1.6793878078460693, "kl": 0.2359619140625, "learning_rate": 1.1261754973965422e-07, "loss": 0.0094, "reward": 0.2201872505247593, "reward_std": 0.23867875151336193, "rewards/cosine_scaled_reward": -0.14597262814641, "rewards/format_reward": 0.833333358168602, "step": 467 }, { "completion_length": 1782.3958435058594, "epoch": 0.5348571428571428, "grad_norm": 1.4512677192687988, "kl": 0.32489013671875, "learning_rate": 1.1188949370707787e-07, "loss": 0.013, "reward": 0.18340039625763893, "reward_std": 0.30998512730002403, "rewards/cosine_scaled_reward": -0.10351480543613434, "rewards/format_reward": 0.7500000149011612, "step": 468 }, { "completion_length": 1680.6250305175781, "epoch": 0.536, "grad_norm": 1.8575129508972168, "kl": 0.4377593994140625, "learning_rate": 1.1118279056249653e-07, "loss": 0.0175, "reward": 0.22907501395093277, "reward_std": 0.26676100492477417, "rewards/cosine_scaled_reward": -0.031132690608501434, "rewards/format_reward": 0.6875000298023224, "step": 469 }, { "completion_length": 1948.354248046875, "epoch": 0.5371428571428571, "grad_norm": 1.7162368297576904, "kl": 0.47454833984375, "learning_rate": 1.1049747474962444e-07, "loss": 0.019, "reward": 0.22342915716581047, "reward_std": 0.25090836361050606, "rewards/cosine_scaled_reward": -0.08544790372252464, "rewards/format_reward": 0.7500000149011612, "step": 470 }, { "completion_length": 1845.5625305175781, "epoch": 0.5382857142857143, "grad_norm": 1.5333935022354126, "kl": 0.4609375, "learning_rate": 1.0983357966978745e-07, "loss": 0.0184, "reward": 0.25642314786091447, "reward_std": 0.21491710096597672, "rewards/cosine_scaled_reward": 0.010931313037872314, "rewards/format_reward": 0.7291666865348816, "step": 471 }, { "completion_length": 1809.1250762939453, "epoch": 0.5394285714285715, "grad_norm": 1.18997061252594, "kl": 0.345947265625, "learning_rate": 1.0919113768029517e-07, "loss": 0.0138, "reward": 0.15577425621449947, "reward_std": 0.2659985013306141, "rewards/cosine_scaled_reward": -0.18055069167166948, "rewards/format_reward": 0.7500000149011612, "step": 472 }, { "completion_length": 2019.2292175292969, "epoch": 0.5405714285714286, "grad_norm": 1.1716952323913574, "kl": 0.402801513671875, "learning_rate": 1.0857018009286381e-07, "loss": 0.0161, "reward": 0.15595489647239447, "reward_std": 0.23244059830904007, "rewards/cosine_scaled_reward": -0.10964090749621391, "rewards/format_reward": 0.6875, "step": 473 }, { "completion_length": 1756.9584045410156, "epoch": 0.5417142857142857, "grad_norm": 1.262105941772461, "kl": 0.4705352783203125, "learning_rate": 1.0797073717209013e-07, "loss": 0.0189, "reward": 0.5162371173501015, "reward_std": 0.34698261320590973, "rewards/cosine_scaled_reward": 0.3371627281885594, "rewards/format_reward": 0.8125000149011612, "step": 474 }, { "completion_length": 1377.0416870117188, "epoch": 0.5428571428571428, "grad_norm": 0.5952227115631104, "kl": 0.145538330078125, "learning_rate": 1.0739283813397639e-07, "loss": 0.0058, "reward": 0.20968987792730331, "reward_std": 0.2660527192056179, "rewards/cosine_scaled_reward": -0.15221368055790663, "rewards/format_reward": 0.8541666865348816, "step": 475 }, { "completion_length": 1574.7083740234375, "epoch": 0.544, "grad_norm": 0.8631353974342346, "kl": 0.19976806640625, "learning_rate": 1.068365111445064e-07, "loss": 0.008, "reward": 0.26954812556505203, "reward_std": 0.29395921155810356, "rewards/cosine_scaled_reward": -0.04235384240746498, "rewards/format_reward": 0.8125000149011612, "step": 476 }, { "completion_length": 1422.0625610351562, "epoch": 0.5451428571428572, "grad_norm": 1.0784908533096313, "kl": 0.252349853515625, "learning_rate": 1.063017833182728e-07, "loss": 0.0101, "reward": 0.41880106925964355, "reward_std": 0.392875611782074, "rewards/cosine_scaled_reward": 0.1550253469031304, "rewards/format_reward": 0.8541666865348816, "step": 477 }, { "completion_length": 1763.166748046875, "epoch": 0.5462857142857143, "grad_norm": 1.126824140548706, "kl": 0.34912109375, "learning_rate": 1.0578868071715544e-07, "loss": 0.014, "reward": 0.14933402370661497, "reward_std": 0.29011841118335724, "rewards/cosine_scaled_reward": -0.12388145178556442, "rewards/format_reward": 0.6666666865348816, "step": 478 }, { "completion_length": 1986.5209350585938, "epoch": 0.5474285714285714, "grad_norm": 1.0951849222183228, "kl": 0.548828125, "learning_rate": 1.0529722834905125e-07, "loss": 0.022, "reward": 0.2293107956647873, "reward_std": 0.2566649541258812, "rewards/cosine_scaled_reward": -0.02941977046430111, "rewards/format_reward": 0.7291666865348816, "step": 479 }, { "completion_length": 1394.8958740234375, "epoch": 0.5485714285714286, "grad_norm": 1.8387150764465332, "kl": 0.277618408203125, "learning_rate": 1.0482745016665526e-07, "loss": 0.0111, "reward": 0.22987965494394302, "reward_std": 0.30185453593730927, "rewards/cosine_scaled_reward": -0.10149852558970451, "rewards/format_reward": 0.8125000149011612, "step": 480 }, { "completion_length": 1966.3541870117188, "epoch": 0.5497142857142857, "grad_norm": 2.787348508834839, "kl": 0.376220703125, "learning_rate": 1.0437936906629334e-07, "loss": 0.015, "reward": 0.12472720723599195, "reward_std": 0.21501713246107101, "rewards/cosine_scaled_reward": -0.2091291006654501, "rewards/format_reward": 0.7500000149011612, "step": 481 }, { "completion_length": 1356.5625305175781, "epoch": 0.5508571428571428, "grad_norm": 1.1721312999725342, "kl": 0.26988983154296875, "learning_rate": 1.0395300688680625e-07, "loss": 0.0108, "reward": 0.34367292933166027, "reward_std": 0.2708738148212433, "rewards/cosine_scaled_reward": 0.07783332094550133, "rewards/format_reward": 0.8750000149011612, "step": 482 }, { "completion_length": 1949.3750915527344, "epoch": 0.552, "grad_norm": 1.2884186506271362, "kl": 0.4071044921875, "learning_rate": 1.0354838440848501e-07, "loss": 0.0163, "reward": 0.19273016415536404, "reward_std": 0.3338697701692581, "rewards/cosine_scaled_reward": -0.11160976439714432, "rewards/format_reward": 0.708333358168602, "step": 483 }, { "completion_length": 1149.8333740234375, "epoch": 0.5531428571428572, "grad_norm": 1.1346054077148438, "kl": 0.19384765625, "learning_rate": 1.0316552135205837e-07, "loss": 0.0078, "reward": 0.27724506333470345, "reward_std": 0.27399446070194244, "rewards/cosine_scaled_reward": -0.07647367380559444, "rewards/format_reward": 0.9166666865348816, "step": 484 }, { "completion_length": 1416.6458435058594, "epoch": 0.5542857142857143, "grad_norm": 1.929132103919983, "kl": 0.243896484375, "learning_rate": 1.0280443637773163e-07, "loss": 0.0098, "reward": 0.2756602857261896, "reward_std": 0.26644935086369514, "rewards/cosine_scaled_reward": -0.017928674817085266, "rewards/format_reward": 0.8541667014360428, "step": 485 }, { "completion_length": 841.1666870117188, "epoch": 0.5554285714285714, "grad_norm": 1.185613751411438, "kl": 0.1144866943359375, "learning_rate": 1.0246514708427701e-07, "loss": 0.0046, "reward": 0.28294001519680023, "reward_std": 0.21592207811772823, "rewards/cosine_scaled_reward": -0.02465655282139778, "rewards/format_reward": 0.8750000149011612, "step": 486 }, { "completion_length": 1128.0625305175781, "epoch": 0.5565714285714286, "grad_norm": 0.5899915099143982, "kl": 0.0730743408203125, "learning_rate": 1.0214767000817596e-07, "loss": 0.0029, "reward": 0.4133741296827793, "reward_std": 0.2118063010275364, "rewards/cosine_scaled_reward": 0.17440680041909218, "rewards/format_reward": 0.8958333432674408, "step": 487 }, { "completion_length": 1287.9583740234375, "epoch": 0.5577142857142857, "grad_norm": 1.2799737453460693, "kl": 0.3245849609375, "learning_rate": 1.0185202062281336e-07, "loss": 0.013, "reward": 0.22669554874300957, "reward_std": 0.2413083128631115, "rewards/cosine_scaled_reward": -0.11942423251457512, "rewards/format_reward": 0.8958333432674408, "step": 488 }, { "completion_length": 1572.4375610351562, "epoch": 0.5588571428571428, "grad_norm": 1.0668160915374756, "kl": 0.39013671875, "learning_rate": 1.0157821333772304e-07, "loss": 0.0156, "reward": 0.06757297122385353, "reward_std": 0.17149998247623444, "rewards/cosine_scaled_reward": -0.2858339995145798, "rewards/format_reward": 0.8125000149011612, "step": 489 }, { "completion_length": 1397.5000305175781, "epoch": 0.56, "grad_norm": 1.396504521369934, "kl": 0.2890625, "learning_rate": 1.013262614978859e-07, "loss": 0.0116, "reward": 0.25191737711429596, "reward_std": 0.2361450344324112, "rewards/cosine_scaled_reward": -0.06171676144003868, "rewards/format_reward": 0.8333333432674408, "step": 490 }, { "completion_length": 1577.6459045410156, "epoch": 0.5611428571428572, "grad_norm": 1.347195029258728, "kl": 0.22802734375, "learning_rate": 1.0109617738307911e-07, "loss": 0.0091, "reward": 0.38772477209568024, "reward_std": 0.3787771388888359, "rewards/cosine_scaled_reward": 0.11236833222210407, "rewards/format_reward": 0.8125000149011612, "step": 491 }, { "completion_length": 1666.7708740234375, "epoch": 0.5622857142857143, "grad_norm": 1.133479356765747, "kl": 0.37799072265625, "learning_rate": 1.0088797220727779e-07, "loss": 0.0151, "reward": 0.2728476896882057, "reward_std": 0.2604874260723591, "rewards/cosine_scaled_reward": -0.008132921531796455, "rewards/format_reward": 0.7500000149011612, "step": 492 }, { "completion_length": 1406.8333435058594, "epoch": 0.5634285714285714, "grad_norm": 1.091591477394104, "kl": 0.2840576171875, "learning_rate": 1.0070165611810855e-07, "loss": 0.0114, "reward": 0.349657341837883, "reward_std": 0.35755816102027893, "rewards/cosine_scaled_reward": 0.09285960160195827, "rewards/format_reward": 0.8541666865348816, "step": 493 }, { "completion_length": 1206.7083740234375, "epoch": 0.5645714285714286, "grad_norm": 1.3146690130233765, "kl": 0.19622802734375, "learning_rate": 1.005372381963547e-07, "loss": 0.0078, "reward": 0.36272867769002914, "reward_std": 0.3516298234462738, "rewards/cosine_scaled_reward": -0.005188416689634323, "rewards/format_reward": 0.9583333432674408, "step": 494 }, { "completion_length": 1572.8542175292969, "epoch": 0.5657142857142857, "grad_norm": 3.402564525604248, "kl": 0.4200439453125, "learning_rate": 1.0039472645551372e-07, "loss": 0.0168, "reward": 0.21359156537801027, "reward_std": 0.29135290533304214, "rewards/cosine_scaled_reward": -0.05910599837079644, "rewards/format_reward": 0.729166679084301, "step": 495 }, { "completion_length": 1534.1250457763672, "epoch": 0.5668571428571428, "grad_norm": 0.9494636654853821, "kl": 0.3223876953125, "learning_rate": 1.002741278414069e-07, "loss": 0.0129, "reward": 0.26828407123684883, "reward_std": 0.22558922320604324, "rewards/cosine_scaled_reward": -0.002284809947013855, "rewards/format_reward": 0.7708333358168602, "step": 496 }, { "completion_length": 1434.4792175292969, "epoch": 0.568, "grad_norm": 1.0619434118270874, "kl": 0.4782257080078125, "learning_rate": 1.0017544823184055e-07, "loss": 0.0191, "reward": 0.3955768905580044, "reward_std": 0.2971377484500408, "rewards/cosine_scaled_reward": 0.10769826103933156, "rewards/format_reward": 0.8958333432674408, "step": 497 }, { "completion_length": 1431.750015258789, "epoch": 0.5691428571428572, "grad_norm": 0.4931652843952179, "kl": 0.1463775634765625, "learning_rate": 1.0009869243631952e-07, "loss": 0.0059, "reward": 0.24166107922792435, "reward_std": 0.2782045044004917, "rewards/cosine_scaled_reward": -0.050205936655402184, "rewards/format_reward": 0.7500000149011612, "step": 498 }, { "completion_length": 1504.541732788086, "epoch": 0.5702857142857143, "grad_norm": 0.9951570630073547, "kl": 0.208251953125, "learning_rate": 1.000438641958131e-07, "loss": 0.0083, "reward": 0.2595203034579754, "reward_std": 0.27923375740647316, "rewards/cosine_scaled_reward": -0.056234169751405716, "rewards/format_reward": 0.8750000298023224, "step": 499 }, { "completion_length": 1520.0625610351562, "epoch": 0.5714285714285714, "grad_norm": 2.099856376647949, "kl": 0.295166015625, "learning_rate": 1.0001096618257236e-07, "loss": 0.0118, "reward": 0.25935105979442596, "reward_std": 0.26287253201007843, "rewards/cosine_scaled_reward": -0.07230615895241499, "rewards/format_reward": 0.8750000149011612, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.0020821976251731195, "train_runtime": 63377.2164, "train_samples_per_second": 0.379, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }