| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5714285714285714, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 2593.500030517578, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.2531319856643677, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.0, | |
| "reward": 0.1798910442739725, | |
| "reward_std": 0.2546093426644802, | |
| "rewards/cosine_scaled_reward": 0.03244469128549099, | |
| "rewards/format_reward": 0.4791666828095913, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 2896.7916870117188, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.18851928412914276, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": 0.0, | |
| "reward": 0.09316219575703144, | |
| "reward_std": 0.29461798816919327, | |
| "rewards/cosine_scaled_reward": -0.013163809664547443, | |
| "rewards/format_reward": 0.37500001303851604, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 3354.7708740234375, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.16173841059207916, | |
| "kl": 4.8100948333740234e-05, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0, | |
| "reward": -0.11184105090796947, | |
| "reward_std": 0.18034866452217102, | |
| "rewards/cosine_scaled_reward": -0.21646979451179504, | |
| "rewards/format_reward": 0.12500000558793545, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 2534.791717529297, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.2748425304889679, | |
| "kl": 3.293156623840332e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0, | |
| "reward": 0.09380067978054285, | |
| "reward_std": 0.3090171590447426, | |
| "rewards/cosine_scaled_reward": -0.14579539687838405, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 3140.5208740234375, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.24623969197273254, | |
| "kl": 4.279613494873047e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0, | |
| "reward": -0.09575780294835567, | |
| "reward_std": 0.17200364544987679, | |
| "rewards/cosine_scaled_reward": -0.2587662376463413, | |
| "rewards/format_reward": 0.22916667349636555, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 3190.7291870117188, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.19608470797538757, | |
| "kl": 4.5180320739746094e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0, | |
| "reward": -0.058510275557637215, | |
| "reward_std": 0.23042680323123932, | |
| "rewards/cosine_scaled_reward": -0.1778582688421011, | |
| "rewards/format_reward": 0.22916667722165585, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 3066.5416870117188, | |
| "epoch": 0.008, | |
| "grad_norm": 0.15382979810237885, | |
| "kl": 2.86102294921875e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.08077196776866913, | |
| "reward_std": 0.25011931732296944, | |
| "rewards/cosine_scaled_reward": -0.11747794598340988, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 2712.3125610351562, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.1719086617231369, | |
| "kl": 1.9162893295288086e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.22242407873272896, | |
| "reward_std": 0.2544141337275505, | |
| "rewards/cosine_scaled_reward": 0.08669950067996979, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 3236.0834350585938, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.18805697560310364, | |
| "kl": 3.9696693420410156e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.0005422467365860939, | |
| "reward_std": 0.22025279328227043, | |
| "rewards/cosine_scaled_reward": -0.1387481726706028, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 2817.2291870117188, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.17756684124469757, | |
| "kl": 3.081560134887695e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.06626716488972306, | |
| "reward_std": 0.24357087537646294, | |
| "rewards/cosine_scaled_reward": -0.08263782970607281, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 3375.3958740234375, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.1558818519115448, | |
| "kl": 3.355741500854492e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0, | |
| "reward": -0.06830349378287792, | |
| "reward_std": 0.2444549147039652, | |
| "rewards/cosine_scaled_reward": -0.1914975270628929, | |
| "rewards/format_reward": 0.14583333395421505, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 2640.3959045410156, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.19693666696548462, | |
| "kl": 3.769993782043457e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0, | |
| "reward": 0.20038466714322567, | |
| "reward_std": 0.2972068637609482, | |
| "rewards/cosine_scaled_reward": -0.002930758520960808, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 2929.041748046875, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.20476509630680084, | |
| "kl": 3.999471664428711e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.1786028677597642, | |
| "reward_std": 0.2050685416907072, | |
| "rewards/cosine_scaled_reward": 0.020189031958580017, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 2892.0209350585938, | |
| "epoch": 0.016, | |
| "grad_norm": 0.24246934056282043, | |
| "kl": 3.141164779663086e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.030550841242074966, | |
| "reward_std": 0.2482227310538292, | |
| "rewards/cosine_scaled_reward": -0.12857902504038066, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 2702.1875915527344, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.20641961693763733, | |
| "kl": 2.3245811462402344e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.14474756829440594, | |
| "reward_std": 0.23251333087682724, | |
| "rewards/cosine_scaled_reward": 0.005298769101500511, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 3581.0208740234375, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.1642305999994278, | |
| "kl": 4.464387893676758e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0, | |
| "reward": -0.14038190618157387, | |
| "reward_std": 0.2387940175831318, | |
| "rewards/cosine_scaled_reward": -0.23054109513759613, | |
| "rewards/format_reward": 0.0416666679084301, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 2264.666748046875, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.2635192275047302, | |
| "kl": 4.088878631591797e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.1540674790740013, | |
| "reward_std": 0.2818757649511099, | |
| "rewards/cosine_scaled_reward": -0.022916601970791817, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 3018.3334350585938, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.17216412723064423, | |
| "kl": 2.47955322265625e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0, | |
| "reward": 0.030305630527436733, | |
| "reward_std": 0.20705362781882286, | |
| "rewards/cosine_scaled_reward": -0.14827953279018402, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 2884.0001220703125, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.19174857437610626, | |
| "kl": 3.063678741455078e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.20790010876953602, | |
| "reward_std": 0.29342253506183624, | |
| "rewards/cosine_scaled_reward": 0.10294647887349129, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 2425.541748046875, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.2391517460346222, | |
| "kl": 2.168118953704834e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0, | |
| "reward": 0.14567521400749683, | |
| "reward_std": 0.2973865978419781, | |
| "rewards/cosine_scaled_reward": -0.06746451498474926, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 2701.6875, | |
| "epoch": 0.024, | |
| "grad_norm": 0.24159380793571472, | |
| "kl": 4.054605960845947e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0, | |
| "reward": 0.05628635361790657, | |
| "reward_std": 0.24320747144520283, | |
| "rewards/cosine_scaled_reward": -0.12232345715165138, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 1856.3333740234375, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.50559401512146, | |
| "kl": 4.360079765319824e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0, | |
| "reward": 0.23862483352422714, | |
| "reward_std": 0.2502210922539234, | |
| "rewards/cosine_scaled_reward": -0.017226822674274445, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 2594.541748046875, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.2060898244380951, | |
| "kl": 3.0606985092163086e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0, | |
| "reward": 0.16113443858921528, | |
| "reward_std": 0.33911262452602386, | |
| "rewards/cosine_scaled_reward": -0.00336562842130661, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 2759.0833740234375, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.21166135370731354, | |
| "kl": 3.281235694885254e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.15770690888166428, | |
| "reward_std": 0.24454708211123943, | |
| "rewards/cosine_scaled_reward": -0.006449127569794655, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 2638.7500610351562, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.2307935506105423, | |
| "kl": 3.364682197570801e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.08012701012194157, | |
| "reward_std": 0.2043658159673214, | |
| "rewards/cosine_scaled_reward": -0.10021369205787778, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 2893.3333740234375, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.16613639891147614, | |
| "kl": 3.281235694885254e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0, | |
| "reward": 0.15319424215704203, | |
| "reward_std": 0.2377900332212448, | |
| "rewards/cosine_scaled_reward": -0.021871407516300678, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 2979.6043090820312, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.22748583555221558, | |
| "kl": 3.71783971786499e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.20988648012280464, | |
| "reward_std": 0.3224359378218651, | |
| "rewards/cosine_scaled_reward": 0.012707266956567764, | |
| "rewards/format_reward": 0.520833358168602, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 2814.7500610351562, | |
| "epoch": 0.032, | |
| "grad_norm": 0.19229423999786377, | |
| "kl": 3.766268491744995e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.19077781355008483, | |
| "reward_std": 0.2345624901354313, | |
| "rewards/cosine_scaled_reward": 0.07308735512197018, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 3052.1458740234375, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.20518264174461365, | |
| "kl": 2.2858381271362305e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0, | |
| "reward": -0.1032419636612758, | |
| "reward_std": 0.18674146011471748, | |
| "rewards/cosine_scaled_reward": -0.3117845207452774, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 2897.1458740234375, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.18484562635421753, | |
| "kl": 2.422928810119629e-05, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.15586171858012676, | |
| "reward_std": 0.28348754346370697, | |
| "rewards/cosine_scaled_reward": -0.0056856535375118256, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 3157.104248046875, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.17953482270240784, | |
| "kl": 2.7835369110107422e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0, | |
| "reward": 0.027457450167275965, | |
| "reward_std": 0.2997221350669861, | |
| "rewards/cosine_scaled_reward": -0.10362934321165085, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 3132.604248046875, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.20453964173793793, | |
| "kl": 3.3348798751831055e-05, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.12347688060253859, | |
| "reward_std": 0.32773852348327637, | |
| "rewards/cosine_scaled_reward": -0.04991224408149719, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 3322.9583740234375, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.1382211297750473, | |
| "kl": 3.045797348022461e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.028675228357315063, | |
| "reward_std": 0.24782302975654602, | |
| "rewards/cosine_scaled_reward": -0.08641303982585669, | |
| "rewards/format_reward": 0.3125000149011612, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 2470.854217529297, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.30319151282310486, | |
| "kl": 3.30507755279541e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.201703529804945, | |
| "reward_std": 0.31444599851965904, | |
| "rewards/cosine_scaled_reward": 0.04628665745258331, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 3163.2084045410156, | |
| "epoch": 0.04, | |
| "grad_norm": 0.20104609429836273, | |
| "kl": 3.534555435180664e-05, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0, | |
| "reward": -0.018313759297598153, | |
| "reward_std": 0.27370789274573326, | |
| "rewards/cosine_scaled_reward": -0.1344178761355579, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 3377.8333740234375, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.16919943690299988, | |
| "kl": 3.9130449295043945e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0, | |
| "reward": -0.07359358388930559, | |
| "reward_std": 0.21812193095684052, | |
| "rewards/cosine_scaled_reward": -0.1993401860818267, | |
| "rewards/format_reward": 0.2083333395421505, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 3403.8958740234375, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.1572796255350113, | |
| "kl": 1.5166588127613068e-05, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.11656345473602414, | |
| "reward_std": 0.16832329705357552, | |
| "rewards/cosine_scaled_reward": -0.23412149026989937, | |
| "rewards/format_reward": 0.14583333395421505, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 3141.3125610351562, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.16345535218715668, | |
| "kl": 2.950429916381836e-05, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0, | |
| "reward": -0.01369121391326189, | |
| "reward_std": 0.20268694125115871, | |
| "rewards/cosine_scaled_reward": -0.1278916783630848, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 2718.7709045410156, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.18454480171203613, | |
| "kl": 1.6637146472930908e-05, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.1349094829056412, | |
| "reward_std": 0.19345630332827568, | |
| "rewards/cosine_scaled_reward": -0.005877137184143066, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 2670.5209350585938, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.18595482409000397, | |
| "kl": 2.847611904144287e-05, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.12064463802380487, | |
| "reward_std": 0.23050246760249138, | |
| "rewards/cosine_scaled_reward": -0.08602435514330864, | |
| "rewards/format_reward": 0.5208333544433117, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 2916.104248046875, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.17687222361564636, | |
| "kl": 3.0338764190673828e-05, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0, | |
| "reward": 0.03498362717800774, | |
| "reward_std": 0.2998371869325638, | |
| "rewards/cosine_scaled_reward": -0.14938516542315483, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 2854.104217529297, | |
| "epoch": 0.048, | |
| "grad_norm": 0.2547663450241089, | |
| "kl": 4.5239925384521484e-05, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.06212767260149121, | |
| "reward_std": 0.1429902408272028, | |
| "rewards/cosine_scaled_reward": -0.2606445848941803, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 3066.854248046875, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.1688769906759262, | |
| "kl": 2.8118491172790527e-05, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.06209855154156685, | |
| "reward_std": 0.3040098361670971, | |
| "rewards/cosine_scaled_reward": -0.0826764814555645, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 2718.4791717529297, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.2759998142719269, | |
| "kl": 0.00011104345321655273, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.16620675288140774, | |
| "reward_std": 0.2648898549377918, | |
| "rewards/cosine_scaled_reward": 0.023219330236315727, | |
| "rewards/format_reward": 0.5, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 3411.875, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.15706008672714233, | |
| "kl": 5.768239498138428e-05, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.010180938057601452, | |
| "reward_std": 0.23901794850826263, | |
| "rewards/cosine_scaled_reward": -0.10855040326714516, | |
| "rewards/format_reward": 0.20833334513008595, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 3205.1666870117188, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.18602944910526276, | |
| "kl": 5.361437797546387e-05, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0, | |
| "reward": -0.04720673710107803, | |
| "reward_std": 0.14394842460751534, | |
| "rewards/cosine_scaled_reward": -0.16787217557430267, | |
| "rewards/format_reward": 0.20833334513008595, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 2893.166748046875, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.2058715671300888, | |
| "kl": 3.2939016819000244e-05, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.2286459095776081, | |
| "reward_std": 0.28589488565921783, | |
| "rewards/cosine_scaled_reward": 0.07320494949817657, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 2904.6875610351562, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.22265109419822693, | |
| "kl": 0.0001233220100402832, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.007764505222439766, | |
| "reward_std": 0.16852838546037674, | |
| "rewards/cosine_scaled_reward": -0.16211793571710587, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 2427.479248046875, | |
| "epoch": 0.056, | |
| "grad_norm": 0.2190365344285965, | |
| "kl": 0.00010353326797485352, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.1893621925264597, | |
| "reward_std": 0.31022848933935165, | |
| "rewards/cosine_scaled_reward": 0.017491597682237625, | |
| "rewards/format_reward": 0.5625000298023224, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 2934.3541870117188, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.18039213120937347, | |
| "kl": 7.003545761108398e-05, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.11967798322439194, | |
| "reward_std": 0.19052543118596077, | |
| "rewards/cosine_scaled_reward": 0.040288787335157394, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 2329.4584350585938, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.23464825749397278, | |
| "kl": 0.0002582073211669922, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 0.12857681699097157, | |
| "reward_std": 0.20647091418504715, | |
| "rewards/cosine_scaled_reward": -0.0722556822001934, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 2854.500030517578, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.21867266297340393, | |
| "kl": 0.00016963481903076172, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.0, | |
| "reward": 0.14125719666481018, | |
| "reward_std": 0.32246551662683487, | |
| "rewards/cosine_scaled_reward": 0.0029166871681809425, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 2820.2500915527344, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.19331412017345428, | |
| "kl": 0.00015205144882202148, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0, | |
| "reward": 0.1258083715365501, | |
| "reward_std": 0.32310857623815536, | |
| "rewards/cosine_scaled_reward": -0.020295356400310993, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 2856.0833740234375, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.1580573320388794, | |
| "kl": 3.808736801147461e-05, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0, | |
| "reward": 0.2725381199270487, | |
| "reward_std": 0.36154576390981674, | |
| "rewards/cosine_scaled_reward": 0.1424376405775547, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 2909.1250610351562, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.18711332976818085, | |
| "kl": 0.00010773539543151855, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0, | |
| "reward": 0.13717407977674156, | |
| "reward_std": 0.3047039993107319, | |
| "rewards/cosine_scaled_reward": -0.015960073098540306, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 2983.4583740234375, | |
| "epoch": 0.064, | |
| "grad_norm": 0.172135129570961, | |
| "kl": 5.167722702026367e-05, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0, | |
| "reward": 0.06429924443364143, | |
| "reward_std": 0.23643197491765022, | |
| "rewards/cosine_scaled_reward": -0.09967034682631493, | |
| "rewards/format_reward": 0.375, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 3349.291748046875, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.13240715861320496, | |
| "kl": 3.217160701751709e-05, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.0, | |
| "reward": 0.08298390917479992, | |
| "reward_std": 0.2719106115400791, | |
| "rewards/cosine_scaled_reward": -0.061886819545179605, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 2203.2291870117188, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.22709013521671295, | |
| "kl": 0.00039386749267578125, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0, | |
| "reward": 0.2722779009491205, | |
| "reward_std": 0.2938956469297409, | |
| "rewards/cosine_scaled_reward": 0.08483442291617393, | |
| "rewards/format_reward": 0.666666692122817, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 2840.187530517578, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.17239369451999664, | |
| "kl": 1.9103288650512695e-05, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0, | |
| "reward": 0.0854027196764946, | |
| "reward_std": 0.22641704231500626, | |
| "rewards/cosine_scaled_reward": -0.06355120055377483, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 3012.1250610351562, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.1650194674730301, | |
| "kl": 7.164478302001953e-05, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0, | |
| "reward": 0.012519015290308744, | |
| "reward_std": 0.22390243411064148, | |
| "rewards/cosine_scaled_reward": -0.14655769802629948, | |
| "rewards/format_reward": 0.375, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 3101.8126220703125, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.17062878608703613, | |
| "kl": 0.00014269817620515823, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0, | |
| "reward": 0.05342123447917402, | |
| "reward_std": 0.25336652249097824, | |
| "rewards/cosine_scaled_reward": -0.15571192651987076, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 2601.5833435058594, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.22512151300907135, | |
| "kl": 0.0006105899810791016, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0, | |
| "reward": 0.17650287225842476, | |
| "reward_std": 0.3199679031968117, | |
| "rewards/cosine_scaled_reward": -0.06858954066410661, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 2216.9166870117188, | |
| "epoch": 0.072, | |
| "grad_norm": 0.23623046278953552, | |
| "kl": 0.0003886222839355469, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0, | |
| "reward": 0.2345086196437478, | |
| "reward_std": 0.364711195230484, | |
| "rewards/cosine_scaled_reward": -0.0016382848843932152, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 2802.8125915527344, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.17913182079792023, | |
| "kl": 0.00014469027519226074, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0, | |
| "reward": 0.13276350032538176, | |
| "reward_std": 0.3188353106379509, | |
| "rewards/cosine_scaled_reward": -0.014707108959555626, | |
| "rewards/format_reward": 0.4166666828095913, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 2648.791717529297, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.19293726980686188, | |
| "kl": 0.00011703372001647949, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0, | |
| "reward": 0.02339506382122636, | |
| "reward_std": 0.16691064462065697, | |
| "rewards/cosine_scaled_reward": -0.16883965581655502, | |
| "rewards/format_reward": 0.4166666865348816, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 2108.4375228881836, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.2505956292152405, | |
| "kl": 0.00021564960479736328, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0, | |
| "reward": 0.18803048506379128, | |
| "reward_std": 0.23812443763017654, | |
| "rewards/cosine_scaled_reward": 0.02932748757302761, | |
| "rewards/format_reward": 0.5, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 3409.4583740234375, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.15418125689029694, | |
| "kl": 0.00023257732391357422, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0, | |
| "reward": -0.13601324148476124, | |
| "reward_std": 0.14229955151677132, | |
| "rewards/cosine_scaled_reward": -0.28745780140161514, | |
| "rewards/format_reward": 0.1666666716337204, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 1989.791748046875, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.2515312433242798, | |
| "kl": 0.0010142326354980469, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0, | |
| "reward": 0.22917652688920498, | |
| "reward_std": 0.29839884862303734, | |
| "rewards/cosine_scaled_reward": -0.01369619369506836, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 2491.041748046875, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.26017820835113525, | |
| "kl": 0.0007710456848144531, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0, | |
| "reward": 0.049685924575896934, | |
| "reward_std": 0.2717648334801197, | |
| "rewards/cosine_scaled_reward": -0.1978147281333804, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 3060.7083740234375, | |
| "epoch": 0.08, | |
| "grad_norm": 0.16957440972328186, | |
| "kl": 0.0008012652397155762, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0, | |
| "reward": -0.042351190000772476, | |
| "reward_std": 0.1818229742348194, | |
| "rewards/cosine_scaled_reward": -0.21522082015872002, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 2694.312545776367, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.1995006948709488, | |
| "kl": 0.0007072687149047852, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0, | |
| "reward": 0.07359633408486843, | |
| "reward_std": 0.22721958719193935, | |
| "rewards/cosine_scaled_reward": -0.06798692792654037, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 2656.0625, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.21664902567863464, | |
| "kl": 0.0008908510208129883, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.0, | |
| "reward": 0.04184230323880911, | |
| "reward_std": 0.22974644601345062, | |
| "rewards/cosine_scaled_reward": -0.15995099861174822, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 3503.125, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.17171922326087952, | |
| "kl": 0.00013053417205810547, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0, | |
| "reward": -0.05220331740565598, | |
| "reward_std": 0.246462631970644, | |
| "rewards/cosine_scaled_reward": -0.1583964079618454, | |
| "rewards/format_reward": 0.1875000074505806, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 3180.4791870117188, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.19165758788585663, | |
| "kl": 0.00042688846588134766, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0, | |
| "reward": 0.03528491756878793, | |
| "reward_std": 0.24782484769821167, | |
| "rewards/cosine_scaled_reward": -0.04785974510014057, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 3037.416748046875, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.15819406509399414, | |
| "kl": 0.0006095767021179199, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0, | |
| "reward": 0.0823002946563065, | |
| "reward_std": 0.19139036908745766, | |
| "rewards/cosine_scaled_reward": -0.05699274316430092, | |
| "rewards/format_reward": 0.37500001676380634, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 2817.7709350585938, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.1885841190814972, | |
| "kl": 0.0001068115234375, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0, | |
| "reward": 0.03719830792397261, | |
| "reward_std": 0.24271035939455032, | |
| "rewards/cosine_scaled_reward": -0.178798396140337, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 3224.6458740234375, | |
| "epoch": 0.088, | |
| "grad_norm": 0.1698525995016098, | |
| "kl": 0.00024068355560302734, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0, | |
| "reward": 0.005038566887378693, | |
| "reward_std": 0.20738781057298183, | |
| "rewards/cosine_scaled_reward": -0.15254707634449005, | |
| "rewards/format_reward": 0.2708333432674408, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 3077.8543090820312, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.20861662924289703, | |
| "kl": 0.0001271367073059082, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.0, | |
| "reward": 0.10431988351047039, | |
| "reward_std": 0.31425832584500313, | |
| "rewards/cosine_scaled_reward": -0.03074691817164421, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 2268.041717529297, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.22589290142059326, | |
| "kl": 0.000714719295501709, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0, | |
| "reward": 0.10906978440470994, | |
| "reward_std": 0.18880136497318745, | |
| "rewards/cosine_scaled_reward": -0.09606132353655994, | |
| "rewards/format_reward": 0.5, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 3246.6250610351562, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.18828435242176056, | |
| "kl": 0.0005083680152893066, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0, | |
| "reward": 0.06318249693140388, | |
| "reward_std": 0.31002140790224075, | |
| "rewards/cosine_scaled_reward": -0.07233269140124321, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 3215.3333740234375, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.27181336283683777, | |
| "kl": 0.0016459226608276367, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0001, | |
| "reward": -0.06593337655067444, | |
| "reward_std": 0.228203646838665, | |
| "rewards/cosine_scaled_reward": -0.1975144650787115, | |
| "rewards/format_reward": 0.1875, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 2783.4375610351562, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.17714105546474457, | |
| "kl": 0.0010538101196289062, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.0, | |
| "reward": 0.10739830927923322, | |
| "reward_std": 0.2685987316071987, | |
| "rewards/cosine_scaled_reward": -0.03886566497385502, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 2704.0833740234375, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.2489301860332489, | |
| "kl": 0.0006620883941650391, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0, | |
| "reward": 0.051296481397002935, | |
| "reward_std": 0.22126955911517143, | |
| "rewards/cosine_scaled_reward": -0.10262950323522091, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 2919.479217529297, | |
| "epoch": 0.096, | |
| "grad_norm": 0.16122053563594818, | |
| "kl": 0.0003643035888671875, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0, | |
| "reward": 0.14531802013516426, | |
| "reward_std": 0.30427272617816925, | |
| "rewards/cosine_scaled_reward": 0.02680143341422081, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 3221.625, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.13713707029819489, | |
| "kl": 0.00022965669631958008, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0, | |
| "reward": 0.045454833656549454, | |
| "reward_std": 0.32109829783439636, | |
| "rewards/cosine_scaled_reward": -0.0938787111081183, | |
| "rewards/format_reward": 0.33333334513008595, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 3070.1666870117188, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.1718609482049942, | |
| "kl": 0.0007703304290771484, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0, | |
| "reward": 0.020616615191102028, | |
| "reward_std": 0.20714357122778893, | |
| "rewards/cosine_scaled_reward": -0.12587381899356842, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 2553.8126220703125, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.23934882879257202, | |
| "kl": 0.0006546974182128906, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0, | |
| "reward": 0.16441121231764555, | |
| "reward_std": 0.27248556166887283, | |
| "rewards/cosine_scaled_reward": -0.07301743514835835, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 2649.6666870117188, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.24427059292793274, | |
| "kl": 0.0014219284057617188, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0001, | |
| "reward": 0.30650845542550087, | |
| "reward_std": 0.442048154771328, | |
| "rewards/cosine_scaled_reward": 0.09996251187112648, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 3074.9793090820312, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.20860984921455383, | |
| "kl": 0.0009670257568359375, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0, | |
| "reward": 0.06839994341135025, | |
| "reward_std": 0.3136523775756359, | |
| "rewards/cosine_scaled_reward": -0.0889416765421629, | |
| "rewards/format_reward": 0.3333333469927311, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 2521.3959045410156, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.26103153824806213, | |
| "kl": 0.001010894775390625, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0, | |
| "reward": -0.015254740603268147, | |
| "reward_std": 0.12981478869915009, | |
| "rewards/cosine_scaled_reward": -0.2523176036775112, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 3130.4166870117188, | |
| "epoch": 0.104, | |
| "grad_norm": 0.1804915815591812, | |
| "kl": 0.0006823539733886719, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0, | |
| "reward": 0.024226047098636627, | |
| "reward_std": 0.24618754535913467, | |
| "rewards/cosine_scaled_reward": -0.13014899473637342, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 2594.5000915527344, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.25217053294181824, | |
| "kl": 0.0017712712287902832, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0001, | |
| "reward": 0.06422214396297932, | |
| "reward_std": 0.20822307094931602, | |
| "rewards/cosine_scaled_reward": -0.12269663874758407, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 3538.125, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.16923604905605316, | |
| "kl": 0.0010764598846435547, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0, | |
| "reward": -0.16369806230068207, | |
| "reward_std": 0.1373737584799528, | |
| "rewards/cosine_scaled_reward": -0.2538382336497307, | |
| "rewards/format_reward": 0.02083333395421505, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 3163.4583740234375, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.1778799593448639, | |
| "kl": 0.001428365707397461, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0001, | |
| "reward": 0.014819767326116562, | |
| "reward_std": 0.23536204546689987, | |
| "rewards/cosine_scaled_reward": -0.11371379345655441, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 3426.7083740234375, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.17324331402778625, | |
| "kl": 0.0004151463508605957, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0, | |
| "reward": -0.035191090777516365, | |
| "reward_std": 0.27483338490128517, | |
| "rewards/cosine_scaled_reward": -0.1303609658498317, | |
| "rewards/format_reward": 0.1875000074505806, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 2760.25, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.17773400247097015, | |
| "kl": 0.0018305778503417969, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0001, | |
| "reward": 0.08919629082083702, | |
| "reward_std": 0.22743511945009232, | |
| "rewards/cosine_scaled_reward": -0.016248881816864014, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 3314.3959350585938, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.16482950747013092, | |
| "kl": 0.0007615089416503906, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0, | |
| "reward": -0.014882845804095268, | |
| "reward_std": 0.28871480002999306, | |
| "rewards/cosine_scaled_reward": -0.17778804525732994, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 2971.729248046875, | |
| "epoch": 0.112, | |
| "grad_norm": 0.15072181820869446, | |
| "kl": 0.0003001689910888672, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0, | |
| "reward": 0.03349009482190013, | |
| "reward_std": 0.18237757682800293, | |
| "rewards/cosine_scaled_reward": -0.1615230068564415, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 2852.7708740234375, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.19570456445217133, | |
| "kl": 0.0006053447723388672, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0, | |
| "reward": 0.06496170163154602, | |
| "reward_std": 0.21407892182469368, | |
| "rewards/cosine_scaled_reward": -0.0664414819329977, | |
| "rewards/format_reward": 0.3125, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 2670.4168090820312, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.17085237801074982, | |
| "kl": 0.0008068084716796875, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0, | |
| "reward": 0.15346461767330766, | |
| "reward_std": 0.30039672553539276, | |
| "rewards/cosine_scaled_reward": -0.029875319451093674, | |
| "rewards/format_reward": 0.4791666865348816, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 3090.6875610351562, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.17470602691173553, | |
| "kl": 0.0010237693786621094, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0, | |
| "reward": 0.04014189355075359, | |
| "reward_std": 0.22363552451133728, | |
| "rewards/cosine_scaled_reward": -0.09809151291847229, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 2311.3750610351562, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.2883223593235016, | |
| "kl": 0.002286195755004883, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0001, | |
| "reward": 0.1678666821680963, | |
| "reward_std": 0.21508530527353287, | |
| "rewards/cosine_scaled_reward": -0.10981736332178116, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 3020.3125915527344, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.22340644896030426, | |
| "kl": 0.0008773207664489746, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0, | |
| "reward": 0.09143933840095997, | |
| "reward_std": 0.27459392696619034, | |
| "rewards/cosine_scaled_reward": -0.08100417070090771, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 2666.041748046875, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.2013712376356125, | |
| "kl": 0.001735687255859375, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0001, | |
| "reward": 0.04090608523983974, | |
| "reward_std": 0.13189667649567127, | |
| "rewards/cosine_scaled_reward": -0.14965718239545822, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 2776.3125, | |
| "epoch": 0.12, | |
| "grad_norm": 0.21267718076705933, | |
| "kl": 0.0008919239044189453, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0, | |
| "reward": 0.08371754828840494, | |
| "reward_std": 0.24455546587705612, | |
| "rewards/cosine_scaled_reward": -0.024641111493110657, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 2168.854217529297, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.194187730550766, | |
| "kl": 0.0012993812561035156, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0001, | |
| "reward": 0.37329915910959244, | |
| "reward_std": 0.23653614707291126, | |
| "rewards/cosine_scaled_reward": 0.2502317950129509, | |
| "rewards/format_reward": 0.6666666772216558, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 3031.2916870117188, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.22123286128044128, | |
| "kl": 0.0007143020629882812, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0, | |
| "reward": 0.06332904286682606, | |
| "reward_std": 0.2138124220073223, | |
| "rewards/cosine_scaled_reward": -0.10465502738952637, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 2653.354248046875, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.19954904913902283, | |
| "kl": 0.001068115234375, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0, | |
| "reward": 0.1708927322179079, | |
| "reward_std": 0.2717986926436424, | |
| "rewards/cosine_scaled_reward": 0.013383063487708569, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 3006.3333740234375, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.15602044761180878, | |
| "kl": 0.0003781318664550781, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0, | |
| "reward": -0.01096257008612156, | |
| "reward_std": 0.15808527171611786, | |
| "rewards/cosine_scaled_reward": -0.19856059784069657, | |
| "rewards/format_reward": 0.375, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 2700.2501220703125, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.23633067309856415, | |
| "kl": 0.0006241798400878906, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0, | |
| "reward": 0.14015202317386866, | |
| "reward_std": 0.3137088418006897, | |
| "rewards/cosine_scaled_reward": -0.07339744362980127, | |
| "rewards/format_reward": 0.5, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 2980.729248046875, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.20043744146823883, | |
| "kl": 0.0012969970703125, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0001, | |
| "reward": 0.12000765651464462, | |
| "reward_std": 0.3265395238995552, | |
| "rewards/cosine_scaled_reward": -0.05010761972516775, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 3150.8333740234375, | |
| "epoch": 0.128, | |
| "grad_norm": 0.16131086647510529, | |
| "kl": 0.0005326271057128906, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0, | |
| "reward": 0.12711239233613014, | |
| "reward_std": 0.37858303636312485, | |
| "rewards/cosine_scaled_reward": -0.00730159692466259, | |
| "rewards/format_reward": 0.39583334885537624, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 2972.2083435058594, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.1954687088727951, | |
| "kl": 0.0014047622680664062, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0001, | |
| "reward": 0.07627532246988267, | |
| "reward_std": 0.291649978607893, | |
| "rewards/cosine_scaled_reward": -0.07863758876919746, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 2534.3334045410156, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.19298915565013885, | |
| "kl": 0.0018050670623779297, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0001, | |
| "reward": 0.10349475312978029, | |
| "reward_std": 0.2791562117636204, | |
| "rewards/cosine_scaled_reward": -0.17762713879346848, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 2881.6459350585938, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.20512622594833374, | |
| "kl": 0.002078533172607422, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0001, | |
| "reward": 0.061600953340530396, | |
| "reward_std": 0.26807834208011627, | |
| "rewards/cosine_scaled_reward": -0.0658574104309082, | |
| "rewards/format_reward": 0.33333334885537624, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 3314.8959350585938, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.18601059913635254, | |
| "kl": 0.0011138916015625, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0, | |
| "reward": -0.046209411695599556, | |
| "reward_std": 0.27809127047657967, | |
| "rewards/cosine_scaled_reward": -0.19332470558583736, | |
| "rewards/format_reward": 0.22916667349636555, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 3279.4166870117188, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.1657201051712036, | |
| "kl": 0.001247406005859375, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0, | |
| "reward": -0.013633315684273839, | |
| "reward_std": 0.25296393781900406, | |
| "rewards/cosine_scaled_reward": -0.1371129583567381, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 3042.2709350585938, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.15822374820709229, | |
| "kl": 0.0009875297546386719, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0, | |
| "reward": 0.2582199349999428, | |
| "reward_std": 0.44860056787729263, | |
| "rewards/cosine_scaled_reward": 0.1183284455910325, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 2344.666748046875, | |
| "epoch": 0.136, | |
| "grad_norm": 0.23042984306812286, | |
| "kl": 0.0028333663940429688, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0001, | |
| "reward": 0.19421357102692127, | |
| "reward_std": 0.2767038568854332, | |
| "rewards/cosine_scaled_reward": -0.02037617191672325, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 2476.979217529297, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.20887106657028198, | |
| "kl": 0.0014963150024414062, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0001, | |
| "reward": 0.04804867971688509, | |
| "reward_std": 0.2182541899383068, | |
| "rewards/cosine_scaled_reward": -0.19375143572688103, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 1820.5000915527344, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.5606526732444763, | |
| "kl": 0.017534255981445312, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0007, | |
| "reward": 0.20858957897871733, | |
| "reward_std": 0.2296605035662651, | |
| "rewards/cosine_scaled_reward": -0.03449038416147232, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 3009.3125610351562, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.18202351033687592, | |
| "kl": 0.0010209083557128906, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0, | |
| "reward": 0.17906883545219898, | |
| "reward_std": 0.37603622674942017, | |
| "rewards/cosine_scaled_reward": 0.019067944958806038, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 2798.5001220703125, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.18842831254005432, | |
| "kl": 0.001125335693359375, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0, | |
| "reward": 0.09704925492405891, | |
| "reward_std": 0.29832683503627777, | |
| "rewards/cosine_scaled_reward": -0.1078597791492939, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 2319.5001220703125, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.258215069770813, | |
| "kl": 0.005318641662597656, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0002, | |
| "reward": 0.14139796933159232, | |
| "reward_std": 0.2474342044442892, | |
| "rewards/cosine_scaled_reward": -0.08365245535969734, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 2868.7083740234375, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.18052953481674194, | |
| "kl": 0.0010223388671875, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0, | |
| "reward": 0.0953734740614891, | |
| "reward_std": 0.2038922980427742, | |
| "rewards/cosine_scaled_reward": -0.021539516746997833, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 2733.9375610351562, | |
| "epoch": 0.144, | |
| "grad_norm": 0.1686813235282898, | |
| "kl": 0.0012359619140625, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0, | |
| "reward": 0.1783338594250381, | |
| "reward_std": 0.22713003307580948, | |
| "rewards/cosine_scaled_reward": -0.019597443286329508, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 2906.4584045410156, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.16627976298332214, | |
| "kl": 0.000850677490234375, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0, | |
| "reward": 0.039196121506392956, | |
| "reward_std": 0.17883937060832977, | |
| "rewards/cosine_scaled_reward": -0.15023103915154934, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 2877.7916870117188, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.19161036610603333, | |
| "kl": 0.0016307830810546875, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0001, | |
| "reward": 0.16749756410717964, | |
| "reward_std": 0.32903048396110535, | |
| "rewards/cosine_scaled_reward": 0.06750535871833563, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 3303.9791870117188, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.18518763780593872, | |
| "kl": 0.0013399124145507812, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0001, | |
| "reward": -0.04355821633362211, | |
| "reward_std": 0.256184009835124, | |
| "rewards/cosine_scaled_reward": -0.1530312355607748, | |
| "rewards/format_reward": 0.20833334140479565, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 2958.9376220703125, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.17572814226150513, | |
| "kl": 0.0014519691467285156, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0001, | |
| "reward": -0.018080759793519974, | |
| "reward_std": 0.15979976579546928, | |
| "rewards/cosine_scaled_reward": -0.1620011143386364, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 2775.5625610351562, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.25634628534317017, | |
| "kl": 0.0019540786743164062, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0001, | |
| "reward": 0.16852700896561146, | |
| "reward_std": 0.32166776061058044, | |
| "rewards/cosine_scaled_reward": 0.031354669481515884, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 2601.541748046875, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.17196296155452728, | |
| "kl": 0.0007419586181640625, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0, | |
| "reward": 0.17289095558226109, | |
| "reward_std": 0.24414288625121117, | |
| "rewards/cosine_scaled_reward": 0.04381055012345314, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 3346.354248046875, | |
| "epoch": 0.152, | |
| "grad_norm": 0.17542175948619843, | |
| "kl": 0.0012054443359375, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0, | |
| "reward": -0.061608402989804745, | |
| "reward_std": 0.1897377409040928, | |
| "rewards/cosine_scaled_reward": -0.17360613122582436, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 2794.9375610351562, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.19970323145389557, | |
| "kl": 0.0014543533325195312, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0001, | |
| "reward": 0.11259202286601067, | |
| "reward_std": 0.320014838129282, | |
| "rewards/cosine_scaled_reward": -0.044173166155815125, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 1900.5000457763672, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.23359762132167816, | |
| "kl": 0.001819610595703125, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0001, | |
| "reward": 0.2668313942849636, | |
| "reward_std": 0.28612423315644264, | |
| "rewards/cosine_scaled_reward": 0.11550819734111428, | |
| "rewards/format_reward": 0.625, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 2652.4584350585938, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.18795403838157654, | |
| "kl": 0.0008897781372070312, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0, | |
| "reward": 0.21900932490825653, | |
| "reward_std": 0.30797392688691616, | |
| "rewards/cosine_scaled_reward": 0.07143217464908957, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 2986.6041870117188, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.18147197365760803, | |
| "kl": 0.00103759765625, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0, | |
| "reward": -0.018022691132500768, | |
| "reward_std": 0.23599210940301418, | |
| "rewards/cosine_scaled_reward": -0.1875150203704834, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 2430.2500610351562, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.1941184550523758, | |
| "kl": 0.0010514259338378906, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0, | |
| "reward": 0.16211648099124432, | |
| "reward_std": 0.2540963143110275, | |
| "rewards/cosine_scaled_reward": -0.08601415157318115, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 2956.3334350585938, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.19715362787246704, | |
| "kl": 0.001739501953125, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0001, | |
| "reward": 0.098558459430933, | |
| "reward_std": 0.29911189526319504, | |
| "rewards/cosine_scaled_reward": -0.12457533576525748, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 2963.979248046875, | |
| "epoch": 0.16, | |
| "grad_norm": 0.3068901002407074, | |
| "kl": 0.00263214111328125, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0001, | |
| "reward": 0.05242377519607544, | |
| "reward_std": 0.2736871726810932, | |
| "rewards/cosine_scaled_reward": -0.06892253458499908, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 2732.1875915527344, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.20098887383937836, | |
| "kl": 0.0020694732666015625, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0001, | |
| "reward": 0.09720400208607316, | |
| "reward_std": 0.3277590870857239, | |
| "rewards/cosine_scaled_reward": -0.12496923531580251, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 2843.6875915527344, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.17572563886642456, | |
| "kl": 0.0017600059509277344, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0001, | |
| "reward": 0.16560933645814657, | |
| "reward_std": 0.31519390642642975, | |
| "rewards/cosine_scaled_reward": -0.04391762427985668, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 2583.8751220703125, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.2704511284828186, | |
| "kl": 0.00214385986328125, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0001, | |
| "reward": 0.08706286805681884, | |
| "reward_std": 0.27615340799093246, | |
| "rewards/cosine_scaled_reward": -0.11747677624225616, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 2858.166748046875, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.24892909824848175, | |
| "kl": 0.0025987625122070312, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0001, | |
| "reward": 0.0692533003166318, | |
| "reward_std": 0.26655495166778564, | |
| "rewards/cosine_scaled_reward": -0.0823652264662087, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 2059.541748046875, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.24266168475151062, | |
| "kl": 0.004192352294921875, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0002, | |
| "reward": 0.25175508856773376, | |
| "reward_std": 0.22561134956777096, | |
| "rewards/cosine_scaled_reward": 0.03231249749660492, | |
| "rewards/format_reward": 0.6666666828095913, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 2468.1876220703125, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.19397643208503723, | |
| "kl": 0.0013666152954101562, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0001, | |
| "reward": 0.06067563686519861, | |
| "reward_std": 0.24036981165409088, | |
| "rewards/cosine_scaled_reward": -0.20845083706080914, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 2808.6459350585938, | |
| "epoch": 0.168, | |
| "grad_norm": 0.20159922540187836, | |
| "kl": 0.0025682449340820312, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0001, | |
| "reward": 0.11753404047340155, | |
| "reward_std": 0.2573778033256531, | |
| "rewards/cosine_scaled_reward": -0.04415273433551192, | |
| "rewards/format_reward": 0.4583333544433117, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 2150.9791870117188, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.21388837695121765, | |
| "kl": 0.0014133453369140625, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.0001, | |
| "reward": 0.1763187162578106, | |
| "reward_std": 0.22804216668009758, | |
| "rewards/cosine_scaled_reward": -0.048267703503370285, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 2555.4584350585938, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.17537924647331238, | |
| "kl": 0.0022039413452148438, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0001, | |
| "reward": 0.2855038889683783, | |
| "reward_std": 0.28325819969177246, | |
| "rewards/cosine_scaled_reward": 0.10880730301141739, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 2453.854248046875, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.276807576417923, | |
| "kl": 0.0028324127197265625, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0001, | |
| "reward": 0.18719099462032318, | |
| "reward_std": 0.3572011888027191, | |
| "rewards/cosine_scaled_reward": 0.03765287483111024, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 2566.5209350585938, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.23010526597499847, | |
| "kl": 0.003368377685546875, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0001, | |
| "reward": 0.18735797423869371, | |
| "reward_std": 0.3481326140463352, | |
| "rewards/cosine_scaled_reward": 0.039755554869771004, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 2584.2500610351562, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.2767723500728607, | |
| "kl": 0.0019273757934570312, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.0001, | |
| "reward": 0.08376874192617834, | |
| "reward_std": 0.29247722774744034, | |
| "rewards/cosine_scaled_reward": -0.13108721375465393, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 2822.8959350585938, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.2643832266330719, | |
| "kl": 0.0040264129638671875, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0002, | |
| "reward": -0.010647638468071818, | |
| "reward_std": 0.2258121222257614, | |
| "rewards/cosine_scaled_reward": -0.21952056884765625, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 3242.0625610351562, | |
| "epoch": 0.176, | |
| "grad_norm": 0.2574949264526367, | |
| "kl": 0.0021734237670898438, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0001, | |
| "reward": 0.11165983291903103, | |
| "reward_std": 0.27420539781451225, | |
| "rewards/cosine_scaled_reward": -0.013817982282489538, | |
| "rewards/format_reward": 0.35416667722165585, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 2522.2084350585938, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.21373054385185242, | |
| "kl": 0.0026683807373046875, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0001, | |
| "reward": 0.2718268558382988, | |
| "reward_std": 0.32829324156045914, | |
| "rewards/cosine_scaled_reward": 0.16317287646234035, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 2815.104217529297, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.1889781653881073, | |
| "kl": 0.00186920166015625, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0001, | |
| "reward": 0.10164620354771614, | |
| "reward_std": 0.23263414576649666, | |
| "rewards/cosine_scaled_reward": -0.04785778373479843, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 2579.416717529297, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.21700051426887512, | |
| "kl": 0.00357818603515625, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.0001, | |
| "reward": 0.0624239519238472, | |
| "reward_std": 0.19695715978741646, | |
| "rewards/cosine_scaled_reward": -0.1928333044052124, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 2961.1875, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.19288885593414307, | |
| "kl": 0.0033111572265625, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.0001, | |
| "reward": 0.19672009721398354, | |
| "reward_std": 0.34154168516397476, | |
| "rewards/cosine_scaled_reward": 0.055924009531736374, | |
| "rewards/format_reward": 0.4166666865348816, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 2796.3959350585938, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.21284666657447815, | |
| "kl": 0.003772735595703125, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0002, | |
| "reward": -0.014586111530661583, | |
| "reward_std": 0.22132756188511848, | |
| "rewards/cosine_scaled_reward": -0.20993047207593918, | |
| "rewards/format_reward": 0.3541666828095913, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 2681.583465576172, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.19720213115215302, | |
| "kl": 0.00548553466796875, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.0002, | |
| "reward": 0.14375573489814997, | |
| "reward_std": 0.28298795223236084, | |
| "rewards/cosine_scaled_reward": -0.02817438170313835, | |
| "rewards/format_reward": 0.4583333544433117, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 2425.5000610351562, | |
| "epoch": 0.184, | |
| "grad_norm": 0.24934129416942596, | |
| "kl": 0.0036067962646484375, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0001, | |
| "reward": 0.23228778317570686, | |
| "reward_std": 0.30961449444293976, | |
| "rewards/cosine_scaled_reward": 0.004279725253582001, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 2979.0625610351562, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.2740453779697418, | |
| "kl": 0.006992340087890625, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.0003, | |
| "reward": 0.021589869633316994, | |
| "reward_std": 0.21139143407344818, | |
| "rewards/cosine_scaled_reward": -0.11643525585532188, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 2308.1043090820312, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.19304688274860382, | |
| "kl": 0.0030364990234375, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.0001, | |
| "reward": 0.3190014772117138, | |
| "reward_std": 0.23985935747623444, | |
| "rewards/cosine_scaled_reward": 0.12445038510486484, | |
| "rewards/format_reward": 0.625, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 2485.5834045410156, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.20876161754131317, | |
| "kl": 0.004638671875, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.0002, | |
| "reward": 0.1634826324880123, | |
| "reward_std": 0.27906645834445953, | |
| "rewards/cosine_scaled_reward": -0.0056002295459620655, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 2515.916748046875, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.2549312114715576, | |
| "kl": 0.00457000732421875, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0002, | |
| "reward": 0.07781760673969984, | |
| "reward_std": 0.27216625958681107, | |
| "rewards/cosine_scaled_reward": -0.13273300230503082, | |
| "rewards/format_reward": 0.5000000186264515, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 2622.8333740234375, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.4316038191318512, | |
| "kl": 0.0029125213623046875, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0001, | |
| "reward": 0.05336676351726055, | |
| "reward_std": 0.2708708755671978, | |
| "rewards/cosine_scaled_reward": -0.1293769534677267, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 2269.7500610351562, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.26302385330200195, | |
| "kl": 0.003326416015625, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0001, | |
| "reward": 0.19086750224232674, | |
| "reward_std": 0.29077377915382385, | |
| "rewards/cosine_scaled_reward": -0.04062679596245289, | |
| "rewards/format_reward": 0.6666667014360428, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 2815.8959350585938, | |
| "epoch": 0.192, | |
| "grad_norm": 0.18967334926128387, | |
| "kl": 0.004009246826171875, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.0002, | |
| "reward": 0.18181406240910292, | |
| "reward_std": 0.3442248087376356, | |
| "rewards/cosine_scaled_reward": -0.018053213134407997, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 1723.3125305175781, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.20175939798355103, | |
| "kl": 0.003383636474609375, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5135928094387054, | |
| "reward_std": 0.2722737267613411, | |
| "rewards/cosine_scaled_reward": 0.3224016949534416, | |
| "rewards/format_reward": 0.875, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 2422.3125915527344, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.20338761806488037, | |
| "kl": 0.003936767578125, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0002, | |
| "reward": 0.10243179090321064, | |
| "reward_std": 0.17919140681624413, | |
| "rewards/cosine_scaled_reward": -0.0379374660551548, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 2499.041748046875, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.2039409875869751, | |
| "kl": 0.0029468536376953125, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0001, | |
| "reward": 0.26387462578713894, | |
| "reward_std": 0.2428448162972927, | |
| "rewards/cosine_scaled_reward": 0.146632214426063, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 2817.1458740234375, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.23230940103530884, | |
| "kl": 0.00603485107421875, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.0002, | |
| "reward": 0.18164719082415104, | |
| "reward_std": 0.2874234914779663, | |
| "rewards/cosine_scaled_reward": 0.005057951435446739, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 1697.0626068115234, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.25889357924461365, | |
| "kl": 0.004001617431640625, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0002, | |
| "reward": 0.16589125618338585, | |
| "reward_std": 0.21044109761714935, | |
| "rewards/cosine_scaled_reward": -0.07962662167847157, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 2253.041778564453, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.22761958837509155, | |
| "kl": 0.0071430206298828125, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2043942455202341, | |
| "reward_std": 0.3179479092359543, | |
| "rewards/cosine_scaled_reward": -0.06055827997624874, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 2658.3750610351562, | |
| "epoch": 0.2, | |
| "grad_norm": 0.18005910515785217, | |
| "kl": 0.004535675048828125, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2105781240388751, | |
| "reward_std": 0.23263323679566383, | |
| "rewards/cosine_scaled_reward": 0.08791563287377357, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 2122.7500610351562, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.23283635079860687, | |
| "kl": 0.0038604736328125, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0002, | |
| "reward": 0.15146937035024166, | |
| "reward_std": 0.35297995805740356, | |
| "rewards/cosine_scaled_reward": -0.09888035990297794, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 2699.166748046875, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.2360665202140808, | |
| "kl": 0.006439208984375, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0003, | |
| "reward": 0.12413773685693741, | |
| "reward_std": 0.28092398308217525, | |
| "rewards/cosine_scaled_reward": -0.0860077440738678, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 2637.9375610351562, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.2292148619890213, | |
| "kl": 0.00957489013671875, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0004, | |
| "reward": 0.15966340154409409, | |
| "reward_std": 0.31500688195228577, | |
| "rewards/cosine_scaled_reward": -0.0060564366285689175, | |
| "rewards/format_reward": 0.4583333544433117, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 2525.2916870117188, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.3491784930229187, | |
| "kl": 0.005161285400390625, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0002, | |
| "reward": 0.09528316929936409, | |
| "reward_std": 0.19896456971764565, | |
| "rewards/cosine_scaled_reward": -0.09036011155694723, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 2001.7292175292969, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.24371346831321716, | |
| "kl": 0.006465911865234375, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.0003, | |
| "reward": 0.25116462633013725, | |
| "reward_std": 0.2804570086300373, | |
| "rewards/cosine_scaled_reward": 0.06461600167676806, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 2886.729248046875, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.18016843497753143, | |
| "kl": 0.006114959716796875, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0002, | |
| "reward": 0.060142465867102146, | |
| "reward_std": 0.20902230963110924, | |
| "rewards/cosine_scaled_reward": -0.09921930730342865, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 2052.562530517578, | |
| "epoch": 0.208, | |
| "grad_norm": 0.1674763262271881, | |
| "kl": 0.0023097991943359375, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0001, | |
| "reward": 0.2553917448967695, | |
| "reward_std": 0.3303040415048599, | |
| "rewards/cosine_scaled_reward": 0.018131352961063385, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 1718.7709045410156, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.22614581882953644, | |
| "kl": 0.006946563720703125, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3155193105340004, | |
| "reward_std": 0.32469745725393295, | |
| "rewards/cosine_scaled_reward": 0.015594778582453728, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 2556.541748046875, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.22962883114814758, | |
| "kl": 0.005680084228515625, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.0002, | |
| "reward": 0.05491369962692261, | |
| "reward_std": 0.23178402706980705, | |
| "rewards/cosine_scaled_reward": -0.12312540411949158, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 2175.2916717529297, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.21504995226860046, | |
| "kl": 0.00449371337890625, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.0002, | |
| "reward": 0.02204909175634384, | |
| "reward_std": 0.1608840376138687, | |
| "rewards/cosine_scaled_reward": -0.22503361850976944, | |
| "rewards/format_reward": 0.5416666772216558, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 2807.125, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.17259298264980316, | |
| "kl": 0.00559234619140625, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0002, | |
| "reward": 0.17079078778624535, | |
| "reward_std": 0.22075738571584225, | |
| "rewards/cosine_scaled_reward": 0.029174381867051125, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 1926.4375915527344, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.2804771363735199, | |
| "kl": 0.00753021240234375, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0003, | |
| "reward": 0.21946877613663673, | |
| "reward_std": 0.25319264084100723, | |
| "rewards/cosine_scaled_reward": -0.04631359688937664, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 2948.3959045410156, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.18056756258010864, | |
| "kl": 0.005832672119140625, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0002, | |
| "reward": 0.04070337675511837, | |
| "reward_std": 0.2970491647720337, | |
| "rewards/cosine_scaled_reward": -0.12244150042533875, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 2054.541717529297, | |
| "epoch": 0.216, | |
| "grad_norm": 0.27006274461746216, | |
| "kl": 0.004894256591796875, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.0002, | |
| "reward": 0.13675427064299583, | |
| "reward_std": 0.22908888384699821, | |
| "rewards/cosine_scaled_reward": -0.11930672498419881, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 1816.8125305175781, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.2942986488342285, | |
| "kl": 0.004177093505859375, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2299617975950241, | |
| "reward_std": 0.2523540575057268, | |
| "rewards/cosine_scaled_reward": -0.02879966050386429, | |
| "rewards/format_reward": 0.6875000260770321, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 1893.4584045410156, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.24997583031654358, | |
| "kl": 0.005523681640625, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0002, | |
| "reward": 0.21511027216911316, | |
| "reward_std": 0.3751527927815914, | |
| "rewards/cosine_scaled_reward": -0.03335425350815058, | |
| "rewards/format_reward": 0.6250000298023224, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 2557.3959045410156, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.18185892701148987, | |
| "kl": 0.0045013427734375, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0002, | |
| "reward": 0.1383908847346902, | |
| "reward_std": 0.2589973732829094, | |
| "rewards/cosine_scaled_reward": -0.13022802397608757, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 3001.375, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.18063190579414368, | |
| "kl": 0.005649566650390625, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0002, | |
| "reward": -0.041762998211197555, | |
| "reward_std": 0.20236865058541298, | |
| "rewards/cosine_scaled_reward": -0.23694830760359764, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 2589.5626220703125, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.21290159225463867, | |
| "kl": 0.0049457550048828125, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0002, | |
| "reward": 0.3359889704734087, | |
| "reward_std": 0.3363325707614422, | |
| "rewards/cosine_scaled_reward": 0.17966394126415253, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 2308.8333740234375, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.3455241620540619, | |
| "kl": 0.004795074462890625, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0002, | |
| "reward": 0.15757257863879204, | |
| "reward_std": 0.34646955132484436, | |
| "rewards/cosine_scaled_reward": -0.1573782730847597, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 3186.5000610351562, | |
| "epoch": 0.224, | |
| "grad_norm": 0.1627224236726761, | |
| "kl": 0.00597381591796875, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0002, | |
| "reward": 0.07601241301745176, | |
| "reward_std": 0.24550564214587212, | |
| "rewards/cosine_scaled_reward": -0.06591853499412537, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 1489.6250610351562, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.25338006019592285, | |
| "kl": 0.00522613525390625, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2503656707704067, | |
| "reward_std": 0.33918196335434914, | |
| "rewards/cosine_scaled_reward": -0.03431052714586258, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 2260.979217529297, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.21298959851264954, | |
| "kl": 0.0067291259765625, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.0003, | |
| "reward": 0.13585240580141544, | |
| "reward_std": 0.25526952743530273, | |
| "rewards/cosine_scaled_reward": -0.08993709087371826, | |
| "rewards/format_reward": 0.6458333358168602, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 2225.2084350585938, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.2402978390455246, | |
| "kl": 0.007724761962890625, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0003, | |
| "reward": 0.14825211465358734, | |
| "reward_std": 0.25423312187194824, | |
| "rewards/cosine_scaled_reward": -0.15464808233082294, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 1518.4375610351562, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.2037964165210724, | |
| "kl": 0.0044708251953125, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.0002, | |
| "reward": 0.30662217177450657, | |
| "reward_std": 0.2782711200416088, | |
| "rewards/cosine_scaled_reward": 0.023611009120941162, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 2185.104217529297, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 0.22064223885536194, | |
| "kl": 0.00380706787109375, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4745171591639519, | |
| "reward_std": 0.3512828201055527, | |
| "rewards/cosine_scaled_reward": 0.32128068804740906, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 1733.2291870117188, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 0.21100802719593048, | |
| "kl": 0.004169464111328125, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.0002, | |
| "reward": 0.40203397534787655, | |
| "reward_std": 0.21288743987679482, | |
| "rewards/cosine_scaled_reward": 0.26086020842194557, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 2548.0000915527344, | |
| "epoch": 0.232, | |
| "grad_norm": 0.2812572717666626, | |
| "kl": 0.0077362060546875, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.0003, | |
| "reward": 0.15158599987626076, | |
| "reward_std": 0.3328489065170288, | |
| "rewards/cosine_scaled_reward": -0.013430323451757431, | |
| "rewards/format_reward": 0.4583333469927311, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 1831.7291870117188, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.2870136499404907, | |
| "kl": 0.008026123046875, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2725534439086914, | |
| "reward_std": 0.2783195786178112, | |
| "rewards/cosine_scaled_reward": -0.009869151283055544, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 2287.916717529297, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.2195574939250946, | |
| "kl": 0.004608154296875, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2794661708176136, | |
| "reward_std": 0.3664914593100548, | |
| "rewards/cosine_scaled_reward": 0.09487088554305956, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 2537.729217529297, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 0.21473117172718048, | |
| "kl": 0.0049896240234375, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0002, | |
| "reward": 0.1754872469464317, | |
| "reward_std": 0.31655465066432953, | |
| "rewards/cosine_scaled_reward": -0.07037313468754292, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 2232.0834045410156, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.22935722768306732, | |
| "kl": 0.00576019287109375, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0002, | |
| "reward": 0.1333406250923872, | |
| "reward_std": 0.23974663391709328, | |
| "rewards/cosine_scaled_reward": -0.1676623560488224, | |
| "rewards/format_reward": 0.6875, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 1856.9375610351562, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 0.2221345752477646, | |
| "kl": 0.005878448486328125, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.0002, | |
| "reward": 0.26369363255798817, | |
| "reward_std": 0.1700501013547182, | |
| "rewards/cosine_scaled_reward": 0.0034250058233737946, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 2069.8750915527344, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 0.29865333437919617, | |
| "kl": 0.0078582763671875, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2509584181243554, | |
| "reward_std": 0.2393549047410488, | |
| "rewards/cosine_scaled_reward": 0.04618738777935505, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 2130.5000610351562, | |
| "epoch": 0.24, | |
| "grad_norm": 0.19106724858283997, | |
| "kl": 0.006458282470703125, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0003, | |
| "reward": 0.14285951387137175, | |
| "reward_std": 0.19960445538163185, | |
| "rewards/cosine_scaled_reward": -0.08981587737798691, | |
| "rewards/format_reward": 0.6250000204890966, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 1910.729248046875, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 0.2566603720188141, | |
| "kl": 0.00563812255859375, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0002, | |
| "reward": 0.24128811853006482, | |
| "reward_std": 0.23205993883311749, | |
| "rewards/cosine_scaled_reward": -0.012406919151544571, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 1509.9375457763672, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 0.2559695243835449, | |
| "kl": 0.005527496337890625, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.0002, | |
| "reward": 0.29675021627917886, | |
| "reward_std": 0.21607037633657455, | |
| "rewards/cosine_scaled_reward": 0.03729934897273779, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 1916.1459350585938, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.24653296172618866, | |
| "kl": 0.00902557373046875, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.0004, | |
| "reward": 0.3602223386988044, | |
| "reward_std": 0.2784761004149914, | |
| "rewards/cosine_scaled_reward": 0.17531066434457898, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 2526.1876220703125, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 0.3414965569972992, | |
| "kl": 0.00806427001953125, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2717064246535301, | |
| "reward_std": 0.3078125827014446, | |
| "rewards/cosine_scaled_reward": 0.1282469742000103, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 1703.4792175292969, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.2487371265888214, | |
| "kl": 0.00457000732421875, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.0002, | |
| "reward": 0.22754944674670696, | |
| "reward_std": 0.170820539817214, | |
| "rewards/cosine_scaled_reward": -0.09126808494329453, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 1111.8958892822266, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.24535112082958221, | |
| "kl": 0.0048828125, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0002, | |
| "reward": 0.439119428396225, | |
| "reward_std": 0.3081901855766773, | |
| "rewards/cosine_scaled_reward": 0.18471297062933445, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 1762.0208435058594, | |
| "epoch": 0.248, | |
| "grad_norm": 0.21277444064617157, | |
| "kl": 0.0063629150390625, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.0003, | |
| "reward": 0.23682209104299545, | |
| "reward_std": 0.3436572030186653, | |
| "rewards/cosine_scaled_reward": 0.0008598812855780125, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 1882.2083435058594, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 0.22767505049705505, | |
| "kl": 0.00641632080078125, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.0003, | |
| "reward": 0.13319060020148754, | |
| "reward_std": 0.29052822291851044, | |
| "rewards/cosine_scaled_reward": -0.18922634795308113, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 1539.0625305175781, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 0.2611101567745209, | |
| "kl": 0.007419586181640625, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0003, | |
| "reward": 0.24942272529006004, | |
| "reward_std": 0.24504001811146736, | |
| "rewards/cosine_scaled_reward": 0.018378445878624916, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 1554.8542022705078, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 0.2599616050720215, | |
| "kl": 0.0065155029296875, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.0003, | |
| "reward": 0.07774155330844223, | |
| "reward_std": 0.14831382408738136, | |
| "rewards/cosine_scaled_reward": -0.29498909786343575, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 1505.1666870117188, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 0.2269742339849472, | |
| "kl": 0.0047454833984375, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0002, | |
| "reward": 0.32571933791041374, | |
| "reward_std": 0.24275402911007404, | |
| "rewards/cosine_scaled_reward": 0.11157770827412605, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 1946.916748046875, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.23065422475337982, | |
| "kl": 0.00635528564453125, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3265978842973709, | |
| "reward_std": 0.29620324075222015, | |
| "rewards/cosine_scaled_reward": 0.11483582854270935, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 1822.6458740234375, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 0.23844553530216217, | |
| "kl": 0.0056915283203125, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0002, | |
| "reward": 0.25359345600008965, | |
| "reward_std": 0.305300273001194, | |
| "rewards/cosine_scaled_reward": -0.011000402271747589, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 2066.5000610351562, | |
| "epoch": 0.256, | |
| "grad_norm": 0.2057129144668579, | |
| "kl": 0.006046295166015625, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.0002, | |
| "reward": 0.1604653261601925, | |
| "reward_std": 0.2231849767267704, | |
| "rewards/cosine_scaled_reward": -0.12973792850971222, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 2385.4375610351562, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.2630828022956848, | |
| "kl": 0.00998687744140625, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0004, | |
| "reward": 0.1808540727943182, | |
| "reward_std": 0.29596592485904694, | |
| "rewards/cosine_scaled_reward": -0.056875346694141626, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 1941.7708740234375, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 0.19778260588645935, | |
| "kl": 0.00537872314453125, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4080950105562806, | |
| "reward_std": 0.3161883242428303, | |
| "rewards/cosine_scaled_reward": 0.1740732565522194, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 1382.3125305175781, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 0.23452956974506378, | |
| "kl": 0.0117950439453125, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0005, | |
| "reward": 0.2297086864709854, | |
| "reward_std": 0.2339212652295828, | |
| "rewards/cosine_scaled_reward": -0.12544679269194603, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 1569.5209197998047, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.2557767629623413, | |
| "kl": 0.005340576171875, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0002, | |
| "reward": 0.35539381578564644, | |
| "reward_std": 0.28502682596445084, | |
| "rewards/cosine_scaled_reward": 0.15302862459793687, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 1640.5626220703125, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 0.216407909989357, | |
| "kl": 0.007568359375, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0003, | |
| "reward": 0.17562153795734048, | |
| "reward_std": 0.13211852312088013, | |
| "rewards/cosine_scaled_reward": -0.12780890613794327, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 1904.0208892822266, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.2001759558916092, | |
| "kl": 0.00653076171875, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0003, | |
| "reward": 0.12062056954891887, | |
| "reward_std": 0.19564786739647388, | |
| "rewards/cosine_scaled_reward": -0.21936549118254334, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 1837.7500305175781, | |
| "epoch": 0.264, | |
| "grad_norm": 0.3313891589641571, | |
| "kl": 0.0077056884765625, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3438783623278141, | |
| "reward_std": 0.3303004875779152, | |
| "rewards/cosine_scaled_reward": 0.0843821857124567, | |
| "rewards/format_reward": 0.7916667014360428, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 2066.916778564453, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 0.24516141414642334, | |
| "kl": 0.0080718994140625, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0003, | |
| "reward": 0.13200945453718305, | |
| "reward_std": 0.20449103228747845, | |
| "rewards/cosine_scaled_reward": -0.0955875813961029, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 1328.9584045410156, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 0.21824268996715546, | |
| "kl": 0.00582122802734375, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.0002, | |
| "reward": 0.20045446418225765, | |
| "reward_std": 0.25751522183418274, | |
| "rewards/cosine_scaled_reward": -0.17755268700420856, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 2047.9375839233398, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 0.2818503975868225, | |
| "kl": 0.0115814208984375, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.0005, | |
| "reward": 0.18364327400922775, | |
| "reward_std": 0.27602763287723064, | |
| "rewards/cosine_scaled_reward": -0.0731953289359808, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 1446.4166870117188, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 0.26381435990333557, | |
| "kl": 0.00649261474609375, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.0003, | |
| "reward": 0.42406467348337173, | |
| "reward_std": 0.3336157165467739, | |
| "rewards/cosine_scaled_reward": 0.15425015799701214, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 1943.5416870117188, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 0.22560541331768036, | |
| "kl": 0.006351470947265625, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0003, | |
| "reward": 0.1980036310851574, | |
| "reward_std": 0.2674345225095749, | |
| "rewards/cosine_scaled_reward": -0.08064225409179926, | |
| "rewards/format_reward": 0.75, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 1437.1459045410156, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.2287108153104782, | |
| "kl": 0.00568389892578125, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0002, | |
| "reward": 0.3367458498105407, | |
| "reward_std": 0.19615472108125687, | |
| "rewards/cosine_scaled_reward": 0.052811697125434875, | |
| "rewards/format_reward": 0.875, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 1172.1250305175781, | |
| "epoch": 0.272, | |
| "grad_norm": 0.24761365354061127, | |
| "kl": 0.00733184814453125, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.0003, | |
| "reward": 0.32783316634595394, | |
| "reward_std": 0.2573069650679827, | |
| "rewards/cosine_scaled_reward": -0.023007137060631067, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 1534.1250762939453, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 0.2047697901725769, | |
| "kl": 0.00616455078125, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0002, | |
| "reward": 0.44937574677169323, | |
| "reward_std": 0.25652869790792465, | |
| "rewards/cosine_scaled_reward": 0.2903861254453659, | |
| "rewards/format_reward": 0.75, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 2051.354248046875, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.2782205045223236, | |
| "kl": 0.010284423828125, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.0004, | |
| "reward": 0.08665064908564091, | |
| "reward_std": 0.14281608164310455, | |
| "rewards/cosine_scaled_reward": -0.21663008630275726, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 1735.2500610351562, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 0.2723366618156433, | |
| "kl": 0.00836944580078125, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0003, | |
| "reward": 0.07045929972082376, | |
| "reward_std": 0.1520705670118332, | |
| "rewards/cosine_scaled_reward": -0.27597323805093765, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 1264.6667022705078, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 0.2971493601799011, | |
| "kl": 0.0120086669921875, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.0005, | |
| "reward": 0.2805042117834091, | |
| "reward_std": 0.2464975118637085, | |
| "rewards/cosine_scaled_reward": -0.06483453698456287, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 1633.6458740234375, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 0.1975041925907135, | |
| "kl": 0.00750732421875, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3133881650865078, | |
| "reward_std": 0.28336846828460693, | |
| "rewards/cosine_scaled_reward": 0.05694121681153774, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 1654.354248046875, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 0.2421889454126358, | |
| "kl": 0.00768280029296875, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3425067327916622, | |
| "reward_std": 0.2821289487183094, | |
| "rewards/cosine_scaled_reward": 0.11226867651566863, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 1824.3750915527344, | |
| "epoch": 0.28, | |
| "grad_norm": 0.20805487036705017, | |
| "kl": 0.006072998046875, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0002, | |
| "reward": 0.34558769315481186, | |
| "reward_std": 0.33702201396226883, | |
| "rewards/cosine_scaled_reward": 0.10386714269407094, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 1482.729248046875, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 0.23012664914131165, | |
| "kl": 0.00945281982421875, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0004, | |
| "reward": 0.20973372273147106, | |
| "reward_std": 0.22376436367630959, | |
| "rewards/cosine_scaled_reward": -0.12794288620352745, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 2047.9791870117188, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 0.23625898361206055, | |
| "kl": 0.00788116455078125, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.0003, | |
| "reward": 0.10961971618235111, | |
| "reward_std": 0.25065623596310616, | |
| "rewards/cosine_scaled_reward": -0.15552188456058502, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 1475.6041870117188, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 0.31174013018608093, | |
| "kl": 0.007293701171875, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3890385031700134, | |
| "reward_std": 0.2624141275882721, | |
| "rewards/cosine_scaled_reward": 0.20213650539517403, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 1318.8333740234375, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 0.21641266345977783, | |
| "kl": 0.008941650390625, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0004, | |
| "reward": 0.4674979895353317, | |
| "reward_std": 0.25317949429154396, | |
| "rewards/cosine_scaled_reward": 0.20079964771866798, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 1108.8125610351562, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.30281519889831543, | |
| "kl": 0.00795745849609375, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.0003, | |
| "reward": 0.29210687801241875, | |
| "reward_std": 0.1993991807103157, | |
| "rewards/cosine_scaled_reward": -0.07733290828764439, | |
| "rewards/format_reward": 1.0, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 1167.2292022705078, | |
| "epoch": 0.28685714285714287, | |
| "grad_norm": 0.28529420495033264, | |
| "kl": 0.00995635986328125, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0004, | |
| "reward": 0.22866389155387878, | |
| "reward_std": 0.25506168603897095, | |
| "rewards/cosine_scaled_reward": -0.08852448500692844, | |
| "rewards/format_reward": 0.875, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 1595.7917175292969, | |
| "epoch": 0.288, | |
| "grad_norm": 0.23463518917560577, | |
| "kl": 0.0086212158203125, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.0003, | |
| "reward": 0.18430738151073456, | |
| "reward_std": 0.15956872701644897, | |
| "rewards/cosine_scaled_reward": -0.11572365462779999, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 1727.3958740234375, | |
| "epoch": 0.28914285714285715, | |
| "grad_norm": 0.28016501665115356, | |
| "kl": 0.01375579833984375, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0005, | |
| "reward": 0.23585626482963562, | |
| "reward_std": 0.23895636573433876, | |
| "rewards/cosine_scaled_reward": -0.06529825925827026, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 1843.6250762939453, | |
| "epoch": 0.29028571428571426, | |
| "grad_norm": 0.2972549498081207, | |
| "kl": 0.009918212890625, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.0004, | |
| "reward": 0.19810109585523605, | |
| "reward_std": 0.3144396096467972, | |
| "rewards/cosine_scaled_reward": -0.06495114602148533, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 2136.791717529297, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 0.25467684864997864, | |
| "kl": 0.00911712646484375, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.0004, | |
| "reward": 0.14784921891987324, | |
| "reward_std": 0.22354254499077797, | |
| "rewards/cosine_scaled_reward": -0.15361732488963753, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 1582.2292175292969, | |
| "epoch": 0.2925714285714286, | |
| "grad_norm": 0.23993702232837677, | |
| "kl": 0.0102081298828125, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0004, | |
| "reward": 0.21227549016475677, | |
| "reward_std": 0.23894662968814373, | |
| "rewards/cosine_scaled_reward": -0.07508064294233918, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 2020.2084045410156, | |
| "epoch": 0.2937142857142857, | |
| "grad_norm": 0.23374323546886444, | |
| "kl": 0.00890350341796875, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0004, | |
| "reward": 0.41816296614706516, | |
| "reward_std": 0.3155359774827957, | |
| "rewards/cosine_scaled_reward": 0.2016722597181797, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 1815.1042175292969, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 0.22643603384494781, | |
| "kl": 0.007904052734375, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3056030236184597, | |
| "reward_std": 0.3313448801636696, | |
| "rewards/cosine_scaled_reward": 0.019550755620002747, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 1405.7292175292969, | |
| "epoch": 0.296, | |
| "grad_norm": 0.26583632826805115, | |
| "kl": 0.010009765625, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.0004, | |
| "reward": 0.3662095330655575, | |
| "reward_std": 0.29809001833200455, | |
| "rewards/cosine_scaled_reward": 0.05342107731848955, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 1058.0208435058594, | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 0.27586984634399414, | |
| "kl": 0.007415771484375, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.0003, | |
| "reward": 0.42465347796678543, | |
| "reward_std": 0.28675223514437675, | |
| "rewards/cosine_scaled_reward": 0.14612944051623344, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 2196.8750610351562, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 0.20076176524162292, | |
| "kl": 0.0092315673828125, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0004, | |
| "reward": 0.10841672308743, | |
| "reward_std": 0.1914910487830639, | |
| "rewards/cosine_scaled_reward": -0.1518568762112409, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 1699.2292175292969, | |
| "epoch": 0.29942857142857143, | |
| "grad_norm": 0.26689934730529785, | |
| "kl": 0.0111541748046875, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.0004, | |
| "reward": 0.09261467261239886, | |
| "reward_std": 0.16253572702407837, | |
| "rewards/cosine_scaled_reward": -0.20514269173145294, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 1472.5000305175781, | |
| "epoch": 0.30057142857142854, | |
| "grad_norm": 0.23336315155029297, | |
| "kl": 0.00583648681640625, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0002, | |
| "reward": 0.1648530475795269, | |
| "reward_std": 0.18808570504188538, | |
| "rewards/cosine_scaled_reward": -0.2227061167359352, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 1523.7708740234375, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 0.22092439234256744, | |
| "kl": 0.0074310302734375, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.0003, | |
| "reward": 0.20358567498624325, | |
| "reward_std": 0.27659233286976814, | |
| "rewards/cosine_scaled_reward": -0.1430983915925026, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 1479.7708892822266, | |
| "epoch": 0.3028571428571429, | |
| "grad_norm": 0.29656776785850525, | |
| "kl": 0.0095367431640625, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0004, | |
| "reward": 0.2208748161792755, | |
| "reward_std": 0.23769052140414715, | |
| "rewards/cosine_scaled_reward": -0.09724835399538279, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 1845.1250610351562, | |
| "epoch": 0.304, | |
| "grad_norm": 0.24222609400749207, | |
| "kl": 0.0095062255859375, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0004, | |
| "reward": 0.24290394503623247, | |
| "reward_std": 0.2597558721899986, | |
| "rewards/cosine_scaled_reward": -0.017342038452625275, | |
| "rewards/format_reward": 0.75, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 2344.312530517578, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 0.21543891727924347, | |
| "kl": 0.0122833251953125, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.0005, | |
| "reward": 0.025223158299922943, | |
| "reward_std": 0.18688056617975235, | |
| "rewards/cosine_scaled_reward": -0.2162624504417181, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 1327.8958587646484, | |
| "epoch": 0.3062857142857143, | |
| "grad_norm": 0.3169882893562317, | |
| "kl": 0.01444244384765625, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.0006, | |
| "reward": 0.2750858571380377, | |
| "reward_std": 0.3029320724308491, | |
| "rewards/cosine_scaled_reward": -0.0884361332282424, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 1640.0416870117188, | |
| "epoch": 0.30742857142857144, | |
| "grad_norm": 0.24088799953460693, | |
| "kl": 0.00873565673828125, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0003, | |
| "reward": 0.18298603361472487, | |
| "reward_std": 0.21892964094877243, | |
| "rewards/cosine_scaled_reward": -0.13406258076429367, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 1622.2500610351562, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 0.23051612079143524, | |
| "kl": 0.00720977783203125, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.0003, | |
| "reward": 0.37890365347266197, | |
| "reward_std": 0.32260435819625854, | |
| "rewards/cosine_scaled_reward": 0.06904735416173935, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 1450.0417175292969, | |
| "epoch": 0.3097142857142857, | |
| "grad_norm": 0.28786543011665344, | |
| "kl": 0.00946044921875, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0004, | |
| "reward": 0.5036385664716363, | |
| "reward_std": 0.36333882436156273, | |
| "rewards/cosine_scaled_reward": 0.2697860337793827, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 1877.8541870117188, | |
| "epoch": 0.31085714285714283, | |
| "grad_norm": 0.20036853849887848, | |
| "kl": 0.00983428955078125, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0004, | |
| "reward": 0.22807104885578156, | |
| "reward_std": 0.24351342767477036, | |
| "rewards/cosine_scaled_reward": -0.0636335639283061, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 1397.3333740234375, | |
| "epoch": 0.312, | |
| "grad_norm": 0.26651531457901, | |
| "kl": 0.0090484619140625, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.0004, | |
| "reward": 0.2839905247092247, | |
| "reward_std": 0.2825750559568405, | |
| "rewards/cosine_scaled_reward": -0.004876431077718735, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 1043.9167175292969, | |
| "epoch": 0.31314285714285717, | |
| "grad_norm": 0.3097701072692871, | |
| "kl": 0.0113067626953125, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.0005, | |
| "reward": 0.331375852227211, | |
| "reward_std": 0.2427959106862545, | |
| "rewards/cosine_scaled_reward": 0.009415101259946823, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 1657.0833587646484, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.2341931164264679, | |
| "kl": 0.0104217529296875, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0004, | |
| "reward": 0.3595724329352379, | |
| "reward_std": 0.3247465565800667, | |
| "rewards/cosine_scaled_reward": 0.1323377527296543, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 1292.0000610351562, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 0.3187240660190582, | |
| "kl": 0.01602935791015625, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0006, | |
| "reward": 0.34184306114912033, | |
| "reward_std": 0.3191189467906952, | |
| "rewards/cosine_scaled_reward": 0.022058267146348953, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 1618.7500305175781, | |
| "epoch": 0.31657142857142856, | |
| "grad_norm": 0.3473435938358307, | |
| "kl": 0.01397705078125, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0006, | |
| "reward": 0.3007097691297531, | |
| "reward_std": 0.20629184320569038, | |
| "rewards/cosine_scaled_reward": 0.02854561060667038, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 1575.3959045410156, | |
| "epoch": 0.3177142857142857, | |
| "grad_norm": 0.295357882976532, | |
| "kl": 0.00948333740234375, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0004, | |
| "reward": 0.34411953017115593, | |
| "reward_std": 0.25512586534023285, | |
| "rewards/cosine_scaled_reward": 0.08236894011497498, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 1402.7292175292969, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 0.26129284501075745, | |
| "kl": 0.01031494140625, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0004, | |
| "reward": 0.25433837436139584, | |
| "reward_std": 0.21544880792498589, | |
| "rewards/cosine_scaled_reward": -0.10006260499358177, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 1718.5625610351562, | |
| "epoch": 0.32, | |
| "grad_norm": 0.2867210805416107, | |
| "kl": 0.0108489990234375, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.0004, | |
| "reward": 0.40966310165822506, | |
| "reward_std": 0.34037622064352036, | |
| "rewards/cosine_scaled_reward": 0.18308642879128456, | |
| "rewards/format_reward": 0.8125, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 2452.1875610351562, | |
| "epoch": 0.3211428571428571, | |
| "grad_norm": 0.21252016723155975, | |
| "kl": 0.01434326171875, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0006, | |
| "reward": 0.06680710799992085, | |
| "reward_std": 0.2203577384352684, | |
| "rewards/cosine_scaled_reward": -0.1721898689866066, | |
| "rewards/format_reward": 0.5416666828095913, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 1353.3750457763672, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 0.2406478375196457, | |
| "kl": 0.008941650390625, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.0004, | |
| "reward": 0.3876841962337494, | |
| "reward_std": 0.2718113847076893, | |
| "rewards/cosine_scaled_reward": 0.08670558547601104, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 2312.291717529297, | |
| "epoch": 0.32342857142857145, | |
| "grad_norm": 0.18672321736812592, | |
| "kl": 0.01123046875, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.0004, | |
| "reward": 0.33478916296735406, | |
| "reward_std": 0.3283892571926117, | |
| "rewards/cosine_scaled_reward": 0.18481532111763954, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 1362.9167175292969, | |
| "epoch": 0.32457142857142857, | |
| "grad_norm": 0.43515703082084656, | |
| "kl": 0.01312255859375, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.0005, | |
| "reward": 0.24751500971615314, | |
| "reward_std": 0.21904924511909485, | |
| "rewards/cosine_scaled_reward": -0.10085805598646402, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 914.6666870117188, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 0.2816125154495239, | |
| "kl": 0.00911712646484375, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.0004, | |
| "reward": 0.3308372348546982, | |
| "reward_std": 0.22466129437088966, | |
| "rewards/cosine_scaled_reward": 0.007770329713821411, | |
| "rewards/format_reward": 1.0, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 1364.2709045410156, | |
| "epoch": 0.32685714285714285, | |
| "grad_norm": 0.24092473089694977, | |
| "kl": 0.00989532470703125, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0004, | |
| "reward": 0.3854522183537483, | |
| "reward_std": 0.3654041290283203, | |
| "rewards/cosine_scaled_reward": 0.0710476387757808, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 1273.1042175292969, | |
| "epoch": 0.328, | |
| "grad_norm": 0.30035242438316345, | |
| "kl": 0.01116180419921875, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.0004, | |
| "reward": 0.19016006495803595, | |
| "reward_std": 0.1666824333369732, | |
| "rewards/cosine_scaled_reward": -0.10087527148425579, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 1498.5625305175781, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 0.2478376030921936, | |
| "kl": 0.010406494140625, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.0004, | |
| "reward": 0.23150785267353058, | |
| "reward_std": 0.17795532755553722, | |
| "rewards/cosine_scaled_reward": -0.09038132801651955, | |
| "rewards/format_reward": 0.875, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 1331.5833892822266, | |
| "epoch": 0.3302857142857143, | |
| "grad_norm": 0.338457852602005, | |
| "kl": 0.015472412109375, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.0006, | |
| "reward": 0.2746650446206331, | |
| "reward_std": 0.23780345171689987, | |
| "rewards/cosine_scaled_reward": -0.03811670187860727, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 1039.1250610351562, | |
| "epoch": 0.3314285714285714, | |
| "grad_norm": 0.37133654952049255, | |
| "kl": 0.01009368896484375, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.0004, | |
| "reward": 0.3331058695912361, | |
| "reward_std": 0.26539327949285507, | |
| "rewards/cosine_scaled_reward": 0.03625666256994009, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 1281.6875610351562, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 0.25397613644599915, | |
| "kl": 0.0101470947265625, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0004, | |
| "reward": 0.28346237912774086, | |
| "reward_std": 0.3078480400145054, | |
| "rewards/cosine_scaled_reward": -0.04864836111664772, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 1514.6042175292969, | |
| "epoch": 0.33371428571428574, | |
| "grad_norm": 0.24802722036838531, | |
| "kl": 0.0099945068359375, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.0004, | |
| "reward": 0.22861065715551376, | |
| "reward_std": 0.19359621033072472, | |
| "rewards/cosine_scaled_reward": -0.1172735309228301, | |
| "rewards/format_reward": 0.875, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 1185.8958740234375, | |
| "epoch": 0.33485714285714285, | |
| "grad_norm": 0.2437037080526352, | |
| "kl": 0.0122222900390625, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0005, | |
| "reward": 0.29054274410009384, | |
| "reward_std": 0.22883088141679764, | |
| "rewards/cosine_scaled_reward": -0.07128089666366577, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 1941.1458740234375, | |
| "epoch": 0.336, | |
| "grad_norm": 0.35831886529922485, | |
| "kl": 0.0147552490234375, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.0006, | |
| "reward": 0.2816973514854908, | |
| "reward_std": 0.3041081838309765, | |
| "rewards/cosine_scaled_reward": 0.06637562066316605, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 1546.1041870117188, | |
| "epoch": 0.33714285714285713, | |
| "grad_norm": 0.2846806049346924, | |
| "kl": 0.01165771484375, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0005, | |
| "reward": 0.45683666691184044, | |
| "reward_std": 0.36336134001612663, | |
| "rewards/cosine_scaled_reward": 0.17266745120286942, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 1614.5208435058594, | |
| "epoch": 0.3382857142857143, | |
| "grad_norm": 0.2929825782775879, | |
| "kl": 0.01120758056640625, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0004, | |
| "reward": 0.1770874448120594, | |
| "reward_std": 0.2070347797125578, | |
| "rewards/cosine_scaled_reward": -0.15072383964434266, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 2259.0834045410156, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 0.2630348205566406, | |
| "kl": 0.01708984375, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0007, | |
| "reward": 0.2537407707422972, | |
| "reward_std": 0.27694452553987503, | |
| "rewards/cosine_scaled_reward": 0.010442063212394714, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 1325.3542022705078, | |
| "epoch": 0.3405714285714286, | |
| "grad_norm": 0.2507433593273163, | |
| "kl": 0.01007080078125, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.0004, | |
| "reward": 0.21724335104227066, | |
| "reward_std": 0.18220657296478748, | |
| "rewards/cosine_scaled_reward": -0.12216328456997871, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 1462.4791870117188, | |
| "epoch": 0.3417142857142857, | |
| "grad_norm": 0.3154860734939575, | |
| "kl": 0.0118408203125, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.0005, | |
| "reward": 0.25573862344026566, | |
| "reward_std": 0.19661388732492924, | |
| "rewards/cosine_scaled_reward": -0.05486198514699936, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 1630.166748046875, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.9737704396247864, | |
| "kl": 0.019989013671875, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.0008, | |
| "reward": 0.23839148692786694, | |
| "reward_std": 0.23841408640146255, | |
| "rewards/cosine_scaled_reward": -0.037340753711760044, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 1643.6041870117188, | |
| "epoch": 0.344, | |
| "grad_norm": 0.39158645272254944, | |
| "kl": 0.0224151611328125, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0009, | |
| "reward": 0.22077538957819343, | |
| "reward_std": 0.27438678219914436, | |
| "rewards/cosine_scaled_reward": -0.11595524847507477, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 1667.6250457763672, | |
| "epoch": 0.34514285714285714, | |
| "grad_norm": 0.32535773515701294, | |
| "kl": 0.016754150390625, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.0007, | |
| "reward": 0.2695366069674492, | |
| "reward_std": 0.27404558658599854, | |
| "rewards/cosine_scaled_reward": 0.03671133052557707, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 1149.1250305175781, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 0.3706236481666565, | |
| "kl": 0.01305389404296875, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.0005, | |
| "reward": 0.2421702779829502, | |
| "reward_std": 0.28027326986193657, | |
| "rewards/cosine_scaled_reward": -0.12036034030097653, | |
| "rewards/format_reward": 0.9375, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 1612.8125305175781, | |
| "epoch": 0.3474285714285714, | |
| "grad_norm": 0.2613138258457184, | |
| "kl": 0.0163421630859375, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.0007, | |
| "reward": 0.3241605297662318, | |
| "reward_std": 0.23523182421922684, | |
| "rewards/cosine_scaled_reward": 0.048464858322404325, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 1475.8750305175781, | |
| "epoch": 0.3485714285714286, | |
| "grad_norm": 0.3334023058414459, | |
| "kl": 0.0114898681640625, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.0005, | |
| "reward": 0.16058492846786976, | |
| "reward_std": 0.24794265627861023, | |
| "rewards/cosine_scaled_reward": -0.1861222069710493, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 1062.2292175292969, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 0.29249686002731323, | |
| "kl": 0.00969696044921875, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0004, | |
| "reward": 0.4680076576769352, | |
| "reward_std": 0.32026200741529465, | |
| "rewards/cosine_scaled_reward": 0.16786080971360207, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 1272.9583892822266, | |
| "epoch": 0.35085714285714287, | |
| "grad_norm": 0.31508970260620117, | |
| "kl": 0.00922393798828125, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0004, | |
| "reward": 0.2798173949122429, | |
| "reward_std": 0.35323888808488846, | |
| "rewards/cosine_scaled_reward": -0.01756212580949068, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 2330.9584350585938, | |
| "epoch": 0.352, | |
| "grad_norm": 0.2136361300945282, | |
| "kl": 0.0195465087890625, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.0008, | |
| "reward": 0.19128491915762424, | |
| "reward_std": 0.25959878973662853, | |
| "rewards/cosine_scaled_reward": -0.0526702341157943, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 1883.3333740234375, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 0.20839928090572357, | |
| "kl": 0.0121612548828125, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.0005, | |
| "reward": 0.2328551933169365, | |
| "reward_std": 0.2755979187786579, | |
| "rewards/cosine_scaled_reward": -0.09418771299533546, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 1376.1875610351562, | |
| "epoch": 0.35428571428571426, | |
| "grad_norm": 0.28804856538772583, | |
| "kl": 0.0186920166015625, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.0007, | |
| "reward": 0.2578071504831314, | |
| "reward_std": 0.24786734953522682, | |
| "rewards/cosine_scaled_reward": -0.14047110336832702, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 1313.750015258789, | |
| "epoch": 0.3554285714285714, | |
| "grad_norm": 0.3734670579433441, | |
| "kl": 0.0127410888671875, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.0005, | |
| "reward": 0.29336644522845745, | |
| "reward_std": 0.2504027783870697, | |
| "rewards/cosine_scaled_reward": -0.012938316911458969, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 1223.416732788086, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 0.3231605887413025, | |
| "kl": 0.01352691650390625, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.0005, | |
| "reward": 0.49592292681336403, | |
| "reward_std": 0.2532188519835472, | |
| "rewards/cosine_scaled_reward": 0.2624667380005121, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 1883.1875610351562, | |
| "epoch": 0.3577142857142857, | |
| "grad_norm": 0.40404918789863586, | |
| "kl": 0.021453857421875, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.0009, | |
| "reward": 0.21103960182517767, | |
| "reward_std": 0.2670624628663063, | |
| "rewards/cosine_scaled_reward": -0.03821822814643383, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 1640.104263305664, | |
| "epoch": 0.3588571428571429, | |
| "grad_norm": 0.26183629035949707, | |
| "kl": 0.02033233642578125, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.0008, | |
| "reward": 0.2432878129184246, | |
| "reward_std": 0.19171066209673882, | |
| "rewards/cosine_scaled_reward": -0.0031162824016064405, | |
| "rewards/format_reward": 0.75, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 1864.2084045410156, | |
| "epoch": 0.36, | |
| "grad_norm": 0.6592049598693848, | |
| "kl": 0.046630859375, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.0019, | |
| "reward": 0.2404745277017355, | |
| "reward_std": 0.22513627633452415, | |
| "rewards/cosine_scaled_reward": -0.006075944751501083, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 2024.6042175292969, | |
| "epoch": 0.36114285714285715, | |
| "grad_norm": 0.41091492772102356, | |
| "kl": 0.02691650390625, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0011, | |
| "reward": 0.04616490565240383, | |
| "reward_std": 0.15632366575300694, | |
| "rewards/cosine_scaled_reward": -0.2781721465289593, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 1890.6667175292969, | |
| "epoch": 0.36228571428571427, | |
| "grad_norm": 0.395164430141449, | |
| "kl": 0.0240631103515625, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.001, | |
| "reward": 0.1601184867322445, | |
| "reward_std": 0.23606964573264122, | |
| "rewards/cosine_scaled_reward": -0.0570518858730793, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 1231.2916870117188, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 0.36525601148605347, | |
| "kl": 0.022247314453125, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.0009, | |
| "reward": 0.2613836098462343, | |
| "reward_std": 0.18278861418366432, | |
| "rewards/cosine_scaled_reward": -0.08080821360636037, | |
| "rewards/format_reward": 0.9375, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 1301.6041870117188, | |
| "epoch": 0.36457142857142855, | |
| "grad_norm": 0.360119104385376, | |
| "kl": 0.012481689453125, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.0005, | |
| "reward": 0.2703345976769924, | |
| "reward_std": 0.27354278787970543, | |
| "rewards/cosine_scaled_reward": -0.05605246126651764, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 1155.2083587646484, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 0.40306153893470764, | |
| "kl": 0.021759033203125, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0009, | |
| "reward": 0.3281657323241234, | |
| "reward_std": 0.3362472988665104, | |
| "rewards/cosine_scaled_reward": -0.012413738295435905, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 952.2708740234375, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 0.29503145813941956, | |
| "kl": 0.00998687744140625, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0004, | |
| "reward": 0.44769028574228287, | |
| "reward_std": 0.2670172415673733, | |
| "rewards/cosine_scaled_reward": 0.1519340705126524, | |
| "rewards/format_reward": 1.0, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 1964.6667175292969, | |
| "epoch": 0.368, | |
| "grad_norm": 0.6683375239372253, | |
| "kl": 0.048004150390625, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0019, | |
| "reward": 0.10212668823078275, | |
| "reward_std": 0.1831696219742298, | |
| "rewards/cosine_scaled_reward": -0.211124025285244, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 1723.979232788086, | |
| "epoch": 0.36914285714285716, | |
| "grad_norm": 0.38685184717178345, | |
| "kl": 0.03357696533203125, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.0013, | |
| "reward": 0.19741436280310154, | |
| "reward_std": 0.20172200351953506, | |
| "rewards/cosine_scaled_reward": -0.08652417734265327, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 1568.0833740234375, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 1.1231465339660645, | |
| "kl": 0.02925872802734375, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.0012, | |
| "reward": 0.2117295628413558, | |
| "reward_std": 0.28987129777669907, | |
| "rewards/cosine_scaled_reward": -0.07313152588903904, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 1840.2708892822266, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.2682565152645111, | |
| "kl": 0.0226287841796875, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.0009, | |
| "reward": 0.23154988093301654, | |
| "reward_std": 0.19991927221417427, | |
| "rewards/cosine_scaled_reward": -0.014693088829517365, | |
| "rewards/format_reward": 0.75, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 1522.4583435058594, | |
| "epoch": 0.37257142857142855, | |
| "grad_norm": 0.44652143120765686, | |
| "kl": 0.02679443359375, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0011, | |
| "reward": 0.2407391034066677, | |
| "reward_std": 0.22661786526441574, | |
| "rewards/cosine_scaled_reward": -0.04047023877501488, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 1716.7708435058594, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 0.30573394894599915, | |
| "kl": 0.0191192626953125, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0008, | |
| "reward": 0.3224290758371353, | |
| "reward_std": 0.2919512912631035, | |
| "rewards/cosine_scaled_reward": 0.08114048466086388, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 1643.166748046875, | |
| "epoch": 0.37485714285714283, | |
| "grad_norm": 0.46766549348831177, | |
| "kl": 0.0225830078125, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.0009, | |
| "reward": 0.18003581184893847, | |
| "reward_std": 0.19023212790489197, | |
| "rewards/cosine_scaled_reward": -0.14118107501417398, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 990.7083587646484, | |
| "epoch": 0.376, | |
| "grad_norm": 0.37604936957359314, | |
| "kl": 0.01605224609375, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.0006, | |
| "reward": 0.47424641251564026, | |
| "reward_std": 0.3888479918241501, | |
| "rewards/cosine_scaled_reward": 0.12837250716984272, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 1277.1666870117188, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 0.6129769086837769, | |
| "kl": 0.0253753662109375, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.001, | |
| "reward": 0.2782764509320259, | |
| "reward_std": 0.3325960785150528, | |
| "rewards/cosine_scaled_reward": -0.05062708631157875, | |
| "rewards/format_reward": 0.8541667014360428, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 1592.5833740234375, | |
| "epoch": 0.3782857142857143, | |
| "grad_norm": 0.4822657108306885, | |
| "kl": 0.02129364013671875, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0009, | |
| "reward": 0.11315785581246018, | |
| "reward_std": 0.1640291679650545, | |
| "rewards/cosine_scaled_reward": -0.17293575033545494, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 1577.5834045410156, | |
| "epoch": 0.37942857142857145, | |
| "grad_norm": 1.590923547744751, | |
| "kl": 0.02500152587890625, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.001, | |
| "reward": 0.2356187179684639, | |
| "reward_std": 0.3015174902975559, | |
| "rewards/cosine_scaled_reward": -0.12084164097905159, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 1291.8542175292969, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 0.25067755579948425, | |
| "kl": 0.0163116455078125, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.0007, | |
| "reward": 0.41119640320539474, | |
| "reward_std": 0.2953006289899349, | |
| "rewards/cosine_scaled_reward": 0.05285493656992912, | |
| "rewards/format_reward": 1.0, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 2088.0000915527344, | |
| "epoch": 0.38171428571428573, | |
| "grad_norm": 0.3662320375442505, | |
| "kl": 0.0378875732421875, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.0015, | |
| "reward": 0.10603704676032066, | |
| "reward_std": 0.27144619822502136, | |
| "rewards/cosine_scaled_reward": -0.17696780152618885, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 1203.166732788086, | |
| "epoch": 0.38285714285714284, | |
| "grad_norm": 0.2919592559337616, | |
| "kl": 0.01375579833984375, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0006, | |
| "reward": 0.39064921438694, | |
| "reward_std": 0.27864502742886543, | |
| "rewards/cosine_scaled_reward": 0.0996411181986332, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 1457.9375305175781, | |
| "epoch": 0.384, | |
| "grad_norm": 0.5630917549133301, | |
| "kl": 0.02484130859375, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.001, | |
| "reward": 0.3291323632001877, | |
| "reward_std": 0.3386707752943039, | |
| "rewards/cosine_scaled_reward": 0.032098641619086266, | |
| "rewards/format_reward": 0.7916667014360428, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 1539.9792175292969, | |
| "epoch": 0.3851428571428571, | |
| "grad_norm": 0.3664340674877167, | |
| "kl": 0.02039337158203125, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.0008, | |
| "reward": 0.2679777964949608, | |
| "reward_std": 0.2982511632144451, | |
| "rewards/cosine_scaled_reward": -0.08566620387136936, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 1319.6875305175781, | |
| "epoch": 0.3862857142857143, | |
| "grad_norm": 0.2860366404056549, | |
| "kl": 0.014190673828125, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0006, | |
| "reward": 0.3431988013908267, | |
| "reward_std": 0.2697646599262953, | |
| "rewards/cosine_scaled_reward": 0.04516521096229553, | |
| "rewards/format_reward": 0.875, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 1608.1459045410156, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 0.5042271018028259, | |
| "kl": 0.0297698974609375, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0012, | |
| "reward": 0.22079084441065788, | |
| "reward_std": 0.27200285717844963, | |
| "rewards/cosine_scaled_reward": -0.15526098851114511, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 1437.4583740234375, | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 0.3569318652153015, | |
| "kl": 0.01732635498046875, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0007, | |
| "reward": 0.2634928924962878, | |
| "reward_std": 0.27312322705984116, | |
| "rewards/cosine_scaled_reward": -0.05826007016003132, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 1169.6042175292969, | |
| "epoch": 0.38971428571428574, | |
| "grad_norm": 0.2896157205104828, | |
| "kl": 0.0129241943359375, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0005, | |
| "reward": 0.46248795837163925, | |
| "reward_std": 0.31046775355935097, | |
| "rewards/cosine_scaled_reward": 0.1391837690025568, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 1596.3333587646484, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 0.5315576791763306, | |
| "kl": 0.03363037109375, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.0013, | |
| "reward": 0.3130345083773136, | |
| "reward_std": 0.3867286182940006, | |
| "rewards/cosine_scaled_reward": -0.06444146143621765, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 1510.6666870117188, | |
| "epoch": 0.392, | |
| "grad_norm": 0.4027750492095947, | |
| "kl": 0.02393341064453125, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.001, | |
| "reward": 0.4770284369587898, | |
| "reward_std": 0.4011606350541115, | |
| "rewards/cosine_scaled_reward": 0.1810836885124445, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 1328.3750305175781, | |
| "epoch": 0.3931428571428571, | |
| "grad_norm": 0.31406036019325256, | |
| "kl": 0.021820068359375, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.0009, | |
| "reward": 0.4601411782205105, | |
| "reward_std": 0.26154783368110657, | |
| "rewards/cosine_scaled_reward": 0.20328249409794807, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 1693.1041870117188, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 0.5955045819282532, | |
| "kl": 0.03485107421875, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0014, | |
| "reward": 0.2936294376850128, | |
| "reward_std": 0.25769656151533127, | |
| "rewards/cosine_scaled_reward": -0.0028788005001842976, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 1577.75, | |
| "epoch": 0.3954285714285714, | |
| "grad_norm": 0.29605719447135925, | |
| "kl": 0.02605438232421875, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.001, | |
| "reward": 0.27892691642045975, | |
| "reward_std": 0.23167649656534195, | |
| "rewards/cosine_scaled_reward": -0.08691584412008524, | |
| "rewards/format_reward": 0.9375, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 1622.8958740234375, | |
| "epoch": 0.3965714285714286, | |
| "grad_norm": 0.27287817001342773, | |
| "kl": 0.01595306396484375, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0006, | |
| "reward": 0.15320170670747757, | |
| "reward_std": 0.1787218227982521, | |
| "rewards/cosine_scaled_reward": -0.21337689459323883, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 1563.6875610351562, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 0.32775041460990906, | |
| "kl": 0.03545379638671875, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.0014, | |
| "reward": 0.36891554296016693, | |
| "reward_std": 0.24630171805620193, | |
| "rewards/cosine_scaled_reward": 0.08824050053954124, | |
| "rewards/format_reward": 0.875, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 1382.541748046875, | |
| "epoch": 0.39885714285714285, | |
| "grad_norm": 0.33139002323150635, | |
| "kl": 0.0301971435546875, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.0012, | |
| "reward": 0.37559112161397934, | |
| "reward_std": 0.28325023502111435, | |
| "rewards/cosine_scaled_reward": 0.02644458832219243, | |
| "rewards/format_reward": 0.9375, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 900.2916717529297, | |
| "epoch": 0.4, | |
| "grad_norm": 0.2705373764038086, | |
| "kl": 0.008544921875, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2913980260491371, | |
| "reward_std": 0.30835580080747604, | |
| "rewards/cosine_scaled_reward": -0.0918974825181067, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 1303.0000305175781, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 0.3872186839580536, | |
| "kl": 0.023712158203125, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0009, | |
| "reward": 0.2837943397462368, | |
| "reward_std": 0.25149884819984436, | |
| "rewards/cosine_scaled_reward": -0.0790461078286171, | |
| "rewards/format_reward": 0.9375, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 1608.1250305175781, | |
| "epoch": 0.4022857142857143, | |
| "grad_norm": 0.6398208141326904, | |
| "kl": 0.02740478515625, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.0011, | |
| "reward": 0.28259705752134323, | |
| "reward_std": 0.2690298482775688, | |
| "rewards/cosine_scaled_reward": 0.0077111730352044106, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 1021.6666717529297, | |
| "epoch": 0.4034285714285714, | |
| "grad_norm": 0.41845765709877014, | |
| "kl": 0.01645660400390625, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.0007, | |
| "reward": 0.38043496757745743, | |
| "reward_std": 0.21722519025206566, | |
| "rewards/cosine_scaled_reward": 0.07915054634213448, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 979.8541870117188, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 0.28274133801460266, | |
| "kl": 0.010162353515625, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.0004, | |
| "reward": 0.4308842793107033, | |
| "reward_std": 0.24077448807656765, | |
| "rewards/cosine_scaled_reward": 0.08124570176005363, | |
| "rewards/format_reward": 1.0, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 1101.3958587646484, | |
| "epoch": 0.4057142857142857, | |
| "grad_norm": 0.38370591402053833, | |
| "kl": 0.01351165771484375, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0005, | |
| "reward": 0.3669841568917036, | |
| "reward_std": 0.2703222706913948, | |
| "rewards/cosine_scaled_reward": -0.0021127446088939905, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 1464.9791870117188, | |
| "epoch": 0.40685714285714286, | |
| "grad_norm": 0.3078323006629944, | |
| "kl": 0.032012939453125, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0013, | |
| "reward": 0.3374742101877928, | |
| "reward_std": 0.32919860631227493, | |
| "rewards/cosine_scaled_reward": 0.003608912229537964, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 1756.8334045410156, | |
| "epoch": 0.408, | |
| "grad_norm": 0.5091083645820618, | |
| "kl": 0.0364532470703125, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0015, | |
| "reward": 0.14945390354841948, | |
| "reward_std": 0.2556047923862934, | |
| "rewards/cosine_scaled_reward": -0.2234173621982336, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 1431.8333740234375, | |
| "epoch": 0.40914285714285714, | |
| "grad_norm": 0.27200624346733093, | |
| "kl": 0.0190277099609375, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.0008, | |
| "reward": 0.39337392151355743, | |
| "reward_std": 0.3704180307686329, | |
| "rewards/cosine_scaled_reward": 0.07256992720067501, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 1123.437515258789, | |
| "epoch": 0.4102857142857143, | |
| "grad_norm": 0.6349442601203918, | |
| "kl": 0.029815673828125, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.0012, | |
| "reward": 0.23123174533247948, | |
| "reward_std": 0.25142205134034157, | |
| "rewards/cosine_scaled_reward": -0.14995449781417847, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 1300.9791870117188, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 0.38422876596450806, | |
| "kl": 0.031402587890625, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.0013, | |
| "reward": 0.3842775635421276, | |
| "reward_std": 0.33736754953861237, | |
| "rewards/cosine_scaled_reward": 0.07002734206616879, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 1139.0625305175781, | |
| "epoch": 0.4125714285714286, | |
| "grad_norm": 0.28237876296043396, | |
| "kl": 0.0174102783203125, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0007, | |
| "reward": 0.2165006436407566, | |
| "reward_std": 0.19712563790380955, | |
| "rewards/cosine_scaled_reward": -0.1817290298640728, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 1061.4583587646484, | |
| "epoch": 0.4137142857142857, | |
| "grad_norm": 0.35058391094207764, | |
| "kl": 0.0254669189453125, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.001, | |
| "reward": 0.3845072239637375, | |
| "reward_std": 0.2359091378748417, | |
| "rewards/cosine_scaled_reward": 0.058078229427337646, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 951.1042022705078, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 0.4236137568950653, | |
| "kl": 0.02153778076171875, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0009, | |
| "reward": 0.561962466686964, | |
| "reward_std": 0.26705051213502884, | |
| "rewards/cosine_scaled_reward": 0.26302190124988556, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 1268.2917175292969, | |
| "epoch": 0.416, | |
| "grad_norm": 0.3417412340641022, | |
| "kl": 0.01409912109375, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.0006, | |
| "reward": 0.1475709266960621, | |
| "reward_std": 0.17424429766833782, | |
| "rewards/cosine_scaled_reward": -0.26906686276197433, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 1956.9375610351562, | |
| "epoch": 0.41714285714285715, | |
| "grad_norm": 0.4032626748085022, | |
| "kl": 0.0582122802734375, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0023, | |
| "reward": 0.13641461171209812, | |
| "reward_std": 0.26020678877830505, | |
| "rewards/cosine_scaled_reward": -0.15251348353922367, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 1301.8958892822266, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 0.40691182017326355, | |
| "kl": 0.02590179443359375, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.001, | |
| "reward": 0.31418493390083313, | |
| "reward_std": 0.2587681859731674, | |
| "rewards/cosine_scaled_reward": 0.01146254688501358, | |
| "rewards/format_reward": 0.875, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 1587.8125610351562, | |
| "epoch": 0.41942857142857143, | |
| "grad_norm": 0.45763275027275085, | |
| "kl": 0.027862548828125, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.0011, | |
| "reward": 0.3541104570031166, | |
| "reward_std": 0.2708537131547928, | |
| "rewards/cosine_scaled_reward": -0.004807896912097931, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 1991.2500305175781, | |
| "epoch": 0.4205714285714286, | |
| "grad_norm": 0.932867169380188, | |
| "kl": 0.042816162109375, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.0017, | |
| "reward": 0.27406515926122665, | |
| "reward_std": 0.4030579626560211, | |
| "rewards/cosine_scaled_reward": -0.029212753055617213, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 1489.1875305175781, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 0.5538850426673889, | |
| "kl": 0.037109375, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0015, | |
| "reward": 0.3791775330901146, | |
| "reward_std": 0.42510559409856796, | |
| "rewards/cosine_scaled_reward": 0.015287954360246658, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 1329.6042175292969, | |
| "epoch": 0.4228571428571429, | |
| "grad_norm": 0.4791981875896454, | |
| "kl": 0.041656494140625, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.0017, | |
| "reward": 0.27333252876996994, | |
| "reward_std": 0.22211980447173119, | |
| "rewards/cosine_scaled_reward": -0.06442724168300629, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 690.0833511352539, | |
| "epoch": 0.424, | |
| "grad_norm": 0.3764467239379883, | |
| "kl": 0.009735107421875, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0004, | |
| "reward": 0.3839330803602934, | |
| "reward_std": 0.1987195983529091, | |
| "rewards/cosine_scaled_reward": 0.08161145448684692, | |
| "rewards/format_reward": 1.0, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 1491.0208435058594, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 0.46033698320388794, | |
| "kl": 0.0261383056640625, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.001, | |
| "reward": 0.3577600382268429, | |
| "reward_std": 0.221741683781147, | |
| "rewards/cosine_scaled_reward": 0.05540268123149872, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 973.4375305175781, | |
| "epoch": 0.42628571428571427, | |
| "grad_norm": 0.5783974528312683, | |
| "kl": 0.024383544921875, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.001, | |
| "reward": 0.2985563389956951, | |
| "reward_std": 0.24250118806958199, | |
| "rewards/cosine_scaled_reward": -0.07883045147173107, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 1205.8542175292969, | |
| "epoch": 0.42742857142857144, | |
| "grad_norm": 0.34894105792045593, | |
| "kl": 0.01763153076171875, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.0007, | |
| "reward": 0.35072382912039757, | |
| "reward_std": 0.20784537121653557, | |
| "rewards/cosine_scaled_reward": 0.04581686854362488, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 1723.7708740234375, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.3739396929740906, | |
| "kl": 0.072723388671875, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0029, | |
| "reward": 0.44560275599360466, | |
| "reward_std": 0.2987182140350342, | |
| "rewards/cosine_scaled_reward": 0.17832038179039955, | |
| "rewards/format_reward": 0.7916666679084301, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 1434.4583587646484, | |
| "epoch": 0.4297142857142857, | |
| "grad_norm": 0.6302974224090576, | |
| "kl": 0.0382080078125, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0015, | |
| "reward": 0.20585713908076286, | |
| "reward_std": 0.22607924416661263, | |
| "rewards/cosine_scaled_reward": -0.1532294088974595, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 1910.9583740234375, | |
| "epoch": 0.4308571428571429, | |
| "grad_norm": 0.4583210051059723, | |
| "kl": 0.06280517578125, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0025, | |
| "reward": 0.2012559212744236, | |
| "reward_std": 0.2751467525959015, | |
| "rewards/cosine_scaled_reward": -0.11389604769647121, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 1376.5416870117188, | |
| "epoch": 0.432, | |
| "grad_norm": 0.4945673644542694, | |
| "kl": 0.036407470703125, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0015, | |
| "reward": 0.35591157153248787, | |
| "reward_std": 0.3600316420197487, | |
| "rewards/cosine_scaled_reward": 0.007506262511014938, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 1669.3542175292969, | |
| "epoch": 0.43314285714285716, | |
| "grad_norm": 0.7727935314178467, | |
| "kl": 0.0548858642578125, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.0022, | |
| "reward": 0.22004763688892126, | |
| "reward_std": 0.252176720649004, | |
| "rewards/cosine_scaled_reward": -0.09906591847538948, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 1446.5209045410156, | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 0.3336326479911804, | |
| "kl": 0.0492706298828125, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.002, | |
| "reward": 0.28320768661797047, | |
| "reward_std": 0.20618024468421936, | |
| "rewards/cosine_scaled_reward": 0.01299813762307167, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 1881.1042175292969, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 0.7163864970207214, | |
| "kl": 0.068115234375, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0027, | |
| "reward": 0.16360357124358416, | |
| "reward_std": 0.25915900990366936, | |
| "rewards/cosine_scaled_reward": -0.16741767711937428, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 1083.3750305175781, | |
| "epoch": 0.43657142857142855, | |
| "grad_norm": 0.3177710175514221, | |
| "kl": 0.0191650390625, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.0008, | |
| "reward": 0.19142531976103783, | |
| "reward_std": 0.2180217020213604, | |
| "rewards/cosine_scaled_reward": -0.1876898668706417, | |
| "rewards/format_reward": 0.9375, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 1691.6459045410156, | |
| "epoch": 0.4377142857142857, | |
| "grad_norm": 0.8062416315078735, | |
| "kl": 0.09381103515625, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0038, | |
| "reward": 0.43192901834845543, | |
| "reward_std": 0.34921496361494064, | |
| "rewards/cosine_scaled_reward": 0.20125210843980312, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 1159.312515258789, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 0.50071781873703, | |
| "kl": 0.02691650390625, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.0011, | |
| "reward": 0.5177538767457008, | |
| "reward_std": 0.33252015709877014, | |
| "rewards/cosine_scaled_reward": 0.27795255556702614, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 1704.5209045410156, | |
| "epoch": 0.44, | |
| "grad_norm": 0.9335633516311646, | |
| "kl": 0.0668487548828125, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.0027, | |
| "reward": 0.20765825361013412, | |
| "reward_std": 0.21825537830591202, | |
| "rewards/cosine_scaled_reward": -0.10266052093356848, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 1376.8958435058594, | |
| "epoch": 0.44114285714285717, | |
| "grad_norm": 0.4630853235721588, | |
| "kl": 0.03307342529296875, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0013, | |
| "reward": 0.4098484069108963, | |
| "reward_std": 0.2593814432621002, | |
| "rewards/cosine_scaled_reward": 0.07745273411273956, | |
| "rewards/format_reward": 0.9375, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 1727.9167175292969, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 0.7240560054779053, | |
| "kl": 0.0662994384765625, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0026, | |
| "reward": 0.29854518361389637, | |
| "reward_std": 0.27315741032361984, | |
| "rewards/cosine_scaled_reward": 0.015177648514509201, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 1099.3333740234375, | |
| "epoch": 0.44342857142857145, | |
| "grad_norm": 0.6079539060592651, | |
| "kl": 0.023193359375, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0009, | |
| "reward": 0.38250723481178284, | |
| "reward_std": 0.2334286943078041, | |
| "rewards/cosine_scaled_reward": 0.054990146309137344, | |
| "rewards/format_reward": 1.0, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 1335.6041870117188, | |
| "epoch": 0.44457142857142856, | |
| "grad_norm": 0.38633546233177185, | |
| "kl": 0.05527496337890625, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0022, | |
| "reward": 0.24971096962690353, | |
| "reward_std": 0.23452140390872955, | |
| "rewards/cosine_scaled_reward": -0.11643039900809526, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 1625.8125457763672, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 0.5874533653259277, | |
| "kl": 0.045379638671875, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.0018, | |
| "reward": 0.14043784514069557, | |
| "reward_std": 0.2116746250540018, | |
| "rewards/cosine_scaled_reward": -0.16961687617003918, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 1688.0833587646484, | |
| "epoch": 0.44685714285714284, | |
| "grad_norm": 0.8851686716079712, | |
| "kl": 0.11041259765625, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0044, | |
| "reward": 0.31278832722455263, | |
| "reward_std": 0.4149634316563606, | |
| "rewards/cosine_scaled_reward": 0.03951522649731487, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 1464.4791870117188, | |
| "epoch": 0.448, | |
| "grad_norm": 0.7757493853569031, | |
| "kl": 0.05254364013671875, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.0021, | |
| "reward": 0.23795920982956886, | |
| "reward_std": 0.305756276473403, | |
| "rewards/cosine_scaled_reward": -0.10466890409588814, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 1721.9583740234375, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 0.5244545936584473, | |
| "kl": 0.07568359375, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.003, | |
| "reward": 0.274740107357502, | |
| "reward_std": 0.3141351081430912, | |
| "rewards/cosine_scaled_reward": -0.02863520081155002, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 1661.3542022705078, | |
| "epoch": 0.4502857142857143, | |
| "grad_norm": 0.8934553861618042, | |
| "kl": 0.077362060546875, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.0031, | |
| "reward": 0.15594827197492123, | |
| "reward_std": 0.2428222820162773, | |
| "rewards/cosine_scaled_reward": -0.17488732561469078, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 1452.1458740234375, | |
| "epoch": 0.4514285714285714, | |
| "grad_norm": 0.6420386433601379, | |
| "kl": 0.0724945068359375, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0029, | |
| "reward": 0.3845909982919693, | |
| "reward_std": 0.3638080097734928, | |
| "rewards/cosine_scaled_reward": 0.08904469013214111, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 1154.7292022705078, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 0.5331507325172424, | |
| "kl": 0.02313232421875, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0009, | |
| "reward": 0.386741541326046, | |
| "reward_std": 0.21999208815395832, | |
| "rewards/cosine_scaled_reward": 0.00984945148229599, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 1427.3125610351562, | |
| "epoch": 0.45371428571428574, | |
| "grad_norm": 0.3781166970729828, | |
| "kl": 0.0625, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.0025, | |
| "reward": 0.20402754470705986, | |
| "reward_std": 0.2863064855337143, | |
| "rewards/cosine_scaled_reward": -0.1487132391630439, | |
| "rewards/format_reward": 0.8125, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 1177.8333587646484, | |
| "epoch": 0.45485714285714285, | |
| "grad_norm": 0.4327796697616577, | |
| "kl": 0.054107666015625, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.0022, | |
| "reward": 0.2399006700143218, | |
| "reward_std": 0.22659046947956085, | |
| "rewards/cosine_scaled_reward": -0.09948757383972406, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 1008.2708740234375, | |
| "epoch": 0.456, | |
| "grad_norm": 0.26638633012771606, | |
| "kl": 0.016021728515625, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0006, | |
| "reward": 0.4025176055729389, | |
| "reward_std": 0.2404554933309555, | |
| "rewards/cosine_scaled_reward": 0.07033197954297066, | |
| "rewards/format_reward": 1.0, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 911.3750305175781, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.6145984530448914, | |
| "kl": 0.01869964599609375, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.0007, | |
| "reward": 0.4475930854678154, | |
| "reward_std": 0.18855594843626022, | |
| "rewards/cosine_scaled_reward": 0.1654730625450611, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 1711.6458740234375, | |
| "epoch": 0.4582857142857143, | |
| "grad_norm": 0.7000367641448975, | |
| "kl": 0.11761474609375, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0047, | |
| "reward": 0.2637552545638755, | |
| "reward_std": 0.2834421545267105, | |
| "rewards/cosine_scaled_reward": -0.017568758921697736, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 1553.1875610351562, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 0.9715263247489929, | |
| "kl": 0.084716796875, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.0034, | |
| "reward": 0.3285421133041382, | |
| "reward_std": 0.29113895259797573, | |
| "rewards/cosine_scaled_reward": -0.012742497026920319, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 1059.2916870117188, | |
| "epoch": 0.4605714285714286, | |
| "grad_norm": 0.7187147736549377, | |
| "kl": 0.046661376953125, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0019, | |
| "reward": 0.33717523515224457, | |
| "reward_std": 0.24185105971992016, | |
| "rewards/cosine_scaled_reward": 0.04978985991328955, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 1179.8333435058594, | |
| "epoch": 0.4617142857142857, | |
| "grad_norm": 0.6305052638053894, | |
| "kl": 0.03697967529296875, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.0015, | |
| "reward": 0.29722677916288376, | |
| "reward_std": 0.2047160156071186, | |
| "rewards/cosine_scaled_reward": -0.04974408820271492, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 1272.2500305175781, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 1.0514519214630127, | |
| "kl": 0.03973388671875, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.0016, | |
| "reward": 0.4752641096711159, | |
| "reward_std": 0.3702741339802742, | |
| "rewards/cosine_scaled_reward": 0.18676720187067986, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 1381.7917022705078, | |
| "epoch": 0.464, | |
| "grad_norm": 0.4949979782104492, | |
| "kl": 0.05523681640625, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0022, | |
| "reward": 0.3171195648610592, | |
| "reward_std": 0.24485689774155617, | |
| "rewards/cosine_scaled_reward": 0.00026333145797252655, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 1613.0833435058594, | |
| "epoch": 0.46514285714285714, | |
| "grad_norm": 0.48335549235343933, | |
| "kl": 0.07067108154296875, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.0028, | |
| "reward": 0.3383835516870022, | |
| "reward_std": 0.1939285695552826, | |
| "rewards/cosine_scaled_reward": 0.005908636376261711, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 1326.1458587646484, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 0.42962008714675903, | |
| "kl": 0.04166412353515625, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0017, | |
| "reward": 0.5063566863536835, | |
| "reward_std": 0.39465366303920746, | |
| "rewards/cosine_scaled_reward": 0.18573056161403656, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 1747.8958740234375, | |
| "epoch": 0.4674285714285714, | |
| "grad_norm": 0.5078721642494202, | |
| "kl": 0.1215057373046875, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.0049, | |
| "reward": 0.20226813852787018, | |
| "reward_std": 0.23090328555554152, | |
| "rewards/cosine_scaled_reward": -0.08720803633332253, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 1563.3958892822266, | |
| "epoch": 0.4685714285714286, | |
| "grad_norm": 0.6820676922798157, | |
| "kl": 0.083648681640625, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0033, | |
| "reward": 0.4776548147201538, | |
| "reward_std": 0.29904134944081306, | |
| "rewards/cosine_scaled_reward": 0.15387392230331898, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 1896.6458740234375, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 0.7664035558700562, | |
| "kl": 0.136627197265625, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0055, | |
| "reward": 0.17960253171622753, | |
| "reward_std": 0.28863129764795303, | |
| "rewards/cosine_scaled_reward": -0.1354135131696239, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 1207.9166870117188, | |
| "epoch": 0.47085714285714286, | |
| "grad_norm": 0.7434352040290833, | |
| "kl": 0.0690765380859375, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.0028, | |
| "reward": 0.31478671357035637, | |
| "reward_std": 0.2499629482626915, | |
| "rewards/cosine_scaled_reward": -0.020952284336090088, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 1497.1875305175781, | |
| "epoch": 0.472, | |
| "grad_norm": 0.6059801578521729, | |
| "kl": 0.094818115234375, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.0038, | |
| "reward": 0.25864014215767384, | |
| "reward_std": 0.3024759627878666, | |
| "rewards/cosine_scaled_reward": -0.019603880122303963, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 2146.4583740234375, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 0.8143852353096008, | |
| "kl": 0.1427001953125, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.0057, | |
| "reward": 0.12906305491924286, | |
| "reward_std": 0.23890957981348038, | |
| "rewards/cosine_scaled_reward": -0.22878951579332352, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 1836.2084350585938, | |
| "epoch": 0.4742857142857143, | |
| "grad_norm": 0.836365818977356, | |
| "kl": 0.17041015625, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.0068, | |
| "reward": 0.29722545109689236, | |
| "reward_std": 0.26087959110736847, | |
| "rewards/cosine_scaled_reward": -0.022759397514164448, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 1202.0625305175781, | |
| "epoch": 0.4754285714285714, | |
| "grad_norm": 0.6389414668083191, | |
| "kl": 0.0267486572265625, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0011, | |
| "reward": 0.5062696859240532, | |
| "reward_std": 0.36872031539678574, | |
| "rewards/cosine_scaled_reward": 0.207824494689703, | |
| "rewards/format_reward": 1.0, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 1793.291748046875, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 1.9947246313095093, | |
| "kl": 0.144775390625, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.0058, | |
| "reward": 0.22884063888341188, | |
| "reward_std": 0.2757262773811817, | |
| "rewards/cosine_scaled_reward": -0.11375176906585693, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 1436.4792098999023, | |
| "epoch": 0.4777142857142857, | |
| "grad_norm": 1.2780954837799072, | |
| "kl": 0.1436767578125, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.0058, | |
| "reward": 0.3646044321358204, | |
| "reward_std": 0.25644519180059433, | |
| "rewards/cosine_scaled_reward": 0.11428587138652802, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 1709.5, | |
| "epoch": 0.47885714285714287, | |
| "grad_norm": 1.3950793743133545, | |
| "kl": 0.15716552734375, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0063, | |
| "reward": 0.2546742111444473, | |
| "reward_std": 0.2845570184290409, | |
| "rewards/cosine_scaled_reward": -0.027704435400664806, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 1101.9583587646484, | |
| "epoch": 0.48, | |
| "grad_norm": 1.140668511390686, | |
| "kl": 0.080596923828125, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0032, | |
| "reward": 0.2978057861328125, | |
| "reward_std": 0.23859090358018875, | |
| "rewards/cosine_scaled_reward": -0.10128935193642974, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 1453.6250305175781, | |
| "epoch": 0.48114285714285715, | |
| "grad_norm": 0.6903990507125854, | |
| "kl": 0.143310546875, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0057, | |
| "reward": 0.24449162557721138, | |
| "reward_std": 0.25894002988934517, | |
| "rewards/cosine_scaled_reward": -0.07152132876217365, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 1786.6875305175781, | |
| "epoch": 0.48228571428571426, | |
| "grad_norm": 0.7118463516235352, | |
| "kl": 0.1838836669921875, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0073, | |
| "reward": 0.26678669825196266, | |
| "reward_std": 0.22027145139873028, | |
| "rewards/cosine_scaled_reward": 0.008928820490837097, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 1891.7500305175781, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 1.0444345474243164, | |
| "kl": 0.233154296875, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0093, | |
| "reward": 0.16685185953974724, | |
| "reward_std": 0.24080579355359077, | |
| "rewards/cosine_scaled_reward": -0.11634586472064257, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 1682.8333740234375, | |
| "epoch": 0.4845714285714286, | |
| "grad_norm": 1.0426344871520996, | |
| "kl": 0.1017913818359375, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.0041, | |
| "reward": 0.17576977238059044, | |
| "reward_std": 0.25679684802889824, | |
| "rewards/cosine_scaled_reward": -0.17408692091703415, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 1349.2708892822266, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 0.44903820753097534, | |
| "kl": 0.058441162109375, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0023, | |
| "reward": 0.5212084054946899, | |
| "reward_std": 0.31782156974077225, | |
| "rewards/cosine_scaled_reward": 0.31852289102971554, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 1496.7500610351562, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 1.0811808109283447, | |
| "kl": 0.2073974609375, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0083, | |
| "reward": 0.26587871834635735, | |
| "reward_std": 0.25409030355513096, | |
| "rewards/cosine_scaled_reward": -0.07828386966139078, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 1663.2709045410156, | |
| "epoch": 0.488, | |
| "grad_norm": 1.3951669931411743, | |
| "kl": 0.111724853515625, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0045, | |
| "reward": 0.34958234429359436, | |
| "reward_std": 0.4163043648004532, | |
| "rewards/cosine_scaled_reward": 0.035510750487446785, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 1454.4792175292969, | |
| "epoch": 0.48914285714285716, | |
| "grad_norm": 0.8079267740249634, | |
| "kl": 0.0900115966796875, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0036, | |
| "reward": 0.2652810290455818, | |
| "reward_std": 0.2516926135867834, | |
| "rewards/cosine_scaled_reward": -0.06710008531808853, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 1153.0625457763672, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 0.755164384841919, | |
| "kl": 0.112701416015625, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0045, | |
| "reward": 0.28123652189970016, | |
| "reward_std": 0.25917378067970276, | |
| "rewards/cosine_scaled_reward": -0.06613434542668983, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 1428.2708587646484, | |
| "epoch": 0.49142857142857144, | |
| "grad_norm": 1.3430911302566528, | |
| "kl": 0.1293487548828125, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0052, | |
| "reward": 0.254832336679101, | |
| "reward_std": 0.23446263745427132, | |
| "rewards/cosine_scaled_reward": -0.033917545806616545, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 1351.3958892822266, | |
| "epoch": 0.49257142857142855, | |
| "grad_norm": 1.1112327575683594, | |
| "kl": 0.208984375, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0084, | |
| "reward": 0.18969909101724625, | |
| "reward_std": 0.23189299926161766, | |
| "rewards/cosine_scaled_reward": -0.14616328151896596, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 1790.1250610351562, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 0.6371591091156006, | |
| "kl": 0.1598358154296875, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.0064, | |
| "reward": 0.24663135036826134, | |
| "reward_std": 0.2685924358665943, | |
| "rewards/cosine_scaled_reward": -0.03793436847627163, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 1450.0000610351562, | |
| "epoch": 0.4948571428571429, | |
| "grad_norm": 0.8176037073135376, | |
| "kl": 0.0924072265625, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.0037, | |
| "reward": 0.33402128890156746, | |
| "reward_std": 0.24811824969947338, | |
| "rewards/cosine_scaled_reward": 0.017201400361955166, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 1634.3750305175781, | |
| "epoch": 0.496, | |
| "grad_norm": 1.4213171005249023, | |
| "kl": 0.212249755859375, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0085, | |
| "reward": 0.11890215799212456, | |
| "reward_std": 0.19112374633550644, | |
| "rewards/cosine_scaled_reward": -0.23629990592598915, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 1083.0416870117188, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 0.7486536502838135, | |
| "kl": 0.1158599853515625, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0046, | |
| "reward": 0.17990897595882416, | |
| "reward_std": 0.18072829023003578, | |
| "rewards/cosine_scaled_reward": -0.22289768233895302, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 1389.3334045410156, | |
| "epoch": 0.4982857142857143, | |
| "grad_norm": 0.9094387888908386, | |
| "kl": 0.17256927490234375, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.0069, | |
| "reward": 0.32354634441435337, | |
| "reward_std": 0.22079703584313393, | |
| "rewards/cosine_scaled_reward": 0.08843789249658585, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 1371.8958587646484, | |
| "epoch": 0.49942857142857144, | |
| "grad_norm": 1.0350995063781738, | |
| "kl": 0.067138671875, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.0027, | |
| "reward": 0.2465380048379302, | |
| "reward_std": 0.24149386584758759, | |
| "rewards/cosine_scaled_reward": -0.09340556943789124, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 1961.9792175292969, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 0.695432186126709, | |
| "kl": 0.27734375, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.0111, | |
| "reward": 0.23410780914127827, | |
| "reward_std": 0.26633862406015396, | |
| "rewards/cosine_scaled_reward": -0.07006174232810736, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 1414.1875457763672, | |
| "epoch": 0.5017142857142857, | |
| "grad_norm": 1.3709079027175903, | |
| "kl": 0.0963897705078125, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.0039, | |
| "reward": 0.17976359650492668, | |
| "reward_std": 0.27022555842995644, | |
| "rewards/cosine_scaled_reward": -0.16386261460138485, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 1298.375015258789, | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 0.7073745131492615, | |
| "kl": 0.10003662109375, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.004, | |
| "reward": 0.1858317069709301, | |
| "reward_std": 0.15907006710767746, | |
| "rewards/cosine_scaled_reward": -0.1895090565085411, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 1580.8333435058594, | |
| "epoch": 0.504, | |
| "grad_norm": 1.3497065305709839, | |
| "kl": 0.17523193359375, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.007, | |
| "reward": 0.28470361296785995, | |
| "reward_std": 0.2480045147240162, | |
| "rewards/cosine_scaled_reward": 0.0019478872418403625, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 1075.4375305175781, | |
| "epoch": 0.5051428571428571, | |
| "grad_norm": 0.725144624710083, | |
| "kl": 0.048858642578125, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.002, | |
| "reward": 0.3077830299735069, | |
| "reward_std": 0.2904723510146141, | |
| "rewards/cosine_scaled_reward": -0.05075856437906623, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 1847.7916870117188, | |
| "epoch": 0.5062857142857143, | |
| "grad_norm": 0.9789474010467529, | |
| "kl": 0.283935546875, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0113, | |
| "reward": 0.1660569068044424, | |
| "reward_std": 0.20301354117691517, | |
| "rewards/cosine_scaled_reward": -0.12026797235012054, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 1602.9584045410156, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 1.3816633224487305, | |
| "kl": 0.2508544921875, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.01, | |
| "reward": 0.2705380953848362, | |
| "reward_std": 0.3167138807475567, | |
| "rewards/cosine_scaled_reward": -0.0010249577462673187, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 1616.854248046875, | |
| "epoch": 0.5085714285714286, | |
| "grad_norm": 2.4442567825317383, | |
| "kl": 0.381591796875, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0152, | |
| "reward": 0.29816973581910133, | |
| "reward_std": 0.3240351192653179, | |
| "rewards/cosine_scaled_reward": -0.05487822741270065, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 1421.8333740234375, | |
| "epoch": 0.5097142857142857, | |
| "grad_norm": 1.0425515174865723, | |
| "kl": 0.075347900390625, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.003, | |
| "reward": 0.22422216087579727, | |
| "reward_std": 0.24329788237810135, | |
| "rewards/cosine_scaled_reward": -0.02643941156566143, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 1299.6458740234375, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 0.9955561757087708, | |
| "kl": 0.13824462890625, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0055, | |
| "reward": 0.2557426951825619, | |
| "reward_std": 0.2716595195233822, | |
| "rewards/cosine_scaled_reward": -0.07632811553776264, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 1150.0417175292969, | |
| "epoch": 0.512, | |
| "grad_norm": 1.2563894987106323, | |
| "kl": 0.17791748046875, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.0071, | |
| "reward": 0.32831166312098503, | |
| "reward_std": 0.2525380663573742, | |
| "rewards/cosine_scaled_reward": -0.023525401949882507, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 1260.6250305175781, | |
| "epoch": 0.5131428571428571, | |
| "grad_norm": 1.812902808189392, | |
| "kl": 0.26861572265625, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0107, | |
| "reward": 0.16285967081785202, | |
| "reward_std": 0.2142799235880375, | |
| "rewards/cosine_scaled_reward": -0.21595630422234535, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 1231.6667175292969, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 1.7580350637435913, | |
| "kl": 0.19732666015625, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.0079, | |
| "reward": 0.25604721903800964, | |
| "reward_std": 0.2637137100100517, | |
| "rewards/cosine_scaled_reward": -0.0012245629914104939, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 1126.6667022705078, | |
| "epoch": 0.5154285714285715, | |
| "grad_norm": 8.124907493591309, | |
| "kl": 0.2808837890625, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0112, | |
| "reward": 0.32462166622281075, | |
| "reward_std": 0.24854115769267082, | |
| "rewards/cosine_scaled_reward": -0.025696704164147377, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 1795.479232788086, | |
| "epoch": 0.5165714285714286, | |
| "grad_norm": 7.079920768737793, | |
| "kl": 0.427490234375, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.0171, | |
| "reward": 0.2079998729750514, | |
| "reward_std": 0.32888830825686455, | |
| "rewards/cosine_scaled_reward": -0.05745353177189827, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 1771.3750305175781, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 0.9983645677566528, | |
| "kl": 0.3150634765625, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0126, | |
| "reward": 0.30452151922509074, | |
| "reward_std": 0.3506147041916847, | |
| "rewards/cosine_scaled_reward": -0.0257783941924572, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 1406.2292175292969, | |
| "epoch": 0.5188571428571429, | |
| "grad_norm": 0.7196060419082642, | |
| "kl": 0.197479248046875, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.0079, | |
| "reward": 0.23121106252074242, | |
| "reward_std": 0.20694708451628685, | |
| "rewards/cosine_scaled_reward": -0.07145766541361809, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 1343.8958740234375, | |
| "epoch": 0.52, | |
| "grad_norm": 2.1558892726898193, | |
| "kl": 0.191925048828125, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0077, | |
| "reward": 0.14940793626010418, | |
| "reward_std": 0.17657526955008507, | |
| "rewards/cosine_scaled_reward": -0.21375716105103493, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 1670.0625610351562, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 1.342545986175537, | |
| "kl": 0.29425048828125, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0118, | |
| "reward": 0.22154957614839077, | |
| "reward_std": 0.30688488483428955, | |
| "rewards/cosine_scaled_reward": -0.1299609588459134, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 1772.541748046875, | |
| "epoch": 0.5222857142857142, | |
| "grad_norm": 0.9106640219688416, | |
| "kl": 0.37689208984375, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.0151, | |
| "reward": 0.27197565883398056, | |
| "reward_std": 0.26648762077093124, | |
| "rewards/cosine_scaled_reward": -0.009499620646238327, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 1358.4166870117188, | |
| "epoch": 0.5234285714285715, | |
| "grad_norm": 1.6191344261169434, | |
| "kl": 0.20147705078125, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.0081, | |
| "reward": 0.17686885967850685, | |
| "reward_std": 0.1800503470003605, | |
| "rewards/cosine_scaled_reward": -0.150895812548697, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 1272.2292022705078, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 1.0406947135925293, | |
| "kl": 0.173828125, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.0069, | |
| "reward": 0.3333798721432686, | |
| "reward_std": 0.29825500398874283, | |
| "rewards/cosine_scaled_reward": 0.05970238149166107, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 2041.4167175292969, | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 2.132143259048462, | |
| "kl": 0.349609375, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.014, | |
| "reward": 0.21742988645564765, | |
| "reward_std": 0.21657241694629192, | |
| "rewards/cosine_scaled_reward": -0.04676482267677784, | |
| "rewards/format_reward": 0.6666667014360428, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 1672.3125915527344, | |
| "epoch": 0.5268571428571428, | |
| "grad_norm": 1.4400893449783325, | |
| "kl": 0.27490234375, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.011, | |
| "reward": 0.24742556363344193, | |
| "reward_std": 0.2879389263689518, | |
| "rewards/cosine_scaled_reward": -0.019029099494218826, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 1469.8958740234375, | |
| "epoch": 0.528, | |
| "grad_norm": 1.8993326425552368, | |
| "kl": 0.332977294921875, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0133, | |
| "reward": 0.17773004883201793, | |
| "reward_std": 0.26185616478323936, | |
| "rewards/cosine_scaled_reward": -0.20606819819658995, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 1753.7084045410156, | |
| "epoch": 0.5291428571428571, | |
| "grad_norm": 1.1649374961853027, | |
| "kl": 0.165557861328125, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0066, | |
| "reward": 0.2517554350197315, | |
| "reward_std": 0.19878023117780685, | |
| "rewards/cosine_scaled_reward": -0.03619728982448578, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 1010.4583435058594, | |
| "epoch": 0.5302857142857142, | |
| "grad_norm": 0.5831181406974792, | |
| "kl": 0.1368255615234375, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.0055, | |
| "reward": 0.45407935231924057, | |
| "reward_std": 0.2512675076723099, | |
| "rewards/cosine_scaled_reward": 0.1662245448678732, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 1744.3125610351562, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 1.3955875635147095, | |
| "kl": 0.37939453125, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.0152, | |
| "reward": 0.2800963893532753, | |
| "reward_std": 0.3290780335664749, | |
| "rewards/cosine_scaled_reward": 0.019320473540574312, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 1541.3125610351562, | |
| "epoch": 0.5325714285714286, | |
| "grad_norm": 1.1654468774795532, | |
| "kl": 0.2691650390625, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0107, | |
| "reward": 0.3624018207192421, | |
| "reward_std": 0.29893627390265465, | |
| "rewards/cosine_scaled_reward": 0.09198451042175293, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 1635.2500610351562, | |
| "epoch": 0.5337142857142857, | |
| "grad_norm": 1.6793878078460693, | |
| "kl": 0.2359619140625, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.0094, | |
| "reward": 0.2201872505247593, | |
| "reward_std": 0.23867875151336193, | |
| "rewards/cosine_scaled_reward": -0.14597262814641, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 1782.3958435058594, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 1.4512677192687988, | |
| "kl": 0.32489013671875, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.013, | |
| "reward": 0.18340039625763893, | |
| "reward_std": 0.30998512730002403, | |
| "rewards/cosine_scaled_reward": -0.10351480543613434, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 1680.6250305175781, | |
| "epoch": 0.536, | |
| "grad_norm": 1.8575129508972168, | |
| "kl": 0.4377593994140625, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0175, | |
| "reward": 0.22907501395093277, | |
| "reward_std": 0.26676100492477417, | |
| "rewards/cosine_scaled_reward": -0.031132690608501434, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 1948.354248046875, | |
| "epoch": 0.5371428571428571, | |
| "grad_norm": 1.7162368297576904, | |
| "kl": 0.47454833984375, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.019, | |
| "reward": 0.22342915716581047, | |
| "reward_std": 0.25090836361050606, | |
| "rewards/cosine_scaled_reward": -0.08544790372252464, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 1845.5625305175781, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 1.5333935022354126, | |
| "kl": 0.4609375, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0184, | |
| "reward": 0.25642314786091447, | |
| "reward_std": 0.21491710096597672, | |
| "rewards/cosine_scaled_reward": 0.010931313037872314, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 1809.1250762939453, | |
| "epoch": 0.5394285714285715, | |
| "grad_norm": 1.18997061252594, | |
| "kl": 0.345947265625, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.0138, | |
| "reward": 0.15577425621449947, | |
| "reward_std": 0.2659985013306141, | |
| "rewards/cosine_scaled_reward": -0.18055069167166948, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 2019.2292175292969, | |
| "epoch": 0.5405714285714286, | |
| "grad_norm": 1.1716952323913574, | |
| "kl": 0.402801513671875, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0161, | |
| "reward": 0.15595489647239447, | |
| "reward_std": 0.23244059830904007, | |
| "rewards/cosine_scaled_reward": -0.10964090749621391, | |
| "rewards/format_reward": 0.6875, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 1756.9584045410156, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 1.262105941772461, | |
| "kl": 0.4705352783203125, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.0189, | |
| "reward": 0.5162371173501015, | |
| "reward_std": 0.34698261320590973, | |
| "rewards/cosine_scaled_reward": 0.3371627281885594, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 1377.0416870117188, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 0.5952227115631104, | |
| "kl": 0.145538330078125, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.0058, | |
| "reward": 0.20968987792730331, | |
| "reward_std": 0.2660527192056179, | |
| "rewards/cosine_scaled_reward": -0.15221368055790663, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 1574.7083740234375, | |
| "epoch": 0.544, | |
| "grad_norm": 0.8631353974342346, | |
| "kl": 0.19976806640625, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.008, | |
| "reward": 0.26954812556505203, | |
| "reward_std": 0.29395921155810356, | |
| "rewards/cosine_scaled_reward": -0.04235384240746498, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 1422.0625610351562, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 1.0784908533096313, | |
| "kl": 0.252349853515625, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.0101, | |
| "reward": 0.41880106925964355, | |
| "reward_std": 0.392875611782074, | |
| "rewards/cosine_scaled_reward": 0.1550253469031304, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 1763.166748046875, | |
| "epoch": 0.5462857142857143, | |
| "grad_norm": 1.126824140548706, | |
| "kl": 0.34912109375, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.014, | |
| "reward": 0.14933402370661497, | |
| "reward_std": 0.29011841118335724, | |
| "rewards/cosine_scaled_reward": -0.12388145178556442, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 1986.5209350585938, | |
| "epoch": 0.5474285714285714, | |
| "grad_norm": 1.0951849222183228, | |
| "kl": 0.548828125, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.022, | |
| "reward": 0.2293107956647873, | |
| "reward_std": 0.2566649541258812, | |
| "rewards/cosine_scaled_reward": -0.02941977046430111, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 1394.8958740234375, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 1.8387150764465332, | |
| "kl": 0.277618408203125, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.0111, | |
| "reward": 0.22987965494394302, | |
| "reward_std": 0.30185453593730927, | |
| "rewards/cosine_scaled_reward": -0.10149852558970451, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 1966.3541870117188, | |
| "epoch": 0.5497142857142857, | |
| "grad_norm": 2.787348508834839, | |
| "kl": 0.376220703125, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.015, | |
| "reward": 0.12472720723599195, | |
| "reward_std": 0.21501713246107101, | |
| "rewards/cosine_scaled_reward": -0.2091291006654501, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 1356.5625305175781, | |
| "epoch": 0.5508571428571428, | |
| "grad_norm": 1.1721312999725342, | |
| "kl": 0.26988983154296875, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0108, | |
| "reward": 0.34367292933166027, | |
| "reward_std": 0.2708738148212433, | |
| "rewards/cosine_scaled_reward": 0.07783332094550133, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 1949.3750915527344, | |
| "epoch": 0.552, | |
| "grad_norm": 1.2884186506271362, | |
| "kl": 0.4071044921875, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0163, | |
| "reward": 0.19273016415536404, | |
| "reward_std": 0.3338697701692581, | |
| "rewards/cosine_scaled_reward": -0.11160976439714432, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 1149.8333740234375, | |
| "epoch": 0.5531428571428572, | |
| "grad_norm": 1.1346054077148438, | |
| "kl": 0.19384765625, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.0078, | |
| "reward": 0.27724506333470345, | |
| "reward_std": 0.27399446070194244, | |
| "rewards/cosine_scaled_reward": -0.07647367380559444, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 1416.6458435058594, | |
| "epoch": 0.5542857142857143, | |
| "grad_norm": 1.929132103919983, | |
| "kl": 0.243896484375, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.0098, | |
| "reward": 0.2756602857261896, | |
| "reward_std": 0.26644935086369514, | |
| "rewards/cosine_scaled_reward": -0.017928674817085266, | |
| "rewards/format_reward": 0.8541667014360428, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 841.1666870117188, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 1.185613751411438, | |
| "kl": 0.1144866943359375, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0046, | |
| "reward": 0.28294001519680023, | |
| "reward_std": 0.21592207811772823, | |
| "rewards/cosine_scaled_reward": -0.02465655282139778, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 1128.0625305175781, | |
| "epoch": 0.5565714285714286, | |
| "grad_norm": 0.5899915099143982, | |
| "kl": 0.0730743408203125, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0029, | |
| "reward": 0.4133741296827793, | |
| "reward_std": 0.2118063010275364, | |
| "rewards/cosine_scaled_reward": 0.17440680041909218, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 1287.9583740234375, | |
| "epoch": 0.5577142857142857, | |
| "grad_norm": 1.2799737453460693, | |
| "kl": 0.3245849609375, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.013, | |
| "reward": 0.22669554874300957, | |
| "reward_std": 0.2413083128631115, | |
| "rewards/cosine_scaled_reward": -0.11942423251457512, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 1572.4375610351562, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 1.0668160915374756, | |
| "kl": 0.39013671875, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.0156, | |
| "reward": 0.06757297122385353, | |
| "reward_std": 0.17149998247623444, | |
| "rewards/cosine_scaled_reward": -0.2858339995145798, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 1397.5000305175781, | |
| "epoch": 0.56, | |
| "grad_norm": 1.396504521369934, | |
| "kl": 0.2890625, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.0116, | |
| "reward": 0.25191737711429596, | |
| "reward_std": 0.2361450344324112, | |
| "rewards/cosine_scaled_reward": -0.06171676144003868, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 1577.6459045410156, | |
| "epoch": 0.5611428571428572, | |
| "grad_norm": 1.347195029258728, | |
| "kl": 0.22802734375, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0091, | |
| "reward": 0.38772477209568024, | |
| "reward_std": 0.3787771388888359, | |
| "rewards/cosine_scaled_reward": 0.11236833222210407, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 1666.7708740234375, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 1.133479356765747, | |
| "kl": 0.37799072265625, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.0151, | |
| "reward": 0.2728476896882057, | |
| "reward_std": 0.2604874260723591, | |
| "rewards/cosine_scaled_reward": -0.008132921531796455, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 1406.8333435058594, | |
| "epoch": 0.5634285714285714, | |
| "grad_norm": 1.091591477394104, | |
| "kl": 0.2840576171875, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.0114, | |
| "reward": 0.349657341837883, | |
| "reward_std": 0.35755816102027893, | |
| "rewards/cosine_scaled_reward": 0.09285960160195827, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 1206.7083740234375, | |
| "epoch": 0.5645714285714286, | |
| "grad_norm": 1.3146690130233765, | |
| "kl": 0.19622802734375, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0078, | |
| "reward": 0.36272867769002914, | |
| "reward_std": 0.3516298234462738, | |
| "rewards/cosine_scaled_reward": -0.005188416689634323, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 1572.8542175292969, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 3.402564525604248, | |
| "kl": 0.4200439453125, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0168, | |
| "reward": 0.21359156537801027, | |
| "reward_std": 0.29135290533304214, | |
| "rewards/cosine_scaled_reward": -0.05910599837079644, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 1534.1250457763672, | |
| "epoch": 0.5668571428571428, | |
| "grad_norm": 0.9494636654853821, | |
| "kl": 0.3223876953125, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0129, | |
| "reward": 0.26828407123684883, | |
| "reward_std": 0.22558922320604324, | |
| "rewards/cosine_scaled_reward": -0.002284809947013855, | |
| "rewards/format_reward": 0.7708333358168602, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 1434.4792175292969, | |
| "epoch": 0.568, | |
| "grad_norm": 1.0619434118270874, | |
| "kl": 0.4782257080078125, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.0191, | |
| "reward": 0.3955768905580044, | |
| "reward_std": 0.2971377484500408, | |
| "rewards/cosine_scaled_reward": 0.10769826103933156, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 1431.750015258789, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 0.4931652843952179, | |
| "kl": 0.1463775634765625, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.0059, | |
| "reward": 0.24166107922792435, | |
| "reward_std": 0.2782045044004917, | |
| "rewards/cosine_scaled_reward": -0.050205936655402184, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 1504.541732788086, | |
| "epoch": 0.5702857142857143, | |
| "grad_norm": 0.9951570630073547, | |
| "kl": 0.208251953125, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0083, | |
| "reward": 0.2595203034579754, | |
| "reward_std": 0.27923375740647316, | |
| "rewards/cosine_scaled_reward": -0.056234169751405716, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 499 | |
| }, | |
| { | |
| "completion_length": 1520.0625610351562, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 2.099856376647949, | |
| "kl": 0.295166015625, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.0118, | |
| "reward": 0.25935105979442596, | |
| "reward_std": 0.26287253201007843, | |
| "rewards/cosine_scaled_reward": -0.07230615895241499, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.0020821976251731195, | |
| "train_runtime": 63377.2164, | |
| "train_samples_per_second": 0.379, | |
| "train_steps_per_second": 0.008 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |