diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6534 +1,6521 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.2862857142857143, + "epoch": 0.5714285714285714, "eval_steps": 500, - "global_step": 501, + "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "completion_length": 2706.0833740234375, - "epoch": 0.0005714285714285715, - "grad_norm": 0.2501683533191681, + "completion_length": 2571.2083587646484, + "epoch": 0.001142857142857143, + "grad_norm": 0.19501760601997375, "kl": 0.0, "learning_rate": 2e-08, "loss": -0.0, - "reward": 0.11776293255388737, - "reward_std": 0.15003874525427818, - "rewards/cosine_scaled_reward": 0.11671920120716095, - "rewards/format_reward": 0.4583333544433117, + "reward": 0.08349451050162315, + "reward_std": 0.14101681299507618, + "rewards/cosine_scaled_reward": -0.015534311532974243, + "rewards/format_reward": 0.5208333488553762, "step": 1 }, { - "completion_length": 2436.3333435058594, - "epoch": 0.001142857142857143, - "grad_norm": 0.2955506443977356, + "completion_length": 2804.395881652832, + "epoch": 0.002285714285714286, + "grad_norm": 0.18392972648143768, "kl": 0.0, "learning_rate": 4e-08, "loss": -0.0, - "reward": 0.04922608844935894, - "reward_std": 0.13199488073587418, - "rewards/cosine_scaled_reward": -0.14778782427310944, - "rewards/format_reward": 0.5833333432674408, + "reward": 0.04647743375971913, + "reward_std": 0.071280462667346, + "rewards/cosine_scaled_reward": -0.04980122856795788, + "rewards/format_reward": 0.37500000558793545, "step": 2 }, { - "completion_length": 3366.9166870117188, - "epoch": 0.0017142857142857142, - "grad_norm": 0.2227727621793747, - "kl": 3.55839729309082e-05, + "completion_length": 3326.5208435058594, + "epoch": 0.0034285714285714284, + "grad_norm": 0.16814576089382172, + "kl": 4.73707914352417e-05, "learning_rate": 6e-08, "loss": 0.0, - "reward": -0.028425224125385284, - "reward_std": 0.12027337681502104, - "rewards/cosine_scaled_reward": -0.14689341001212597, - "rewards/format_reward": 0.125, + "reward": -0.05517918919213116, + "reward_std": 0.06846281955949962, + "rewards/cosine_scaled_reward": -0.23461355827748775, + "rewards/format_reward": 0.1458333395421505, "step": 3 }, { - "completion_length": 2430.8750610351562, - "epoch": 0.002285714285714286, - "grad_norm": 0.2626025974750519, - "kl": 2.8995797038078308e-05, + "completion_length": 2271.854202270508, + "epoch": 0.004571428571428572, + "grad_norm": 0.20840850472450256, + "kl": 3.003329038619995e-05, "learning_rate": 8e-08, "loss": 0.0, - "reward": 0.13804559502750635, - "reward_std": 0.11173753812909126, - "rewards/cosine_scaled_reward": 0.13660547882318497, - "rewards/format_reward": 0.541666679084301, + "reward": 0.104565495159477, + "reward_std": 0.1621289630420506, + "rewards/cosine_scaled_reward": -0.016989090479910374, + "rewards/format_reward": 0.6458333358168602, "step": 4 }, { - "completion_length": 3094.916748046875, - "epoch": 0.002857142857142857, - "grad_norm": 0.35939615964889526, - "kl": 3.844499588012695e-05, + "completion_length": 3269.250030517578, + "epoch": 0.005714285714285714, + "grad_norm": 0.20661523938179016, + "kl": 3.783777356147766e-05, "learning_rate": 1e-07, "loss": 0.0, - "reward": 0.0022379541769623756, - "reward_std": 0.11127193830907345, - "rewards/cosine_scaled_reward": -0.09858288243412971, - "rewards/format_reward": 0.2083333395421505, + "reward": -0.027549213060410693, + "reward_std": 0.11831692652776837, + "rewards/cosine_scaled_reward": -0.22735669882968068, + "rewards/format_reward": 0.29166667349636555, "step": 5 }, { - "completion_length": 3572.4583740234375, - "epoch": 0.0034285714285714284, - "grad_norm": 0.21475401520729065, - "kl": 4.488229751586914e-05, + "completion_length": 3207.125, + "epoch": 0.006857142857142857, + "grad_norm": 0.25092315673828125, + "kl": 4.430115222930908e-05, "learning_rate": 1.2e-07, "loss": 0.0, - "reward": -0.08444137871265411, - "reward_std": 0.054994030855596066, - "rewards/cosine_scaled_reward": -0.2687242180109024, - "rewards/format_reward": 0.0416666679084301, + "reward": 0.0003576022572815418, + "reward_std": 0.13864743057638407, + "rewards/cosine_scaled_reward": -0.12470872118137777, + "rewards/format_reward": 0.2500000074505806, "step": 6 }, { - "completion_length": 1544.166732788086, - "epoch": 0.004, - "grad_norm": 0.3717062473297119, - "kl": 4.8160552978515625e-05, + "completion_length": 3157.375045776367, + "epoch": 0.008, + "grad_norm": 0.17134779691696167, + "kl": 2.024322748184204e-05, "learning_rate": 1.4e-07, "loss": 0.0, - "reward": 0.09388712793588638, - "reward_std": 0.11137167830020189, - "rewards/cosine_scaled_reward": -0.16197674837894738, - "rewards/format_reward": 0.875, + "reward": 0.024647740297950804, + "reward_std": 0.12343645561486483, + "rewards/cosine_scaled_reward": -0.10497256461530924, + "rewards/format_reward": 0.354166679084301, "step": 7 }, { - "completion_length": 2584.5834350585938, - "epoch": 0.004571428571428572, - "grad_norm": 0.2368609756231308, - "kl": 2.600252628326416e-05, + "completion_length": 2640.6875610351562, + "epoch": 0.009142857142857144, + "grad_norm": 0.1863798052072525, + "kl": 1.722201704978943e-05, "learning_rate": 1.6e-07, "loss": 0.0, - "reward": 0.12684859707951546, - "reward_std": 0.18000701442360878, - "rewards/cosine_scaled_reward": 0.07870174164418131, - "rewards/format_reward": 0.583333358168602, + "reward": 0.11913361493498087, + "reward_std": 0.13311758637428284, + "rewards/cosine_scaled_reward": 0.08812191832112148, + "rewards/format_reward": 0.520833345130086, "step": 8 }, { - "completion_length": 3519.9583740234375, - "epoch": 0.005142857142857143, - "grad_norm": 0.22889883816242218, - "kl": 4.903972148895264e-05, + "completion_length": 3171.4166870117188, + "epoch": 0.010285714285714285, + "grad_norm": 0.20052321255207062, + "kl": 4.750490188598633e-05, "learning_rate": 1.8e-07, "loss": 0.0, - "reward": -0.08069122396409512, - "reward_std": 0.07182692922651768, - "rewards/cosine_scaled_reward": -0.2808060795068741, - "rewards/format_reward": 0.0833333358168602, + "reward": 0.04684965265914798, + "reward_std": 0.10927984630689025, + "rewards/cosine_scaled_reward": -0.030095582733338233, + "rewards/format_reward": 0.3333333469927311, "step": 9 }, { - "completion_length": 3400.6250610351562, - "epoch": 0.005714285714285714, - "grad_norm": 0.22983324527740479, - "kl": 3.796815872192383e-05, + "completion_length": 2619.479217529297, + "epoch": 0.011428571428571429, + "grad_norm": 0.1900119185447693, + "kl": 2.4830922484397888e-05, "learning_rate": 2e-07, "loss": 0.0, - "reward": 0.01905585126951337, - "reward_std": 0.192475326359272, - "rewards/cosine_scaled_reward": -0.1310137260297779, - "rewards/format_reward": 0.3750000037252903, + "reward": 0.0658606368524488, + "reward_std": 0.13385961623862386, + "rewards/cosine_scaled_reward": -0.01448088325560093, + "rewards/format_reward": 0.4166666716337204, "step": 10 }, { - "completion_length": 2809.8333740234375, - "epoch": 0.006285714285714286, - "grad_norm": 0.28179633617401123, - "kl": 3.349035978317261e-05, + "completion_length": 3247.4583740234375, + "epoch": 0.012571428571428572, + "grad_norm": 0.15901124477386475, + "kl": 2.709031105041504e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, - "reward": -0.010781999677419662, - "reward_std": 0.07108947075903416, - "rewards/cosine_scaled_reward": -0.19876819103956223, - "rewards/format_reward": 0.3333333358168602, + "reward": -0.04511611349880695, + "reward_std": 0.10903473664075136, + "rewards/cosine_scaled_reward": -0.21695117373019457, + "rewards/format_reward": 0.16666667349636555, "step": 11 }, { - "completion_length": 3279.0416870117188, - "epoch": 0.006857142857142857, - "grad_norm": 0.23209799826145172, - "kl": 3.993511199951172e-05, + "completion_length": 2527.062515258789, + "epoch": 0.013714285714285714, + "grad_norm": 0.21673277020454407, + "kl": 4.7400593757629395e-05, "learning_rate": 2.4e-07, "loss": 0.0, - "reward": -0.025608792901039124, - "reward_std": 0.12979500833898783, - "rewards/cosine_scaled_reward": -0.18058418296277523, - "rewards/format_reward": 0.2083333358168602, + "reward": 0.07390611409209669, + "reward_std": 0.13868055026978254, + "rewards/cosine_scaled_reward": -0.0959229115396738, + "rewards/format_reward": 0.6250000074505806, "step": 12 }, { - "completion_length": 3470.4166870117188, - "epoch": 0.0074285714285714285, - "grad_norm": 0.21848788857460022, - "kl": 2.7291476726531982e-05, + "completion_length": 2923.7083740234375, + "epoch": 0.014857142857142857, + "grad_norm": 0.18947269022464752, + "kl": 3.428012132644653e-05, "learning_rate": 2.6e-07, "loss": 0.0, - "reward": 0.029045815274002962, - "reward_std": 0.2295490764081478, - "rewards/cosine_scaled_reward": -0.08041818533092737, - "rewards/format_reward": 0.3333333395421505, + "reward": 0.035209502559155226, + "reward_std": 0.11472400068305433, + "rewards/cosine_scaled_reward": -0.0937328040599823, + "rewards/format_reward": 0.39583334140479565, "step": 13 }, { - "completion_length": 2837.8334350585938, - "epoch": 0.008, - "grad_norm": 0.22295141220092773, - "kl": 2.562999725341797e-05, + "completion_length": 2934.9166870117188, + "epoch": 0.016, + "grad_norm": 0.24321378767490387, + "kl": 3.0156224966049194e-05, "learning_rate": 2.8e-07, "loss": 0.0, - "reward": 0.10421890311408788, - "reward_std": 0.07840518932789564, - "rewards/cosine_scaled_reward": -0.06545893847942352, - "rewards/format_reward": 0.7500000149011612, + "reward": 0.022672508843243122, + "reward_std": 0.12115806993097067, + "rewards/cosine_scaled_reward": -0.1290947226807475, + "rewards/format_reward": 0.39583333767950535, "step": 14 }, { - "completion_length": 2813.166717529297, - "epoch": 0.008571428571428572, - "grad_norm": 0.26672378182411194, - "kl": 1.5015248209238052e-05, + "completion_length": 2780.0833587646484, + "epoch": 0.017142857142857144, + "grad_norm": 0.2025020718574524, + "kl": 2.3249536752700806e-05, "learning_rate": 3e-07, "loss": 0.0, - "reward": 0.1173621149500832, - "reward_std": 0.12413903884589672, - "rewards/cosine_scaled_reward": 0.14990346878767014, - "rewards/format_reward": 0.3750000037252903, + "reward": 0.07990049698855728, + "reward_std": 0.07795312092639506, + "rewards/cosine_scaled_reward": 0.005295965820550919, + "rewards/format_reward": 0.45833334140479565, "step": 15 }, { - "completion_length": 2549.7500610351562, - "epoch": 0.009142857142857144, - "grad_norm": 0.22926995158195496, - "kl": 2.8446316719055176e-05, + "completion_length": 3492.312530517578, + "epoch": 0.018285714285714287, + "grad_norm": 0.18354374170303345, + "kl": 4.486739635467529e-05, "learning_rate": 3.2e-07, "loss": 0.0, - "reward": 0.19004815444350243, - "reward_std": 0.1421443521976471, - "rewards/cosine_scaled_reward": 0.2294003665447235, - "rewards/format_reward": 0.6666666865348816, + "reward": -0.06231466308236122, + "reward_std": 0.08755358215421438, + "rewards/cosine_scaled_reward": -0.22571247071027756, + "rewards/format_reward": 0.0833333358168602, "step": 16 }, { - "completion_length": 3186.625, - "epoch": 0.009714285714285713, - "grad_norm": 0.24431762099266052, - "kl": 4.646182060241699e-05, + "completion_length": 2405.56254196167, + "epoch": 0.019428571428571427, + "grad_norm": 0.2691311836242676, + "kl": 3.308616578578949e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, - "reward": 0.026559581980109215, - "reward_std": 0.1105972295626998, - "rewards/cosine_scaled_reward": -0.10945820808410645, - "rewards/format_reward": 0.375, + "reward": 0.07129185972735286, + "reward_std": 0.13615732779726386, + "rewards/cosine_scaled_reward": -0.07220900629181415, + "rewards/format_reward": 0.562500013038516, "step": 17 }, { - "completion_length": 2963.0001220703125, - "epoch": 0.010285714285714285, - "grad_norm": 0.33800262212753296, - "kl": 3.93986701965332e-05, + "completion_length": 2926.854232788086, + "epoch": 0.02057142857142857, + "grad_norm": 0.16645942628383636, + "kl": 2.5579705834388733e-05, "learning_rate": 3.6e-07, "loss": 0.0, - "reward": 0.05916056130081415, - "reward_std": 0.16077539697289467, - "rewards/cosine_scaled_reward": -0.05458628945052624, - "rewards/format_reward": 0.4583333395421505, + "reward": 0.018077057087793946, + "reward_std": 0.09778019832447171, + "rewards/cosine_scaled_reward": -0.12368878477718681, + "rewards/format_reward": 0.35416667349636555, "step": 18 }, { - "completion_length": 3570.0416870117188, - "epoch": 0.010857142857142857, - "grad_norm": 0.227882519364357, - "kl": 3.0517578125e-05, + "completion_length": 2740.2083740234375, + "epoch": 0.021714285714285714, + "grad_norm": 0.36254823207855225, + "kl": 2.2433698177337646e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, - "reward": 0.0035021062940359116, - "reward_std": 0.14238406158983707, - "rewards/cosine_scaled_reward": -0.09337483113631606, - "rewards/format_reward": 0.2083333358168602, + "reward": 0.11815963860135525, + "reward_std": 0.17526093125343323, + "rewards/cosine_scaled_reward": 0.10508107638452202, + "rewards/format_reward": 0.4791666753590107, "step": 19 }, { - "completion_length": 1829.3333435058594, - "epoch": 0.011428571428571429, - "grad_norm": 0.3233315944671631, - "kl": 1.4620833098888397e-05, + "completion_length": 2277.77091217041, + "epoch": 0.022857142857142857, + "grad_norm": 0.19969557225704193, + "kl": 1.2811273336410522e-05, "learning_rate": 4e-07, "loss": 0.0, - "reward": 0.09831320657394826, - "reward_std": 0.14866896159946918, - "rewards/cosine_scaled_reward": -0.02309577353298664, - "rewards/format_reward": 0.625, + "reward": 0.12608362920582294, + "reward_std": 0.16070688236504793, + "rewards/cosine_scaled_reward": 0.003691190853714943, + "rewards/format_reward": 0.7291666865348816, "step": 20 }, { - "completion_length": 3162.1250610351562, - "epoch": 0.012, - "grad_norm": 0.24830858409404755, - "kl": 2.905726432800293e-05, + "completion_length": 2652.770896911621, + "epoch": 0.024, + "grad_norm": 0.28585541248321533, + "kl": 4.544854164123535e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, - "reward": -0.04330470971763134, - "reward_std": 0.1002501891925931, - "rewards/cosine_scaled_reward": -0.23210086300969124, - "rewards/format_reward": 0.2083333395421505, + "reward": 0.02955322596244514, + "reward_std": 0.12703021708875895, + "rewards/cosine_scaled_reward": -0.10260925628244877, + "rewards/format_reward": 0.3750000037252903, "step": 21 }, { - "completion_length": 3584.0, - "epoch": 0.012571428571428572, - "grad_norm": 0.19824904203414917, - "kl": 3.5375356674194336e-05, + "completion_length": 1915.7292022705078, + "epoch": 0.025142857142857144, + "grad_norm": 0.2661941349506378, + "kl": 2.355128526687622e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, - "reward": -0.07378904055804014, - "reward_std": 0.04473175760358572, - "rewards/cosine_scaled_reward": -0.21756086684763432, - "rewards/format_reward": 0.0, + "reward": 0.11611313896719366, + "reward_std": 0.1277361772954464, + "rewards/cosine_scaled_reward": -0.013740219175815582, + "rewards/format_reward": 0.7083333469927311, "step": 22 }, { - "completion_length": 3017.5, - "epoch": 0.013142857142857144, - "grad_norm": 0.29069581627845764, - "kl": 5.033612251281738e-05, + "completion_length": 2549.1458892822266, + "epoch": 0.026285714285714287, + "grad_norm": 0.2124091237783432, + "kl": 3.099720925092697e-05, "learning_rate": 4.6e-07, "loss": 0.0, - "reward": -0.05077078752219677, - "reward_std": 0.06933087762445211, - "rewards/cosine_scaled_reward": -0.3174169808626175, - "rewards/format_reward": 0.3333333358168602, + "reward": 0.05105366797943134, + "reward_std": 0.1019338914193213, + "rewards/cosine_scaled_reward": -0.10208869446069002, + "rewards/format_reward": 0.500000013038516, "step": 23 }, { - "completion_length": 2011.2500610351562, - "epoch": 0.013714285714285714, - "grad_norm": 0.35230743885040283, - "kl": 3.3915042877197266e-05, + "completion_length": 2841.8333740234375, + "epoch": 0.027428571428571427, + "grad_norm": 0.23832456767559052, + "kl": 2.9023736715316772e-05, "learning_rate": 4.8e-07, "loss": 0.0, - "reward": 0.1924394704401493, - "reward_std": 0.1356711070984602, - "rewards/cosine_scaled_reward": 0.12988251447677612, - "rewards/format_reward": 0.875, + "reward": 0.055271712597459555, + "reward_std": 0.15443201549351215, + "rewards/cosine_scaled_reward": -0.05593502405099571, + "rewards/format_reward": 0.43750000931322575, "step": 24 }, { - "completion_length": 2952.8334350585938, - "epoch": 0.014285714285714285, - "grad_norm": 0.29196441173553467, - "kl": 2.822279930114746e-05, + "completion_length": 2734.6250228881836, + "epoch": 0.02857142857142857, + "grad_norm": 0.20576398074626923, + "kl": 4.264712333679199e-05, "learning_rate": 5e-07, "loss": 0.0, - "reward": 0.060000598430633545, - "reward_std": 0.1326651219278574, - "rewards/cosine_scaled_reward": -0.009722378104925156, - "rewards/format_reward": 0.3750000111758709, + "reward": 0.03582445718348026, + "reward_std": 0.14606032520532608, + "rewards/cosine_scaled_reward": -0.0837564684334211, + "rewards/format_reward": 0.3750000074505806, "step": 25 }, { - "completion_length": 2867.625, - "epoch": 0.014857142857142857, - "grad_norm": 0.2785350978374481, - "kl": 2.4410896003246307e-05, + "completion_length": 2954.3958740234375, + "epoch": 0.029714285714285714, + "grad_norm": 0.16195496916770935, + "kl": 3.225356340408325e-05, "learning_rate": 5.2e-07, "loss": 0.0, - "reward": 0.09439184702932835, - "reward_std": 0.1144513338804245, - "rewards/cosine_scaled_reward": 0.04433799907565117, - "rewards/format_reward": 0.4583333432674408, + "reward": 0.06916803470812738, + "reward_std": 0.08843644242733717, + "rewards/cosine_scaled_reward": -0.023365769535303116, + "rewards/format_reward": 0.45833334140479565, "step": 26 }, { - "completion_length": 3351.666748046875, - "epoch": 0.015428571428571429, - "grad_norm": 0.2856082320213318, - "kl": 2.9489398002624512e-05, + "completion_length": 2985.125045776367, + "epoch": 0.030857142857142857, + "grad_norm": 0.18544963002204895, + "kl": 2.6202760636806488e-05, "learning_rate": 5.4e-07, "loss": 0.0, - "reward": -0.050767247565090656, - "reward_std": 0.12167283333837986, - "rewards/cosine_scaled_reward": -0.25269323866814375, - "rewards/format_reward": 0.2083333395421505, + "reward": 0.0820373228052631, + "reward_std": 0.15796185052022338, + "rewards/cosine_scaled_reward": -0.017732856795191765, + "rewards/format_reward": 0.5208333432674408, "step": 27 }, { - "completion_length": 2461.3333740234375, - "epoch": 0.016, - "grad_norm": 0.26561662554740906, - "kl": 3.0346214771270752e-05, + "completion_length": 2774.625015258789, + "epoch": 0.032, + "grad_norm": 0.1868346929550171, + "kl": 3.4227967262268066e-05, "learning_rate": 5.6e-07, "loss": 0.0, - "reward": 0.14572910498827696, - "reward_std": 0.1390552008524537, - "rewards/cosine_scaled_reward": 0.09711413830518723, - "rewards/format_reward": 0.6666666716337204, + "reward": 0.09084612363949418, + "reward_std": 0.08862769743427634, + "rewards/cosine_scaled_reward": 0.03848753869533539, + "rewards/format_reward": 0.4583333432674408, "step": 28 }, { - "completion_length": 2944.375, - "epoch": 0.01657142857142857, - "grad_norm": 0.30562201142311096, - "kl": 2.7602538466453552e-05, + "completion_length": 3335.4791870117188, + "epoch": 0.03314285714285714, + "grad_norm": 0.1792682260274887, + "kl": 2.7485191822052002e-05, "learning_rate": 5.8e-07, "loss": 0.0, - "reward": 0.07183475513011217, - "reward_std": 0.13248160295188427, - "rewards/cosine_scaled_reward": 0.06457573268562555, - "rewards/format_reward": 0.2916666679084301, + "reward": -0.04747861542273313, + "reward_std": 0.10171156749129295, + "rewards/cosine_scaled_reward": -0.22336439974606037, + "rewards/format_reward": 0.16666666977107525, "step": 29 }, { - "completion_length": 2580.0000610351562, - "epoch": 0.017142857142857144, - "grad_norm": 0.24796323478221893, - "kl": 1.662224531173706e-05, + "completion_length": 3182.291717529297, + "epoch": 0.03428571428571429, + "grad_norm": 0.20437775552272797, + "kl": 2.625095658004284e-05, "learning_rate": 6e-07, "loss": 0.0, - "reward": 0.0457475371658802, - "reward_std": 0.10252010356634855, - "rewards/cosine_scaled_reward": -0.15621890127658844, - "rewards/format_reward": 0.5833333358168602, + "reward": 0.04219546925742179, + "reward_std": 0.16496449895203114, + "rewards/cosine_scaled_reward": -0.04208953632041812, + "rewards/format_reward": 0.33333334140479565, "step": 30 }, { - "completion_length": 3464.6250610351562, - "epoch": 0.017714285714285714, - "grad_norm": 0.26567551493644714, - "kl": 4.1037797927856445e-05, + "completion_length": 3098.729202270508, + "epoch": 0.03542857142857143, + "grad_norm": 0.22590124607086182, + "kl": 3.269314765930176e-05, "learning_rate": 6.2e-07, "loss": 0.0, - "reward": -0.042508176527917385, - "reward_std": 0.11092632170766592, - "rewards/cosine_scaled_reward": -0.16741123114479706, - "rewards/format_reward": 0.0833333358168602, + "reward": 0.005523839652596507, + "reward_std": 0.14406571350991726, + "rewards/cosine_scaled_reward": -0.10929703339934349, + "rewards/format_reward": 0.2500000111758709, "step": 31 }, { - "completion_length": 3584.0, - "epoch": 0.018285714285714287, - "grad_norm": 0.23417529463768005, - "kl": 3.102421760559082e-05, + "completion_length": 3180.729248046875, + "epoch": 0.036571428571428574, + "grad_norm": 0.1668621450662613, + "kl": 2.413243055343628e-05, "learning_rate": 6.4e-07, "loss": 0.0, - "reward": -0.08364807441830635, - "reward_std": 0.06065140198916197, - "rewards/cosine_scaled_reward": -0.24549809098243713, - "rewards/format_reward": 0.0, + "reward": 0.02685558469966054, + "reward_std": 0.11427591601386666, + "rewards/cosine_scaled_reward": -0.07639202522113919, + "rewards/format_reward": 0.31250000186264515, "step": 32 }, { - "completion_length": 1886.0000534057617, - "epoch": 0.018857142857142857, - "grad_norm": 0.41026684641838074, - "kl": 1.4901161193847656e-05, + "completion_length": 3349.291748046875, + "epoch": 0.037714285714285714, + "grad_norm": 0.14419840276241302, + "kl": 3.168731927871704e-05, "learning_rate": 6.6e-07, "loss": 0.0, - "reward": 0.13917259220033884, - "reward_std": 0.1789782401174307, - "rewards/cosine_scaled_reward": 0.07767122983932495, - "rewards/format_reward": 0.6666666865348816, + "reward": 0.039358544163405895, + "reward_std": 0.16947886534035206, + "rewards/cosine_scaled_reward": -0.06123522081179544, + "rewards/format_reward": 0.3541666753590107, "step": 33 }, { - "completion_length": 2840.000030517578, - "epoch": 0.019428571428571427, - "grad_norm": 0.3186735510826111, - "kl": 3.826618194580078e-05, + "completion_length": 2380.9167098999023, + "epoch": 0.038857142857142854, + "grad_norm": 0.22925761342048645, + "kl": 2.7898699045181274e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0, - "reward": 0.06006689742207527, - "reward_std": 0.13148816861212254, - "rewards/cosine_scaled_reward": -0.05165791395120323, - "rewards/format_reward": 0.4583333395421505, + "reward": 0.10769698303192854, + "reward_std": 0.1343193519860506, + "rewards/cosine_scaled_reward": 0.024825414642691612, + "rewards/format_reward": 0.5833333358168602, "step": 34 }, { - "completion_length": 3322.166748046875, - "epoch": 0.02, - "grad_norm": 0.20957322418689728, - "kl": 1.1128373444080353e-05, + "completion_length": 2984.8959197998047, + "epoch": 0.04, + "grad_norm": 0.22632969915866852, + "kl": 3.349222242832184e-05, "learning_rate": 7e-07, "loss": 0.0, - "reward": 0.039599085692316294, - "reward_std": 0.17049845680594444, - "rewards/cosine_scaled_reward": -0.02847505733370781, - "rewards/format_reward": 0.2916666753590107, + "reward": 0.008388399612158537, + "reward_std": 0.10677651688456535, + "rewards/cosine_scaled_reward": -0.172616648953408, + "rewards/format_reward": 0.3958333395421505, "step": 35 }, { - "completion_length": 2600.250030517578, - "epoch": 0.02057142857142857, - "grad_norm": 0.2403794527053833, - "kl": 2.065300941467285e-05, + "completion_length": 3379.3333740234375, + "epoch": 0.04114285714285714, + "grad_norm": 0.1803259551525116, + "kl": 3.5781413316726685e-05, "learning_rate": 7.2e-07, "loss": 0.0, - "reward": 0.019864825531840324, - "reward_std": 0.08492440357804298, - "rewards/cosine_scaled_reward": -0.17020654119551182, - "rewards/format_reward": 0.4583333432674408, + "reward": -0.04249414807418361, + "reward_std": 0.11586784245446324, + "rewards/cosine_scaled_reward": -0.21885607368312776, + "rewards/format_reward": 0.18750000186264515, "step": 36 }, { - "completion_length": 2663.2500915527344, - "epoch": 0.021142857142857144, - "grad_norm": 0.2849242091178894, - "kl": 2.4974346160888672e-05, + "completion_length": 3416.3541870117188, + "epoch": 0.04228571428571429, + "grad_norm": 0.14628717303276062, + "kl": 1.6780570149421692e-05, "learning_rate": 7.4e-07, "loss": 0.0, - "reward": 0.16150063159875572, - "reward_std": 0.20827658474445343, - "rewards/cosine_scaled_reward": 0.20511333644390106, - "rewards/format_reward": 0.541666679084301, + "reward": -0.050946102710440755, + "reward_std": 0.07268051384016871, + "rewards/cosine_scaled_reward": -0.22305289562791586, + "rewards/format_reward": 0.1458333395421505, "step": 37 }, { - "completion_length": 3248.0833740234375, - "epoch": 0.021714285714285714, - "grad_norm": 0.19711735844612122, - "kl": 2.995133399963379e-05, + "completion_length": 3168.875015258789, + "epoch": 0.04342857142857143, + "grad_norm": 0.15368561446666718, + "kl": 2.38809734582901e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0, - "reward": 0.026037941686809063, - "reward_std": 0.20191513188183308, - "rewards/cosine_scaled_reward": -0.048709994181990623, - "rewards/format_reward": 0.2500000037252903, + "reward": -0.01665067719295621, + "reward_std": 0.06848278688266873, + "rewards/cosine_scaled_reward": -0.14153539016842842, + "rewards/format_reward": 0.1875, "step": 38 }, { - "completion_length": 2381.6667098999023, - "epoch": 0.022285714285714287, - "grad_norm": 0.2262616902589798, - "kl": 7.659196853637695e-06, + "completion_length": 2897.9583740234375, + "epoch": 0.044571428571428574, + "grad_norm": 0.17705340683460236, + "kl": 1.7940998077392578e-05, "learning_rate": 7.799999999999999e-07, - "loss": -0.0, - "reward": 0.1581143403891474, - "reward_std": 0.1152333621867001, - "rewards/cosine_scaled_reward": 0.06927675288170576, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0, + "reward": 0.05753035913221538, + "reward_std": 0.07578674505930394, + "rewards/cosine_scaled_reward": -0.04679988697171211, + "rewards/format_reward": 0.4375000037252903, "step": 39 }, { - "completion_length": 2882.500030517578, - "epoch": 0.022857142857142857, - "grad_norm": 0.21805818378925323, - "kl": 3.94284725189209e-05, + "completion_length": 2554.7708892822266, + "epoch": 0.045714285714285714, + "grad_norm": 0.22783087193965912, + "kl": 2.1063722670078278e-05, "learning_rate": 8e-07, "loss": 0.0, - "reward": 0.004913114244118333, - "reward_std": 0.15220978297293186, - "rewards/cosine_scaled_reward": -0.2152528576552868, - "rewards/format_reward": 0.4583333395421505, + "reward": 0.04392236044805031, + "reward_std": 0.11113758757710457, + "rewards/cosine_scaled_reward": -0.11080039292573929, + "rewards/format_reward": 0.4791666753590107, "step": 40 }, { - "completion_length": 3445.541748046875, - "epoch": 0.023428571428571427, - "grad_norm": 0.2037835419178009, - "kl": 1.8320046365261078e-05, + "completion_length": 2975.8958740234375, + "epoch": 0.046857142857142854, + "grad_norm": 0.22396467626094818, + "kl": 1.806439831852913e-05, "learning_rate": 8.199999999999999e-07, "loss": 0.0, - "reward": -0.01321748155169189, - "reward_std": 0.15236316993832588, - "rewards/cosine_scaled_reward": -0.12253267783671618, - "rewards/format_reward": 0.1666666716337204, + "reward": -0.02207160711986944, + "reward_std": 0.12583239562809467, + "rewards/cosine_scaled_reward": -0.24189986009150743, + "rewards/format_reward": 0.3541666753590107, "step": 41 }, { - "completion_length": 2160.166717529297, - "epoch": 0.024, - "grad_norm": 0.32533666491508484, - "kl": 0.00010207295417785645, + "completion_length": 2725.9166984558105, + "epoch": 0.048, + "grad_norm": 0.27289333939552307, + "kl": 3.649294376373291e-05, "learning_rate": 8.399999999999999e-07, "loss": 0.0, - "reward": 0.11436812300235033, - "reward_std": 0.10017563961446285, - "rewards/cosine_scaled_reward": 0.0031651929020881653, - "rewards/format_reward": 0.6666666865348816, + "reward": -0.031094663951080292, + "reward_std": 0.07978959055617452, + "rewards/cosine_scaled_reward": -0.28885440342128277, + "rewards/format_reward": 0.3958333395421505, "step": 42 }, { - "completion_length": 1653.2083435058594, - "epoch": 0.02457142857142857, - "grad_norm": 0.3985632359981537, - "kl": 0.00018486008048057556, + "completion_length": 2959.916702270508, + "epoch": 0.04914285714285714, + "grad_norm": 0.20727042853832245, + "kl": 2.0106323063373566e-05, "learning_rate": 8.599999999999999e-07, "loss": 0.0, - "reward": 0.09936422016471624, - "reward_std": 0.11649142764508724, - "rewards/cosine_scaled_reward": -0.14586824923753738, - "rewards/format_reward": 0.875, + "reward": 0.02459817798808217, + "reward_std": 0.1277715265750885, + "rewards/cosine_scaled_reward": -0.08350535575300455, + "rewards/format_reward": 0.3125000037252903, "step": 43 }, { - "completion_length": 2075.0416717529297, - "epoch": 0.025142857142857144, - "grad_norm": 0.39254382252693176, - "kl": 5.186349153518677e-05, + "completion_length": 2700.6458740234375, + "epoch": 0.05028571428571429, + "grad_norm": 0.2838585376739502, + "kl": 5.9839338064193726e-05, "learning_rate": 8.799999999999999e-07, "loss": 0.0, - "reward": 0.1214053895091638, - "reward_std": 0.12070498056709766, - "rewards/cosine_scaled_reward": 0.04414374195039272, - "rewards/format_reward": 0.625, + "reward": 0.07408008817583323, + "reward_std": 0.1297681350260973, + "rewards/cosine_scaled_reward": -0.011715584434568882, + "rewards/format_reward": 0.45833334140479565, "step": 44 }, { - "completion_length": 2018.1250305175781, - "epoch": 0.025714285714285714, - "grad_norm": 0.2977658808231354, - "kl": 5.125999450683594e-05, + "completion_length": 3354.8125610351562, + "epoch": 0.05142857142857143, + "grad_norm": 0.1483648717403412, + "kl": 1.9919127225875854e-05, "learning_rate": 9e-07, "loss": 0.0, - "reward": 0.08466422068886459, - "reward_std": 0.10737069696187973, - "rewards/cosine_scaled_reward": -0.08546392060816288, - "rewards/format_reward": 0.6666666865348816, + "reward": 0.042736097471788526, + "reward_std": 0.13064279220998287, + "rewards/cosine_scaled_reward": -0.017792840400943533, + "rewards/format_reward": 0.2916666716337204, "step": 45 }, { - "completion_length": 3075.4583435058594, - "epoch": 0.026285714285714287, - "grad_norm": 0.2357693314552307, - "kl": 4.1171908378601074e-05, + "completion_length": 3156.3959045410156, + "epoch": 0.052571428571428575, + "grad_norm": 0.18322643637657166, + "kl": 2.984795719385147e-05, "learning_rate": 9.2e-07, "loss": 0.0, - "reward": -0.03193952515721321, - "reward_std": 0.07977707567624748, - "rewards/cosine_scaled_reward": -0.24097227677702904, - "rewards/format_reward": 0.2916666753590107, + "reward": -0.0388933519134298, + "reward_std": 0.09174182126298547, + "rewards/cosine_scaled_reward": -0.21921097254380584, + "rewards/format_reward": 0.2083333358168602, "step": 46 }, { - "completion_length": 2144.4166870117188, - "epoch": 0.026857142857142857, - "grad_norm": 0.4162147045135498, - "kl": 7.081031799316406e-05, + "completion_length": 2675.1042098999023, + "epoch": 0.053714285714285714, + "grad_norm": 0.22902333736419678, + "kl": 3.225822001695633e-05, "learning_rate": 9.399999999999999e-07, "loss": 0.0, - "reward": 0.08159872889518738, - "reward_std": 0.14609019458293915, - "rewards/cosine_scaled_reward": -0.05107011832296848, - "rewards/format_reward": 0.5833333432674408, + "reward": 0.11461905902251601, + "reward_std": 0.14914221363142133, + "rewards/cosine_scaled_reward": 0.06888513453304768, + "rewards/format_reward": 0.5416666809469461, "step": 47 }, { - "completion_length": 3097.291748046875, - "epoch": 0.027428571428571427, - "grad_norm": 0.18213987350463867, - "kl": 2.221018075942993e-05, + "completion_length": 2940.250015258789, + "epoch": 0.054857142857142854, + "grad_norm": 0.1693173050880432, + "kl": 9.981123730540276e-05, "learning_rate": 9.6e-07, "loss": 0.0, - "reward": 0.09545105075812899, - "reward_std": 0.17484111711382866, - "rewards/cosine_scaled_reward": 0.05273487728845794, - "rewards/format_reward": 0.4583333395421505, + "reward": 0.029834913242666516, + "reward_std": 0.1260078912600875, + "rewards/cosine_scaled_reward": -0.07712742034345865, + "rewards/format_reward": 0.3333333358168602, "step": 48 }, { - "completion_length": 2946.500030517578, - "epoch": 0.028, - "grad_norm": 0.22182704508304596, - "kl": 7.066130638122559e-05, + "completion_length": 2305.0000381469727, + "epoch": 0.056, + "grad_norm": 0.2164977639913559, + "kl": 7.113814353942871e-05, "learning_rate": 9.8e-07, "loss": 0.0, - "reward": 0.057773349806666374, - "reward_std": 0.11813096515834332, - "rewards/cosine_scaled_reward": -0.05903153121471405, - "rewards/format_reward": 0.4583333432674408, + "reward": 0.06258401460945606, + "reward_std": 0.12052545137703419, + "rewards/cosine_scaled_reward": -0.0762196818832308, + "rewards/format_reward": 0.5208333376795053, "step": 49 }, { - "completion_length": 2420.9166870117188, - "epoch": 0.02857142857142857, - "grad_norm": 0.29158666729927063, - "kl": 8.463859558105469e-05, + "completion_length": 2923.770866394043, + "epoch": 0.05714285714285714, + "grad_norm": 0.165934756398201, + "kl": 5.564838647842407e-05, "learning_rate": 1e-06, "loss": 0.0, - "reward": 0.08006853051483631, - "reward_std": 0.11879745870828629, - "rewards/cosine_scaled_reward": 0.002561185508966446, - "rewards/format_reward": 0.4583333395421505, + "reward": 0.02972496929578483, + "reward_std": 0.1184019073843956, + "rewards/cosine_scaled_reward": -0.07874834412359633, + "rewards/format_reward": 0.3333333358168602, "step": 50 }, { - "completion_length": 2958.6666870117188, - "epoch": 0.029142857142857144, - "grad_norm": 0.21123450994491577, - "kl": 4.0978193283081055e-08, + "completion_length": 2164.1250076293945, + "epoch": 0.05828571428571429, + "grad_norm": 0.28015246987342834, + "kl": 0.00023446721024811268, "learning_rate": 9.999890338174275e-07, "loss": 0.0, - "reward": 0.08579499274492264, - "reward_std": 0.08441803604364395, - "rewards/cosine_scaled_reward": 0.007927987724542618, - "rewards/format_reward": 0.5, + "reward": 0.06518604746088386, + "reward_std": 0.08450535265728831, + "rewards/cosine_scaled_reward": -0.08784076571464539, + "rewards/format_reward": 0.5625000074505806, "step": 51 }, { - "completion_length": 2888.750030517578, - "epoch": 0.029714285714285714, - "grad_norm": 0.2578684687614441, - "kl": 3.127753734588623e-05, + "completion_length": 2815.583396911621, + "epoch": 0.05942857142857143, + "grad_norm": 0.22428154945373535, + "kl": 0.00012213829904794693, "learning_rate": 9.999561358041868e-07, "loss": 0.0, - "reward": 0.04564014449715614, - "reward_std": 0.11141092889010906, - "rewards/cosine_scaled_reward": -0.07328417152166367, - "rewards/format_reward": 0.4166666865348816, + "reward": 0.08840094119659625, + "reward_std": 0.17651066416874528, + "rewards/cosine_scaled_reward": 0.02174525521695614, + "rewards/format_reward": 0.47916667722165585, "step": 52 }, { - "completion_length": 3506.5000610351562, - "epoch": 0.030285714285714287, - "grad_norm": 0.19908225536346436, - "kl": 4.537403583526611e-06, + "completion_length": 2765.291717529297, + "epoch": 0.060571428571428575, + "grad_norm": 0.20454645156860352, + "kl": 0.00013817846775054932, "learning_rate": 9.999013075636804e-07, "loss": 0.0, - "reward": 0.007065474521368742, - "reward_std": 0.13948188349604607, - "rewards/cosine_scaled_reward": -0.12396979331970215, - "rewards/format_reward": 0.2916666679084301, + "reward": 0.13577738002641127, + "reward_std": 0.15497551951557398, + "rewards/cosine_scaled_reward": 0.10726968757808208, + "rewards/format_reward": 0.5833333376795053, "step": 53 }, { - "completion_length": 2591.3750610351562, - "epoch": 0.030857142857142857, - "grad_norm": 0.3687554895877838, - "kl": 7.659196853637695e-05, + "completion_length": 2602.104217529297, + "epoch": 0.061714285714285715, + "grad_norm": 0.17332378029823303, + "kl": 6.633996963500977e-05, "learning_rate": 9.998245517681593e-07, "loss": 0.0, - "reward": 0.03492234647274017, - "reward_std": 0.16333030443638563, - "rewards/cosine_scaled_reward": -0.10544288344681263, - "rewards/format_reward": 0.4166666716337204, + "reward": 0.17019405495375395, + "reward_std": 0.14906789222732186, + "rewards/cosine_scaled_reward": 0.18696625716984272, + "rewards/format_reward": 0.6250000167638063, "step": 54 }, { - "completion_length": 2961.9583740234375, - "epoch": 0.03142857142857143, - "grad_norm": 0.24653460085391998, - "kl": 4.8294663429260254e-05, + "completion_length": 2931.2708587646484, + "epoch": 0.06285714285714286, + "grad_norm": 0.1711331158876419, + "kl": 8.093938231468201e-05, "learning_rate": 9.997258721585931e-07, "loss": 0.0, - "reward": 0.06728992238640785, - "reward_std": 0.12527610547840595, - "rewards/cosine_scaled_reward": -0.00823098886758089, - "rewards/format_reward": 0.4166666865348816, + "reward": 0.06622584909200668, + "reward_std": 0.12079534726217389, + "rewards/cosine_scaled_reward": -0.012890823185443878, + "rewards/format_reward": 0.41666666977107525, "step": 55 }, { - "completion_length": 2705.375030517578, - "epoch": 0.032, - "grad_norm": 0.34186646342277527, - "kl": 0.0003177367616444826, + "completion_length": 2785.833396911621, + "epoch": 0.064, + "grad_norm": 0.18859043717384338, + "kl": 3.4786760807037354e-05, "learning_rate": 9.996052735444862e-07, "loss": 0.0, - "reward": 0.08239484508521855, - "reward_std": 0.1340354452840984, - "rewards/cosine_scaled_reward": 0.03559903847053647, - "rewards/format_reward": 0.4166666865348816, + "reward": 0.058590125758200884, + "reward_std": 0.0908779576420784, + "rewards/cosine_scaled_reward": -0.045839957892894745, + "rewards/format_reward": 0.43750001303851604, "step": 56 }, { - "completion_length": 3360.8333740234375, - "epoch": 0.03257142857142857, - "grad_norm": 0.2354213446378708, - "kl": 0.00018787384033203125, + "completion_length": 3269.0208740234375, + "epoch": 0.06514285714285714, + "grad_norm": 0.12594451010227203, + "kl": 1.267390325665474e-05, "learning_rate": 9.994627618036452e-07, "loss": 0.0, - "reward": -0.036895571276545525, - "reward_std": 0.09581980761140585, - "rewards/cosine_scaled_reward": -0.19129813089966774, - "rewards/format_reward": 0.1666666716337204, + "reward": 0.023180216550827026, + "reward_std": 0.12568830093368888, + "rewards/cosine_scaled_reward": -0.09830697299912572, + "rewards/format_reward": 0.3333333358168602, "step": 57 }, { - "completion_length": 3243.8333740234375, - "epoch": 0.03314285714285714, - "grad_norm": 0.3167147636413574, - "kl": 0.0002677738666534424, + "completion_length": 2111.645866394043, + "epoch": 0.06628571428571428, + "grad_norm": 0.2272089719772339, + "kl": 0.0004666820168495178, "learning_rate": 9.992983438818915e-07, "loss": 0.0, - "reward": -0.001234954223036766, - "reward_std": 0.14268147014081478, - "rewards/cosine_scaled_reward": -0.1278473660349846, - "rewards/format_reward": 0.2500000037252903, + "reward": 0.1490978323854506, + "reward_std": 0.143410362303257, + "rewards/cosine_scaled_reward": 0.06337156053632498, + "rewards/format_reward": 0.7500000149011612, "step": 58 }, { - "completion_length": 2542.333465576172, - "epoch": 0.03371428571428572, - "grad_norm": 0.26090723276138306, - "kl": 0.0002754628658294678, + "completion_length": 2783.1250228881836, + "epoch": 0.06742857142857143, + "grad_norm": 0.17396898567676544, + "kl": 2.6125460863113403e-05, "learning_rate": 9.991120277927223e-07, "loss": 0.0, - "reward": 0.0860880445688963, - "reward_std": 0.20923775807023048, - "rewards/cosine_scaled_reward": -0.07773629995062947, - "rewards/format_reward": 0.6666666716337204, + "reward": 0.01869871746748686, + "reward_std": 0.12852068059146404, + "rewards/cosine_scaled_reward": -0.1118423049338162, + "rewards/format_reward": 0.3333333358168602, "step": 59 }, { - "completion_length": 3372.6250610351562, - "epoch": 0.03428571428571429, - "grad_norm": 0.2036096453666687, - "kl": 0.00041681528091430664, + "completion_length": 2860.500045776367, + "epoch": 0.06857142857142857, + "grad_norm": 0.1613863706588745, + "kl": 6.277300417423248e-05, "learning_rate": 9.989038226169207e-07, "loss": 0.0, - "reward": 0.041515701450407505, - "reward_std": 0.18584254710003734, - "rewards/cosine_scaled_reward": -0.04242793470621109, - "rewards/format_reward": 0.3333333395421505, + "reward": 0.036287183640524745, + "reward_std": 0.12081348802894354, + "rewards/cosine_scaled_reward": -0.10168062068987638, + "rewards/format_reward": 0.41666667349636555, "step": 60 }, { - "completion_length": 3389.8333740234375, - "epoch": 0.03485714285714286, - "grad_norm": 0.21293561160564423, - "kl": 0.00014474987983703613, + "completion_length": 2981.479217529297, + "epoch": 0.06971428571428571, + "grad_norm": 0.16875192523002625, + "kl": 0.00013221928384155035, "learning_rate": 9.98673738502114e-07, "loss": 0.0, - "reward": 0.010577604174613953, - "reward_std": 0.11215718183666468, - "rewards/cosine_scaled_reward": -0.051535483449697495, - "rewards/format_reward": 0.1666666679084301, + "reward": 0.050741570768877864, + "reward_std": 0.14027578197419643, + "rewards/cosine_scaled_reward": -0.10068160435184836, + "rewards/format_reward": 0.5000000055879354, "step": 61 }, { - "completion_length": 2807.416717529297, - "epoch": 0.03542857142857143, - "grad_norm": 0.29103437066078186, - "kl": 0.00022557377815246582, + "completion_length": 2615.1042251586914, + "epoch": 0.07085714285714285, + "grad_norm": 0.19548830389976501, + "kl": 0.0005001500248908997, "learning_rate": 9.98421786662277e-07, "loss": 0.0, - "reward": 0.028865497559309006, - "reward_std": 0.14694130700081587, - "rewards/cosine_scaled_reward": -0.10447319713421166, - "rewards/format_reward": 0.3750000037252903, + "reward": 0.09628129447810352, + "reward_std": 0.09381117532029748, + "rewards/cosine_scaled_reward": -0.006604377180337906, + "rewards/format_reward": 0.583333333954215, "step": 62 }, { - "completion_length": 3206.9584350585938, - "epoch": 0.036, - "grad_norm": 0.22266943752765656, - "kl": 0.00011670589447021484, + "completion_length": 2180.791763305664, + "epoch": 0.072, + "grad_norm": 0.2137078046798706, + "kl": 0.0017590373754501343, "learning_rate": 9.981479793771866e-07, - "loss": 0.0, - "reward": 0.11690415907651186, - "reward_std": 0.12782080098986626, - "rewards/cosine_scaled_reward": 0.13852720707654953, - "rewards/format_reward": 0.4166666679084301, + "loss": 0.0001, + "reward": 0.18858059402555227, + "reward_std": 0.1375174829736352, + "rewards/cosine_scaled_reward": 0.18239261582493782, + "rewards/format_reward": 0.7500000074505806, "step": 63 }, { - "completion_length": 3269.8333740234375, - "epoch": 0.036571428571428574, - "grad_norm": 0.245357483625412, - "kl": 0.0009914040565490723, + "completion_length": 2905.479217529297, + "epoch": 0.07314285714285715, + "grad_norm": 0.17932051420211792, + "kl": 0.0002357116900384426, "learning_rate": 9.97852329991824e-07, "loss": 0.0, - "reward": -0.03344050049781799, - "reward_std": 0.10190400667488575, - "rewards/cosine_scaled_reward": -0.20260387379676104, - "rewards/format_reward": 0.2083333395421505, + "reward": 0.041001017816597596, + "reward_std": 0.1474486207589507, + "rewards/cosine_scaled_reward": -0.05582154542207718, + "rewards/format_reward": 0.35416667722165585, "step": 64 }, { - "completion_length": 3419.25, - "epoch": 0.037142857142857144, - "grad_norm": 0.2120564877986908, - "kl": 0.0005999132990837097, + "completion_length": 2707.2917137145996, + "epoch": 0.07428571428571429, + "grad_norm": 0.18318000435829163, + "kl": 0.0001243998558493331, "learning_rate": 9.975348529157229e-07, "loss": 0.0, - "reward": -0.07127432338893414, - "reward_std": 0.09128645434975624, - "rewards/cosine_scaled_reward": -0.2945564016699791, - "rewards/format_reward": 0.1666666716337204, + "reward": 0.05281507736071944, + "reward_std": 0.10035080322995782, + "rewards/cosine_scaled_reward": -0.08306887093931437, + "rewards/format_reward": 0.47916666977107525, "step": 65 }, { - "completion_length": 3205.2916870117188, - "epoch": 0.037714285714285714, - "grad_norm": 0.21348921954631805, - "kl": 0.00023609399795532227, + "completion_length": 2048.8333435058594, + "epoch": 0.07542857142857143, + "grad_norm": 0.2723003029823303, + "kl": 0.0007075890898704529, "learning_rate": 9.971955636222684e-07, "loss": 0.0, - "reward": 0.07358985611062963, - "reward_std": 0.19247190468013287, - "rewards/cosine_scaled_reward": 0.029258128255605698, - "rewards/format_reward": 0.3750000149011612, + "reward": 0.08446738217025995, + "reward_std": 0.12608648044988513, + "rewards/cosine_scaled_reward": -0.031810659915208817, + "rewards/format_reward": 0.5625000018626451, "step": 66 }, { - "completion_length": 1525.6250610351562, - "epoch": 0.038285714285714284, - "grad_norm": 0.317193865776062, - "kl": 0.0008597373962402344, + "completion_length": 3272.250015258789, + "epoch": 0.07657142857142857, + "grad_norm": 0.14235161244869232, + "kl": 0.0003021508455276489, "learning_rate": 9.968344786479415e-07, "loss": 0.0, - "reward": 0.3304795026779175, - "reward_std": 0.2038802020251751, - "rewards/cosine_scaled_reward": 0.49172529578208923, - "rewards/format_reward": 0.9583333432674408, + "reward": -0.09321667347103357, + "reward_std": 0.06564409867860377, + "rewards/cosine_scaled_reward": -0.358583465218544, + "rewards/format_reward": 0.16666667349636555, "step": 67 }, { - "completion_length": 3565.4166870117188, - "epoch": 0.038857142857142854, - "grad_norm": 0.23730017244815826, - "kl": 0.00024116039276123047, + "completion_length": 1971.250015258789, + "epoch": 0.07771428571428571, + "grad_norm": 0.25673460960388184, + "kl": 0.001732461154460907, "learning_rate": 9.964516155915151e-07, - "loss": 0.0, - "reward": -0.025502284057438374, - "reward_std": 0.1045564329251647, - "rewards/cosine_scaled_reward": -0.11634521931409836, - "rewards/format_reward": 0.0833333358168602, + "loss": 0.0001, + "reward": 0.10094248503446579, + "reward_std": 0.14019617764279246, + "rewards/cosine_scaled_reward": -0.018839816562831402, + "rewards/format_reward": 0.6250000074505806, "step": 68 }, { - "completion_length": 2520.3334197998047, - "epoch": 0.03942857142857143, - "grad_norm": 0.31635159254074097, - "kl": 0.0004544258117675781, + "completion_length": 2486.1666870117188, + "epoch": 0.07885714285714286, + "grad_norm": 0.24227525293827057, + "kl": 0.0012736618518829346, "learning_rate": 9.960469931131936e-07, - "loss": 0.0, - "reward": 0.1283328365534544, - "reward_std": 0.19457278959453106, - "rewards/cosine_scaled_reward": 0.1481811236590147, - "rewards/format_reward": 0.4583333395421505, + "loss": 0.0001, + "reward": 0.002093482413329184, + "reward_std": 0.1131673906929791, + "rewards/cosine_scaled_reward": -0.22560235299170017, + "rewards/format_reward": 0.45833334513008595, "step": 69 }, { - "completion_length": 3340.5416870117188, - "epoch": 0.04, - "grad_norm": 0.24175065755844116, - "kl": 0.0011235177516937256, + "completion_length": 3097.7708740234375, + "epoch": 0.08, + "grad_norm": 0.16386379301548004, + "kl": 0.0010862918570637703, "learning_rate": 9.956206309337066e-07, "loss": 0.0, - "reward": -0.022892503649927676, - "reward_std": 0.16851711831986904, - "rewards/cosine_scaled_reward": -0.17294882237911224, - "rewards/format_reward": 0.2083333358168602, + "reward": 0.003557512885890901, + "reward_std": 0.07532872771844268, + "rewards/cosine_scaled_reward": -0.1663934402167797, + "rewards/format_reward": 0.35416666977107525, "step": 70 }, { - "completion_length": 3584.0, - "epoch": 0.04057142857142857, - "grad_norm": 0.2198539823293686, - "kl": 0.00041710957884788513, + "completion_length": 2676.9166946411133, + "epoch": 0.08114285714285714, + "grad_norm": 0.3169231712818146, + "kl": 0.0008103922009468079, "learning_rate": 9.951725498333448e-07, "loss": 0.0, - "reward": -0.09060919843614101, - "reward_std": 0.07114507164806128, - "rewards/cosine_scaled_reward": -0.28651032224297523, - "rewards/format_reward": 0.0416666679084301, + "reward": 0.057090925984084606, + "reward_std": 0.12368696788325906, + "rewards/cosine_scaled_reward": -0.009727515280246735, + "rewards/format_reward": 0.35416666977107525, "step": 71 }, { - "completion_length": 3307.7916870117188, - "epoch": 0.04114285714285714, - "grad_norm": 0.2961927354335785, - "kl": 0.0011830031871795654, + "completion_length": 2502.604202270508, + "epoch": 0.08228571428571428, + "grad_norm": 0.22013245522975922, + "kl": 0.0007383376359939575, "learning_rate": 9.947027716509488e-07, "loss": 0.0, - "reward": -0.004574490711092949, - "reward_std": 0.13881276827305555, - "rewards/cosine_scaled_reward": -0.20059602707624435, - "rewards/format_reward": 0.3750000074505806, + "reward": 0.034887210465967655, + "reward_std": 0.1007420509122312, + "rewards/cosine_scaled_reward": -0.14844494126737118, + "rewards/format_reward": 0.5000000149011612, "step": 72 }, { - "completion_length": 2856.6666870117188, - "epoch": 0.04171428571428572, - "grad_norm": 0.283258855342865, - "kl": 0.0008726119995117188, + "completion_length": 3457.125, + "epoch": 0.08342857142857144, + "grad_norm": 0.14216576516628265, + "kl": 0.00022399425506591797, "learning_rate": 9.942113192828444e-07, "loss": 0.0, - "reward": -0.008917616680264473, - "reward_std": 0.08053276687860489, - "rewards/cosine_scaled_reward": -0.23503995686769485, - "rewards/format_reward": 0.4166666716337204, + "reward": 0.0063769330736249685, + "reward_std": 0.0942028573481366, + "rewards/cosine_scaled_reward": -0.07475030946079642, + "rewards/format_reward": 0.1875000074505806, "step": 73 }, { - "completion_length": 3495.875, - "epoch": 0.04228571428571429, - "grad_norm": 0.2070513665676117, - "kl": 6.085634231567383e-05, + "completion_length": 3151.3125610351562, + "epoch": 0.08457142857142858, + "grad_norm": 0.16055600345134735, + "kl": 0.0008836947381496429, "learning_rate": 9.93698216681727e-07, "loss": 0.0, - "reward": -0.07934569753706455, - "reward_std": 0.07326019834727049, - "rewards/cosine_scaled_reward": -0.2762797996401787, - "rewards/format_reward": 0.0833333358168602, + "reward": 0.08290100377053022, + "reward_std": 0.16621626261621714, + "rewards/cosine_scaled_reward": 0.06829957757145166, + "rewards/format_reward": 0.35416666977107525, "step": 74 }, { - "completion_length": 3584.0, - "epoch": 0.04285714285714286, - "grad_norm": 0.2158546894788742, - "kl": 7.456541061401367e-05, + "completion_length": 2834.812530517578, + "epoch": 0.08571428571428572, + "grad_norm": 0.17003442347049713, + "kl": 0.0006369650363922119, "learning_rate": 9.931634888554935e-07, "loss": 0.0, - "reward": -0.09801437985152006, - "reward_std": 0.053719223476946354, - "rewards/cosine_scaled_reward": -0.2883354425430298, - "rewards/format_reward": 0.0, + "reward": 0.05851969541981816, + "reward_std": 0.09551909612491727, + "rewards/cosine_scaled_reward": -0.021192173473536968, + "rewards/format_reward": 0.37500000186264515, "step": 75 }, { - "completion_length": 3284.7916870117188, - "epoch": 0.04342857142857143, - "grad_norm": 0.21909596025943756, - "kl": 0.0005540847778320312, + "completion_length": 2987.1458435058594, + "epoch": 0.08685714285714285, + "grad_norm": 0.17524327337741852, + "kl": 0.0001463182270526886, "learning_rate": 9.926071618660237e-07, "loss": 0.0, - "reward": -0.014737647026777267, - "reward_std": 0.11156850960105658, - "rewards/cosine_scaled_reward": -0.12538111954927444, - "rewards/format_reward": 0.1666666716337204, + "reward": -0.009090241976082325, + "reward_std": 0.06974720861762762, + "rewards/cosine_scaled_reward": -0.21435680365539156, + "rewards/format_reward": 0.3750000037252903, "step": 76 }, { - "completion_length": 2972.6666870117188, - "epoch": 0.044, - "grad_norm": 0.25531890988349915, - "kl": 0.0018314719200134277, + "completion_length": 3154.2708740234375, + "epoch": 0.088, + "grad_norm": 0.1655508130788803, + "kl": 0.0002579297870397568, "learning_rate": 9.9202926282791e-07, - "loss": 0.0001, - "reward": 0.010560308117419481, - "reward_std": 0.09020634181797504, - "rewards/cosine_scaled_reward": -0.24031861126422882, - "rewards/format_reward": 0.5416666865348816, + "loss": 0.0, + "reward": -0.01178191090002656, + "reward_std": 0.10129856411367655, + "rewards/cosine_scaled_reward": -0.1495479578152299, + "rewards/format_reward": 0.2291666753590107, "step": 77 }, { - "completion_length": 2636.500045776367, - "epoch": 0.044571428571428574, - "grad_norm": 0.24530038237571716, - "kl": 0.00012546777725219727, + "completion_length": 3120.6041870117188, + "epoch": 0.08914285714285715, + "grad_norm": 0.1709078699350357, + "kl": 0.0009910427033901215, "learning_rate": 9.91429819907136e-07, "loss": 0.0, - "reward": 0.13959729950875044, - "reward_std": 0.08109728526324034, - "rewards/cosine_scaled_reward": 0.18548674881458282, - "rewards/format_reward": 0.4583333432674408, + "reward": 0.03289509087335318, + "reward_std": 0.09486977197229862, + "rewards/cosine_scaled_reward": -0.06091844476759434, + "rewards/format_reward": 0.3125000074505806, "step": 78 }, { - "completion_length": 2578.7083435058594, - "epoch": 0.045142857142857144, - "grad_norm": 0.2788633406162262, - "kl": 0.0013951659202575684, + "completion_length": 2322.7708702087402, + "epoch": 0.09028571428571429, + "grad_norm": 0.2289884388446808, + "kl": 0.001061469316482544, "learning_rate": 9.908088623197048e-07, - "loss": 0.0001, - "reward": 0.07973003596998751, - "reward_std": 0.07567325606942177, - "rewards/cosine_scaled_reward": 0.023972127586603165, - "rewards/format_reward": 0.4166666716337204, + "loss": 0.0, + "reward": 0.08358410373330116, + "reward_std": 0.09495366807095706, + "rewards/cosine_scaled_reward": -0.00746832974255085, + "rewards/format_reward": 0.5000000055879354, "step": 79 }, { - "completion_length": 2795.791717529297, - "epoch": 0.045714285714285714, - "grad_norm": 0.32540103793144226, - "kl": 0.003176093101501465, + "completion_length": 3293.2083740234375, + "epoch": 0.09142857142857143, + "grad_norm": 0.18912553787231445, + "kl": 0.0006625503301620483, "learning_rate": 9.901664203302124e-07, - "loss": 0.0001, - "reward": 0.016130131669342518, - "reward_std": 0.09331217408180237, - "rewards/cosine_scaled_reward": -0.24449174664914608, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0, + "reward": -0.006389252142980695, + "reward_std": 0.13851621747016907, + "rewards/cosine_scaled_reward": -0.1551999393850565, + "rewards/format_reward": 0.2708333358168602, "step": 80 }, { - "completion_length": 2876.5833740234375, - "epoch": 0.046285714285714284, - "grad_norm": 0.23254291713237762, - "kl": 0.00019277632236480713, + "completion_length": 3064.145866394043, + "epoch": 0.09257142857142857, + "grad_norm": 0.22587524354457855, + "kl": 0.002162039279937744, "learning_rate": 9.895025252503755e-07, - "loss": 0.0, - "reward": 0.010751697234809399, - "reward_std": 0.14756072871387005, - "rewards/cosine_scaled_reward": -0.15722812339663506, - "rewards/format_reward": 0.3750000149011612, + "loss": 0.0001, + "reward": -0.021476033609360456, + "reward_std": 0.09152220841497183, + "rewards/cosine_scaled_reward": -0.19870448019355536, + "rewards/format_reward": 0.27083333767950535, "step": 81 }, { - "completion_length": 3112.4166870117188, - "epoch": 0.046857142857142854, - "grad_norm": 0.2425730675458908, - "kl": 0.0008943080902099609, + "completion_length": 2900.9791717529297, + "epoch": 0.09371428571428571, + "grad_norm": 0.19733589887619019, + "kl": 0.000873371958732605, "learning_rate": 9.888172094375033e-07, "loss": 0.0, - "reward": 0.10021019820123911, - "reward_std": 0.1867539957165718, - "rewards/cosine_scaled_reward": 0.044586583971977234, - "rewards/format_reward": 0.5000000149011612, + "reward": 0.03715384565293789, + "reward_std": 0.1097968677058816, + "rewards/cosine_scaled_reward": -0.05757363699376583, + "rewards/format_reward": 0.33333333395421505, "step": 82 }, { - "completion_length": 2751.5, - "epoch": 0.04742857142857143, - "grad_norm": 0.4409594237804413, - "kl": 0.0004534721374511719, + "completion_length": 2747.500045776367, + "epoch": 0.09485714285714286, + "grad_norm": 0.23122522234916687, + "kl": 0.0008766204118728638, "learning_rate": 9.881105062929221e-07, "loss": 0.0, - "reward": -0.04191046301275492, - "reward_std": 0.09563638782128692, - "rewards/cosine_scaled_reward": -0.26906227320432663, - "rewards/format_reward": 0.2916666679084301, + "reward": 0.021435680333524942, + "reward_std": 0.12779827043414116, + "rewards/cosine_scaled_reward": -0.11562954680994153, + "rewards/format_reward": 0.35416666977107525, "step": 83 }, { - "completion_length": 2676.166717529297, - "epoch": 0.048, - "grad_norm": 0.33962568640708923, - "kl": 0.00030422210693359375, + "completion_length": 3090.750045776367, + "epoch": 0.096, + "grad_norm": 0.1742715984582901, + "kl": 0.00035139918327331543, "learning_rate": 9.873824502603459e-07, "loss": 0.0, - "reward": -0.041442704387009144, - "reward_std": 0.0723197115585208, - "rewards/cosine_scaled_reward": -0.3300611712038517, - "rewards/format_reward": 0.4166666716337204, + "reward": 0.061643086373806, + "reward_std": 0.16731480974704027, + "rewards/cosine_scaled_reward": -0.016695552330929786, + "rewards/format_reward": 0.3958333432674408, "step": 84 }, { - "completion_length": 2908.416717529297, - "epoch": 0.04857142857142857, - "grad_norm": 0.31520113348960876, - "kl": 0.001495361328125, + "completion_length": 2862.5000762939453, + "epoch": 0.09714285714285714, + "grad_norm": 0.15780915319919586, + "kl": 0.00032967329025268555, "learning_rate": 9.866330768241983e-07, - "loss": 0.0001, - "reward": 0.049455616157501936, - "reward_std": 0.06858202628791332, - "rewards/cosine_scaled_reward": 0.0031317323446273804, - "rewards/format_reward": 0.2916666679084301, + "loss": 0.0, + "reward": 0.059395642252638936, + "reward_std": 0.18405951745808125, + "rewards/cosine_scaled_reward": -0.07413570675998926, + "rewards/format_reward": 0.5000000093132257, "step": 85 }, { - "completion_length": 2992.5001220703125, - "epoch": 0.04914285714285714, - "grad_norm": 0.2631990313529968, - "kl": 0.0005331635475158691, + "completion_length": 2822.458366394043, + "epoch": 0.09828571428571428, + "grad_norm": 0.18948958814144135, + "kl": 0.0012461543083190918, "learning_rate": 9.85862422507884e-07, "loss": 0.0, - "reward": -0.011616203933954239, - "reward_std": 0.14434752985835075, - "rewards/cosine_scaled_reward": -0.20175681728869677, - "rewards/format_reward": 0.3333333358168602, + "reward": 0.058563592843711376, + "reward_std": 0.13514422718435526, + "rewards/cosine_scaled_reward": -0.023547479882836342, + "rewards/format_reward": 0.3958333432674408, "step": 86 }, { - "completion_length": 2167.875015258789, - "epoch": 0.04971428571428571, - "grad_norm": 0.33546218276023865, - "kl": 0.0012390613555908203, + "completion_length": 2582.0208892822266, + "epoch": 0.09942857142857142, + "grad_norm": 0.24197638034820557, + "kl": 0.001571571920067072, "learning_rate": 9.850705248720068e-07, - "loss": 0.0, - "reward": 0.11383942142128944, - "reward_std": 0.1343392226845026, - "rewards/cosine_scaled_reward": 0.06459504179656506, + "loss": 0.0001, + "reward": 0.06321019981987774, + "reward_std": 0.1301463134586811, + "rewards/cosine_scaled_reward": -0.0854906840249896, "rewards/format_reward": 0.5416666679084301, "step": 87 }, { - "completion_length": 3190.541748046875, - "epoch": 0.05028571428571429, - "grad_norm": 0.21457454562187195, - "kl": 0.000608295202255249, + "completion_length": 2717.666717529297, + "epoch": 0.10057142857142858, + "grad_norm": 0.2206323742866516, + "kl": 0.0015416741371154785, "learning_rate": 9.8425742251254e-07, - "loss": 0.0, - "reward": 0.006447264924645424, - "reward_std": 0.13064131513237953, - "rewards/cosine_scaled_reward": -0.10738314688205719, - "rewards/format_reward": 0.2500000074505806, + "loss": 0.0001, + "reward": 0.10002893407363445, + "reward_std": 0.15120992343872786, + "rewards/cosine_scaled_reward": 0.02375661302357912, + "rewards/format_reward": 0.5416666902601719, "step": 88 }, { - "completion_length": 3171.2916870117188, - "epoch": 0.05085714285714286, - "grad_norm": 0.2724606990814209, - "kl": 0.005704402923583984, + "completion_length": 3077.7708587646484, + "epoch": 0.10171428571428572, + "grad_norm": 0.18466134369373322, + "kl": 0.0015412569046020508, "learning_rate": 9.83423155058946e-07, - "loss": 0.0002, - "reward": 0.10516921058297157, - "reward_std": 0.13658037036657333, - "rewards/cosine_scaled_reward": 0.1440199911594391, - "rewards/format_reward": 0.3333333432674408, + "loss": 0.0001, + "reward": 0.019450924504781142, + "reward_std": 0.1520949569530785, + "rewards/cosine_scaled_reward": -0.10057051916373894, + "rewards/format_reward": 0.31250000931322575, "step": 89 }, { - "completion_length": 3537.1666870117188, - "epoch": 0.05142857142857143, - "grad_norm": 0.23336061835289001, - "kl": 0.00017547607421875, + "completion_length": 2275.2708587646484, + "epoch": 0.10285714285714286, + "grad_norm": 0.317700058221817, + "kl": 0.0030652284622192383, "learning_rate": 9.825677631722435e-07, - "loss": 0.0, - "reward": -0.038981794379651546, - "reward_std": 0.10609682742506266, - "rewards/cosine_scaled_reward": -0.17710164934396744, - "rewards/format_reward": 0.1250000037252903, + "loss": 0.0001, + "reward": 0.020127289928495884, + "reward_std": 0.09609310049563646, + "rewards/cosine_scaled_reward": -0.2033098302781582, + "rewards/format_reward": 0.5208333395421505, "step": 90 }, { - "completion_length": 3584.0, - "epoch": 0.052, - "grad_norm": 0.23519191145896912, - "kl": 0.0010558366775512695, + "completion_length": 3104.2291870117188, + "epoch": 0.104, + "grad_norm": 0.16374754905700684, + "kl": 0.0012220889329910278, "learning_rate": 9.816912885430258e-07, "loss": 0.0, - "reward": -0.08323674835264683, - "reward_std": 0.04371932055801153, - "rewards/cosine_scaled_reward": -0.24652044475078583, - "rewards/format_reward": 0.0, + "reward": 0.02835351601243019, + "reward_std": 0.0958831796888262, + "rewards/cosine_scaled_reward": -0.08190420269966125, + "rewards/format_reward": 0.3333333432674408, "step": 91 }, { - "completion_length": 2922.7083435058594, - "epoch": 0.052571428571428575, - "grad_norm": 0.28320083022117615, - "kl": 0.0011293888092041016, + "completion_length": 2564.9583892822266, + "epoch": 0.10514285714285715, + "grad_norm": 0.23695100843906403, + "kl": 0.0031346678733825684, "learning_rate": 9.807937738894303e-07, - "loss": 0.0, - "reward": -0.025928404182195663, - "reward_std": 0.07635276950895786, - "rewards/cosine_scaled_reward": -0.20148934796452522, - "rewards/format_reward": 0.25, + "loss": 0.0001, + "reward": 0.031697872560471296, + "reward_std": 0.1256707413122058, + "rewards/cosine_scaled_reward": -0.1565206847153604, + "rewards/format_reward": 0.5000000055879354, "step": 92 }, { - "completion_length": 3191.3750610351562, - "epoch": 0.053142857142857144, - "grad_norm": 0.22535181045532227, - "kl": 0.00041413307189941406, + "completion_length": 3429.2916870117188, + "epoch": 0.10628571428571429, + "grad_norm": 0.18266721069812775, + "kl": 0.0016658008098602295, "learning_rate": 9.798752629550546e-07, - "loss": 0.0, - "reward": 0.04429396986961365, - "reward_std": 0.13803782314062119, - "rewards/cosine_scaled_reward": -0.014629889279603958, - "rewards/format_reward": 0.2916666716337204, + "loss": 0.0001, + "reward": -0.06380185973830521, + "reward_std": 0.07419165363535285, + "rewards/cosine_scaled_reward": -0.21879931539297104, + "rewards/format_reward": 0.06250000186264515, "step": 93 }, { - "completion_length": 2802.4583740234375, - "epoch": 0.053714285714285714, - "grad_norm": 0.31209224462509155, - "kl": 0.000571906566619873, + "completion_length": 2926.041717529297, + "epoch": 0.10742857142857143, + "grad_norm": 0.183439701795578, + "kl": 0.0018378198146820068, "learning_rate": 9.78935800506826e-07, - "loss": 0.0, - "reward": 0.110066844150424, - "reward_std": 0.17873639985919, - "rewards/cosine_scaled_reward": 0.05385873094201088, - "rewards/format_reward": 0.5416666716337204, + "loss": 0.0001, + "reward": 0.02929869778745342, + "reward_std": 0.0936040470842272, + "rewards/cosine_scaled_reward": -0.0937141003087163, + "rewards/format_reward": 0.3541666716337204, "step": 94 }, { - "completion_length": 2327.0416870117188, - "epoch": 0.054285714285714284, - "grad_norm": 0.33819082379341125, - "kl": 0.004376888275146484, + "completion_length": 3466.7708740234375, + "epoch": 0.10857142857142857, + "grad_norm": 0.1366991549730301, + "kl": 0.0007553547620773315, "learning_rate": 9.779754323328192e-07, - "loss": 0.0002, - "reward": 0.1542066391557455, - "reward_std": 0.16038192249834538, - "rewards/cosine_scaled_reward": 0.1625713836401701, - "rewards/format_reward": 0.5833333358168602, + "loss": 0.0, + "reward": -0.011318721110001206, + "reward_std": 0.11944798147305846, + "rewards/cosine_scaled_reward": -0.1375442687422037, + "rewards/format_reward": 0.20833333767950535, "step": 95 }, { - "completion_length": 3068.291717529297, - "epoch": 0.054857142857142854, - "grad_norm": 0.2672428488731384, - "kl": 0.0011248588562011719, + "completion_length": 2708.729202270508, + "epoch": 0.10971428571428571, + "grad_norm": 0.17744414508342743, + "kl": 0.002089708112180233, "learning_rate": 9.769942052400235e-07, - "loss": 0.0, - "reward": -0.024494814686477184, - "reward_std": 0.1325745964422822, - "rewards/cosine_scaled_reward": -0.21936501190066338, - "rewards/format_reward": 0.291666679084301, + "loss": 0.0001, + "reward": 0.08605838101357222, + "reward_std": 0.11997871845960617, + "rewards/cosine_scaled_reward": 0.024741460452787578, + "rewards/format_reward": 0.45833334140479565, "step": 96 }, { - "completion_length": 2348.000045776367, - "epoch": 0.05542857142857143, - "grad_norm": 0.41059279441833496, - "kl": 0.007290244102478027, + "completion_length": 3173.854248046875, + "epoch": 0.11085714285714286, + "grad_norm": 0.18968307971954346, + "kl": 0.0014801472425460815, "learning_rate": 9.759921670520634e-07, - "loss": 0.0003, - "reward": 0.06665459275245667, - "reward_std": 0.14610515628010035, - "rewards/cosine_scaled_reward": -0.09449225105345249, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0001, + "reward": 0.031303239753469825, + "reward_std": 0.11792595777660608, + "rewards/cosine_scaled_reward": -0.06253299303352833, + "rewards/format_reward": 0.31250000931322575, "step": 97 }, { - "completion_length": 2388.041717529297, - "epoch": 0.056, - "grad_norm": 0.2683107554912567, - "kl": 0.0004184246063232422, + "completion_length": 2743.937545776367, + "epoch": 0.112, + "grad_norm": 0.17238333821296692, + "kl": 0.0005875229835510254, "learning_rate": 9.749693666068663e-07, "loss": 0.0, - "reward": 0.0730120544321835, - "reward_std": 0.11645183898508549, - "rewards/cosine_scaled_reward": -0.0543170552700758, - "rewards/format_reward": 0.5416666679084301, + "reward": 0.03780742874369025, + "reward_std": 0.09086114913225174, + "rewards/cosine_scaled_reward": -0.14788446575403214, + "rewards/format_reward": 0.5208333432674408, "step": 98 }, { - "completion_length": 2799.0416717529297, - "epoch": 0.05657142857142857, - "grad_norm": 0.24571600556373596, - "kl": 0.0010900497436523438, + "completion_length": 2848.7083587646484, + "epoch": 0.11314285714285714, + "grad_norm": 0.24863041937351227, + "kl": 0.0012496709823608398, "learning_rate": 9.739258537542835e-07, - "loss": 0.0, - "reward": 0.03587005753070116, - "reward_std": 0.05272865202277899, - "rewards/cosine_scaled_reward": -0.04149058274924755, - "rewards/format_reward": 0.2916666679084301, + "loss": 0.0001, + "reward": 0.007481225358787924, + "reward_std": 0.09957948396913707, + "rewards/cosine_scaled_reward": -0.11191552877426147, + "rewards/format_reward": 0.2708333395421505, "step": 99 }, { - "completion_length": 2953.875, - "epoch": 0.05714285714285714, - "grad_norm": 0.26749980449676514, - "kl": 0.0006313323974609375, + "completion_length": 2628.6458740234375, + "epoch": 0.11428571428571428, + "grad_norm": 0.2551541328430176, + "kl": 0.007792949676513672, "learning_rate": 9.728616793536587e-07, - "loss": 0.0, - "reward": 0.10422471165657043, - "reward_std": 0.12896847166121006, - "rewards/cosine_scaled_reward": 0.09673600643873215, - "rewards/format_reward": 0.4166666716337204, + "loss": 0.0003, + "reward": 0.11142967082560062, + "reward_std": 0.1601211791858077, + "rewards/cosine_scaled_reward": 0.0880345068871975, + "rewards/format_reward": 0.4791666828095913, "step": 100 }, { - "completion_length": 2678.166717529297, - "epoch": 0.05771428571428571, - "grad_norm": 0.32678940892219543, - "kl": 0.005286693572998047, + "completion_length": 2490.229202270508, + "epoch": 0.11542857142857142, + "grad_norm": 0.21603932976722717, + "kl": 0.0017938017845153809, "learning_rate": 9.717768952713511e-07, - "loss": 0.0002, - "reward": 0.059486206620931625, - "reward_std": 0.1262753512710333, - "rewards/cosine_scaled_reward": -0.053003497421741486, - "rewards/format_reward": 0.4583333395421505, + "loss": 0.0001, + "reward": 0.05852094758301973, + "reward_std": 0.09475950035266578, + "rewards/cosine_scaled_reward": -0.0666491650044918, + "rewards/format_reward": 0.47916666977107525, "step": 101 }, { - "completion_length": 2141.0833435058594, - "epoch": 0.05828571428571429, - "grad_norm": 0.3860064744949341, - "kl": 0.0025103092193603516, + "completion_length": 2286.395866394043, + "epoch": 0.11657142857142858, + "grad_norm": 0.22003820538520813, + "kl": 0.0035685300827026367, "learning_rate": 9.706715543782064e-07, "loss": 0.0001, - "reward": 0.058345479890704155, - "reward_std": 0.09447939228266478, - "rewards/cosine_scaled_reward": -0.07782930880784988, - "rewards/format_reward": 0.5, + "reward": 0.06951136235147715, + "reward_std": 0.11986843310296535, + "rewards/cosine_scaled_reward": -0.09702310990542173, + "rewards/format_reward": 0.6041666716337204, "step": 102 }, { - "completion_length": 2441.0833587646484, - "epoch": 0.05885714285714286, - "grad_norm": 0.4264659285545349, - "kl": 0.0046890974044799805, + "completion_length": 2706.5833854675293, + "epoch": 0.11771428571428572, + "grad_norm": 0.24211692810058594, + "kl": 0.0033190250396728516, "learning_rate": 9.695457105469804e-07, - "loss": 0.0002, - "reward": 0.19378599524497986, - "reward_std": 0.09719427116215229, - "rewards/cosine_scaled_reward": 0.2184842936694622, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.0001, + "reward": 0.02106085862033069, + "reward_std": 0.13761026575230062, + "rewards/cosine_scaled_reward": -0.1569200656376779, + "rewards/format_reward": 0.43750000931322575, "step": 103 }, { - "completion_length": 3437.8750610351562, - "epoch": 0.05942857142857143, - "grad_norm": 0.21489928662776947, - "kl": 0.0004730224609375, + "completion_length": 2745.125011444092, + "epoch": 0.11885714285714286, + "grad_norm": 0.2112489640712738, + "kl": 0.0039566755294799805, "learning_rate": 9.683994186497132e-07, - "loss": 0.0, - "reward": -0.03252293914556503, - "reward_std": 0.15201162360608578, - "rewards/cosine_scaled_reward": -0.1787819191813469, - "rewards/format_reward": 0.1666666716337204, + "loss": 0.0002, + "reward": 0.013099167263135314, + "reward_std": 0.1101553007028997, + "rewards/cosine_scaled_reward": -0.14944082498550415, + "rewards/format_reward": 0.3750000037252903, "step": 104 }, { - "completion_length": 2828.5833435058594, - "epoch": 0.06, - "grad_norm": 0.28643998503685, - "kl": 0.001087963581085205, + "completion_length": 2631.812545776367, + "epoch": 0.12, + "grad_norm": 0.19800451397895813, + "kl": 0.002551555633544922, "learning_rate": 9.672327345550543e-07, - "loss": 0.0, - "reward": 0.1313789002597332, - "reward_std": 0.15932872332632542, - "rewards/cosine_scaled_reward": 0.09692827612161636, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0001, + "reward": 0.08774281479418278, + "reward_std": 0.15658214688301086, + "rewards/cosine_scaled_reward": 0.02480911184102297, + "rewards/format_reward": 0.4583333395421505, "step": 105 }, { - "completion_length": 2742.0834045410156, - "epoch": 0.060571428571428575, - "grad_norm": 0.30696070194244385, - "kl": 0.00095367431640625, + "completion_length": 2240.000026702881, + "epoch": 0.12114285714285715, + "grad_norm": 0.1994587928056717, + "kl": 0.0023127198219299316, "learning_rate": 9.66045715125541e-07, - "loss": 0.0, - "reward": 0.13310191221535206, - "reward_std": 0.09642031975090504, - "rewards/cosine_scaled_reward": 0.038958167657256126, - "rewards/format_reward": 0.7083333358168602, + "loss": 0.0001, + "reward": 0.1655233004130423, + "reward_std": 0.13684139621909708, + "rewards/cosine_scaled_reward": 0.17548487707972527, + "rewards/format_reward": 0.6250000111758709, "step": 106 }, { - "completion_length": 3108.1250610351562, - "epoch": 0.061142857142857145, - "grad_norm": 0.21947838366031647, - "kl": 0.00020295381546020508, + "completion_length": 2713.1250915527344, + "epoch": 0.12228571428571429, + "grad_norm": 0.23116803169250488, + "kl": 0.0034139156341552734, "learning_rate": 9.648384182148252e-07, - "loss": 0.0, - "reward": 0.10749095072969794, - "reward_std": 0.11216552276164293, - "rewards/cosine_scaled_reward": 0.12838193029165268, - "rewards/format_reward": 0.3750000037252903, + "loss": 0.0001, + "reward": 0.07083619991317391, + "reward_std": 0.12138741742819548, + "rewards/cosine_scaled_reward": -0.05151521973311901, + "rewards/format_reward": 0.5208333488553762, "step": 107 }, { - "completion_length": 2327.3334045410156, - "epoch": 0.061714285714285715, - "grad_norm": 0.33659592270851135, - "kl": 0.008703947067260742, + "completion_length": 2540.8125762939453, + "epoch": 0.12342857142857143, + "grad_norm": 0.20976760983467102, + "kl": 0.0018079280853271484, "learning_rate": 9.636109026648554e-07, - "loss": 0.0003, - "reward": 0.259239349514246, - "reward_std": 0.1686703823506832, - "rewards/cosine_scaled_reward": 0.3435509689152241, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0001, + "reward": 0.07085073599591851, + "reward_std": 0.15451500099152327, + "rewards/cosine_scaled_reward": -0.041776820085942745, + "rewards/format_reward": 0.5000000074505806, "step": 108 }, { - "completion_length": 3051.5416870117188, - "epoch": 0.062285714285714285, - "grad_norm": 0.2260669320821762, - "kl": 0.0009126663208007812, + "completion_length": 3043.500045776367, + "epoch": 0.12457142857142857, + "grad_norm": 0.16290034353733063, + "kl": 0.0009417533874511719, "learning_rate": 9.623632283030077e-07, "loss": 0.0, - "reward": -0.01571727846749127, - "reward_std": 0.1399233676493168, - "rewards/cosine_scaled_reward": -0.2135260934010148, - "rewards/format_reward": 0.3333333469927311, + "reward": 0.03415968408808112, + "reward_std": 0.09731752565130591, + "rewards/cosine_scaled_reward": -0.08652829378843307, + "rewards/format_reward": 0.37500000931322575, "step": 109 }, { - "completion_length": 2840.4584350585938, - "epoch": 0.06285714285714286, - "grad_norm": 0.23950429260730743, - "kl": 0.003201007843017578, + "completion_length": 2716.9583587646484, + "epoch": 0.12571428571428572, + "grad_norm": 0.202264204621315, + "kl": 0.0016897767782211304, "learning_rate": 9.610954559391704e-07, "loss": 0.0001, - "reward": 0.12386356201022863, - "reward_std": 0.1479373024776578, - "rewards/cosine_scaled_reward": 0.13685176149010658, - "rewards/format_reward": 0.4583333395421505, + "reward": 0.030952309258282185, + "reward_std": 0.13009351352229714, + "rewards/cosine_scaled_reward": -0.11608889419585466, + "rewards/format_reward": 0.41666666977107525, "step": 110 }, { - "completion_length": 3189.8750610351562, - "epoch": 0.06342857142857143, - "grad_norm": 0.2137785404920578, - "kl": 0.0004210472106933594, + "completion_length": 3076.3959045410156, + "epoch": 0.12685714285714286, + "grad_norm": 0.1983112394809723, + "kl": 0.002537250518798828, "learning_rate": 9.598076473627796e-07, - "loss": 0.0, - "reward": 0.013948600739240646, - "reward_std": 0.08112779445946217, - "rewards/cosine_scaled_reward": -0.16665012761950493, - "rewards/format_reward": 0.4166666865348816, + "loss": 0.0001, + "reward": 0.059007523115724325, + "reward_std": 0.16636842489242554, + "rewards/cosine_scaled_reward": -0.0232586320489645, + "rewards/format_reward": 0.39583334140479565, "step": 111 }, { - "completion_length": 2671.9583587646484, - "epoch": 0.064, - "grad_norm": 0.3268754482269287, - "kl": 0.0006527900695800781, + "completion_length": 3080.041717529297, + "epoch": 0.128, + "grad_norm": 0.15135328471660614, + "kl": 0.0012042894959449768, "learning_rate": 9.58499865339809e-07, "loss": 0.0, - "reward": 0.0882517546415329, - "reward_std": 0.1352424994111061, - "rewards/cosine_scaled_reward": 0.054025545716285706, - "rewards/format_reward": 0.4166666716337204, + "reward": 0.06036388734355569, + "reward_std": 0.11590251373127103, + "rewards/cosine_scaled_reward": -0.008873747196048498, + "rewards/format_reward": 0.37500000931322575, "step": 112 }, { - "completion_length": 3551.8333740234375, - "epoch": 0.06457142857142857, - "grad_norm": 0.1638534665107727, - "kl": 0.00015148520469665527, + "completion_length": 2676.9583435058594, + "epoch": 0.12914285714285714, + "grad_norm": 0.2907523810863495, + "kl": 0.0033299922943115234, "learning_rate": 9.571721736097088e-07, - "loss": 0.0, - "reward": -0.054237596690654755, - "reward_std": 0.10698490496724844, - "rewards/cosine_scaled_reward": -0.22286773473024368, - "rewards/format_reward": 0.1250000037252903, + "loss": 0.0001, + "reward": 0.021284373477101326, + "reward_std": 0.12445678655058146, + "rewards/cosine_scaled_reward": -0.12851847242563963, + "rewards/format_reward": 0.37500001303851604, "step": 113 }, { - "completion_length": 3049.791748046875, - "epoch": 0.06514285714285714, - "grad_norm": 0.19514170289039612, - "kl": 0.00044786930084228516, + "completion_length": 2599.645866394043, + "epoch": 0.13028571428571428, + "grad_norm": 0.21558967232704163, + "kl": 0.0046939849853515625, "learning_rate": 9.55824636882301e-07, - "loss": 0.0, - "reward": 0.02969010453671217, - "reward_std": 0.10824764519929886, - "rewards/cosine_scaled_reward": -0.14071331918239594, - "rewards/format_reward": 0.4583333358168602, + "loss": 0.0002, + "reward": 0.030540801119059324, + "reward_std": 0.09902108740061522, + "rewards/cosine_scaled_reward": -0.19062614813446999, + "rewards/format_reward": 0.562500013038516, "step": 114 }, { - "completion_length": 2535.3750610351562, - "epoch": 0.06571428571428571, - "grad_norm": 0.22467546164989471, - "kl": 0.0037784576416015625, + "completion_length": 2822.729179382324, + "epoch": 0.13142857142857142, + "grad_norm": 0.20229722559452057, + "kl": 0.0031037330627441406, "learning_rate": 9.54457320834625e-07, - "loss": 0.0002, - "reward": 0.06513393670320511, - "reward_std": 0.11163874063640833, - "rewards/cosine_scaled_reward": -0.07820340245962143, - "rewards/format_reward": 0.541666679084301, + "loss": 0.0001, + "reward": 0.016621847171336412, + "reward_std": 0.1235029874369502, + "rewards/cosine_scaled_reward": -0.14894464937970042, + "rewards/format_reward": 0.3958333432674408, "step": 115 }, { - "completion_length": 1770.1250457763672, - "epoch": 0.06628571428571428, - "grad_norm": 0.317340612411499, - "kl": 0.00151824951171875, + "completion_length": 3348.0833435058594, + "epoch": 0.13257142857142856, + "grad_norm": 0.17120520770549774, + "kl": 0.0019791126251220703, "learning_rate": 9.530702921077358e-07, "loss": 0.0001, - "reward": 0.15827554278075695, - "reward_std": 0.18450350128114223, - "rewards/cosine_scaled_reward": 0.06737165962113068, - "rewards/format_reward": 0.791666679084301, + "reward": -0.046343902591615915, + "reward_std": 0.09335399139672518, + "rewards/cosine_scaled_reward": -0.2216574940830469, + "rewards/format_reward": 0.16666667349636555, "step": 116 }, { - "completion_length": 3126.9583740234375, - "epoch": 0.06685714285714285, - "grad_norm": 0.24544592201709747, - "kl": 0.0006279945373535156, + "completion_length": 2949.8958740234375, + "epoch": 0.1337142857142857, + "grad_norm": 0.27388995885849, + "kl": 0.0031032562255859375, "learning_rate": 9.516636183034564e-07, - "loss": 0.0, - "reward": -0.024326413869857788, - "reward_std": 0.11831062566488981, - "rewards/cosine_scaled_reward": -0.23888603504747152, - "rewards/format_reward": 0.3333333358168602, + "loss": 0.0001, + "reward": -0.03218040708452463, + "reward_std": 0.08237923681735992, + "rewards/cosine_scaled_reward": -0.22971701715141535, + "rewards/format_reward": 0.27083334140479565, "step": 117 }, { - "completion_length": 3022.1666870117188, - "epoch": 0.06742857142857143, - "grad_norm": 0.21933986246585846, - "kl": 0.0003420114517211914, + "completion_length": 2783.3542098999023, + "epoch": 0.13485714285714287, + "grad_norm": 0.17567184567451477, + "kl": 0.001497507095336914, "learning_rate": 9.502373679810839e-07, - "loss": 0.0, - "reward": 0.042915768921375275, - "reward_std": 0.057759009301662445, - "rewards/cosine_scaled_reward": 0.0017555169761180878, - "rewards/format_reward": 0.25, + "loss": 0.0001, + "reward": 0.12685386650264263, + "reward_std": 0.13957502879202366, + "rewards/cosine_scaled_reward": 0.1258874498307705, + "rewards/format_reward": 0.5000000018626451, "step": 118 }, { - "completion_length": 2548.5834350585938, - "epoch": 0.068, - "grad_norm": 0.2473248690366745, - "kl": 0.0012044906616210938, + "completion_length": 2406.875030517578, + "epoch": 0.136, + "grad_norm": 0.2555752098560333, + "kl": 0.004542350769042969, "learning_rate": 9.487916106540465e-07, - "loss": 0.0, - "reward": 0.06746315537020564, - "reward_std": 0.14464781805872917, - "rewards/cosine_scaled_reward": -0.11485214158892632, - "rewards/format_reward": 0.6250000037252903, + "loss": 0.0002, + "reward": 0.094550846144557, + "reward_std": 0.12869207374751568, + "rewards/cosine_scaled_reward": -0.0038661109283566475, + "rewards/format_reward": 0.562500013038516, "step": 119 }, { - "completion_length": 3584.0, - "epoch": 0.06857142857142857, - "grad_norm": 0.23051899671554565, - "kl": 0.0005028247833251953, + "completion_length": 2285.6250762939453, + "epoch": 0.13714285714285715, + "grad_norm": 0.23725992441177368, + "kl": 0.002517223358154297, "learning_rate": 9.473264167865171e-07, - "loss": 0.0, - "reward": -0.09011241607367992, - "reward_std": 0.06575908605009317, - "rewards/cosine_scaled_reward": -0.26638515666127205, - "rewards/format_reward": 0.0, + "loss": 0.0001, + "reward": 0.08064356981776655, + "reward_std": 0.12271030526608229, + "rewards/cosine_scaled_reward": -0.0756605202332139, + "rewards/format_reward": 0.6250000074505806, "step": 120 }, { - "completion_length": 3125.2083740234375, - "epoch": 0.06914285714285714, - "grad_norm": 0.23945416510105133, - "kl": 0.0013582706451416016, + "completion_length": 1854.8542022705078, + "epoch": 0.1382857142857143, + "grad_norm": 0.2627831697463989, + "kl": 0.003121614456176758, "learning_rate": 9.458418577899774e-07, "loss": 0.0001, - "reward": 0.04295490635558963, - "reward_std": 0.12206923216581345, - "rewards/cosine_scaled_reward": -0.039602309465408325, - "rewards/format_reward": 0.3333333432674408, + "reward": 0.11522941256407648, + "reward_std": 0.12531911802943796, + "rewards/cosine_scaled_reward": -0.03698302572593093, + "rewards/format_reward": 0.750000013038516, "step": 121 }, { - "completion_length": 3119.7500610351562, - "epoch": 0.06971428571428571, - "grad_norm": 0.1996261477470398, - "kl": 0.0002168416976928711, + "completion_length": 2925.1458892822266, + "epoch": 0.13942857142857143, + "grad_norm": 0.18577584624290466, + "kl": 0.0021719932556152344, "learning_rate": 9.443380060197385e-07, - "loss": 0.0, - "reward": 0.0352822788991034, - "reward_std": 0.13951773941516876, - "rewards/cosine_scaled_reward": -0.14444508403539658, - "rewards/format_reward": 0.5000000149011612, + "loss": 0.0001, + "reward": 0.07248353259637952, + "reward_std": 0.15761876897886395, + "rewards/cosine_scaled_reward": 0.006229955703020096, + "rewards/format_reward": 0.4166666753590107, "step": 122 }, { - "completion_length": 2701.541748046875, - "epoch": 0.07028571428571428, - "grad_norm": 0.18627424538135529, - "kl": 0.001249760389328003, + "completion_length": 2714.7708740234375, + "epoch": 0.14057142857142857, + "grad_norm": 0.17778825759887695, + "kl": 0.002131819725036621, "learning_rate": 9.428149347714143e-07, "loss": 0.0001, - "reward": 0.16536866128444672, - "reward_std": 0.13200602121651173, - "rewards/cosine_scaled_reward": 0.11369390785694122, - "rewards/format_reward": 0.7500000223517418, + "reward": 0.04195330070797354, + "reward_std": 0.11853250442072749, + "rewards/cosine_scaled_reward": -0.11678100191056728, + "rewards/format_reward": 0.47916667349636555, "step": 123 }, { - "completion_length": 2187.625030517578, - "epoch": 0.07085714285714285, - "grad_norm": 0.9284882545471191, - "kl": 0.03720283508300781, + "completion_length": 2183.958381652832, + "epoch": 0.1417142857142857, + "grad_norm": 0.2191576361656189, + "kl": 0.005345821380615234, "learning_rate": 9.412727182773486e-07, - "loss": 0.0015, - "reward": 0.1057466440834105, - "reward_std": 0.16962256282567978, - "rewards/cosine_scaled_reward": -0.025201511569321156, - "rewards/format_reward": 0.6666666828095913, + "loss": 0.0002, + "reward": 0.0987518366164295, + "reward_std": 0.1706788558512926, + "rewards/cosine_scaled_reward": -0.011822802014648914, + "rewards/format_reward": 0.6041666753590107, "step": 124 }, { - "completion_length": 2560.8333435058594, - "epoch": 0.07142857142857142, - "grad_norm": 0.23313874006271362, - "kl": 0.0019648075103759766, + "completion_length": 2888.354202270508, + "epoch": 0.14285714285714285, + "grad_norm": 0.16155032813549042, + "kl": 0.0020842552185058594, "learning_rate": 9.397114317029974e-07, "loss": 0.0001, - "reward": 0.10835672356188297, - "reward_std": 0.09912110771983862, - "rewards/cosine_scaled_reward": 0.010563732124865055, - "rewards/format_reward": 0.625, + "reward": 0.04314285283908248, + "reward_std": 0.1018235448282212, + "rewards/cosine_scaled_reward": -0.029325059265829623, + "rewards/format_reward": 0.31250000186264515, "step": 125 }, { - "completion_length": 2155.541748046875, - "epoch": 0.072, - "grad_norm": 0.29316407442092896, - "kl": 0.0007872581481933594, + "completion_length": 2848.166702270508, + "epoch": 0.144, + "grad_norm": 0.1632193773984909, + "kl": 0.0012958049774169922, "learning_rate": 9.381311511432658e-07, - "loss": 0.0, - "reward": 0.16190030612051487, - "reward_std": 0.16819189861416817, - "rewards/cosine_scaled_reward": 0.08260180242359638, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0001, + "reward": 0.05145277862902731, + "reward_std": 0.1218188302591443, + "rewards/cosine_scaled_reward": -0.06772069446742535, + "rewards/format_reward": 0.43750000558793545, "step": 126 }, { - "completion_length": 1872.1250305175781, - "epoch": 0.07257142857142856, - "grad_norm": 0.3099227249622345, - "kl": 0.0006561279296875, + "completion_length": 2989.2708892822266, + "epoch": 0.14514285714285713, + "grad_norm": 0.1904735267162323, + "kl": 0.0023894309997558594, "learning_rate": 9.36531953618799e-07, - "loss": 0.0, - "reward": 0.1794773992151022, - "reward_std": 0.12134084850549698, - "rewards/cosine_scaled_reward": 0.1718052290380001, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.0001, + "reward": 0.010273065650835633, + "reward_std": 0.11918974481523037, + "rewards/cosine_scaled_reward": -0.1677860009167489, + "rewards/format_reward": 0.39583334513008595, "step": 127 }, { - "completion_length": 3306.416748046875, - "epoch": 0.07314285714285715, - "grad_norm": 0.2559962272644043, - "kl": 0.0005383491516113281, + "completion_length": 2901.5625228881836, + "epoch": 0.1462857142857143, + "grad_norm": 0.18506519496440887, + "kl": 0.003477931022644043, "learning_rate": 9.34913917072228e-07, - "loss": 0.0, - "reward": 0.020507143228314817, - "reward_std": 0.09752212930470705, - "rewards/cosine_scaled_reward": -0.10445040091872215, - "rewards/format_reward": 0.3333333469927311, + "loss": 0.0001, + "reward": 0.08515205327421427, + "reward_std": 0.14246997330337763, + "rewards/cosine_scaled_reward": 0.07529417611658573, + "rewards/format_reward": 0.3541666753590107, "step": 128 }, { - "completion_length": 2649.9583587646484, - "epoch": 0.07371428571428572, - "grad_norm": 0.25425705313682556, - "kl": 0.0005583763122558594, + "completion_length": 3311.187530517578, + "epoch": 0.14742857142857144, + "grad_norm": 0.22474369406700134, + "kl": 0.0030045509338378906, "learning_rate": 9.332771203643714e-07, - "loss": 0.0, - "reward": 0.08644429221749306, - "reward_std": 0.10645583271980286, - "rewards/cosine_scaled_reward": 0.003808148205280304, - "rewards/format_reward": 0.5, + "loss": 0.0001, + "reward": -0.026094807864865288, + "reward_std": 0.12371798837557435, + "rewards/cosine_scaled_reward": -0.19261731766164303, + "rewards/format_reward": 0.22916667349636555, "step": 129 }, { - "completion_length": 2776.125, - "epoch": 0.07428571428571429, - "grad_norm": 0.3987893760204315, - "kl": 0.0007352828979492188, + "completion_length": 2830.0833587646484, + "epoch": 0.14857142857142858, + "grad_norm": 0.18044407665729523, + "kl": 0.0024716854095458984, "learning_rate": 9.316216432703916e-07, - "loss": 0.0, - "reward": -0.06129014492034912, - "reward_std": 0.05513091990724206, - "rewards/cosine_scaled_reward": -0.36728058755397797, - "rewards/format_reward": 0.375, + "loss": 0.0001, + "reward": -0.006927699316293001, + "reward_std": 0.08703188924118876, + "rewards/cosine_scaled_reward": -0.1649078167974949, + "rewards/format_reward": 0.29166667349636555, "step": 130 }, { - "completion_length": 2755.9583740234375, - "epoch": 0.07485714285714286, - "grad_norm": 0.2685925364494324, - "kl": 0.0005114078521728516, + "completion_length": 2923.0000228881836, + "epoch": 0.14971428571428572, + "grad_norm": 0.22359904646873474, + "kl": 0.0043888092041015625, "learning_rate": 9.299475664759068e-07, - "loss": 0.0, - "reward": -0.015627777203917503, - "reward_std": 0.12313933670520782, - "rewards/cosine_scaled_reward": -0.191717523150146, - "rewards/format_reward": 0.2916666679084301, + "loss": 0.0002, + "reward": 0.04467150871641934, + "reward_std": 0.11750007548835129, + "rewards/cosine_scaled_reward": -0.02377407788299024, + "rewards/format_reward": 0.31250000558793545, "step": 131 }, { - "completion_length": 1364.4166870117188, - "epoch": 0.07542857142857143, - "grad_norm": 0.3648519814014435, - "kl": 0.001399993896484375, + "completion_length": 2468.4583587646484, + "epoch": 0.15085714285714286, + "grad_norm": 0.20087915658950806, + "kl": 0.0021352767944335938, "learning_rate": 9.282549715730579e-07, "loss": 0.0001, - "reward": 0.21200655028223991, - "reward_std": 0.06416944274678826, - "rewards/cosine_scaled_reward": 0.25286095403134823, - "rewards/format_reward": 0.75, + "reward": 0.04917083401232958, + "reward_std": 0.10673188930377364, + "rewards/cosine_scaled_reward": -0.07255440950393677, + "rewards/format_reward": 0.43750000186264515, "step": 132 }, { - "completion_length": 3164.0833435058594, - "epoch": 0.076, - "grad_norm": 0.20253612101078033, - "kl": 0.0011391639709472656, + "completion_length": 2899.750030517578, + "epoch": 0.152, + "grad_norm": 0.2198459506034851, + "kl": 0.0033211708068847656, "learning_rate": 9.265439410565328e-07, - "loss": 0.0, - "reward": -0.009751026052981615, - "reward_std": 0.09113957174122334, - "rewards/cosine_scaled_reward": -0.15398671105504036, - "rewards/format_reward": 0.2500000111758709, + "loss": 0.0001, + "reward": 0.007418630411848426, + "reward_std": 0.09727612743154168, + "rewards/cosine_scaled_reward": -0.16582289477810264, + "rewards/format_reward": 0.3750000111758709, "step": 133 }, { - "completion_length": 3584.0, - "epoch": 0.07657142857142857, - "grad_norm": 0.17280180752277374, - "kl": 0.0002849102020263672, + "completion_length": 2317.9375228881836, + "epoch": 0.15314285714285714, + "grad_norm": 0.19853614270687103, + "kl": 0.004344940185546875, "learning_rate": 9.248145583195447e-07, - "loss": 0.0, - "reward": -0.09615898318588734, - "reward_std": 0.06122902221977711, - "rewards/cosine_scaled_reward": -0.2841451019048691, - "rewards/format_reward": 0.0, + "loss": 0.0002, + "reward": 0.09111709147691727, + "reward_std": 0.11203813040629029, + "rewards/cosine_scaled_reward": -0.02176826912909746, + "rewards/format_reward": 0.5833333432674408, "step": 134 }, { - "completion_length": 1193.8750610351562, - "epoch": 0.07714285714285714, - "grad_norm": 0.3936041593551636, - "kl": 0.001529693603515625, + "completion_length": 1802.4791946411133, + "epoch": 0.15428571428571428, + "grad_norm": 0.26218482851982117, + "kl": 0.0051059722900390625, "learning_rate": 9.230669076497687e-07, - "loss": 0.0001, - "reward": 0.16709956154227257, - "reward_std": 0.19497814774513245, - "rewards/cosine_scaled_reward": 0.03372149355709553, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0002, + "reward": 0.19630146119743586, + "reward_std": 0.15289967821445316, + "rewards/cosine_scaled_reward": 0.23608196713030338, + "rewards/format_reward": 0.6875000111758709, "step": 135 }, { - "completion_length": 2334.2916870117188, - "epoch": 0.07771428571428571, - "grad_norm": 0.30911606550216675, - "kl": 0.0007023811340332031, + "completion_length": 2570.854248046875, + "epoch": 0.15542857142857142, + "grad_norm": 0.20145438611507416, + "kl": 0.0037708282470703125, "learning_rate": 9.213010742252327e-07, - "loss": 0.0, - "reward": 0.07801801711320877, - "reward_std": 0.058914353139698505, - "rewards/cosine_scaled_reward": -0.02124110981822014, - "rewards/format_reward": 0.5, + "loss": 0.0002, + "reward": 0.0829686057404615, + "reward_std": 0.1752019147388637, + "rewards/cosine_scaled_reward": -0.026120582595467567, + "rewards/format_reward": 0.5416666753590107, "step": 136 }, { - "completion_length": 2367.875030517578, - "epoch": 0.07828571428571429, - "grad_norm": 0.3404524028301239, - "kl": 0.0021572113037109375, + "completion_length": 2661.2500381469727, + "epoch": 0.15657142857142858, + "grad_norm": 0.19357463717460632, + "kl": 0.0031414031982421875, "learning_rate": 9.195171441101668e-07, "loss": 0.0001, - "reward": 0.0090090436860919, - "reward_std": 0.15126726776361465, - "rewards/cosine_scaled_reward": -0.20420951768755913, - "rewards/format_reward": 0.4583333507180214, + "reward": 0.0038152660708874464, + "reward_std": 0.10811280179768801, + "rewards/cosine_scaled_reward": -0.21752969082444906, + "rewards/format_reward": 0.45833334513008595, "step": 137 }, { - "completion_length": 2165.4584045410156, - "epoch": 0.07885714285714286, - "grad_norm": 0.3800869286060333, - "kl": 0.001201629638671875, + "completion_length": 2141.166702270508, + "epoch": 0.15771428571428572, + "grad_norm": 0.2089914232492447, + "kl": 0.002676725387573242, "learning_rate": 9.177152042508077e-07, - "loss": 0.0, - "reward": 0.028105018951464444, - "reward_std": 0.14362438395619392, - "rewards/cosine_scaled_reward": -0.21166812255978584, - "rewards/format_reward": 0.5833333469927311, + "loss": 0.0001, + "reward": 0.14542343048378825, + "reward_std": 0.15844334475696087, + "rewards/cosine_scaled_reward": 0.04235106392297894, + "rewards/format_reward": 0.7708333618938923, "step": 138 }, { - "completion_length": 2959.4583740234375, - "epoch": 0.07942857142857143, - "grad_norm": 0.18971167504787445, - "kl": 0.000293731689453125, + "completion_length": 2820.8750610351562, + "epoch": 0.15885714285714286, + "grad_norm": 0.18096813559532166, + "kl": 0.0042743682861328125, "learning_rate": 9.158953424711624e-07, - "loss": 0.0, - "reward": 0.03813161235302687, - "reward_std": 0.1281577367335558, - "rewards/cosine_scaled_reward": -0.17832310870289803, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0002, + "reward": 0.06168772419914603, + "reward_std": 0.14358124835416675, + "rewards/cosine_scaled_reward": -0.07845509238541126, + "rewards/format_reward": 0.520833345130086, "step": 139 }, { - "completion_length": 2643.125, - "epoch": 0.08, - "grad_norm": 0.30835434794425964, - "kl": 0.0021344423294067383, + "completion_length": 2436.354217529297, + "epoch": 0.16, + "grad_norm": 0.27698883414268494, + "kl": 0.0058422088623046875, "learning_rate": 9.140576474687263e-07, - "loss": 0.0001, - "reward": -0.020217259414494038, - "reward_std": 0.10152997449040413, - "rewards/cosine_scaled_reward": -0.24589134380221367, - "rewards/format_reward": 0.3750000037252903, + "loss": 0.0002, + "reward": 0.044038140535121784, + "reward_std": 0.07816965272650123, + "rewards/cosine_scaled_reward": -0.11026089265942574, + "rewards/format_reward": 0.4791666716337204, "step": 140 }, { - "completion_length": 3135.1666870117188, - "epoch": 0.08057142857142857, - "grad_norm": 0.2558992803096771, - "kl": 0.000596463680267334, + "completion_length": 2359.375045776367, + "epoch": 0.16114285714285714, + "grad_norm": 0.1908435970544815, + "kl": 0.0043811798095703125, "learning_rate": 9.122022088101613e-07, - "loss": 0.0, - "reward": -0.014769007684662938, - "reward_std": 0.096078310161829, - "rewards/cosine_scaled_reward": -0.16845327895134687, - "rewards/format_reward": 0.2500000074505806, + "loss": 0.0002, + "reward": 0.03223781171254814, + "reward_std": 0.11182020884007215, + "rewards/cosine_scaled_reward": -0.1976433489471674, + "rewards/format_reward": 0.5833333414047956, "step": 141 }, { - "completion_length": 2130.7916870117188, - "epoch": 0.08114285714285714, - "grad_norm": 0.5362604856491089, - "kl": 0.0020551681518554688, + "completion_length": 2460.187530517578, + "epoch": 0.16228571428571428, + "grad_norm": 0.1981201469898224, + "kl": 0.0038270950317382812, "learning_rate": 9.103291169269299e-07, - "loss": 0.0001, - "reward": 0.09697438403964043, - "reward_std": 0.1245157066732645, - "rewards/cosine_scaled_reward": 0.03538942337036133, - "rewards/format_reward": 0.5, + "loss": 0.0002, + "reward": 0.08847181824967265, + "reward_std": 0.13877713168039918, + "rewards/cosine_scaled_reward": -0.06368941674008965, + "rewards/format_reward": 0.645833345130086, "step": 142 }, { - "completion_length": 1713.8750762939453, - "epoch": 0.08171428571428571, - "grad_norm": 0.4555683135986328, - "kl": 0.0018873214721679688, + "completion_length": 2437.791702270508, + "epoch": 0.16342857142857142, + "grad_norm": 0.3880097270011902, + "kl": 0.023929595947265625, "learning_rate": 9.084384631108882e-07, - "loss": 0.0001, - "reward": 0.07669693045318127, - "reward_std": 0.1109575666487217, - "rewards/cosine_scaled_reward": -0.17139715957455337, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.001, + "reward": 0.016975378792267293, + "reward_std": 0.12177514331415296, + "rewards/cosine_scaled_reward": -0.20094774826429784, + "rewards/format_reward": 0.5000000111758709, "step": 143 }, { - "completion_length": 3281.4583740234375, - "epoch": 0.08228571428571428, - "grad_norm": 0.19648481905460358, - "kl": 0.0004582405090332031, + "completion_length": 2708.916732788086, + "epoch": 0.16457142857142856, + "grad_norm": 0.21965523064136505, + "kl": 0.00417327880859375, "learning_rate": 9.065303395098358e-07, - "loss": 0.0, - "reward": -0.014060030691325665, - "reward_std": 0.11811933852732182, - "rewards/cosine_scaled_reward": -0.1666712760925293, - "rewards/format_reward": 0.2500000074505806, + "loss": 0.0002, + "reward": 0.030230441665480612, + "reward_std": 0.17415813449770212, + "rewards/cosine_scaled_reward": -0.11869808053597808, + "rewards/format_reward": 0.4166666753590107, "step": 144 }, { - "completion_length": 3426.625, - "epoch": 0.08285714285714285, - "grad_norm": 0.19925186038017273, - "kl": 0.0004405975341796875, + "completion_length": 1839.4375228881836, + "epoch": 0.1657142857142857, + "grad_norm": 0.2578926384449005, + "kl": 0.004568576812744141, "learning_rate": 9.046048391230247e-07, - "loss": 0.0, - "reward": -0.018054714426398277, - "reward_std": 0.09174860920757055, - "rewards/cosine_scaled_reward": -0.1567653939127922, - "rewards/format_reward": 0.2083333432674408, + "loss": 0.0002, + "reward": 0.15131515043321997, + "reward_std": 0.15678277891129255, + "rewards/cosine_scaled_reward": 0.0914019983028993, + "rewards/format_reward": 0.7083333358168602, "step": 145 }, { - "completion_length": 3485.9583740234375, - "epoch": 0.08342857142857144, - "grad_norm": 0.20348073542118073, - "kl": 0.000531315803527832, + "completion_length": 1990.5833892822266, + "epoch": 0.16685714285714287, + "grad_norm": 0.19752271473407745, + "kl": 0.003320455551147461, "learning_rate": 9.026620557966279e-07, - "loss": 0.0, - "reward": 0.040838935412466526, - "reward_std": 0.17878105863928795, - "rewards/cosine_scaled_reward": -0.0029604285955429077, - "rewards/format_reward": 0.2500000037252903, + "loss": 0.0001, + "reward": 0.09106699889525771, + "reward_std": 0.10312592587433755, + "rewards/cosine_scaled_reward": -0.1385479016462341, + "rewards/format_reward": 0.8125000149011612, "step": 146 }, { - "completion_length": 2924.7083740234375, - "epoch": 0.084, - "grad_norm": 0.19717860221862793, - "kl": 0.0002467632293701172, + "completion_length": 1966.604232788086, + "epoch": 0.168, + "grad_norm": 0.26733097434043884, + "kl": 0.006213188171386719, "learning_rate": 9.007020842191634e-07, - "loss": 0.0, - "reward": 0.1250794786028564, - "reward_std": 0.16694453824311495, - "rewards/cosine_scaled_reward": 0.11983045563101768, - "rewards/format_reward": 0.5000000074505806, + "loss": 0.0002, + "reward": 0.12648878013715148, + "reward_std": 0.1677848151884973, + "rewards/cosine_scaled_reward": 0.016043312381953, + "rewards/format_reward": 0.7083333469927311, "step": 147 }, { - "completion_length": 3223.4166870117188, - "epoch": 0.08457142857142858, - "grad_norm": 0.2469419240951538, - "kl": 0.0021734237670898438, + "completion_length": 1654.6250457763672, + "epoch": 0.16914285714285715, + "grad_norm": 0.2192990928888321, + "kl": 0.004794120788574219, "learning_rate": 8.987250199168808e-07, - "loss": 0.0001, - "reward": 0.01485387422144413, - "reward_std": 0.14883059356361628, - "rewards/cosine_scaled_reward": -0.060995956882834435, - "rewards/format_reward": 0.2083333358168602, + "loss": 0.0002, + "reward": 0.09204956935718656, + "reward_std": 0.08866735780611634, + "rewards/cosine_scaled_reward": -0.13534213416278362, + "rewards/format_reward": 0.8125000149011612, "step": 148 }, { - "completion_length": 3118.625030517578, - "epoch": 0.08514285714285715, - "grad_norm": 0.20025363564491272, - "kl": 0.0003981590270996094, + "completion_length": 2345.729232788086, + "epoch": 0.1702857142857143, + "grad_norm": 0.2061564177274704, + "kl": 0.0041484832763671875, "learning_rate": 8.967309592491052e-07, - "loss": 0.0, - "reward": -0.027475359849631786, - "reward_std": 0.0655772490426898, - "rewards/cosine_scaled_reward": -0.24801118299365044, - "rewards/format_reward": 0.3333333358168602, + "loss": 0.0002, + "reward": 0.08000461710616946, + "reward_std": 0.12683053640648723, + "rewards/cosine_scaled_reward": -0.06624547764658928, + "rewards/format_reward": 0.6041666828095913, "step": 149 }, { - "completion_length": 2811.8333740234375, - "epoch": 0.08571428571428572, - "grad_norm": 0.24798090755939484, - "kl": 0.001667022705078125, + "completion_length": 2006.6250381469727, + "epoch": 0.17142857142857143, + "grad_norm": 0.24100010097026825, + "kl": 0.00368499755859375, "learning_rate": 8.9471999940354e-07, "loss": 0.0001, - "reward": 0.15302221104502678, - "reward_std": 0.1286456696689129, - "rewards/cosine_scaled_reward": 0.2414417085237801, - "rewards/format_reward": 0.4166666716337204, + "reward": 0.08540961390826851, + "reward_std": 0.15422609634697437, + "rewards/cosine_scaled_reward": -0.0530257155187428, + "rewards/format_reward": 0.6041666734963655, "step": 150 }, { - "completion_length": 2862.8333435058594, - "epoch": 0.08628571428571429, - "grad_norm": 0.3297010064125061, - "kl": 0.00044167041778564453, + "completion_length": 1996.520881652832, + "epoch": 0.17257142857142857, + "grad_norm": 0.28235289454460144, + "kl": 0.005840301513671875, "learning_rate": 8.926922383915315e-07, - "loss": 0.0, - "reward": -0.023194299079477787, - "reward_std": 0.08936419151723385, - "rewards/cosine_scaled_reward": -0.29692845046520233, - "rewards/format_reward": 0.4583333358168602, + "loss": 0.0002, + "reward": 0.12831606157124043, + "reward_std": 0.131237086141482, + "rewards/cosine_scaled_reward": 0.031276384368538857, + "rewards/format_reward": 0.6875000204890966, "step": 151 }, { - "completion_length": 2644.875030517578, - "epoch": 0.08685714285714285, - "grad_norm": 0.27902644872665405, - "kl": 0.0008206367492675781, + "completion_length": 2301.0208435058594, + "epoch": 0.1737142857142857, + "grad_norm": 0.3784143626689911, + "kl": 0.0074005126953125, "learning_rate": 8.906477750432903e-07, - "loss": 0.0, - "reward": 0.015308692585676908, - "reward_std": 0.11771413777023554, - "rewards/cosine_scaled_reward": -0.18592653423547745, - "rewards/format_reward": 0.4583333395421505, + "loss": 0.0003, + "reward": 0.025953251402825117, + "reward_std": 0.11646852549165487, + "rewards/cosine_scaled_reward": -0.18527186242863536, + "rewards/format_reward": 0.5208333488553762, "step": 152 }, { - "completion_length": 2970.2916870117188, - "epoch": 0.08742857142857142, - "grad_norm": 0.23400025069713593, - "kl": 0.0007061958312988281, + "completion_length": 2342.0834045410156, + "epoch": 0.17485714285714285, + "grad_norm": 0.20556333661079407, + "kl": 0.0073909759521484375, "learning_rate": 8.88586709003076e-07, - "loss": 0.0, - "reward": -0.005328923463821411, - "reward_std": 0.07167630735784769, - "rewards/cosine_scaled_reward": -0.2441950924694538, - "rewards/format_reward": 0.4583333432674408, + "loss": 0.0003, + "reward": 0.03766886703670025, + "reward_std": 0.08948473795317113, + "rewards/cosine_scaled_reward": -0.19193847686983645, + "rewards/format_reward": 0.6041666753590107, "step": 153 }, { - "completion_length": 3180.5, - "epoch": 0.088, - "grad_norm": 0.20705805718898773, - "kl": 0.0009854435920715332, + "completion_length": 3022.5833740234375, + "epoch": 0.176, + "grad_norm": 0.22437125444412231, + "kl": 0.00362396240234375, "learning_rate": 8.865091407243394e-07, - "loss": 0.0, - "reward": 0.010325465351343155, - "reward_std": 0.11453245859593153, - "rewards/cosine_scaled_reward": -0.0966395572759211, - "rewards/format_reward": 0.2500000111758709, + "loss": 0.0001, + "reward": 0.06817820528522134, + "reward_std": 0.16511888336390257, + "rewards/cosine_scaled_reward": -0.018517197109758854, + "rewards/format_reward": 0.4375000074505806, "step": 154 }, { - "completion_length": 2892.2500610351562, - "epoch": 0.08857142857142856, - "grad_norm": 0.23993003368377686, - "kl": 0.0006253719329833984, + "completion_length": 2435.0625610351562, + "epoch": 0.17714285714285713, + "grad_norm": 0.21803762018680573, + "kl": 0.004116058349609375, "learning_rate": 8.844151714648274e-07, - "loss": 0.0, - "reward": -0.02529401145875454, - "reward_std": 0.08164496999233961, - "rewards/cosine_scaled_reward": -0.2222640160471201, - "rewards/format_reward": 0.2916666679084301, + "loss": 0.0002, + "reward": 0.11520516406744719, + "reward_std": 0.13615657854825258, + "rewards/cosine_scaled_reward": 0.03556065930752084, + "rewards/format_reward": 0.6041666734963655, "step": 155 }, { - "completion_length": 2833.0833740234375, - "epoch": 0.08914285714285715, - "grad_norm": 0.26325613260269165, - "kl": 0.0006287097930908203, + "completion_length": 2320.583351135254, + "epoch": 0.1782857142857143, + "grad_norm": 0.1894351989030838, + "kl": 0.00403594970703125, "learning_rate": 8.823049032816478e-07, - "loss": 0.0, - "reward": 0.03351680841296911, - "reward_std": 0.1468300074338913, - "rewards/cosine_scaled_reward": -0.1301112249493599, - "rewards/format_reward": 0.4583333544433117, + "loss": 0.0002, + "reward": 0.09686407796107233, + "reward_std": 0.13217018358409405, + "rewards/cosine_scaled_reward": 0.03549210913479328, + "rewards/format_reward": 0.5000000055879354, "step": 156 }, { - "completion_length": 1936.4166870117188, - "epoch": 0.08971428571428572, - "grad_norm": 0.34370705485343933, - "kl": 0.0022745132446289062, + "completion_length": 2527.5834197998047, + "epoch": 0.17942857142857144, + "grad_norm": 0.2570294737815857, + "kl": 0.005597114562988281, "learning_rate": 8.801784390262943e-07, - "loss": 0.0001, - "reward": 0.14279305189847946, - "reward_std": 0.09327758988365531, - "rewards/cosine_scaled_reward": 0.08474906906485558, - "rewards/format_reward": 0.6666666716337204, + "loss": 0.0002, + "reward": 0.046291103353723884, + "reward_std": 0.09627518011257052, + "rewards/cosine_scaled_reward": -0.15388192608952522, + "rewards/format_reward": 0.5833333395421505, "step": 157 }, { - "completion_length": 2459.1666870117188, - "epoch": 0.09028571428571429, - "grad_norm": 0.27400168776512146, - "kl": 0.0007686614990234375, + "completion_length": 2241.729248046875, + "epoch": 0.18057142857142858, + "grad_norm": 0.21536891162395477, + "kl": 0.0056514739990234375, "learning_rate": 8.780358823396352e-07, - "loss": 0.0, - "reward": -0.007594363763928413, - "reward_std": 0.08547988906502724, - "rewards/cosine_scaled_reward": -0.27388815581798553, - "rewards/format_reward": 0.5, + "loss": 0.0002, + "reward": 0.15251322067342699, + "reward_std": 0.13267278019338846, + "rewards/cosine_scaled_reward": 0.06854456290602684, + "rewards/format_reward": 0.7500000186264515, "step": 158 }, { - "completion_length": 3369.6666870117188, - "epoch": 0.09085714285714286, - "grad_norm": 0.2419930398464203, - "kl": 0.0025382041931152344, + "completion_length": 2231.7708587646484, + "epoch": 0.18171428571428572, + "grad_norm": 0.1829807013273239, + "kl": 0.004848480224609375, "learning_rate": 8.758773376468604e-07, - "loss": 0.0001, - "reward": 0.08435911685228348, - "reward_std": 0.21178193762898445, - "rewards/cosine_scaled_reward": 0.08296507969498634, - "rewards/format_reward": 0.3333333395421505, + "loss": 0.0002, + "reward": 0.044310242868959904, + "reward_std": 0.11434727860614657, + "rewards/cosine_scaled_reward": -0.16176462545990944, + "rewards/format_reward": 0.5833333432674408, "step": 159 }, { - "completion_length": 3119.125030517578, - "epoch": 0.09142857142857143, - "grad_norm": 0.2259458303451538, - "kl": 0.0006680488586425781, + "completion_length": 2064.312530517578, + "epoch": 0.18285714285714286, + "grad_norm": 0.23337005078792572, + "kl": 0.006267547607421875, "learning_rate": 8.737029101523929e-07, - "loss": 0.0, - "reward": -0.029232572065666318, - "reward_std": 0.08499846514314413, - "rewards/cosine_scaled_reward": -0.23120503779500723, - "rewards/format_reward": 0.2916666679084301, + "loss": 0.0003, + "reward": 0.10007608711021021, + "reward_std": 0.1771828606724739, + "rewards/cosine_scaled_reward": -0.04113523324485868, + "rewards/format_reward": 0.6666666828095913, "step": 160 }, { - "completion_length": 3030.8750610351562, - "epoch": 0.092, - "grad_norm": 0.2455476075410843, - "kl": 0.0012464523315429688, + "completion_length": 2270.937545776367, + "epoch": 0.184, + "grad_norm": 0.3289494812488556, + "kl": 0.007640838623046875, "learning_rate": 8.715127058347614e-07, - "loss": 0.0, - "reward": 0.07169072097167373, - "reward_std": 0.09119516797363758, - "rewards/cosine_scaled_reward": 0.007221970707178116, - "rewards/format_reward": 0.4166666679084301, + "loss": 0.0003, + "reward": 0.053364482591859996, + "reward_std": 0.14938203245401382, + "rewards/cosine_scaled_reward": -0.11477282957639545, + "rewards/format_reward": 0.541666679084301, "step": 161 }, { - "completion_length": 2479.4166717529297, - "epoch": 0.09257142857142857, - "grad_norm": 0.4288926422595978, - "kl": 0.0049152374267578125, + "completion_length": 2312.416702270508, + "epoch": 0.18514285714285714, + "grad_norm": 0.26157301664352417, + "kl": 0.008253097534179688, "learning_rate": 8.693068314414344e-07, - "loss": 0.0002, - "reward": -0.02276976825669408, - "reward_std": 0.07757014129310846, - "rewards/cosine_scaled_reward": -0.256338007748127, - "rewards/format_reward": 0.375, + "loss": 0.0003, + "reward": 0.04589501162990928, + "reward_std": 0.12189697381108999, + "rewards/cosine_scaled_reward": -0.13589882757514715, + "rewards/format_reward": 0.5416666753590107, "step": 162 }, { - "completion_length": 2956.9583740234375, - "epoch": 0.09314285714285714, - "grad_norm": 0.2355862259864807, - "kl": 0.0015063285827636719, + "completion_length": 2191.4583740234375, + "epoch": 0.18628571428571428, + "grad_norm": 0.2897321283817291, + "kl": 0.00688934326171875, "learning_rate": 8.670853944836176e-07, - "loss": 0.0001, - "reward": 0.10418934188783169, - "reward_std": 0.11982499714940786, - "rewards/cosine_scaled_reward": 0.10006946325302124, - "rewards/format_reward": 0.4166666716337204, + "loss": 0.0003, + "reward": 0.13259745202958584, + "reward_std": 0.06983533198945224, + "rewards/cosine_scaled_reward": 0.07808320969343185, + "rewards/format_reward": 0.625, "step": 163 }, { - "completion_length": 2112.2083435058594, - "epoch": 0.09371428571428571, - "grad_norm": 0.3043869435787201, - "kl": 0.0017682313919067383, + "completion_length": 1521.0625610351562, + "epoch": 0.18742857142857142, + "grad_norm": 0.2747092843055725, + "kl": 0.009218215942382812, "learning_rate": 8.648485032310144e-07, - "loss": 0.0001, - "reward": 0.2011046763509512, - "reward_std": 0.10349223669618368, - "rewards/cosine_scaled_reward": 0.3217890188097954, - "rewards/format_reward": 0.5416666679084301, + "loss": 0.0004, + "reward": 0.12581492541357875, + "reward_std": 0.09997158916667104, + "rewards/cosine_scaled_reward": -0.03640655893832445, + "rewards/format_reward": 0.8125000055879354, "step": 164 }, { - "completion_length": 3509.2083740234375, - "epoch": 0.09428571428571429, - "grad_norm": 0.21398882567882538, - "kl": 0.001675724983215332, + "completion_length": 1848.2500228881836, + "epoch": 0.18857142857142858, + "grad_norm": 0.275541216135025, + "kl": 0.006542205810546875, "learning_rate": 8.625962667065487e-07, - "loss": 0.0001, - "reward": -0.03561374684795737, - "reward_std": 0.07662838604301214, - "rewards/cosine_scaled_reward": -0.18927406147122383, - "rewards/format_reward": 0.1666666679084301, + "loss": 0.0003, + "reward": 0.0523092825897038, + "reward_std": 0.12433975096791983, + "rewards/cosine_scaled_reward": -0.1790934158489108, + "rewards/format_reward": 0.6666666716337204, "step": 165 }, { - "completion_length": 2245.2083435058594, - "epoch": 0.09485714285714286, - "grad_norm": 0.3329755663871765, - "kl": 0.0022058486938476562, + "completion_length": 2049.0417251586914, + "epoch": 0.18971428571428572, + "grad_norm": 0.1916612833738327, + "kl": 0.0053234100341796875, "learning_rate": 8.603287946810513e-07, - "loss": 0.0001, - "reward": 0.07745502702891827, - "reward_std": 0.12439612857997417, - "rewards/cosine_scaled_reward": -0.06281759589910507, - "rewards/format_reward": 0.5833333358168602, + "loss": 0.0002, + "reward": 0.09825841523706913, + "reward_std": 0.15324107883498073, + "rewards/cosine_scaled_reward": -0.0768615622073412, + "rewards/format_reward": 0.7291666734963655, "step": 166 }, { - "completion_length": 2427.7084045410156, - "epoch": 0.09542857142857143, - "grad_norm": 0.23882003128528595, - "kl": 0.0007524490356445312, + "completion_length": 1554.1666946411133, + "epoch": 0.19085714285714286, + "grad_norm": 0.22473838925361633, + "kl": 0.0043048858642578125, "learning_rate": 8.580461976679099e-07, - "loss": 0.0, - "reward": 0.1526787169277668, - "reward_std": 0.08664220478385687, - "rewards/cosine_scaled_reward": 0.05513417907059193, - "rewards/format_reward": 0.7916666679084301, + "loss": 0.0002, + "reward": 0.12430966179817915, + "reward_std": 0.12945860624313354, + "rewards/cosine_scaled_reward": -0.07466088375076652, + "rewards/format_reward": 0.8750000111758709, "step": 167 }, { - "completion_length": 3174.916748046875, - "epoch": 0.096, - "grad_norm": 0.2548929750919342, - "kl": 0.0009465217590332031, + "completion_length": 2357.5209045410156, + "epoch": 0.192, + "grad_norm": 0.218026265501976, + "kl": 0.0063018798828125, "learning_rate": 8.557485869176825e-07, - "loss": 0.0, - "reward": 0.04437680635601282, - "reward_std": 0.12012367881834507, - "rewards/cosine_scaled_reward": -0.03799363225698471, - "rewards/format_reward": 0.3333333432674408, + "loss": 0.0003, + "reward": 0.0786146642640233, + "reward_std": 0.14164651185274124, + "rewards/cosine_scaled_reward": -0.08221817389130592, + "rewards/format_reward": 0.6250000223517418, "step": 168 }, { - "completion_length": 2719.416748046875, - "epoch": 0.09657142857142857, - "grad_norm": 0.20099206268787384, - "kl": 0.0010776519775390625, + "completion_length": 1244.0625381469727, + "epoch": 0.19314285714285714, + "grad_norm": 0.2136813998222351, + "kl": 0.005023002624511719, "learning_rate": 8.534360744126753e-07, - "loss": 0.0, - "reward": 0.09815767779946327, - "reward_std": 0.22022154554724693, - "rewards/cosine_scaled_reward": -0.0014746334636583924, - "rewards/format_reward": 0.5833333507180214, + "loss": 0.0002, + "reward": 0.2644330468028784, + "reward_std": 0.10949333664029837, + "rewards/cosine_scaled_reward": 0.2890866380184889, + "rewards/format_reward": 0.9791666716337204, "step": 169 }, { - "completion_length": 2693.5416870117188, - "epoch": 0.09714285714285714, - "grad_norm": 0.2715960741043091, - "kl": 0.0010104179382324219, + "completion_length": 2086.333366394043, + "epoch": 0.19428571428571428, + "grad_norm": 0.23367033898830414, + "kl": 0.0049877166748046875, "learning_rate": 8.511087728614862e-07, - "loss": 0.0, - "reward": 0.06597982347011566, - "reward_std": 0.1477346494793892, - "rewards/cosine_scaled_reward": -0.11717931320890784, - "rewards/format_reward": 0.625, + "loss": 0.0002, + "reward": 0.13441222603432834, + "reward_std": 0.12571046565426514, + "rewards/cosine_scaled_reward": 0.07233863137662411, + "rewards/format_reward": 0.6458333395421505, "step": 170 }, { - "completion_length": 3412.1666870117188, - "epoch": 0.09771428571428571, - "grad_norm": 0.2158529907464981, - "kl": 0.002594470977783203, + "completion_length": 2040.7708587646484, + "epoch": 0.19542857142857142, + "grad_norm": 0.2201111614704132, + "kl": 0.004878044128417969, "learning_rate": 8.487667956935087e-07, - "loss": 0.0001, - "reward": 0.011013628914952278, - "reward_std": 0.090276881121099, - "rewards/cosine_scaled_reward": -0.09002431482076645, - "rewards/format_reward": 0.25, + "loss": 0.0002, + "reward": 0.07799222506582737, + "reward_std": 0.1219726437702775, + "rewards/cosine_scaled_reward": -0.09505355032160878, + "rewards/format_reward": 0.6458333395421505, "step": 171 }, { - "completion_length": 1863.5417022705078, - "epoch": 0.09828571428571428, - "grad_norm": 0.29856592416763306, - "kl": 0.0009441375732421875, + "completion_length": 2090.562557220459, + "epoch": 0.19657142857142856, + "grad_norm": 0.24928785860538483, + "kl": 0.008022308349609375, "learning_rate": 8.464102570534061e-07, - "loss": 0.0, - "reward": 0.09323265589773655, - "reward_std": 0.10657317750155926, - "rewards/cosine_scaled_reward": -0.09766584308817983, - "rewards/format_reward": 0.75, + "loss": 0.0003, + "reward": 0.1555600226856768, + "reward_std": 0.12234124867245555, + "rewards/cosine_scaled_reward": 0.14208754245191813, + "rewards/format_reward": 0.6250000149011612, "step": 172 }, { - "completion_length": 3000.8750610351562, - "epoch": 0.09885714285714285, - "grad_norm": 0.26156315207481384, - "kl": 0.0020602941513061523, + "completion_length": 1355.5000228881836, + "epoch": 0.1977142857142857, + "grad_norm": 0.2922389805316925, + "kl": 0.00618743896484375, "learning_rate": 8.440392717955475e-07, - "loss": 0.0001, - "reward": 0.059873973950743675, - "reward_std": 0.1590843703597784, - "rewards/cosine_scaled_reward": 0.009898446500301361, - "rewards/format_reward": 0.3333333432674408, + "loss": 0.0002, + "reward": 0.07720496540423483, + "reward_std": 0.08944753208197653, + "rewards/cosine_scaled_reward": -0.16825812682509422, + "rewards/format_reward": 0.7916666679084301, "step": 173 }, { - "completion_length": 2169.7500915527344, - "epoch": 0.09942857142857142, - "grad_norm": 0.25845739245414734, - "kl": 0.0028324127197265625, + "completion_length": 1297.6042098999023, + "epoch": 0.19885714285714284, + "grad_norm": 0.21333926916122437, + "kl": 0.008691787719726562, "learning_rate": 8.416539554784089e-07, - "loss": 0.0001, - "reward": 0.12606202624738216, - "reward_std": 0.10567605309188366, - "rewards/cosine_scaled_reward": -0.06765587627887726, - "rewards/format_reward": 0.8750000298023224, + "loss": 0.0003, + "reward": 0.1351238526403904, + "reward_std": 0.09852686384692788, + "rewards/cosine_scaled_reward": -0.0807552793994546, + "rewards/format_reward": 0.9583333432674408, "step": 174 }, { - "completion_length": 2109.3333740234375, - "epoch": 0.1, - "grad_norm": 0.30087336897850037, - "kl": 0.0016384124755859375, + "completion_length": 1959.2708892822266, + "epoch": 0.2, + "grad_norm": 0.22018226981163025, + "kl": 0.006298065185546875, "learning_rate": 8.392544243589427e-07, - "loss": 0.0001, - "reward": 0.13839344680309296, - "reward_std": 0.13256806321442127, - "rewards/cosine_scaled_reward": 0.07393108680844307, - "rewards/format_reward": 0.6666666865348816, + "loss": 0.0003, + "reward": 0.14479633374139667, + "reward_std": 0.0969647653400898, + "rewards/cosine_scaled_reward": 0.06122639961540699, + "rewards/format_reward": 0.7291666753590107, "step": 175 }, { - "completion_length": 2961.166748046875, - "epoch": 0.10057142857142858, - "grad_norm": 0.2737109065055847, - "kl": 0.0030095577239990234, + "completion_length": 1673.6875381469727, + "epoch": 0.20114285714285715, + "grad_norm": 0.2576465308666229, + "kl": 0.0056438446044921875, "learning_rate": 8.368407953869103e-07, - "loss": 0.0001, - "reward": 0.10537720960564911, - "reward_std": 0.23410072550177574, - "rewards/cosine_scaled_reward": 0.0412262286990881, - "rewards/format_reward": 0.5416666902601719, + "loss": 0.0002, + "reward": 0.0873019965365529, + "reward_std": 0.11411390919238329, + "rewards/cosine_scaled_reward": -0.12017827155068517, + "rewards/format_reward": 0.7500000111758709, "step": 176 }, { - "completion_length": 2338.0000915527344, - "epoch": 0.10114285714285715, - "grad_norm": 0.3184531033039093, - "kl": 0.00208282470703125, + "completion_length": 2214.270881652832, + "epoch": 0.2022857142857143, + "grad_norm": 0.28319552540779114, + "kl": 0.0062694549560546875, "learning_rate": 8.344131861991828e-07, - "loss": 0.0001, - "reward": 0.05052237864583731, - "reward_std": 0.17906413041055202, - "rewards/cosine_scaled_reward": -0.12304041348397732, - "rewards/format_reward": 0.5416666679084301, + "loss": 0.0003, + "reward": 0.08295768650714308, + "reward_std": 0.10326328268274665, + "rewards/cosine_scaled_reward": -0.09064689744263887, + "rewards/format_reward": 0.6666666753590107, "step": 177 }, { - "completion_length": 3078.4583740234375, - "epoch": 0.10171428571428572, - "grad_norm": 0.24096979200839996, - "kl": 0.0026640892028808594, + "completion_length": 1769.8542022705078, + "epoch": 0.20342857142857143, + "grad_norm": 0.28282982110977173, + "kl": 0.00852203369140625, "learning_rate": 8.319717151140072e-07, - "loss": 0.0001, - "reward": -0.007279876619577408, - "reward_std": 0.1127309650182724, - "rewards/cosine_scaled_reward": -0.2085082083940506, - "rewards/format_reward": 0.3750000149011612, + "loss": 0.0003, + "reward": 0.1297212722711265, + "reward_std": 0.15833280980587006, + "rewards/cosine_scaled_reward": 0.026437816210091114, + "rewards/format_reward": 0.7083333432674408, "step": 178 }, { - "completion_length": 1732.9583435058594, - "epoch": 0.10228571428571429, - "grad_norm": 0.4574584364891052, - "kl": 0.00363922119140625, + "completion_length": 2106.7291717529297, + "epoch": 0.20457142857142857, + "grad_norm": 0.21418659389019012, + "kl": 0.00591278076171875, "learning_rate": 8.295165011252396e-07, - "loss": 0.0001, - "reward": 0.060525269247591496, - "reward_std": 0.054098937660455704, - "rewards/cosine_scaled_reward": -0.19929899834096432, - "rewards/format_reward": 0.75, + "loss": 0.0002, + "reward": 0.051371646230109036, + "reward_std": 0.10319430893287063, + "rewards/cosine_scaled_reward": -0.15093043667729944, + "rewards/format_reward": 0.6041666697710752, "step": 179 }, { - "completion_length": 2443.7500610351562, - "epoch": 0.10285714285714286, - "grad_norm": 0.3083910346031189, - "kl": 0.003475189208984375, + "completion_length": 1345.2083778381348, + "epoch": 0.2057142857142857, + "grad_norm": 0.28628435730934143, + "kl": 0.00799560546875, "learning_rate": 8.270476638965461e-07, - "loss": 0.0001, - "reward": 0.037189796566963196, - "reward_std": 0.09739750809967518, - "rewards/cosine_scaled_reward": -0.16163260862231255, - "rewards/format_reward": 0.5416666679084301, + "loss": 0.0003, + "reward": 0.2407036293298006, + "reward_std": 0.17751038947608322, + "rewards/cosine_scaled_reward": 0.25933289900422096, + "rewards/format_reward": 0.8958333432674408, "step": 180 }, { - "completion_length": 2516.1250610351562, - "epoch": 0.10342857142857143, - "grad_norm": 0.26153579354286194, - "kl": 0.0025212764739990234, + "completion_length": 1969.208381652832, + "epoch": 0.20685714285714285, + "grad_norm": 0.24428333342075348, + "kl": 0.00662994384765625, "learning_rate": 8.245653237555705e-07, - "loss": 0.0001, - "reward": 0.11242004483938217, - "reward_std": 0.10370426625013351, - "rewards/cosine_scaled_reward": 0.019959699362516403, - "rewards/format_reward": 0.625, + "loss": 0.0003, + "reward": 0.09334775037132204, + "reward_std": 0.12076092883944511, + "rewards/cosine_scaled_reward": -0.08935475163161755, + "rewards/format_reward": 0.7291666828095913, "step": 181 }, { - "completion_length": 2725.0, - "epoch": 0.104, - "grad_norm": 0.255856990814209, - "kl": 0.0017554759979248047, + "completion_length": 1827.2291946411133, + "epoch": 0.208, + "grad_norm": 0.19231781363487244, + "kl": 0.0035953521728515625, "learning_rate": 8.220696016880687e-07, "loss": 0.0001, - "reward": 0.04441619198769331, - "reward_std": 0.09660841524600983, - "rewards/cosine_scaled_reward": -0.07804835960268974, - "rewards/format_reward": 0.4166666716337204, + "reward": 0.10479269758798182, + "reward_std": 0.11822887184098363, + "rewards/cosine_scaled_reward": -0.06516919657588005, + "rewards/format_reward": 0.7500000055879354, "step": 182 }, { - "completion_length": 2273.5833740234375, - "epoch": 0.10457142857142857, - "grad_norm": 0.40616485476493835, - "kl": 0.0037136077880859375, + "completion_length": 1392.8125457763672, + "epoch": 0.20914285714285713, + "grad_norm": 0.264604389667511, + "kl": 0.009449005126953125, "learning_rate": 8.195606193320136e-07, - "loss": 0.0001, - "reward": 0.014469819143414497, - "reward_std": 0.11339808069169521, - "rewards/cosine_scaled_reward": -0.24939279817044735, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0004, + "reward": 0.1556489015929401, + "reward_std": 0.10147193586453795, + "rewards/cosine_scaled_reward": 0.019236549735069275, + "rewards/format_reward": 0.8750000074505806, "step": 183 }, { - "completion_length": 2731.916732788086, - "epoch": 0.10514285714285715, - "grad_norm": 0.2949007451534271, - "kl": 0.0030760765075683594, + "completion_length": 1635.062551498413, + "epoch": 0.2102857142857143, + "grad_norm": 0.2975788414478302, + "kl": 0.00707244873046875, "learning_rate": 8.170384989716657e-07, - "loss": 0.0001, - "reward": 0.05573008651845157, - "reward_std": 0.12555470131337643, - "rewards/cosine_scaled_reward": -0.0629606805741787, - "rewards/format_reward": 0.4583333358168602, + "loss": 0.0003, + "reward": 0.0682187182828784, + "reward_std": 0.0552776656113565, + "rewards/cosine_scaled_reward": -0.20538464561104774, + "rewards/format_reward": 0.8125000018626451, "step": 184 }, { - "completion_length": 3537.25, - "epoch": 0.10571428571428572, - "grad_norm": 0.22492392361164093, - "kl": 0.0022058486938476562, + "completion_length": 1627.1250686645508, + "epoch": 0.21142857142857144, + "grad_norm": 0.26065441966056824, + "kl": 0.0062122344970703125, "learning_rate": 8.145033635316128e-07, - "loss": 0.0001, - "reward": -0.07737948512658477, - "reward_std": 0.062332406640052795, - "rewards/cosine_scaled_reward": -0.24878150783479214, - "rewards/format_reward": 0.0416666679084301, + "loss": 0.0002, + "reward": 0.06265506497584283, + "reward_std": 0.11735227680765092, + "rewards/cosine_scaled_reward": -0.19095464050769806, + "rewards/format_reward": 0.7500000093132257, "step": 185 }, { - "completion_length": 3350.8333740234375, - "epoch": 0.10628571428571429, - "grad_norm": 0.25699582695961, - "kl": 0.0046672821044921875, + "completion_length": 1840.1458587646484, + "epoch": 0.21257142857142858, + "grad_norm": 0.2078036367893219, + "kl": 0.007350921630859375, "learning_rate": 8.119553365707802e-07, - "loss": 0.0002, - "reward": -0.05990290082991123, - "reward_std": 0.07678448967635632, - "rewards/cosine_scaled_reward": -0.25926706194877625, - "rewards/format_reward": 0.1666666679084301, + "loss": 0.0003, + "reward": 0.09769681794568896, + "reward_std": 0.09951704926788807, + "rewards/cosine_scaled_reward": -0.06353580858558416, + "rewards/format_reward": 0.7083333414047956, "step": 186 }, { - "completion_length": 2271.250030517578, - "epoch": 0.10685714285714286, - "grad_norm": 0.3348846137523651, - "kl": 0.0040645599365234375, + "completion_length": 1557.5416870117188, + "epoch": 0.21371428571428572, + "grad_norm": 0.44674429297447205, + "kl": 0.008470535278320312, "learning_rate": 8.093945422764069e-07, - "loss": 0.0002, - "reward": 0.12475723121315241, - "reward_std": 0.07640931662172079, - "rewards/cosine_scaled_reward": 0.09590934868901968, - "rewards/format_reward": 0.5416666679084301, + "loss": 0.0003, + "reward": 0.09892327198758721, + "reward_std": 0.0855756844393909, + "rewards/cosine_scaled_reward": -0.1240589041262865, + "rewards/format_reward": 0.8333333414047956, "step": 187 }, { - "completion_length": 3262.25, - "epoch": 0.10742857142857143, - "grad_norm": 0.2265806943178177, - "kl": 0.002201080322265625, + "completion_length": 2297.3333587646484, + "epoch": 0.21485714285714286, + "grad_norm": 0.2045762985944748, + "kl": 0.008823394775390625, "learning_rate": 8.068211054579943e-07, - "loss": 0.0001, - "reward": -0.030809858813881874, - "reward_std": 0.11213317047804594, - "rewards/cosine_scaled_reward": -0.1948933992534876, - "rewards/format_reward": 0.2083333358168602, + "loss": 0.0004, + "reward": 0.05021746223792434, + "reward_std": 0.06788925174623728, + "rewards/cosine_scaled_reward": -0.15488196723163128, + "rewards/format_reward": 0.6041666697710752, "step": 188 }, { - "completion_length": 3561.9583740234375, - "epoch": 0.108, - "grad_norm": 0.1893041729927063, - "kl": 0.001382589340209961, + "completion_length": 1355.6875381469727, + "epoch": 0.216, + "grad_norm": 0.29116666316986084, + "kl": 0.0076541900634765625, "learning_rate": 8.04235151541222e-07, - "loss": 0.0001, - "reward": -0.053201296366751194, - "reward_std": 0.1335222413763404, - "rewards/cosine_scaled_reward": -0.21994520723819733, - "rewards/format_reward": 0.1250000037252903, + "loss": 0.0003, + "reward": 0.07565085194073617, + "reward_std": 0.08206533431075513, + "rewards/cosine_scaled_reward": -0.19497415097430348, + "rewards/format_reward": 0.8333333358168602, "step": 189 }, { - "completion_length": 3211.6250610351562, - "epoch": 0.10857142857142857, - "grad_norm": 0.22965195775032043, - "kl": 0.001880645751953125, + "completion_length": 1135.4167022705078, + "epoch": 0.21714285714285714, + "grad_norm": 0.2219790816307068, + "kl": 0.006549835205078125, "learning_rate": 8.01636806561836e-07, - "loss": 0.0001, - "reward": 0.03332811780273914, - "reward_std": 0.12492356635630131, - "rewards/cosine_scaled_reward": -0.12997949868440628, - "rewards/format_reward": 0.4583333395421505, + "loss": 0.0003, + "reward": 0.15915191872045398, + "reward_std": 0.08802533126436174, + "rewards/cosine_scaled_reward": -0.022108266479335725, + "rewards/format_reward": 0.9791666716337204, "step": 190 }, { - "completion_length": 2315.541748046875, - "epoch": 0.10914285714285714, - "grad_norm": 0.3455600440502167, - "kl": 0.004306793212890625, + "completion_length": 1071.7708587646484, + "epoch": 0.21828571428571428, + "grad_norm": 0.26136839389801025, + "kl": 0.006053924560546875, "learning_rate": 7.990261971595048e-07, "loss": 0.0002, - "reward": 0.10726934857666492, - "reward_std": 0.1528959609568119, - "rewards/cosine_scaled_reward": 0.025072216987609863, - "rewards/format_reward": 0.5833333358168602, + "reward": 0.20304076466709375, + "reward_std": 0.12852966412901878, + "rewards/cosine_scaled_reward": 0.10831178847001866, + "rewards/format_reward": 0.9791666716337204, "step": 191 }, { - "completion_length": 2843.1666717529297, - "epoch": 0.10971428571428571, - "grad_norm": 0.30234038829803467, - "kl": 0.0014281272888183594, + "completion_length": 1712.4583740234375, + "epoch": 0.21942857142857142, + "grad_norm": 0.20474177598953247, + "kl": 0.006473541259765625, "learning_rate": 7.964034505716476e-07, - "loss": 0.0001, - "reward": -0.00438886322081089, - "reward_std": 0.12094544153660536, - "rewards/cosine_scaled_reward": -0.15985378623008728, - "rewards/format_reward": 0.2916666679084301, + "loss": 0.0003, + "reward": 0.09229559730738401, + "reward_std": 0.1016106829047203, + "rewards/cosine_scaled_reward": -0.1470730509608984, + "rewards/format_reward": 0.8333333432674408, "step": 192 }, { - "completion_length": 2507.666748046875, - "epoch": 0.11028571428571429, - "grad_norm": 0.2999958097934723, - "kl": 0.0013928413391113281, + "completion_length": 2210.395851135254, + "epoch": 0.22057142857142858, + "grad_norm": 1.7929117679595947, + "kl": 0.055267333984375, "learning_rate": 7.93768694627233e-07, - "loss": 0.0001, - "reward": 0.12146176910027862, - "reward_std": 0.18834388069808483, - "rewards/cosine_scaled_reward": 0.027391068637371063, - "rewards/format_reward": 0.6666666865348816, + "loss": 0.0022, + "reward": 0.03142698993906379, + "reward_std": 0.12175118830054998, + "rewards/cosine_scaled_reward": -0.18109365366399288, + "rewards/format_reward": 0.5416666679084301, "step": 193 }, { - "completion_length": 3312.1666870117188, - "epoch": 0.11085714285714286, - "grad_norm": 0.23863805830478668, - "kl": 0.0029125213623046875, + "completion_length": 2227.291717529297, + "epoch": 0.22171428571428572, + "grad_norm": 0.21808066964149475, + "kl": 0.00843048095703125, "learning_rate": 7.911220577405484e-07, - "loss": 0.0001, - "reward": -0.053993106354027987, - "reward_std": 0.12258123233914375, - "rewards/cosine_scaled_reward": -0.24138313718140125, - "rewards/format_reward": 0.1666666716337204, + "loss": 0.0003, + "reward": 0.1474433816038072, + "reward_std": 0.17017615539953113, + "rewards/cosine_scaled_reward": 0.07256746315397322, + "rewards/format_reward": 0.7291666716337204, "step": 194 }, { - "completion_length": 2579.5833740234375, - "epoch": 0.11142857142857143, - "grad_norm": 0.19390937685966492, - "kl": 0.0013315677642822266, + "completion_length": 1330.1458435058594, + "epoch": 0.22285714285714286, + "grad_norm": 0.2461828738451004, + "kl": 0.008304595947265625, "learning_rate": 7.884636689049422e-07, - "loss": 0.0001, - "reward": 0.0982305034995079, - "reward_std": 0.0711896289139986, - "rewards/cosine_scaled_reward": 0.00020765885710716248, - "rewards/format_reward": 0.5833333358168602, + "loss": 0.0003, + "reward": 0.13772209081798792, + "reward_std": 0.13301934953778982, + "rewards/cosine_scaled_reward": -0.04500130284577608, + "rewards/format_reward": 0.8958333507180214, "step": 195 }, { - "completion_length": 2706.6250610351562, - "epoch": 0.112, - "grad_norm": 0.39363712072372437, - "kl": 0.0028934478759765625, + "completion_length": 2083.5000762939453, + "epoch": 0.224, + "grad_norm": 0.2871836721897125, + "kl": 0.0097198486328125, "learning_rate": 7.857936576865356e-07, - "loss": 0.0001, - "reward": 0.007551372342277318, - "reward_std": 0.08434354141354561, - "rewards/cosine_scaled_reward": -0.20768819749355316, - "rewards/format_reward": 0.4583333358168602, + "loss": 0.0004, + "reward": 0.11253911699168384, + "reward_std": 0.1284322296269238, + "rewards/cosine_scaled_reward": -0.02314686682075262, + "rewards/format_reward": 0.7083333544433117, "step": 196 }, { - "completion_length": 2017.8333587646484, - "epoch": 0.11257142857142857, - "grad_norm": 0.38249877095222473, - "kl": 0.002765655517578125, + "completion_length": 1072.708366394043, + "epoch": 0.22514285714285714, + "grad_norm": 0.28244540095329285, + "kl": 0.006664276123046875, "learning_rate": 7.831121542179086e-07, - "loss": 0.0001, - "reward": 0.11480033956468105, - "reward_std": 0.05338160693645477, - "rewards/cosine_scaled_reward": 0.06876572966575623, - "rewards/format_reward": 0.5416666679084301, + "loss": 0.0003, + "reward": 0.14220814127475023, + "reward_std": 0.1526435911655426, + "rewards/cosine_scaled_reward": -0.0398729182779789, + "rewards/format_reward": 0.9166666679084301, "step": 197 }, { - "completion_length": 3338.1666870117188, - "epoch": 0.11314285714285714, - "grad_norm": 0.22670181095600128, - "kl": 0.0019083023071289062, + "completion_length": 1305.8750305175781, + "epoch": 0.22628571428571428, + "grad_norm": 0.3013690710067749, + "kl": 0.009075164794921875, "learning_rate": 7.804192891917571e-07, - "loss": 0.0001, - "reward": -0.04976028436794877, - "reward_std": 0.12509359791874886, - "rewards/cosine_scaled_reward": -0.2500083018094301, - "rewards/format_reward": 0.2083333358168602, + "loss": 0.0004, + "reward": 0.1608109144726768, + "reward_std": 0.13874499686062336, + "rewards/cosine_scaled_reward": 0.03152369521558285, + "rewards/format_reward": 0.8750000149011612, "step": 198 }, { - "completion_length": 2759.3750610351562, - "epoch": 0.11371428571428571, - "grad_norm": 0.324608713388443, - "kl": 0.00327301025390625, + "completion_length": 1484.0000457763672, + "epoch": 0.22742857142857142, + "grad_norm": 0.21533454954624176, + "kl": 0.0067596435546875, "learning_rate": 7.777151938545235e-07, - "loss": 0.0001, - "reward": 0.05167090427130461, - "reward_std": 0.0683326879516244, - "rewards/cosine_scaled_reward": -0.09669571369886398, - "rewards/format_reward": 0.5, + "loss": 0.0003, + "reward": 0.11915545212104917, + "reward_std": 0.10307862563058734, + "rewards/cosine_scaled_reward": -0.1427280263742432, + "rewards/format_reward": 0.9791666716337204, "step": 199 }, { - "completion_length": 2173.791717529297, - "epoch": 0.11428571428571428, - "grad_norm": 0.2554347515106201, - "kl": 0.002532958984375, + "completion_length": 1291.8750228881836, + "epoch": 0.22857142857142856, + "grad_norm": 0.2203797996044159, + "kl": 0.008068084716796875, "learning_rate": 7.75e-07, - "loss": 0.0001, - "reward": 0.1663350909948349, - "reward_std": 0.15231452882289886, - "rewards/cosine_scaled_reward": 0.17955515533685684, - "rewards/format_reward": 0.6250000037252903, + "loss": 0.0003, + "reward": 0.1579569444875233, + "reward_std": 0.1443558344617486, + "rewards/cosine_scaled_reward": -0.0025847081560641527, + "rewards/format_reward": 0.9375, "step": 200 }, { - "completion_length": 2982.7500610351562, - "epoch": 0.11485714285714285, - "grad_norm": 0.26406627893447876, - "kl": 0.002414703369140625, + "completion_length": 1646.3958892822266, + "epoch": 0.2297142857142857, + "grad_norm": 0.24083541333675385, + "kl": 0.00879669189453125, "learning_rate": 7.72273839962904e-07, - "loss": 0.0001, - "reward": -0.011516915983520448, - "reward_std": 0.10508911218494177, - "rewards/cosine_scaled_reward": -0.1807081662118435, - "rewards/format_reward": 0.291666679084301, + "loss": 0.0004, + "reward": 0.21994818467646837, + "reward_std": 0.11868779285578057, + "rewards/cosine_scaled_reward": 0.23895522952079773, + "rewards/format_reward": 0.791666679084301, "step": 201 }, { - "completion_length": 2030.0833587646484, - "epoch": 0.11542857142857142, - "grad_norm": 0.333197683095932, - "kl": 0.0029327869415283203, + "completion_length": 1287.6041946411133, + "epoch": 0.23085714285714284, + "grad_norm": 0.3120401203632355, + "kl": 0.016357421875, "learning_rate": 7.695368466124296e-07, - "loss": 0.0001, - "reward": 0.16738482657819986, - "reward_std": 0.15324399899691343, - "rewards/cosine_scaled_reward": 0.11996408179402351, - "rewards/format_reward": 0.7500000074505806, + "loss": 0.0007, + "reward": 0.21438546478748322, + "reward_std": 0.06259978096932173, + "rewards/cosine_scaled_reward": 0.1961576696485281, + "rewards/format_reward": 0.875, "step": 202 }, { - "completion_length": 2086.2916717529297, - "epoch": 0.116, - "grad_norm": 0.2623634934425354, - "kl": 0.002384185791015625, + "completion_length": 1295.270896911621, + "epoch": 0.232, + "grad_norm": 0.25308915972709656, + "kl": 0.008411407470703125, "learning_rate": 7.667891533457718e-07, - "loss": 0.0001, - "reward": 0.167269978672266, - "reward_std": 0.14336112327873707, - "rewards/cosine_scaled_reward": 0.11821148172020912, - "rewards/format_reward": 0.75, + "loss": 0.0003, + "reward": 0.16257827286608517, + "reward_std": 0.13092664163559675, + "rewards/cosine_scaled_reward": 0.02888377010822296, + "rewards/format_reward": 0.8958333432674408, "step": 203 }, { - "completion_length": 1473.291732788086, - "epoch": 0.11657142857142858, - "grad_norm": 0.3735639452934265, - "kl": 0.00466156005859375, + "completion_length": 1253.0417022705078, + "epoch": 0.23314285714285715, + "grad_norm": 0.36675336956977844, + "kl": 0.01125335693359375, "learning_rate": 7.640308940816239e-07, - "loss": 0.0002, - "reward": 0.1115006972104311, - "reward_std": 0.1495855338871479, - "rewards/cosine_scaled_reward": -0.11061681807041168, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.0005, + "reward": 0.19385905005037785, + "reward_std": 0.12034193379804492, + "rewards/cosine_scaled_reward": 0.07674635015428066, + "rewards/format_reward": 0.9583333358168602, "step": 204 }, { - "completion_length": 2656.5000610351562, - "epoch": 0.11714285714285715, - "grad_norm": 0.3730281889438629, - "kl": 0.006900787353515625, + "completion_length": 1501.5417251586914, + "epoch": 0.2342857142857143, + "grad_norm": 0.2521401345729828, + "kl": 0.00830841064453125, "learning_rate": 7.612622032536507e-07, "loss": 0.0003, - "reward": -0.014895959524437785, - "reward_std": 0.08633499965071678, - "rewards/cosine_scaled_reward": -0.27377867698669434, - "rewards/format_reward": 0.4583333507180214, + "reward": 0.24430920276790857, + "reward_std": 0.17064828611910343, + "rewards/cosine_scaled_reward": 0.26697138883173466, + "rewards/format_reward": 0.8958333432674408, "step": 205 }, { - "completion_length": 2586.0833740234375, - "epoch": 0.11771428571428572, - "grad_norm": 0.3290543258190155, - "kl": 0.0014281272888183594, + "completion_length": 1941.3958740234375, + "epoch": 0.23542857142857143, + "grad_norm": 0.30331704020500183, + "kl": 0.00870513916015625, "learning_rate": 7.584832158039378e-07, - "loss": 0.0001, - "reward": 0.16872386634349823, - "reward_std": 0.2004525102674961, - "rewards/cosine_scaled_reward": 0.2048325203359127, - "rewards/format_reward": 0.5833333469927311, + "loss": 0.0003, + "reward": 0.045394688844680786, + "reward_std": 0.09498168341815472, + "rewards/cosine_scaled_reward": -0.24192016012966633, + "rewards/format_reward": 0.7500000149011612, "step": 206 }, { - "completion_length": 2900.000030517578, - "epoch": 0.11828571428571429, - "grad_norm": 0.30826064944267273, - "kl": 0.003490447998046875, + "completion_length": 1438.8333587646484, + "epoch": 0.23657142857142857, + "grad_norm": 0.31282100081443787, + "kl": 0.0115203857421875, "learning_rate": 7.556940671764124e-07, - "loss": 0.0001, - "reward": -0.027414992451667786, - "reward_std": 0.08677189517766237, - "rewards/cosine_scaled_reward": -0.22649349225685, - "rewards/format_reward": 0.2916666679084301, + "loss": 0.0005, + "reward": 0.11734095495194197, + "reward_std": 0.1474492819979787, + "rewards/cosine_scaled_reward": -0.08522756304591894, + "rewards/format_reward": 0.8541666939854622, "step": 207 }, { - "completion_length": 2383.0833740234375, - "epoch": 0.11885714285714286, - "grad_norm": 0.2495100349187851, - "kl": 0.003902435302734375, + "completion_length": 1113.7500228881836, + "epoch": 0.2377142857142857, + "grad_norm": 0.2624795734882355, + "kl": 0.008823394775390625, "learning_rate": 7.528948933102438e-07, - "loss": 0.0002, - "reward": 0.07607341930270195, - "reward_std": 0.11710657738149166, - "rewards/cosine_scaled_reward": -0.09222951903939247, - "rewards/format_reward": 0.6250000037252903, + "loss": 0.0004, + "reward": 0.17374667339026928, + "reward_std": 0.10973423393443227, + "rewards/cosine_scaled_reward": 0.023771056905388832, + "rewards/format_reward": 0.9583333432674408, "step": 208 }, { - "completion_length": 2373.1666870117188, - "epoch": 0.11942857142857143, - "grad_norm": 0.2503434717655182, - "kl": 0.00215911865234375, + "completion_length": 1030.7708625793457, + "epoch": 0.23885714285714285, + "grad_norm": 0.32483094930648804, + "kl": 0.00803375244140625, "learning_rate": 7.500858306332172e-07, - "loss": 0.0001, - "reward": 0.01991327479481697, - "reward_std": 0.07142207399010658, - "rewards/cosine_scaled_reward": -0.19286122359335423, - "rewards/format_reward": 0.5, + "loss": 0.0003, + "reward": 0.18928107433021069, + "reward_std": 0.10979128838516772, + "rewards/cosine_scaled_reward": 0.07668027561157942, + "rewards/format_reward": 0.9583333432674408, "step": 209 }, { - "completion_length": 2423.5833587646484, - "epoch": 0.12, - "grad_norm": 0.31848689913749695, - "kl": 0.0035247802734375, + "completion_length": 1972.9167022705078, + "epoch": 0.24, + "grad_norm": 0.17738643288612366, + "kl": 0.008514404296875, "learning_rate": 7.472670160550848e-07, - "loss": 0.0001, - "reward": 0.07869179422414163, - "reward_std": 0.14648755080997944, - "rewards/cosine_scaled_reward": 0.0026129893958568573, - "rewards/format_reward": 0.4583333395421505, + "loss": 0.0003, + "reward": 0.14470667950809002, + "reward_std": 0.13229582412168384, + "rewards/cosine_scaled_reward": 0.037202537059783936, + "rewards/format_reward": 0.7708333395421505, "step": 210 }, { - "completion_length": 3017.0834045410156, - "epoch": 0.12057142857142857, - "grad_norm": 0.20636920630931854, - "kl": 0.0016193389892578125, + "completion_length": 1598.7500228881836, + "epoch": 0.24114285714285713, + "grad_norm": 0.27847760915756226, + "kl": 0.011180877685546875, "learning_rate": 7.444385869608921e-07, - "loss": 0.0001, - "reward": 0.11636412516236305, - "reward_std": 0.12861133087426424, - "rewards/cosine_scaled_reward": 0.1375791933387518, - "rewards/format_reward": 0.4166666679084301, + "loss": 0.0004, + "reward": 0.14864719624165446, + "reward_std": 0.11434817017288879, + "rewards/cosine_scaled_reward": 0.04204285331070423, + "rewards/format_reward": 0.7916666772216558, "step": 211 }, { - "completion_length": 1285.4167175292969, - "epoch": 0.12114285714285715, - "grad_norm": 0.3454136848449707, - "kl": 0.0033330917358398438, + "completion_length": 1177.6875190734863, + "epoch": 0.2422857142857143, + "grad_norm": 0.28754156827926636, + "kl": 0.01055145263671875, "learning_rate": 7.416006812042827e-07, - "loss": 0.0001, - "reward": 0.2367185316979885, - "reward_std": 0.11896193400025368, - "rewards/cosine_scaled_reward": 0.21828349493443966, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0004, + "reward": 0.17801955621689558, + "reward_std": 0.10441821068525314, + "rewards/cosine_scaled_reward": 0.1050565280020237, + "rewards/format_reward": 0.8333333414047956, "step": 212 }, { - "completion_length": 2373.9584045410156, - "epoch": 0.12171428571428572, - "grad_norm": 0.31905391812324524, - "kl": 0.0040836334228515625, + "completion_length": 1114.354190826416, + "epoch": 0.24342857142857144, + "grad_norm": 0.4188780188560486, + "kl": 0.011600494384765625, "learning_rate": 7.387534371007797e-07, - "loss": 0.0002, - "reward": 0.15462911687791348, - "reward_std": 0.14377483539283276, - "rewards/cosine_scaled_reward": 0.08280523493885994, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0005, + "reward": 0.17088149162009358, + "reward_std": 0.11700731026940048, + "rewards/cosine_scaled_reward": 0.030954405665397644, + "rewards/format_reward": 0.9375000074505806, "step": 213 }, { - "completion_length": 2538.7084045410156, - "epoch": 0.12228571428571429, - "grad_norm": 0.2996288239955902, - "kl": 0.002288818359375, + "completion_length": 1921.0833587646484, + "epoch": 0.24457142857142858, + "grad_norm": 0.22250115871429443, + "kl": 0.010768890380859375, "learning_rate": 7.358969934210438e-07, - "loss": 0.0001, - "reward": 0.018887239741161466, - "reward_std": 0.09132399410009384, - "rewards/cosine_scaled_reward": -0.2364298179745674, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0004, + "reward": 0.165664148516953, + "reward_std": 0.15507009578868747, + "rewards/cosine_scaled_reward": 0.08362742932513356, + "rewards/format_reward": 0.791666679084301, "step": 214 }, { - "completion_length": 2491.7916870117188, - "epoch": 0.12285714285714286, - "grad_norm": 0.27040696144104004, - "kl": 0.004131317138671875, + "completion_length": 1409.3333587646484, + "epoch": 0.24571428571428572, + "grad_norm": 0.26721227169036865, + "kl": 0.007991790771484375, "learning_rate": 7.330314893841101e-07, - "loss": 0.0002, - "reward": 0.0644271457567811, - "reward_std": 0.1355307325720787, - "rewards/cosine_scaled_reward": -0.04077669233083725, - "rewards/format_reward": 0.4583333432674408, + "loss": 0.0003, + "reward": 0.11459713708609343, + "reward_std": 0.10593406949192286, + "rewards/cosine_scaled_reward": -0.08912589284591377, + "rewards/format_reward": 0.8333333432674408, "step": 215 }, { - "completion_length": 2357.5833435058594, - "epoch": 0.12342857142857143, - "grad_norm": 0.33591803908348083, - "kl": 0.0028328895568847656, + "completion_length": 1280.6042175292969, + "epoch": 0.24685714285714286, + "grad_norm": 0.31878677010536194, + "kl": 0.010250091552734375, "learning_rate": 7.301570646506027e-07, - "loss": 0.0001, - "reward": 0.04419836588203907, - "reward_std": 0.1420129630714655, - "rewards/cosine_scaled_reward": -0.17904941737651825, - "rewards/format_reward": 0.6250000223517418, + "loss": 0.0004, + "reward": 0.19101236946880817, + "reward_std": 0.11184050468727946, + "rewards/cosine_scaled_reward": 0.12221446633338928, + "rewards/format_reward": 0.8750000111758709, "step": 216 }, { - "completion_length": 3153.2916870117188, - "epoch": 0.124, - "grad_norm": 0.32349398732185364, - "kl": 0.0020656585693359375, + "completion_length": 1485.3541870117188, + "epoch": 0.248, + "grad_norm": 0.24682621657848358, + "kl": 0.00894927978515625, "learning_rate": 7.27273859315928e-07, - "loss": 0.0001, - "reward": -0.0373761048540473, - "reward_std": 0.08199001383036375, - "rewards/cosine_scaled_reward": -0.2557108663022518, - "rewards/format_reward": 0.2916666716337204, + "loss": 0.0004, + "reward": 0.1820166790857911, + "reward_std": 0.16682014428079128, + "rewards/cosine_scaled_reward": 0.11264388589188457, + "rewards/format_reward": 0.8333333395421505, "step": 217 }, { - "completion_length": 2428.6666870117188, - "epoch": 0.12457142857142857, - "grad_norm": 0.2583353817462921, - "kl": 0.003322601318359375, + "completion_length": 1584.9583740234375, + "epoch": 0.24914285714285714, + "grad_norm": 0.28223466873168945, + "kl": 0.010005950927734375, "learning_rate": 7.243820139034464e-07, - "loss": 0.0001, - "reward": 0.12931736442260444, - "reward_std": 0.08473479002714157, - "rewards/cosine_scaled_reward": 0.1125111160799861, - "rewards/format_reward": 0.5416666679084301, + "loss": 0.0004, + "reward": 0.11426169364131056, + "reward_std": 0.1477383803576231, + "rewards/cosine_scaled_reward": -0.10390518826898187, + "rewards/format_reward": 0.8750000149011612, "step": 218 }, { - "completion_length": 2420.416732788086, - "epoch": 0.12514285714285714, - "grad_norm": 0.29344987869262695, - "kl": 0.0013303756713867188, + "completion_length": 1366.0625305175781, + "epoch": 0.2502857142857143, + "grad_norm": 0.3087660074234009, + "kl": 0.0101165771484375, "learning_rate": 7.214816693576234e-07, - "loss": 0.0001, - "reward": 0.0856350027024746, - "reward_std": 0.15545816719532013, - "rewards/cosine_scaled_reward": -0.08107624389231205, - "rewards/format_reward": 0.6666666828095913, + "loss": 0.0004, + "reward": 0.162561041302979, + "reward_std": 0.15994944656267762, + "rewards/cosine_scaled_reward": 0.03411710192449391, + "rewards/format_reward": 0.8750000055879354, "step": 219 }, { - "completion_length": 2632.541748046875, - "epoch": 0.12571428571428572, - "grad_norm": 0.3831244111061096, - "kl": 0.003871917724609375, + "completion_length": 1414.1666717529297, + "epoch": 0.25142857142857145, + "grad_norm": 0.30027467012405396, + "kl": 0.007183074951171875, "learning_rate": 7.185729670371604e-07, - "loss": 0.0002, - "reward": 0.022275029681622982, - "reward_std": 0.1458304082043469, - "rewards/cosine_scaled_reward": -0.14381001330912113, - "rewards/format_reward": 0.4166666679084301, + "loss": 0.0003, + "reward": 0.04776344425044954, + "reward_std": 0.06931147351861, + "rewards/cosine_scaled_reward": -0.2924546115100384, + "rewards/format_reward": 0.8541666772216558, "step": 220 }, { - "completion_length": 2673.125, - "epoch": 0.12628571428571428, - "grad_norm": 0.2231934517621994, - "kl": 0.0038356781005859375, + "completion_length": 1363.2292022705078, + "epoch": 0.25257142857142856, + "grad_norm": 0.32602623105049133, + "kl": 0.00916290283203125, "learning_rate": 7.156560487081051e-07, - "loss": 0.0002, - "reward": 0.05967811681330204, - "reward_std": 0.11407985910773277, - "rewards/cosine_scaled_reward": -0.05345373600721359, - "rewards/format_reward": 0.4583333432674408, + "loss": 0.0004, + "reward": 0.17115566816937644, + "reward_std": 0.10846680961549282, + "rewards/cosine_scaled_reward": 0.0650689210742712, + "rewards/format_reward": 0.8750000149011612, "step": 221 }, { - "completion_length": 2702.0833435058594, - "epoch": 0.12685714285714286, - "grad_norm": 0.33175790309906006, - "kl": 0.003498077392578125, + "completion_length": 1461.8542022705078, + "epoch": 0.2537142857142857, + "grad_norm": 0.2890350818634033, + "kl": 0.009174346923828125, "learning_rate": 7.127310565369415e-07, - "loss": 0.0001, - "reward": 0.05622336361557245, - "reward_std": 0.14574621617794037, - "rewards/cosine_scaled_reward": -0.06559618934988976, - "rewards/format_reward": 0.4583333544433117, + "loss": 0.0004, + "reward": 0.1370222427067347, + "reward_std": 0.13589494908228517, + "rewards/cosine_scaled_reward": -0.022166259586811066, + "rewards/format_reward": 0.8333333395421505, "step": 222 }, { - "completion_length": 3103.75, - "epoch": 0.12742857142857142, - "grad_norm": 0.25623002648353577, - "kl": 0.0033712387084960938, + "completion_length": 1474.7500381469727, + "epoch": 0.25485714285714284, + "grad_norm": 0.22688108682632446, + "kl": 0.008312225341796875, "learning_rate": 7.097981330836616e-07, - "loss": 0.0001, - "reward": 0.08619383163750172, - "reward_std": 0.16002377308905125, - "rewards/cosine_scaled_reward": 0.08019665628671646, - "rewards/format_reward": 0.3333333432674408, + "loss": 0.0003, + "reward": 0.1485723494552076, + "reward_std": 0.10952452756464481, + "rewards/cosine_scaled_reward": 0.029805000871419907, + "rewards/format_reward": 0.8125000074505806, "step": 223 }, { - "completion_length": 2388.3333740234375, - "epoch": 0.128, - "grad_norm": 0.253112256526947, - "kl": 0.0017728805541992188, + "completion_length": 1974.1458892822266, + "epoch": 0.256, + "grad_norm": 0.24594879150390625, + "kl": 0.008819580078125, "learning_rate": 7.068574212948169e-07, - "loss": 0.0001, - "reward": 0.17507698480039835, - "reward_std": 0.1411991035565734, - "rewards/cosine_scaled_reward": 0.12073229486122727, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0004, + "reward": 0.11885308753699064, + "reward_std": 0.1231303745880723, + "rewards/cosine_scaled_reward": -0.07305323181208223, + "rewards/format_reward": 0.8333333432674408, "step": 224 }, { - "completion_length": 1735.2500915527344, - "epoch": 0.12857142857142856, - "grad_norm": 0.3771184980869293, - "kl": 0.008134841918945312, + "completion_length": 1713.7500305175781, + "epoch": 0.2571428571428571, + "grad_norm": 0.2284773290157318, + "kl": 0.014232635498046875, "learning_rate": 7.039090644965509e-07, - "loss": 0.0003, - "reward": 0.16182245966047049, - "reward_std": 0.17920539155602455, - "rewards/cosine_scaled_reward": 0.09676531393779442, - "rewards/format_reward": 0.7500000074505806, + "loss": 0.0006, + "reward": 0.10374407912604511, + "reward_std": 0.14682644978165627, + "rewards/cosine_scaled_reward": -0.10095677326899022, + "rewards/format_reward": 0.8125000111758709, "step": 225 }, { - "completion_length": 2294.6250610351562, - "epoch": 0.12914285714285714, - "grad_norm": 0.3015359938144684, - "kl": 0.003520965576171875, + "completion_length": 1503.7708740234375, + "epoch": 0.2582857142857143, + "grad_norm": 0.23106862604618073, + "kl": 0.00830841064453125, "learning_rate": 7.009532063876148e-07, - "loss": 0.0001, - "reward": 0.057341309264302254, - "reward_std": 0.09985095728188753, - "rewards/cosine_scaled_reward": -0.16445063799619675, - "rewards/format_reward": 0.6666666716337204, + "loss": 0.0003, + "reward": 0.18740362441167235, + "reward_std": 0.08802895061671734, + "rewards/cosine_scaled_reward": 0.10540169104933739, + "rewards/format_reward": 0.8958333395421505, "step": 226 }, { - "completion_length": 1856.8333587646484, - "epoch": 0.12971428571428573, - "grad_norm": 0.31405168771743774, - "kl": 0.003047943115234375, + "completion_length": 1244.1875534057617, + "epoch": 0.25942857142857145, + "grad_norm": 0.2707884907722473, + "kl": 0.013683319091796875, "learning_rate": 6.979899910323624e-07, - "loss": 0.0001, - "reward": 0.017376150004565716, - "reward_std": 0.11509128450416028, - "rewards/cosine_scaled_reward": -0.32319026812911034, - "rewards/format_reward": 0.7500000149011612, + "loss": 0.0005, + "reward": 0.15233041066676378, + "reward_std": 0.11700486252084374, + "rewards/cosine_scaled_reward": -0.04158254712820053, + "rewards/format_reward": 0.9583333432674408, "step": 227 }, { - "completion_length": 2065.541748046875, - "epoch": 0.13028571428571428, - "grad_norm": 0.3149060308933258, - "kl": 0.004154205322265625, + "completion_length": 1151.6666870117188, + "epoch": 0.26057142857142856, + "grad_norm": 0.31404733657836914, + "kl": 0.00904083251953125, "learning_rate": 6.950195628537299e-07, - "loss": 0.0002, - "reward": 0.12278923206031322, - "reward_std": 0.1152509143576026, - "rewards/cosine_scaled_reward": -0.012125710025429726, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0004, + "reward": 0.1969418814405799, + "reward_std": 0.11795947467908263, + "rewards/cosine_scaled_reward": 0.13468213769374415, + "rewards/format_reward": 0.8958333395421505, "step": 228 }, { - "completion_length": 2920.75, - "epoch": 0.13085714285714287, - "grad_norm": 0.20424042642116547, - "kl": 0.0026836395263671875, + "completion_length": 1607.1875610351562, + "epoch": 0.26171428571428573, + "grad_norm": 0.3030518889427185, + "kl": 0.011165618896484375, "learning_rate": 6.920420666261961e-07, - "loss": 0.0001, - "reward": 0.009747498668730259, - "reward_std": 0.10922604519873857, - "rewards/cosine_scaled_reward": -0.15981731563806534, - "rewards/format_reward": 0.3750000149011612, + "loss": 0.0004, + "reward": 0.11387707642279565, + "reward_std": 0.0794507262762636, + "rewards/cosine_scaled_reward": -0.06163225881755352, + "rewards/format_reward": 0.7916666828095913, "step": 229 }, { - "completion_length": 2789.4583587646484, - "epoch": 0.13142857142857142, - "grad_norm": 0.36339035630226135, - "kl": 0.00818634033203125, + "completion_length": 1865.7292022705078, + "epoch": 0.26285714285714284, + "grad_norm": 0.26728665828704834, + "kl": 0.009771347045898438, "learning_rate": 6.890576474687263e-07, - "loss": 0.0003, - "reward": 0.0005879290401935577, - "reward_std": 0.1172481756657362, - "rewards/cosine_scaled_reward": -0.16576159978285432, - "rewards/format_reward": 0.3333333358168602, + "loss": 0.0004, + "reward": 0.06328508502338082, + "reward_std": 0.11280694883316755, + "rewards/cosine_scaled_reward": -0.1895492672920227, + "rewards/format_reward": 0.7500000074505806, "step": 230 }, { - "completion_length": 2930.5000610351562, - "epoch": 0.132, - "grad_norm": 0.304671972990036, - "kl": 0.0041217803955078125, + "completion_length": 1559.5625610351562, + "epoch": 0.264, + "grad_norm": 0.25874829292297363, + "kl": 0.009349822998046875, "learning_rate": 6.860664508377001e-07, - "loss": 0.0002, - "reward": 0.05537473565345863, - "reward_std": 0.14710533432662487, - "rewards/cosine_scaled_reward": -0.006951111798116472, - "rewards/format_reward": 0.3333333358168602, + "loss": 0.0004, + "reward": 0.22241820394992828, + "reward_std": 0.11675288947299123, + "rewards/cosine_scaled_reward": 0.19621967896819115, + "rewards/format_reward": 0.9166666716337204, "step": 231 }, { - "completion_length": 3523.25, - "epoch": 0.13257142857142856, - "grad_norm": 0.22350047528743744, - "kl": 0.00376129150390625, - "learning_rate": 6.83068622519821e-07, - "loss": 0.0002, - "reward": -0.055352203315123916, - "reward_std": 0.059430775698274374, - "rewards/cosine_scaled_reward": -0.20401961356401443, - "rewards/format_reward": 0.0833333358168602, + "completion_length": 1832.2291870117188, + "epoch": 0.2651428571428571, + "grad_norm": 0.2535684406757355, + "kl": 0.011371612548828125, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0005, + "reward": 0.04174727539066225, + "reward_std": 0.08498770324513316, + "rewards/cosine_scaled_reward": -0.2521397266536951, + "rewards/format_reward": 0.7500000111758709, "step": 232 }, { - "completion_length": 2124.0833740234375, - "epoch": 0.13314285714285715, - "grad_norm": 0.24072830379009247, - "kl": 0.004871368408203125, + "completion_length": 1139.8958587646484, + "epoch": 0.2662857142857143, + "grad_norm": 0.24799145758152008, + "kl": 0.00756072998046875, "learning_rate": 6.800643086250121e-07, - "loss": 0.0002, - "reward": 0.02883557928726077, - "reward_std": 0.11566138081252575, - "rewards/cosine_scaled_reward": -0.227960211224854, - "rewards/format_reward": 0.6250000149011612, + "loss": 0.0003, + "reward": 0.10831367457285523, + "reward_std": 0.10008358396589756, + "rewards/cosine_scaled_reward": -0.16862575709819794, + "rewards/format_reward": 0.9583333432674408, "step": 233 }, { - "completion_length": 3160.25, - "epoch": 0.1337142857142857, - "grad_norm": 0.2963768541812897, - "kl": 0.0071258544921875, + "completion_length": 1544.5000305175781, + "epoch": 0.2674285714285714, + "grad_norm": 0.2703922986984253, + "kl": 0.0122222900390625, "learning_rate": 6.770536555792944e-07, - "loss": 0.0003, - "reward": -0.031203433871269226, - "reward_std": 0.1089600445702672, - "rewards/cosine_scaled_reward": -0.1961350440979004, - "rewards/format_reward": 0.2083333358168602, + "loss": 0.0005, + "reward": 0.13072135020047426, + "reward_std": 0.12703408766537905, + "rewards/cosine_scaled_reward": -0.010831212624907494, + "rewards/format_reward": 0.791666679084301, "step": 234 }, { - "completion_length": 2500.2083740234375, - "epoch": 0.13428571428571429, - "grad_norm": 0.2251562625169754, - "kl": 0.00217437744140625, + "completion_length": 1198.8125267028809, + "epoch": 0.26857142857142857, + "grad_norm": 0.33897149562835693, + "kl": 0.01023101806640625, "learning_rate": 6.740368101176495e-07, - "loss": 0.0001, - "reward": 0.14199365861713886, - "reward_std": 0.13348014745861292, - "rewards/cosine_scaled_reward": 0.10780146927572787, - "rewards/format_reward": 0.6250000149011612, + "loss": 0.0004, + "reward": 0.22266705462243408, + "reward_std": 0.13369846408022568, + "rewards/cosine_scaled_reward": 0.17654232122004032, + "rewards/format_reward": 0.9375000074505806, "step": 235 }, { - "completion_length": 2707.0833740234375, - "epoch": 0.13485714285714287, - "grad_norm": 0.28649571537971497, - "kl": 0.0032176971435546875, + "completion_length": 1820.2500457763672, + "epoch": 0.26971428571428574, + "grad_norm": 0.22755910456180573, + "kl": 0.008855819702148438, "learning_rate": 6.710139192768694e-07, - "loss": 0.0001, - "reward": 0.07454727217555046, - "reward_std": 0.13660876639187336, - "rewards/cosine_scaled_reward": -0.008956663310527802, - "rewards/format_reward": 0.4583333432674408, + "loss": 0.0004, + "reward": 0.10096835857257247, + "reward_std": 0.12764764530584216, + "rewards/cosine_scaled_reward": -0.10865432699210942, + "rewards/format_reward": 0.8125000055879354, "step": 236 }, { - "completion_length": 1691.8333587646484, - "epoch": 0.13542857142857143, - "grad_norm": 0.39492595195770264, - "kl": 0.0026493072509765625, + "completion_length": 1567.0208587646484, + "epoch": 0.27085714285714285, + "grad_norm": 0.23869018256664276, + "kl": 0.010875701904296875, "learning_rate": 6.679851303883891e-07, - "loss": 0.0001, - "reward": 0.11160031147301197, - "reward_std": 0.11124805174767971, - "rewards/cosine_scaled_reward": -0.026990113779902458, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.0004, + "reward": 0.1583275799639523, + "reward_std": 0.07935558445751667, + "rewards/cosine_scaled_reward": 0.06267662812024355, + "rewards/format_reward": 0.8125000074505806, "step": 237 }, { - "completion_length": 2021.6666717529297, - "epoch": 0.136, - "grad_norm": 0.356982946395874, - "kl": 0.006900787353515625, + "completion_length": 1344.5625305175781, + "epoch": 0.272, + "grad_norm": 0.23950256407260895, + "kl": 0.01043701171875, "learning_rate": 6.649505910711058e-07, - "loss": 0.0003, - "reward": 0.12163960468024015, - "reward_std": 0.15431670285761356, - "rewards/cosine_scaled_reward": 0.025143351405858994, - "rewards/format_reward": 0.666666679084301, + "loss": 0.0004, + "reward": 0.2100483477115631, + "reward_std": 0.10991842532530427, + "rewards/cosine_scaled_reward": 0.15677512856200337, + "rewards/format_reward": 0.9166666716337204, "step": 238 }, { - "completion_length": 1721.4167098999023, - "epoch": 0.13657142857142857, - "grad_norm": 0.4040653705596924, - "kl": 0.006313323974609375, + "completion_length": 1426.041690826416, + "epoch": 0.27314285714285713, + "grad_norm": 0.2463754415512085, + "kl": 0.0074310302734375, "learning_rate": 6.619104492241847e-07, "loss": 0.0003, - "reward": 0.13764589373022318, - "reward_std": 0.15449939854443073, - "rewards/cosine_scaled_reward": 0.049103467259556055, - "rewards/format_reward": 0.7083333358168602, + "reward": 0.21384657500311732, + "reward_std": 0.08286083268467337, + "rewards/cosine_scaled_reward": 0.23679617792367935, + "rewards/format_reward": 0.770833333954215, "step": 239 }, { - "completion_length": 1866.1250457763672, - "epoch": 0.13714285714285715, - "grad_norm": 0.3743992745876312, - "kl": 0.004802703857421875, + "completion_length": 1488.8958587646484, + "epoch": 0.2742857142857143, + "grad_norm": 0.4307219088077545, + "kl": 0.0166473388671875, "learning_rate": 6.588648530198504e-07, - "loss": 0.0002, - "reward": 0.09157293569296598, - "reward_std": 0.1369806006550789, - "rewards/cosine_scaled_reward": -0.12631518207490444, - "rewards/format_reward": 0.7916666865348816, + "loss": 0.0007, + "reward": 0.043208114642766304, + "reward_std": 0.10311023378744721, + "rewards/cosine_scaled_reward": -0.24894601851701736, + "rewards/format_reward": 0.7500000149011612, "step": 240 }, { - "completion_length": 1125.2917175292969, - "epoch": 0.1377142857142857, - "grad_norm": 0.3624768555164337, - "kl": 0.004398345947265625, + "completion_length": 1405.2916870117188, + "epoch": 0.2754285714285714, + "grad_norm": 0.42230215668678284, + "kl": 0.013263702392578125, "learning_rate": 6.558139508961654e-07, - "loss": 0.0002, - "reward": 0.15821411460638046, - "reward_std": 0.14234024193137884, - "rewards/cosine_scaled_reward": -0.01562540978193283, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0005, + "reward": 0.05663964038831182, + "reward_std": 0.10991012584418058, + "rewards/cosine_scaled_reward": -0.2536990698426962, + "rewards/format_reward": 0.8333333432674408, "step": 241 }, { - "completion_length": 1014.5416870117188, - "epoch": 0.1382857142857143, - "grad_norm": 0.3342161476612091, - "kl": 0.0055389404296875, + "completion_length": 1111.4375457763672, + "epoch": 0.2765714285714286, + "grad_norm": 0.365119606256485, + "kl": 0.018003463745117188, "learning_rate": 6.527578915497951e-07, - "loss": 0.0002, - "reward": 0.21047868020832539, - "reward_std": 0.09484192356467247, - "rewards/cosine_scaled_reward": 0.14085940271615982, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0007, + "reward": 0.12744820569059812, + "reward_std": 0.1108874985948205, + "rewards/cosine_scaled_reward": -0.08098506298847497, + "rewards/format_reward": 0.8958333432674408, "step": 242 }, { - "completion_length": 2909.2916870117188, - "epoch": 0.13885714285714285, - "grad_norm": 0.2539648711681366, - "kl": 0.004573822021484375, + "completion_length": 1557.8541870117188, + "epoch": 0.2777142857142857, + "grad_norm": 0.22312189638614655, + "kl": 0.0106353759765625, "learning_rate": 6.496968239287603e-07, - "loss": 0.0002, - "reward": 0.0831678556278348, - "reward_std": 0.18316610343754292, - "rewards/cosine_scaled_reward": 0.015971510350937024, - "rewards/format_reward": 0.4583333395421505, + "loss": 0.0004, + "reward": 0.1432976769283414, + "reward_std": 0.11755505437031388, + "rewards/cosine_scaled_reward": 0.002848621690645814, + "rewards/format_reward": 0.8333333432674408, "step": 243 }, { - "completion_length": 2062.5834045410156, - "epoch": 0.13942857142857143, - "grad_norm": 0.31980466842651367, - "kl": 0.00295257568359375, + "completion_length": 1817.9167022705078, + "epoch": 0.27885714285714286, + "grad_norm": 0.284006804227829, + "kl": 0.01305389404296875, "learning_rate": 6.466308972251785e-07, - "loss": 0.0001, - "reward": 0.12564383377321064, - "reward_std": 0.07869264017790556, - "rewards/cosine_scaled_reward": -0.00325128436088562, - "rewards/format_reward": 0.7500000074505806, + "loss": 0.0005, + "reward": 0.16071391198784113, + "reward_std": 0.11166157713159919, + "rewards/cosine_scaled_reward": 0.0663528572767973, + "rewards/format_reward": 0.8125000074505806, "step": 244 }, { - "completion_length": 2921.6666870117188, - "epoch": 0.14, - "grad_norm": 0.2742232382297516, - "kl": 0.0054779052734375, + "completion_length": 1842.479232788086, + "epoch": 0.28, + "grad_norm": 0.26421064138412476, + "kl": 0.01108551025390625, "learning_rate": 6.435602608679916e-07, - "loss": 0.0002, - "reward": -0.03441611060407013, - "reward_std": 0.09494781494140625, - "rewards/cosine_scaled_reward": -0.2879330441355705, - "rewards/format_reward": 0.3750000111758709, + "loss": 0.0004, + "reward": 0.15495485439896584, + "reward_std": 0.14859340619295835, + "rewards/cosine_scaled_reward": 0.060377851128578186, + "rewards/format_reward": 0.7916666772216558, "step": 245 }, { - "completion_length": 1766.2083892822266, - "epoch": 0.14057142857142857, - "grad_norm": 0.22949433326721191, - "kl": 0.0014743804931640625, + "completion_length": 1367.7292098999023, + "epoch": 0.28114285714285714, + "grad_norm": 0.257500022649765, + "kl": 0.01171112060546875, "learning_rate": 6.404850645156841e-07, - "loss": 0.0001, - "reward": 0.19001246895641088, - "reward_std": 0.0961861927062273, - "rewards/cosine_scaled_reward": 0.09637190401554108, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0005, + "reward": 0.1178839597851038, + "reward_std": 0.11170466430485249, + "rewards/cosine_scaled_reward": -0.12124981544911861, + "rewards/format_reward": 0.9375, "step": 246 }, { - "completion_length": 1791.0416870117188, - "epoch": 0.14114285714285715, - "grad_norm": 0.31283846497535706, - "kl": 0.0024776458740234375, + "completion_length": 2025.4167175292969, + "epoch": 0.2822857142857143, + "grad_norm": 0.2877947986125946, + "kl": 0.014371871948242188, "learning_rate": 6.374054580489873e-07, - "loss": 0.0001, - "reward": 0.08852069079875946, - "reward_std": 0.08228143397718668, - "rewards/cosine_scaled_reward": -0.095562394708395, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.0006, + "reward": 0.041426120849791914, + "reward_std": 0.10642113536596298, + "rewards/cosine_scaled_reward": -0.20261837355792522, + "rewards/format_reward": 0.6458333432674408, "step": 247 }, { - "completion_length": 2097.7083892822266, - "epoch": 0.1417142857142857, - "grad_norm": 0.33567875623703003, - "kl": 0.015350341796875, + "completion_length": 1317.4167022705078, + "epoch": 0.2834285714285714, + "grad_norm": 0.3394499123096466, + "kl": 0.012571334838867188, "learning_rate": 6.343215915635761e-07, - "loss": 0.0006, - "reward": 0.08294937666505575, - "reward_std": 0.14238599315285683, - "rewards/cosine_scaled_reward": -0.04499213583767414, - "rewards/format_reward": 0.5833333469927311, + "loss": 0.0005, + "reward": 0.16258022841066122, + "reward_std": 0.09610571060329676, + "rewards/cosine_scaled_reward": 0.07010693056508899, + "rewards/format_reward": 0.8125000074505806, "step": 248 }, { - "completion_length": 2499.416717529297, - "epoch": 0.1422857142857143, - "grad_norm": 0.27392181754112244, - "kl": 0.00414276123046875, + "completion_length": 1280.3125305175781, + "epoch": 0.2845714285714286, + "grad_norm": 0.27582064270973206, + "kl": 0.011753082275390625, "learning_rate": 6.31233615362752e-07, - "loss": 0.0002, - "reward": 0.07490747206611559, - "reward_std": 0.1725021181628108, - "rewards/cosine_scaled_reward": -0.010441349819302559, - "rewards/format_reward": 0.4583333358168602, + "loss": 0.0005, + "reward": 0.21597624802961946, + "reward_std": 0.11335421586409211, + "rewards/cosine_scaled_reward": 0.18638860061764717, + "rewards/format_reward": 0.8958333358168602, "step": 249 }, { - "completion_length": 2913.3333435058594, - "epoch": 0.14285714285714285, - "grad_norm": 0.23789697885513306, - "kl": 0.0028171539306640625, + "completion_length": 1112.1875381469727, + "epoch": 0.2857142857142857, + "grad_norm": 0.3718562424182892, + "kl": 0.01335906982421875, "learning_rate": 6.281416799501187e-07, - "loss": 0.0001, - "reward": 0.051127828657627106, - "reward_std": 0.06356200855225325, - "rewards/cosine_scaled_reward": 0.02932456135749817, - "rewards/format_reward": 0.25, + "loss": 0.0005, + "reward": 0.1376856635324657, + "reward_std": 0.06630115583539009, + "rewards/cosine_scaled_reward": -0.09413989027962089, + "rewards/format_reward": 0.9791666716337204, "step": 250 }, { - "completion_length": 1972.5000915527344, - "epoch": 0.14342857142857143, - "grad_norm": 0.2969943583011627, - "kl": 0.0026111602783203125, + "completion_length": 969.0625152587891, + "epoch": 0.28685714285714287, + "grad_norm": 0.4498876631259918, + "kl": 0.01354217529296875, "learning_rate": 6.25045936022246e-07, - "loss": 0.0001, - "reward": 0.1337272571399808, - "reward_std": 0.08672486431896687, - "rewards/cosine_scaled_reward": 0.040651775896549225, - "rewards/format_reward": 0.7083333395421505, + "loss": 0.0005, + "reward": 0.13784294662764296, + "reward_std": 0.11408048821613193, + "rewards/cosine_scaled_reward": -0.029557042755186558, + "rewards/format_reward": 0.8541666828095913, "step": 251 }, { - "completion_length": 2679.166748046875, - "epoch": 0.144, - "grad_norm": 0.20708324015140533, - "kl": 0.002941131591796875, + "completion_length": 1556.2500534057617, + "epoch": 0.288, + "grad_norm": 0.35213375091552734, + "kl": 0.01563262939453125, "learning_rate": 6.219465344613258e-07, - "loss": 0.0001, - "reward": 0.11838137917220592, - "reward_std": 0.13586335629224777, - "rewards/cosine_scaled_reward": 0.059326328337192535, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0006, + "reward": 0.10068062460049987, + "reward_std": 0.08584798872470856, + "rewards/cosine_scaled_reward": -0.10943282302469015, + "rewards/format_reward": 0.812500013038516, "step": 252 }, { - "completion_length": 2705.2916870117188, - "epoch": 0.14457142857142857, - "grad_norm": 0.24477000534534454, - "kl": 0.00506591796875, + "completion_length": 1594.7292175292969, + "epoch": 0.28914285714285715, + "grad_norm": 0.36279869079589844, + "kl": 0.019073486328125, "learning_rate": 6.188436263278172e-07, - "loss": 0.0002, - "reward": -0.005199562467169017, - "reward_std": 0.0900608729571104, - "rewards/cosine_scaled_reward": -0.3072587475180626, - "rewards/format_reward": 0.583333358168602, + "loss": 0.0008, + "reward": 0.10339808277785778, + "reward_std": 0.14174744859337807, + "rewards/cosine_scaled_reward": -0.09498709812760353, + "rewards/format_reward": 0.7916666734963655, "step": 253 }, { - "completion_length": 2214.3333435058594, - "epoch": 0.14514285714285713, - "grad_norm": 0.2783680260181427, - "kl": 0.004276275634765625, + "completion_length": 1617.0417022705078, + "epoch": 0.29028571428571426, + "grad_norm": 0.37625786662101746, + "kl": 0.016139984130859375, "learning_rate": 6.157373628530852e-07, - "loss": 0.0002, - "reward": -0.004566615447402, - "reward_std": 0.07381200324743986, - "rewards/cosine_scaled_reward": -0.2845504656434059, - "rewards/format_reward": 0.541666679084301, + "loss": 0.0006, + "reward": 0.06562280771322548, + "reward_std": 0.09647063678130507, + "rewards/cosine_scaled_reward": -0.20810320507735014, + "rewards/format_reward": 0.791666679084301, "step": 254 }, { - "completion_length": 1775.166732788086, - "epoch": 0.1457142857142857, - "grad_norm": 0.2997996509075165, - "kl": 0.0033359527587890625, + "completion_length": 1775.500015258789, + "epoch": 0.2914285714285714, + "grad_norm": 0.30141302943229675, + "kl": 0.012451171875, "learning_rate": 6.126278954320294e-07, - "loss": 0.0001, - "reward": 0.2675487082451582, - "reward_std": 0.13101814314723015, - "rewards/cosine_scaled_reward": 0.34729035571217537, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.0005, + "reward": 0.03216266352683306, + "reward_std": 0.09485536953434348, + "rewards/cosine_scaled_reward": -0.2940612696111202, + "rewards/format_reward": 0.7708333432674408, "step": 255 }, { - "completion_length": 3150.4583740234375, - "epoch": 0.1462857142857143, - "grad_norm": 0.3275359272956848, - "kl": 0.007221221923828125, + "completion_length": 1419.5833740234375, + "epoch": 0.2925714285714286, + "grad_norm": 0.3471844494342804, + "kl": 0.0146484375, "learning_rate": 6.095153756157051e-07, - "loss": 0.0003, - "reward": 0.024978789500892162, - "reward_std": 0.09368282463401556, - "rewards/cosine_scaled_reward": -0.07320776581764221, - "rewards/format_reward": 0.291666679084301, + "loss": 0.0006, + "reward": 0.15079816803336143, + "reward_std": 0.10951651586219668, + "rewards/cosine_scaled_reward": -0.006679622456431389, + "rewards/format_reward": 0.8958333432674408, "step": 256 }, { - "completion_length": 2939.9166717529297, - "epoch": 0.14685714285714285, - "grad_norm": 0.27961456775665283, - "kl": 0.0040187835693359375, + "completion_length": 1932.9583892822266, + "epoch": 0.2937142857142857, + "grad_norm": 0.22311758995056152, + "kl": 0.01297760009765625, "learning_rate": 6.06399955103937e-07, - "loss": 0.0002, - "reward": 0.011663436889648438, - "reward_std": 0.11398309096693993, - "rewards/cosine_scaled_reward": -0.11544614285230637, - "rewards/format_reward": 0.2916666679084301, + "loss": 0.0005, + "reward": 0.20792145188897848, + "reward_std": 0.12192836869508028, + "rewards/cosine_scaled_reward": 0.19739503040909767, + "rewards/format_reward": 0.8333333395421505, "step": 257 }, { - "completion_length": 3163.291748046875, - "epoch": 0.14742857142857144, - "grad_norm": 0.25813010334968567, - "kl": 0.005168914794921875, + "completion_length": 1696.458366394043, + "epoch": 0.2948571428571429, + "grad_norm": 0.3503570556640625, + "kl": 0.012058258056640625, "learning_rate": 6.032817857379256e-07, - "loss": 0.0002, - "reward": 0.05509801208972931, - "reward_std": 0.18171382322907448, - "rewards/cosine_scaled_reward": -0.005882609635591507, - "rewards/format_reward": 0.3333333469927311, + "loss": 0.0005, + "reward": 0.111656179651618, + "reward_std": 0.14000094681978226, + "rewards/cosine_scaled_reward": -0.06675757747143507, + "rewards/format_reward": 0.7916666772216558, "step": 258 }, { - "completion_length": 2398.2916717529297, - "epoch": 0.148, - "grad_norm": 0.21977297961711884, - "kl": 0.0030517578125, + "completion_length": 1482.2292022705078, + "epoch": 0.296, + "grad_norm": 0.38044440746307373, + "kl": 0.016082763671875, "learning_rate": 6.001610194928464e-07, - "loss": 0.0001, - "reward": 0.10405881115002558, - "reward_std": 0.08769672736525536, - "rewards/cosine_scaled_reward": 0.05970558896660805, - "rewards/format_reward": 0.5000000111758709, + "loss": 0.0006, + "reward": 0.14132701233029366, + "reward_std": 0.09021098469384015, + "rewards/cosine_scaled_reward": 0.016738089732825756, + "rewards/format_reward": 0.791666679084301, "step": 259 }, { - "completion_length": 2677.8750610351562, - "epoch": 0.14857142857142858, - "grad_norm": 0.26745128631591797, - "kl": 0.004352569580078125, + "completion_length": 1308.8958930969238, + "epoch": 0.29714285714285715, + "grad_norm": 0.3778381943702698, + "kl": 0.0111083984375, "learning_rate": 5.97037808470444e-07, - "loss": 0.0002, - "reward": -0.005887473002076149, - "reward_std": 0.09334081970155239, - "rewards/cosine_scaled_reward": -0.2667917311191559, - "rewards/format_reward": 0.5000000111758709, + "loss": 0.0004, + "reward": 0.20748187974095345, + "reward_std": 0.1426714597037062, + "rewards/cosine_scaled_reward": 0.14869027212262154, + "rewards/format_reward": 0.916666679084301, "step": 260 }, { - "completion_length": 2825.041748046875, - "epoch": 0.14914285714285713, - "grad_norm": 0.39294514060020447, - "kl": 0.00946807861328125, + "completion_length": 2185.395866394043, + "epoch": 0.29828571428571427, + "grad_norm": 0.29314514994621277, + "kl": 0.019428253173828125, "learning_rate": 5.939123048916173e-07, - "loss": 0.0004, - "reward": 0.050011758925393224, - "reward_std": 0.1369151584804058, - "rewards/cosine_scaled_reward": -0.08272550255060196, - "rewards/format_reward": 0.4583333469927311, + "loss": 0.0008, + "reward": 0.05902714841067791, + "reward_std": 0.09425672655925155, + "rewards/cosine_scaled_reward": -0.1403571031987667, + "rewards/format_reward": 0.6250000111758709, "step": 261 }, { - "completion_length": 2165.3750610351562, - "epoch": 0.14971428571428572, - "grad_norm": 0.3451504111289978, - "kl": 0.003437042236328125, + "completion_length": 1879.7917175292969, + "epoch": 0.29942857142857143, + "grad_norm": 0.46697157621383667, + "kl": 0.020751953125, "learning_rate": 5.907846610890011e-07, - "loss": 0.0001, - "reward": 0.19721811264753342, - "reward_std": 0.12206157064065337, - "rewards/cosine_scaled_reward": 0.2494414784014225, + "loss": 0.0008, + "reward": 0.04076826642267406, + "reward_std": 0.10802473686635494, + "rewards/cosine_scaled_reward": -0.2183469645678997, "rewards/format_reward": 0.6666666828095913, "step": 262 }, { - "completion_length": 2892.625, - "epoch": 0.15028571428571427, - "grad_norm": 0.3263697028160095, - "kl": 0.00276947021484375, + "completion_length": 1427.145881652832, + "epoch": 0.30057142857142854, + "grad_norm": 0.2928243577480316, + "kl": 0.0118255615234375, "learning_rate": 5.87655029499542e-07, - "loss": 0.0001, - "reward": -0.0039011603221297264, - "reward_std": 0.1250161398202181, - "rewards/cosine_scaled_reward": -0.1561775620211847, - "rewards/format_reward": 0.2916666679084301, + "loss": 0.0005, + "reward": 0.05615242966450751, + "reward_std": 0.06438650703057647, + "rewards/cosine_scaled_reward": -0.2520607812330127, + "rewards/format_reward": 0.8333333488553762, "step": 263 }, { - "completion_length": 1756.0833740234375, - "epoch": 0.15085714285714286, - "grad_norm": 0.29304105043411255, - "kl": 0.00423431396484375, + "completion_length": 1357.7292251586914, + "epoch": 0.3017142857142857, + "grad_norm": 0.3987843096256256, + "kl": 0.012592315673828125, "learning_rate": 5.845235626570683e-07, - "loss": 0.0002, - "reward": 0.06624670885503292, - "reward_std": 0.11948579177260399, - "rewards/cosine_scaled_reward": -0.15875966474413872, - "rewards/format_reward": 0.7083333358168602, + "loss": 0.0005, + "reward": 0.11286137904971838, + "reward_std": 0.13379723951220512, + "rewards/cosine_scaled_reward": -0.10757828690111637, + "rewards/format_reward": 0.8750000223517418, "step": 264 }, { - "completion_length": 2940.8333740234375, - "epoch": 0.15142857142857144, - "grad_norm": 0.29393821954727173, - "kl": 0.0046672821044921875, + "completion_length": 1381.5000381469727, + "epoch": 0.3028571428571429, + "grad_norm": 0.3415836989879608, + "kl": 0.0178070068359375, "learning_rate": 5.813904131848564e-07, - "loss": 0.0002, - "reward": -0.0046788230538368225, - "reward_std": 0.13691922556608915, - "rewards/cosine_scaled_reward": -0.18014120450243354, - "rewards/format_reward": 0.3333333358168602, + "loss": 0.0007, + "reward": 0.1456056940369308, + "reward_std": 0.14075131434947252, + "rewards/cosine_scaled_reward": -0.009397267829626799, + "rewards/format_reward": 0.8541666716337204, "step": 265 }, { - "completion_length": 2493.6250610351562, - "epoch": 0.152, - "grad_norm": 0.3958794176578522, - "kl": 0.006023406982421875, + "completion_length": 1806.1250610351562, + "epoch": 0.304, + "grad_norm": 0.38759687542915344, + "kl": 0.01422119140625, "learning_rate": 5.78255733788191e-07, - "loss": 0.0002, - "reward": 0.024231276474893093, - "reward_std": 0.07274394854903221, - "rewards/cosine_scaled_reward": -0.22116203233599663, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0006, + "reward": 0.1108898997772485, + "reward_std": 0.11440710537135601, + "rewards/cosine_scaled_reward": -0.08643979020416737, + "rewards/format_reward": 0.8125000074505806, "step": 266 }, { - "completion_length": 2224.625, - "epoch": 0.15257142857142858, - "grad_norm": 0.2776709198951721, - "kl": 0.006488800048828125, + "completion_length": 2056.729202270508, + "epoch": 0.30514285714285716, + "grad_norm": 0.42212945222854614, + "kl": 0.026676177978515625, "learning_rate": 5.751196772469237e-07, - "loss": 0.0003, - "reward": 0.16999819688498974, - "reward_std": 0.16079052351415157, - "rewards/cosine_scaled_reward": 0.1879310579970479, - "rewards/format_reward": 0.6250000149011612, + "loss": 0.0011, + "reward": 0.022361958224792033, + "reward_std": 0.08537128940224648, + "rewards/cosine_scaled_reward": -0.24759646970778704, + "rewards/format_reward": 0.6250000167638063, "step": 267 }, { - "completion_length": 2274.0834045410156, - "epoch": 0.15314285714285714, - "grad_norm": 0.4002082347869873, - "kl": 0.00811767578125, + "completion_length": 1231.9166870117188, + "epoch": 0.3062857142857143, + "grad_norm": 0.5157231092453003, + "kl": 0.02033233642578125, "learning_rate": 5.71982396408026e-07, - "loss": 0.0003, - "reward": 0.004429344087839127, - "reward_std": 0.09090007236227393, - "rewards/cosine_scaled_reward": -0.25958751142024994, - "rewards/format_reward": 0.5416666679084301, + "loss": 0.0008, + "reward": 0.0841333303033025, + "reward_std": 0.10015194956213236, + "rewards/cosine_scaled_reward": -0.18771476298570633, + "rewards/format_reward": 0.854166679084301, "step": 268 }, { - "completion_length": 1051.833366394043, - "epoch": 0.15371428571428572, - "grad_norm": 0.37623241543769836, - "kl": 0.004245758056640625, + "completion_length": 1421.8541870117188, + "epoch": 0.30742857142857144, + "grad_norm": 0.3175782859325409, + "kl": 0.015338897705078125, "learning_rate": 5.688440441781398e-07, - "loss": 0.0002, - "reward": 0.28792200167663395, - "reward_std": 0.06009115173947066, - "rewards/cosine_scaled_reward": 0.43378106132149696, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.0006, + "reward": 0.0848088227212429, + "reward_std": 0.10533672664314508, + "rewards/cosine_scaled_reward": -0.1579919238574803, + "rewards/format_reward": 0.8125, "step": 269 }, { - "completion_length": 1943.7500610351562, - "epoch": 0.15428571428571428, - "grad_norm": 0.3560849130153656, - "kl": 0.006954193115234375, + "completion_length": 1852.9792251586914, + "epoch": 0.30857142857142855, + "grad_norm": 0.31283366680145264, + "kl": 0.020725250244140625, "learning_rate": 5.657047735161255e-07, - "loss": 0.0003, - "reward": 0.14212690759450197, - "reward_std": 0.1882859766483307, - "rewards/cosine_scaled_reward": 0.04390180786140263, - "rewards/format_reward": 0.7500000074505806, + "loss": 0.0008, + "reward": 0.1707673908676952, + "reward_std": 0.14836188638582826, + "rewards/cosine_scaled_reward": 0.11553415982052684, + "rewards/format_reward": 0.7708333432674408, "step": 270 }, { - "completion_length": 1407.3333435058594, - "epoch": 0.15485714285714286, - "grad_norm": 0.27751579880714417, - "kl": 0.003631591796875, + "completion_length": 1270.8333625793457, + "epoch": 0.3097142857142857, + "grad_norm": 0.4045168459415436, + "kl": 0.020198822021484375, "learning_rate": 5.625647374256061e-07, - "loss": 0.0001, - "reward": 0.14959589950740337, - "reward_std": 0.1417488120496273, - "rewards/cosine_scaled_reward": -0.019944459199905396, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0008, + "reward": 0.2056138776242733, + "reward_std": 0.1318775275722146, + "rewards/cosine_scaled_reward": 0.1646625578578096, + "rewards/format_reward": 0.8750000111758709, "step": 271 }, { - "completion_length": 1726.8333740234375, - "epoch": 0.15542857142857142, - "grad_norm": 0.30496859550476074, - "kl": 0.006397247314453125, + "completion_length": 1756.8542098999023, + "epoch": 0.31085714285714283, + "grad_norm": 0.34620898962020874, + "kl": 0.02004241943359375, "learning_rate": 5.594240889475106e-07, - "loss": 0.0003, - "reward": 0.1322023249231279, - "reward_std": 0.12009184807538986, - "rewards/cosine_scaled_reward": 0.014062527567148209, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0008, + "reward": 0.12157785186354886, + "reward_std": 0.1287962575443089, + "rewards/cosine_scaled_reward": -0.04085011733695865, + "rewards/format_reward": 0.791666679084301, "step": 272 }, { - "completion_length": 2897.2083435058594, - "epoch": 0.156, - "grad_norm": 0.265347957611084, - "kl": 0.005428314208984375, + "completion_length": 1281.5000381469727, + "epoch": 0.312, + "grad_norm": 0.4477139115333557, + "kl": 0.0211029052734375, "learning_rate": 5.562829811526154e-07, - "loss": 0.0002, - "reward": -0.014817139599472284, - "reward_std": 0.10670020338147879, - "rewards/cosine_scaled_reward": -0.2326906262896955, - "rewards/format_reward": 0.3750000111758709, + "loss": 0.0008, + "reward": 0.1495568435639143, + "reward_std": 0.1047550356015563, + "rewards/cosine_scaled_reward": 0.03370687458664179, + "rewards/format_reward": 0.8125000111758709, "step": 273 }, { - "completion_length": 2191.750030517578, - "epoch": 0.15657142857142858, - "grad_norm": 0.23128239810466766, - "kl": 0.00417327880859375, + "completion_length": 1047.6250457763672, + "epoch": 0.31314285714285717, + "grad_norm": 0.38608211278915405, + "kl": 0.015705108642578125, "learning_rate": 5.531415671340826e-07, - "loss": 0.0002, - "reward": 0.13589699938893318, - "reward_std": 0.1101944437250495, - "rewards/cosine_scaled_reward": 0.025908786803483963, - "rewards/format_reward": 0.75, + "loss": 0.0006, + "reward": 0.16697457217378542, + "reward_std": 0.1347856274805963, + "rewards/cosine_scaled_reward": 0.032155007666005986, + "rewards/format_reward": 0.8958333432674408, "step": 274 }, { - "completion_length": 2195.166748046875, - "epoch": 0.15714285714285714, - "grad_norm": 0.3097631335258484, - "kl": 0.003200531005859375, + "completion_length": 1519.5625534057617, + "epoch": 0.3142857142857143, + "grad_norm": 0.41990184783935547, + "kl": 0.02295684814453125, "learning_rate": 5.5e-07, - "loss": 0.0001, - "reward": 0.13339716009795666, - "reward_std": 0.07753341924399137, - "rewards/cosine_scaled_reward": 0.038514066487550735, - "rewards/format_reward": 0.7083333395421505, + "loss": 0.0009, + "reward": 0.19299401948228478, + "reward_std": 0.1319624213501811, + "rewards/cosine_scaled_reward": 0.15465315023902804, + "rewards/format_reward": 0.8125000074505806, "step": 275 }, { - "completion_length": 1775.8750610351562, - "epoch": 0.15771428571428572, - "grad_norm": 0.3163658678531647, - "kl": 0.006744384765625, + "completion_length": 1206.583351135254, + "epoch": 0.31542857142857145, + "grad_norm": 0.38451164960861206, + "kl": 0.020355224609375, "learning_rate": 5.468584328659172e-07, - "loss": 0.0003, - "reward": 0.09098343178629875, - "reward_std": 0.12340791895985603, - "rewards/cosine_scaled_reward": -0.12732042837888002, - "rewards/format_reward": 0.7916666865348816, + "loss": 0.0008, + "reward": 0.14626443712040782, + "reward_std": 0.13966891495510936, + "rewards/cosine_scaled_reward": 0.004510253042099066, + "rewards/format_reward": 0.8333333432674408, "step": 276 }, { - "completion_length": 1893.0417175292969, - "epoch": 0.15828571428571428, - "grad_norm": 0.2749994099140167, - "kl": 0.0048980712890625, + "completion_length": 1248.2917022705078, + "epoch": 0.31657142857142856, + "grad_norm": 0.5081427097320557, + "kl": 0.02585601806640625, "learning_rate": 5.437170188473847e-07, - "loss": 0.0002, - "reward": 0.07307121413759887, - "reward_std": 0.12130637839436531, - "rewards/cosine_scaled_reward": -0.20349279139190912, - "rewards/format_reward": 0.833333358168602, + "loss": 0.001, + "reward": 0.14069768914487213, + "reward_std": 0.12813527416437864, + "rewards/cosine_scaled_reward": -0.05632503447122872, + "rewards/format_reward": 0.916666679084301, "step": 277 }, { - "completion_length": 2929.666748046875, - "epoch": 0.15885714285714286, - "grad_norm": 0.2802342474460602, - "kl": 0.00811004638671875, + "completion_length": 1363.895866394043, + "epoch": 0.3177142857142857, + "grad_norm": 0.5567955374717712, + "kl": 0.02330780029296875, "learning_rate": 5.405759110524894e-07, - "loss": 0.0003, - "reward": 0.06634932570159435, - "reward_std": 0.13052548561245203, - "rewards/cosine_scaled_reward": 0.00530102476477623, - "rewards/format_reward": 0.3750000037252903, + "loss": 0.0009, + "reward": 0.1985365085711237, + "reward_std": 0.08099143509753048, + "rewards/cosine_scaled_reward": 0.12419202888850123, + "rewards/format_reward": 0.875, "step": 278 }, { - "completion_length": 1915.7084350585938, - "epoch": 0.15942857142857142, - "grad_norm": 0.28284236788749695, - "kl": 0.004901885986328125, + "completion_length": 1633.9583740234375, + "epoch": 0.31885714285714284, + "grad_norm": 0.42431047558784485, + "kl": 0.035366058349609375, "learning_rate": 5.37435262574394e-07, - "loss": 0.0002, - "reward": 0.175384268630296, - "reward_std": 0.1247609406709671, - "rewards/cosine_scaled_reward": 0.13918372802436352, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0014, + "reward": 0.10419091582298279, + "reward_std": 0.15187329379841685, + "rewards/cosine_scaled_reward": -0.07383611425757408, + "rewards/format_reward": 0.7500000093132257, "step": 279 }, { - "completion_length": 2258.375045776367, - "epoch": 0.16, - "grad_norm": 0.675750195980072, - "kl": 0.0109710693359375, + "completion_length": 1602.2292251586914, + "epoch": 0.32, + "grad_norm": 0.3582462668418884, + "kl": 0.042247772216796875, "learning_rate": 5.342952264838747e-07, - "loss": 0.0004, - "reward": 0.027627339586615562, - "reward_std": 0.10861307010054588, - "rewards/cosine_scaled_reward": -0.1919801402837038, - "rewards/format_reward": 0.541666679084301, + "loss": 0.0017, + "reward": 0.20170635590329766, + "reward_std": 0.15067243855446577, + "rewards/cosine_scaled_reward": 0.19395790994167328, + "rewards/format_reward": 0.7708333395421505, "step": 280 }, { - "completion_length": 1640.0416870117188, - "epoch": 0.16057142857142856, - "grad_norm": 0.3315058648586273, - "kl": 0.004913330078125, + "completion_length": 2467.0416717529297, + "epoch": 0.3211428571428571, + "grad_norm": 0.29816770553588867, + "kl": 0.05840301513671875, "learning_rate": 5.311559558218603e-07, - "loss": 0.0002, - "reward": 0.17733313888311386, - "reward_std": 0.21304279193282127, - "rewards/cosine_scaled_reward": 0.08402156922966242, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.0023, + "reward": 0.05792845832183957, + "reward_std": 0.14697478245943785, + "rewards/cosine_scaled_reward": -0.10388723947107792, + "rewards/format_reward": 0.541666679084301, "step": 281 }, { - "completion_length": 1198.6667022705078, - "epoch": 0.16114285714285714, - "grad_norm": 0.3147311210632324, - "kl": 0.005733489990234375, + "completion_length": 1241.5416946411133, + "epoch": 0.3222857142857143, + "grad_norm": 0.46031466126441956, + "kl": 0.0266265869140625, "learning_rate": 5.28017603591974e-07, - "loss": 0.0002, - "reward": 0.10686604678630829, - "reward_std": 0.13023934047669172, - "rewards/cosine_scaled_reward": -0.14513825066387653, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0011, + "reward": 0.13118357677012682, + "reward_std": 0.13791329925879836, + "rewards/cosine_scaled_reward": -0.02069989638403058, + "rewards/format_reward": 0.8125000074505806, "step": 282 }, { - "completion_length": 1904.1250610351562, - "epoch": 0.16171428571428573, - "grad_norm": 0.3399271070957184, - "kl": 0.0055389404296875, + "completion_length": 1868.6250228881836, + "epoch": 0.32342857142857145, + "grad_norm": 0.32365959882736206, + "kl": 0.052989959716796875, "learning_rate": 5.248803227530763e-07, - "loss": 0.0002, - "reward": 0.10958698927424848, - "reward_std": 0.1423885878175497, - "rewards/cosine_scaled_reward": -0.07467210292816162, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0021, + "reward": 0.18105492927134037, + "reward_std": 0.13572940416634083, + "rewards/cosine_scaled_reward": 0.16773403156548738, + "rewards/format_reward": 0.7291666828095913, "step": 283 }, { - "completion_length": 2422.5834045410156, - "epoch": 0.16228571428571428, - "grad_norm": 0.32275182008743286, - "kl": 0.00556182861328125, + "completion_length": 1280.8542098999023, + "epoch": 0.32457142857142857, + "grad_norm": 0.4519507586956024, + "kl": 0.019351959228515625, "learning_rate": 5.21744266211809e-07, - "loss": 0.0002, - "reward": 0.14030809747055173, - "reward_std": 0.14728851336985826, - "rewards/cosine_scaled_reward": 0.061691079288721085, - "rewards/format_reward": 0.7083333395421505, + "loss": 0.0008, + "reward": 0.11721091519575566, + "reward_std": 0.15023711137473583, + "rewards/cosine_scaled_reward": -0.09081541141495109, + "rewards/format_reward": 0.8541666865348816, "step": 284 }, { - "completion_length": 1614.1666870117188, - "epoch": 0.16285714285714287, - "grad_norm": 0.4587381184101105, - "kl": 0.00586700439453125, + "completion_length": 1140.1250305175781, + "epoch": 0.32571428571428573, + "grad_norm": 0.7309654951095581, + "kl": 0.0377349853515625, "learning_rate": 5.186095868151436e-07, - "loss": 0.0002, - "reward": 0.03370382636785507, - "reward_std": 0.06128286011517048, - "rewards/cosine_scaled_reward": -0.2780035622417927, - "rewards/format_reward": 0.75, + "loss": 0.0015, + "reward": 0.09533977252431214, + "reward_std": 0.11678969115018845, + "rewards/cosine_scaled_reward": -0.12558545544743538, + "rewards/format_reward": 0.8125000186264515, "step": 285 }, { - "completion_length": 2494.3334045410156, - "epoch": 0.16342857142857142, - "grad_norm": 0.552965521812439, - "kl": 0.00891876220703125, + "completion_length": 1449.3958740234375, + "epoch": 0.32685714285714285, + "grad_norm": 0.49403443932533264, + "kl": 0.0523681640625, "learning_rate": 5.154764373429315e-07, - "loss": 0.0004, - "reward": 0.055814516730606556, - "reward_std": 0.13142240978777409, - "rewards/cosine_scaled_reward": -0.1299433447420597, - "rewards/format_reward": 0.5833333469927311, + "loss": 0.0021, + "reward": 0.1131226432044059, + "reward_std": 0.12507515028119087, + "rewards/cosine_scaled_reward": -0.08685349836014211, + "rewards/format_reward": 0.8125000149011612, "step": 286 }, { - "completion_length": 2986.2916870117188, - "epoch": 0.164, - "grad_norm": 0.2565011978149414, - "kl": 0.004932403564453125, + "completion_length": 1521.6250457763672, + "epoch": 0.328, + "grad_norm": 0.858707070350647, + "kl": 0.069488525390625, "learning_rate": 5.123449705004581e-07, - "loss": 0.0002, - "reward": 0.10646794736385345, - "reward_std": 0.1831368077546358, - "rewards/cosine_scaled_reward": 0.04352019354701042, - "rewards/format_reward": 0.5416666716337204, + "loss": 0.0028, + "reward": 0.11133736907504499, + "reward_std": 0.11151464702561498, + "rewards/cosine_scaled_reward": -0.03888722602277994, + "rewards/format_reward": 0.708333345130086, "step": 287 }, { - "completion_length": 2249.416717529297, - "epoch": 0.16457142857142856, - "grad_norm": 0.3778051733970642, - "kl": 0.005828857421875, + "completion_length": 1589.6458892822266, + "epoch": 0.3291428571428571, + "grad_norm": 0.5665430426597595, + "kl": 0.059162139892578125, "learning_rate": 5.09215338910999e-07, - "loss": 0.0002, - "reward": 0.0777837848290801, - "reward_std": 0.06775563955307007, - "rewards/cosine_scaled_reward": -0.020740922540426254, - "rewards/format_reward": 0.5000000111758709, + "loss": 0.0024, + "reward": 0.11726528691360727, + "reward_std": 0.11230942467227578, + "rewards/cosine_scaled_reward": -0.06808646989520639, + "rewards/format_reward": 0.8125000111758709, "step": 288 }, { - "completion_length": 1182.8750534057617, - "epoch": 0.16514285714285715, - "grad_norm": 0.5000633597373962, - "kl": 0.00676727294921875, + "completion_length": 1320.7916831970215, + "epoch": 0.3302857142857143, + "grad_norm": 0.8541005849838257, + "kl": 0.035400390625, "learning_rate": 5.060876951083828e-07, - "loss": 0.0003, - "reward": 0.2432612655684352, - "reward_std": 0.13146781735122204, - "rewards/cosine_scaled_reward": 0.25964413583278656, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0014, + "reward": 0.12517158885020763, + "reward_std": 0.07708289241418242, + "rewards/cosine_scaled_reward": -0.030706046149134636, + "rewards/format_reward": 0.7916666772216558, "step": 289 }, { - "completion_length": 2173.5416717529297, - "epoch": 0.1657142857142857, - "grad_norm": 0.3588608503341675, - "kl": 0.005374908447265625, + "completion_length": 1165.520866394043, + "epoch": 0.3314285714285714, + "grad_norm": 0.45539847016334534, + "kl": 0.050296783447265625, "learning_rate": 5.02962191529556e-07, - "loss": 0.0002, - "reward": 0.034552792087197304, - "reward_std": 0.1388353668153286, - "rewards/cosine_scaled_reward": -0.16950345784425735, - "rewards/format_reward": 0.5416666679084301, + "loss": 0.002, + "reward": 0.1758332857862115, + "reward_std": 0.14369960315525532, + "rewards/cosine_scaled_reward": 0.06662235781550407, + "rewards/format_reward": 0.895833358168602, "step": 290 }, { - "completion_length": 2345.166732788086, - "epoch": 0.1662857142857143, - "grad_norm": 0.2620290517807007, - "kl": 0.00481414794921875, + "completion_length": 1888.7292556762695, + "epoch": 0.3325714285714286, + "grad_norm": 0.4208749234676361, + "kl": 0.10372543334960938, "learning_rate": 4.998389805071536e-07, - "loss": 0.0002, - "reward": 0.036819197703152895, - "reward_std": 0.03736873436719179, - "rewards/cosine_scaled_reward": -0.22433330863714218, - "rewards/format_reward": 0.6666666828095913, + "loss": 0.0041, + "reward": 0.15313149709254503, + "reward_std": 0.14885053224861622, + "rewards/cosine_scaled_reward": 0.030359832802787423, + "rewards/format_reward": 0.8333333414047956, "step": 291 }, { - "completion_length": 1165.3750305175781, - "epoch": 0.16685714285714287, - "grad_norm": 0.34216010570526123, - "kl": 0.00525665283203125, + "completion_length": 1906.958381652832, + "epoch": 0.33371428571428574, + "grad_norm": 1.015468716621399, + "kl": 0.11415481567382812, "learning_rate": 4.967182142620745e-07, - "loss": 0.0002, - "reward": 0.11193216871470213, - "reward_std": 0.147000752389431, - "rewards/cosine_scaled_reward": -0.10941322520375252, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.0046, + "reward": 0.08511171862483025, + "reward_std": 0.11772025120444596, + "rewards/cosine_scaled_reward": -0.146739911288023, + "rewards/format_reward": 0.7916666865348816, "step": 292 }, { - "completion_length": 1956.7500305175781, - "epoch": 0.16742857142857143, - "grad_norm": 0.23569047451019287, - "kl": 0.005397796630859375, + "completion_length": 1338.8541946411133, + "epoch": 0.33485714285714285, + "grad_norm": 0.5531266331672668, + "kl": 0.0963287353515625, "learning_rate": 4.93600044896063e-07, - "loss": 0.0002, - "reward": 0.0816317368298769, - "reward_std": 0.14073395915329456, - "rewards/cosine_scaled_reward": -0.1344745261594653, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0039, + "reward": 0.14025127002969384, + "reward_std": 0.12792299408465624, + "rewards/cosine_scaled_reward": -0.01351697463542223, + "rewards/format_reward": 0.8541666772216558, "step": 293 }, { - "completion_length": 1343.5417022705078, - "epoch": 0.168, - "grad_norm": 0.37875431776046753, - "kl": 0.00566864013671875, + "completion_length": 1772.7291870117188, + "epoch": 0.336, + "grad_norm": 0.9273716807365417, + "kl": 0.05928802490234375, "learning_rate": 4.904846243842949e-07, - "loss": 0.0002, - "reward": 0.18713519629091024, - "reward_std": 0.15218950249254704, - "rewards/cosine_scaled_reward": 0.08764517540112138, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0024, + "reward": 0.14557143417187035, + "reward_std": 0.13822759315371513, + "rewards/cosine_scaled_reward": 0.04577969899401069, + "rewards/format_reward": 0.7500000149011612, "step": 294 }, { - "completion_length": 1497.0833587646484, - "epoch": 0.16857142857142857, - "grad_norm": 0.3290937542915344, - "kl": 0.00817108154296875, + "completion_length": 1621.5833587646484, + "epoch": 0.33714285714285713, + "grad_norm": 0.7160328030586243, + "kl": 0.06582260131835938, "learning_rate": 4.873721045679706e-07, - "loss": 0.0003, - "reward": 0.14200717816129327, - "reward_std": 0.10107252094894648, - "rewards/cosine_scaled_reward": 0.0221601203083992, - "rewards/format_reward": 0.7916666679084301, + "loss": 0.0026, + "reward": 0.15534777799621224, + "reward_std": 0.12278164038434625, + "rewards/cosine_scaled_reward": 0.053630582988262177, + "rewards/format_reward": 0.7916666753590107, "step": 295 }, { - "completion_length": 1112.5000457763672, - "epoch": 0.16914285714285715, - "grad_norm": 0.3060383200645447, - "kl": 0.0038299560546875, + "completion_length": 1988.916732788086, + "epoch": 0.3382857142857143, + "grad_norm": 1.5293620824813843, + "kl": 0.130126953125, "learning_rate": 4.842626371469149e-07, - "loss": 0.0002, - "reward": 0.09541013650596142, - "reward_std": 0.0758298896253109, - "rewards/cosine_scaled_reward": -0.22060698084533215, - "rewards/format_reward": 1.0, + "loss": 0.0052, + "reward": 0.07497212127782404, + "reward_std": 0.15273468242958188, + "rewards/cosine_scaled_reward": -0.14556175749748945, + "rewards/format_reward": 0.7291666902601719, "step": 296 }, { - "completion_length": 2660.750030517578, - "epoch": 0.1697142857142857, - "grad_norm": 0.23520247638225555, - "kl": 0.004154205322265625, + "completion_length": 2342.8959197998047, + "epoch": 0.3394285714285714, + "grad_norm": 1.5157654285430908, + "kl": 0.10904693603515625, "learning_rate": 4.811563736721829e-07, - "loss": 0.0002, - "reward": 0.05843828700017184, - "reward_std": 0.13387714512646198, - "rewards/cosine_scaled_reward": -0.11719884723424911, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0044, + "reward": 0.08573226723819971, + "reward_std": 0.14976172288879752, + "rewards/cosine_scaled_reward": -0.05238574789837003, + "rewards/format_reward": 0.6041666753590107, "step": 297 }, { - "completion_length": 1781.9583587646484, - "epoch": 0.1702857142857143, - "grad_norm": 0.34224119782447815, - "kl": 0.00775146484375, + "completion_length": 1915.416732788086, + "epoch": 0.3405714285714286, + "grad_norm": 0.8163977861404419, + "kl": 0.1429595947265625, "learning_rate": 4.780534655386743e-07, - "loss": 0.0003, - "reward": 0.18634057487361133, - "reward_std": 0.14846444129943848, - "rewards/cosine_scaled_reward": 0.13383683562278748, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0057, + "reward": 0.10191577160730958, + "reward_std": 0.10441284067928791, + "rewards/cosine_scaled_reward": -0.054123382084071636, + "rewards/format_reward": 0.7083333469927311, "step": 298 }, { - "completion_length": 1319.3333587646484, - "epoch": 0.17085714285714285, - "grad_norm": 0.43493491411209106, - "kl": 0.005352020263671875, + "completion_length": 1942.7084045410156, + "epoch": 0.3417142857142857, + "grad_norm": 1.951476812362671, + "kl": 0.17220306396484375, "learning_rate": 4.749540639777539e-07, - "loss": 0.0002, - "reward": 0.14997456409037113, - "reward_std": 0.1703295260667801, - "rewards/cosine_scaled_reward": -0.04173681698739529, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0069, + "reward": 0.09106689637701493, + "reward_std": 0.11453963397070765, + "rewards/cosine_scaled_reward": -0.08032999746501446, + "rewards/format_reward": 0.6875000186264515, "step": 299 }, { - "completion_length": 1676.4584045410156, - "epoch": 0.17142857142857143, - "grad_norm": 0.3143356144428253, - "kl": 0.0052642822265625, + "completion_length": 2121.520881652832, + "epoch": 0.34285714285714286, + "grad_norm": 0.9551680684089661, + "kl": 0.1997222900390625, "learning_rate": 4.7185832004988133e-07, - "loss": 0.0002, - "reward": 0.11242577526718378, - "reward_std": 0.142780059017241, - "rewards/cosine_scaled_reward": -0.04435308463871479, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.008, + "reward": 0.09341209102421999, + "reward_std": 0.11039053509011865, + "rewards/cosine_scaled_reward": -0.07323687896132469, + "rewards/format_reward": 0.6875000149011612, "step": 300 }, { - "completion_length": 2174.6666870117188, - "epoch": 0.172, - "grad_norm": 0.49433818459510803, - "kl": 0.009006500244140625, + "completion_length": 2006.5209007263184, + "epoch": 0.344, + "grad_norm": 1.1690226793289185, + "kl": 0.244354248046875, "learning_rate": 4.68766384637248e-07, - "loss": 0.0004, - "reward": 0.11563278967514634, - "reward_std": 0.2119303159415722, - "rewards/cosine_scaled_reward": 0.04536004364490509, - "rewards/format_reward": 0.5833333395421505, + "loss": 0.0098, + "reward": 0.07932836120016873, + "reward_std": 0.13190083391964436, + "rewards/cosine_scaled_reward": -0.11513608321547508, + "rewards/format_reward": 0.6875000093132257, "step": 301 }, { - "completion_length": 1970.2917175292969, - "epoch": 0.17257142857142857, - "grad_norm": 0.29630419611930847, - "kl": 0.008312225341796875, + "completion_length": 1556.2917137145996, + "epoch": 0.34514285714285714, + "grad_norm": 1.1125872135162354, + "kl": 0.10612106323242188, "learning_rate": 4.656784084364238e-07, - "loss": 0.0003, - "reward": 0.175113957375288, - "reward_std": 0.1281280852854252, - "rewards/cosine_scaled_reward": 0.09974386170506477, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.0042, + "reward": 0.15471726842224598, + "reward_std": 0.10859864950180054, + "rewards/cosine_scaled_reward": 0.06700704153627157, + "rewards/format_reward": 0.7708333414047956, "step": 302 }, { - "completion_length": 2159.166717529297, - "epoch": 0.17314285714285715, - "grad_norm": 0.3525708317756653, - "kl": 0.0053501129150390625, + "completion_length": 1470.5625267028809, + "epoch": 0.3462857142857143, + "grad_norm": 0.939318060874939, + "kl": 0.0955810546875, "learning_rate": 4.6259454195101267e-07, - "loss": 0.0002, - "reward": 0.07677281461656094, - "reward_std": 0.08553566597402096, - "rewards/cosine_scaled_reward": -0.12797834165394306, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.0038, + "reward": 0.16825311817228794, + "reward_std": 0.13590177986770868, + "rewards/cosine_scaled_reward": 0.02981522586196661, + "rewards/format_reward": 0.9166666865348816, "step": 303 }, { - "completion_length": 1881.6666946411133, - "epoch": 0.1737142857142857, - "grad_norm": 0.39991453289985657, - "kl": 0.00827789306640625, + "completion_length": 1612.6458740234375, + "epoch": 0.3474285714285714, + "grad_norm": 1.2758152484893799, + "kl": 0.13702392578125, "learning_rate": 4.59514935484316e-07, - "loss": 0.0003, - "reward": -0.015333778690546751, - "reward_std": 0.06587743479758501, - "rewards/cosine_scaled_reward": -0.33822769671678543, - "rewards/format_reward": 0.5833333358168602, + "loss": 0.0055, + "reward": 0.08096132357604802, + "reward_std": 0.10056991688907146, + "rewards/cosine_scaled_reward": -0.1730497945100069, + "rewards/format_reward": 0.812500013038516, "step": 304 }, { - "completion_length": 2222.0833740234375, - "epoch": 0.1742857142857143, - "grad_norm": 0.3573647737503052, - "kl": 0.01055908203125, + "completion_length": 1444.104232788086, + "epoch": 0.3485714285714286, + "grad_norm": 1.4721267223358154, + "kl": 0.09059906005859375, "learning_rate": 4.5643973913200837e-07, - "loss": 0.0004, - "reward": 0.03121366538107395, - "reward_std": 0.10081714205443859, - "rewards/cosine_scaled_reward": -0.19995495676994324, - "rewards/format_reward": 0.5833333507180214, + "loss": 0.0036, + "reward": 0.10656621214002371, + "reward_std": 0.13656832091510296, + "rewards/cosine_scaled_reward": -0.12406391743570566, + "rewards/format_reward": 0.8750000223517418, "step": 305 }, { - "completion_length": 1664.916763305664, - "epoch": 0.17485714285714285, - "grad_norm": 0.4049828350543976, - "kl": 0.01068115234375, + "completion_length": 1328.6458740234375, + "epoch": 0.3497142857142857, + "grad_norm": 1.2772217988967896, + "kl": 0.14186859130859375, "learning_rate": 4.5336910277482155e-07, - "loss": 0.0004, - "reward": 0.06955286301672459, - "reward_std": 0.10504710860550404, - "rewards/cosine_scaled_reward": -0.16950544342398643, - "rewards/format_reward": 0.7500000074505806, + "loss": 0.0057, + "reward": 0.19892889651237056, + "reward_std": 0.11784483585506678, + "rewards/cosine_scaled_reward": 0.14353829622268677, + "rewards/format_reward": 0.8750000074505806, "step": 306 }, { - "completion_length": 2116.8334350585938, - "epoch": 0.17542857142857143, - "grad_norm": 0.2659807801246643, - "kl": 0.004810333251953125, - "learning_rate": 4.503031760712397e-07, - "loss": 0.0002, - "reward": 0.14551719557493925, - "reward_std": 0.15460799634456635, - "rewards/cosine_scaled_reward": 0.03528590872883797, - "rewards/format_reward": 0.7916666865348816, + "completion_length": 1276.645866394043, + "epoch": 0.35085714285714287, + "grad_norm": 1.0841631889343262, + "kl": 0.130828857421875, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0052, + "reward": 0.12190989218652248, + "reward_std": 0.12431731820106506, + "rewards/cosine_scaled_reward": -0.08158679166808724, + "rewards/format_reward": 0.8750000074505806, "step": 307 }, { - "completion_length": 3202.2083740234375, - "epoch": 0.176, - "grad_norm": 0.24735987186431885, - "kl": 0.00765228271484375, + "completion_length": 2021.0000305175781, + "epoch": 0.352, + "grad_norm": 1.4621251821517944, + "kl": 0.2753448486328125, "learning_rate": 4.4724210845020494e-07, - "loss": 0.0003, - "reward": 0.028550002723932266, - "reward_std": 0.15040935762226582, - "rewards/cosine_scaled_reward": -0.061920275911688805, - "rewards/format_reward": 0.2916666753590107, + "loss": 0.011, + "reward": 0.09099714574404061, + "reward_std": 0.14960693335160613, + "rewards/cosine_scaled_reward": -0.09915194474160671, + "rewards/format_reward": 0.7291666716337204, "step": 308 }, { - "completion_length": 1844.0000305175781, - "epoch": 0.17657142857142857, - "grad_norm": 0.2916047275066376, - "kl": 0.00652313232421875, + "completion_length": 2146.7083740234375, + "epoch": 0.35314285714285715, + "grad_norm": 2.272526502609253, + "kl": 0.4404144287109375, "learning_rate": 4.441860491038345e-07, - "loss": 0.0003, - "reward": 0.1707380348816514, - "reward_std": 0.10255965404212475, - "rewards/cosine_scaled_reward": 0.1485364492982626, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.0176, + "reward": 0.09269980387762189, + "reward_std": 0.14243671763688326, + "rewards/cosine_scaled_reward": -0.06128171645104885, + "rewards/format_reward": 0.666666679084301, "step": 309 }, { - "completion_length": 2489.750045776367, - "epoch": 0.17714285714285713, - "grad_norm": 0.3996852934360504, - "kl": 0.01092529296875, + "completion_length": 1203.4791946411133, + "epoch": 0.35428571428571426, + "grad_norm": 1.225544810295105, + "kl": 0.16626739501953125, "learning_rate": 4.4113514698014953e-07, - "loss": 0.0004, - "reward": 0.044906886643730104, - "reward_std": 0.15476176887750626, - "rewards/cosine_scaled_reward": -0.11812459863722324, - "rewards/format_reward": 0.5000000037252903, + "loss": 0.0066, + "reward": 0.10261391778476536, + "reward_std": 0.08983314875513315, + "rewards/cosine_scaled_reward": -0.16803579218685627, + "rewards/format_reward": 0.916666679084301, "step": 310 }, { - "completion_length": 2800.7500610351562, - "epoch": 0.1777142857142857, - "grad_norm": 0.2494601458311081, - "kl": 0.00762939453125, + "completion_length": 1146.8125343322754, + "epoch": 0.3554285714285714, + "grad_norm": 1.5572764873504639, + "kl": 0.07305145263671875, "learning_rate": 4.3808955077581546e-07, - "loss": 0.0003, - "reward": 0.03491459786891937, - "reward_std": 0.10233313590288162, - "rewards/cosine_scaled_reward": -0.08323951810598373, - "rewards/format_reward": 0.375, + "loss": 0.0029, + "reward": 0.19523340463638306, + "reward_std": 0.14635545574128628, + "rewards/cosine_scaled_reward": 0.08061425480991602, + "rewards/format_reward": 0.9791666716337204, "step": 311 }, { - "completion_length": 1673.6666870117188, - "epoch": 0.1782857142857143, - "grad_norm": 0.2702195346355438, - "kl": 0.00411224365234375, + "completion_length": 1333.0625228881836, + "epoch": 0.3565714285714286, + "grad_norm": 1.1539429426193237, + "kl": 0.34932708740234375, "learning_rate": 4.350494089288943e-07, - "loss": 0.0002, - "reward": 0.20612565986812115, - "reward_std": 0.1918553113937378, - "rewards/cosine_scaled_reward": 0.19074446707963943, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.014, + "reward": 0.1985101569443941, + "reward_std": 0.07966499030590057, + "rewards/cosine_scaled_reward": 0.1561545841395855, + "rewards/format_reward": 0.8541666697710752, "step": 312 }, { - "completion_length": 2605.375045776367, - "epoch": 0.17885714285714285, - "grad_norm": 0.28814107179641724, - "kl": 0.00624847412109375, + "completion_length": 2031.31254196167, + "epoch": 0.3577142857142857, + "grad_norm": 1.5145570039749146, + "kl": 0.6246414184570312, "learning_rate": 4.3201486961161093e-07, - "loss": 0.0003, - "reward": 0.06068434612825513, - "reward_std": 0.1535702757537365, - "rewards/cosine_scaled_reward": -0.05121878907084465, - "rewards/format_reward": 0.4583333395421505, + "loss": 0.0249, + "reward": 0.11477423517499119, + "reward_std": 0.11296425701584667, + "rewards/cosine_scaled_reward": 0.012727040797472, + "rewards/format_reward": 0.6458333432674408, "step": 313 }, { - "completion_length": 2223.4583740234375, - "epoch": 0.17942857142857144, - "grad_norm": 0.3431723117828369, - "kl": 0.010547637939453125, + "completion_length": 1317.7083625793457, + "epoch": 0.3588571428571429, + "grad_norm": 2.5971078872680664, + "kl": 0.4033660888671875, "learning_rate": 4.2898608072313045e-07, - "loss": 0.0004, - "reward": 0.026441825553774834, - "reward_std": 0.04924630746245384, - "rewards/cosine_scaled_reward": -0.2751910872757435, - "rewards/format_reward": 0.7083333544433117, + "loss": 0.0162, + "reward": 0.18170432932674885, + "reward_std": 0.13025694666430354, + "rewards/cosine_scaled_reward": 0.10399175062775612, + "rewards/format_reward": 0.854166679084301, "step": 314 }, { - "completion_length": 1709.5833740234375, - "epoch": 0.18, - "grad_norm": 0.3201163411140442, - "kl": 0.00830078125, + "completion_length": 2108.854248046875, + "epoch": 0.36, + "grad_norm": 2.6029951572418213, + "kl": 0.965606689453125, "learning_rate": 4.2596318988235037e-07, - "loss": 0.0003, - "reward": 0.07538635097444057, - "reward_std": 0.09759460296481848, - "rewards/cosine_scaled_reward": -0.19568588957190514, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0386, + "reward": 0.09199583710869774, + "reward_std": 0.1252267644740641, + "rewards/cosine_scaled_reward": -0.07803997304290533, + "rewards/format_reward": 0.6875000186264515, "step": 315 }, { - "completion_length": 1850.7084045410156, - "epoch": 0.18057142857142858, - "grad_norm": 0.425157755613327, - "kl": 0.00659942626953125, + "completion_length": 1711.2917366027832, + "epoch": 0.36114285714285715, + "grad_norm": 2.844820499420166, + "kl": 0.6187896728515625, "learning_rate": 4.2294634442070553e-07, - "loss": 0.0003, - "reward": 0.2117317747324705, - "reward_std": 0.12208653800189495, - "rewards/cosine_scaled_reward": 0.21481734700500965, - "rewards/format_reward": 0.7916667014360428, + "loss": 0.0247, + "reward": 0.04600826557725668, + "reward_std": 0.08856615889817476, + "rewards/cosine_scaled_reward": -0.20388950034976006, + "rewards/format_reward": 0.6666666846722364, "step": 316 }, { - "completion_length": 2269.250030517578, - "epoch": 0.18114285714285713, - "grad_norm": 0.3082946836948395, - "kl": 0.006744384765625, + "completion_length": 1576.0625076293945, + "epoch": 0.36228571428571427, + "grad_norm": 2.0948660373687744, + "kl": 0.4916229248046875, "learning_rate": 4.1993569137498776e-07, - "loss": 0.0003, - "reward": 0.01995856501162052, - "reward_std": 0.06604108307510614, - "rewards/cosine_scaled_reward": -0.2132924273610115, - "rewards/format_reward": 0.5416666679084301, + "loss": 0.0197, + "reward": 0.11145175900310278, + "reward_std": 0.11253520660102367, + "rewards/cosine_scaled_reward": -0.05809229984879494, + "rewards/format_reward": 0.7500000186264515, "step": 317 }, { - "completion_length": 2022.9583740234375, - "epoch": 0.18171428571428572, - "grad_norm": 0.27388623356819153, - "kl": 0.0070343017578125, + "completion_length": 1069.9166831970215, + "epoch": 0.36342857142857143, + "grad_norm": 2.052123785018921, + "kl": 0.205841064453125, "learning_rate": 4.1693137748017915e-07, - "loss": 0.0003, - "reward": 0.04815053194761276, - "reward_std": 0.08979168348014355, - "rewards/cosine_scaled_reward": -0.2123359590768814, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.0082, + "reward": 0.1357195656746626, + "reward_std": 0.11260256776586175, + "rewards/cosine_scaled_reward": -0.10857605282217264, + "rewards/format_reward": 1.0, "step": 318 }, { - "completion_length": 1480.2084045410156, - "epoch": 0.18228571428571427, - "grad_norm": 0.3955407738685608, - "kl": 0.00750732421875, + "completion_length": 1185.145866394043, + "epoch": 0.36457142857142855, + "grad_norm": 3.6290059089660645, + "kl": 0.23030853271484375, "learning_rate": 4.1393354916230005e-07, - "loss": 0.0003, - "reward": 0.14766318071633577, - "reward_std": 0.130909638479352, - "rewards/cosine_scaled_reward": 0.039204370230436325, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0092, + "reward": 0.039664710406214, + "reward_std": 0.09799355687573552, + "rewards/cosine_scaled_reward": -0.29441852401942015, + "rewards/format_reward": 0.8125000074505806, "step": 319 }, { - "completion_length": 1815.9583740234375, - "epoch": 0.18285714285714286, - "grad_norm": 0.3559373617172241, - "kl": 0.00984954833984375, + "completion_length": 822.8125267028809, + "epoch": 0.3657142857142857, + "grad_norm": 10.01123046875, + "kl": 0.42021942138671875, "learning_rate": 4.1094235253127374e-07, - "loss": 0.0004, - "reward": 0.09755610581487417, - "reward_std": 0.12540572695434093, - "rewards/cosine_scaled_reward": -0.08782809972763062, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0169, + "reward": 0.14526783768087626, + "reward_std": 0.1066361116245389, + "rewards/cosine_scaled_reward": -0.05672251805663109, + "rewards/format_reward": 0.9583333432674408, "step": 320 }, { - "completion_length": 1744.9583587646484, - "epoch": 0.18342857142857144, - "grad_norm": 0.35276076197624207, - "kl": 0.00742340087890625, + "completion_length": 951.9166946411133, + "epoch": 0.3668571428571429, + "grad_norm": 1.7355072498321533, + "kl": 0.166778564453125, "learning_rate": 4.079579333738039e-07, - "loss": 0.0003, - "reward": 0.12307325447909534, - "reward_std": 0.14547176472842693, - "rewards/cosine_scaled_reward": 0.009329738095402718, - "rewards/format_reward": 0.7083333395421505, + "loss": 0.0067, + "reward": 0.16883038450032473, + "reward_std": 0.10923763830214739, + "rewards/cosine_scaled_reward": 0.03077949397265911, + "rewards/format_reward": 0.9166666716337204, "step": 321 }, { - "completion_length": 2137.8750610351562, - "epoch": 0.184, - "grad_norm": 0.30521053075790405, - "kl": 0.0121002197265625, + "completion_length": 1182.9167137145996, + "epoch": 0.368, + "grad_norm": 3.371811628341675, + "kl": 0.32440185546875, "learning_rate": 4.0498043714627006e-07, - "loss": 0.0005, - "reward": 0.04445845208829269, - "reward_std": 0.13378578051924706, - "rewards/cosine_scaled_reward": -0.2241049762815237, - "rewards/format_reward": 0.7083333395421505, + "loss": 0.013, + "reward": 0.10157357528805733, + "reward_std": 0.09876813879236579, + "rewards/cosine_scaled_reward": -0.07484898250550032, + "rewards/format_reward": 0.7291666679084301, "step": 322 }, { - "completion_length": 2655.500030517578, - "epoch": 0.18457142857142858, - "grad_norm": 0.3859151005744934, - "kl": 0.0158538818359375, + "completion_length": 1447.1458587646484, + "epoch": 0.36914285714285716, + "grad_norm": 2.1031486988067627, + "kl": 0.4143524169921875, "learning_rate": 4.020100089676376e-07, - "loss": 0.0006, - "reward": 0.027778119780123234, - "reward_std": 0.11814791522920132, - "rewards/cosine_scaled_reward": -0.10827487707138062, - "rewards/format_reward": 0.3750000111758709, + "loss": 0.0166, + "reward": 0.0939664258621633, + "reward_std": 0.11301013454794884, + "rewards/cosine_scaled_reward": -0.08118974138051271, + "rewards/format_reward": 0.7083333376795053, "step": 323 }, { - "completion_length": 1785.3334045410156, - "epoch": 0.18514285714285714, - "grad_norm": 0.38553640246391296, - "kl": 0.00872802734375, + "completion_length": 1413.3542251586914, + "epoch": 0.3702857142857143, + "grad_norm": 1.4308388233184814, + "kl": 0.449249267578125, "learning_rate": 3.9904679361238526e-07, - "loss": 0.0003, - "reward": 0.09418739820830524, - "reward_std": 0.16541589424014091, - "rewards/cosine_scaled_reward": -0.09726313035935163, - "rewards/format_reward": 0.7500000149011612, + "loss": 0.018, + "reward": 0.054550049535464495, + "reward_std": 0.1179595545399934, + "rewards/cosine_scaled_reward": -0.20703712804242969, + "rewards/format_reward": 0.7291666697710752, "step": 324 }, { - "completion_length": 1953.5833740234375, - "epoch": 0.18571428571428572, - "grad_norm": 0.3402915298938751, - "kl": 0.0072784423828125, + "completion_length": 1824.2708702087402, + "epoch": 0.37142857142857144, + "grad_norm": 2.757559299468994, + "kl": 1.072296142578125, "learning_rate": 3.9609093550344907e-07, - "loss": 0.0003, - "reward": 0.10808768216520548, - "reward_std": 0.10259002726525068, - "rewards/cosine_scaled_reward": -0.0767526775598526, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0429, + "reward": 0.08934608940035105, + "reward_std": 0.12084951438009739, + "rewards/cosine_scaled_reward": -0.030909400433301926, + "rewards/format_reward": 0.5833333432674408, "step": 325 }, { - "completion_length": 2230.8333587646484, - "epoch": 0.18628571428571428, - "grad_norm": 0.35040411353111267, - "kl": 0.009002685546875, + "completion_length": 1115.270866394043, + "epoch": 0.37257142857142855, + "grad_norm": 2.2729392051696777, + "kl": 0.2136383056640625, "learning_rate": 3.931425787051832e-07, - "loss": 0.0004, - "reward": 0.17596495151519775, - "reward_std": 0.1837935969233513, - "rewards/cosine_scaled_reward": 0.22626550868153572, - "rewards/format_reward": 0.5833333358168602, + "loss": 0.0085, + "reward": 0.16146898362785578, + "reward_std": 0.14930668845772743, + "rewards/cosine_scaled_reward": 0.023914188146591187, + "rewards/format_reward": 0.8958333432674408, "step": 326 }, { - "completion_length": 1227.6250610351562, - "epoch": 0.18685714285714286, - "grad_norm": 0.350586861371994, - "kl": 0.007678985595703125, + "completion_length": 1361.3542022705078, + "epoch": 0.3737142857142857, + "grad_norm": 0.9111059904098511, + "kl": 0.24196624755859375, "learning_rate": 3.902018669163384e-07, - "loss": 0.0003, - "reward": 0.16640940494835377, - "reward_std": 0.09436677768826485, - "rewards/cosine_scaled_reward": 0.012735145166516304, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0097, + "reward": 0.18883195845410228, + "reward_std": 0.12315699364989996, + "rewards/cosine_scaled_reward": 0.10393881611526012, + "rewards/format_reward": 0.895833333954215, "step": 327 }, { - "completion_length": 1633.9583740234375, - "epoch": 0.18742857142857142, - "grad_norm": 0.32919415831565857, - "kl": 0.007015228271484375, + "completion_length": 1545.9375381469727, + "epoch": 0.37485714285714283, + "grad_norm": 2.3425772190093994, + "kl": 0.56195068359375, "learning_rate": 3.872689434630585e-07, - "loss": 0.0003, - "reward": 0.10486362967640162, - "reward_std": 0.0827696262858808, - "rewards/cosine_scaled_reward": -0.07059448771178722, - "rewards/format_reward": 0.75, + "loss": 0.0225, + "reward": 0.0559651258517988, + "reward_std": 0.11655561625957489, + "rewards/cosine_scaled_reward": -0.21696932520717382, + "rewards/format_reward": 0.7500000149011612, "step": 328 }, { - "completion_length": 849.0417022705078, - "epoch": 0.188, - "grad_norm": 0.3825984001159668, - "kl": 0.00890350341796875, + "completion_length": 979.2708511352539, + "epoch": 0.376, + "grad_norm": 3.092437744140625, + "kl": 0.1307373046875, "learning_rate": 3.843439512918949e-07, - "loss": 0.0004, - "reward": 0.07872669212520123, - "reward_std": 0.07250371528789401, - "rewards/cosine_scaled_reward": -0.26847486943006516, - "rewards/format_reward": 1.0, + "loss": 0.0052, + "reward": 0.16340086178388447, + "reward_std": 0.10502722533419728, + "rewards/cosine_scaled_reward": 0.060619720607064664, + "rewards/format_reward": 0.8125000111758709, "step": 329 }, { - "completion_length": 1930.7084045410156, - "epoch": 0.18857142857142858, - "grad_norm": 0.44835320115089417, - "kl": 0.0089263916015625, + "completion_length": 1155.9375305175781, + "epoch": 0.37714285714285717, + "grad_norm": 2.5028579235076904, + "kl": 0.5152435302734375, "learning_rate": 3.8142703296283953e-07, - "loss": 0.0004, - "reward": 0.1757338629104197, - "reward_std": 0.1174239031970501, - "rewards/cosine_scaled_reward": 0.16307002678513527, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.0206, + "reward": 0.0951260298024863, + "reward_std": 0.10306549491360784, + "rewards/cosine_scaled_reward": -0.17680510133504868, + "rewards/format_reward": 0.8958333432674408, "step": 330 }, { - "completion_length": 2502.6250610351562, - "epoch": 0.18914285714285714, - "grad_norm": 0.2408701777458191, - "kl": 0.00754547119140625, + "completion_length": 1770.0208892822266, + "epoch": 0.3782857142857143, + "grad_norm": 1.7597198486328125, + "kl": 0.6001739501953125, "learning_rate": 3.785183306423767e-07, - "loss": 0.0003, - "reward": 0.10585480247391388, - "reward_std": 0.10277330316603184, - "rewards/cosine_scaled_reward": -0.02224121242761612, - "rewards/format_reward": 0.6666666679084301, + "loss": 0.024, + "reward": 0.10013224184513092, + "reward_std": 0.11445806687697768, + "rewards/cosine_scaled_reward": -0.0341799296438694, + "rewards/format_reward": 0.6458333395421505, "step": 331 }, { - "completion_length": 1428.3750305175781, - "epoch": 0.18971428571428572, - "grad_norm": 0.29567867517471313, - "kl": 0.005764007568359375, + "completion_length": 1468.270881652832, + "epoch": 0.37942857142857145, + "grad_norm": 2.4370524883270264, + "kl": 0.4661865234375, "learning_rate": 3.7561798609655373e-07, - "loss": 0.0002, - "reward": 0.14334412198513746, - "reward_std": 0.18330056220293045, - "rewards/cosine_scaled_reward": 0.006231578998267651, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.0187, + "reward": 0.0839388279709965, + "reward_std": 0.08322880789637566, + "rewards/cosine_scaled_reward": -0.1475684866309166, + "rewards/format_reward": 0.7708333544433117, "step": 332 }, { - "completion_length": 1089.1666870117188, - "epoch": 0.19028571428571428, - "grad_norm": 0.320068895816803, - "kl": 0.00738525390625, + "completion_length": 1169.4166946411133, + "epoch": 0.38057142857142856, + "grad_norm": 1.17229163646698, + "kl": 0.29107666015625, "learning_rate": 3.72726140684072e-07, - "loss": 0.0003, - "reward": 0.11433824151754379, - "reward_std": 0.10415084846317768, - "rewards/cosine_scaled_reward": -0.14351750910282135, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0117, + "reward": 0.12938493536785245, + "reward_std": 0.13117663795128465, + "rewards/cosine_scaled_reward": -0.09842105722054839, + "rewards/format_reward": 0.9375000149011612, "step": 333 }, { - "completion_length": 2009.916748046875, - "epoch": 0.19085714285714286, - "grad_norm": 0.26171213388442993, - "kl": 0.0092620849609375, + "completion_length": 2001.9375305175781, + "epoch": 0.38171428571428573, + "grad_norm": 2.2998111248016357, + "kl": 1.0654296875, "learning_rate": 3.6984293534939737e-07, - "loss": 0.0004, - "reward": 0.09622881561517715, - "reward_std": 0.08779392950236797, - "rewards/cosine_scaled_reward": -0.17686026357114315, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0426, + "reward": 0.03344674052641494, + "reward_std": 0.12521635321900249, + "rewards/cosine_scaled_reward": -0.2376094851642847, + "rewards/format_reward": 0.666666679084301, "step": 334 }, { - "completion_length": 2221.166778564453, - "epoch": 0.19142857142857142, - "grad_norm": 0.297913134098053, - "kl": 0.00634002685546875, + "completion_length": 1584.0625534057617, + "epoch": 0.38285714285714284, + "grad_norm": 2.906867504119873, + "kl": 0.5097579956054688, "learning_rate": 3.6696851061588994e-07, - "loss": 0.0003, - "reward": 0.07134210062213242, - "reward_std": 0.17495033890008926, - "rewards/cosine_scaled_reward": -0.12311220541596413, - "rewards/format_reward": 0.666666679084301, + "loss": 0.0204, + "reward": 0.11645684402901679, + "reward_std": 0.12058440665714443, + "rewards/cosine_scaled_reward": -0.04469235986471176, + "rewards/format_reward": 0.7708333432674408, "step": 335 }, { - "completion_length": 1554.2500915527344, - "epoch": 0.192, - "grad_norm": 0.3545062243938446, - "kl": 0.007537841796875, + "completion_length": 1674.354206085205, + "epoch": 0.384, + "grad_norm": 1.6613434553146362, + "kl": 0.6746101379394531, "learning_rate": 3.641030065789562e-07, - "loss": 0.0003, - "reward": 0.13218490220606327, - "reward_std": 0.12200412433594465, - "rewards/cosine_scaled_reward": -0.04995713010430336, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.027, + "reward": 0.10816401499323547, + "reward_std": 0.13928860798478127, + "rewards/cosine_scaled_reward": 0.030387197621166706, + "rewards/format_reward": 0.5625000074505806, "step": 336 }, { - "completion_length": 979.9166870117188, - "epoch": 0.19257142857142856, - "grad_norm": 0.35836145281791687, - "kl": 0.00778961181640625, + "completion_length": 1494.8334045410156, + "epoch": 0.3851428571428571, + "grad_norm": 2.075910806655884, + "kl": 0.5845260620117188, "learning_rate": 3.612465628992203e-07, - "loss": 0.0003, - "reward": 0.2534114308655262, - "reward_std": 0.1473761759698391, - "rewards/cosine_scaled_reward": 0.2439298890531063, - "rewards/format_reward": 1.0, + "loss": 0.0234, + "reward": 0.09280223221867345, + "reward_std": 0.1204152749851346, + "rewards/cosine_scaled_reward": -0.17956852912902832, + "rewards/format_reward": 0.8958333432674408, "step": 337 }, { - "completion_length": 1342.8333435058594, - "epoch": 0.19314285714285714, - "grad_norm": 0.30824002623558044, - "kl": 0.00574493408203125, + "completion_length": 1031.0625076293945, + "epoch": 0.3862857142857143, + "grad_norm": 1.3386896848678589, + "kl": 0.3839874267578125, "learning_rate": 3.5839931879571725e-07, - "loss": 0.0002, - "reward": 0.25899236742407084, - "reward_std": 0.056164965964853764, - "rewards/cosine_scaled_reward": 0.3065415769815445, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0154, + "reward": 0.12651887256652117, + "reward_std": 0.11517677642405033, + "rewards/cosine_scaled_reward": -0.06461506709456444, + "rewards/format_reward": 0.8541666772216558, "step": 338 }, { - "completion_length": 1336.0000457763672, - "epoch": 0.19371428571428573, - "grad_norm": 0.42957302927970886, - "kl": 0.011322021484375, + "completion_length": 1663.1250305175781, + "epoch": 0.38742857142857146, + "grad_norm": 2.8500008583068848, + "kl": 0.616424560546875, "learning_rate": 3.555614130391079e-07, - "loss": 0.0005, - "reward": 0.12907202867791057, - "reward_std": 0.06525825895369053, - "rewards/cosine_scaled_reward": -0.0594879649579525, - "rewards/format_reward": 0.875, + "loss": 0.0247, + "reward": 0.09466042937128805, + "reward_std": 0.09753736667335033, + "rewards/cosine_scaled_reward": -0.1472719321027398, + "rewards/format_reward": 0.8333333544433117, "step": 339 }, { - "completion_length": 2111.041717529297, - "epoch": 0.19428571428571428, - "grad_norm": 0.23951831459999084, - "kl": 0.005275726318359375, + "completion_length": 1387.4375381469727, + "epoch": 0.38857142857142857, + "grad_norm": 2.0746471881866455, + "kl": 0.4688873291015625, "learning_rate": 3.5273298394491515e-07, - "loss": 0.0002, - "reward": 0.11526640597730875, - "reward_std": 0.09964395221322775, - "rewards/cosine_scaled_reward": 0.044717367738485336, - "rewards/format_reward": 0.5833333358168602, + "loss": 0.0188, + "reward": 0.1363295007031411, + "reward_std": 0.12295715417712927, + "rewards/cosine_scaled_reward": -0.040779574774205685, + "rewards/format_reward": 0.875, "step": 340 }, { - "completion_length": 1666.4583435058594, - "epoch": 0.19485714285714287, - "grad_norm": 0.31324517726898193, - "kl": 0.0053253173828125, + "completion_length": 1612.5208740234375, + "epoch": 0.38971428571428574, + "grad_norm": 2.2794735431671143, + "kl": 0.814697265625, "learning_rate": 3.4991416936678276e-07, - "loss": 0.0002, - "reward": 0.08922275435179472, - "reward_std": 0.11431438475847244, - "rewards/cosine_scaled_reward": -0.11223939806222916, - "rewards/format_reward": 0.75, + "loss": 0.0326, + "reward": 0.1513365504797548, + "reward_std": 0.15308804996311665, + "rewards/cosine_scaled_reward": 0.11684986762702465, + "rewards/format_reward": 0.6250000149011612, "step": 341 }, { - "completion_length": 2162.5833435058594, - "epoch": 0.19542857142857142, - "grad_norm": 0.2762133479118347, - "kl": 0.007244110107421875, + "completion_length": 1542.833381652832, + "epoch": 0.39085714285714285, + "grad_norm": 2.162214756011963, + "kl": 0.7668609619140625, "learning_rate": 3.471051066897562e-07, - "loss": 0.0003, - "reward": 0.10517577454447746, - "reward_std": 0.10100515838712454, - "rewards/cosine_scaled_reward": 0.05931771546602249, - "rewards/format_reward": 0.5, + "loss": 0.0307, + "reward": 0.07486888614948839, + "reward_std": 0.08258337597362697, + "rewards/cosine_scaled_reward": -0.19311499642208219, + "rewards/format_reward": 0.8125000111758709, "step": 342 }, { - "completion_length": 1259.9167022705078, - "epoch": 0.196, - "grad_norm": 0.4986846148967743, - "kl": 0.01161956787109375, + "completion_length": 1584.8750457763672, + "epoch": 0.392, + "grad_norm": 2.241345167160034, + "kl": 0.691986083984375, "learning_rate": 3.4430593282358777e-07, - "loss": 0.0005, - "reward": 0.28647180274128914, - "reward_std": 0.18137922044843435, - "rewards/cosine_scaled_reward": 0.39783449098467827, - "rewards/format_reward": 0.875, + "loss": 0.0277, + "reward": 0.13888886122731492, + "reward_std": 0.1267297170124948, + "rewards/cosine_scaled_reward": 0.026948151644319296, + "rewards/format_reward": 0.7500000186264515, "step": 343 }, { - "completion_length": 2507.375, - "epoch": 0.19657142857142856, - "grad_norm": 0.36488795280456543, - "kl": 0.01192474365234375, + "completion_length": 1414.5833587646484, + "epoch": 0.3931428571428571, + "grad_norm": 4.952951908111572, + "kl": 0.78118896484375, "learning_rate": 3.4151678419606233e-07, - "loss": 0.0005, - "reward": 0.09034067811444402, - "reward_std": 0.09155634045600891, - "rewards/cosine_scaled_reward": 0.053079698234796524, - "rewards/format_reward": 0.4166666828095913, + "loss": 0.0312, + "reward": 0.1714508015429601, + "reward_std": 0.1391323572024703, + "rewards/cosine_scaled_reward": 0.07935434021055698, + "rewards/format_reward": 0.833333358168602, "step": 344 }, { - "completion_length": 2022.9583587646484, - "epoch": 0.19714285714285715, - "grad_norm": 0.2977750599384308, - "kl": 0.012126922607421875, + "completion_length": 1180.333366394043, + "epoch": 0.3942857142857143, + "grad_norm": 1.540398120880127, + "kl": 0.5207366943359375, "learning_rate": 3.387377967463493e-07, - "loss": 0.0005, - "reward": 0.06547224149107933, - "reward_std": 0.11361038591712713, - "rewards/cosine_scaled_reward": -0.12147113494575024, - "rewards/format_reward": 0.625, + "loss": 0.0209, + "reward": 0.13360386714339256, + "reward_std": 0.14522417169064283, + "rewards/cosine_scaled_reward": -0.034749194979667664, + "rewards/format_reward": 0.8541666865348816, "step": 345 }, { - "completion_length": 826.4166946411133, - "epoch": 0.1977142857142857, - "grad_norm": 0.483870267868042, - "kl": 0.00688934326171875, + "completion_length": 1519.6875305175781, + "epoch": 0.3954285714285714, + "grad_norm": 2.235276699066162, + "kl": 0.7563095092773438, "learning_rate": 3.359691059183761e-07, - "loss": 0.0003, - "reward": 0.1543113209772855, - "reward_std": 0.14186884555965662, - "rewards/cosine_scaled_reward": -0.04690166004002094, - "rewards/format_reward": 1.0, + "loss": 0.0302, + "reward": 0.06512028211727738, + "reward_std": 0.11110542109236121, + "rewards/cosine_scaled_reward": -0.17456839326769114, + "rewards/format_reward": 0.7291666865348816, "step": 346 }, { - "completion_length": 1192.0833740234375, - "epoch": 0.1982857142857143, - "grad_norm": 0.3235227167606354, - "kl": 0.00669097900390625, + "completion_length": 1307.8750305175781, + "epoch": 0.3965714285714286, + "grad_norm": 4.259415626525879, + "kl": 0.570068359375, "learning_rate": 3.3321084665422803e-07, - "loss": 0.0003, - "reward": 0.16040955064818263, - "reward_std": 0.12023740820586681, - "rewards/cosine_scaled_reward": -0.0062605226412415504, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0228, + "reward": 0.08080136496573687, + "reward_std": 0.10251538408920169, + "rewards/cosine_scaled_reward": -0.22406550850064377, + "rewards/format_reward": 0.9166666865348816, "step": 347 }, { - "completion_length": 1555.2084045410156, - "epoch": 0.19885714285714284, - "grad_norm": 0.3759072721004486, - "kl": 0.01299285888671875, + "completion_length": 1316.1250381469727, + "epoch": 0.3977142857142857, + "grad_norm": 1.8449828624725342, + "kl": 0.8987274169921875, "learning_rate": 3.3046315338757026e-07, - "loss": 0.0005, - "reward": 0.18190800957381725, - "reward_std": 0.1719740368425846, - "rewards/cosine_scaled_reward": 0.11778108216822147, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0359, + "reward": 0.11601153435185552, + "reward_std": 0.12534239329397678, + "rewards/cosine_scaled_reward": -0.09255390800535679, + "rewards/format_reward": 0.8541666939854622, "step": 348 }, { - "completion_length": 2267.4584045410156, - "epoch": 0.19942857142857143, - "grad_norm": 0.4392825663089752, - "kl": 0.009174346923828125, + "completion_length": 1266.5000305175781, + "epoch": 0.39885714285714285, + "grad_norm": 2.301177740097046, + "kl": 0.484771728515625, "learning_rate": 3.2772616003709616e-07, - "loss": 0.0004, - "reward": 0.17164004128426313, - "reward_std": 0.12854371964931488, - "rewards/cosine_scaled_reward": 0.19356799498200417, - "rewards/format_reward": 0.6250000074505806, + "loss": 0.0194, + "reward": 0.11150891752913594, + "reward_std": 0.10962018929421902, + "rewards/cosine_scaled_reward": -0.07765534892678261, + "rewards/format_reward": 0.8125000037252903, "step": 349 }, { - "completion_length": 1689.5000610351562, - "epoch": 0.2, - "grad_norm": 0.37206119298934937, - "kl": 0.007114410400390625, + "completion_length": 1148.3125343322754, + "epoch": 0.4, + "grad_norm": 2.0247983932495117, + "kl": 0.4848480224609375, "learning_rate": 3.250000000000001e-07, - "loss": 0.0003, - "reward": 0.15067086089402437, - "reward_std": 0.08571497537195683, - "rewards/cosine_scaled_reward": 0.026989895850419998, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0194, + "reward": 0.15430114115588367, + "reward_std": 0.14347478654235601, + "rewards/cosine_scaled_reward": 0.002343377098441124, + "rewards/format_reward": 0.8958333507180214, "step": 350 }, { - "completion_length": 1587.2917175292969, - "epoch": 0.20057142857142857, - "grad_norm": 0.4086715281009674, - "kl": 0.0078125, + "completion_length": 1485.7500457763672, + "epoch": 0.40114285714285713, + "grad_norm": 2.1637814044952393, + "kl": 0.888519287109375, "learning_rate": 3.222848061454764e-07, - "loss": 0.0003, - "reward": 0.11855471413582563, - "reward_std": 0.13508227095007896, - "rewards/cosine_scaled_reward": -0.0912880351243075, - "rewards/format_reward": 0.875, + "loss": 0.0356, + "reward": 0.12535253415990155, + "reward_std": 0.15357061475515366, + "rewards/cosine_scaled_reward": -0.002679265569895506, + "rewards/format_reward": 0.7291666865348816, "step": 351 }, { - "completion_length": 1948.6250610351562, - "epoch": 0.20114285714285715, - "grad_norm": 0.2982023060321808, - "kl": 0.00762176513671875, + "completion_length": 1839.8125534057617, + "epoch": 0.4022857142857143, + "grad_norm": 3.5423998832702637, + "kl": 1.449371337890625, "learning_rate": 3.195807108082429e-07, - "loss": 0.0003, - "reward": 0.18151408556150272, - "reward_std": 0.1891896389424801, - "rewards/cosine_scaled_reward": 0.11615890264511108, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.058, + "reward": 0.09909449616679922, + "reward_std": 0.15971363987773657, + "rewards/cosine_scaled_reward": -0.0335394795256434, + "rewards/format_reward": 0.6458333507180214, "step": 352 }, { - "completion_length": 1194.9167022705078, - "epoch": 0.2017142857142857, - "grad_norm": 0.4766882359981537, - "kl": 0.01186370849609375, + "completion_length": 1224.6875305175781, + "epoch": 0.4034285714285714, + "grad_norm": 3.0395312309265137, + "kl": 0.6749420166015625, "learning_rate": 3.168878457820915e-07, - "loss": 0.0005, - "reward": 0.17849738337099552, - "reward_std": 0.11849691066890955, - "rewards/cosine_scaled_reward": 0.06622898019850254, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.027, + "reward": 0.2176575637422502, + "reward_std": 0.1126766725210473, + "rewards/cosine_scaled_reward": 0.20828061178326607, + "rewards/format_reward": 0.8541666753590107, "step": 353 }, { - "completion_length": 2620.0000610351562, - "epoch": 0.2022857142857143, - "grad_norm": 0.26498156785964966, - "kl": 0.00844573974609375, + "completion_length": 1273.708381652832, + "epoch": 0.4045714285714286, + "grad_norm": 2.6028831005096436, + "kl": 0.7369384765625, "learning_rate": 3.142063423134644e-07, - "loss": 0.0003, - "reward": 0.058990323916077614, - "reward_std": 0.11095090536400676, - "rewards/cosine_scaled_reward": -0.11670423299074173, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0296, + "reward": 0.12516162917017937, + "reward_std": 0.09799297200515866, + "rewards/cosine_scaled_reward": -0.0536028565838933, + "rewards/format_reward": 0.8333333432674408, "step": 354 }, { - "completion_length": 1842.6250457763672, - "epoch": 0.20285714285714285, - "grad_norm": 0.4641972482204437, - "kl": 0.0177001953125, + "completion_length": 1033.4166946411133, + "epoch": 0.4057142857142857, + "grad_norm": 8.765235900878906, + "kl": 0.76263427734375, "learning_rate": 3.115363310950578e-07, - "loss": 0.0007, - "reward": 0.1758787203580141, - "reward_std": 0.18893010169267654, - "rewards/cosine_scaled_reward": 0.11979078315198421, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0305, + "reward": 0.2028479753062129, + "reward_std": 0.13977694138884544, + "rewards/cosine_scaled_reward": 0.11576181277632713, + "rewards/format_reward": 0.9583333432674408, "step": 355 }, { - "completion_length": 1170.1250457763672, - "epoch": 0.20342857142857143, - "grad_norm": 0.3953860104084015, - "kl": 0.0081329345703125, + "completion_length": 1755.3959197998047, + "epoch": 0.40685714285714286, + "grad_norm": 8.445033073425293, + "kl": 1.7131805419921875, "learning_rate": 3.0887794225945143e-07, - "loss": 0.0003, - "reward": 0.08785539958626032, - "reward_std": 0.09181052353233099, - "rewards/cosine_scaled_reward": -0.22207785211503506, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0685, + "reward": 0.06232760299462825, + "reward_std": 0.15073465276509523, + "rewards/cosine_scaled_reward": -0.1625305749475956, + "rewards/format_reward": 0.6875000223517418, "step": 356 }, { - "completion_length": 2746.750030517578, - "epoch": 0.204, - "grad_norm": 0.38290905952453613, - "kl": 0.0120086669921875, + "completion_length": 1907.8750686645508, + "epoch": 0.408, + "grad_norm": 2.7435877323150635, + "kl": 1.49609375, "learning_rate": 3.062313053727671e-07, - "loss": 0.0005, - "reward": 0.0031569083221256733, - "reward_std": 0.0973870512098074, - "rewards/cosine_scaled_reward": -0.17823771573603153, - "rewards/format_reward": 0.3750000149011612, + "loss": 0.0599, + "reward": 0.045256637153215706, + "reward_std": 0.10663844272494316, + "rewards/cosine_scaled_reward": -0.23459493229165673, + "rewards/format_reward": 0.7291666846722364, "step": 357 }, { - "completion_length": 1726.7083587646484, - "epoch": 0.20457142857142857, - "grad_norm": 0.31264233589172363, - "kl": 0.0087890625, + "completion_length": 1480.333366394043, + "epoch": 0.40914285714285714, + "grad_norm": 2.5645945072174072, + "kl": 0.68377685546875, "learning_rate": 3.0359654942835247e-07, - "loss": 0.0004, - "reward": 0.08988821506500244, - "reward_std": 0.09381802566349506, - "rewards/cosine_scaled_reward": -0.11072870343923569, - "rewards/format_reward": 0.75, + "loss": 0.0273, + "reward": 0.1269954121671617, + "reward_std": 0.12238077353686094, + "rewards/cosine_scaled_reward": -0.08821228123269975, + "rewards/format_reward": 0.9166666865348816, "step": 358 }, { - "completion_length": 1186.4166793823242, - "epoch": 0.20514285714285715, - "grad_norm": 0.30002641677856445, - "kl": 0.00556182861328125, + "completion_length": 1268.8750381469727, + "epoch": 0.4102857142857143, + "grad_norm": 9.816429138183594, + "kl": 1.206329345703125, "learning_rate": 3.0097380284049523e-07, - "loss": 0.0002, - "reward": 0.28468725830316544, - "reward_std": 0.1832931600511074, - "rewards/cosine_scaled_reward": 0.3569262996315956, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0483, + "reward": 0.0941942217759788, + "reward_std": 0.09892157511785626, + "rewards/cosine_scaled_reward": -0.17397763207554817, + "rewards/format_reward": 0.8958333507180214, "step": 359 }, { - "completion_length": 1543.3750457763672, - "epoch": 0.2057142857142857, - "grad_norm": 0.43645501136779785, - "kl": 0.01152801513671875, + "completion_length": 1494.9583587646484, + "epoch": 0.4114285714285714, + "grad_norm": 2.043757915496826, + "kl": 0.8025665283203125, "learning_rate": 2.9836319343816397e-07, - "loss": 0.0005, - "reward": 0.21140727028250694, - "reward_std": 0.12353593099396676, - "rewards/cosine_scaled_reward": 0.23092055320739746, - "rewards/format_reward": 0.791666679084301, + "loss": 0.0321, + "reward": 0.14919199608266354, + "reward_std": 0.16920089721679688, + "rewards/cosine_scaled_reward": 0.01944921351969242, + "rewards/format_reward": 0.833333358168602, "step": 360 }, { - "completion_length": 2161.666778564453, - "epoch": 0.2062857142857143, - "grad_norm": 0.4098803997039795, - "kl": 0.0108184814453125, + "completion_length": 1387.5000457763672, + "epoch": 0.4125714285714286, + "grad_norm": 3.627060890197754, + "kl": 0.604827880859375, "learning_rate": 2.9576484845877793e-07, - "loss": 0.0004, - "reward": 0.07429497828707099, - "reward_std": 0.08556102309376001, - "rewards/cosine_scaled_reward": -0.1573917493224144, - "rewards/format_reward": 0.7500000149011612, + "loss": 0.0243, + "reward": 0.0996842200984247, + "reward_std": 0.12515971716493368, + "rewards/cosine_scaled_reward": -0.14535773918032646, + "rewards/format_reward": 0.8750000149011612, "step": 361 }, { - "completion_length": 1550.6667022705078, - "epoch": 0.20685714285714285, - "grad_norm": 0.2914271354675293, - "kl": 0.00815582275390625, + "completion_length": 1282.7291946411133, + "epoch": 0.4137142857142857, + "grad_norm": 2.7785682678222656, + "kl": 0.9784774780273438, "learning_rate": 2.931788945420058e-07, - "loss": 0.0003, - "reward": 0.20716216741129756, - "reward_std": 0.13167328014969826, - "rewards/cosine_scaled_reward": 0.19552084058523178, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.0392, + "reward": 0.15694484941195697, + "reward_std": 0.08703827066347003, + "rewards/cosine_scaled_reward": 0.005612561479210854, + "rewards/format_reward": 0.8958333432674408, "step": 362 }, { - "completion_length": 2079.0000915527344, - "epoch": 0.20742857142857143, - "grad_norm": 0.24376393854618073, - "kl": 0.00457763671875, + "completion_length": 981.1875267028809, + "epoch": 0.41485714285714287, + "grad_norm": 2.8391177654266357, + "kl": 0.4031524658203125, "learning_rate": 2.9060545772359305e-07, - "loss": 0.0002, - "reward": 0.10386873036623001, - "reward_std": 0.12028491590172052, - "rewards/cosine_scaled_reward": -0.06645945087075233, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0162, + "reward": 0.20889083773363382, + "reward_std": 0.09412059234455228, + "rewards/cosine_scaled_reward": 0.14475639257580042, + "rewards/format_reward": 0.8958333432674408, "step": 363 }, { - "completion_length": 1426.1666870117188, - "epoch": 0.208, - "grad_norm": 0.3742162585258484, - "kl": 0.005340576171875, + "completion_length": 1328.2500457763672, + "epoch": 0.416, + "grad_norm": 1.4150444269180298, + "kl": 0.5859603881835938, "learning_rate": 2.8804466342921987e-07, - "loss": 0.0002, - "reward": 0.07049184036441147, - "reward_std": 0.09780162759125233, - "rewards/cosine_scaled_reward": -0.19005949795246124, - "rewards/format_reward": 0.7916666679084301, + "loss": 0.0235, + "reward": 0.05011219787411392, + "reward_std": 0.09909166162833571, + "rewards/cosine_scaled_reward": -0.26201771944761276, + "rewards/format_reward": 0.8125000260770321, "step": 364 }, { - "completion_length": 1153.5417175292969, - "epoch": 0.20857142857142857, - "grad_norm": 0.35599327087402344, - "kl": 0.0100555419921875, + "completion_length": 1732.4375457763672, + "epoch": 0.41714285714285715, + "grad_norm": 3.007594585418701, + "kl": 1.2857818603515625, "learning_rate": 2.854966364683872e-07, - "loss": 0.0004, - "reward": 0.166679373010993, - "reward_std": 0.12070106063038111, - "rewards/cosine_scaled_reward": -0.009944230318069458, - "rewards/format_reward": 1.0, + "loss": 0.0514, + "reward": 0.13636061176657677, + "reward_std": 0.1195504111237824, + "rewards/cosine_scaled_reward": 0.015712120453827083, + "rewards/format_reward": 0.750000013038516, "step": 365 }, { - "completion_length": 1327.625015258789, - "epoch": 0.20914285714285713, - "grad_norm": 0.4017048180103302, - "kl": 0.008220672607421875, + "completion_length": 1108.0000495910645, + "epoch": 0.41828571428571426, + "grad_norm": 4.287596702575684, + "kl": 0.290069580078125, "learning_rate": 2.829615010283344e-07, - "loss": 0.0003, - "reward": 0.20296469517052174, - "reward_std": 0.08326233178377151, - "rewards/cosine_scaled_reward": 0.11577039491385221, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0116, + "reward": 0.15384384151548147, + "reward_std": 0.1040445901453495, + "rewards/cosine_scaled_reward": 0.009631453547626734, + "rewards/format_reward": 0.8750000149011612, "step": 366 }, { - "completion_length": 1248.1667175292969, - "epoch": 0.20971428571428571, - "grad_norm": 0.3973245918750763, - "kl": 0.01091766357421875, + "completion_length": 1712.8542022705078, + "epoch": 0.41942857142857143, + "grad_norm": 2.6099181175231934, + "kl": 0.9123382568359375, "learning_rate": 2.8043938066798645e-07, - "loss": 0.0004, - "reward": 0.14192159054800868, - "reward_std": 0.08490000059828162, - "rewards/cosine_scaled_reward": -0.0199802964925766, - "rewards/format_reward": 0.875, + "loss": 0.0365, + "reward": 0.0756345079280436, + "reward_std": 0.11609864910133183, + "rewards/cosine_scaled_reward": -0.12630227487534285, + "rewards/format_reward": 0.6875000111758709, "step": 367 }, { - "completion_length": 2086.041717529297, - "epoch": 0.2102857142857143, - "grad_norm": 0.37026768922805786, - "kl": 0.01041412353515625, + "completion_length": 1932.4375381469727, + "epoch": 0.4205714285714286, + "grad_norm": 4.4302215576171875, + "kl": 1.157470703125, "learning_rate": 2.7793039831193133e-07, - "loss": 0.0004, - "reward": 0.06479707430116832, - "reward_std": 0.08738323394209146, - "rewards/cosine_scaled_reward": -0.20435304380953312, - "rewards/format_reward": 0.7916666865348816, + "loss": 0.0463, + "reward": 0.05102244240697473, + "reward_std": 0.14196251472458243, + "rewards/cosine_scaled_reward": -0.1656601596623659, + "rewards/format_reward": 0.6250000074505806, "step": 368 }, { - "completion_length": 2242.2916870117188, - "epoch": 0.21085714285714285, - "grad_norm": 0.4381715953350067, - "kl": 0.012054443359375, + "completion_length": 1801.0000686645508, + "epoch": 0.4217142857142857, + "grad_norm": 2.231231451034546, + "kl": 1.43145751953125, "learning_rate": 2.7543467624442956e-07, - "loss": 0.0005, - "reward": 0.02100947010330856, - "reward_std": 0.1212756559252739, - "rewards/cosine_scaled_reward": -0.2307599112391472, - "rewards/format_reward": 0.5833333507180214, + "loss": 0.0572, + "reward": 0.13401565965614282, + "reward_std": 0.1620243601500988, + "rewards/cosine_scaled_reward": 0.007201282307505608, + "rewards/format_reward": 0.7500000204890966, "step": 369 }, { - "completion_length": 1030.1666870117188, - "epoch": 0.21142857142857144, - "grad_norm": 0.3970824182033539, - "kl": 0.00518798828125, + "completion_length": 1558.6250267028809, + "epoch": 0.4228571428571429, + "grad_norm": 2.5228145122528076, + "kl": 1.020751953125, "learning_rate": 2.729523361034538e-07, - "loss": 0.0002, - "reward": 0.07259168848395348, - "reward_std": 0.08260688814334571, - "rewards/cosine_scaled_reward": -0.2663383586332202, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0408, + "reward": 0.09310736267070752, + "reward_std": 0.09267222543712705, + "rewards/cosine_scaled_reward": -0.09523181803524494, + "rewards/format_reward": 0.7291666865348816, "step": 370 }, { - "completion_length": 1592.0416717529297, - "epoch": 0.212, - "grad_norm": 0.2999872863292694, - "kl": 0.00794219970703125, + "completion_length": 933.7292060852051, + "epoch": 0.424, + "grad_norm": 1.8327668905258179, + "kl": 0.6253280639648438, "learning_rate": 2.7048349887476037e-07, - "loss": 0.0003, - "reward": 0.20434065232984722, - "reward_std": 0.18057554587721825, - "rewards/cosine_scaled_reward": 0.18890590965747833, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.025, + "reward": 0.1391057469882071, + "reward_std": 0.09823393146507442, + "rewards/cosine_scaled_reward": 0.023010117933154106, + "rewards/format_reward": 0.7708333469927311, "step": 371 }, { - "completion_length": 2246.541732788086, - "epoch": 0.21257142857142858, - "grad_norm": 0.3598721921443939, - "kl": 0.01320648193359375, + "completion_length": 1648.6458740234375, + "epoch": 0.42514285714285716, + "grad_norm": 3.7653868198394775, + "kl": 0.7486572265625, "learning_rate": 2.6802828488599294e-07, - "loss": 0.0005, - "reward": 0.020206657238304615, - "reward_std": 0.06623952649533749, - "rewards/cosine_scaled_reward": -0.21290142834186554, - "rewards/format_reward": 0.5416666679084301, + "loss": 0.0299, + "reward": 0.11916764298803173, + "reward_std": 0.16286169085651636, + "rewards/cosine_scaled_reward": -0.006715672556310892, + "rewards/format_reward": 0.7083333544433117, "step": 372 }, { - "completion_length": 1374.8333740234375, - "epoch": 0.21314285714285713, - "grad_norm": 0.3151344656944275, - "kl": 0.0064849853515625, + "completion_length": 899.7083587646484, + "epoch": 0.42628571428571427, + "grad_norm": 2.0490570068359375, + "kl": 0.393463134765625, "learning_rate": 2.655868138008171e-07, - "loss": 0.0003, - "reward": 0.1322591225616634, - "reward_std": 0.09143866831436753, - "rewards/cosine_scaled_reward": -0.06762194633483887, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0157, + "reward": 0.11241465236525983, + "reward_std": 0.1083641320001334, + "rewards/cosine_scaled_reward": -0.0999544644728303, + "rewards/format_reward": 0.8541666828095913, "step": 373 }, { - "completion_length": 1958.0833587646484, - "epoch": 0.21371428571428572, - "grad_norm": 0.4066512882709503, - "kl": 0.0143890380859375, + "completion_length": 1088.1042022705078, + "epoch": 0.42742857142857144, + "grad_norm": 2.575913667678833, + "kl": 0.3724212646484375, "learning_rate": 2.631592046130896e-07, - "loss": 0.0006, - "reward": 0.03527960833162069, - "reward_std": 0.05944979004561901, - "rewards/cosine_scaled_reward": -0.2710474990308285, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0149, + "reward": 0.1537247821688652, + "reward_std": 0.144410849083215, + "rewards/cosine_scaled_reward": 0.0019068364053964615, + "rewards/format_reward": 0.8958333432674408, "step": 374 }, { - "completion_length": 2048.6250610351562, - "epoch": 0.21428571428571427, - "grad_norm": 0.31716644763946533, - "kl": 0.00885009765625, + "completion_length": 1255.9375457763672, + "epoch": 0.42857142857142855, + "grad_norm": 4.075099468231201, + "kl": 1.0, "learning_rate": 2.6074557564105724e-07, - "loss": 0.0004, - "reward": 0.06908245524391532, - "reward_std": 0.1118032718077302, - "rewards/cosine_scaled_reward": -0.1307327002286911, - "rewards/format_reward": 0.6666666828095913, + "loss": 0.04, + "reward": 0.11958665121346712, + "reward_std": 0.17436719313263893, + "rewards/cosine_scaled_reward": -0.02865603007376194, + "rewards/format_reward": 0.7500000149011612, "step": 375 }, { - "completion_length": 2341.750030517578, - "epoch": 0.21485714285714286, - "grad_norm": 0.4057767391204834, - "kl": 0.01312255859375, + "completion_length": 1406.4583740234375, + "epoch": 0.4297142857142857, + "grad_norm": 4.192153453826904, + "kl": 1.0216064453125, "learning_rate": 2.583460445215911e-07, - "loss": 0.0005, - "reward": 0.03595755062997341, - "reward_std": 0.09900238458067179, - "rewards/cosine_scaled_reward": -0.16343829687684774, - "rewards/format_reward": 0.5416666679084301, + "loss": 0.0409, + "reward": 0.11533228470943868, + "reward_std": 0.1220639725215733, + "rewards/cosine_scaled_reward": -0.10069759003818035, + "rewards/format_reward": 0.8750000149011612, "step": 376 }, { - "completion_length": 1181.7917022705078, - "epoch": 0.21542857142857144, - "grad_norm": 0.38229188323020935, - "kl": 0.00901031494140625, + "completion_length": 1517.8542175292969, + "epoch": 0.4308571428571429, + "grad_norm": 3.5561301708221436, + "kl": 0.7474365234375, "learning_rate": 2.5596072820445254e-07, - "loss": 0.0004, - "reward": 0.14246569201350212, - "reward_std": 0.10488046705722809, - "rewards/cosine_scaled_reward": 0.0011578947305679321, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.0299, + "reward": 0.06857469491660595, + "reward_std": 0.12185074761509895, + "rewards/cosine_scaled_reward": -0.20741149224340916, + "rewards/format_reward": 0.8125000298023224, "step": 377 }, { - "completion_length": 1232.9167175292969, - "epoch": 0.216, - "grad_norm": 0.3913348615169525, - "kl": 0.00858306884765625, + "completion_length": 1371.6458797454834, + "epoch": 0.432, + "grad_norm": 2.8652353286743164, + "kl": 0.5273590087890625, "learning_rate": 2.5358974294659373e-07, - "loss": 0.0003, - "reward": 0.04195020208135247, - "reward_std": 0.07212261809036136, - "rewards/cosine_scaled_reward": -0.33642251044511795, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0211, + "reward": 0.15830194216687232, + "reward_std": 0.12496394384652376, + "rewards/cosine_scaled_reward": 0.02524256706237793, + "rewards/format_reward": 0.8333333432674408, "step": 378 }, { - "completion_length": 1434.5, - "epoch": 0.21657142857142858, - "grad_norm": 0.2608247995376587, - "kl": 0.0068359375, + "completion_length": 1719.6666946411133, + "epoch": 0.43314285714285716, + "grad_norm": 1.9505208730697632, + "kl": 0.86883544921875, "learning_rate": 2.512332043064913e-07, - "loss": 0.0003, - "reward": 0.11376794520765543, - "reward_std": 0.13872879184782505, - "rewards/cosine_scaled_reward": -0.12288481742143631, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0347, + "reward": 0.07183210924267769, + "reward_std": 0.08735731011256576, + "rewards/cosine_scaled_reward": -0.19765841774642467, + "rewards/format_reward": 0.812500013038516, "step": 379 }, { - "completion_length": 911.1250457763672, - "epoch": 0.21714285714285714, - "grad_norm": 0.4077244699001312, - "kl": 0.00713348388671875, + "completion_length": 1268.458366394043, + "epoch": 0.4342857142857143, + "grad_norm": 2.074549436569214, + "kl": 0.7683868408203125, "learning_rate": 2.488912271385139e-07, - "loss": 0.0003, - "reward": 0.16560986638069153, - "reward_std": 0.0655624121427536, - "rewards/cosine_scaled_reward": 0.007554691284894943, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0308, + "reward": 0.09168538171797991, + "reward_std": 0.1286314483731985, + "rewards/cosine_scaled_reward": -0.10583510436117649, + "rewards/format_reward": 0.7500000055879354, "step": 380 }, { - "completion_length": 1085.2917022705078, - "epoch": 0.21771428571428572, - "grad_norm": 0.40816831588745117, - "kl": 0.0084228515625, + "completion_length": 1851.1042251586914, + "epoch": 0.43542857142857144, + "grad_norm": 3.475543975830078, + "kl": 1.308349609375, "learning_rate": 2.465639255873246e-07, - "loss": 0.0003, - "reward": 0.14622489735484123, - "reward_std": 0.12954704836010933, - "rewards/cosine_scaled_reward": -0.05051956232637167, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0523, + "reward": 0.017921562888659537, + "reward_std": 0.10069577465765178, + "rewards/cosine_scaled_reward": -0.26895514875650406, + "rewards/format_reward": 0.6458333525806665, "step": 381 }, { - "completion_length": 1053.8750305175781, - "epoch": 0.21828571428571428, - "grad_norm": 0.3615580201148987, - "kl": 0.00667572021484375, + "completion_length": 1056.208351135254, + "epoch": 0.43657142857142855, + "grad_norm": 3.336723566055298, + "kl": 0.65924072265625, "learning_rate": 2.4425141308231765e-07, - "loss": 0.0003, - "reward": 0.23885554261505604, - "reward_std": 0.1068088416941464, - "rewards/cosine_scaled_reward": 0.20313109457492828, - "rewards/format_reward": 1.0, + "loss": 0.0264, + "reward": 0.0698289682622999, + "reward_std": 0.10450809169560671, + "rewards/cosine_scaled_reward": -0.20110327936708927, + "rewards/format_reward": 0.8125000149011612, "step": 382 }, { - "completion_length": 1816.1667022705078, - "epoch": 0.21885714285714286, - "grad_norm": 0.3170018494129181, - "kl": 0.008575439453125, + "completion_length": 1278.520881652832, + "epoch": 0.4377142857142857, + "grad_norm": 1.7827008962631226, + "kl": 1.0176849365234375, "learning_rate": 2.4195380233209006e-07, - "loss": 0.0003, - "reward": 0.1350853955373168, - "reward_std": 0.16153337061405182, - "rewards/cosine_scaled_reward": -0.00037962011992931366, - "rewards/format_reward": 0.791666679084301, + "loss": 0.0407, + "reward": 0.15380790340714157, + "reward_std": 0.1372139612212777, + "rewards/cosine_scaled_reward": 0.03767992998473346, + "rewards/format_reward": 0.8125000074505806, "step": 383 }, { - "completion_length": 1950.375, - "epoch": 0.21942857142857142, - "grad_norm": 0.38474762439727783, - "kl": 0.010650634765625, + "completion_length": 1171.3541946411133, + "epoch": 0.43885714285714283, + "grad_norm": 2.2583577632904053, + "kl": 0.81756591796875, "learning_rate": 2.3967120531894857e-07, - "loss": 0.0004, - "reward": 0.0158225460909307, - "reward_std": 0.07315236143767834, - "rewards/cosine_scaled_reward": -0.3099482133984566, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.0327, + "reward": 0.21438380563631654, + "reward_std": 0.18565166369080544, + "rewards/cosine_scaled_reward": 0.19361148471944034, + "rewards/format_reward": 0.8541666865348816, "step": 384 }, { - "completion_length": 2737.8333740234375, - "epoch": 0.22, - "grad_norm": 0.2779238224029541, - "kl": 0.0101776123046875, + "completion_length": 1468.4584121704102, + "epoch": 0.44, + "grad_norm": 3.9643666744232178, + "kl": 0.9686126708984375, "learning_rate": 2.374037332934512e-07, - "loss": 0.0004, - "reward": 0.06211519613862038, - "reward_std": 0.1654689945280552, - "rewards/cosine_scaled_reward": -0.06907939910888672, - "rewards/format_reward": 0.5000000111758709, + "loss": 0.0388, + "reward": 0.09368883771821856, + "reward_std": 0.12416216172277927, + "rewards/cosine_scaled_reward": -0.13376147765666246, + "rewards/format_reward": 0.8125000298023224, "step": 385 }, { - "completion_length": 1792.2916870117188, - "epoch": 0.22057142857142858, - "grad_norm": 0.4043613374233246, - "kl": 0.01165008544921875, + "completion_length": 1320.7292022705078, + "epoch": 0.44114285714285717, + "grad_norm": 2.8084425926208496, + "kl": 0.906951904296875, "learning_rate": 2.3515149676898552e-07, - "loss": 0.0005, - "reward": 0.09801781643182039, - "reward_std": 0.10368974041193724, - "rewards/cosine_scaled_reward": -0.04495358094573021, - "rewards/format_reward": 0.6666666865348816, + "loss": 0.0363, + "reward": 0.18752056313678622, + "reward_std": 0.09679444809444249, + "rewards/cosine_scaled_reward": 0.0695192702114582, + "rewards/format_reward": 0.9375000149011612, "step": 386 }, { - "completion_length": 2386.7084045410156, - "epoch": 0.22114285714285714, - "grad_norm": 0.27550241351127625, - "kl": 0.0086822509765625, + "completion_length": 1446.6667175292969, + "epoch": 0.4422857142857143, + "grad_norm": 2.2158679962158203, + "kl": 0.94366455078125, "learning_rate": 2.3291460551638237e-07, - "loss": 0.0003, - "reward": 0.2090755831450224, - "reward_std": 0.14855033997446299, - "rewards/cosine_scaled_reward": 0.24086257070302963, - "rewards/format_reward": 0.75, + "loss": 0.0378, + "reward": 0.13372905366122723, + "reward_std": 0.09293439192697406, + "rewards/cosine_scaled_reward": -0.017829248681664467, + "rewards/format_reward": 0.8125000204890966, "step": 387 }, { - "completion_length": 2094.166732788086, - "epoch": 0.22171428571428572, - "grad_norm": 0.3434302508831024, - "kl": 0.0134429931640625, + "completion_length": 1416.458366394043, + "epoch": 0.44342857142857145, + "grad_norm": 3.4314982891082764, + "kl": 1.20458984375, "learning_rate": 2.306931685585657e-07, - "loss": 0.0005, - "reward": 0.07938379980623722, - "reward_std": 0.16526173241436481, - "rewards/cosine_scaled_reward": -0.10007171449251473, - "rewards/format_reward": 0.6666666716337204, + "loss": 0.0482, + "reward": 0.10979634639807045, + "reward_std": 0.14348772866651416, + "rewards/cosine_scaled_reward": -0.09102598764002323, + "rewards/format_reward": 0.8125000074505806, "step": 388 }, { - "completion_length": 1078.2083587646484, - "epoch": 0.22228571428571428, - "grad_norm": 0.36017942428588867, - "kl": 0.008579254150390625, + "completion_length": 1301.3750381469727, + "epoch": 0.44457142857142856, + "grad_norm": 2.1517300605773926, + "kl": 0.8640289306640625, "learning_rate": 2.2848729416523859e-07, - "loss": 0.0003, - "reward": 0.14833149127662182, - "reward_std": 0.1003393866121769, - "rewards/cosine_scaled_reward": -0.044327083975076675, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0345, + "reward": 0.12174346391111612, + "reward_std": 0.1113532236777246, + "rewards/cosine_scaled_reward": -0.11768272100016475, + "rewards/format_reward": 0.9375000149011612, "step": 389 }, { - "completion_length": 1731.7500457763672, - "epoch": 0.22285714285714286, - "grad_norm": 0.2937983274459839, - "kl": 0.00847625732421875, + "completion_length": 1864.270881652832, + "epoch": 0.44571428571428573, + "grad_norm": 2.703826665878296, + "kl": 1.3235321044921875, "learning_rate": 2.2629708984760706e-07, - "loss": 0.0003, - "reward": 0.1133604384958744, - "reward_std": 0.1449694661423564, - "rewards/cosine_scaled_reward": -0.1039534006267786, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.053, + "reward": 0.08123930124565959, + "reward_std": 0.12660485692322254, + "rewards/cosine_scaled_reward": -0.06472943164408207, + "rewards/format_reward": 0.6041666753590107, "step": 390 }, { - "completion_length": 2893.041717529297, - "epoch": 0.22342857142857142, - "grad_norm": 0.3667258620262146, - "kl": 0.0148468017578125, + "completion_length": 1732.4167137145996, + "epoch": 0.44685714285714284, + "grad_norm": 3.1918063163757324, + "kl": 1.76708984375, "learning_rate": 2.2412266235313973e-07, - "loss": 0.0006, - "reward": 0.023702453976511606, - "reward_std": 0.1544233299791813, - "rewards/cosine_scaled_reward": -0.0989939762512222, - "rewards/format_reward": 0.3333333395421505, + "loss": 0.0708, + "reward": 0.14286700636148453, + "reward_std": 0.18649776838719845, + "rewards/cosine_scaled_reward": 0.028382533695548773, + "rewards/format_reward": 0.7500000111758709, "step": 391 }, { - "completion_length": 1818.9584045410156, - "epoch": 0.224, - "grad_norm": 0.31116607785224915, - "kl": 0.00978851318359375, + "completion_length": 1371.7916946411133, + "epoch": 0.448, + "grad_norm": 3.6219229698181152, + "kl": 0.9765167236328125, "learning_rate": 2.2196411766036487e-07, - "loss": 0.0004, - "reward": 0.18491296656429768, - "reward_std": 0.13328682258725166, - "rewards/cosine_scaled_reward": 0.06605712324380875, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.039, + "reward": 0.06799169280566275, + "reward_std": 0.09887422667816281, + "rewards/cosine_scaled_reward": -0.19014379568398, + "rewards/format_reward": 0.7708333488553762, "step": 392 }, { - "completion_length": 1275.2500610351562, - "epoch": 0.22457142857142856, - "grad_norm": 0.29684415459632874, - "kl": 0.00753021240234375, + "completion_length": 1691.1666870117188, + "epoch": 0.4491428571428571, + "grad_norm": 2.618884801864624, + "kl": 1.6240234375, "learning_rate": 2.1982156097370557e-07, - "loss": 0.0003, - "reward": 0.21253256127238274, - "reward_std": 0.14230689406394958, - "rewards/cosine_scaled_reward": 0.16549073345959187, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0649, + "reward": 0.06132141873240471, + "reward_std": 0.10926410043612123, + "rewards/cosine_scaled_reward": -0.12576089892536402, + "rewards/format_reward": 0.6041666809469461, "step": 393 }, { - "completion_length": 774.3333587646484, - "epoch": 0.22514285714285714, - "grad_norm": 0.43091732263565063, - "kl": 0.01126861572265625, + "completion_length": 1537.270866394043, + "epoch": 0.4502857142857143, + "grad_norm": 6.063929557800293, + "kl": 1.8046112060546875, "learning_rate": 2.1769509671835223e-07, - "loss": 0.0004, - "reward": 0.16364898160099983, - "reward_std": 0.1272161053493619, - "rewards/cosine_scaled_reward": -0.020448708906769753, - "rewards/format_reward": 1.0, + "loss": 0.0722, + "reward": 0.0685962769202888, + "reward_std": 0.09430173807777464, + "rewards/cosine_scaled_reward": -0.1982439812272787, + "rewards/format_reward": 0.7916666772216558, "step": 394 }, { - "completion_length": 1754.5416870117188, - "epoch": 0.2257142857142857, - "grad_norm": 0.3315636217594147, - "kl": 0.01122283935546875, + "completion_length": 1333.958396911621, + "epoch": 0.4514285714285714, + "grad_norm": 4.921578884124756, + "kl": 1.4940185546875, "learning_rate": 2.1558482853517253e-07, - "loss": 0.0004, - "reward": 0.1863310383632779, - "reward_std": 0.12636925652623177, - "rewards/cosine_scaled_reward": 0.0882805734872818, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0598, + "reward": 0.1565821710973978, + "reward_std": 0.11654932564124465, + "rewards/cosine_scaled_reward": 0.03457294497638941, + "rewards/format_reward": 0.8125000111758709, "step": 395 }, { - "completion_length": 1317.9583892822266, - "epoch": 0.22628571428571428, - "grad_norm": 0.4063781797885895, - "kl": 0.01039886474609375, + "completion_length": 1335.645866394043, + "epoch": 0.45257142857142857, + "grad_norm": 2.5358009338378906, + "kl": 0.828582763671875, "learning_rate": 2.134908592756607e-07, - "loss": 0.0004, - "reward": 0.07750673312693834, - "reward_std": 0.09616141952574253, - "rewards/cosine_scaled_reward": -0.1879611350595951, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.0332, + "reward": 0.143600944429636, + "reward_std": 0.12354162661358714, + "rewards/cosine_scaled_reward": -0.04586084187030792, + "rewards/format_reward": 0.9166666865348816, "step": 396 }, { - "completion_length": 1440.25, - "epoch": 0.22685714285714287, - "grad_norm": 0.34236493706703186, - "kl": 0.00748443603515625, + "completion_length": 1311.5625381469727, + "epoch": 0.45371428571428574, + "grad_norm": 3.427694082260132, + "kl": 0.9869384765625, "learning_rate": 2.1141329099692406e-07, - "loss": 0.0003, - "reward": 0.06769982213154435, - "reward_std": 0.07849302049726248, - "rewards/cosine_scaled_reward": -0.2383486544713378, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.0395, + "reward": 0.08522352996078553, + "reward_std": 0.11573155457153916, + "rewards/cosine_scaled_reward": -0.11667108163237572, + "rewards/format_reward": 0.7291666883975267, "step": 397 }, { - "completion_length": 1334.4166870117188, - "epoch": 0.22742857142857142, - "grad_norm": 0.3131984770298004, - "kl": 0.007415771484375, + "completion_length": 1535.7500457763672, + "epoch": 0.45485714285714285, + "grad_norm": 1.9625768661499023, + "kl": 1.721282958984375, "learning_rate": 2.0935222495670968e-07, - "loss": 0.0003, - "reward": 0.09769275551661849, - "reward_std": 0.10631104931235313, - "rewards/cosine_scaled_reward": -0.19094078708440065, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0688, + "reward": 0.10138712753541768, + "reward_std": 0.12253602081909776, + "rewards/cosine_scaled_reward": -0.0959820756688714, + "rewards/format_reward": 0.7708333432674408, "step": 398 }, { - "completion_length": 1302.7500457763672, - "epoch": 0.228, - "grad_norm": 0.36516350507736206, - "kl": 0.01001739501953125, + "completion_length": 1052.1875305175781, + "epoch": 0.456, + "grad_norm": 2.016707181930542, + "kl": 0.39638519287109375, "learning_rate": 2.0730776160846853e-07, - "loss": 0.0004, - "reward": 0.11052368767559528, - "reward_std": 0.12456023506820202, - "rewards/cosine_scaled_reward": -0.1534249554388225, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0159, + "reward": 0.20977385994046926, + "reward_std": 0.17129726987332106, + "rewards/cosine_scaled_reward": 0.13206836581230164, + "rewards/format_reward": 0.9375000149011612, "step": 399 }, { - "completion_length": 974.4167022705078, - "epoch": 0.22857142857142856, - "grad_norm": 0.35530197620391846, - "kl": 0.0057525634765625, + "completion_length": 998.1458587646484, + "epoch": 0.45714285714285713, + "grad_norm": 1.5737963914871216, + "kl": 0.47527313232421875, "learning_rate": 2.0528000059645995e-07, - "loss": 0.0002, - "reward": 0.2029826734215021, - "reward_std": 0.1195136783644557, - "rewards/cosine_scaled_reward": 0.0965583398938179, - "rewards/format_reward": 1.0, + "loss": 0.019, + "reward": 0.20247652614489198, + "reward_std": 0.11905267764814198, + "rewards/cosine_scaled_reward": 0.13497311808168888, + "rewards/format_reward": 0.9166666716337204, "step": 400 }, { - "completion_length": 1459.4583587646484, - "epoch": 0.22914285714285715, - "grad_norm": 0.46601906418800354, - "kl": 0.0134429931640625, + "completion_length": 1659.1250305175781, + "epoch": 0.4582857142857143, + "grad_norm": 2.9050798416137695, + "kl": 1.2333984375, "learning_rate": 2.032690407508949e-07, - "loss": 0.0005, - "reward": 0.17741917446255684, - "reward_std": 0.07429017499089241, - "rewards/cosine_scaled_reward": 0.14814256876707077, - "rewards/format_reward": 0.75, + "loss": 0.0493, + "reward": 0.13014382123947144, + "reward_std": 0.12995108915492892, + "rewards/cosine_scaled_reward": -0.019444716162979603, + "rewards/format_reward": 0.791666679084301, "step": 401 }, { - "completion_length": 2270.916732788086, - "epoch": 0.2297142857142857, - "grad_norm": 0.24633632600307465, - "kl": 0.0076446533203125, + "completion_length": 1586.0417175292969, + "epoch": 0.4594285714285714, + "grad_norm": 2.764214277267456, + "kl": 1.1788330078125, "learning_rate": 2.0127498008311922e-07, - "loss": 0.0003, - "reward": 0.2581409737467766, - "reward_std": 0.1837962344288826, - "rewards/cosine_scaled_reward": 0.3482528477907181, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0472, + "reward": 0.11509460117667913, + "reward_std": 0.12621240504086018, + "rewards/cosine_scaled_reward": -0.03602955490350723, + "rewards/format_reward": 0.7291666939854622, "step": 402 }, { - "completion_length": 977.7917098999023, - "epoch": 0.2302857142857143, - "grad_norm": 0.3036160469055176, - "kl": 0.00571441650390625, + "completion_length": 1443.87504196167, + "epoch": 0.4605714285714286, + "grad_norm": 2.6378586292266846, + "kl": 1.0952301025390625, "learning_rate": 1.9929791578083655e-07, - "loss": 0.0002, - "reward": 0.2744885301217437, - "reward_std": 0.07894740859046578, - "rewards/cosine_scaled_reward": 0.31127897277474403, - "rewards/format_reward": 1.0, + "loss": 0.0438, + "reward": 0.14914248324930668, + "reward_std": 0.08150437835138291, + "rewards/cosine_scaled_reward": 0.02718578651547432, + "rewards/format_reward": 0.8125000204890966, "step": 403 }, { - "completion_length": 1544.625015258789, - "epoch": 0.23085714285714284, - "grad_norm": 0.2819896340370178, - "kl": 0.00907135009765625, + "completion_length": 1235.4792098999023, + "epoch": 0.4617142857142857, + "grad_norm": 2.0312211513519287, + "kl": 0.6242218017578125, "learning_rate": 1.9733794420337213e-07, - "loss": 0.0004, - "reward": 0.11523301573470235, - "reward_std": 0.040874656988307834, - "rewards/cosine_scaled_reward": -0.03358080983161926, - "rewards/format_reward": 0.75, + "loss": 0.025, + "reward": 0.15180272003635764, + "reward_std": 0.10177731700241566, + "rewards/cosine_scaled_reward": -0.007191255688667297, + "rewards/format_reward": 0.8958333507180214, "step": 404 }, { - "completion_length": 1330.1250610351562, - "epoch": 0.23142857142857143, - "grad_norm": 0.298233300447464, - "kl": 0.00891876220703125, + "completion_length": 1179.3958587646484, + "epoch": 0.46285714285714286, + "grad_norm": 2.201613664627075, + "kl": 0.848846435546875, "learning_rate": 1.9539516087697517e-07, - "loss": 0.0004, - "reward": 0.18481022119522095, - "reward_std": 0.16554485447704792, - "rewards/cosine_scaled_reward": 0.06647372804582119, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0339, + "reward": 0.21597139816731215, + "reward_std": 0.16427310602739453, + "rewards/cosine_scaled_reward": 0.154250493273139, + "rewards/format_reward": 0.9375000074505806, "step": 405 }, { - "completion_length": 1268.9583435058594, - "epoch": 0.232, - "grad_norm": 0.41509929299354553, - "kl": 0.0117340087890625, + "completion_length": 1454.9166946411133, + "epoch": 0.464, + "grad_norm": 3.2176761627197266, + "kl": 1.196868896484375, "learning_rate": 1.934696604901642e-07, - "loss": 0.0005, - "reward": 0.18005840852856636, - "reward_std": 0.14741305448114872, - "rewards/cosine_scaled_reward": 0.09081599116325378, - "rewards/format_reward": 0.8750000298023224, + "loss": 0.0477, + "reward": 0.12235562037676573, + "reward_std": 0.12256110971793532, + "rewards/cosine_scaled_reward": -0.045554774114862084, + "rewards/format_reward": 0.791666679084301, "step": 406 }, { - "completion_length": 1569.0416870117188, - "epoch": 0.23257142857142857, - "grad_norm": 0.35921189188957214, - "kl": 0.0102691650390625, + "completion_length": 1515.4792289733887, + "epoch": 0.46514285714285714, + "grad_norm": 3.040894031524658, + "kl": 0.8402099609375, "learning_rate": 1.915615368891117e-07, - "loss": 0.0004, - "reward": 0.24313370883464813, - "reward_std": 0.18073663674294949, - "rewards/cosine_scaled_reward": 0.2787550240755081, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.0336, + "reward": 0.15898172045126557, + "reward_std": 0.10987926088273525, + "rewards/cosine_scaled_reward": 0.03327514789998531, + "rewards/format_reward": 0.8541666939854622, "step": 407 }, { - "completion_length": 912.2500152587891, - "epoch": 0.23314285714285715, - "grad_norm": 0.38133394718170166, - "kl": 0.0138092041015625, + "completion_length": 1372.5625534057617, + "epoch": 0.4662857142857143, + "grad_norm": 1.9494367837905884, + "kl": 0.5256500244140625, "learning_rate": 1.8967088307307e-07, - "loss": 0.0006, - "reward": 0.18693566136062145, - "reward_std": 0.04957200586795807, - "rewards/cosine_scaled_reward": 0.05054560489952564, - "rewards/format_reward": 1.0, + "loss": 0.021, + "reward": 0.18118306156247854, + "reward_std": 0.1542639322578907, + "rewards/cosine_scaled_reward": 0.08791719190776348, + "rewards/format_reward": 0.8750000149011612, "step": 408 }, { - "completion_length": 1824.5000457763672, - "epoch": 0.2337142857142857, - "grad_norm": 0.26921290159225464, - "kl": 0.0072021484375, + "completion_length": 2067.58341217041, + "epoch": 0.4674285714285714, + "grad_norm": 2.919564962387085, + "kl": 1.49102783203125, "learning_rate": 1.8779779118983867e-07, - "loss": 0.0003, - "reward": 0.2339594718068838, - "reward_std": 0.10502585582435131, - "rewards/cosine_scaled_reward": 0.2535390406847, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.0597, + "reward": 0.07456145505420864, + "reward_std": 0.12785195047035813, + "rewards/cosine_scaled_reward": -0.12900145258754492, + "rewards/format_reward": 0.6875000186264515, "step": 409 }, { - "completion_length": 1068.7083587646484, - "epoch": 0.2342857142857143, - "grad_norm": 0.46298354864120483, - "kl": 0.00797271728515625, + "completion_length": 1623.2500228881836, + "epoch": 0.4685714285714286, + "grad_norm": 2.359740972518921, + "kl": 1.49200439453125, "learning_rate": 1.8594235253127372e-07, - "loss": 0.0003, - "reward": 0.1949240928515792, - "reward_std": 0.15359684638679028, - "rewards/cosine_scaled_reward": 0.11527213966473937, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0597, + "reward": 0.12223792565055192, + "reward_std": 0.13907577726058662, + "rewards/cosine_scaled_reward": -0.026431459933519363, + "rewards/format_reward": 0.7500000167638063, "step": 410 }, { - "completion_length": 1749.5833740234375, - "epoch": 0.23485714285714285, - "grad_norm": 0.45598310232162476, - "kl": 0.00908660888671875, + "completion_length": 1614.6250534057617, + "epoch": 0.4697142857142857, + "grad_norm": 2.7134463787078857, + "kl": 0.764923095703125, "learning_rate": 1.8410465752883758e-07, - "loss": 0.0004, - "reward": 0.0530395470559597, - "reward_std": 0.0732766967266798, - "rewards/cosine_scaled_reward": -0.24037358909845352, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0306, + "reward": 0.07999165914952755, + "reward_std": 0.12184700695797801, + "rewards/cosine_scaled_reward": -0.16527405753731728, + "rewards/format_reward": 0.7916666865348816, "step": 411 }, { - "completion_length": 2562.375030517578, - "epoch": 0.23542857142857143, - "grad_norm": 0.2796461284160614, - "kl": 0.010040283203125, + "completion_length": 1176.7291946411133, + "epoch": 0.47085714285714286, + "grad_norm": 3.365088701248169, + "kl": 0.94677734375, "learning_rate": 1.822847957491922e-07, - "loss": 0.0004, - "reward": 0.022122640162706375, - "reward_std": 0.09536791127175093, - "rewards/cosine_scaled_reward": -0.2269127182662487, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0378, + "reward": 0.10302653594408184, + "reward_std": 0.13904161704704165, + "rewards/cosine_scaled_reward": -0.11917255260050297, + "rewards/format_reward": 0.8333333432674408, "step": 412 }, { - "completion_length": 1705.8333740234375, - "epoch": 0.236, - "grad_norm": 0.47761160135269165, - "kl": 0.019866943359375, + "completion_length": 1315.0833854675293, + "epoch": 0.472, + "grad_norm": 3.6166837215423584, + "kl": 0.6557846069335938, "learning_rate": 1.804828558898332e-07, - "loss": 0.0008, - "reward": 0.08965376578271389, - "reward_std": 0.14554089680314064, - "rewards/cosine_scaled_reward": -0.13563106954097748, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0263, + "reward": 0.13768333243206143, + "reward_std": 0.12120772805064917, + "rewards/cosine_scaled_reward": -0.05770981824025512, + "rewards/format_reward": 0.9166666865348816, "step": 413 }, { - "completion_length": 1398.3750305175781, - "epoch": 0.23657142857142857, - "grad_norm": 0.3065117299556732, - "kl": 0.0072784423828125, + "completion_length": 2013.416748046875, + "epoch": 0.47314285714285714, + "grad_norm": 2.6909852027893066, + "kl": 1.70458984375, "learning_rate": 1.7869892577476722e-07, - "loss": 0.0003, - "reward": 0.1565822046250105, - "reward_std": 0.1707898247987032, - "rewards/cosine_scaled_reward": -0.019366338849067688, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0682, + "reward": 0.04394434345886111, + "reward_std": 0.12014323053881526, + "rewards/cosine_scaled_reward": -0.24098782893270254, + "rewards/format_reward": 0.7291666772216558, "step": 414 }, { - "completion_length": 1064.5833587646484, - "epoch": 0.23714285714285716, - "grad_norm": 0.34449470043182373, - "kl": 0.00835418701171875, + "completion_length": 1488.583366394043, + "epoch": 0.4742857142857143, + "grad_norm": 2.07572340965271, + "kl": 0.9158935546875, "learning_rate": 1.7693309235023127e-07, - "loss": 0.0003, - "reward": 0.18769755307585, - "reward_std": 0.040396169293671846, - "rewards/cosine_scaled_reward": 0.05374469980597496, - "rewards/format_reward": 1.0, + "loss": 0.0366, + "reward": 0.09156056097708642, + "reward_std": 0.1220615403726697, + "rewards/cosine_scaled_reward": -0.16554791200906038, + "rewards/format_reward": 0.8541666939854622, "step": 415 }, { - "completion_length": 1597.791748046875, - "epoch": 0.2377142857142857, - "grad_norm": 0.33483630418777466, - "kl": 0.00931549072265625, + "completion_length": 1202.708366394043, + "epoch": 0.4754285714285714, + "grad_norm": 2.5329267978668213, + "kl": 0.7248611450195312, "learning_rate": 1.7518544168045524e-07, - "loss": 0.0004, - "reward": 0.17363234143704176, - "reward_std": 0.1324333418160677, - "rewards/cosine_scaled_reward": 0.053535325452685356, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.029, + "reward": 0.20120010571554303, + "reward_std": 0.16196386714000255, + "rewards/cosine_scaled_reward": 0.1876464392989874, + "rewards/format_reward": 0.7916666772216558, "step": 416 }, { - "completion_length": 1124.458366394043, - "epoch": 0.2382857142857143, - "grad_norm": 0.45643943548202515, - "kl": 0.00966644287109375, + "completion_length": 1595.958366394043, + "epoch": 0.4765714285714286, + "grad_norm": 2.1365303993225098, + "kl": 0.925079345703125, "learning_rate": 1.7345605894346726e-07, - "loss": 0.0004, - "reward": 0.1574200503528118, - "reward_std": 0.14907053112983704, - "rewards/cosine_scaled_reward": 0.004368675872683525, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.037, + "reward": 0.13611510070040822, + "reward_std": 0.10463207168504596, + "rewards/cosine_scaled_reward": -0.014729505404829979, + "rewards/format_reward": 0.8125000074505806, "step": 417 }, { - "completion_length": 1326.0000305175781, - "epoch": 0.23885714285714285, - "grad_norm": 0.34442561864852905, - "kl": 0.00894927978515625, + "completion_length": 1091.2292137145996, + "epoch": 0.4777142857142857, + "grad_norm": 3.3795764446258545, + "kl": 0.7227020263671875, "learning_rate": 1.7174502842694212e-07, - "loss": 0.0004, - "reward": 0.17413843423128128, - "reward_std": 0.058942196890711784, - "rewards/cosine_scaled_reward": 0.05713009461760521, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0289, + "reward": 0.21031097415834665, + "reward_std": 0.12943580746650696, + "rewards/cosine_scaled_reward": 0.15948706772178411, + "rewards/format_reward": 0.8958333432674408, "step": 418 }, { - "completion_length": 1926.9167175292969, - "epoch": 0.23942857142857144, - "grad_norm": 0.4213322699069977, - "kl": 0.0115814208984375, + "completion_length": 1551.8750610351562, + "epoch": 0.47885714285714287, + "grad_norm": 2.417349100112915, + "kl": 0.9874191284179688, "learning_rate": 1.7005243352409333e-07, - "loss": 0.0005, - "reward": 0.07054175285156816, - "reward_std": 0.1377626247704029, - "rewards/cosine_scaled_reward": -0.18770137149840593, - "rewards/format_reward": 0.7916666865348816, + "loss": 0.0395, + "reward": 0.1355010142287938, + "reward_std": 0.15785679733380675, + "rewards/cosine_scaled_reward": -0.02094801003113389, + "rewards/format_reward": 0.8125000149011612, "step": 419 }, { - "completion_length": 1003.1667022705078, - "epoch": 0.24, - "grad_norm": 0.25942546129226685, - "kl": 0.0056304931640625, + "completion_length": 945.6041870117188, + "epoch": 0.48, + "grad_norm": 1.6603590250015259, + "kl": 0.618865966796875, "learning_rate": 1.6837835672960831e-07, - "loss": 0.0002, - "reward": 0.19107356388121843, - "reward_std": 0.04739784402772784, - "rewards/cosine_scaled_reward": 0.06541471788659692, - "rewards/format_reward": 1.0, + "loss": 0.0248, + "reward": 0.10752899164799601, + "reward_std": 0.09324449067935348, + "rewards/cosine_scaled_reward": -0.155439174734056, + "rewards/format_reward": 0.9166666716337204, "step": 420 }, { - "completion_length": 1854.1250457763672, - "epoch": 0.24057142857142857, - "grad_norm": 0.41457414627075195, - "kl": 0.01378631591796875, + "completion_length": 1413.1042251586914, + "epoch": 0.48114285714285715, + "grad_norm": 2.401996374130249, + "kl": 1.11029052734375, "learning_rate": 1.6672287963562852e-07, - "loss": 0.0006, - "reward": 0.11942541692405939, - "reward_std": 0.07678701169788837, - "rewards/cosine_scaled_reward": 0.0008558295667171478, - "rewards/format_reward": 0.7083333358168602, + "loss": 0.0444, + "reward": 0.08400759304640815, + "reward_std": 0.13007194455713034, + "rewards/cosine_scaled_reward": -0.18536985479295254, + "rewards/format_reward": 0.854166679084301, "step": 421 }, { - "completion_length": 1477.0416870117188, - "epoch": 0.24114285714285713, - "grad_norm": 0.34446263313293457, - "kl": 0.0105438232421875, + "completion_length": 1390.4375305175781, + "epoch": 0.48228571428571426, + "grad_norm": 2.419326066970825, + "kl": 1.081390380859375, "learning_rate": 1.6508608292777203e-07, - "loss": 0.0004, - "reward": 0.1485960902646184, - "reward_std": 0.06330296769738197, - "rewards/cosine_scaled_reward": 0.0646209642291069, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0433, + "reward": 0.13678289717063308, + "reward_std": 0.11409326584544033, + "rewards/cosine_scaled_reward": -0.019857976818457246, + "rewards/format_reward": 0.8333333469927311, "step": 422 }, { - "completion_length": 1188.5833740234375, - "epoch": 0.24171428571428571, - "grad_norm": 0.42733582854270935, - "kl": 0.0096435546875, + "completion_length": 1838.7917022705078, + "epoch": 0.48342857142857143, + "grad_norm": 1.9876563549041748, + "kl": 1.2999267578125, "learning_rate": 1.6346804638120098e-07, - "loss": 0.0004, - "reward": 0.19333018362522125, - "reward_std": 0.16796002350747585, - "rewards/cosine_scaled_reward": 0.1293434426188469, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.052, + "reward": 0.05111281352583319, + "reward_std": 0.11355760088190436, + "rewards/cosine_scaled_reward": -0.21695283614099026, + "rewards/format_reward": 0.729166679084301, "step": 423 }, { - "completion_length": 1010.1667175292969, - "epoch": 0.2422857142857143, - "grad_norm": 0.34660741686820984, - "kl": 0.0093841552734375, + "completion_length": 1542.2292251586914, + "epoch": 0.4845714285714286, + "grad_norm": 2.6723501682281494, + "kl": 1.047271728515625, "learning_rate": 1.6186884885673413e-07, - "loss": 0.0004, - "reward": 0.21930179418995976, - "reward_std": 0.0818589385598898, - "rewards/cosine_scaled_reward": 0.18683160468935966, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0418, + "reward": 0.09121810318902135, + "reward_std": 0.11254043271765113, + "rewards/cosine_scaled_reward": -0.18606012500822544, + "rewards/format_reward": 0.8958333507180214, "step": 424 }, { - "completion_length": 1683.2083892822266, - "epoch": 0.24285714285714285, - "grad_norm": 0.5022530555725098, - "kl": 0.02095794677734375, + "completion_length": 1012.3541717529297, + "epoch": 0.4857142857142857, + "grad_norm": 2.142672061920166, + "kl": 0.32970428466796875, "learning_rate": 1.6028856829700258e-07, - "loss": 0.0008, - "reward": 0.2759297899901867, - "reward_std": 0.18575101345777512, - "rewards/cosine_scaled_reward": 0.39443655556533486, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0132, + "reward": 0.27949744602665305, + "reward_std": 0.13918299926444888, + "rewards/cosine_scaled_reward": 0.35947928391397, + "rewards/format_reward": 0.916666679084301, "step": 425 }, { - "completion_length": 1142.3750305175781, - "epoch": 0.24342857142857144, - "grad_norm": 0.5855507850646973, - "kl": 0.0108795166015625, + "completion_length": 1376.7917022705078, + "epoch": 0.4868571428571429, + "grad_norm": 2.2903144359588623, + "kl": 1.4896697998046875, "learning_rate": 1.5872728172265146e-07, - "loss": 0.0004, - "reward": 0.055991355795413256, - "reward_std": 0.08330015558749437, - "rewards/cosine_scaled_reward": -0.2518373355269432, - "rewards/format_reward": 0.833333358168602, + "loss": 0.0595, + "reward": 0.10234351572580636, + "reward_std": 0.1212971555069089, + "rewards/cosine_scaled_reward": -0.1176013108342886, + "rewards/format_reward": 0.8333333507180214, "step": 426 }, { - "completion_length": 1717.7083892822266, - "epoch": 0.244, - "grad_norm": 0.6683474779129028, - "kl": 0.0181732177734375, + "completion_length": 1822.0208892822266, + "epoch": 0.488, + "grad_norm": 3.0833256244659424, + "kl": 0.966033935546875, "learning_rate": 1.5718506522858572e-07, - "loss": 0.0007, - "reward": 0.11663197167217731, - "reward_std": 0.14281198661774397, - "rewards/cosine_scaled_reward": -0.07647238485515118, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0387, + "reward": 0.1447278270497918, + "reward_std": 0.1653783330693841, + "rewards/cosine_scaled_reward": 0.011570073664188385, + "rewards/format_reward": 0.8125000149011612, "step": 427 }, { - "completion_length": 1789.4583740234375, - "epoch": 0.24457142857142858, - "grad_norm": 0.29647254943847656, - "kl": 0.010288238525390625, + "completion_length": 1313.0625228881836, + "epoch": 0.48914285714285716, + "grad_norm": 1.966538906097412, + "kl": 0.5592117309570312, "learning_rate": 1.5566199398026147e-07, - "loss": 0.0004, - "reward": 0.13982021622359753, - "reward_std": 0.0829594423994422, - "rewards/cosine_scaled_reward": 0.057332220138050616, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.0223, + "reward": 0.12443019635975361, + "reward_std": 0.11524406261742115, + "rewards/cosine_scaled_reward": -0.076734006870538, + "rewards/format_reward": 0.8750000111758709, "step": 428 }, { - "completion_length": 936.2500305175781, - "epoch": 0.24514285714285713, - "grad_norm": 0.33182427287101746, - "kl": 0.00751495361328125, + "completion_length": 1229.2916870117188, + "epoch": 0.49028571428571427, + "grad_norm": 1.638811707496643, + "kl": 1.2337493896484375, "learning_rate": 1.5415814221002265e-07, - "loss": 0.0003, - "reward": 0.1806036289781332, - "reward_std": 0.08534971065819263, - "rewards/cosine_scaled_reward": 0.029523834586143494, - "rewards/format_reward": 1.0, + "loss": 0.0494, + "reward": 0.147400954447221, + "reward_std": 0.1352155078202486, + "rewards/cosine_scaled_reward": -0.012214789167046547, + "rewards/format_reward": 0.8750000074505806, "step": 429 }, { - "completion_length": 1622.2083740234375, - "epoch": 0.24571428571428572, - "grad_norm": 0.4233330488204956, - "kl": 0.0075531005859375, + "completion_length": 979.6458587646484, + "epoch": 0.49142857142857144, + "grad_norm": 1.6275877952575684, + "kl": 0.34814453125, "learning_rate": 1.5267358321348285e-07, - "loss": 0.0003, - "reward": 0.030628697015345097, - "reward_std": 0.058422201313078403, - "rewards/cosine_scaled_reward": -0.2865150347352028, - "rewards/format_reward": 0.75, + "loss": 0.0139, + "reward": 0.1619871830334887, + "reward_std": 0.11312393890693784, + "rewards/cosine_scaled_reward": 0.02661276888102293, + "rewards/format_reward": 0.8958333432674408, "step": 430 }, { - "completion_length": 1064.5416717529297, - "epoch": 0.24628571428571427, - "grad_norm": 0.39948347210884094, - "kl": 0.0070037841796875, + "completion_length": 1446.2292175292969, + "epoch": 0.49257142857142855, + "grad_norm": 3.2351579666137695, + "kl": 1.28387451171875, "learning_rate": 1.5120838934595337e-07, - "loss": 0.0003, - "reward": 0.24056867510080338, - "reward_std": 0.14248921908438206, - "rewards/cosine_scaled_reward": 0.2420511320233345, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0514, + "reward": 0.07184328138828278, + "reward_std": 0.0726720115635544, + "rewards/cosine_scaled_reward": -0.19962184969335794, + "rewards/format_reward": 0.8125000111758709, "step": 431 }, { - "completion_length": 1284.6666870117188, - "epoch": 0.24685714285714286, - "grad_norm": 0.38756710290908813, - "kl": 0.0093231201171875, + "completion_length": 1804.979232788086, + "epoch": 0.4937142857142857, + "grad_norm": 3.7058115005493164, + "kl": 1.610595703125, "learning_rate": 1.4976263201891613e-07, - "loss": 0.0004, - "reward": 0.14967439509928226, - "reward_std": 0.046333879232406616, - "rewards/cosine_scaled_reward": 0.0047361403703689575, - "rewards/format_reward": 0.875, + "loss": 0.0645, + "reward": 0.05518326349556446, + "reward_std": 0.10842809174209833, + "rewards/cosine_scaled_reward": -0.19627886731177568, + "rewards/format_reward": 0.7083333525806665, "step": 432 }, { - "completion_length": 1339.2083892822266, - "epoch": 0.24742857142857144, - "grad_norm": 0.34880462288856506, - "kl": 0.00974273681640625, + "completion_length": 1190.7292022705078, + "epoch": 0.4948571428571429, + "grad_norm": 1.7561228275299072, + "kl": 0.5131759643554688, "learning_rate": 1.483363816965435e-07, - "loss": 0.0004, - "reward": 0.15797951196145732, - "reward_std": 0.14605247788131237, - "rewards/cosine_scaled_reward": 0.028781283646821976, - "rewards/format_reward": 0.875, + "loss": 0.0205, + "reward": 0.1619516264181584, + "reward_std": 0.10530700022354722, + "rewards/cosine_scaled_reward": 0.022388019686331972, + "rewards/format_reward": 0.895833358168602, "step": 433 }, { - "completion_length": 1483.3750457763672, - "epoch": 0.248, - "grad_norm": 0.4117262363433838, - "kl": 0.0096282958984375, + "completion_length": 1296.0417175292969, + "epoch": 0.496, + "grad_norm": 1.8874526023864746, + "kl": 0.7233428955078125, "learning_rate": 1.469297078922642e-07, - "loss": 0.0004, - "reward": 0.17310748994350433, - "reward_std": 0.10971591155976057, - "rewards/cosine_scaled_reward": 0.09441257268190384, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.0289, + "reward": 0.06844042587908916, + "reward_std": 0.08203518786467612, + "rewards/cosine_scaled_reward": -0.23772956430912018, + "rewards/format_reward": 0.8750000149011612, "step": 434 }, { - "completion_length": 1336.2916870117188, - "epoch": 0.24857142857142858, - "grad_norm": 0.3324863016605377, - "kl": 0.0098114013671875, + "completion_length": 1286.6042213439941, + "epoch": 0.49714285714285716, + "grad_norm": 2.515587568283081, + "kl": 1.2847747802734375, "learning_rate": 1.4554267916537495e-07, - "loss": 0.0004, - "reward": 0.05687671690247953, - "reward_std": 0.05665414920076728, - "rewards/cosine_scaled_reward": -0.269748467952013, - "rewards/format_reward": 0.875, + "loss": 0.0513, + "reward": 0.06948329764418304, + "reward_std": 0.09834526525810361, + "rewards/cosine_scaled_reward": -0.22741128038614988, + "rewards/format_reward": 0.8541666865348816, "step": 435 }, { - "completion_length": 1825.5000915527344, - "epoch": 0.24914285714285714, - "grad_norm": 0.23547585308551788, - "kl": 0.00902557373046875, + "completion_length": 1111.458366394043, + "epoch": 0.4982857142857143, + "grad_norm": 1.83876633644104, + "kl": 0.725067138671875, "learning_rate": 1.4417536311769885e-07, - "loss": 0.0004, - "reward": 0.05375045910477638, - "reward_std": 0.134916627779603, - "rewards/cosine_scaled_reward": -0.1949047544039786, - "rewards/format_reward": 0.7083333507180214, + "loss": 0.029, + "reward": 0.18536534893792123, + "reward_std": 0.12061038403771818, + "rewards/cosine_scaled_reward": 0.11224743165075779, + "rewards/format_reward": 0.8541666865348816, "step": 436 }, { - "completion_length": 1690.3333435058594, - "epoch": 0.24971428571428572, - "grad_norm": 0.252827912569046, - "kl": 0.00861358642578125, + "completion_length": 1064.020866394043, + "epoch": 0.49942857142857144, + "grad_norm": 2.1831390857696533, + "kl": 0.4850616455078125, "learning_rate": 1.4282782639029128e-07, - "loss": 0.0003, - "reward": 0.17899128887802362, - "reward_std": 0.10658581275492907, - "rewards/cosine_scaled_reward": 0.15425992757081985, - "rewards/format_reward": 0.75, + "loss": 0.0194, + "reward": 0.1590421856380999, + "reward_std": 0.09123086743056774, + "rewards/cosine_scaled_reward": -0.028785159811377525, + "rewards/format_reward": 0.9583333432674408, "step": 437 }, { - "completion_length": 1176.208366394043, - "epoch": 0.2502857142857143, - "grad_norm": 0.5680178999900818, - "kl": 0.0163421630859375, + "completion_length": 1594.8750610351562, + "epoch": 0.5005714285714286, + "grad_norm": 2.209979295730591, + "kl": 0.8747100830078125, "learning_rate": 1.4150013466019114e-07, - "loss": 0.0007, - "reward": 0.12644178420305252, - "reward_std": 0.13729064725339413, - "rewards/cosine_scaled_reward": -0.10660480707883835, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.035, + "reward": 0.08124704580404796, + "reward_std": 0.10059291700599715, + "rewards/cosine_scaled_reward": -0.1798710956936702, + "rewards/format_reward": 0.8333333507180214, "step": 438 }, { - "completion_length": 1480.0833740234375, - "epoch": 0.25085714285714283, - "grad_norm": 0.31664028763771057, - "kl": 0.01097869873046875, + "completion_length": 1178.1875305175781, + "epoch": 0.5017142857142857, + "grad_norm": 1.6698002815246582, + "kl": 0.3531494140625, "learning_rate": 1.4019235263722034e-07, - "loss": 0.0004, - "reward": 0.0605952525511384, - "reward_std": 0.07811014354228973, - "rewards/cosine_scaled_reward": -0.26010931842029095, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.0141, + "reward": 0.07717898802366108, + "reward_std": 0.06667181965894997, + "rewards/cosine_scaled_reward": -0.24372687563300133, + "rewards/format_reward": 0.9375000149011612, "step": 439 }, { - "completion_length": 1673.958366394043, - "epoch": 0.25142857142857145, - "grad_norm": 0.3744324743747711, - "kl": 0.0091705322265625, + "completion_length": 1250.7083549499512, + "epoch": 0.5028571428571429, + "grad_norm": 3.8609611988067627, + "kl": 0.908416748046875, "learning_rate": 1.3890454406082956e-07, - "loss": 0.0004, - "reward": 0.01797363953664899, - "reward_std": 0.06721334718167782, - "rewards/cosine_scaled_reward": -0.30165933445096016, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.0363, + "reward": 0.08707587665412575, + "reward_std": 0.0901845304761082, + "rewards/cosine_scaled_reward": -0.19678178988397121, + "rewards/format_reward": 0.8958333432674408, "step": 440 }, { - "completion_length": 932.3750228881836, - "epoch": 0.252, - "grad_norm": 0.2834275960922241, - "kl": 0.0052032470703125, + "completion_length": 1613.2292098999023, + "epoch": 0.504, + "grad_norm": 2.8707664012908936, + "kl": 1.0423583984375, "learning_rate": 1.3763677169699217e-07, - "loss": 0.0002, - "reward": 0.1804841896519065, - "reward_std": 0.08349917223677039, - "rewards/cosine_scaled_reward": 0.032689280807971954, - "rewards/format_reward": 1.0, + "loss": 0.0417, + "reward": 0.1019732168642804, + "reward_std": 0.11968619748950005, + "rewards/cosine_scaled_reward": -0.07690610364079475, + "rewards/format_reward": 0.7500000111758709, "step": 441 }, { - "completion_length": 1722.9584045410156, - "epoch": 0.25257142857142856, - "grad_norm": 0.3624862730503082, - "kl": 0.00960540771484375, + "completion_length": 1124.3125228881836, + "epoch": 0.5051428571428571, + "grad_norm": 3.18782377243042, + "kl": 0.6033172607421875, "learning_rate": 1.3638909733514452e-07, - "loss": 0.0004, - "reward": 0.18544169422239065, - "reward_std": 0.13662213925272226, - "rewards/cosine_scaled_reward": 0.1923333816230297, - "rewards/format_reward": 0.7083333544433117, + "loss": 0.0241, + "reward": 0.1847122572362423, + "reward_std": 0.11424465058371425, + "rewards/cosine_scaled_reward": 0.06125026382505894, + "rewards/format_reward": 0.9375000149011612, "step": 442 }, { - "completion_length": 1247.1667022705078, - "epoch": 0.25314285714285717, - "grad_norm": 0.4204573631286621, - "kl": 0.01018524169921875, + "completion_length": 1618.208396911621, + "epoch": 0.5062857142857143, + "grad_norm": 3.1647167205810547, + "kl": 0.8673095703125, "learning_rate": 1.351615817851748e-07, - "loss": 0.0004, - "reward": 0.09432954154908657, - "reward_std": 0.09576405212283134, - "rewards/cosine_scaled_reward": -0.1410097642801702, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.0347, + "reward": 0.1230522379046306, + "reward_std": 0.08929805480875075, + "rewards/cosine_scaled_reward": -0.06928862258791924, + "rewards/format_reward": 0.8541666939854622, "step": 443 }, { - "completion_length": 1889.4584197998047, - "epoch": 0.2537142857142857, - "grad_norm": 0.3111760914325714, - "kl": 0.012866973876953125, + "completion_length": 1356.6458740234375, + "epoch": 0.5074285714285715, + "grad_norm": 2.186877489089966, + "kl": 0.634979248046875, "learning_rate": 1.3395428487445914e-07, - "loss": 0.0005, - "reward": 0.1778575791977346, - "reward_std": 0.11700630933046341, - "rewards/cosine_scaled_reward": 0.14782167598605156, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0254, + "reward": 0.11778642190620303, + "reward_std": 0.10000652819871902, + "rewards/cosine_scaled_reward": -0.13054069224745035, + "rewards/format_reward": 0.9375000149011612, "step": 444 }, { - "completion_length": 1837.2500305175781, - "epoch": 0.2542857142857143, - "grad_norm": 0.3862968683242798, - "kl": 0.01141357421875, + "completion_length": 1302.6042022705078, + "epoch": 0.5085714285714286, + "grad_norm": 1.7782143354415894, + "kl": 0.7170562744140625, "learning_rate": 1.3276726544494571e-07, - "loss": 0.0005, - "reward": 0.10465393215417862, - "reward_std": 0.09716109558939934, - "rewards/cosine_scaled_reward": -0.026579685509204865, - "rewards/format_reward": 0.6666666865348816, + "loss": 0.0287, + "reward": 0.09130575158633292, + "reward_std": 0.09221122646704316, + "rewards/cosine_scaled_reward": -0.1979788908502087, + "rewards/format_reward": 0.9166666865348816, "step": 445 }, { - "completion_length": 1610.7083892822266, - "epoch": 0.25485714285714284, - "grad_norm": 0.31398341059684753, - "kl": 0.009929656982421875, + "completion_length": 1449.1667022705078, + "epoch": 0.5097142857142857, + "grad_norm": 2.79809308052063, + "kl": 0.5194091796875, "learning_rate": 1.316005813502869e-07, - "loss": 0.0004, - "reward": 0.10001690965145826, - "reward_std": 0.09864317439496517, - "rewards/cosine_scaled_reward": -0.07858972623944283, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0207, + "reward": 0.10592652973718941, + "reward_std": 0.0978604587726295, + "rewards/cosine_scaled_reward": -0.12308421358466148, + "rewards/format_reward": 0.854166679084301, "step": 446 }, { - "completion_length": 1593.0000305175781, - "epoch": 0.25542857142857145, - "grad_norm": 0.2847026586532593, - "kl": 0.00848388671875, + "completion_length": 1229.6667137145996, + "epoch": 0.5108571428571429, + "grad_norm": 1.967965006828308, + "kl": 0.792236328125, "learning_rate": 1.3045428945301953e-07, - "loss": 0.0003, - "reward": 0.12400662526488304, - "reward_std": 0.17154778726398945, - "rewards/cosine_scaled_reward": -0.09315278381109238, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0317, + "reward": 0.12869366817176342, + "reward_std": 0.09020477347075939, + "rewards/cosine_scaled_reward": -0.07901583984494209, + "rewards/format_reward": 0.895833358168602, "step": 447 }, { - "completion_length": 2114.3750610351562, - "epoch": 0.256, - "grad_norm": 0.2734992504119873, - "kl": 0.0105743408203125, + "completion_length": 1233.4167022705078, + "epoch": 0.512, + "grad_norm": 5.0800299644470215, + "kl": 0.8968582153320312, "learning_rate": 1.2932844562179352e-07, - "loss": 0.0004, - "reward": 0.14665351761505008, - "reward_std": 0.21061961725354195, - "rewards/cosine_scaled_reward": 0.07771626731846482, - "rewards/format_reward": 0.7083333395421505, + "loss": 0.0358, + "reward": 0.12176637991797179, + "reward_std": 0.07025636686012149, + "rewards/cosine_scaled_reward": -0.10369249619543552, + "rewards/format_reward": 0.916666679084301, "step": 448 }, { - "completion_length": 1690.3333740234375, - "epoch": 0.25657142857142856, - "grad_norm": 0.4054621458053589, - "kl": 0.01947784423828125, + "completion_length": 1115.104206085205, + "epoch": 0.5131428571428571, + "grad_norm": 5.752594947814941, + "kl": 0.5258560180664062, "learning_rate": 1.2822310472864885e-07, - "loss": 0.0008, - "reward": 0.08519147476181388, - "reward_std": 0.10636863484978676, - "rewards/cosine_scaled_reward": -0.18691626656800508, - "rewards/format_reward": 0.875, + "loss": 0.021, + "reward": 0.09505507163703442, + "reward_std": 0.08659794740378857, + "rewards/cosine_scaled_reward": -0.18167979642748833, + "rewards/format_reward": 0.916666679084301, "step": 449 }, { - "completion_length": 1902.541748046875, - "epoch": 0.2571428571428571, - "grad_norm": 0.3922634422779083, - "kl": 0.0136871337890625, + "completion_length": 1027.0416946411133, + "epoch": 0.5142857142857142, + "grad_norm": 1.9456526041030884, + "kl": 0.33565521240234375, "learning_rate": 1.2713832064634125e-07, - "loss": 0.0005, - "reward": 0.17442963598296046, - "reward_std": 0.112509710714221, - "rewards/cosine_scaled_reward": 0.07242470979690552, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.0134, + "reward": 0.1392190819606185, + "reward_std": 0.08027261192910373, + "rewards/cosine_scaled_reward": -0.06442757230252028, + "rewards/format_reward": 0.9375000074505806, "step": 450 }, { - "completion_length": 1552.7500610351562, - "epoch": 0.25771428571428573, - "grad_norm": 0.3281165659427643, - "kl": 0.0084381103515625, + "completion_length": 1365.3125495910645, + "epoch": 0.5154285714285715, + "grad_norm": 2.0280954837799072, + "kl": 0.9598846435546875, "learning_rate": 1.260741462457165e-07, - "loss": 0.0003, - "reward": 0.1979460008442402, - "reward_std": 0.13919626083225012, - "rewards/cosine_scaled_reward": 0.12410717271268368, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0384, + "reward": 0.14097292395308614, + "reward_std": 0.1067453664727509, + "rewards/cosine_scaled_reward": -0.06067850440740585, + "rewards/format_reward": 0.9375000074505806, "step": 451 }, { - "completion_length": 1168.5417175292969, - "epoch": 0.2582857142857143, - "grad_norm": 0.3811437785625458, - "kl": 0.00800323486328125, + "completion_length": 1629.979248046875, + "epoch": 0.5165714285714286, + "grad_norm": 2.833832025527954, + "kl": 1.052734375, "learning_rate": 1.2503063339313356e-07, - "loss": 0.0003, - "reward": 0.20214430056512356, - "reward_std": 0.08999810181558132, - "rewards/cosine_scaled_reward": 0.11895790975540876, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0421, + "reward": 0.1490084226243198, + "reward_std": 0.1209279503673315, + "rewards/cosine_scaled_reward": 0.01308462768793106, + "rewards/format_reward": 0.8333333507180214, "step": 452 }, { - "completion_length": 1383.7083587646484, - "epoch": 0.25885714285714284, - "grad_norm": 0.36133143305778503, - "kl": 0.010040283203125, + "completion_length": 1423.2083702087402, + "epoch": 0.5177142857142857, + "grad_norm": 2.2027335166931152, + "kl": 0.909820556640625, "learning_rate": 1.2400783294793668e-07, - "loss": 0.0004, - "reward": 0.12098074518144131, - "reward_std": 0.12733107572421432, - "rewards/cosine_scaled_reward": -0.10091916844248772, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0364, + "reward": 0.12369064858648926, + "reward_std": 0.12611138448119164, + "rewards/cosine_scaled_reward": -0.08292348496615887, + "rewards/format_reward": 0.8750000223517418, "step": 453 }, { - "completion_length": 1271.7916870117188, - "epoch": 0.25942857142857145, - "grad_norm": 0.5313466191291809, - "kl": 0.0219879150390625, + "completion_length": 1188.7500381469727, + "epoch": 0.5188571428571429, + "grad_norm": 2.987781524658203, + "kl": 0.386993408203125, "learning_rate": 1.2300579475997657e-07, - "loss": 0.0009, - "reward": 0.1940160132944584, - "reward_std": 0.12792113609611988, - "rewards/cosine_scaled_reward": 0.10657241940498352, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0155, + "reward": 0.11641452787443995, + "reward_std": 0.0906132124364376, + "rewards/cosine_scaled_reward": -0.12823456013575196, + "rewards/format_reward": 0.9375000074505806, "step": 454 }, { - "completion_length": 1607.208366394043, - "epoch": 0.26, - "grad_norm": 0.37100306153297424, - "kl": 0.0096435546875, + "completion_length": 1531.6875457763672, + "epoch": 0.52, + "grad_norm": 2.0451388359069824, + "kl": 0.9672393798828125, "learning_rate": 1.220245676671809e-07, - "loss": 0.0004, - "reward": 0.1685400577262044, - "reward_std": 0.09649250004440546, - "rewards/cosine_scaled_reward": 0.12060768622905016, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0387, + "reward": 0.08102269377559423, + "reward_std": 0.10998783679679036, + "rewards/cosine_scaled_reward": -0.17945297434926033, + "rewards/format_reward": 0.8333333507180214, "step": 455 }, { - "completion_length": 1181.9167022705078, - "epoch": 0.26057142857142856, - "grad_norm": 0.44773364067077637, - "kl": 0.00923919677734375, + "completion_length": 1520.6458587646484, + "epoch": 0.5211428571428571, + "grad_norm": 1.6886857748031616, + "kl": 0.6373214721679688, "learning_rate": 1.2106419949317388e-07, - "loss": 0.0004, - "reward": 0.1973871992304339, - "reward_std": 0.1189747229218483, - "rewards/cosine_scaled_reward": 0.14472665637731552, - "rewards/format_reward": 0.875, + "loss": 0.0255, + "reward": 0.04929352249018848, + "reward_std": 0.10546231491025537, + "rewards/cosine_scaled_reward": -0.23321845568716526, + "rewards/format_reward": 0.7500000111758709, "step": 456 }, { - "completion_length": 1377.5417175292969, - "epoch": 0.2611428571428571, - "grad_norm": 0.34346216917037964, - "kl": 0.0081634521484375, + "completion_length": 1495.3750534057617, + "epoch": 0.5222857142857142, + "grad_norm": 3.0165514945983887, + "kl": 0.941650390625, "learning_rate": 1.2012473704494537e-07, - "loss": 0.0003, - "reward": 0.04526542080566287, - "reward_std": 0.05457933805882931, - "rewards/cosine_scaled_reward": -0.32515719532966614, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0377, + "reward": 0.09918425139039755, + "reward_std": 0.12838075123727322, + "rewards/cosine_scaled_reward": -0.124738659709692, + "rewards/format_reward": 0.8125000111758709, "step": 457 }, { - "completion_length": 1205.5833740234375, - "epoch": 0.26171428571428573, - "grad_norm": 0.4528224766254425, - "kl": 0.0102386474609375, + "completion_length": 1388.5208740234375, + "epoch": 0.5234285714285715, + "grad_norm": 1.8302446603775024, + "kl": 0.6633834838867188, "learning_rate": 1.1920622611056974e-07, - "loss": 0.0004, - "reward": 0.24707153625786304, - "reward_std": 0.11611469089984894, - "rewards/cosine_scaled_reward": 0.2831005919724703, + "loss": 0.0266, + "reward": 0.07702152655110694, + "reward_std": 0.08541670115664601, + "rewards/cosine_scaled_reward": -0.21506773598957807, "rewards/format_reward": 0.875, "step": 458 }, { - "completion_length": 1604.5417175292969, - "epoch": 0.2622857142857143, - "grad_norm": 0.5158527493476868, - "kl": 0.011627197265625, + "completion_length": 1343.5208473205566, + "epoch": 0.5245714285714286, + "grad_norm": 2.486727714538574, + "kl": 0.926055908203125, "learning_rate": 1.1830871145697412e-07, - "loss": 0.0005, - "reward": 0.050534650683403015, - "reward_std": 0.12486429698765278, - "rewards/cosine_scaled_reward": -0.26649756729602814, - "rewards/format_reward": 0.8333333730697632, + "loss": 0.0371, + "reward": 0.1338568499777466, + "reward_std": 0.14767975080758333, + "rewards/cosine_scaled_reward": -0.04322970123030245, + "rewards/format_reward": 0.8541666865348816, "step": 459 }, { - "completion_length": 2053.416717529297, - "epoch": 0.26285714285714284, - "grad_norm": 0.2274622768163681, - "kl": 0.00897216796875, + "completion_length": 1559.1250762939453, + "epoch": 0.5257142857142857, + "grad_norm": 1.7466343641281128, + "kl": 0.7754058837890625, "learning_rate": 1.1743223682775649e-07, - "loss": 0.0004, - "reward": 0.042031632736325264, - "reward_std": 0.08631630428135395, - "rewards/cosine_scaled_reward": -0.23050150647759438, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.031, + "reward": 0.11281750063062645, + "reward_std": 0.09168540453538299, + "rewards/cosine_scaled_reward": -0.10402092151343822, + "rewards/format_reward": 0.8541666865348816, "step": 460 }, { - "completion_length": 1021.5000305175781, - "epoch": 0.2634285714285714, - "grad_norm": 0.4134611189365387, - "kl": 0.015411376953125, + "completion_length": 1441.5417098999023, + "epoch": 0.5268571428571428, + "grad_norm": 2.377612829208374, + "kl": 0.6788558959960938, "learning_rate": 1.1657684494105386e-07, - "loss": 0.0006, - "reward": 0.138437463901937, - "reward_std": 0.0730750598013401, - "rewards/cosine_scaled_reward": 0.033056119456887245, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0271, + "reward": 0.12857838673517108, + "reward_std": 0.1296999854966998, + "rewards/cosine_scaled_reward": -0.040342007763683796, + "rewards/format_reward": 0.8333333395421505, "step": 461 }, { - "completion_length": 1504.8333435058594, - "epoch": 0.264, - "grad_norm": 0.3614439070224762, - "kl": 0.0085296630859375, + "completion_length": 1260.895866394043, + "epoch": 0.528, + "grad_norm": 5.065842151641846, + "kl": 0.7448501586914062, "learning_rate": 1.1574257748745986e-07, - "loss": 0.0003, - "reward": 0.23874622413131874, - "reward_std": 0.13557261787354946, - "rewards/cosine_scaled_reward": 0.22620555013418198, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0298, + "reward": 0.08250287733972073, + "reward_std": 0.11116986721754074, + "rewards/cosine_scaled_reward": -0.15972350211814046, + "rewards/format_reward": 0.7916666865348816, "step": 462 }, { - "completion_length": 1844.4583740234375, - "epoch": 0.26457142857142857, - "grad_norm": 0.36927905678749084, - "kl": 0.013092041015625, + "completion_length": 1197.4166946411133, + "epoch": 0.5291428571428571, + "grad_norm": 2.4315950870513916, + "kl": 0.316741943359375, "learning_rate": 1.1492947512799328e-07, - "loss": 0.0005, - "reward": 0.08661322388797998, - "reward_std": 0.10069876257330179, - "rewards/cosine_scaled_reward": -0.16130705457180738, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.0126, + "reward": 0.17255353461951017, + "reward_std": 0.14624354103580117, + "rewards/cosine_scaled_reward": 0.10790145858481992, + "rewards/format_reward": 0.7916666753590107, "step": 463 }, { - "completion_length": 1797.5416717529297, - "epoch": 0.2651428571428571, - "grad_norm": 0.35167327523231506, - "kl": 0.01092529296875, + "completion_length": 1026.7292098999023, + "epoch": 0.5302857142857142, + "grad_norm": 1.8613412380218506, + "kl": 0.590972900390625, "learning_rate": 1.1413757749211602e-07, - "loss": 0.0004, - "reward": 0.041669391095638275, - "reward_std": 0.06646678596735, - "rewards/cosine_scaled_reward": -0.2540195994079113, - "rewards/format_reward": 0.75, + "loss": 0.0236, + "reward": 0.17181929713115096, + "reward_std": 0.12013115221634507, + "rewards/cosine_scaled_reward": 0.02979774959385395, + "rewards/format_reward": 0.9375000149011612, "step": 464 }, { - "completion_length": 1120.6250305175781, - "epoch": 0.26571428571428574, - "grad_norm": 0.290342777967453, - "kl": 0.006744384765625, + "completion_length": 1346.9375305175781, + "epoch": 0.5314285714285715, + "grad_norm": 1.951102614402771, + "kl": 0.7594528198242188, "learning_rate": 1.1336692317580158e-07, - "loss": 0.0003, - "reward": 0.10004028119146824, - "reward_std": 0.09755644854158163, - "rewards/cosine_scaled_reward": -0.18460207618772984, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0304, + "reward": 0.12367417407222092, + "reward_std": 0.13810825487598777, + "rewards/cosine_scaled_reward": -0.10004201903939247, + "rewards/format_reward": 0.9166666716337204, "step": 465 }, { - "completion_length": 847.5000305175781, - "epoch": 0.2662857142857143, - "grad_norm": 0.39604970812797546, - "kl": 0.0054473876953125, + "completion_length": 1361.2708702087402, + "epoch": 0.5325714285714286, + "grad_norm": 4.891315937042236, + "kl": 0.6375732421875, "learning_rate": 1.1261754973965422e-07, - "loss": 0.0002, - "reward": 0.14204144291579723, - "reward_std": 0.0992696713656187, - "rewards/cosine_scaled_reward": -0.08424053154885769, - "rewards/format_reward": 1.0, + "loss": 0.0255, + "reward": 0.1684982028673403, + "reward_std": 0.13606750033795834, + "rewards/cosine_scaled_reward": 0.051418120972812176, + "rewards/format_reward": 0.875, "step": 466 }, { - "completion_length": 1440.3333435058594, - "epoch": 0.26685714285714285, - "grad_norm": 0.4701591432094574, - "kl": 0.0153045654296875, + "completion_length": 1727.2708740234375, + "epoch": 0.5337142857142857, + "grad_norm": 3.141331434249878, + "kl": 0.8231964111328125, "learning_rate": 1.1188949370707787e-07, - "loss": 0.0006, - "reward": 0.14087377674877644, - "reward_std": 0.12677839933894575, - "rewards/cosine_scaled_reward": 0.018134362995624542, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0329, + "reward": 0.0738575104624033, + "reward_std": 0.11307883076369762, + "rewards/cosine_scaled_reward": -0.20494843367487192, + "rewards/format_reward": 0.8333333358168602, "step": 467 }, { - "completion_length": 2129.041748046875, - "epoch": 0.2674285714285714, - "grad_norm": 0.3471032977104187, - "kl": 0.0135955810546875, + "completion_length": 1474.2708930969238, + "epoch": 0.5348571428571428, + "grad_norm": 2.9029293060302734, + "kl": 0.9022445678710938, "learning_rate": 1.1118279056249653e-07, - "loss": 0.0005, - "reward": 0.024037884548306465, - "reward_std": 0.09396060439758003, - "rewards/cosine_scaled_reward": -0.22124765627086163, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0361, + "reward": 0.11058385335491039, + "reward_std": 0.14076802507042885, + "rewards/cosine_scaled_reward": -0.07730268314480782, + "rewards/format_reward": 0.7916666846722364, "step": 468 }, { - "completion_length": 965.5833740234375, - "epoch": 0.268, - "grad_norm": 0.4992268979549408, - "kl": 0.0136871337890625, + "completion_length": 1365.81254196167, + "epoch": 0.536, + "grad_norm": 2.5009942054748535, + "kl": 0.7577972412109375, "learning_rate": 1.1049747474962444e-07, - "loss": 0.0005, - "reward": 0.21583312842994928, - "reward_std": 0.12461132742464542, - "rewards/cosine_scaled_reward": 0.17968885973095894, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0303, + "reward": 0.14495484717190266, + "reward_std": 0.13738009426742792, + "rewards/cosine_scaled_reward": 0.003895143046975136, + "rewards/format_reward": 0.8333333395421505, "step": 469 }, { - "completion_length": 1527.6666793823242, - "epoch": 0.26857142857142857, - "grad_norm": 0.4218987822532654, - "kl": 0.01016998291015625, + "completion_length": 1764.3333740234375, + "epoch": 0.5371428571428571, + "grad_norm": 3.4764883518218994, + "kl": 1.04931640625, "learning_rate": 1.0983357966978745e-07, - "loss": 0.0004, - "reward": 0.15412186551839113, - "reward_std": 0.11437848303467035, - "rewards/cosine_scaled_reward": 0.03735947608947754, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.042, + "reward": 0.12439248080772813, + "reward_std": 0.1104172533378005, + "rewards/cosine_scaled_reward": -0.042109834030270576, + "rewards/format_reward": 0.791666679084301, "step": 470 }, { - "completion_length": 1656.0833892822266, - "epoch": 0.26914285714285713, - "grad_norm": 0.3551444113254547, - "kl": 0.007755279541015625, + "completion_length": 1697.2083587646484, + "epoch": 0.5382857142857143, + "grad_norm": 3.593810558319092, + "kl": 1.015869140625, "learning_rate": 1.0919113768029517e-07, - "loss": 0.0003, - "reward": 0.15511425444856286, - "reward_std": 0.14628601353615522, - "rewards/cosine_scaled_reward": 0.01941436529159546, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.0406, + "reward": 0.16313168196938932, + "reward_std": 0.1711752563714981, + "rewards/cosine_scaled_reward": 0.08633797615766525, + "rewards/format_reward": 0.7708333544433117, "step": 471 }, { - "completion_length": 2208.6250610351562, - "epoch": 0.26971428571428574, - "grad_norm": 0.33657190203666687, - "kl": 0.015655517578125, + "completion_length": 1437.3125457763672, + "epoch": 0.5394285714285715, + "grad_norm": 2.227132558822632, + "kl": 0.584442138671875, "learning_rate": 1.0857018009286381e-07, - "loss": 0.0006, - "reward": 0.07080578990280628, - "reward_std": 0.13868937082588673, - "rewards/cosine_scaled_reward": -0.12444374524056911, - "rewards/format_reward": 0.6666666679084301, + "loss": 0.0234, + "reward": 0.049232515739277005, + "reward_std": 0.10539399227127433, + "rewards/cosine_scaled_reward": -0.2601514309644699, + "rewards/format_reward": 0.8125000186264515, "step": 472 }, { - "completion_length": 1014.5000152587891, - "epoch": 0.2702857142857143, - "grad_norm": 0.35231611132621765, - "kl": 0.008575439453125, + "completion_length": 1427.6667175292969, + "epoch": 0.5405714285714286, + "grad_norm": 2.773785352706909, + "kl": 0.7134475708007812, "learning_rate": 1.0797073717209013e-07, - "loss": 0.0003, - "reward": 0.0662034135311842, - "reward_std": 0.05356099922209978, - "rewards/cosine_scaled_reward": -0.28560300916433334, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0286, + "reward": 0.06174903141800314, + "reward_std": 0.0896931691095233, + "rewards/cosine_scaled_reward": -0.2288785793352872, + "rewards/format_reward": 0.8125000055879354, "step": 473 }, { - "completion_length": 1747.6666870117188, - "epoch": 0.27085714285714285, - "grad_norm": 0.2967202663421631, - "kl": 0.008983612060546875, + "completion_length": 1671.583366394043, + "epoch": 0.5417142857142857, + "grad_norm": 3.813199520111084, + "kl": 1.1416015625, "learning_rate": 1.0739283813397639e-07, - "loss": 0.0004, - "reward": 0.2399043757468462, - "reward_std": 0.11284455470740795, - "rewards/cosine_scaled_reward": 0.33540781773626804, - "rewards/format_reward": 0.75, + "loss": 0.0456, + "reward": 0.22307377692777663, + "reward_std": 0.1535223526880145, + "rewards/cosine_scaled_reward": 0.24020265229046345, + "rewards/format_reward": 0.8125000149011612, "step": 474 }, { - "completion_length": 1124.7917022705078, - "epoch": 0.2714285714285714, - "grad_norm": 0.33496546745300293, - "kl": 0.01090240478515625, + "completion_length": 1295.3333587646484, + "epoch": 0.5428571428571428, + "grad_norm": 1.257153868675232, + "kl": 0.5095977783203125, "learning_rate": 1.068365111445064e-07, - "loss": 0.0004, - "reward": 0.19144833646714687, - "reward_std": 0.15066044661216438, - "rewards/cosine_scaled_reward": 0.10481813549995422, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0204, + "reward": 0.11992977559566498, + "reward_std": 0.13077597226947546, + "rewards/cosine_scaled_reward": -0.1030871132388711, + "rewards/format_reward": 0.8958333432674408, "step": 475 }, { - "completion_length": 1379.3333587646484, - "epoch": 0.272, - "grad_norm": 0.26556169986724854, - "kl": 0.0073699951171875, + "completion_length": 1405.2500457763672, + "epoch": 0.544, + "grad_norm": 1.7341384887695312, + "kl": 0.8976593017578125, "learning_rate": 1.063017833182728e-07, - "loss": 0.0003, - "reward": 0.15770338382571936, - "reward_std": 0.12872092425823212, - "rewards/cosine_scaled_reward": 0.007714824751019478, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0359, + "reward": 0.11575727723538876, + "reward_std": 0.13892074767500162, + "rewards/cosine_scaled_reward": -0.09050496038980782, + "rewards/format_reward": 0.8541666828095913, "step": 476 }, { - "completion_length": 1432.5833435058594, - "epoch": 0.2725714285714286, - "grad_norm": 0.2708079516887665, - "kl": 0.00745391845703125, + "completion_length": 1334.520866394043, + "epoch": 0.5451428571428572, + "grad_norm": 2.1544551849365234, + "kl": 0.790008544921875, "learning_rate": 1.0578868071715544e-07, - "loss": 0.0003, - "reward": 0.3094392456114292, - "reward_std": 0.08074376359581947, - "rewards/cosine_scaled_reward": 0.5412539541721344, - "rewards/format_reward": 0.75, + "loss": 0.0317, + "reward": 0.1808297468814999, + "reward_std": 0.15548726078122854, + "rewards/cosine_scaled_reward": 0.09577041026204824, + "rewards/format_reward": 0.8541666865348816, "step": 477 }, { - "completion_length": 1313.9583587646484, - "epoch": 0.27314285714285713, - "grad_norm": 0.3747269809246063, - "kl": 0.00879669189453125, + "completion_length": 1640.0417098999023, + "epoch": 0.5462857142857143, + "grad_norm": 2.7642574310302734, + "kl": 1.0536422729492188, "learning_rate": 1.0529722834905125e-07, - "loss": 0.0004, - "reward": 0.14969859644770622, - "reward_std": 0.0952550619840622, - "rewards/cosine_scaled_reward": 0.024618370458483696, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.0421, + "reward": 0.12267802411224693, + "reward_std": 0.11104590399190784, + "rewards/cosine_scaled_reward": -0.03174232318997383, + "rewards/format_reward": 0.7708333469927311, "step": 478 }, { - "completion_length": 1250.8333587646484, - "epoch": 0.2737142857142857, - "grad_norm": 0.35014885663986206, - "kl": 0.0105438232421875, + "completion_length": 1397.3750457763672, + "epoch": 0.5474285714285714, + "grad_norm": 3.118372678756714, + "kl": 0.7571868896484375, "learning_rate": 1.0482745016665526e-07, - "loss": 0.0004, - "reward": 0.09707892872393131, - "reward_std": 0.09250679984688759, - "rewards/cosine_scaled_reward": -0.1726207509636879, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0302, + "reward": 0.11781717138364911, + "reward_std": 0.11601145751774311, + "rewards/cosine_scaled_reward": -0.06828589458018541, + "rewards/format_reward": 0.8125000186264515, "step": 479 }, { - "completion_length": 1332.5833587646484, - "epoch": 0.2742857142857143, - "grad_norm": 0.526451826095581, - "kl": 0.01734161376953125, + "completion_length": 1380.4375381469727, + "epoch": 0.5485714285714286, + "grad_norm": 2.790198802947998, + "kl": 1.3089599609375, "learning_rate": 1.0437936906629334e-07, - "loss": 0.0007, - "reward": 0.08888316294178367, - "reward_std": 0.09775535203516483, - "rewards/cosine_scaled_reward": -0.1562953796237707, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0523, + "reward": 0.09327662736177444, + "reward_std": 0.12549111153930426, + "rewards/cosine_scaled_reward": -0.14611217193305492, + "rewards/format_reward": 0.8333333507180214, "step": 480 }, { - "completion_length": 1378.2917175292969, - "epoch": 0.27485714285714286, - "grad_norm": 0.6014401912689209, - "kl": 0.01509857177734375, + "completion_length": 1502.9375457763672, + "epoch": 0.5497142857142857, + "grad_norm": 3.935354232788086, + "kl": 0.792388916015625, "learning_rate": 1.0395300688680625e-07, - "loss": 0.0006, - "reward": 0.08418819680809975, - "reward_std": 0.12936637550592422, - "rewards/cosine_scaled_reward": -0.1695399060845375, - "rewards/format_reward": 0.833333358168602, + "loss": 0.0317, + "reward": 0.06033009593375027, + "reward_std": 0.12011839542537928, + "rewards/cosine_scaled_reward": -0.16942069120705128, + "rewards/format_reward": 0.6875000186264515, "step": 481 }, { - "completion_length": 1868.7500305175781, - "epoch": 0.2754285714285714, - "grad_norm": 0.29042962193489075, - "kl": 0.0122222900390625, + "completion_length": 1320.3541984558105, + "epoch": 0.5508571428571428, + "grad_norm": 3.4273793697357178, + "kl": 0.8202667236328125, "learning_rate": 1.0354838440848501e-07, - "loss": 0.0005, - "reward": 0.05093623013817705, - "reward_std": 0.08514466695487499, - "rewards/cosine_scaled_reward": -0.2666657380759716, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0328, + "reward": 0.145433189580217, + "reward_std": 0.13374328007921576, + "rewards/cosine_scaled_reward": -0.03734024800360203, + "rewards/format_reward": 0.916666679084301, "step": 482 }, { - "completion_length": 962.0417022705078, - "epoch": 0.276, - "grad_norm": 2.3759164810180664, - "kl": 0.02405548095703125, + "completion_length": 1709.2500534057617, + "epoch": 0.552, + "grad_norm": 3.1256120204925537, + "kl": 1.09405517578125, "learning_rate": 1.0316552135205837e-07, - "loss": 0.001, - "reward": 0.0790173931454774, - "reward_std": 0.07320081850048155, - "rewards/cosine_scaled_reward": -0.20882237516343594, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.0438, + "reward": 0.10491634625941515, + "reward_std": 0.13583884108811617, + "rewards/cosine_scaled_reward": -0.11350399069488049, + "rewards/format_reward": 0.8333333432674408, "step": 483 }, { - "completion_length": 1389.0417175292969, - "epoch": 0.2765714285714286, - "grad_norm": 0.4754798412322998, - "kl": 0.0251922607421875, + "completion_length": 1060.3125457763672, + "epoch": 0.5531428571428572, + "grad_norm": 2.7416789531707764, + "kl": 0.593719482421875, "learning_rate": 1.0280443637773163e-07, - "loss": 0.001, - "reward": 0.16605499107390642, - "reward_std": 0.10074156429618597, - "rewards/cosine_scaled_reward": 0.013021436869166791, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0237, + "reward": 0.13031714130192995, + "reward_std": 0.12379100965335965, + "rewards/cosine_scaled_reward": -0.09740070005500456, + "rewards/format_reward": 0.9375000074505806, "step": 484 }, { - "completion_length": 1579.0833892822266, - "epoch": 0.27714285714285714, - "grad_norm": 0.3607923686504364, - "kl": 0.0094451904296875, + "completion_length": 1706.8333740234375, + "epoch": 0.5542857142857143, + "grad_norm": 2.7037065029144287, + "kl": 1.599609375, "learning_rate": 1.0246514708427701e-07, - "loss": 0.0004, - "reward": 0.07166456733830273, - "reward_std": 0.09728987328708172, - "rewards/cosine_scaled_reward": -0.18766840733587742, - "rewards/format_reward": 0.791666679084301, + "loss": 0.064, + "reward": 0.07579714641906321, + "reward_std": 0.1142753018066287, + "rewards/cosine_scaled_reward": -0.1662511508911848, + "rewards/format_reward": 0.7708333469927311, "step": 485 }, { - "completion_length": 1408.6250610351562, - "epoch": 0.2777142857142857, - "grad_norm": 0.2610523998737335, - "kl": 0.00855255126953125, + "completion_length": 909.7500267028809, + "epoch": 0.5554285714285714, + "grad_norm": 2.9626100063323975, + "kl": 0.5513458251953125, "learning_rate": 1.0214767000817596e-07, - "loss": 0.0003, - "reward": 0.14911100082099438, - "reward_std": 0.12243578024208546, - "rewards/cosine_scaled_reward": -0.041350213810801506, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0221, + "reward": 0.16371853230521083, + "reward_std": 0.08725554682314396, + "rewards/cosine_scaled_reward": -0.0292919734492898, + "rewards/format_reward": 1.0, "step": 486 }, { - "completion_length": 1436.3333358764648, - "epoch": 0.2782857142857143, - "grad_norm": 0.37136200070381165, - "kl": 0.0115814208984375, + "completion_length": 977.083366394043, + "epoch": 0.5565714285714286, + "grad_norm": 1.2269593477249146, + "kl": 0.354217529296875, "learning_rate": 1.0185202062281336e-07, - "loss": 0.0005, - "reward": 0.2240826990455389, - "reward_std": 0.06997670792043209, - "rewards/cosine_scaled_reward": 0.2247643545269966, - "rewards/format_reward": 0.875, + "loss": 0.0142, + "reward": 0.18861438240855932, + "reward_std": 0.12223522993735969, + "rewards/cosine_scaled_reward": 0.09728494752198458, + "rewards/format_reward": 0.9166666716337204, "step": 487 }, { - "completion_length": 1474.9167175292969, - "epoch": 0.27885714285714286, - "grad_norm": 0.301455557346344, - "kl": 0.007965087890625, + "completion_length": 1098.43754196167, + "epoch": 0.5577142857142857, + "grad_norm": 3.2133853435516357, + "kl": 0.5051727294921875, "learning_rate": 1.0157821333772304e-07, - "loss": 0.0003, - "reward": 0.19594286940991879, - "reward_std": 0.17231578193604946, - "rewards/cosine_scaled_reward": 0.1186765544116497, - "rewards/format_reward": 0.9166666865348816, + "loss": 0.0202, + "reward": 0.13033967884257436, + "reward_std": 0.08723578602075577, + "rewards/cosine_scaled_reward": -0.08787534758448601, + "rewards/format_reward": 0.9375000074505806, "step": 488 }, { - "completion_length": 1647.3333435058594, - "epoch": 0.2794285714285714, - "grad_norm": 0.2870188057422638, - "kl": 0.007232666015625, + "completion_length": 1465.5000457763672, + "epoch": 0.5588571428571428, + "grad_norm": 4.108081817626953, + "kl": 0.8738555908203125, "learning_rate": 1.013262614978859e-07, - "loss": 0.0003, - "reward": 0.19428860675543547, - "reward_std": 0.1931849978864193, - "rewards/cosine_scaled_reward": 0.13230790570378304, - "rewards/format_reward": 0.875, + "loss": 0.035, + "reward": 0.0360484067350626, + "reward_std": 0.08054300001822412, + "rewards/cosine_scaled_reward": -0.2918727397918701, + "rewards/format_reward": 0.7916666865348816, "step": 489 }, { - "completion_length": 1850.2083587646484, - "epoch": 0.28, - "grad_norm": 0.4214995503425598, - "kl": 0.01214599609375, + "completion_length": 1255.0416984558105, + "epoch": 0.56, + "grad_norm": 1.202156662940979, + "kl": 0.7819290161132812, "learning_rate": 1.0109617738307911e-07, - "loss": 0.0005, - "reward": 0.11115190200507641, - "reward_std": 0.17980430275201797, - "rewards/cosine_scaled_reward": -0.024802304804325104, - "rewards/format_reward": 0.7083333507180214, + "loss": 0.0313, + "reward": 0.15311120147816837, + "reward_std": 0.10377770848572254, + "rewards/cosine_scaled_reward": -0.02998751401901245, + "rewards/format_reward": 0.9583333432674408, "step": 490 }, { - "completion_length": 1484.3333892822266, - "epoch": 0.2805714285714286, - "grad_norm": 0.37037596106529236, - "kl": 0.01509857177734375, + "completion_length": 1578.0625381469727, + "epoch": 0.5611428571428572, + "grad_norm": 1.941928505897522, + "kl": 0.84698486328125, "learning_rate": 1.0088797220727779e-07, - "loss": 0.0006, - "reward": 0.17148346919566393, - "reward_std": 0.07489623595029116, - "rewards/cosine_scaled_reward": 0.08829830959439278, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.0339, + "reward": 0.13159164262469858, + "reward_std": 0.13246915489435196, + "rewards/cosine_scaled_reward": -0.05548815353540704, + "rewards/format_reward": 0.8750000074505806, "step": 491 }, { - "completion_length": 1292.3333740234375, - "epoch": 0.28114285714285714, - "grad_norm": 0.29965201020240784, - "kl": 0.0102691650390625, + "completion_length": 1292.3333587646484, + "epoch": 0.5622857142857143, + "grad_norm": 2.9671390056610107, + "kl": 0.797515869140625, "learning_rate": 1.0070165611810855e-07, - "loss": 0.0004, - "reward": 0.10552676115185022, - "reward_std": 0.11199694685637951, - "rewards/cosine_scaled_reward": -0.1681173350661993, + "loss": 0.0319, + "reward": 0.13928497838787735, + "reward_std": 0.10173067264258862, + "rewards/cosine_scaled_reward": -0.08179534692317247, "rewards/format_reward": 0.9583333432674408, "step": 492 }, { - "completion_length": 2142.6250915527344, - "epoch": 0.2817142857142857, - "grad_norm": 0.5195578932762146, - "kl": 0.02080535888671875, + "completion_length": 1064.8125228881836, + "epoch": 0.5634285714285714, + "grad_norm": 2.1849365234375, + "kl": 0.4040985107421875, "learning_rate": 1.005372381963547e-07, - "loss": 0.0008, - "reward": 0.018468670547008514, - "reward_std": 0.11828110925853252, - "rewards/cosine_scaled_reward": -0.23857781663537025, - "rewards/format_reward": 0.5833333469927311, + "loss": 0.0162, + "reward": 0.13772490341216326, + "reward_std": 0.14829062833450735, + "rewards/cosine_scaled_reward": -0.07700726587790996, + "rewards/format_reward": 0.9583333432674408, "step": 493 }, { - "completion_length": 1982.166732788086, - "epoch": 0.2822857142857143, - "grad_norm": 0.3225327134132385, - "kl": 0.008243560791015625, + "completion_length": 1061.7291870117188, + "epoch": 0.5645714285714286, + "grad_norm": 1.5284849405288696, + "kl": 0.540435791015625, "learning_rate": 1.0039472645551372e-07, - "loss": 0.0003, - "reward": 0.12525426445063204, - "reward_std": 0.11276457970961928, - "rewards/cosine_scaled_reward": -0.07034042105078697, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.0217, + "reward": 0.12483312236145139, + "reward_std": 0.1254338538274169, + "rewards/cosine_scaled_reward": -0.08674540685024112, + "rewards/format_reward": 0.8958333432674408, "step": 494 }, { - "completion_length": 1622.8333435058594, - "epoch": 0.28285714285714286, - "grad_norm": 0.46337267756462097, - "kl": 0.0142059326171875, + "completion_length": 1627.0625305175781, + "epoch": 0.5657142857142857, + "grad_norm": 5.17227029800415, + "kl": 1.3586196899414062, "learning_rate": 1.002741278414069e-07, - "loss": 0.0006, - "reward": 0.1336368229240179, - "reward_std": 0.06697798799723387, - "rewards/cosine_scaled_reward": 0.03765885531902313, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.0544, + "reward": 0.11015307196066715, + "reward_std": 0.13126440905034542, + "rewards/cosine_scaled_reward": -0.08530823234468699, + "rewards/format_reward": 0.8125000149011612, "step": 495 }, { - "completion_length": 748.1250152587891, - "epoch": 0.2834285714285714, - "grad_norm": 0.3830733001232147, - "kl": 0.005741119384765625, + "completion_length": 1575.4167098999023, + "epoch": 0.5668571428571428, + "grad_norm": 3.1292481422424316, + "kl": 1.2574386596679688, "learning_rate": 1.0017544823184055e-07, - "loss": 0.0002, - "reward": 0.2972524566575885, - "reward_std": 0.06938042910769582, - "rewards/cosine_scaled_reward": 0.3789844736456871, - "rewards/format_reward": 1.0, + "loss": 0.0503, + "reward": 0.1306152348406613, + "reward_std": 0.08260433259420097, + "rewards/cosine_scaled_reward": -0.013811783166602254, + "rewards/format_reward": 0.7916666716337204, "step": 496 }, { - "completion_length": 1095.7500305175781, - "epoch": 0.284, - "grad_norm": 0.4513530731201172, - "kl": 0.012371063232421875, + "completion_length": 1092.5208587646484, + "epoch": 0.568, + "grad_norm": 3.035909414291382, + "kl": 0.6652145385742188, "learning_rate": 1.0009869243631952e-07, - "loss": 0.0005, - "reward": 0.23861000314354897, - "reward_std": 0.12132814712822437, - "rewards/cosine_scaled_reward": 0.26830008905380964, - "rewards/format_reward": 0.875, + "loss": 0.0266, + "reward": 0.17385427234694362, + "reward_std": 0.09661010140553117, + "rewards/cosine_scaled_reward": 0.058778489008545876, + "rewards/format_reward": 0.8958333358168602, "step": 497 }, { - "completion_length": 1185.5417175292969, - "epoch": 0.2845714285714286, - "grad_norm": 0.3653818368911743, - "kl": 0.0113372802734375, + "completion_length": 1320.1250457763672, + "epoch": 0.5691428571428572, + "grad_norm": 1.9357889890670776, + "kl": 0.88330078125, "learning_rate": 1.000438641958131e-07, - "loss": 0.0005, - "reward": 0.2222729790955782, - "reward_std": 0.13904497772455215, - "rewards/cosine_scaled_reward": 0.19242076948285103, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0354, + "reward": 0.13382654823362827, + "reward_std": 0.11791369551792741, + "rewards/cosine_scaled_reward": -0.06090674642473459, + "rewards/format_reward": 0.8958333432674408, "step": 498 }, { - "completion_length": 1228.2083740234375, - "epoch": 0.28514285714285714, - "grad_norm": 0.33757004141807556, - "kl": 0.0083465576171875, + "completion_length": 1220.7292022705078, + "epoch": 0.5702857142857143, + "grad_norm": 2.041574239730835, + "kl": 0.31233978271484375, "learning_rate": 1.0001096618257236e-07, - "loss": 0.0003, - "reward": 0.10967470798641443, - "reward_std": 0.12665395252406597, - "rewards/cosine_scaled_reward": -0.15649681817740202, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0125, + "reward": 0.15456983912736177, + "reward_std": 0.130745030939579, + "rewards/cosine_scaled_reward": -0.008478153496980667, + "rewards/format_reward": 0.9166666865348816, "step": 499 }, { - "completion_length": 1122.3333587646484, - "epoch": 0.2857142857142857, - "grad_norm": 0.9894787669181824, - "kl": 0.01134490966796875, + "completion_length": 1320.8750457763672, + "epoch": 0.5714285714285714, + "grad_norm": 3.362833023071289, + "kl": 0.72802734375, "learning_rate": 1e-07, - "loss": 0.0005, - "reward": 0.18905188143253326, - "reward_std": 0.11179906129837036, - "rewards/cosine_scaled_reward": 0.07686734944581985, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0292, + "reward": 0.11918663769029081, + "reward_std": 0.1335056396201253, + "rewards/cosine_scaled_reward": -0.1146287601441145, + "rewards/format_reward": 0.9166666865348816, "step": 500 }, { - "completion_length": 1086.6667175292969, - "epoch": 0.2862857142857143, - "grad_norm": 0.544489324092865, - "kl": 0.0120391845703125, - "learning_rate": 1.0001096618257236e-07, - "loss": 0.0005, - "reward": 0.13621441926807165, - "reward_std": 0.1650450136512518, - "rewards/cosine_scaled_reward": -0.018101891502738, - "rewards/format_reward": 0.8333333432674408, - "step": 501 - }, - { - "epoch": 0.2862857142857143, - "step": 501, + "epoch": 0.5714285714285714, + "step": 500, "total_flos": 0.0, - "train_loss": 9.615809731966185e-07, - "train_runtime": 82.4093, - "train_samples_per_second": 145.615, - "train_steps_per_second": 6.067 + "train_loss": 0.012724974361105416, + "train_runtime": 59499.213, + "train_samples_per_second": 0.403, + "train_steps_per_second": 0.008 } ], "logging_steps": 1,