{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 1875, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 309.1416763305664, "epoch": 0.0026666666666666666, "grad_norm": 1.0915299654006958, "kl": 0.000769805908203125, "learning_rate": 2.6595744680851062e-08, "loss": 0.0572, "reward": -0.8166666805744172, "reward_std": 0.30653437227010727, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": -0.8916666805744171, "step": 5 }, { "completion_length": 335.7166778564453, "epoch": 0.005333333333333333, "grad_norm": 1.6455351114273071, "kl": 0.00115509033203125, "learning_rate": 5.3191489361702123e-08, "loss": 0.0544, "reward": -0.8583333492279053, "reward_std": 0.2569814197719097, "rewards/accuracy_reward": 0.0666666679084301, "rewards/format_reward": -0.925000011920929, "step": 10 }, { "completion_length": 290.56251220703126, "epoch": 0.008, "grad_norm": 1.7345483303070068, "kl": 0.0012176513671875, "learning_rate": 7.978723404255319e-08, "loss": 0.0639, "reward": -0.8333333611488343, "reward_std": 0.4082482993602753, "rewards/accuracy_reward": 0.04166666753590107, "rewards/format_reward": -0.8750000238418579, "step": 15 }, { "completion_length": 320.79584350585935, "epoch": 0.010666666666666666, "grad_norm": 1.5033214092254639, "kl": 0.0012172698974609376, "learning_rate": 1.0638297872340425e-07, "loss": 0.0569, "reward": -0.8416666924953461, "reward_std": 0.2757193736732006, "rewards/accuracy_reward": 0.04166666753590107, "rewards/format_reward": -0.8833333492279053, "step": 20 }, { "completion_length": 341.09584655761716, "epoch": 0.013333333333333334, "grad_norm": 1.9244236946105957, "kl": 0.0012542724609375, "learning_rate": 1.329787234042553e-07, "loss": 0.0252, "reward": -0.8458333492279053, "reward_std": 0.3047572821378708, "rewards/accuracy_reward": 0.08750000149011612, "rewards/format_reward": -0.9333333492279052, "step": 25 }, { "completion_length": 298.28334350585936, "epoch": 0.016, "grad_norm": 2.07985258102417, "kl": 0.0014141082763671875, "learning_rate": 1.5957446808510638e-07, "loss": 0.0589, "reward": -0.8083333551883698, "reward_std": 0.3681695103645325, "rewards/accuracy_reward": 0.05833333432674408, "rewards/format_reward": -0.8666666924953461, "step": 30 }, { "completion_length": 327.3208374023437, "epoch": 0.018666666666666668, "grad_norm": 1.2517731189727783, "kl": 0.0022617340087890624, "learning_rate": 1.8617021276595742e-07, "loss": 0.0912, "reward": -0.854166692495346, "reward_std": 0.3303105406463146, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": -0.891666692495346, "step": 35 }, { "completion_length": 352.6000122070312, "epoch": 0.021333333333333333, "grad_norm": 0.8213557004928589, "kl": 0.003079986572265625, "learning_rate": 2.127659574468085e-07, "loss": 0.0448, "reward": -0.850000011920929, "reward_std": 0.28401452749967576, "rewards/accuracy_reward": 0.05833333395421505, "rewards/format_reward": -0.9083333432674408, "step": 40 }, { "completion_length": 337.5208435058594, "epoch": 0.024, "grad_norm": 0.7038048505783081, "kl": 0.0064483642578125, "learning_rate": 2.393617021276596e-07, "loss": 0.1086, "reward": -0.8291666865348816, "reward_std": 0.320113442838192, "rewards/accuracy_reward": 0.07083333469927311, "rewards/format_reward": -0.9000000119209289, "step": 45 }, { "completion_length": 353.56251220703126, "epoch": 0.02666666666666667, "grad_norm": 1.7768754959106445, "kl": 0.011151123046875, "learning_rate": 2.659574468085106e-07, "loss": 0.0946, "reward": -0.791666692495346, "reward_std": 0.3645200379192829, "rewards/accuracy_reward": 0.05000000149011612, "rewards/format_reward": -0.8416666805744171, "step": 50 }, { "completion_length": 304.80834197998047, "epoch": 0.029333333333333333, "grad_norm": 1.8845570087432861, "kl": 0.0145263671875, "learning_rate": 2.925531914893617e-07, "loss": 0.1085, "reward": -0.825000011920929, "reward_std": 0.378238408267498, "rewards/accuracy_reward": 0.05000000149011612, "rewards/format_reward": -0.8750000119209289, "step": 55 }, { "completion_length": 300.5541717529297, "epoch": 0.032, "grad_norm": 1.3354154825210571, "kl": 0.0145751953125, "learning_rate": 3.1914893617021275e-07, "loss": 0.0812, "reward": -0.8083333492279052, "reward_std": 0.3903405636548996, "rewards/accuracy_reward": 0.09166666939854622, "rewards/format_reward": -0.900000023841858, "step": 60 }, { "completion_length": 302.21250915527344, "epoch": 0.034666666666666665, "grad_norm": 1.0147898197174072, "kl": 0.022381591796875, "learning_rate": 3.457446808510638e-07, "loss": 0.1457, "reward": -0.7458333492279052, "reward_std": 0.47493031769990923, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": -0.7833333492279053, "step": 65 }, { "completion_length": 291.40417633056643, "epoch": 0.037333333333333336, "grad_norm": 2.549830675125122, "kl": 0.03157958984375, "learning_rate": 3.7234042553191484e-07, "loss": 0.1052, "reward": -0.7708333551883697, "reward_std": 0.4252162277698517, "rewards/accuracy_reward": 0.06250000037252904, "rewards/format_reward": -0.8333333551883697, "step": 70 }, { "completion_length": 315.60834350585935, "epoch": 0.04, "grad_norm": 1.0364981889724731, "kl": 0.02958984375, "learning_rate": 3.989361702127659e-07, "loss": 0.1479, "reward": -0.7458333551883698, "reward_std": 0.48624068051576613, "rewards/accuracy_reward": 0.06250000111758709, "rewards/format_reward": -0.8083333611488343, "step": 75 }, { "completion_length": 275.8916732788086, "epoch": 0.042666666666666665, "grad_norm": 1.8735177516937256, "kl": 0.03341064453125, "learning_rate": 4.25531914893617e-07, "loss": 0.1291, "reward": -0.7250000238418579, "reward_std": 0.5295502826571464, "rewards/accuracy_reward": 0.05833333395421505, "rewards/format_reward": -0.7833333551883698, "step": 80 }, { "completion_length": 278.6000091552734, "epoch": 0.04533333333333334, "grad_norm": 1.7058619260787964, "kl": 0.04388427734375, "learning_rate": 4.5212765957446806e-07, "loss": 0.1241, "reward": -0.7166666746139526, "reward_std": 0.517094686627388, "rewards/accuracy_reward": 0.05833333395421505, "rewards/format_reward": -0.7750000119209289, "step": 85 }, { "completion_length": 294.21667327880857, "epoch": 0.048, "grad_norm": 2.6495754718780518, "kl": 0.05029296875, "learning_rate": 4.787234042553192e-07, "loss": 0.1663, "reward": -0.6333333402872086, "reward_std": 0.6619763910770416, "rewards/accuracy_reward": 0.08333333507180214, "rewards/format_reward": -0.7166666865348816, "step": 90 }, { "completion_length": 290.7458435058594, "epoch": 0.050666666666666665, "grad_norm": 1.9712241888046265, "kl": 0.064501953125, "learning_rate": 5.053191489361702e-07, "loss": 0.2083, "reward": -0.5291666805744171, "reward_std": 0.7097373753786087, "rewards/accuracy_reward": 0.10416666902601719, "rewards/format_reward": -0.6333333551883698, "step": 95 }, { "completion_length": 276.9541778564453, "epoch": 0.05333333333333334, "grad_norm": 2.6062912940979004, "kl": 0.09736328125, "learning_rate": 5.319148936170212e-07, "loss": 0.1889, "reward": -0.5833333522081375, "reward_std": 0.6344460442662239, "rewards/accuracy_reward": 0.09166666977107525, "rewards/format_reward": -0.675000011920929, "step": 100 }, { "completion_length": 258.17084350585935, "epoch": 0.056, "grad_norm": 3.833866834640503, "kl": 0.1748046875, "learning_rate": 5.585106382978722e-07, "loss": 0.1782, "reward": -0.5166666775941848, "reward_std": 0.7733665883541108, "rewards/accuracy_reward": 0.11666666902601719, "rewards/format_reward": -0.6333333492279053, "step": 105 }, { "completion_length": 276.9500076293945, "epoch": 0.058666666666666666, "grad_norm": 2.573361873626709, "kl": 0.1017578125, "learning_rate": 5.851063829787234e-07, "loss": 0.1876, "reward": -0.49583334624767306, "reward_std": 0.7565078109502792, "rewards/accuracy_reward": 0.12083333656191826, "rewards/format_reward": -0.6166666924953461, "step": 110 }, { "completion_length": 276.2250061035156, "epoch": 0.06133333333333333, "grad_norm": 1.7497327327728271, "kl": 0.075927734375, "learning_rate": 6.117021276595744e-07, "loss": 0.1169, "reward": -0.6041666865348816, "reward_std": 0.6442232474684715, "rewards/accuracy_reward": 0.10416666828095913, "rewards/format_reward": -0.7083333551883697, "step": 115 }, { "completion_length": 258.6750091552734, "epoch": 0.064, "grad_norm": 2.181297779083252, "kl": 0.081201171875, "learning_rate": 6.382978723404255e-07, "loss": 0.1729, "reward": -0.5708333432674408, "reward_std": 0.7166081488132476, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": -0.6333333522081375, "step": 120 }, { "completion_length": 268.40834045410156, "epoch": 0.06666666666666667, "grad_norm": 4.303333759307861, "kl": 0.133251953125, "learning_rate": 6.648936170212765e-07, "loss": 0.2288, "reward": -0.5125000178813934, "reward_std": 0.7744767606258393, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": -0.5833333492279053, "step": 125 }, { "completion_length": 299.7708435058594, "epoch": 0.06933333333333333, "grad_norm": 4.013002872467041, "kl": 0.169775390625, "learning_rate": 6.914893617021277e-07, "loss": 0.318, "reward": -0.3000000104308128, "reward_std": 0.8895713210105896, "rewards/accuracy_reward": 0.08333333507180214, "rewards/format_reward": -0.3833333447575569, "step": 130 }, { "completion_length": 235.02500915527344, "epoch": 0.072, "grad_norm": 2.6780853271484375, "kl": 0.173779296875, "learning_rate": 7.180851063829787e-07, "loss": 0.1926, "reward": -0.35416668057441714, "reward_std": 0.7739700466394425, "rewards/accuracy_reward": 0.11250000409781932, "rewards/format_reward": -0.46666668355464935, "step": 135 }, { "completion_length": 257.54584045410155, "epoch": 0.07466666666666667, "grad_norm": 1.999470829963684, "kl": 0.117333984375, "learning_rate": 7.446808510638297e-07, "loss": 0.2742, "reward": -0.23750000968575477, "reward_std": 0.8838598787784576, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": -0.3416666746139526, "step": 140 }, { "completion_length": 246.15000610351564, "epoch": 0.07733333333333334, "grad_norm": 3.609872579574585, "kl": 0.20576171875, "learning_rate": 7.712765957446808e-07, "loss": 0.2398, "reward": -0.3708333432674408, "reward_std": 0.879425299167633, "rewards/accuracy_reward": 0.11250000149011612, "rewards/format_reward": -0.4833333492279053, "step": 145 }, { "completion_length": 259.1625091552734, "epoch": 0.08, "grad_norm": 4.464442729949951, "kl": 0.21318359375, "learning_rate": 7.978723404255318e-07, "loss": 0.2627, "reward": -0.1833333384245634, "reward_std": 0.9505029916763306, "rewards/accuracy_reward": 0.12500000260770322, "rewards/format_reward": -0.3083333432674408, "step": 150 }, { "completion_length": 236.57083740234376, "epoch": 0.08266666666666667, "grad_norm": 4.6332550048828125, "kl": 0.216796875, "learning_rate": 8.24468085106383e-07, "loss": 0.2414, "reward": -0.23333333767950534, "reward_std": 0.9220583379268646, "rewards/accuracy_reward": 0.13333333544433118, "rewards/format_reward": -0.3666666798293591, "step": 155 }, { "completion_length": 223.12500610351563, "epoch": 0.08533333333333333, "grad_norm": 2.853530168533325, "kl": 0.240625, "learning_rate": 8.51063829787234e-07, "loss": 0.2636, "reward": -0.2166666705161333, "reward_std": 0.9433093965053558, "rewards/accuracy_reward": 0.10833333730697632, "rewards/format_reward": -0.32500001341104506, "step": 160 }, { "completion_length": 187.4041717529297, "epoch": 0.088, "grad_norm": 3.2459123134613037, "kl": 0.2255859375, "learning_rate": 8.77659574468085e-07, "loss": 0.2653, "reward": -0.13750000298023224, "reward_std": 0.9205778002738952, "rewards/accuracy_reward": 0.1291666690260172, "rewards/format_reward": -0.2666666805744171, "step": 165 }, { "completion_length": 199.13333892822266, "epoch": 0.09066666666666667, "grad_norm": 2.3914144039154053, "kl": 0.2478515625, "learning_rate": 9.042553191489361e-07, "loss": 0.242, "reward": -0.2500000067055225, "reward_std": 0.9306629121303558, "rewards/accuracy_reward": 0.05833333432674408, "rewards/format_reward": -0.30833333879709246, "step": 170 }, { "completion_length": 194.1500045776367, "epoch": 0.09333333333333334, "grad_norm": 7.140366554260254, "kl": 0.336328125, "learning_rate": 9.308510638297871e-07, "loss": 0.2465, "reward": 0.11250000335276127, "reward_std": 0.9915949404239655, "rewards/accuracy_reward": 0.2041666742414236, "rewards/format_reward": -0.09166667088866234, "step": 175 }, { "completion_length": 177.92917022705078, "epoch": 0.096, "grad_norm": 3.311171293258667, "kl": 0.31455078125, "learning_rate": 9.574468085106384e-07, "loss": 0.2429, "reward": -0.02083333097398281, "reward_std": 0.9874655485153199, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": -0.1666666716337204, "step": 180 }, { "completion_length": 204.3000045776367, "epoch": 0.09866666666666667, "grad_norm": 15.891555786132812, "kl": 0.29267578125, "learning_rate": 9.840425531914893e-07, "loss": 0.2048, "reward": -0.04166666828095913, "reward_std": 0.999242752790451, "rewards/accuracy_reward": 0.16666667237877847, "rewards/format_reward": -0.20833333879709243, "step": 185 }, { "completion_length": 169.1041702270508, "epoch": 0.10133333333333333, "grad_norm": 3.4694690704345703, "kl": 0.36845703125, "learning_rate": 9.999965320799375e-07, "loss": 0.2537, "reward": 0.29166667312383654, "reward_std": 0.9786251783370972, "rewards/accuracy_reward": 0.19166667237877846, "rewards/format_reward": 0.10000000670552253, "step": 190 }, { "completion_length": 155.45833740234374, "epoch": 0.104, "grad_norm": 4.827131271362305, "kl": 0.4091796875, "learning_rate": 9.999575185316993e-07, "loss": 0.253, "reward": 0.21666667386889457, "reward_std": 0.9847019255161286, "rewards/accuracy_reward": 0.12500000335276126, "rewards/format_reward": 0.09166666865348816, "step": 195 }, { "completion_length": 153.3083366394043, "epoch": 0.10666666666666667, "grad_norm": 6.091984272003174, "kl": 0.4533203125, "learning_rate": 9.998751599287957e-07, "loss": 0.2823, "reward": 0.37916667610406873, "reward_std": 1.0222100555896758, "rewards/accuracy_reward": 0.16250000521540642, "rewards/format_reward": 0.21666667237877846, "step": 200 }, { "epoch": 0.10666666666666667, "eval_completion_length": 163.56056142171224, "eval_kl": 0.3924609375, "eval_loss": 0.2787472605705261, "eval_reward": 0.34055556610226634, "eval_reward_std": 0.9718689926465353, "eval_rewards/accuracy_reward": 0.09500000228484472, "eval_rewards/format_reward": 0.24555556188027064, "eval_runtime": 368.0531, "eval_samples_per_second": 0.815, "eval_steps_per_second": 0.035, "step": 200 }, { "completion_length": 155.39583740234374, "epoch": 0.10933333333333334, "grad_norm": 4.420968532562256, "kl": 0.378125, "learning_rate": 9.9974946341151e-07, "loss": 0.2678, "reward": 0.42916667759418486, "reward_std": 1.0038800358772277, "rewards/accuracy_reward": 0.17916666977107526, "rewards/format_reward": 0.25000000819563867, "step": 205 }, { "completion_length": 143.2500030517578, "epoch": 0.112, "grad_norm": 40.516544342041016, "kl": 0.4390625, "learning_rate": 9.995804398774126e-07, "loss": 0.303, "reward": 0.6583333551883698, "reward_std": 0.892980021238327, "rewards/accuracy_reward": 0.20000000819563865, "rewards/format_reward": 0.45833334028720857, "step": 210 }, { "completion_length": 129.4250045776367, "epoch": 0.11466666666666667, "grad_norm": 3.4134747982025146, "kl": 0.75546875, "learning_rate": 9.993681039804173e-07, "loss": 0.2308, "reward": 0.6916666924953461, "reward_std": 0.7836884081363678, "rewards/accuracy_reward": 0.16666667200624943, "rewards/format_reward": 0.5250000201165677, "step": 215 }, { "completion_length": 127.5083381652832, "epoch": 0.11733333333333333, "grad_norm": 4.250089645385742, "kl": 0.4599609375, "learning_rate": 9.991124741295105e-07, "loss": 0.2392, "reward": 0.6500000178813934, "reward_std": 0.790798443555832, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.5500000208616257, "step": 220 }, { "completion_length": 142.82500534057618, "epoch": 0.12, "grad_norm": 3.440370798110962, "kl": 0.3755859375, "learning_rate": 9.988135724871545e-07, "loss": 0.2241, "reward": 0.6125000149011612, "reward_std": 0.9097375214099884, "rewards/accuracy_reward": 0.17083333767950534, "rewards/format_reward": 0.4416666775941849, "step": 225 }, { "completion_length": 142.2708381652832, "epoch": 0.12266666666666666, "grad_norm": 3.0582029819488525, "kl": 0.4955078125, "learning_rate": 9.984714249673673e-07, "loss": 0.2703, "reward": 0.7000000238418579, "reward_std": 0.9109564527869225, "rewards/accuracy_reward": 0.22500000149011612, "rewards/format_reward": 0.47500001490116117, "step": 230 }, { "completion_length": 133.31250381469727, "epoch": 0.12533333333333332, "grad_norm": 4.252659797668457, "kl": 0.50390625, "learning_rate": 9.98086061233475e-07, "loss": 0.312, "reward": 0.8500000298023224, "reward_std": 0.840288233757019, "rewards/accuracy_reward": 0.2833333402872086, "rewards/format_reward": 0.5666666805744172, "step": 235 }, { "completion_length": 118.69167022705078, "epoch": 0.128, "grad_norm": 82.35729217529297, "kl": 1.1455078125, "learning_rate": 9.97657514695541e-07, "loss": 0.2646, "reward": 0.9041666984558105, "reward_std": 0.7261348217725754, "rewards/accuracy_reward": 0.22083334028720855, "rewards/format_reward": 0.6833333551883698, "step": 240 }, { "completion_length": 123.26250381469727, "epoch": 0.13066666666666665, "grad_norm": 3.8186757564544678, "kl": 0.5734375, "learning_rate": 9.971858225074672e-07, "loss": 0.2327, "reward": 0.8333333790302276, "reward_std": 0.7271113842725754, "rewards/accuracy_reward": 0.19166667014360428, "rewards/format_reward": 0.6416666895151139, "step": 245 }, { "completion_length": 139.0875045776367, "epoch": 0.13333333333333333, "grad_norm": 4.611005783081055, "kl": 0.55546875, "learning_rate": 9.966710255637762e-07, "loss": 0.3656, "reward": 0.7083333514630794, "reward_std": 0.7690498679876328, "rewards/accuracy_reward": 0.12500000298023223, "rewards/format_reward": 0.5833333611488343, "step": 250 }, { "completion_length": 131.25000610351563, "epoch": 0.136, "grad_norm": 5.000573635101318, "kl": 0.6904296875, "learning_rate": 9.961131684960634e-07, "loss": 0.3735, "reward": 0.7583333611488342, "reward_std": 0.8809190809726715, "rewards/accuracy_reward": 0.21666667200624942, "rewards/format_reward": 0.5416666835546493, "step": 255 }, { "completion_length": 130.9083381652832, "epoch": 0.13866666666666666, "grad_norm": 10.388055801391602, "kl": 0.87734375, "learning_rate": 9.955122996691277e-07, "loss": 0.4113, "reward": 0.8041666835546494, "reward_std": 0.8410302340984345, "rewards/accuracy_reward": 0.2125000074505806, "rewards/format_reward": 0.5916666954755783, "step": 260 }, { "completion_length": 128.3708366394043, "epoch": 0.14133333333333334, "grad_norm": 5.107520580291748, "kl": 0.790234375, "learning_rate": 9.948684711767799e-07, "loss": 0.3787, "reward": 0.8416666924953461, "reward_std": 0.6999663650989533, "rewards/accuracy_reward": 0.18333333507180213, "rewards/format_reward": 0.6583333522081375, "step": 265 }, { "completion_length": 163.54583816528321, "epoch": 0.144, "grad_norm": 7.847362995147705, "kl": 0.6859375, "learning_rate": 9.941817388373247e-07, "loss": 0.486, "reward": 0.9250000238418579, "reward_std": 0.6243289351463318, "rewards/accuracy_reward": 0.1916666693985462, "rewards/format_reward": 0.7333333551883697, "step": 270 }, { "completion_length": 222.45000915527345, "epoch": 0.14666666666666667, "grad_norm": 4.114348411560059, "kl": 0.951953125, "learning_rate": 9.934521621887221e-07, "loss": 0.5765, "reward": 0.825000011920929, "reward_std": 0.70595743060112, "rewards/accuracy_reward": 0.18333333767950535, "rewards/format_reward": 0.6416666865348816, "step": 275 }, { "completion_length": 270.7250061035156, "epoch": 0.14933333333333335, "grad_norm": 7.752039432525635, "kl": 1.58359375, "learning_rate": 9.926798044834259e-07, "loss": 0.8842, "reward": 0.7625000387430191, "reward_std": 0.8192247807979584, "rewards/accuracy_reward": 0.1541666727513075, "rewards/format_reward": 0.608333346247673, "step": 280 }, { "completion_length": 286.2875045776367, "epoch": 0.152, "grad_norm": 20.332632064819336, "kl": 1.559765625, "learning_rate": 9.91864732682899e-07, "loss": 0.7816, "reward": 0.6875000238418579, "reward_std": 0.8882942378520966, "rewards/accuracy_reward": 0.19583333805203437, "rewards/format_reward": 0.49166667759418486, "step": 285 }, { "completion_length": 321.8916748046875, "epoch": 0.15466666666666667, "grad_norm": 70.37603759765625, "kl": 2.5359375, "learning_rate": 9.910070174518091e-07, "loss": 0.8462, "reward": 0.7041666865348816, "reward_std": 0.9259481251239776, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.5166666805744171, "step": 290 }, { "completion_length": 311.6041748046875, "epoch": 0.15733333333333333, "grad_norm": 78.9567642211914, "kl": 3.86171875, "learning_rate": 9.90106733151901e-07, "loss": 0.9557, "reward": 0.5708333492279053, "reward_std": 0.9381726026535034, "rewards/accuracy_reward": 0.13750000149011612, "rewards/format_reward": 0.4333333447575569, "step": 295 }, { "completion_length": 250.3291778564453, "epoch": 0.16, "grad_norm": 138.24191284179688, "kl": 8.51640625, "learning_rate": 9.89163957835551e-07, "loss": 1.2961, "reward": 0.6208333462476731, "reward_std": 0.909997683763504, "rewards/accuracy_reward": 0.1541666727513075, "rewards/format_reward": 0.46666667610406876, "step": 300 }, { "completion_length": 215.75833740234376, "epoch": 0.16266666666666665, "grad_norm": 126.323974609375, "kl": 3.709375, "learning_rate": 9.881787732389985e-07, "loss": 0.9302, "reward": 0.6666666895151139, "reward_std": 0.8827720135450363, "rewards/accuracy_reward": 0.16666667088866233, "rewards/format_reward": 0.500000013411045, "step": 305 }, { "completion_length": 180.2916732788086, "epoch": 0.16533333333333333, "grad_norm": 36.90127944946289, "kl": 3.61640625, "learning_rate": 9.871512647752612e-07, "loss": 0.8254, "reward": 0.5916666835546494, "reward_std": 0.9250853776931762, "rewards/accuracy_reward": 0.11666666902601719, "rewards/format_reward": 0.47500001192092894, "step": 310 }, { "completion_length": 159.34167404174804, "epoch": 0.168, "grad_norm": 50.75526428222656, "kl": 5.4921875, "learning_rate": 9.860815215267287e-07, "loss": 0.8761, "reward": 0.704166692495346, "reward_std": 0.9947333693504333, "rewards/accuracy_reward": 0.21250000447034836, "rewards/format_reward": 0.491666679084301, "step": 315 }, { "completion_length": 171.51667327880858, "epoch": 0.17066666666666666, "grad_norm": 50.245628356933594, "kl": 4.3265625, "learning_rate": 9.849696362374397e-07, "loss": 0.8211, "reward": 0.49166668951511383, "reward_std": 0.8819661140441895, "rewards/accuracy_reward": 0.1000000037252903, "rewards/format_reward": 0.3916666813194752, "step": 320 }, { "completion_length": 130.46667098999023, "epoch": 0.17333333333333334, "grad_norm": 25.52703285217285, "kl": 4.159375, "learning_rate": 9.838157053050423e-07, "loss": 0.6859, "reward": 0.8916666805744171, "reward_std": 0.7892452508211136, "rewards/accuracy_reward": 0.21666667312383653, "rewards/format_reward": 0.6750000208616257, "step": 325 }, { "completion_length": 171.3375015258789, "epoch": 0.176, "grad_norm": 38.008914947509766, "kl": 4.453125, "learning_rate": 9.826198287724346e-07, "loss": 0.8512, "reward": 0.8041666924953461, "reward_std": 0.8764552354812623, "rewards/accuracy_reward": 0.26250000633299353, "rewards/format_reward": 0.5416666954755783, "step": 330 }, { "completion_length": 164.07500610351562, "epoch": 0.17866666666666667, "grad_norm": 128.8215789794922, "kl": 5.5421875, "learning_rate": 9.813821103190931e-07, "loss": 0.9175, "reward": 0.8166666984558105, "reward_std": 0.8608273565769196, "rewards/accuracy_reward": 0.25833333991467955, "rewards/format_reward": 0.5583333522081375, "step": 335 }, { "completion_length": 148.26666946411132, "epoch": 0.18133333333333335, "grad_norm": 46.26949691772461, "kl": 3.05859375, "learning_rate": 9.80102657252083e-07, "loss": 0.6734, "reward": 0.8250000238418579, "reward_std": 0.7950194001197814, "rewards/accuracy_reward": 0.21666667349636554, "rewards/format_reward": 0.6083333492279053, "step": 340 }, { "completion_length": 157.37917098999023, "epoch": 0.184, "grad_norm": 20.64828109741211, "kl": 4.228125, "learning_rate": 9.787815804967551e-07, "loss": 0.9426, "reward": 0.9125000298023224, "reward_std": 0.7014284431934357, "rewards/accuracy_reward": 0.22083334252238274, "rewards/format_reward": 0.6916666805744172, "step": 345 }, { "completion_length": 169.56667251586913, "epoch": 0.18666666666666668, "grad_norm": 57.10884094238281, "kl": 2.883203125, "learning_rate": 9.774189945871288e-07, "loss": 0.7312, "reward": 0.7875000238418579, "reward_std": 0.6836557567119599, "rewards/accuracy_reward": 0.16250000484287738, "rewards/format_reward": 0.6250000178813935, "step": 350 }, { "completion_length": 162.88750457763672, "epoch": 0.18933333333333333, "grad_norm": 33.1915397644043, "kl": 3.08203125, "learning_rate": 9.760150176559624e-07, "loss": 0.6298, "reward": 0.9041666865348816, "reward_std": 0.7536427795886993, "rewards/accuracy_reward": 0.24583334252238273, "rewards/format_reward": 0.658333358168602, "step": 355 }, { "completion_length": 127.81666870117188, "epoch": 0.192, "grad_norm": 28.192670822143555, "kl": 7.875, "learning_rate": 9.745697714245118e-07, "loss": 1.1418, "reward": 1.066666692495346, "reward_std": 0.6796198636293411, "rewards/accuracy_reward": 0.3333333428949118, "rewards/format_reward": 0.7333333551883697, "step": 360 }, { "completion_length": 126.42917175292969, "epoch": 0.19466666666666665, "grad_norm": 19.537328720092773, "kl": 1.71875, "learning_rate": 9.730833811919762e-07, "loss": 0.5002, "reward": 0.9875000238418579, "reward_std": 0.6549044132232666, "rewards/accuracy_reward": 0.2541666716337204, "rewards/format_reward": 0.7333333551883697, "step": 365 }, { "completion_length": 129.67500534057618, "epoch": 0.19733333333333333, "grad_norm": 35.49064636230469, "kl": 3.1390625, "learning_rate": 9.715559758246361e-07, "loss": 0.5476, "reward": 1.041666704416275, "reward_std": 0.6668142318725586, "rewards/accuracy_reward": 0.2916666753590107, "rewards/format_reward": 0.7500000178813935, "step": 370 }, { "completion_length": 124.04166870117187, "epoch": 0.2, "grad_norm": 16.886371612548828, "kl": 4.10625, "learning_rate": 9.699876877446812e-07, "loss": 0.6237, "reward": 0.9458333551883698, "reward_std": 0.6157839864492416, "rewards/accuracy_reward": 0.1958333395421505, "rewards/format_reward": 0.7500000238418579, "step": 375 }, { "completion_length": 128.82083740234376, "epoch": 0.20266666666666666, "grad_norm": 60.52346420288086, "kl": 1.719140625, "learning_rate": 9.683786529187285e-07, "loss": 0.4091, "reward": 0.9833333730697632, "reward_std": 0.5964143082499505, "rewards/accuracy_reward": 0.25833334401249886, "rewards/format_reward": 0.7250000178813935, "step": 380 }, { "completion_length": 130.2500030517578, "epoch": 0.20533333333333334, "grad_norm": 8.501641273498535, "kl": 2.725, "learning_rate": 9.667290108460353e-07, "loss": 0.4553, "reward": 0.9625000298023224, "reward_std": 0.6627969831228256, "rewards/accuracy_reward": 0.25416667088866235, "rewards/format_reward": 0.7083333551883697, "step": 385 }, { "completion_length": 151.49167251586914, "epoch": 0.208, "grad_norm": 9.237239837646484, "kl": 3.03125, "learning_rate": 9.650389045464044e-07, "loss": 0.5862, "reward": 1.0083333671092987, "reward_std": 0.6827763438224792, "rewards/accuracy_reward": 0.28333334177732467, "rewards/format_reward": 0.725000011920929, "step": 390 }, { "completion_length": 157.08750381469727, "epoch": 0.21066666666666667, "grad_norm": 21.65406036376953, "kl": 2.93125, "learning_rate": 9.633084805477855e-07, "loss": 0.7111, "reward": 1.0791666984558106, "reward_std": 0.6408874064683914, "rewards/accuracy_reward": 0.3125000070780516, "rewards/format_reward": 0.766666692495346, "step": 395 }, { "completion_length": 177.63750381469725, "epoch": 0.21333333333333335, "grad_norm": 18.198787689208984, "kl": 3.2578125, "learning_rate": 9.615378888735705e-07, "loss": 0.6602, "reward": 0.9416666984558105, "reward_std": 0.7165269427001476, "rewards/accuracy_reward": 0.2583333391696215, "rewards/format_reward": 0.6833333492279052, "step": 400 }, { "epoch": 0.21333333333333335, "eval_completion_length": 176.4338934326172, "eval_kl": 3.930625, "eval_loss": 0.726865291595459, "eval_reward": 0.8177778057257334, "eval_reward_std": 0.6464416084686915, "eval_rewards/accuracy_reward": 0.125555559694767, "eval_rewards/format_reward": 0.6922222431500753, "eval_runtime": 651.5271, "eval_samples_per_second": 0.46, "eval_steps_per_second": 0.02, "step": 400 }, { "completion_length": 156.30417175292968, "epoch": 0.216, "grad_norm": 9.428586959838867, "kl": 3.2203125, "learning_rate": 9.597272830295876e-07, "loss": 0.5783, "reward": 0.9208333551883697, "reward_std": 0.6943224638700485, "rewards/accuracy_reward": 0.22083334028720855, "rewards/format_reward": 0.7000000238418579, "step": 405 }, { "completion_length": 187.24167251586914, "epoch": 0.21866666666666668, "grad_norm": 8.181583404541016, "kl": 2.2078125, "learning_rate": 9.578768199907919e-07, "loss": 0.5979, "reward": 0.8416666924953461, "reward_std": 0.683533999323845, "rewards/accuracy_reward": 0.15833333805203437, "rewards/format_reward": 0.6833333522081375, "step": 410 }, { "completion_length": 196.9791732788086, "epoch": 0.22133333333333333, "grad_norm": 241.11936950683594, "kl": 5.8875, "learning_rate": 9.55986660187658e-07, "loss": 0.9332, "reward": 0.8916666984558106, "reward_std": 0.7562600076198578, "rewards/accuracy_reward": 0.2916666727513075, "rewards/format_reward": 0.6000000089406967, "step": 415 }, { "completion_length": 200.53750610351562, "epoch": 0.224, "grad_norm": 24.950624465942383, "kl": 2.9796875, "learning_rate": 9.540569674922684e-07, "loss": 0.6774, "reward": 0.9208333611488342, "reward_std": 0.7625810235738755, "rewards/accuracy_reward": 0.2625000074505806, "rewards/format_reward": 0.6583333522081375, "step": 420 }, { "completion_length": 198.76250686645508, "epoch": 0.22666666666666666, "grad_norm": 13.279343605041504, "kl": 2.8796875, "learning_rate": 9.520879092041083e-07, "loss": 0.7823, "reward": 0.8708333611488343, "reward_std": 0.6853504031896591, "rewards/accuracy_reward": 0.18750000335276126, "rewards/format_reward": 0.6833333611488343, "step": 425 }, { "completion_length": 132.6208381652832, "epoch": 0.22933333333333333, "grad_norm": 19.399991989135742, "kl": 3.008203125, "learning_rate": 9.500796560355602e-07, "loss": 0.4804, "reward": 0.9458333551883698, "reward_std": 0.5760359674692154, "rewards/accuracy_reward": 0.1791666705161333, "rewards/format_reward": 0.7666666686534882, "step": 430 }, { "completion_length": 156.29583816528321, "epoch": 0.232, "grad_norm": 13.930024147033691, "kl": 2.163671875, "learning_rate": 9.480323820971037e-07, "loss": 0.6149, "reward": 0.9000000298023224, "reward_std": 0.6693511486053467, "rewards/accuracy_reward": 0.16666667014360428, "rewards/format_reward": 0.7333333551883697, "step": 435 }, { "completion_length": 170.9166732788086, "epoch": 0.23466666666666666, "grad_norm": 28.107120513916016, "kl": 3.184375, "learning_rate": 9.459462648822207e-07, "loss": 0.6151, "reward": 0.8916667103767395, "reward_std": 0.6882745712995529, "rewards/accuracy_reward": 0.21666667684912683, "rewards/format_reward": 0.675000011920929, "step": 440 }, { "completion_length": 162.44583892822266, "epoch": 0.23733333333333334, "grad_norm": 20.64507293701172, "kl": 4.196875, "learning_rate": 9.438214852520072e-07, "loss": 0.7043, "reward": 0.829166692495346, "reward_std": 0.6397666782140732, "rewards/accuracy_reward": 0.1458333373069763, "rewards/format_reward": 0.6833333492279052, "step": 445 }, { "completion_length": 133.62917098999023, "epoch": 0.24, "grad_norm": 5.781806945800781, "kl": 2.41796875, "learning_rate": 9.416582274194929e-07, "loss": 0.5327, "reward": 1.0500000238418579, "reward_std": 0.6202212646603584, "rewards/accuracy_reward": 0.2666666727513075, "rewards/format_reward": 0.7833333492279053, "step": 450 }, { "completion_length": 216.83334045410157, "epoch": 0.24266666666666667, "grad_norm": 24.161720275878906, "kl": 1.6671875, "learning_rate": 9.394566789336707e-07, "loss": 0.5952, "reward": 0.8916666984558106, "reward_std": 0.6439453423023224, "rewards/accuracy_reward": 0.19166667051613331, "rewards/format_reward": 0.7000000178813934, "step": 455 }, { "completion_length": 133.31667098999023, "epoch": 0.24533333333333332, "grad_norm": 14.312320709228516, "kl": 2.26484375, "learning_rate": 9.372170306632358e-07, "loss": 0.3488, "reward": 1.0583333730697633, "reward_std": 0.4993865922093391, "rewards/accuracy_reward": 0.2500000037252903, "rewards/format_reward": 0.8083333551883698, "step": 460 }, { "completion_length": 115.00000228881837, "epoch": 0.248, "grad_norm": 24.176897048950195, "kl": 2.22109375, "learning_rate": 9.349394767800396e-07, "loss": 0.3995, "reward": 1.0458333611488342, "reward_std": 0.5499838680028916, "rewards/accuracy_reward": 0.25416667461395265, "rewards/format_reward": 0.791666692495346, "step": 465 }, { "completion_length": 170.7375030517578, "epoch": 0.25066666666666665, "grad_norm": 15.032400131225586, "kl": 3.209375, "learning_rate": 9.326242147422536e-07, "loss": 0.6388, "reward": 0.9458333611488342, "reward_std": 0.6427857339382171, "rewards/accuracy_reward": 0.22916667051613332, "rewards/format_reward": 0.7166666865348816, "step": 470 }, { "completion_length": 178.14167098999025, "epoch": 0.25333333333333335, "grad_norm": 12.969917297363281, "kl": 2.8015625, "learning_rate": 9.302714452772514e-07, "loss": 0.5282, "reward": 0.8666666924953461, "reward_std": 0.6148743867874146, "rewards/accuracy_reward": 0.20833334065973758, "rewards/format_reward": 0.6583333492279053, "step": 475 }, { "completion_length": 160.37916946411133, "epoch": 0.256, "grad_norm": 25.496326446533203, "kl": 2.88515625, "learning_rate": 9.278813723642059e-07, "loss": 0.6265, "reward": 0.7583333671092987, "reward_std": 0.6592622727155686, "rewards/accuracy_reward": 0.09166666939854622, "rewards/format_reward": 0.666666692495346, "step": 480 }, { "completion_length": 156.80833740234374, "epoch": 0.25866666666666666, "grad_norm": 19.942245483398438, "kl": 2.80546875, "learning_rate": 9.254542032164046e-07, "loss": 0.5487, "reward": 1.0916667103767395, "reward_std": 0.5701596170663834, "rewards/accuracy_reward": 0.2916666727513075, "rewards/format_reward": 0.8000000238418579, "step": 485 }, { "completion_length": 190.86667556762694, "epoch": 0.2613333333333333, "grad_norm": 29.593677520751953, "kl": 5.2046875, "learning_rate": 9.229901482632849e-07, "loss": 0.8562, "reward": 0.8750000268220901, "reward_std": 0.7407498300075531, "rewards/accuracy_reward": 0.25833334438502786, "rewards/format_reward": 0.6166666775941849, "step": 490 }, { "completion_length": 185.21250381469727, "epoch": 0.264, "grad_norm": 15.83928108215332, "kl": 3.34765625, "learning_rate": 9.204894211321905e-07, "loss": 0.7039, "reward": 0.9708333551883698, "reward_std": 0.7463637501001358, "rewards/accuracy_reward": 0.30416667759418486, "rewards/format_reward": 0.6666666865348816, "step": 495 }, { "completion_length": 177.40833740234376, "epoch": 0.26666666666666666, "grad_norm": 12.166021347045898, "kl": 4.6421875, "learning_rate": 9.179522386298506e-07, "loss": 0.8557, "reward": 0.8416666924953461, "reward_std": 0.753840970993042, "rewards/accuracy_reward": 0.22500000558793545, "rewards/format_reward": 0.6166666924953461, "step": 500 }, { "completion_length": 201.27083892822264, "epoch": 0.2693333333333333, "grad_norm": 23.116260528564453, "kl": 2.7734375, "learning_rate": 9.153788207235826e-07, "loss": 0.7171, "reward": 1.0333333730697631, "reward_std": 0.7333385825157166, "rewards/accuracy_reward": 0.35000001192092894, "rewards/format_reward": 0.6833333492279052, "step": 505 }, { "completion_length": 168.51250457763672, "epoch": 0.272, "grad_norm": 78.56771850585938, "kl": 3.934375, "learning_rate": 9.127693905222223e-07, "loss": 0.7631, "reward": 0.9583333551883697, "reward_std": 0.7650938987731933, "rewards/accuracy_reward": 0.31666667461395265, "rewards/format_reward": 0.6416666865348816, "step": 510 }, { "completion_length": 179.67500534057618, "epoch": 0.27466666666666667, "grad_norm": 12.310942649841309, "kl": 2.8015625, "learning_rate": 9.1012417425678e-07, "loss": 0.7708, "reward": 1.050000047683716, "reward_std": 0.6466626852750779, "rewards/accuracy_reward": 0.3250000040978193, "rewards/format_reward": 0.7250000238418579, "step": 515 }, { "completion_length": 137.3458381652832, "epoch": 0.2773333333333333, "grad_norm": 23.13488006591797, "kl": 2.5033203125, "learning_rate": 9.074434012608281e-07, "loss": 0.5319, "reward": 1.0208333611488343, "reward_std": 0.5777831941843032, "rewards/accuracy_reward": 0.2541666753590107, "rewards/format_reward": 0.766666692495346, "step": 520 }, { "completion_length": 139.60833663940429, "epoch": 0.28, "grad_norm": 18.197731018066406, "kl": 2.709375, "learning_rate": 9.047273039506174e-07, "loss": 0.6145, "reward": 1.0708333730697632, "reward_std": 0.47921385020017626, "rewards/accuracy_reward": 0.24583334103226662, "rewards/format_reward": 0.8250000238418579, "step": 525 }, { "completion_length": 125.83333816528321, "epoch": 0.2826666666666667, "grad_norm": 62.50031661987305, "kl": 1.6216796875, "learning_rate": 9.019761178049279e-07, "loss": 0.4705, "reward": 1.1375000178813934, "reward_std": 0.4138069227337837, "rewards/accuracy_reward": 0.2958333373069763, "rewards/format_reward": 0.8416666805744171, "step": 530 }, { "completion_length": 179.41667251586915, "epoch": 0.2853333333333333, "grad_norm": 18.075428009033203, "kl": 2.503125, "learning_rate": 8.991900813446522e-07, "loss": 0.6926, "reward": 0.9208333671092988, "reward_std": 0.7170521825551986, "rewards/accuracy_reward": 0.26250000596046447, "rewards/format_reward": 0.6583333551883698, "step": 535 }, { "completion_length": 269.2916717529297, "epoch": 0.288, "grad_norm": 10.559329986572266, "kl": 3.5578125, "learning_rate": 8.963694361121185e-07, "loss": 0.7955, "reward": 0.7375000238418579, "reward_std": 0.7803374290466308, "rewards/accuracy_reward": 0.21250000707805156, "rewards/format_reward": 0.5250000119209289, "step": 540 }, { "completion_length": 171.4625045776367, "epoch": 0.2906666666666667, "grad_norm": 85.9892578125, "kl": 3.36953125, "learning_rate": 8.935144266501468e-07, "loss": 0.7548, "reward": 0.9333333671092987, "reward_std": 0.6961856186389923, "rewards/accuracy_reward": 0.2416666690260172, "rewards/format_reward": 0.6916666805744172, "step": 545 }, { "completion_length": 116.50833663940429, "epoch": 0.29333333333333333, "grad_norm": 61.62466812133789, "kl": 3.23515625, "learning_rate": 8.906253004808504e-07, "loss": 0.6184, "reward": 1.1833333849906922, "reward_std": 0.5228020772337914, "rewards/accuracy_reward": 0.35833334028720853, "rewards/format_reward": 0.8250000238418579, "step": 550 }, { "completion_length": 130.47500381469726, "epoch": 0.296, "grad_norm": 13.287981986999512, "kl": 1.23828125, "learning_rate": 8.877023080841737e-07, "loss": 0.4985, "reward": 1.1375000178813934, "reward_std": 0.4386047780513763, "rewards/accuracy_reward": 0.29583334028720853, "rewards/format_reward": 0.8416666924953461, "step": 555 }, { "completion_length": 171.2916702270508, "epoch": 0.2986666666666667, "grad_norm": 11.248329162597656, "kl": 2.68125, "learning_rate": 8.847457028761782e-07, "loss": 0.7836, "reward": 1.045833373069763, "reward_std": 0.6420545637607574, "rewards/accuracy_reward": 0.28750000447034835, "rewards/format_reward": 0.7583333492279053, "step": 560 }, { "completion_length": 153.5500045776367, "epoch": 0.30133333333333334, "grad_norm": 11.977041244506836, "kl": 2.99296875, "learning_rate": 8.817557411870715e-07, "loss": 0.6474, "reward": 1.1375000417232513, "reward_std": 0.546431428194046, "rewards/accuracy_reward": 0.3458333432674408, "rewards/format_reward": 0.7916666805744171, "step": 565 }, { "completion_length": 157.00833740234376, "epoch": 0.304, "grad_norm": 8.921804428100586, "kl": 2.18046875, "learning_rate": 8.787326822389835e-07, "loss": 0.7647, "reward": 1.0416667103767394, "reward_std": 0.4992914006114006, "rewards/accuracy_reward": 0.2333333373069763, "rewards/format_reward": 0.8083333551883698, "step": 570 }, { "completion_length": 203.42916946411134, "epoch": 0.30666666666666664, "grad_norm": 8.668245315551758, "kl": 1.5703125, "learning_rate": 8.756767881234928e-07, "loss": 0.7449, "reward": 1.0916666865348816, "reward_std": 0.6213793724775314, "rewards/accuracy_reward": 0.3166666731238365, "rewards/format_reward": 0.7750000119209289, "step": 575 }, { "completion_length": 147.91667098999022, "epoch": 0.30933333333333335, "grad_norm": 27.652040481567383, "kl": 2.9046875, "learning_rate": 8.725883237789044e-07, "loss": 0.7151, "reward": 1.1083333551883698, "reward_std": 0.5087577894330024, "rewards/accuracy_reward": 0.27500000670552255, "rewards/format_reward": 0.8333333611488343, "step": 580 }, { "completion_length": 195.73750381469728, "epoch": 0.312, "grad_norm": 5.533527851104736, "kl": 2.3046875, "learning_rate": 8.694675569672799e-07, "loss": 0.7577, "reward": 0.9291666924953461, "reward_std": 0.5898149274289608, "rewards/accuracy_reward": 0.17916667014360427, "rewards/format_reward": 0.7500000178813935, "step": 585 }, { "completion_length": 218.38334121704102, "epoch": 0.31466666666666665, "grad_norm": 7.887436866760254, "kl": 1.71484375, "learning_rate": 8.663147582512231e-07, "loss": 0.6805, "reward": 1.0375000298023225, "reward_std": 0.6552489116787911, "rewards/accuracy_reward": 0.3125000029802322, "rewards/format_reward": 0.7250000178813935, "step": 590 }, { "completion_length": 142.9625030517578, "epoch": 0.31733333333333336, "grad_norm": 39.04753112792969, "kl": 3.50078125, "learning_rate": 8.631302009704233e-07, "loss": 0.7378, "reward": 1.0500000357627868, "reward_std": 0.5874016582965851, "rewards/accuracy_reward": 0.2416666716337204, "rewards/format_reward": 0.8083333551883698, "step": 595 }, { "completion_length": 145.0583366394043, "epoch": 0.32, "grad_norm": 10.890267372131348, "kl": 1.9578125, "learning_rate": 8.59914161217957e-07, "loss": 0.4446, "reward": 1.2125000357627869, "reward_std": 0.33466504961252214, "rewards/accuracy_reward": 0.3208333432674408, "rewards/format_reward": 0.8916666865348816, "step": 600 }, { "epoch": 0.32, "eval_completion_length": 157.21389434814452, "eval_kl": 1.6558333333333333, "eval_loss": 0.4634725749492645, "eval_reward": 0.9744444727897644, "eval_reward_std": 0.43147118786970773, "eval_rewards/accuracy_reward": 0.13000000352660815, "eval_rewards/format_reward": 0.8444444608688354, "eval_runtime": 533.9055, "eval_samples_per_second": 0.562, "eval_steps_per_second": 0.024, "step": 600 }, { "completion_length": 193.183341217041, "epoch": 0.32266666666666666, "grad_norm": 13.141694068908691, "kl": 1.72421875, "learning_rate": 8.566669178163512e-07, "loss": 0.5602, "reward": 0.9708333551883698, "reward_std": 0.5239060014486313, "rewards/accuracy_reward": 0.2041666716337204, "rewards/format_reward": 0.766666692495346, "step": 605 }, { "completion_length": 154.98333663940429, "epoch": 0.3253333333333333, "grad_norm": 9.7343111038208, "kl": 1.919921875, "learning_rate": 8.533887522934114e-07, "loss": 0.4813, "reward": 1.1041667103767394, "reward_std": 0.5670485764741897, "rewards/accuracy_reward": 0.28750000819563865, "rewards/format_reward": 0.8166666924953461, "step": 610 }, { "completion_length": 109.46667022705078, "epoch": 0.328, "grad_norm": 6.92060661315918, "kl": 1.7703125, "learning_rate": 8.500799488578119e-07, "loss": 0.2986, "reward": 1.1000000476837157, "reward_std": 0.2986703909933567, "rewards/accuracy_reward": 0.1916666727513075, "rewards/format_reward": 0.9083333492279053, "step": 615 }, { "completion_length": 181.3708366394043, "epoch": 0.33066666666666666, "grad_norm": 7.871663570404053, "kl": 1.2625, "learning_rate": 8.467407943744573e-07, "loss": 0.6639, "reward": 1.1875000476837159, "reward_std": 0.5026830688118935, "rewards/accuracy_reward": 0.37916667610406873, "rewards/format_reward": 0.8083333551883698, "step": 620 }, { "completion_length": 130.59166946411133, "epoch": 0.3333333333333333, "grad_norm": 9.6685791015625, "kl": 1.58359375, "learning_rate": 8.433715783396114e-07, "loss": 0.5216, "reward": 1.1583333551883697, "reward_std": 0.4340529665350914, "rewards/accuracy_reward": 0.2750000134110451, "rewards/format_reward": 0.8833333492279053, "step": 625 }, { "completion_length": 180.36667022705078, "epoch": 0.336, "grad_norm": 39.6641960144043, "kl": 3.36953125, "learning_rate": 8.399725928557985e-07, "loss": 0.7533, "reward": 1.025000023841858, "reward_std": 0.46841561794281006, "rewards/accuracy_reward": 0.20833333991467953, "rewards/format_reward": 0.8166666865348816, "step": 630 }, { "completion_length": 135.39167022705078, "epoch": 0.33866666666666667, "grad_norm": 11.369102478027344, "kl": 2.64765625, "learning_rate": 8.365441326064788e-07, "loss": 0.5253, "reward": 1.1875000476837159, "reward_std": 0.40690153986215594, "rewards/accuracy_reward": 0.3375000089406967, "rewards/format_reward": 0.8500000238418579, "step": 635 }, { "completion_length": 210.98333816528321, "epoch": 0.3413333333333333, "grad_norm": 12.549598693847656, "kl": 1.3271484375, "learning_rate": 8.330864948305007e-07, "loss": 0.7683, "reward": 1.0166667103767395, "reward_std": 0.4776305049657822, "rewards/accuracy_reward": 0.1833333384245634, "rewards/format_reward": 0.8333333551883697, "step": 640 }, { "completion_length": 151.5333381652832, "epoch": 0.344, "grad_norm": 6.856210231781006, "kl": 1.084765625, "learning_rate": 8.295999792963299e-07, "loss": 0.4446, "reward": 1.200000035762787, "reward_std": 0.43040212616324425, "rewards/accuracy_reward": 0.3250000089406967, "rewards/format_reward": 0.8750000238418579, "step": 645 }, { "completion_length": 269.2375091552734, "epoch": 0.3466666666666667, "grad_norm": 23.80208396911621, "kl": 2.725, "learning_rate": 8.260848882760615e-07, "loss": 0.865, "reward": 0.829166692495346, "reward_std": 0.659246638417244, "rewards/accuracy_reward": 0.1875000014901161, "rewards/format_reward": 0.6416666865348816, "step": 650 }, { "completion_length": 211.78334045410156, "epoch": 0.34933333333333333, "grad_norm": 6.310311317443848, "kl": 1.966015625, "learning_rate": 8.225415265192126e-07, "loss": 0.769, "reward": 0.954166692495346, "reward_std": 0.5517986357212067, "rewards/accuracy_reward": 0.19583333991467952, "rewards/format_reward": 0.7583333551883698, "step": 655 }, { "completion_length": 183.13750457763672, "epoch": 0.352, "grad_norm": 5.835783958435059, "kl": 1.602734375, "learning_rate": 8.18970201226302e-07, "loss": 0.6146, "reward": 1.0166667103767395, "reward_std": 0.44397214204072954, "rewards/accuracy_reward": 0.2166666716337204, "rewards/format_reward": 0.8000000178813934, "step": 660 }, { "completion_length": 199.48750686645508, "epoch": 0.3546666666666667, "grad_norm": 8.435812950134277, "kl": 1.846875, "learning_rate": 8.153712220222163e-07, "loss": 0.7525, "reward": 0.9833333671092988, "reward_std": 0.544944578409195, "rewards/accuracy_reward": 0.21666666828095912, "rewards/format_reward": 0.7666666984558106, "step": 665 }, { "completion_length": 134.15000381469727, "epoch": 0.35733333333333334, "grad_norm": 13.892914772033691, "kl": 1.921484375, "learning_rate": 8.117449009293668e-07, "loss": 0.5067, "reward": 1.2250000417232514, "reward_std": 0.3981220737099648, "rewards/accuracy_reward": 0.35000001154839994, "rewards/format_reward": 0.8750000178813935, "step": 670 }, { "completion_length": 199.30000534057618, "epoch": 0.36, "grad_norm": 53.40914535522461, "kl": 2.228125, "learning_rate": 8.080915523406369e-07, "loss": 0.8388, "reward": 1.1083333611488342, "reward_std": 0.5545098386704922, "rewards/accuracy_reward": 0.3250000089406967, "rewards/format_reward": 0.7833333432674408, "step": 675 }, { "completion_length": 197.45833740234374, "epoch": 0.3626666666666667, "grad_norm": 46.9063720703125, "kl": 1.98671875, "learning_rate": 8.044114929921263e-07, "loss": 0.8575, "reward": 0.9833333492279053, "reward_std": 0.5669769406318664, "rewards/accuracy_reward": 0.2000000011175871, "rewards/format_reward": 0.7833333492279053, "step": 680 }, { "completion_length": 245.34584197998046, "epoch": 0.36533333333333334, "grad_norm": 20.520357131958008, "kl": 2.934375, "learning_rate": 8.007050419356898e-07, "loss": 0.8979, "reward": 0.8958333611488343, "reward_std": 0.6523119986057282, "rewards/accuracy_reward": 0.20416667275130748, "rewards/format_reward": 0.6916666805744172, "step": 685 }, { "completion_length": 312.5666793823242, "epoch": 0.368, "grad_norm": 13.216290473937988, "kl": 3.625, "learning_rate": 7.969725205112765e-07, "loss": 0.9082, "reward": 0.8625000193715096, "reward_std": 0.748971363902092, "rewards/accuracy_reward": 0.3125000085681677, "rewards/format_reward": 0.550000025331974, "step": 690 }, { "completion_length": 352.8666717529297, "epoch": 0.37066666666666664, "grad_norm": 30.815317153930664, "kl": 2.85390625, "learning_rate": 7.93214252319071e-07, "loss": 0.8438, "reward": 0.7708333522081375, "reward_std": 0.8650185167789459, "rewards/accuracy_reward": 0.3125000070780516, "rewards/format_reward": 0.45833334177732465, "step": 695 }, { "completion_length": 232.11667175292968, "epoch": 0.37333333333333335, "grad_norm": 18.286134719848633, "kl": 2.925, "learning_rate": 7.894305631914373e-07, "loss": 0.9608, "reward": 0.9333333596587181, "reward_std": 0.7935751020908356, "rewards/accuracy_reward": 0.31666667610406873, "rewards/format_reward": 0.6166666969656944, "step": 700 }, { "completion_length": 181.43750457763673, "epoch": 0.376, "grad_norm": 20.183544158935547, "kl": 3.06015625, "learning_rate": 7.856217811646706e-07, "loss": 0.7195, "reward": 1.0458333671092988, "reward_std": 0.5713184028863907, "rewards/accuracy_reward": 0.2791666716337204, "rewards/format_reward": 0.7666666984558106, "step": 705 }, { "completion_length": 223.5291717529297, "epoch": 0.37866666666666665, "grad_norm": 11.171494483947754, "kl": 2.15234375, "learning_rate": 7.817882364505568e-07, "loss": 0.6855, "reward": 0.9000000357627869, "reward_std": 0.5742868632078171, "rewards/accuracy_reward": 0.1750000026077032, "rewards/format_reward": 0.7250000149011612, "step": 710 }, { "completion_length": 266.17500457763674, "epoch": 0.38133333333333336, "grad_norm": 10.772181510925293, "kl": 2.66953125, "learning_rate": 7.779302614077448e-07, "loss": 0.7085, "reward": 0.900000023841858, "reward_std": 0.6552800923585892, "rewards/accuracy_reward": 0.24166667312383652, "rewards/format_reward": 0.6583333551883698, "step": 715 }, { "completion_length": 253.56667785644532, "epoch": 0.384, "grad_norm": 15.041406631469727, "kl": 2.715625, "learning_rate": 7.740481905129306e-07, "loss": 0.8722, "reward": 0.9958333611488343, "reward_std": 0.6977577596902848, "rewards/accuracy_reward": 0.3125000037252903, "rewards/format_reward": 0.6833333551883698, "step": 720 }, { "completion_length": 161.67084045410155, "epoch": 0.38666666666666666, "grad_norm": 4.084959030151367, "kl": 2.09296875, "learning_rate": 7.701423603318604e-07, "loss": 0.5005, "reward": 1.1458333611488343, "reward_std": 0.40467526763677597, "rewards/accuracy_reward": 0.3375000059604645, "rewards/format_reward": 0.8083333551883698, "step": 725 }, { "completion_length": 259.2875091552734, "epoch": 0.3893333333333333, "grad_norm": 154.38690185546875, "kl": 1.82265625, "learning_rate": 7.662131094901498e-07, "loss": 0.7136, "reward": 0.8708333611488343, "reward_std": 0.6159812211990356, "rewards/accuracy_reward": 0.19583334289491178, "rewards/format_reward": 0.675000011920929, "step": 730 }, { "completion_length": 261.3000061035156, "epoch": 0.392, "grad_norm": 2285.590576171875, "kl": 98.85625, "learning_rate": 7.622607786439278e-07, "loss": 18.7274, "reward": 0.9541666984558106, "reward_std": 0.6454987242817879, "rewards/accuracy_reward": 0.2541666701436043, "rewards/format_reward": 0.7000000238418579, "step": 735 }, { "completion_length": 293.8916717529297, "epoch": 0.39466666666666667, "grad_norm": 103.3537826538086, "kl": 2.91640625, "learning_rate": 7.582857104503e-07, "loss": 0.7409, "reward": 0.8625000178813934, "reward_std": 0.6774646982550621, "rewards/accuracy_reward": 0.2458333380520344, "rewards/format_reward": 0.6166666880249977, "step": 740 }, { "completion_length": 363.01250915527345, "epoch": 0.3973333333333333, "grad_norm": 54.62760925292969, "kl": 5.337890625, "learning_rate": 7.542882495376435e-07, "loss": 1.3766, "reward": 0.7250000238418579, "reward_std": 0.7046854376792908, "rewards/accuracy_reward": 0.19166667126119136, "rewards/format_reward": 0.533333346247673, "step": 745 }, { "completion_length": 305.82500610351565, "epoch": 0.4, "grad_norm": 95.76229095458984, "kl": 2.378125, "learning_rate": 7.502687424757277e-07, "loss": 0.9832, "reward": 0.9791667044162751, "reward_std": 0.776354917883873, "rewards/accuracy_reward": 0.3291666775941849, "rewards/format_reward": 0.6500000119209289, "step": 750 }, { "completion_length": 265.6125061035156, "epoch": 0.4026666666666667, "grad_norm": 80.57921600341797, "kl": 10.53046875, "learning_rate": 7.462275377456669e-07, "loss": 2.1369, "reward": 0.925000011920929, "reward_std": 0.8046808481216431, "rewards/accuracy_reward": 0.2833333395421505, "rewards/format_reward": 0.6416666805744171, "step": 755 }, { "completion_length": 321.10834045410155, "epoch": 0.4053333333333333, "grad_norm": 34.1876220703125, "kl": 4.4828125, "learning_rate": 7.421649857097091e-07, "loss": 0.9501, "reward": 0.8291666865348816, "reward_std": 0.6768444120883942, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.6000000149011612, "step": 760 }, { "completion_length": 319.262508392334, "epoch": 0.408, "grad_norm": 117.45841979980469, "kl": 5.225, "learning_rate": 7.380814385808594e-07, "loss": 1.394, "reward": 0.8166666895151138, "reward_std": 0.7352788507938385, "rewards/accuracy_reward": 0.21666667461395264, "rewards/format_reward": 0.6000000163912773, "step": 765 }, { "completion_length": 367.7666809082031, "epoch": 0.4106666666666667, "grad_norm": 66.30304718017578, "kl": 5.2375, "learning_rate": 7.339772503923443e-07, "loss": 1.1981, "reward": 0.8458333551883698, "reward_std": 0.7178668111562729, "rewards/accuracy_reward": 0.2708333406597376, "rewards/format_reward": 0.5750000178813934, "step": 770 }, { "completion_length": 316.7833450317383, "epoch": 0.41333333333333333, "grad_norm": 16.508329391479492, "kl": 4.0140625, "learning_rate": 7.298527769669187e-07, "loss": 1.1483, "reward": 0.9291666865348815, "reward_std": 0.6445024594664573, "rewards/accuracy_reward": 0.25416667461395265, "rewards/format_reward": 0.6750000238418579, "step": 775 }, { "completion_length": 366.72084350585936, "epoch": 0.416, "grad_norm": 88.48445892333984, "kl": 5.3171875, "learning_rate": 7.257083758860157e-07, "loss": 1.5957, "reward": 0.7666666805744171, "reward_std": 0.9076652824878693, "rewards/accuracy_reward": 0.24166667126119137, "rewards/format_reward": 0.5250000208616257, "step": 780 }, { "completion_length": 261.41251068115236, "epoch": 0.4186666666666667, "grad_norm": 58.172264099121094, "kl": 5.23984375, "learning_rate": 7.215444064587462e-07, "loss": 1.4679, "reward": 0.9916666924953461, "reward_std": 0.6018173396587372, "rewards/accuracy_reward": 0.24166667237877845, "rewards/format_reward": 0.7500000178813935, "step": 785 }, { "completion_length": 316.15417709350584, "epoch": 0.42133333333333334, "grad_norm": 47.1014404296875, "kl": 5.3765625, "learning_rate": 7.173612296907472e-07, "loss": 1.0298, "reward": 0.8125000208616256, "reward_std": 0.6595857471227646, "rewards/accuracy_reward": 0.20416667312383652, "rewards/format_reward": 0.6083333522081376, "step": 790 }, { "completion_length": 541.3083404541015, "epoch": 0.424, "grad_norm": 39.66009521484375, "kl": 6.48125, "learning_rate": 7.131592082528835e-07, "loss": 1.4332, "reward": 0.5750000149011611, "reward_std": 0.9711216628551483, "rewards/accuracy_reward": 0.25000000894069674, "rewards/format_reward": 0.325000012665987, "step": 795 }, { "completion_length": 529.3041809082031, "epoch": 0.4266666666666667, "grad_norm": 47.98305892944336, "kl": 7.490625, "learning_rate": 7.089387064498055e-07, "loss": 1.4781, "reward": 0.6041666865348816, "reward_std": 0.9722610518336297, "rewards/accuracy_reward": 0.2291666753590107, "rewards/format_reward": 0.37500000968575475, "step": 800 }, { "epoch": 0.4266666666666667, "eval_completion_length": 626.6472381591797, "eval_kl": 10.508958333333334, "eval_loss": 1.6093299388885498, "eval_reward": 0.24388889610767364, "eval_reward_std": 0.9690509649117788, "eval_rewards/accuracy_reward": 0.08500000188748041, "eval_rewards/format_reward": 0.1588888943195343, "eval_runtime": 1086.9778, "eval_samples_per_second": 0.276, "eval_steps_per_second": 0.012, "step": 800 }, { "completion_length": 575.5916809082031, "epoch": 0.42933333333333334, "grad_norm": 105.59420013427734, "kl": 8.5296875, "learning_rate": 7.047000901883645e-07, "loss": 1.419, "reward": 0.4041666805744171, "reward_std": 0.9605139315128326, "rewards/accuracy_reward": 0.21250000409781933, "rewards/format_reward": 0.19166667610406876, "step": 805 }, { "completion_length": 561.8750213623047, "epoch": 0.432, "grad_norm": 40.40248489379883, "kl": 7.340625, "learning_rate": 7.004437269458894e-07, "loss": 1.4182, "reward": 0.45833334140479565, "reward_std": 0.9228455007076264, "rewards/accuracy_reward": 0.15000000558793544, "rewards/format_reward": 0.3083333432674408, "step": 810 }, { "completion_length": 499.4708450317383, "epoch": 0.43466666666666665, "grad_norm": 15.018860816955566, "kl": 7.36875, "learning_rate": 6.961699857383278e-07, "loss": 1.4916, "reward": 0.6500000096857548, "reward_std": 0.8621464431285858, "rewards/accuracy_reward": 0.2166666742414236, "rewards/format_reward": 0.43333334624767306, "step": 815 }, { "completion_length": 550.2083557128906, "epoch": 0.43733333333333335, "grad_norm": 161.48826599121094, "kl": 5.7875, "learning_rate": 6.91879237088253e-07, "loss": 1.3703, "reward": 0.6125000208616257, "reward_std": 0.9770530998706818, "rewards/accuracy_reward": 0.27083334475755694, "rewards/format_reward": 0.34166667237877846, "step": 820 }, { "completion_length": 456.0291831970215, "epoch": 0.44, "grad_norm": 63.651100158691406, "kl": 6.596875, "learning_rate": 6.875718529927404e-07, "loss": 1.5676, "reward": 0.7250000149011612, "reward_std": 0.8462802618741989, "rewards/accuracy_reward": 0.24166667684912682, "rewards/format_reward": 0.48333334624767305, "step": 825 }, { "completion_length": 405.82500610351565, "epoch": 0.44266666666666665, "grad_norm": 75.36550903320312, "kl": 5.9515625, "learning_rate": 6.832482068911166e-07, "loss": 1.8022, "reward": 0.8083333611488343, "reward_std": 0.8616673052310944, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.5583333462476731, "step": 830 }, { "completion_length": 348.73751068115234, "epoch": 0.44533333333333336, "grad_norm": 66.93795013427734, "kl": 5.48984375, "learning_rate": 6.789086736325834e-07, "loss": 1.1328, "reward": 0.9583333611488343, "reward_std": 0.6873706102371215, "rewards/accuracy_reward": 0.325000012293458, "rewards/format_reward": 0.6333333551883698, "step": 835 }, { "completion_length": 301.7250061035156, "epoch": 0.448, "grad_norm": 38.354427337646484, "kl": 2.6962890625, "learning_rate": 6.745536294437186e-07, "loss": 0.9267, "reward": 1.0375000298023225, "reward_std": 0.5275938391685486, "rewards/accuracy_reward": 0.32083333767950534, "rewards/format_reward": 0.7166666865348816, "step": 840 }, { "completion_length": 353.62501220703126, "epoch": 0.45066666666666666, "grad_norm": 15.596612930297852, "kl": 5.7234375, "learning_rate": 6.701834518958586e-07, "loss": 1.764, "reward": 0.9833333730697632, "reward_std": 0.7794818341732025, "rewards/accuracy_reward": 0.35000001192092894, "rewards/format_reward": 0.6333333492279053, "step": 845 }, { "completion_length": 303.0666778564453, "epoch": 0.4533333333333333, "grad_norm": 38.20693588256836, "kl": 4.66875, "learning_rate": 6.657985198723643e-07, "loss": 1.1933, "reward": 0.9541666895151139, "reward_std": 0.6846331983804703, "rewards/accuracy_reward": 0.3041666798293591, "rewards/format_reward": 0.6500000149011612, "step": 850 }, { "completion_length": 320.7541793823242, "epoch": 0.456, "grad_norm": 18.968719482421875, "kl": 5.1609375, "learning_rate": 6.613992135357712e-07, "loss": 1.5211, "reward": 0.7916666865348816, "reward_std": 0.698268249630928, "rewards/accuracy_reward": 0.1500000052154064, "rewards/format_reward": 0.6416666865348816, "step": 855 }, { "completion_length": 317.71667633056643, "epoch": 0.45866666666666667, "grad_norm": 37.6462287902832, "kl": 4.6234375, "learning_rate": 6.569859142948327e-07, "loss": 1.3873, "reward": 0.9583333611488343, "reward_std": 0.6036534637212754, "rewards/accuracy_reward": 0.2750000063329935, "rewards/format_reward": 0.6833333522081375, "step": 860 }, { "completion_length": 405.2833450317383, "epoch": 0.4613333333333333, "grad_norm": 65.66597747802734, "kl": 6.6875, "learning_rate": 6.52559004771451e-07, "loss": 1.5637, "reward": 0.7166666906327009, "reward_std": 0.7335242480039597, "rewards/accuracy_reward": 0.20833333507180213, "rewards/format_reward": 0.5083333522081375, "step": 865 }, { "completion_length": 306.6833435058594, "epoch": 0.464, "grad_norm": 40.86724853515625, "kl": 3.3671875, "learning_rate": 6.481188687675057e-07, "loss": 0.9366, "reward": 0.9291666686534882, "reward_std": 0.5080332323908806, "rewards/accuracy_reward": 0.23750000223517417, "rewards/format_reward": 0.6916666865348816, "step": 870 }, { "completion_length": 263.4833419799805, "epoch": 0.4666666666666667, "grad_norm": 28.436981201171875, "kl": 3.7203125, "learning_rate": 6.436658912315788e-07, "loss": 1.093, "reward": 1.0041666984558106, "reward_std": 0.5717462062835693, "rewards/accuracy_reward": 0.2625000078231096, "rewards/format_reward": 0.7416666805744171, "step": 875 }, { "completion_length": 237.6666702270508, "epoch": 0.4693333333333333, "grad_norm": 36.36558532714844, "kl": 2.8359375, "learning_rate": 6.392004582255807e-07, "loss": 0.7905, "reward": 1.104166716337204, "reward_std": 0.5215684860944748, "rewards/accuracy_reward": 0.3375000111758709, "rewards/format_reward": 0.7666666865348816, "step": 880 }, { "completion_length": 254.6166732788086, "epoch": 0.472, "grad_norm": 87.91173553466797, "kl": 4.0609375, "learning_rate": 6.347229568912794e-07, "loss": 1.2084, "reward": 0.9250000298023224, "reward_std": 0.6195752292871475, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.7166666805744171, "step": 885 }, { "completion_length": 280.2916717529297, "epoch": 0.4746666666666667, "grad_norm": 33.61180114746094, "kl": 3.603125, "learning_rate": 6.302337754167369e-07, "loss": 1.1916, "reward": 1.0166666984558106, "reward_std": 0.6988473400473595, "rewards/accuracy_reward": 0.3000000089406967, "rewards/format_reward": 0.7166666805744171, "step": 890 }, { "completion_length": 193.06666946411133, "epoch": 0.47733333333333333, "grad_norm": 34.74186325073242, "kl": 2.271875, "learning_rate": 6.257333030026538e-07, "loss": 0.8303, "reward": 1.2000000476837158, "reward_std": 0.4528850480914116, "rewards/accuracy_reward": 0.3666666731238365, "rewards/format_reward": 0.8333333551883697, "step": 895 }, { "completion_length": 275.5500061035156, "epoch": 0.48, "grad_norm": 30.625368118286133, "kl": 5.39375, "learning_rate": 6.212219298286261e-07, "loss": 1.5581, "reward": 1.000000035762787, "reward_std": 0.6969257593154907, "rewards/accuracy_reward": 0.26666667237877845, "rewards/format_reward": 0.7333333551883697, "step": 900 }, { "completion_length": 214.02500610351564, "epoch": 0.4826666666666667, "grad_norm": 28.710886001586914, "kl": 3.11328125, "learning_rate": 6.167000470193188e-07, "loss": 1.1685, "reward": 0.9666666924953461, "reward_std": 0.5459988377988338, "rewards/accuracy_reward": 0.19166667386889458, "rewards/format_reward": 0.7750000119209289, "step": 905 }, { "completion_length": 224.62500610351563, "epoch": 0.48533333333333334, "grad_norm": 44.227230072021484, "kl": 2.56953125, "learning_rate": 6.121680466105559e-07, "loss": 0.9869, "reward": 0.9708333671092987, "reward_std": 0.5105708941817284, "rewards/accuracy_reward": 0.18750000819563867, "rewards/format_reward": 0.7833333551883698, "step": 910 }, { "completion_length": 184.80417404174804, "epoch": 0.488, "grad_norm": 24.200429916381836, "kl": 2.596875, "learning_rate": 6.076263215153307e-07, "loss": 0.8057, "reward": 1.1250000238418578, "reward_std": 0.4605237804353237, "rewards/accuracy_reward": 0.3000000089406967, "rewards/format_reward": 0.8250000178813934, "step": 915 }, { "completion_length": 234.8166763305664, "epoch": 0.49066666666666664, "grad_norm": 40.316471099853516, "kl": 4.540625, "learning_rate": 6.030752654897434e-07, "loss": 1.2207, "reward": 0.9833333611488342, "reward_std": 0.5229995906352997, "rewards/accuracy_reward": 0.20833333544433116, "rewards/format_reward": 0.7750000119209289, "step": 920 }, { "completion_length": 311.7916801452637, "epoch": 0.49333333333333335, "grad_norm": 34.297813415527344, "kl": 4.3859375, "learning_rate": 5.985152730988617e-07, "loss": 1.3617, "reward": 0.9625000298023224, "reward_std": 0.7176508605480194, "rewards/accuracy_reward": 0.2625000074505806, "rewards/format_reward": 0.700000011920929, "step": 925 }, { "completion_length": 182.29167404174805, "epoch": 0.496, "grad_norm": 15.424689292907715, "kl": 2.127734375, "learning_rate": 5.939467396825136e-07, "loss": 0.7552, "reward": 1.1791667103767396, "reward_std": 0.3441163420677185, "rewards/accuracy_reward": 0.31250000596046446, "rewards/format_reward": 0.8666666865348815, "step": 930 }, { "completion_length": 286.0208404541016, "epoch": 0.49866666666666665, "grad_norm": 33.052181243896484, "kl": 3.21015625, "learning_rate": 5.893700613210127e-07, "loss": 1.0319, "reward": 0.9208333551883697, "reward_std": 0.6939111322164535, "rewards/accuracy_reward": 0.23750000558793544, "rewards/format_reward": 0.6833333432674408, "step": 935 }, { "completion_length": 343.77500610351564, "epoch": 0.5013333333333333, "grad_norm": 133.8216094970703, "kl": 5.746875, "learning_rate": 5.847856348008188e-07, "loss": 1.4352, "reward": 0.9500000268220902, "reward_std": 0.7578961223363876, "rewards/accuracy_reward": 0.3000000074505806, "rewards/format_reward": 0.6500000178813934, "step": 940 }, { "completion_length": 435.433349609375, "epoch": 0.504, "grad_norm": 20.99445152282715, "kl": 4.63515625, "learning_rate": 5.801938575801371e-07, "loss": 1.3974, "reward": 0.7708333671092987, "reward_std": 0.926627391576767, "rewards/accuracy_reward": 0.27083334103226664, "rewards/format_reward": 0.5000000163912773, "step": 945 }, { "completion_length": 370.6958404541016, "epoch": 0.5066666666666667, "grad_norm": 8.884126663208008, "kl": 3.3875, "learning_rate": 5.755951277544607e-07, "loss": 1.131, "reward": 0.8166666984558105, "reward_std": 0.764404758810997, "rewards/accuracy_reward": 0.2333333395421505, "rewards/format_reward": 0.5833333551883697, "step": 950 }, { "completion_length": 340.5333419799805, "epoch": 0.5093333333333333, "grad_norm": 49.03192138671875, "kl": 3.83671875, "learning_rate": 5.709898440220551e-07, "loss": 1.1697, "reward": 0.9958333760499954, "reward_std": 0.7807205557823181, "rewards/accuracy_reward": 0.37916667461395265, "rewards/format_reward": 0.6166666805744171, "step": 955 }, { "completion_length": 263.5541793823242, "epoch": 0.512, "grad_norm": 9.941174507141113, "kl": 2.30234375, "learning_rate": 5.663784056493936e-07, "loss": 0.9917, "reward": 1.1750000298023224, "reward_std": 0.5471759930253028, "rewards/accuracy_reward": 0.4166666727513075, "rewards/format_reward": 0.7583333492279053, "step": 960 }, { "completion_length": 298.9041778564453, "epoch": 0.5146666666666667, "grad_norm": 28.354358673095703, "kl": 3.20546875, "learning_rate": 5.61761212436541e-07, "loss": 1.051, "reward": 0.862500011920929, "reward_std": 0.6203906744718551, "rewards/accuracy_reward": 0.19583333656191826, "rewards/format_reward": 0.6666666805744171, "step": 965 }, { "completion_length": 181.4708381652832, "epoch": 0.5173333333333333, "grad_norm": 10.501627922058105, "kl": 2.1251953125, "learning_rate": 5.571386646824922e-07, "loss": 0.4847, "reward": 1.1541667103767395, "reward_std": 0.2954378850758076, "rewards/accuracy_reward": 0.29583333879709245, "rewards/format_reward": 0.8583333492279053, "step": 970 }, { "completion_length": 157.45000381469725, "epoch": 0.52, "grad_norm": 8.8187894821167, "kl": 1.551953125, "learning_rate": 5.525111631504677e-07, "loss": 0.5822, "reward": 1.1541666984558105, "reward_std": 0.36384222060441973, "rewards/accuracy_reward": 0.27083334028720857, "rewards/format_reward": 0.8833333492279053, "step": 975 }, { "completion_length": 146.86666870117188, "epoch": 0.5226666666666666, "grad_norm": 8.133127212524414, "kl": 1.458203125, "learning_rate": 5.478791090331677e-07, "loss": 0.5357, "reward": 1.2083333730697632, "reward_std": 0.3662621095776558, "rewards/accuracy_reward": 0.3166666738688946, "rewards/format_reward": 0.891666692495346, "step": 980 }, { "completion_length": 128.45417251586915, "epoch": 0.5253333333333333, "grad_norm": 15.48926830291748, "kl": 1.6015625, "learning_rate": 5.432429039179899e-07, "loss": 0.3506, "reward": 1.1458333671092986, "reward_std": 0.2651623532176018, "rewards/accuracy_reward": 0.2541666731238365, "rewards/format_reward": 0.891666692495346, "step": 985 }, { "completion_length": 146.4000045776367, "epoch": 0.528, "grad_norm": 5.376186370849609, "kl": 1.834765625, "learning_rate": 5.386029497522133e-07, "loss": 0.6246, "reward": 1.2416666865348815, "reward_std": 0.35661301463842393, "rewards/accuracy_reward": 0.3583333447575569, "rewards/format_reward": 0.8833333551883698, "step": 990 }, { "completion_length": 177.82500457763672, "epoch": 0.5306666666666666, "grad_norm": 22.633089065551758, "kl": 1.623828125, "learning_rate": 5.3395964880815e-07, "loss": 0.6805, "reward": 1.141666692495346, "reward_std": 0.358070158213377, "rewards/accuracy_reward": 0.27500000707805156, "rewards/format_reward": 0.8666666805744171, "step": 995 }, { "completion_length": 196.29583969116212, "epoch": 0.5333333333333333, "grad_norm": 47.34584426879883, "kl": 1.69375, "learning_rate": 5.293134036482698e-07, "loss": 0.823, "reward": 1.087500023841858, "reward_std": 0.3883266061544418, "rewards/accuracy_reward": 0.24583334140479565, "rewards/format_reward": 0.8416666924953461, "step": 1000 }, { "epoch": 0.5333333333333333, "eval_completion_length": 175.36000528971354, "eval_kl": 1.351640625, "eval_loss": 0.610393226146698, "eval_reward": 1.0127778077125549, "eval_reward_std": 0.40122534612814587, "eval_rewards/accuracy_reward": 0.1438888931274414, "eval_rewards/format_reward": 0.8688889082272847, "eval_runtime": 672.1413, "eval_samples_per_second": 0.446, "eval_steps_per_second": 0.019, "step": 1000 }, { "completion_length": 131.97083587646483, "epoch": 0.536, "grad_norm": 80.59904479980469, "kl": 1.5953125, "learning_rate": 5.246646170902975e-07, "loss": 0.4407, "reward": 1.1833333492279052, "reward_std": 0.2533145576715469, "rewards/accuracy_reward": 0.26666667461395266, "rewards/format_reward": 0.9166666746139527, "step": 1005 }, { "completion_length": 149.46666946411133, "epoch": 0.5386666666666666, "grad_norm": 6.973750591278076, "kl": 1.4193359375, "learning_rate": 5.200136921722918e-07, "loss": 0.5413, "reward": 1.1666666924953462, "reward_std": 0.3360213190317154, "rewards/accuracy_reward": 0.26666667833924296, "rewards/format_reward": 0.9000000178813934, "step": 1010 }, { "completion_length": 178.3958366394043, "epoch": 0.5413333333333333, "grad_norm": 10.757118225097656, "kl": 1.02265625, "learning_rate": 5.153610321177013e-07, "loss": 0.4342, "reward": 1.0666667103767395, "reward_std": 0.2775675721466541, "rewards/accuracy_reward": 0.17500000670552254, "rewards/format_reward": 0.8916666865348816, "step": 1015 }, { "completion_length": 265.3958404541016, "epoch": 0.544, "grad_norm": 11.506460189819336, "kl": 2.033984375, "learning_rate": 5.107070403004066e-07, "loss": 0.8198, "reward": 0.8750000298023224, "reward_std": 0.6099086761474609, "rewards/accuracy_reward": 0.15833333767950536, "rewards/format_reward": 0.7166666865348816, "step": 1020 }, { "completion_length": 302.5041778564453, "epoch": 0.5466666666666666, "grad_norm": 16.32666015625, "kl": 2.891015625, "learning_rate": 5.060521202097489e-07, "loss": 1.0253, "reward": 1.025000014901161, "reward_std": 0.5060222968459129, "rewards/accuracy_reward": 0.27500000186264517, "rewards/format_reward": 0.7500000208616256, "step": 1025 }, { "completion_length": 416.4416763305664, "epoch": 0.5493333333333333, "grad_norm": 15.603365898132324, "kl": 2.96328125, "learning_rate": 5.013966754155482e-07, "loss": 1.1611, "reward": 0.7500000238418579, "reward_std": 0.8826716184616089, "rewards/accuracy_reward": 0.2000000063329935, "rewards/format_reward": 0.5500000149011612, "step": 1030 }, { "completion_length": 368.07500915527345, "epoch": 0.552, "grad_norm": 10.401843070983887, "kl": 3.10078125, "learning_rate": 4.967411095331149e-07, "loss": 1.114, "reward": 0.8791666865348816, "reward_std": 0.7436762899160385, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.608333346247673, "step": 1035 }, { "completion_length": 253.62917251586913, "epoch": 0.5546666666666666, "grad_norm": 8.17926025390625, "kl": 2.52109375, "learning_rate": 4.920858261882577e-07, "loss": 0.9692, "reward": 1.0166667014360429, "reward_std": 0.7252250477671623, "rewards/accuracy_reward": 0.3000000059604645, "rewards/format_reward": 0.7166666775941849, "step": 1040 }, { "completion_length": 202.72500228881836, "epoch": 0.5573333333333333, "grad_norm": 23.440996170043945, "kl": 2.301953125, "learning_rate": 4.874312289822899e-07, "loss": 0.8462, "reward": 1.1583333671092988, "reward_std": 0.5490816205739975, "rewards/accuracy_reward": 0.3333333469927311, "rewards/format_reward": 0.825000011920929, "step": 1045 }, { "completion_length": 236.8416748046875, "epoch": 0.56, "grad_norm": 6.160148620605469, "kl": 2.3287109375, "learning_rate": 4.827777214570384e-07, "loss": 0.9984, "reward": 1.066666704416275, "reward_std": 0.5855829656124115, "rewards/accuracy_reward": 0.2833333447575569, "rewards/format_reward": 0.7833333551883698, "step": 1050 }, { "completion_length": 160.41250305175782, "epoch": 0.5626666666666666, "grad_norm": 59.35076904296875, "kl": 2.099609375, "learning_rate": 4.781257070598571e-07, "loss": 0.6582, "reward": 1.2125000357627869, "reward_std": 0.3614451542496681, "rewards/accuracy_reward": 0.3291666731238365, "rewards/format_reward": 0.8833333492279053, "step": 1055 }, { "completion_length": 216.84167404174804, "epoch": 0.5653333333333334, "grad_norm": 11.944228172302246, "kl": 1.9265625, "learning_rate": 4.734755891086498e-07, "loss": 0.8889, "reward": 0.9833333671092988, "reward_std": 0.4978467658162117, "rewards/accuracy_reward": 0.1583333358168602, "rewards/format_reward": 0.8250000238418579, "step": 1060 }, { "completion_length": 204.32500534057618, "epoch": 0.568, "grad_norm": 20.293838500976562, "kl": 2.085546875, "learning_rate": 4.6882777075690346e-07, "loss": 0.6455, "reward": 1.1208333671092987, "reward_std": 0.46157447397708895, "rewards/accuracy_reward": 0.3041666723787785, "rewards/format_reward": 0.8166666865348816, "step": 1065 }, { "completion_length": 239.66666946411132, "epoch": 0.5706666666666667, "grad_norm": 8.901671409606934, "kl": 1.844140625, "learning_rate": 4.6418265495873516e-07, "loss": 0.7643, "reward": 1.1750000357627868, "reward_std": 0.4657398253679276, "rewards/accuracy_reward": 0.35833334624767305, "rewards/format_reward": 0.8166666865348816, "step": 1070 }, { "completion_length": 236.07917327880858, "epoch": 0.5733333333333334, "grad_norm": 10.970005989074707, "kl": 1.715234375, "learning_rate": 4.595406444339576e-07, "loss": 0.7525, "reward": 1.0000000417232513, "reward_std": 0.4881373070180416, "rewards/accuracy_reward": 0.19166667051613331, "rewards/format_reward": 0.8083333492279052, "step": 1075 }, { "completion_length": 317.90001220703124, "epoch": 0.576, "grad_norm": 14.551898956298828, "kl": 3.174609375, "learning_rate": 4.5490214163316397e-07, "loss": 0.9734, "reward": 0.8916666984558106, "reward_std": 0.6427461057901382, "rewards/accuracy_reward": 0.20833333693444728, "rewards/format_reward": 0.6833333522081375, "step": 1080 }, { "completion_length": 227.68750457763673, "epoch": 0.5786666666666667, "grad_norm": 9.023757934570312, "kl": 1.8625, "learning_rate": 4.502675487028369e-07, "loss": 0.8347, "reward": 1.0416666984558105, "reward_std": 0.5368030399084092, "rewards/accuracy_reward": 0.2416666727513075, "rewards/format_reward": 0.8000000238418579, "step": 1085 }, { "completion_length": 288.81667404174806, "epoch": 0.5813333333333334, "grad_norm": 27.050466537475586, "kl": 2.22265625, "learning_rate": 4.456372674504828e-07, "loss": 0.9977, "reward": 0.9666666924953461, "reward_std": 0.5600151270627975, "rewards/accuracy_reward": 0.23333334028720856, "rewards/format_reward": 0.7333333492279053, "step": 1090 }, { "completion_length": 233.35834045410155, "epoch": 0.584, "grad_norm": 99.57305908203125, "kl": 2.521875, "learning_rate": 4.4101169930979677e-07, "loss": 1.0178, "reward": 1.0416666984558105, "reward_std": 0.5232247993350029, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.7916666865348816, "step": 1095 }, { "completion_length": 237.47084121704103, "epoch": 0.5866666666666667, "grad_norm": 32.285587310791016, "kl": 2.566796875, "learning_rate": 4.3639124530585885e-07, "loss": 0.9553, "reward": 1.0416666984558105, "reward_std": 0.49489557296037673, "rewards/accuracy_reward": 0.24166667237877845, "rewards/format_reward": 0.8000000238418579, "step": 1100 }, { "completion_length": 263.5375045776367, "epoch": 0.5893333333333334, "grad_norm": 14.501636505126953, "kl": 2.036328125, "learning_rate": 4.317763060203664e-07, "loss": 0.7803, "reward": 1.0916667044162751, "reward_std": 0.5974579885601997, "rewards/accuracy_reward": 0.34166668020188806, "rewards/format_reward": 0.7500000119209289, "step": 1105 }, { "completion_length": 190.2458381652832, "epoch": 0.592, "grad_norm": 28.570791244506836, "kl": 2.1984375, "learning_rate": 4.271672815569047e-07, "loss": 0.7099, "reward": 1.1125000298023224, "reward_std": 0.4488224387168884, "rewards/accuracy_reward": 0.2958333443850279, "rewards/format_reward": 0.8166666924953461, "step": 1110 }, { "completion_length": 191.68750381469727, "epoch": 0.5946666666666667, "grad_norm": 56.26449203491211, "kl": 1.941015625, "learning_rate": 4.2256457150625847e-07, "loss": 0.7596, "reward": 1.1041666984558105, "reward_std": 0.43807603493332864, "rewards/accuracy_reward": 0.27916667200624945, "rewards/format_reward": 0.8250000238418579, "step": 1115 }, { "completion_length": 164.62500534057617, "epoch": 0.5973333333333334, "grad_norm": 159.04928588867188, "kl": 2.11171875, "learning_rate": 4.1796857491176966e-07, "loss": 0.7277, "reward": 1.200000035762787, "reward_std": 0.3421368353068829, "rewards/accuracy_reward": 0.30833334289491177, "rewards/format_reward": 0.8916666805744171, "step": 1120 }, { "completion_length": 216.95833892822264, "epoch": 0.6, "grad_norm": 12.805575370788574, "kl": 1.614453125, "learning_rate": 4.133796902347396e-07, "loss": 0.5934, "reward": 1.1250000417232513, "reward_std": 0.4264773324131966, "rewards/accuracy_reward": 0.3083333373069763, "rewards/format_reward": 0.8166666865348816, "step": 1125 }, { "completion_length": 219.3791748046875, "epoch": 0.6026666666666667, "grad_norm": 14.401493072509766, "kl": 2.0857421875, "learning_rate": 4.087983153198848e-07, "loss": 0.6371, "reward": 1.0958333671092988, "reward_std": 0.5428274616599083, "rewards/accuracy_reward": 0.3041666761040688, "rewards/format_reward": 0.7916666865348816, "step": 1130 }, { "completion_length": 189.86667556762694, "epoch": 0.6053333333333333, "grad_norm": 19.875732421875, "kl": 2.033203125, "learning_rate": 4.0422484736084414e-07, "loss": 0.6977, "reward": 1.0750000417232513, "reward_std": 0.40034623965620997, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.8666666805744171, "step": 1135 }, { "completion_length": 160.80417022705078, "epoch": 0.608, "grad_norm": 9.801048278808594, "kl": 1.8109375, "learning_rate": 3.9965968286574367e-07, "loss": 0.5807, "reward": 1.1375000476837158, "reward_std": 0.3900581821799278, "rewards/accuracy_reward": 0.2625000040978193, "rewards/format_reward": 0.8750000178813935, "step": 1140 }, { "completion_length": 157.98750534057618, "epoch": 0.6106666666666667, "grad_norm": 30.29570770263672, "kl": 1.40703125, "learning_rate": 3.951032176228199e-07, "loss": 0.5565, "reward": 1.200000035762787, "reward_std": 0.2764429710805416, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.9083333492279053, "step": 1145 }, { "completion_length": 180.90000610351564, "epoch": 0.6133333333333333, "grad_norm": 23.002822875976562, "kl": 1.65390625, "learning_rate": 3.9055584666610596e-07, "loss": 0.6413, "reward": 1.2041666984558106, "reward_std": 0.40405053198337554, "rewards/accuracy_reward": 0.33750001415610315, "rewards/format_reward": 0.8666666924953461, "step": 1150 }, { "completion_length": 186.8958396911621, "epoch": 0.616, "grad_norm": 12.507006645202637, "kl": 1.280078125, "learning_rate": 3.860179642411837e-07, "loss": 0.6498, "reward": 1.1333333432674408, "reward_std": 0.3774823874235153, "rewards/accuracy_reward": 0.2750000037252903, "rewards/format_reward": 0.8583333492279053, "step": 1155 }, { "completion_length": 155.95416946411132, "epoch": 0.6186666666666667, "grad_norm": 3.330310583114624, "kl": 1.2033203125, "learning_rate": 3.8148996377100304e-07, "loss": 0.4313, "reward": 1.2041667103767395, "reward_std": 0.3509316384792328, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.8916666805744171, "step": 1160 }, { "completion_length": 188.76667327880858, "epoch": 0.6213333333333333, "grad_norm": 40.26872634887695, "kl": 2.188671875, "learning_rate": 3.7697223782177303e-07, "loss": 0.6651, "reward": 1.2583333909511567, "reward_std": 0.4299319893121719, "rewards/accuracy_reward": 0.4083333432674408, "rewards/format_reward": 0.8500000178813935, "step": 1165 }, { "completion_length": 252.42917556762694, "epoch": 0.624, "grad_norm": 16.348541259765625, "kl": 2.37421875, "learning_rate": 3.724651780689285e-07, "loss": 0.9173, "reward": 1.0250000298023223, "reward_std": 0.5082567930221558, "rewards/accuracy_reward": 0.2666666731238365, "rewards/format_reward": 0.7583333551883698, "step": 1170 }, { "completion_length": 239.4791717529297, "epoch": 0.6266666666666667, "grad_norm": 17.348703384399414, "kl": 1.586328125, "learning_rate": 3.679691752631715e-07, "loss": 0.796, "reward": 1.1250000417232513, "reward_std": 0.4487001359462738, "rewards/accuracy_reward": 0.3250000089406967, "rewards/format_reward": 0.8000000238418579, "step": 1175 }, { "completion_length": 184.93750610351563, "epoch": 0.6293333333333333, "grad_norm": 5.322170734405518, "kl": 1.261328125, "learning_rate": 3.6348461919659433e-07, "loss": 0.5975, "reward": 1.200000023841858, "reward_std": 0.394177620112896, "rewards/accuracy_reward": 0.35000001043081286, "rewards/format_reward": 0.8500000178813935, "step": 1180 }, { "completion_length": 149.95417175292968, "epoch": 0.632, "grad_norm": 4.652257919311523, "kl": 1.3166015625, "learning_rate": 3.590118986688865e-07, "loss": 0.5187, "reward": 1.2375000357627868, "reward_std": 0.35248192474246026, "rewards/accuracy_reward": 0.34583334140479566, "rewards/format_reward": 0.8916666805744171, "step": 1185 }, { "completion_length": 214.28750915527343, "epoch": 0.6346666666666667, "grad_norm": 8.401276588439941, "kl": 1.615234375, "learning_rate": 3.5455140145362586e-07, "loss": 0.803, "reward": 1.1166667103767396, "reward_std": 0.385433167219162, "rewards/accuracy_reward": 0.2750000089406967, "rewards/format_reward": 0.8416666865348816, "step": 1190 }, { "completion_length": 243.75000686645507, "epoch": 0.6373333333333333, "grad_norm": 17.25551414489746, "kl": 1.5962890625, "learning_rate": 3.5010351426466003e-07, "loss": 0.7611, "reward": 1.1750000417232513, "reward_std": 0.5330882802605629, "rewards/accuracy_reward": 0.36666667759418486, "rewards/format_reward": 0.8083333492279052, "step": 1195 }, { "completion_length": 185.69167251586913, "epoch": 0.64, "grad_norm": 15.053462982177734, "kl": 1.40078125, "learning_rate": 3.4566862272257917e-07, "loss": 0.7929, "reward": 1.154166692495346, "reward_std": 0.4828508198261261, "rewards/accuracy_reward": 0.30416667461395264, "rewards/format_reward": 0.8500000178813935, "step": 1200 }, { "epoch": 0.64, "eval_completion_length": 263.9861196899414, "eval_kl": 2.4522395833333333, "eval_loss": 0.904167890548706, "eval_reward": 0.9144444712003073, "eval_reward_std": 0.5771439289053281, "eval_rewards/accuracy_reward": 0.16111111541589102, "eval_rewards/format_reward": 0.7533333551883697, "eval_runtime": 815.5292, "eval_samples_per_second": 0.368, "eval_steps_per_second": 0.016, "step": 1200 }, { "completion_length": 224.0416717529297, "epoch": 0.6426666666666667, "grad_norm": 14.006892204284668, "kl": 2.721875, "learning_rate": 3.412471113212837e-07, "loss": 0.8414, "reward": 1.0583333611488341, "reward_std": 0.5013190120458603, "rewards/accuracy_reward": 0.2583333384245634, "rewards/format_reward": 0.8000000178813934, "step": 1205 }, { "completion_length": 250.73750228881835, "epoch": 0.6453333333333333, "grad_norm": 4.346928596496582, "kl": 1.89140625, "learning_rate": 3.3683936339464955e-07, "loss": 0.6544, "reward": 1.1083333611488342, "reward_std": 0.5055985808372497, "rewards/accuracy_reward": 0.33333334103226664, "rewards/format_reward": 0.7750000178813934, "step": 1210 }, { "completion_length": 262.02500915527344, "epoch": 0.648, "grad_norm": 8.122538566589355, "kl": 1.6572265625, "learning_rate": 3.324457610832941e-07, "loss": 0.8986, "reward": 1.1541667103767395, "reward_std": 0.5832742094993592, "rewards/accuracy_reward": 0.3791666805744171, "rewards/format_reward": 0.775000023841858, "step": 1215 }, { "completion_length": 245.7916717529297, "epoch": 0.6506666666666666, "grad_norm": 11.933501243591309, "kl": 1.238671875, "learning_rate": 3.280666853014457e-07, "loss": 0.7222, "reward": 1.1958333611488343, "reward_std": 0.5057858511805534, "rewards/accuracy_reward": 0.40416667573153975, "rewards/format_reward": 0.7916666746139527, "step": 1220 }, { "completion_length": 311.77917633056643, "epoch": 0.6533333333333333, "grad_norm": 13.826354026794434, "kl": 2.342578125, "learning_rate": 3.2370251570391925e-07, "loss": 0.9241, "reward": 0.941666704416275, "reward_std": 0.5843323901295662, "rewards/accuracy_reward": 0.2000000037252903, "rewards/format_reward": 0.7416666865348815, "step": 1225 }, { "completion_length": 288.49167709350587, "epoch": 0.656, "grad_norm": 12.062870979309082, "kl": 2.384375, "learning_rate": 3.1935363065320126e-07, "loss": 0.9282, "reward": 0.9458333492279053, "reward_std": 0.5712588280439377, "rewards/accuracy_reward": 0.19583333730697633, "rewards/format_reward": 0.7500000119209289, "step": 1230 }, { "completion_length": 235.39584197998047, "epoch": 0.6586666666666666, "grad_norm": 7.450344085693359, "kl": 1.758203125, "learning_rate": 3.150204071866464e-07, "loss": 0.6324, "reward": 1.1458333611488343, "reward_std": 0.5423780143260956, "rewards/accuracy_reward": 0.36250000819563866, "rewards/format_reward": 0.7833333551883698, "step": 1235 }, { "completion_length": 197.81250762939453, "epoch": 0.6613333333333333, "grad_norm": 11.524059295654297, "kl": 1.31171875, "learning_rate": 3.107032209837892e-07, "loss": 0.5257, "reward": 1.0458333551883698, "reward_std": 0.334098968654871, "rewards/accuracy_reward": 0.19583333805203437, "rewards/format_reward": 0.850000011920929, "step": 1240 }, { "completion_length": 247.20000762939452, "epoch": 0.664, "grad_norm": 7.961787700653076, "kl": 1.726171875, "learning_rate": 3.064024463337747e-07, "loss": 0.8873, "reward": 0.916666692495346, "reward_std": 0.4848631680011749, "rewards/accuracy_reward": 0.14166667088866233, "rewards/format_reward": 0.775000023841858, "step": 1245 }, { "completion_length": 229.66667633056642, "epoch": 0.6666666666666666, "grad_norm": 23.106739044189453, "kl": 1.5203125, "learning_rate": 3.021184561029071e-07, "loss": 0.8455, "reward": 1.1083333611488342, "reward_std": 0.45471236705780027, "rewards/accuracy_reward": 0.28333334140479566, "rewards/format_reward": 0.8250000238418579, "step": 1250 }, { "completion_length": 316.7458381652832, "epoch": 0.6693333333333333, "grad_norm": 9.744142532348633, "kl": 2.18046875, "learning_rate": 2.9785162170232424e-07, "loss": 0.9333, "reward": 0.829166692495346, "reward_std": 0.6003586441278458, "rewards/accuracy_reward": 0.13750000558793546, "rewards/format_reward": 0.6916666984558105, "step": 1255 }, { "completion_length": 242.23750762939454, "epoch": 0.672, "grad_norm": 25.725797653198242, "kl": 1.94375, "learning_rate": 2.936023130557964e-07, "loss": 0.7897, "reward": 1.079166704416275, "reward_std": 0.5282017394900322, "rewards/accuracy_reward": 0.2875000059604645, "rewards/format_reward": 0.7916666865348816, "step": 1260 }, { "completion_length": 226.35417251586915, "epoch": 0.6746666666666666, "grad_norm": 4.361880779266357, "kl": 1.287109375, "learning_rate": 2.893708985676556e-07, "loss": 0.7382, "reward": 1.1708333611488342, "reward_std": 0.4796324223279953, "rewards/accuracy_reward": 0.33750001415610315, "rewards/format_reward": 0.8333333551883697, "step": 1265 }, { "completion_length": 179.21250839233397, "epoch": 0.6773333333333333, "grad_norm": 10.620832443237305, "kl": 1.5546875, "learning_rate": 2.851577450908553e-07, "loss": 0.5363, "reward": 1.204166692495346, "reward_std": 0.39730483293533325, "rewards/accuracy_reward": 0.3291666775941849, "rewards/format_reward": 0.8750000238418579, "step": 1270 }, { "completion_length": 225.60834045410155, "epoch": 0.68, "grad_norm": 24.622392654418945, "kl": 1.930859375, "learning_rate": 2.809632178951655e-07, "loss": 0.7525, "reward": 1.2000000476837158, "reward_std": 0.43045871555805204, "rewards/accuracy_reward": 0.3833333432674408, "rewards/format_reward": 0.8166666865348816, "step": 1275 }, { "completion_length": 189.73333892822265, "epoch": 0.6826666666666666, "grad_norm": 7.506571292877197, "kl": 1.469921875, "learning_rate": 2.767876806355045e-07, "loss": 0.6633, "reward": 1.1250000298023224, "reward_std": 0.4194158732891083, "rewards/accuracy_reward": 0.2666666753590107, "rewards/format_reward": 0.8583333551883697, "step": 1280 }, { "completion_length": 205.73334045410155, "epoch": 0.6853333333333333, "grad_norm": 6.358190059661865, "kl": 1.3841796875, "learning_rate": 2.7263149532041107e-07, "loss": 0.8039, "reward": 1.2166667103767395, "reward_std": 0.5374398469924927, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.8416666865348816, "step": 1285 }, { "completion_length": 210.5208427429199, "epoch": 0.688, "grad_norm": 27.677288055419922, "kl": 2.022265625, "learning_rate": 2.6849502228065955e-07, "loss": 0.8598, "reward": 1.0458333790302277, "reward_std": 0.47432570457458495, "rewards/accuracy_reward": 0.2291666693985462, "rewards/format_reward": 0.8166666924953461, "step": 1290 }, { "completion_length": 206.36667022705078, "epoch": 0.6906666666666667, "grad_norm": 27.595359802246094, "kl": 2.200390625, "learning_rate": 2.6437862013801937e-07, "loss": 0.7624, "reward": 1.075000023841858, "reward_std": 0.45619752407073977, "rewards/accuracy_reward": 0.27500000447034834, "rewards/format_reward": 0.8000000238418579, "step": 1295 }, { "completion_length": 281.9166717529297, "epoch": 0.6933333333333334, "grad_norm": 7.714141845703125, "kl": 3.14765625, "learning_rate": 2.6028264577416414e-07, "loss": 0.9706, "reward": 0.8500000357627868, "reward_std": 0.6301615715026856, "rewards/accuracy_reward": 0.1916666690260172, "rewards/format_reward": 0.6583333551883698, "step": 1300 }, { "completion_length": 227.537508392334, "epoch": 0.696, "grad_norm": 11.018620491027832, "kl": 1.490625, "learning_rate": 2.5620745429973046e-07, "loss": 0.7425, "reward": 1.0625000417232513, "reward_std": 0.47583652585744857, "rewards/accuracy_reward": 0.2708333406597376, "rewards/format_reward": 0.7916666865348816, "step": 1305 }, { "completion_length": 206.64584045410157, "epoch": 0.6986666666666667, "grad_norm": 7.6855149269104, "kl": 1.15, "learning_rate": 2.5215339902353093e-07, "loss": 0.6114, "reward": 1.2291666984558105, "reward_std": 0.40088424533605577, "rewards/accuracy_reward": 0.37083333879709246, "rewards/format_reward": 0.8583333492279053, "step": 1310 }, { "completion_length": 236.3791717529297, "epoch": 0.7013333333333334, "grad_norm": 36.869991302490234, "kl": 1.91953125, "learning_rate": 2.4812083142192323e-07, "loss": 0.8435, "reward": 1.1000000357627868, "reward_std": 0.501855157315731, "rewards/accuracy_reward": 0.30000000447034836, "rewards/format_reward": 0.8000000178813934, "step": 1315 }, { "completion_length": 239.29167404174805, "epoch": 0.704, "grad_norm": 34.20293045043945, "kl": 2.9728515625, "learning_rate": 2.441101011083378e-07, "loss": 0.652, "reward": 1.1500000417232514, "reward_std": 0.4413339100778103, "rewards/accuracy_reward": 0.33333334028720857, "rewards/format_reward": 0.8166666865348816, "step": 1320 }, { "completion_length": 196.09583740234376, "epoch": 0.7066666666666667, "grad_norm": 6.715369701385498, "kl": 1.90703125, "learning_rate": 2.4012155580296705e-07, "loss": 0.571, "reward": 1.3291667222976684, "reward_std": 0.47337266951799395, "rewards/accuracy_reward": 0.4958333492279053, "rewards/format_reward": 0.8333333492279053, "step": 1325 }, { "completion_length": 218.24584045410157, "epoch": 0.7093333333333334, "grad_norm": 6.281963348388672, "kl": 1.1951171875, "learning_rate": 2.3615554130262e-07, "loss": 0.7098, "reward": 1.2250000357627868, "reward_std": 0.4241747669875622, "rewards/accuracy_reward": 0.3916666805744171, "rewards/format_reward": 0.8333333551883697, "step": 1330 }, { "completion_length": 233.00000915527343, "epoch": 0.712, "grad_norm": 65.47384643554688, "kl": 1.837109375, "learning_rate": 2.3221240145074095e-07, "loss": 0.8178, "reward": 1.0833333551883697, "reward_std": 0.49581558406353, "rewards/accuracy_reward": 0.3000000059604645, "rewards/format_reward": 0.7833333551883698, "step": 1335 }, { "completion_length": 223.82500839233398, "epoch": 0.7146666666666667, "grad_norm": 4.103445529937744, "kl": 1.4453125, "learning_rate": 2.2829247810760021e-07, "loss": 0.7565, "reward": 1.075000023841858, "reward_std": 0.45809874683618546, "rewards/accuracy_reward": 0.2666666697710752, "rewards/format_reward": 0.8083333551883698, "step": 1340 }, { "completion_length": 205.72083892822266, "epoch": 0.7173333333333334, "grad_norm": 12.356497764587402, "kl": 1.340234375, "learning_rate": 2.2439611112065547e-07, "loss": 0.5622, "reward": 1.0833333671092986, "reward_std": 0.3753781244158745, "rewards/accuracy_reward": 0.2500000026077032, "rewards/format_reward": 0.8333333492279053, "step": 1345 }, { "completion_length": 223.57083969116212, "epoch": 0.72, "grad_norm": 17.527667999267578, "kl": 1.3921875, "learning_rate": 2.2052363829508776e-07, "loss": 0.6411, "reward": 1.2500000536441802, "reward_std": 0.46224844083189964, "rewards/accuracy_reward": 0.41666667014360426, "rewards/format_reward": 0.8333333551883697, "step": 1350 }, { "completion_length": 223.97083892822266, "epoch": 0.7226666666666667, "grad_norm": 6.988903522491455, "kl": 2.165234375, "learning_rate": 2.1667539536451452e-07, "loss": 0.7447, "reward": 1.004166704416275, "reward_std": 0.5843853443861008, "rewards/accuracy_reward": 0.2458333346992731, "rewards/format_reward": 0.7583333611488342, "step": 1355 }, { "completion_length": 207.48334045410155, "epoch": 0.7253333333333334, "grad_norm": 4.7572832107543945, "kl": 1.383203125, "learning_rate": 2.1285171596188268e-07, "loss": 0.6242, "reward": 1.2250000357627868, "reward_std": 0.4148427419364452, "rewards/accuracy_reward": 0.36666668206453323, "rewards/format_reward": 0.8583333551883697, "step": 1360 }, { "completion_length": 198.64167556762695, "epoch": 0.728, "grad_norm": 9.330974578857422, "kl": 1.2017578125, "learning_rate": 2.090529315905431e-07, "loss": 0.5328, "reward": 1.1500000298023223, "reward_std": 0.36292394176125525, "rewards/accuracy_reward": 0.3000000089406967, "rewards/format_reward": 0.8500000089406967, "step": 1365 }, { "completion_length": 175.92083740234375, "epoch": 0.7306666666666667, "grad_norm": 5.827265739440918, "kl": 1.030078125, "learning_rate": 2.052793715955104e-07, "loss": 0.4956, "reward": 1.2375000238418579, "reward_std": 0.41373512148857117, "rewards/accuracy_reward": 0.3625000089406967, "rewards/format_reward": 0.8750000178813935, "step": 1370 }, { "completion_length": 211.1041732788086, "epoch": 0.7333333333333333, "grad_norm": 19.677011489868164, "kl": 1.71640625, "learning_rate": 2.0153136313490943e-07, "loss": 0.6799, "reward": 1.1750000238418579, "reward_std": 0.4355910629034042, "rewards/accuracy_reward": 0.33333333767950535, "rewards/format_reward": 0.8416666805744171, "step": 1375 }, { "completion_length": 178.94584045410156, "epoch": 0.736, "grad_norm": 9.273496627807617, "kl": 2.182421875, "learning_rate": 1.9780923115161158e-07, "loss": 0.7168, "reward": 1.2291667103767394, "reward_std": 0.4490681551396847, "rewards/accuracy_reward": 0.37916667461395265, "rewards/format_reward": 0.8500000178813935, "step": 1380 }, { "completion_length": 169.15000762939454, "epoch": 0.7386666666666667, "grad_norm": 4.525454044342041, "kl": 1.0986328125, "learning_rate": 1.9411329834506286e-07, "loss": 0.5639, "reward": 1.237500047683716, "reward_std": 0.43203722685575485, "rewards/accuracy_reward": 0.3708333432674408, "rewards/format_reward": 0.8666666865348815, "step": 1385 }, { "completion_length": 246.45417175292968, "epoch": 0.7413333333333333, "grad_norm": 6.603713035583496, "kl": 1.836328125, "learning_rate": 1.904438851433068e-07, "loss": 0.9502, "reward": 0.9625000119209289, "reward_std": 0.5920814260840416, "rewards/accuracy_reward": 0.1958333369344473, "rewards/format_reward": 0.7666666865348816, "step": 1390 }, { "completion_length": 228.13750686645508, "epoch": 0.744, "grad_norm": 12.823264122009277, "kl": 2.1423828125, "learning_rate": 1.868013096752043e-07, "loss": 0.6335, "reward": 1.0458333492279053, "reward_std": 0.4450342819094658, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.7750000178813934, "step": 1395 }, { "completion_length": 179.4166717529297, "epoch": 0.7466666666666667, "grad_norm": 5.9196295738220215, "kl": 1.4501953125, "learning_rate": 1.8318588774285237e-07, "loss": 0.675, "reward": 1.2958333611488342, "reward_std": 0.4500196687877178, "rewards/accuracy_reward": 0.45416668355464934, "rewards/format_reward": 0.8416666924953461, "step": 1400 }, { "epoch": 0.7466666666666667, "eval_completion_length": 254.25000681559246, "eval_kl": 2.1704947916666666, "eval_loss": 0.8455010652542114, "eval_reward": 0.9288889118035635, "eval_reward_std": 0.5875919719537099, "eval_rewards/accuracy_reward": 0.1711111158132553, "eval_rewards/format_reward": 0.7577777977784474, "eval_runtime": 828.1328, "eval_samples_per_second": 0.362, "eval_steps_per_second": 0.016, "step": 1400 }, { "completion_length": 203.6458396911621, "epoch": 0.7493333333333333, "grad_norm": 6.658132553100586, "kl": 1.80625, "learning_rate": 1.7959793279420505e-07, "loss": 0.7215, "reward": 1.1333333730697632, "reward_std": 0.4133936479687691, "rewards/accuracy_reward": 0.2916666746139526, "rewards/format_reward": 0.8416666924953461, "step": 1405 }, { "completion_length": 137.4416702270508, "epoch": 0.752, "grad_norm": 5.461396217346191, "kl": 1.5439453125, "learning_rate": 1.760377558958982e-07, "loss": 0.4963, "reward": 1.1791667282581328, "reward_std": 0.3674156993627548, "rewards/accuracy_reward": 0.287500013038516, "rewards/format_reward": 0.8916666865348816, "step": 1410 }, { "completion_length": 180.80833892822267, "epoch": 0.7546666666666667, "grad_norm": 6.98518180847168, "kl": 1.206640625, "learning_rate": 1.72505665706281e-07, "loss": 0.6087, "reward": 1.2375000536441803, "reward_std": 0.3306782692670822, "rewards/accuracy_reward": 0.3541666746139526, "rewards/format_reward": 0.8833333551883698, "step": 1415 }, { "completion_length": 148.8083396911621, "epoch": 0.7573333333333333, "grad_norm": 9.099882125854492, "kl": 1.595703125, "learning_rate": 1.690019684486557e-07, "loss": 0.5703, "reward": 1.1416666984558106, "reward_std": 0.3741296485066414, "rewards/accuracy_reward": 0.2666666775941849, "rewards/format_reward": 0.8750000119209289, "step": 1420 }, { "completion_length": 205.50000534057617, "epoch": 0.76, "grad_norm": 3.151777744293213, "kl": 1.4462890625, "learning_rate": 1.655269678847292e-07, "loss": 0.6055, "reward": 1.1291666865348815, "reward_std": 0.415025033056736, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.8583333432674408, "step": 1425 }, { "completion_length": 156.6208381652832, "epoch": 0.7626666666666667, "grad_norm": 8.909594535827637, "kl": 1.776953125, "learning_rate": 1.6208096528827714e-07, "loss": 0.4477, "reward": 1.1916666984558106, "reward_std": 0.291341669857502, "rewards/accuracy_reward": 0.3166666816920042, "rewards/format_reward": 0.8750000178813935, "step": 1430 }, { "completion_length": 214.9083396911621, "epoch": 0.7653333333333333, "grad_norm": 10.612777709960938, "kl": 1.9314453125, "learning_rate": 1.5866425941902522e-07, "loss": 0.7519, "reward": 1.0875000536441803, "reward_std": 0.4693745546042919, "rewards/accuracy_reward": 0.2625000089406967, "rewards/format_reward": 0.8250000178813934, "step": 1435 }, { "completion_length": 223.26250991821288, "epoch": 0.768, "grad_norm": 21.530683517456055, "kl": 1.9796875, "learning_rate": 1.5527714649674638e-07, "loss": 0.8694, "reward": 1.1375000476837158, "reward_std": 0.47066808491945267, "rewards/accuracy_reward": 0.3375000089406967, "rewards/format_reward": 0.8000000238418579, "step": 1440 }, { "completion_length": 226.37083740234374, "epoch": 0.7706666666666667, "grad_norm": 13.953042030334473, "kl": 2.4328125, "learning_rate": 1.5191992017557993e-07, "loss": 0.8953, "reward": 0.9833333551883697, "reward_std": 0.5663923621177673, "rewards/accuracy_reward": 0.2250000074505806, "rewards/format_reward": 0.7583333492279053, "step": 1445 }, { "completion_length": 235.72917404174805, "epoch": 0.7733333333333333, "grad_norm": 89.63887786865234, "kl": 1.90859375, "learning_rate": 1.485928715185721e-07, "loss": 0.6224, "reward": 1.1458333492279054, "reward_std": 0.4521483927965164, "rewards/accuracy_reward": 0.3458333402872086, "rewards/format_reward": 0.8000000238418579, "step": 1450 }, { "completion_length": 238.7125045776367, "epoch": 0.776, "grad_norm": 9.134527206420898, "kl": 0.866015625, "learning_rate": 1.4529628897244212e-07, "loss": 0.7825, "reward": 1.1291666984558106, "reward_std": 0.49776799529790877, "rewards/accuracy_reward": 0.31250000447034837, "rewards/format_reward": 0.8166666984558105, "step": 1455 }, { "completion_length": 201.51250381469725, "epoch": 0.7786666666666666, "grad_norm": 11.216150283813477, "kl": 1.4703125, "learning_rate": 1.4203045834257417e-07, "loss": 0.6511, "reward": 1.137500035762787, "reward_std": 0.4635964795947075, "rewards/accuracy_reward": 0.31250000894069674, "rewards/format_reward": 0.8250000238418579, "step": 1460 }, { "completion_length": 130.93750381469727, "epoch": 0.7813333333333333, "grad_norm": 3.627894639968872, "kl": 1.075, "learning_rate": 1.3879566276823896e-07, "loss": 0.3128, "reward": 1.2708333611488343, "reward_std": 0.2865241147577763, "rewards/accuracy_reward": 0.3375000089406967, "rewards/format_reward": 0.9333333432674408, "step": 1465 }, { "completion_length": 216.83750839233397, "epoch": 0.784, "grad_norm": 12.179669380187988, "kl": 1.70078125, "learning_rate": 1.3559218269804624e-07, "loss": 0.7939, "reward": 1.237500047683716, "reward_std": 0.4636766240000725, "rewards/accuracy_reward": 0.42083334252238275, "rewards/format_reward": 0.8166666865348816, "step": 1470 }, { "completion_length": 158.32917251586915, "epoch": 0.7866666666666666, "grad_norm": 5.9073967933654785, "kl": 1.6234375, "learning_rate": 1.3242029586563054e-07, "loss": 0.4521, "reward": 1.066666692495346, "reward_std": 0.27624749541282656, "rewards/accuracy_reward": 0.20000000447034835, "rewards/format_reward": 0.8666666865348815, "step": 1475 }, { "completion_length": 207.18334197998047, "epoch": 0.7893333333333333, "grad_norm": 3.3254387378692627, "kl": 1.4654296875, "learning_rate": 1.2928027726557255e-07, "loss": 0.5483, "reward": 1.2083333730697632, "reward_std": 0.43673594370484353, "rewards/accuracy_reward": 0.38333334140479564, "rewards/format_reward": 0.8250000178813934, "step": 1480 }, { "completion_length": 235.04167556762695, "epoch": 0.792, "grad_norm": 10.714239120483398, "kl": 1.662109375, "learning_rate": 1.2617239912955757e-07, "loss": 0.8735, "reward": 1.0708333671092987, "reward_std": 0.4854356274008751, "rewards/accuracy_reward": 0.2541666731238365, "rewards/format_reward": 0.8166666865348816, "step": 1485 }, { "completion_length": 176.85834045410155, "epoch": 0.7946666666666666, "grad_norm": 6.300493240356445, "kl": 1.4615234375, "learning_rate": 1.230969309027739e-07, "loss": 0.5979, "reward": 1.262500023841858, "reward_std": 0.41847621351480485, "rewards/accuracy_reward": 0.3958333473652601, "rewards/format_reward": 0.8666666805744171, "step": 1490 }, { "completion_length": 230.1958381652832, "epoch": 0.7973333333333333, "grad_norm": 10.270231246948242, "kl": 2.72734375, "learning_rate": 1.2005413922055248e-07, "loss": 0.9828, "reward": 0.9833333671092988, "reward_std": 0.6273943156003952, "rewards/accuracy_reward": 0.21666667349636554, "rewards/format_reward": 0.7666666865348816, "step": 1495 }, { "completion_length": 277.3833374023437, "epoch": 0.8, "grad_norm": 20.984962463378906, "kl": 2.16640625, "learning_rate": 1.1704428788525029e-07, "loss": 0.8738, "reward": 1.066666692495346, "reward_std": 0.6233089223504067, "rewards/accuracy_reward": 0.3250000089406967, "rewards/format_reward": 0.7416666865348815, "step": 1500 }, { "completion_length": 201.60000762939453, "epoch": 0.8026666666666666, "grad_norm": 5.6333489418029785, "kl": 1.384375, "learning_rate": 1.1406763784337948e-07, "loss": 0.6346, "reward": 1.1416666865348817, "reward_std": 0.4620711088180542, "rewards/accuracy_reward": 0.30833334289491177, "rewards/format_reward": 0.8333333551883697, "step": 1505 }, { "completion_length": 155.48750381469728, "epoch": 0.8053333333333333, "grad_norm": 2.1134753227233887, "kl": 0.8833984375, "learning_rate": 1.111244471629838e-07, "loss": 0.4204, "reward": 1.2833333611488342, "reward_std": 0.3339101344347, "rewards/accuracy_reward": 0.3666666835546494, "rewards/format_reward": 0.9166666805744171, "step": 1510 }, { "completion_length": 208.70000686645508, "epoch": 0.808, "grad_norm": 15.448051452636719, "kl": 1.694140625, "learning_rate": 1.0821497101126487e-07, "loss": 0.725, "reward": 1.079166704416275, "reward_std": 0.46868581771850587, "rewards/accuracy_reward": 0.25416667126119136, "rewards/format_reward": 0.8250000178813934, "step": 1515 }, { "completion_length": 174.8625045776367, "epoch": 0.8106666666666666, "grad_norm": 10.781864166259766, "kl": 1.6875, "learning_rate": 1.0533946163245983e-07, "loss": 0.651, "reward": 1.1500000417232514, "reward_std": 0.4552404969930649, "rewards/accuracy_reward": 0.3166666753590107, "rewards/format_reward": 0.8333333551883697, "step": 1520 }, { "completion_length": 177.89167327880858, "epoch": 0.8133333333333334, "grad_norm": 6.928041458129883, "kl": 1.4318359375, "learning_rate": 1.024981683259723e-07, "loss": 0.6095, "reward": 1.2250000357627868, "reward_std": 0.39699684381484984, "rewards/accuracy_reward": 0.35000001192092894, "rewards/format_reward": 0.8750000238418579, "step": 1525 }, { "completion_length": 183.4708396911621, "epoch": 0.816, "grad_norm": 7.962174892425537, "kl": 1.2412109375, "learning_rate": 9.969133742475883e-08, "loss": 0.6025, "reward": 1.2291666924953462, "reward_std": 0.4208852708339691, "rewards/accuracy_reward": 0.3625000089406967, "rewards/format_reward": 0.8666666924953461, "step": 1530 }, { "completion_length": 256.63334503173826, "epoch": 0.8186666666666667, "grad_norm": 33.930641174316406, "kl": 1.7005859375, "learning_rate": 9.691921227397226e-08, "loss": 0.752, "reward": 0.9458333611488342, "reward_std": 0.4962170884013176, "rewards/accuracy_reward": 0.16250000558793545, "rewards/format_reward": 0.7833333611488342, "step": 1535 }, { "completion_length": 178.5583381652832, "epoch": 0.8213333333333334, "grad_norm": 6.385335445404053, "kl": 1.1046875, "learning_rate": 9.4182033209865e-08, "loss": 0.5621, "reward": 1.1833333730697633, "reward_std": 0.3783799774944782, "rewards/accuracy_reward": 0.3083333406597376, "rewards/format_reward": 0.8750000178813935, "step": 1540 }, { "completion_length": 183.16250610351562, "epoch": 0.824, "grad_norm": 7.927608966827393, "kl": 1.2470703125, "learning_rate": 9.148003753895144e-08, "loss": 0.5574, "reward": 1.1875000476837159, "reward_std": 0.40479181706905365, "rewards/accuracy_reward": 0.33750000409781933, "rewards/format_reward": 0.8500000178813935, "step": 1545 }, { "completion_length": 193.51667404174805, "epoch": 0.8266666666666667, "grad_norm": 8.427675247192383, "kl": 1.1888671875, "learning_rate": 8.881345951743485e-08, "loss": 0.7181, "reward": 1.200000035762787, "reward_std": 0.43247459903359414, "rewards/accuracy_reward": 0.3500000085681677, "rewards/format_reward": 0.8500000178813935, "step": 1550 }, { "completion_length": 190.77500686645507, "epoch": 0.8293333333333334, "grad_norm": 7.281441688537598, "kl": 1.5609375, "learning_rate": 8.618253033089767e-08, "loss": 0.6316, "reward": 1.1625000476837157, "reward_std": 0.3389572203159332, "rewards/accuracy_reward": 0.3041666716337204, "rewards/format_reward": 0.8583333551883697, "step": 1555 }, { "completion_length": 196.6958366394043, "epoch": 0.832, "grad_norm": 3.593385696411133, "kl": 1.7515625, "learning_rate": 8.358747807425826e-08, "loss": 0.6747, "reward": 1.2041666984558106, "reward_std": 0.430715125054121, "rewards/accuracy_reward": 0.3625000145286322, "rewards/format_reward": 0.8416666865348816, "step": 1560 }, { "completion_length": 153.6750045776367, "epoch": 0.8346666666666667, "grad_norm": 3.8720762729644775, "kl": 1.4552734375, "learning_rate": 8.102852773199586e-08, "loss": 0.4917, "reward": 1.2208333671092988, "reward_std": 0.34195019751787187, "rewards/accuracy_reward": 0.3125000111758709, "rewards/format_reward": 0.9083333492279053, "step": 1565 }, { "completion_length": 158.85833816528321, "epoch": 0.8373333333333334, "grad_norm": 4.770650386810303, "kl": 1.3427734375, "learning_rate": 7.850590115864481e-08, "loss": 0.4683, "reward": 1.225000023841858, "reward_std": 0.33489523082971573, "rewards/accuracy_reward": 0.33333334028720857, "rewards/format_reward": 0.8916666865348816, "step": 1570 }, { "completion_length": 193.32500534057618, "epoch": 0.84, "grad_norm": 5.848920822143555, "kl": 1.3138671875, "learning_rate": 7.601981705956039e-08, "loss": 0.4761, "reward": 1.2541667103767395, "reward_std": 0.4497212260961533, "rewards/accuracy_reward": 0.4291666798293591, "rewards/format_reward": 0.8250000178813934, "step": 1575 }, { "completion_length": 217.37500610351563, "epoch": 0.8426666666666667, "grad_norm": 9.334526062011719, "kl": 1.68125, "learning_rate": 7.357049097195773e-08, "loss": 0.6737, "reward": 1.0375000298023225, "reward_std": 0.4624008506536484, "rewards/accuracy_reward": 0.22083333916962147, "rewards/format_reward": 0.8166666805744172, "step": 1580 }, { "completion_length": 238.8541702270508, "epoch": 0.8453333333333334, "grad_norm": 7.903083801269531, "kl": 2.119140625, "learning_rate": 7.115813524622488e-08, "loss": 0.7548, "reward": 1.0375000417232514, "reward_std": 0.6012955874204635, "rewards/accuracy_reward": 0.28750000707805157, "rewards/format_reward": 0.7500000178813935, "step": 1585 }, { "completion_length": 213.15834274291993, "epoch": 0.848, "grad_norm": 4.728046417236328, "kl": 1.553515625, "learning_rate": 6.878295902751319e-08, "loss": 0.6544, "reward": 1.1208333492279052, "reward_std": 0.42975625991821287, "rewards/accuracy_reward": 0.3041666716337204, "rewards/format_reward": 0.8166666865348816, "step": 1590 }, { "completion_length": 180.77500534057617, "epoch": 0.8506666666666667, "grad_norm": 77.55509948730469, "kl": 1.58125, "learning_rate": 6.644516823760437e-08, "loss": 0.5961, "reward": 1.091666692495346, "reward_std": 0.32875102311372756, "rewards/accuracy_reward": 0.2166666690260172, "rewards/format_reward": 0.8750000238418579, "step": 1595 }, { "completion_length": 214.39167404174805, "epoch": 0.8533333333333334, "grad_norm": 25.89487075805664, "kl": 1.84609375, "learning_rate": 6.414496555705801e-08, "loss": 0.6698, "reward": 1.1166666805744172, "reward_std": 0.5117101609706879, "rewards/accuracy_reward": 0.34166666977107524, "rewards/format_reward": 0.7750000178813934, "step": 1600 }, { "epoch": 0.8533333333333334, "eval_completion_length": 246.72556213378905, "eval_kl": 2.08859375, "eval_loss": 0.8056277632713318, "eval_reward": 0.9472222503026326, "eval_reward_std": 0.5439748798807462, "eval_rewards/accuracy_reward": 0.17055555924773216, "eval_rewards/format_reward": 0.7766666837533315, "eval_runtime": 810.1675, "eval_samples_per_second": 0.37, "eval_steps_per_second": 0.016, "step": 1600 }, { "completion_length": 281.73750762939454, "epoch": 0.856, "grad_norm": 19.62468719482422, "kl": 1.675390625, "learning_rate": 6.188255040763929e-08, "loss": 0.9653, "reward": 1.041666704416275, "reward_std": 0.62858667075634, "rewards/accuracy_reward": 0.3000000089406967, "rewards/format_reward": 0.7416666865348815, "step": 1605 }, { "completion_length": 188.87916946411133, "epoch": 0.8586666666666667, "grad_norm": 3.9411680698394775, "kl": 1.18125, "learning_rate": 5.965811893503015e-08, "loss": 0.6174, "reward": 1.2083333730697632, "reward_std": 0.42002698704600333, "rewards/accuracy_reward": 0.35000000819563865, "rewards/format_reward": 0.8583333551883697, "step": 1610 }, { "completion_length": 199.65000534057617, "epoch": 0.8613333333333333, "grad_norm": 11.383617401123047, "kl": 2.05234375, "learning_rate": 5.7471863991823356e-08, "loss": 0.6184, "reward": 1.1125000357627868, "reward_std": 0.4449311882257462, "rewards/accuracy_reward": 0.2875000063329935, "rewards/format_reward": 0.8250000298023223, "step": 1615 }, { "completion_length": 176.72917404174805, "epoch": 0.864, "grad_norm": 1.5305043458938599, "kl": 1.0876953125, "learning_rate": 5.532397512080306e-08, "loss": 0.6773, "reward": 1.2458333611488341, "reward_std": 0.42713540196418764, "rewards/accuracy_reward": 0.3625000111758709, "rewards/format_reward": 0.8833333492279053, "step": 1620 }, { "completion_length": 169.13750457763672, "epoch": 0.8666666666666667, "grad_norm": 5.485569000244141, "kl": 1.5263671875, "learning_rate": 5.321463853851188e-08, "loss": 0.5792, "reward": 1.1875000417232513, "reward_std": 0.39003320038318634, "rewards/accuracy_reward": 0.3125000063329935, "rewards/format_reward": 0.8750000238418579, "step": 1625 }, { "completion_length": 166.21250381469727, "epoch": 0.8693333333333333, "grad_norm": 6.497225761413574, "kl": 1.157421875, "learning_rate": 5.114403711910631e-08, "loss": 0.5003, "reward": 1.1833333849906922, "reward_std": 0.33641326874494554, "rewards/accuracy_reward": 0.2833333358168602, "rewards/format_reward": 0.9000000178813934, "step": 1630 }, { "completion_length": 203.3166717529297, "epoch": 0.872, "grad_norm": 10.349686622619629, "kl": 1.8275390625, "learning_rate": 4.911235037850186e-08, "loss": 0.7575, "reward": 1.2083333790302277, "reward_std": 0.5746691286563873, "rewards/accuracy_reward": 0.4000000149011612, "rewards/format_reward": 0.8083333551883698, "step": 1635 }, { "completion_length": 159.37083740234374, "epoch": 0.8746666666666667, "grad_norm": 3.7331645488739014, "kl": 1.871875, "learning_rate": 4.7119754458809725e-08, "loss": 0.5334, "reward": 1.1375000417232513, "reward_std": 0.37364248782396314, "rewards/accuracy_reward": 0.28750000707805157, "rewards/format_reward": 0.8500000238418579, "step": 1640 }, { "completion_length": 177.79583740234375, "epoch": 0.8773333333333333, "grad_norm": 10.393296241760254, "kl": 1.009765625, "learning_rate": 4.516642211306587e-08, "loss": 0.61, "reward": 1.112500047683716, "reward_std": 0.3519383378326893, "rewards/accuracy_reward": 0.23750000521540643, "rewards/format_reward": 0.8750000178813935, "step": 1645 }, { "completion_length": 188.83750534057617, "epoch": 0.88, "grad_norm": 9.854552268981934, "kl": 1.759375, "learning_rate": 4.325252269025315e-08, "loss": 0.6185, "reward": 1.166666704416275, "reward_std": 0.41237895712256434, "rewards/accuracy_reward": 0.32500001452863214, "rewards/format_reward": 0.8416666865348816, "step": 1650 }, { "completion_length": 232.76250457763672, "epoch": 0.8826666666666667, "grad_norm": 5.535569667816162, "kl": 1.8421875, "learning_rate": 4.137822212061964e-08, "loss": 0.7118, "reward": 1.0375000178813933, "reward_std": 0.4611209347844124, "rewards/accuracy_reward": 0.2625000048428774, "rewards/format_reward": 0.7750000178813934, "step": 1655 }, { "completion_length": 231.8791717529297, "epoch": 0.8853333333333333, "grad_norm": 5.740895748138428, "kl": 1.616796875, "learning_rate": 3.954368290129301e-08, "loss": 0.6385, "reward": 1.2250000357627868, "reward_std": 0.5405340433120728, "rewards/accuracy_reward": 0.4000000074505806, "rewards/format_reward": 0.8250000238418579, "step": 1660 }, { "completion_length": 131.25833740234376, "epoch": 0.888, "grad_norm": 5.200298309326172, "kl": 1.3203125, "learning_rate": 3.774906408219197e-08, "loss": 0.277, "reward": 1.3500000178813933, "reward_std": 0.33660581335425377, "rewards/accuracy_reward": 0.4250000149011612, "rewards/format_reward": 0.925000011920929, "step": 1665 }, { "completion_length": 202.34584426879883, "epoch": 0.8906666666666667, "grad_norm": 7.758249759674072, "kl": 1.4638671875, "learning_rate": 3.5994521252237506e-08, "loss": 0.5761, "reward": 1.1083333790302277, "reward_std": 0.38408626839518545, "rewards/accuracy_reward": 0.26666667126119137, "rewards/format_reward": 0.8416666746139526, "step": 1670 }, { "completion_length": 200.52500534057617, "epoch": 0.8933333333333333, "grad_norm": 12.366177558898926, "kl": 1.696875, "learning_rate": 3.42802065258635e-08, "loss": 0.6531, "reward": 1.0083333611488343, "reward_std": 0.46105473637580874, "rewards/accuracy_reward": 0.20000000670552254, "rewards/format_reward": 0.8083333611488343, "step": 1675 }, { "completion_length": 201.46250381469727, "epoch": 0.896, "grad_norm": 6.266597747802734, "kl": 1.599609375, "learning_rate": 3.260626852982873e-08, "loss": 0.6282, "reward": 1.0958333611488342, "reward_std": 0.43908271491527556, "rewards/accuracy_reward": 0.2541666753590107, "rewards/format_reward": 0.8416666865348816, "step": 1680 }, { "completion_length": 160.81250228881837, "epoch": 0.8986666666666666, "grad_norm": 56.750545501708984, "kl": 1.580078125, "learning_rate": 3.097285239033137e-08, "loss": 0.5331, "reward": 1.1541666984558105, "reward_std": 0.3470060914754868, "rewards/accuracy_reward": 0.26250000596046447, "rewards/format_reward": 0.8916666746139527, "step": 1685 }, { "completion_length": 213.47917556762695, "epoch": 0.9013333333333333, "grad_norm": 11.425201416015625, "kl": 2.06640625, "learning_rate": 2.93800997204271e-08, "loss": 0.7264, "reward": 1.200000035762787, "reward_std": 0.5640496462583542, "rewards/accuracy_reward": 0.40833334140479566, "rewards/format_reward": 0.791666692495346, "step": 1690 }, { "completion_length": 194.2541717529297, "epoch": 0.904, "grad_norm": 20.20305061340332, "kl": 1.2748046875, "learning_rate": 2.7828148607751235e-08, "loss": 0.6901, "reward": 1.2416667103767396, "reward_std": 0.4328203298151493, "rewards/accuracy_reward": 0.39166667461395266, "rewards/format_reward": 0.8500000238418579, "step": 1695 }, { "completion_length": 231.80000991821288, "epoch": 0.9066666666666666, "grad_norm": 20.141338348388672, "kl": 1.8451171875, "learning_rate": 2.6317133602547335e-08, "loss": 0.7004, "reward": 1.0708333730697632, "reward_std": 0.5015199676156044, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.8000000238418579, "step": 1700 }, { "completion_length": 226.9791732788086, "epoch": 0.9093333333333333, "grad_norm": 8.150571823120117, "kl": 1.451953125, "learning_rate": 2.4847185706001637e-08, "loss": 0.7311, "reward": 1.1750000357627868, "reward_std": 0.4346106082201004, "rewards/accuracy_reward": 0.33333334550261495, "rewards/format_reward": 0.8416666924953461, "step": 1705 }, { "completion_length": 161.9708366394043, "epoch": 0.912, "grad_norm": 7.521031856536865, "kl": 1.84453125, "learning_rate": 2.341843235888563e-08, "loss": 0.6629, "reward": 1.3250000476837158, "reward_std": 0.4102425158023834, "rewards/accuracy_reward": 0.45000000596046447, "rewards/format_reward": 0.8750000238418579, "step": 1710 }, { "completion_length": 181.10417098999022, "epoch": 0.9146666666666666, "grad_norm": 18.987491607666016, "kl": 1.7630859375, "learning_rate": 2.203099743050746e-08, "loss": 0.6879, "reward": 1.170833373069763, "reward_std": 0.42629132717847823, "rewards/accuracy_reward": 0.32083334028720856, "rewards/format_reward": 0.850000011920929, "step": 1715 }, { "completion_length": 263.47084045410156, "epoch": 0.9173333333333333, "grad_norm": 13.362593650817871, "kl": 2.15078125, "learning_rate": 2.068500120797284e-08, "loss": 0.9974, "reward": 1.125000035762787, "reward_std": 0.6081652283668518, "rewards/accuracy_reward": 0.3583333432674408, "rewards/format_reward": 0.766666692495346, "step": 1720 }, { "completion_length": 187.62083892822267, "epoch": 0.92, "grad_norm": 6.908720970153809, "kl": 1.466015625, "learning_rate": 1.9380560385756084e-08, "loss": 0.7325, "reward": 1.2166666984558105, "reward_std": 0.39846049398183825, "rewards/accuracy_reward": 0.3500000100582838, "rewards/format_reward": 0.8666666984558106, "step": 1725 }, { "completion_length": 208.1041687011719, "epoch": 0.9226666666666666, "grad_norm": 7.331279754638672, "kl": 1.965625, "learning_rate": 1.8117788055583284e-08, "loss": 0.8239, "reward": 1.1291667103767395, "reward_std": 0.5210324048995971, "rewards/accuracy_reward": 0.31250001303851604, "rewards/format_reward": 0.8166666805744172, "step": 1730 }, { "completion_length": 219.9958381652832, "epoch": 0.9253333333333333, "grad_norm": 10.382403373718262, "kl": 1.346484375, "learning_rate": 1.68967936966275e-08, "loss": 0.6091, "reward": 1.1708333671092988, "reward_std": 0.4482548341155052, "rewards/accuracy_reward": 0.3375000096857548, "rewards/format_reward": 0.8333333492279053, "step": 1735 }, { "completion_length": 180.16250228881836, "epoch": 0.928, "grad_norm": 8.021461486816406, "kl": 2.263671875, "learning_rate": 1.571768316601718e-08, "loss": 0.583, "reward": 1.0458333790302277, "reward_std": 0.4388016849756241, "rewards/accuracy_reward": 0.23750000335276128, "rewards/format_reward": 0.8083333551883698, "step": 1740 }, { "completion_length": 244.97917251586915, "epoch": 0.9306666666666666, "grad_norm": 27.44346809387207, "kl": 1.7083984375, "learning_rate": 1.4580558689658406e-08, "loss": 0.9154, "reward": 1.1708333671092988, "reward_std": 0.6085290633141994, "rewards/accuracy_reward": 0.379166679084301, "rewards/format_reward": 0.7916666865348816, "step": 1745 }, { "completion_length": 176.87917251586913, "epoch": 0.9333333333333333, "grad_norm": 21.765560150146484, "kl": 1.50078125, "learning_rate": 1.3485518853372624e-08, "loss": 0.5667, "reward": 1.2708333671092986, "reward_std": 0.44170806705951693, "rewards/accuracy_reward": 0.3958333425223827, "rewards/format_reward": 0.8750000238418579, "step": 1750 }, { "completion_length": 245.63750457763672, "epoch": 0.936, "grad_norm": 5.950416564941406, "kl": 2.523828125, "learning_rate": 1.243265859434911e-08, "loss": 0.8785, "reward": 1.1625000417232514, "reward_std": 0.6160432323813438, "rewards/accuracy_reward": 0.40416667312383653, "rewards/format_reward": 0.7583333492279053, "step": 1755 }, { "completion_length": 174.72083892822266, "epoch": 0.9386666666666666, "grad_norm": 19.66292381286621, "kl": 1.318359375, "learning_rate": 1.1422069192914219e-08, "loss": 0.516, "reward": 1.200000035762787, "reward_std": 0.37103949785232543, "rewards/accuracy_reward": 0.33333334028720857, "rewards/format_reward": 0.8666666805744171, "step": 1760 }, { "completion_length": 224.34167709350587, "epoch": 0.9413333333333334, "grad_norm": 6.988897323608398, "kl": 1.60703125, "learning_rate": 1.0453838264617709e-08, "loss": 0.8039, "reward": 0.9708333611488342, "reward_std": 0.5082953691482544, "rewards/accuracy_reward": 0.1791666727513075, "rewards/format_reward": 0.7916666865348816, "step": 1765 }, { "completion_length": 238.2375061035156, "epoch": 0.944, "grad_norm": 12.432623863220215, "kl": 1.816796875, "learning_rate": 9.528049752636714e-09, "loss": 0.6603, "reward": 1.0416666984558105, "reward_std": 0.4743375271558762, "rewards/accuracy_reward": 0.26666667573153974, "rewards/format_reward": 0.775000023841858, "step": 1770 }, { "completion_length": 169.10417251586915, "epoch": 0.9466666666666667, "grad_norm": 7.009900093078613, "kl": 1.668359375, "learning_rate": 8.644783920498e-09, "loss": 0.485, "reward": 1.200000035762787, "reward_std": 0.37502728700637816, "rewards/accuracy_reward": 0.3416666783392429, "rewards/format_reward": 0.8583333551883697, "step": 1775 }, { "completion_length": 233.9000045776367, "epoch": 0.9493333333333334, "grad_norm": 5.138508319854736, "kl": 1.686328125, "learning_rate": 7.804117345119266e-09, "loss": 0.7679, "reward": 1.1333333730697632, "reward_std": 0.5335646510124207, "rewards/accuracy_reward": 0.34166667722165583, "rewards/format_reward": 0.7916666984558105, "step": 1780 }, { "completion_length": 183.32084121704102, "epoch": 0.952, "grad_norm": 3580.284912109375, "kl": 2.233203125, "learning_rate": 7.00612291017022e-09, "loss": 0.6764, "reward": 1.1083333492279053, "reward_std": 0.3922556236386299, "rewards/accuracy_reward": 0.24166667684912682, "rewards/format_reward": 0.8666666924953461, "step": 1785 }, { "completion_length": 191.9708381652832, "epoch": 0.9546666666666667, "grad_norm": 7.294617176055908, "kl": 1.476171875, "learning_rate": 6.2508697997538665e-09, "loss": 0.5961, "reward": 1.0791666865348817, "reward_std": 0.4319505989551544, "rewards/accuracy_reward": 0.2291666753590107, "rewards/format_reward": 0.8500000238418579, "step": 1790 }, { "completion_length": 211.21250610351564, "epoch": 0.9573333333333334, "grad_norm": 4.981256484985352, "kl": 1.1861328125, "learning_rate": 5.538423492408129e-09, "loss": 0.604, "reward": 1.262500035762787, "reward_std": 0.45401586443185804, "rewards/accuracy_reward": 0.39583334848284724, "rewards/format_reward": 0.8666666805744171, "step": 1795 }, { "completion_length": 283.41251068115236, "epoch": 0.96, "grad_norm": 2.2728030681610107, "kl": 1.4833984375, "learning_rate": 4.8688457554291736e-09, "loss": 0.7529, "reward": 1.1791666984558105, "reward_std": 0.535879123210907, "rewards/accuracy_reward": 0.4041666742414236, "rewards/format_reward": 0.7750000178813934, "step": 1800 }, { "epoch": 0.96, "eval_completion_length": 212.29556121826172, "eval_kl": 1.955859375, "eval_loss": 0.6787320971488953, "eval_reward": 0.9722222558657329, "eval_reward_std": 0.4794289442896843, "eval_rewards/accuracy_reward": 0.1600000035762787, "eval_rewards/format_reward": 0.8122222447395324, "eval_runtime": 704.5393, "eval_samples_per_second": 0.426, "eval_steps_per_second": 0.018, "step": 1800 }, { "completion_length": 171.38750381469725, "epoch": 0.9626666666666667, "grad_norm": 7.666684150695801, "kl": 1.298046875, "learning_rate": 4.242194639516416e-09, "loss": 0.5453, "reward": 1.1750000238418579, "reward_std": 0.3607516996562481, "rewards/accuracy_reward": 0.30000000856816766, "rewards/format_reward": 0.8750000178813935, "step": 1805 }, { "completion_length": 182.1208396911621, "epoch": 0.9653333333333334, "grad_norm": 2.6979856491088867, "kl": 1.5603515625, "learning_rate": 3.658524473739544e-09, "loss": 0.7398, "reward": 1.1625000536441803, "reward_std": 0.40499134212732313, "rewards/accuracy_reward": 0.30416667461395264, "rewards/format_reward": 0.8583333551883697, "step": 1810 }, { "completion_length": 216.9875061035156, "epoch": 0.968, "grad_norm": 11.175540924072266, "kl": 1.25625, "learning_rate": 3.1178858608283954e-09, "loss": 0.7517, "reward": 1.0833333790302277, "reward_std": 0.4425746828317642, "rewards/accuracy_reward": 0.24166667386889457, "rewards/format_reward": 0.8416666865348816, "step": 1815 }, { "completion_length": 187.23750762939454, "epoch": 0.9706666666666667, "grad_norm": 7.220974922180176, "kl": 1.28515625, "learning_rate": 2.6203256727859167e-09, "loss": 0.7132, "reward": 1.1583333730697631, "reward_std": 0.43746666610240936, "rewards/accuracy_reward": 0.3083333432674408, "rewards/format_reward": 0.8500000178813935, "step": 1820 }, { "completion_length": 164.65833892822266, "epoch": 0.9733333333333334, "grad_norm": 10.702654838562012, "kl": 1.7052734375, "learning_rate": 2.165887046824133e-09, "loss": 0.5298, "reward": 1.1958333730697632, "reward_std": 0.32650465294718745, "rewards/accuracy_reward": 0.32916667461395266, "rewards/format_reward": 0.8666666865348815, "step": 1825 }, { "completion_length": 198.562508392334, "epoch": 0.976, "grad_norm": 6.067382335662842, "kl": 1.7404296875, "learning_rate": 1.7546093816246387e-09, "loss": 0.6923, "reward": 1.004166704416275, "reward_std": 0.3946992427110672, "rewards/accuracy_reward": 0.17083333544433116, "rewards/format_reward": 0.8333333551883697, "step": 1830 }, { "completion_length": 206.00834197998046, "epoch": 0.9786666666666667, "grad_norm": 8.223440170288086, "kl": 1.7333984375, "learning_rate": 1.3865283339228316e-09, "loss": 0.6633, "reward": 1.091666692495346, "reward_std": 0.45499495714902877, "rewards/accuracy_reward": 0.26666667237877845, "rewards/format_reward": 0.8250000178813934, "step": 1835 }, { "completion_length": 162.4708381652832, "epoch": 0.9813333333333333, "grad_norm": 6.2932610511779785, "kl": 1.397265625, "learning_rate": 1.0616758154161631e-09, "loss": 0.5335, "reward": 1.3208333730697632, "reward_std": 0.27748758494853976, "rewards/accuracy_reward": 0.4041666716337204, "rewards/format_reward": 0.9166666865348816, "step": 1840 }, { "completion_length": 165.61667251586914, "epoch": 0.984, "grad_norm": 9.122838020324707, "kl": 1.4970703125, "learning_rate": 7.80079989997906e-10, "loss": 0.6188, "reward": 1.1958333611488343, "reward_std": 0.4245347425341606, "rewards/accuracy_reward": 0.33750000409781933, "rewards/format_reward": 0.8583333551883697, "step": 1845 }, { "completion_length": 215.14167251586915, "epoch": 0.9866666666666667, "grad_norm": 23.73024559020996, "kl": 1.775, "learning_rate": 5.417652713152199e-10, "loss": 0.5543, "reward": 1.1333333730697632, "reward_std": 0.4075448580086231, "rewards/accuracy_reward": 0.31666667610406873, "rewards/format_reward": 0.8166666835546493, "step": 1850 }, { "completion_length": 185.79167404174805, "epoch": 0.9893333333333333, "grad_norm": 20.497873306274414, "kl": 1.5189453125, "learning_rate": 3.4675232065256574e-10, "loss": 0.4238, "reward": 1.0708333492279052, "reward_std": 0.32569129317998885, "rewards/accuracy_reward": 0.24583334028720855, "rewards/format_reward": 0.8250000059604645, "step": 1855 }, { "completion_length": 225.09583892822266, "epoch": 0.992, "grad_norm": 12.528355598449707, "kl": 1.2953125, "learning_rate": 1.9505804514047264e-10, "loss": 0.4901, "reward": 1.1541666984558105, "reward_std": 0.4231454662978649, "rewards/accuracy_reward": 0.32083334028720856, "rewards/format_reward": 0.8333333373069763, "step": 1860 }, { "completion_length": 221.36667175292968, "epoch": 0.9946666666666667, "grad_norm": 13.679535865783691, "kl": 1.390625, "learning_rate": 8.669559628954326e-11, "loss": 0.8532, "reward": 1.1583333730697631, "reward_std": 0.5791940867900849, "rewards/accuracy_reward": 0.35000000409781934, "rewards/format_reward": 0.8083333611488343, "step": 1865 }, { "completion_length": 158.7708381652832, "epoch": 0.9973333333333333, "grad_norm": 37.47658157348633, "kl": 1.707421875, "learning_rate": 2.1674368850643777e-11, "loss": 0.5296, "reward": 1.1458333611488343, "reward_std": 0.29994617849588395, "rewards/accuracy_reward": 0.2541666738688946, "rewards/format_reward": 0.8916666805744171, "step": 1870 }, { "completion_length": 223.59584197998046, "epoch": 1.0, "grad_norm": 7.715764999389648, "kl": 1.104296875, "learning_rate": 0.0, "loss": 0.7452, "reward": 1.1708333790302277, "reward_std": 0.4897158071398735, "rewards/accuracy_reward": 0.3375000100582838, "rewards/format_reward": 0.8333333492279053, "step": 1875 }, { "epoch": 1.0, "step": 1875, "total_flos": 0.0, "train_loss": 0.738647110915184, "train_runtime": 47500.6212, "train_samples_per_second": 0.316, "train_steps_per_second": 0.039 } ], "logging_steps": 5, "max_steps": 1875, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }