| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 200, | |
| "global_step": 1875, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 309.1416763305664, | |
| "epoch": 0.0026666666666666666, | |
| "grad_norm": 1.0915299654006958, | |
| "kl": 0.000769805908203125, | |
| "learning_rate": 2.6595744680851062e-08, | |
| "loss": 0.0572, | |
| "reward": -0.8166666805744172, | |
| "reward_std": 0.30653437227010727, | |
| "rewards/accuracy_reward": 0.07500000223517418, | |
| "rewards/format_reward": -0.8916666805744171, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 335.7166778564453, | |
| "epoch": 0.005333333333333333, | |
| "grad_norm": 1.6455351114273071, | |
| "kl": 0.00115509033203125, | |
| "learning_rate": 5.3191489361702123e-08, | |
| "loss": 0.0544, | |
| "reward": -0.8583333492279053, | |
| "reward_std": 0.2569814197719097, | |
| "rewards/accuracy_reward": 0.0666666679084301, | |
| "rewards/format_reward": -0.925000011920929, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 290.56251220703126, | |
| "epoch": 0.008, | |
| "grad_norm": 1.7345483303070068, | |
| "kl": 0.0012176513671875, | |
| "learning_rate": 7.978723404255319e-08, | |
| "loss": 0.0639, | |
| "reward": -0.8333333611488343, | |
| "reward_std": 0.4082482993602753, | |
| "rewards/accuracy_reward": 0.04166666753590107, | |
| "rewards/format_reward": -0.8750000238418579, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 320.79584350585935, | |
| "epoch": 0.010666666666666666, | |
| "grad_norm": 1.5033214092254639, | |
| "kl": 0.0012172698974609376, | |
| "learning_rate": 1.0638297872340425e-07, | |
| "loss": 0.0569, | |
| "reward": -0.8416666924953461, | |
| "reward_std": 0.2757193736732006, | |
| "rewards/accuracy_reward": 0.04166666753590107, | |
| "rewards/format_reward": -0.8833333492279053, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 341.09584655761716, | |
| "epoch": 0.013333333333333334, | |
| "grad_norm": 1.9244236946105957, | |
| "kl": 0.0012542724609375, | |
| "learning_rate": 1.329787234042553e-07, | |
| "loss": 0.0252, | |
| "reward": -0.8458333492279053, | |
| "reward_std": 0.3047572821378708, | |
| "rewards/accuracy_reward": 0.08750000149011612, | |
| "rewards/format_reward": -0.9333333492279052, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 298.28334350585936, | |
| "epoch": 0.016, | |
| "grad_norm": 2.07985258102417, | |
| "kl": 0.0014141082763671875, | |
| "learning_rate": 1.5957446808510638e-07, | |
| "loss": 0.0589, | |
| "reward": -0.8083333551883698, | |
| "reward_std": 0.3681695103645325, | |
| "rewards/accuracy_reward": 0.05833333432674408, | |
| "rewards/format_reward": -0.8666666924953461, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 327.3208374023437, | |
| "epoch": 0.018666666666666668, | |
| "grad_norm": 1.2517731189727783, | |
| "kl": 0.0022617340087890624, | |
| "learning_rate": 1.8617021276595742e-07, | |
| "loss": 0.0912, | |
| "reward": -0.854166692495346, | |
| "reward_std": 0.3303105406463146, | |
| "rewards/accuracy_reward": 0.03750000111758709, | |
| "rewards/format_reward": -0.891666692495346, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 352.6000122070312, | |
| "epoch": 0.021333333333333333, | |
| "grad_norm": 0.8213557004928589, | |
| "kl": 0.003079986572265625, | |
| "learning_rate": 2.127659574468085e-07, | |
| "loss": 0.0448, | |
| "reward": -0.850000011920929, | |
| "reward_std": 0.28401452749967576, | |
| "rewards/accuracy_reward": 0.05833333395421505, | |
| "rewards/format_reward": -0.9083333432674408, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 337.5208435058594, | |
| "epoch": 0.024, | |
| "grad_norm": 0.7038048505783081, | |
| "kl": 0.0064483642578125, | |
| "learning_rate": 2.393617021276596e-07, | |
| "loss": 0.1086, | |
| "reward": -0.8291666865348816, | |
| "reward_std": 0.320113442838192, | |
| "rewards/accuracy_reward": 0.07083333469927311, | |
| "rewards/format_reward": -0.9000000119209289, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 353.56251220703126, | |
| "epoch": 0.02666666666666667, | |
| "grad_norm": 1.7768754959106445, | |
| "kl": 0.011151123046875, | |
| "learning_rate": 2.659574468085106e-07, | |
| "loss": 0.0946, | |
| "reward": -0.791666692495346, | |
| "reward_std": 0.3645200379192829, | |
| "rewards/accuracy_reward": 0.05000000149011612, | |
| "rewards/format_reward": -0.8416666805744171, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 304.80834197998047, | |
| "epoch": 0.029333333333333333, | |
| "grad_norm": 1.8845570087432861, | |
| "kl": 0.0145263671875, | |
| "learning_rate": 2.925531914893617e-07, | |
| "loss": 0.1085, | |
| "reward": -0.825000011920929, | |
| "reward_std": 0.378238408267498, | |
| "rewards/accuracy_reward": 0.05000000149011612, | |
| "rewards/format_reward": -0.8750000119209289, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 300.5541717529297, | |
| "epoch": 0.032, | |
| "grad_norm": 1.3354154825210571, | |
| "kl": 0.0145751953125, | |
| "learning_rate": 3.1914893617021275e-07, | |
| "loss": 0.0812, | |
| "reward": -0.8083333492279052, | |
| "reward_std": 0.3903405636548996, | |
| "rewards/accuracy_reward": 0.09166666939854622, | |
| "rewards/format_reward": -0.900000023841858, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 302.21250915527344, | |
| "epoch": 0.034666666666666665, | |
| "grad_norm": 1.0147898197174072, | |
| "kl": 0.022381591796875, | |
| "learning_rate": 3.457446808510638e-07, | |
| "loss": 0.1457, | |
| "reward": -0.7458333492279052, | |
| "reward_std": 0.47493031769990923, | |
| "rewards/accuracy_reward": 0.03750000111758709, | |
| "rewards/format_reward": -0.7833333492279053, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 291.40417633056643, | |
| "epoch": 0.037333333333333336, | |
| "grad_norm": 2.549830675125122, | |
| "kl": 0.03157958984375, | |
| "learning_rate": 3.7234042553191484e-07, | |
| "loss": 0.1052, | |
| "reward": -0.7708333551883697, | |
| "reward_std": 0.4252162277698517, | |
| "rewards/accuracy_reward": 0.06250000037252904, | |
| "rewards/format_reward": -0.8333333551883697, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 315.60834350585935, | |
| "epoch": 0.04, | |
| "grad_norm": 1.0364981889724731, | |
| "kl": 0.02958984375, | |
| "learning_rate": 3.989361702127659e-07, | |
| "loss": 0.1479, | |
| "reward": -0.7458333551883698, | |
| "reward_std": 0.48624068051576613, | |
| "rewards/accuracy_reward": 0.06250000111758709, | |
| "rewards/format_reward": -0.8083333611488343, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 275.8916732788086, | |
| "epoch": 0.042666666666666665, | |
| "grad_norm": 1.8735177516937256, | |
| "kl": 0.03341064453125, | |
| "learning_rate": 4.25531914893617e-07, | |
| "loss": 0.1291, | |
| "reward": -0.7250000238418579, | |
| "reward_std": 0.5295502826571464, | |
| "rewards/accuracy_reward": 0.05833333395421505, | |
| "rewards/format_reward": -0.7833333551883698, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 278.6000091552734, | |
| "epoch": 0.04533333333333334, | |
| "grad_norm": 1.7058619260787964, | |
| "kl": 0.04388427734375, | |
| "learning_rate": 4.5212765957446806e-07, | |
| "loss": 0.1241, | |
| "reward": -0.7166666746139526, | |
| "reward_std": 0.517094686627388, | |
| "rewards/accuracy_reward": 0.05833333395421505, | |
| "rewards/format_reward": -0.7750000119209289, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 294.21667327880857, | |
| "epoch": 0.048, | |
| "grad_norm": 2.6495754718780518, | |
| "kl": 0.05029296875, | |
| "learning_rate": 4.787234042553192e-07, | |
| "loss": 0.1663, | |
| "reward": -0.6333333402872086, | |
| "reward_std": 0.6619763910770416, | |
| "rewards/accuracy_reward": 0.08333333507180214, | |
| "rewards/format_reward": -0.7166666865348816, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 290.7458435058594, | |
| "epoch": 0.050666666666666665, | |
| "grad_norm": 1.9712241888046265, | |
| "kl": 0.064501953125, | |
| "learning_rate": 5.053191489361702e-07, | |
| "loss": 0.2083, | |
| "reward": -0.5291666805744171, | |
| "reward_std": 0.7097373753786087, | |
| "rewards/accuracy_reward": 0.10416666902601719, | |
| "rewards/format_reward": -0.6333333551883698, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 276.9541778564453, | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 2.6062912940979004, | |
| "kl": 0.09736328125, | |
| "learning_rate": 5.319148936170212e-07, | |
| "loss": 0.1889, | |
| "reward": -0.5833333522081375, | |
| "reward_std": 0.6344460442662239, | |
| "rewards/accuracy_reward": 0.09166666977107525, | |
| "rewards/format_reward": -0.675000011920929, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 258.17084350585935, | |
| "epoch": 0.056, | |
| "grad_norm": 3.833866834640503, | |
| "kl": 0.1748046875, | |
| "learning_rate": 5.585106382978722e-07, | |
| "loss": 0.1782, | |
| "reward": -0.5166666775941848, | |
| "reward_std": 0.7733665883541108, | |
| "rewards/accuracy_reward": 0.11666666902601719, | |
| "rewards/format_reward": -0.6333333492279053, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 276.9500076293945, | |
| "epoch": 0.058666666666666666, | |
| "grad_norm": 2.573361873626709, | |
| "kl": 0.1017578125, | |
| "learning_rate": 5.851063829787234e-07, | |
| "loss": 0.1876, | |
| "reward": -0.49583334624767306, | |
| "reward_std": 0.7565078109502792, | |
| "rewards/accuracy_reward": 0.12083333656191826, | |
| "rewards/format_reward": -0.6166666924953461, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 276.2250061035156, | |
| "epoch": 0.06133333333333333, | |
| "grad_norm": 1.7497327327728271, | |
| "kl": 0.075927734375, | |
| "learning_rate": 6.117021276595744e-07, | |
| "loss": 0.1169, | |
| "reward": -0.6041666865348816, | |
| "reward_std": 0.6442232474684715, | |
| "rewards/accuracy_reward": 0.10416666828095913, | |
| "rewards/format_reward": -0.7083333551883697, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 258.6750091552734, | |
| "epoch": 0.064, | |
| "grad_norm": 2.181297779083252, | |
| "kl": 0.081201171875, | |
| "learning_rate": 6.382978723404255e-07, | |
| "loss": 0.1729, | |
| "reward": -0.5708333432674408, | |
| "reward_std": 0.7166081488132476, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": -0.6333333522081375, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 268.40834045410156, | |
| "epoch": 0.06666666666666667, | |
| "grad_norm": 4.303333759307861, | |
| "kl": 0.133251953125, | |
| "learning_rate": 6.648936170212765e-07, | |
| "loss": 0.2288, | |
| "reward": -0.5125000178813934, | |
| "reward_std": 0.7744767606258393, | |
| "rewards/accuracy_reward": 0.07083333544433117, | |
| "rewards/format_reward": -0.5833333492279053, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 299.7708435058594, | |
| "epoch": 0.06933333333333333, | |
| "grad_norm": 4.013002872467041, | |
| "kl": 0.169775390625, | |
| "learning_rate": 6.914893617021277e-07, | |
| "loss": 0.318, | |
| "reward": -0.3000000104308128, | |
| "reward_std": 0.8895713210105896, | |
| "rewards/accuracy_reward": 0.08333333507180214, | |
| "rewards/format_reward": -0.3833333447575569, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 235.02500915527344, | |
| "epoch": 0.072, | |
| "grad_norm": 2.6780853271484375, | |
| "kl": 0.173779296875, | |
| "learning_rate": 7.180851063829787e-07, | |
| "loss": 0.1926, | |
| "reward": -0.35416668057441714, | |
| "reward_std": 0.7739700466394425, | |
| "rewards/accuracy_reward": 0.11250000409781932, | |
| "rewards/format_reward": -0.46666668355464935, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 257.54584045410155, | |
| "epoch": 0.07466666666666667, | |
| "grad_norm": 1.999470829963684, | |
| "kl": 0.117333984375, | |
| "learning_rate": 7.446808510638297e-07, | |
| "loss": 0.2742, | |
| "reward": -0.23750000968575477, | |
| "reward_std": 0.8838598787784576, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/format_reward": -0.3416666746139526, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 246.15000610351564, | |
| "epoch": 0.07733333333333334, | |
| "grad_norm": 3.609872579574585, | |
| "kl": 0.20576171875, | |
| "learning_rate": 7.712765957446808e-07, | |
| "loss": 0.2398, | |
| "reward": -0.3708333432674408, | |
| "reward_std": 0.879425299167633, | |
| "rewards/accuracy_reward": 0.11250000149011612, | |
| "rewards/format_reward": -0.4833333492279053, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 259.1625091552734, | |
| "epoch": 0.08, | |
| "grad_norm": 4.464442729949951, | |
| "kl": 0.21318359375, | |
| "learning_rate": 7.978723404255318e-07, | |
| "loss": 0.2627, | |
| "reward": -0.1833333384245634, | |
| "reward_std": 0.9505029916763306, | |
| "rewards/accuracy_reward": 0.12500000260770322, | |
| "rewards/format_reward": -0.3083333432674408, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 236.57083740234376, | |
| "epoch": 0.08266666666666667, | |
| "grad_norm": 4.6332550048828125, | |
| "kl": 0.216796875, | |
| "learning_rate": 8.24468085106383e-07, | |
| "loss": 0.2414, | |
| "reward": -0.23333333767950534, | |
| "reward_std": 0.9220583379268646, | |
| "rewards/accuracy_reward": 0.13333333544433118, | |
| "rewards/format_reward": -0.3666666798293591, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 223.12500610351563, | |
| "epoch": 0.08533333333333333, | |
| "grad_norm": 2.853530168533325, | |
| "kl": 0.240625, | |
| "learning_rate": 8.51063829787234e-07, | |
| "loss": 0.2636, | |
| "reward": -0.2166666705161333, | |
| "reward_std": 0.9433093965053558, | |
| "rewards/accuracy_reward": 0.10833333730697632, | |
| "rewards/format_reward": -0.32500001341104506, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 187.4041717529297, | |
| "epoch": 0.088, | |
| "grad_norm": 3.2459123134613037, | |
| "kl": 0.2255859375, | |
| "learning_rate": 8.77659574468085e-07, | |
| "loss": 0.2653, | |
| "reward": -0.13750000298023224, | |
| "reward_std": 0.9205778002738952, | |
| "rewards/accuracy_reward": 0.1291666690260172, | |
| "rewards/format_reward": -0.2666666805744171, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 199.13333892822266, | |
| "epoch": 0.09066666666666667, | |
| "grad_norm": 2.3914144039154053, | |
| "kl": 0.2478515625, | |
| "learning_rate": 9.042553191489361e-07, | |
| "loss": 0.242, | |
| "reward": -0.2500000067055225, | |
| "reward_std": 0.9306629121303558, | |
| "rewards/accuracy_reward": 0.05833333432674408, | |
| "rewards/format_reward": -0.30833333879709246, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 194.1500045776367, | |
| "epoch": 0.09333333333333334, | |
| "grad_norm": 7.140366554260254, | |
| "kl": 0.336328125, | |
| "learning_rate": 9.308510638297871e-07, | |
| "loss": 0.2465, | |
| "reward": 0.11250000335276127, | |
| "reward_std": 0.9915949404239655, | |
| "rewards/accuracy_reward": 0.2041666742414236, | |
| "rewards/format_reward": -0.09166667088866234, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 177.92917022705078, | |
| "epoch": 0.096, | |
| "grad_norm": 3.311171293258667, | |
| "kl": 0.31455078125, | |
| "learning_rate": 9.574468085106384e-07, | |
| "loss": 0.2429, | |
| "reward": -0.02083333097398281, | |
| "reward_std": 0.9874655485153199, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/format_reward": -0.1666666716337204, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 204.3000045776367, | |
| "epoch": 0.09866666666666667, | |
| "grad_norm": 15.891555786132812, | |
| "kl": 0.29267578125, | |
| "learning_rate": 9.840425531914893e-07, | |
| "loss": 0.2048, | |
| "reward": -0.04166666828095913, | |
| "reward_std": 0.999242752790451, | |
| "rewards/accuracy_reward": 0.16666667237877847, | |
| "rewards/format_reward": -0.20833333879709243, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 169.1041702270508, | |
| "epoch": 0.10133333333333333, | |
| "grad_norm": 3.4694690704345703, | |
| "kl": 0.36845703125, | |
| "learning_rate": 9.999965320799375e-07, | |
| "loss": 0.2537, | |
| "reward": 0.29166667312383654, | |
| "reward_std": 0.9786251783370972, | |
| "rewards/accuracy_reward": 0.19166667237877846, | |
| "rewards/format_reward": 0.10000000670552253, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 155.45833740234374, | |
| "epoch": 0.104, | |
| "grad_norm": 4.827131271362305, | |
| "kl": 0.4091796875, | |
| "learning_rate": 9.999575185316993e-07, | |
| "loss": 0.253, | |
| "reward": 0.21666667386889457, | |
| "reward_std": 0.9847019255161286, | |
| "rewards/accuracy_reward": 0.12500000335276126, | |
| "rewards/format_reward": 0.09166666865348816, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 153.3083366394043, | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 6.091984272003174, | |
| "kl": 0.4533203125, | |
| "learning_rate": 9.998751599287957e-07, | |
| "loss": 0.2823, | |
| "reward": 0.37916667610406873, | |
| "reward_std": 1.0222100555896758, | |
| "rewards/accuracy_reward": 0.16250000521540642, | |
| "rewards/format_reward": 0.21666667237877846, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10666666666666667, | |
| "eval_completion_length": 163.56056142171224, | |
| "eval_kl": 0.3924609375, | |
| "eval_loss": 0.2787472605705261, | |
| "eval_reward": 0.34055556610226634, | |
| "eval_reward_std": 0.9718689926465353, | |
| "eval_rewards/accuracy_reward": 0.09500000228484472, | |
| "eval_rewards/format_reward": 0.24555556188027064, | |
| "eval_runtime": 368.0531, | |
| "eval_samples_per_second": 0.815, | |
| "eval_steps_per_second": 0.035, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 155.39583740234374, | |
| "epoch": 0.10933333333333334, | |
| "grad_norm": 4.420968532562256, | |
| "kl": 0.378125, | |
| "learning_rate": 9.9974946341151e-07, | |
| "loss": 0.2678, | |
| "reward": 0.42916667759418486, | |
| "reward_std": 1.0038800358772277, | |
| "rewards/accuracy_reward": 0.17916666977107526, | |
| "rewards/format_reward": 0.25000000819563867, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 143.2500030517578, | |
| "epoch": 0.112, | |
| "grad_norm": 40.516544342041016, | |
| "kl": 0.4390625, | |
| "learning_rate": 9.995804398774126e-07, | |
| "loss": 0.303, | |
| "reward": 0.6583333551883698, | |
| "reward_std": 0.892980021238327, | |
| "rewards/accuracy_reward": 0.20000000819563865, | |
| "rewards/format_reward": 0.45833334028720857, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 129.4250045776367, | |
| "epoch": 0.11466666666666667, | |
| "grad_norm": 3.4134747982025146, | |
| "kl": 0.75546875, | |
| "learning_rate": 9.993681039804173e-07, | |
| "loss": 0.2308, | |
| "reward": 0.6916666924953461, | |
| "reward_std": 0.7836884081363678, | |
| "rewards/accuracy_reward": 0.16666667200624943, | |
| "rewards/format_reward": 0.5250000201165677, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 127.5083381652832, | |
| "epoch": 0.11733333333333333, | |
| "grad_norm": 4.250089645385742, | |
| "kl": 0.4599609375, | |
| "learning_rate": 9.991124741295105e-07, | |
| "loss": 0.2392, | |
| "reward": 0.6500000178813934, | |
| "reward_std": 0.790798443555832, | |
| "rewards/accuracy_reward": 0.10000000298023223, | |
| "rewards/format_reward": 0.5500000208616257, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 142.82500534057618, | |
| "epoch": 0.12, | |
| "grad_norm": 3.440370798110962, | |
| "kl": 0.3755859375, | |
| "learning_rate": 9.988135724871545e-07, | |
| "loss": 0.2241, | |
| "reward": 0.6125000149011612, | |
| "reward_std": 0.9097375214099884, | |
| "rewards/accuracy_reward": 0.17083333767950534, | |
| "rewards/format_reward": 0.4416666775941849, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 142.2708381652832, | |
| "epoch": 0.12266666666666666, | |
| "grad_norm": 3.0582029819488525, | |
| "kl": 0.4955078125, | |
| "learning_rate": 9.984714249673673e-07, | |
| "loss": 0.2703, | |
| "reward": 0.7000000238418579, | |
| "reward_std": 0.9109564527869225, | |
| "rewards/accuracy_reward": 0.22500000149011612, | |
| "rewards/format_reward": 0.47500001490116117, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 133.31250381469727, | |
| "epoch": 0.12533333333333332, | |
| "grad_norm": 4.252659797668457, | |
| "kl": 0.50390625, | |
| "learning_rate": 9.98086061233475e-07, | |
| "loss": 0.312, | |
| "reward": 0.8500000298023224, | |
| "reward_std": 0.840288233757019, | |
| "rewards/accuracy_reward": 0.2833333402872086, | |
| "rewards/format_reward": 0.5666666805744172, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 118.69167022705078, | |
| "epoch": 0.128, | |
| "grad_norm": 82.35729217529297, | |
| "kl": 1.1455078125, | |
| "learning_rate": 9.97657514695541e-07, | |
| "loss": 0.2646, | |
| "reward": 0.9041666984558105, | |
| "reward_std": 0.7261348217725754, | |
| "rewards/accuracy_reward": 0.22083334028720855, | |
| "rewards/format_reward": 0.6833333551883698, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 123.26250381469727, | |
| "epoch": 0.13066666666666665, | |
| "grad_norm": 3.8186757564544678, | |
| "kl": 0.5734375, | |
| "learning_rate": 9.971858225074672e-07, | |
| "loss": 0.2327, | |
| "reward": 0.8333333790302276, | |
| "reward_std": 0.7271113842725754, | |
| "rewards/accuracy_reward": 0.19166667014360428, | |
| "rewards/format_reward": 0.6416666895151139, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 139.0875045776367, | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 4.611005783081055, | |
| "kl": 0.55546875, | |
| "learning_rate": 9.966710255637762e-07, | |
| "loss": 0.3656, | |
| "reward": 0.7083333514630794, | |
| "reward_std": 0.7690498679876328, | |
| "rewards/accuracy_reward": 0.12500000298023223, | |
| "rewards/format_reward": 0.5833333611488343, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 131.25000610351563, | |
| "epoch": 0.136, | |
| "grad_norm": 5.000573635101318, | |
| "kl": 0.6904296875, | |
| "learning_rate": 9.961131684960634e-07, | |
| "loss": 0.3735, | |
| "reward": 0.7583333611488342, | |
| "reward_std": 0.8809190809726715, | |
| "rewards/accuracy_reward": 0.21666667200624942, | |
| "rewards/format_reward": 0.5416666835546493, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 130.9083381652832, | |
| "epoch": 0.13866666666666666, | |
| "grad_norm": 10.388055801391602, | |
| "kl": 0.87734375, | |
| "learning_rate": 9.955122996691277e-07, | |
| "loss": 0.4113, | |
| "reward": 0.8041666835546494, | |
| "reward_std": 0.8410302340984345, | |
| "rewards/accuracy_reward": 0.2125000074505806, | |
| "rewards/format_reward": 0.5916666954755783, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 128.3708366394043, | |
| "epoch": 0.14133333333333334, | |
| "grad_norm": 5.107520580291748, | |
| "kl": 0.790234375, | |
| "learning_rate": 9.948684711767799e-07, | |
| "loss": 0.3787, | |
| "reward": 0.8416666924953461, | |
| "reward_std": 0.6999663650989533, | |
| "rewards/accuracy_reward": 0.18333333507180213, | |
| "rewards/format_reward": 0.6583333522081375, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 163.54583816528321, | |
| "epoch": 0.144, | |
| "grad_norm": 7.847362995147705, | |
| "kl": 0.6859375, | |
| "learning_rate": 9.941817388373247e-07, | |
| "loss": 0.486, | |
| "reward": 0.9250000238418579, | |
| "reward_std": 0.6243289351463318, | |
| "rewards/accuracy_reward": 0.1916666693985462, | |
| "rewards/format_reward": 0.7333333551883697, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 222.45000915527345, | |
| "epoch": 0.14666666666666667, | |
| "grad_norm": 4.114348411560059, | |
| "kl": 0.951953125, | |
| "learning_rate": 9.934521621887221e-07, | |
| "loss": 0.5765, | |
| "reward": 0.825000011920929, | |
| "reward_std": 0.70595743060112, | |
| "rewards/accuracy_reward": 0.18333333767950535, | |
| "rewards/format_reward": 0.6416666865348816, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 270.7250061035156, | |
| "epoch": 0.14933333333333335, | |
| "grad_norm": 7.752039432525635, | |
| "kl": 1.58359375, | |
| "learning_rate": 9.926798044834259e-07, | |
| "loss": 0.8842, | |
| "reward": 0.7625000387430191, | |
| "reward_std": 0.8192247807979584, | |
| "rewards/accuracy_reward": 0.1541666727513075, | |
| "rewards/format_reward": 0.608333346247673, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 286.2875045776367, | |
| "epoch": 0.152, | |
| "grad_norm": 20.332632064819336, | |
| "kl": 1.559765625, | |
| "learning_rate": 9.91864732682899e-07, | |
| "loss": 0.7816, | |
| "reward": 0.6875000238418579, | |
| "reward_std": 0.8882942378520966, | |
| "rewards/accuracy_reward": 0.19583333805203437, | |
| "rewards/format_reward": 0.49166667759418486, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 321.8916748046875, | |
| "epoch": 0.15466666666666667, | |
| "grad_norm": 70.37603759765625, | |
| "kl": 2.5359375, | |
| "learning_rate": 9.910070174518091e-07, | |
| "loss": 0.8462, | |
| "reward": 0.7041666865348816, | |
| "reward_std": 0.9259481251239776, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/format_reward": 0.5166666805744171, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 311.6041748046875, | |
| "epoch": 0.15733333333333333, | |
| "grad_norm": 78.9567642211914, | |
| "kl": 3.86171875, | |
| "learning_rate": 9.90106733151901e-07, | |
| "loss": 0.9557, | |
| "reward": 0.5708333492279053, | |
| "reward_std": 0.9381726026535034, | |
| "rewards/accuracy_reward": 0.13750000149011612, | |
| "rewards/format_reward": 0.4333333447575569, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 250.3291778564453, | |
| "epoch": 0.16, | |
| "grad_norm": 138.24191284179688, | |
| "kl": 8.51640625, | |
| "learning_rate": 9.89163957835551e-07, | |
| "loss": 1.2961, | |
| "reward": 0.6208333462476731, | |
| "reward_std": 0.909997683763504, | |
| "rewards/accuracy_reward": 0.1541666727513075, | |
| "rewards/format_reward": 0.46666667610406876, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 215.75833740234376, | |
| "epoch": 0.16266666666666665, | |
| "grad_norm": 126.323974609375, | |
| "kl": 3.709375, | |
| "learning_rate": 9.881787732389985e-07, | |
| "loss": 0.9302, | |
| "reward": 0.6666666895151139, | |
| "reward_std": 0.8827720135450363, | |
| "rewards/accuracy_reward": 0.16666667088866233, | |
| "rewards/format_reward": 0.500000013411045, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 180.2916732788086, | |
| "epoch": 0.16533333333333333, | |
| "grad_norm": 36.90127944946289, | |
| "kl": 3.61640625, | |
| "learning_rate": 9.871512647752612e-07, | |
| "loss": 0.8254, | |
| "reward": 0.5916666835546494, | |
| "reward_std": 0.9250853776931762, | |
| "rewards/accuracy_reward": 0.11666666902601719, | |
| "rewards/format_reward": 0.47500001192092894, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 159.34167404174804, | |
| "epoch": 0.168, | |
| "grad_norm": 50.75526428222656, | |
| "kl": 5.4921875, | |
| "learning_rate": 9.860815215267287e-07, | |
| "loss": 0.8761, | |
| "reward": 0.704166692495346, | |
| "reward_std": 0.9947333693504333, | |
| "rewards/accuracy_reward": 0.21250000447034836, | |
| "rewards/format_reward": 0.491666679084301, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 171.51667327880858, | |
| "epoch": 0.17066666666666666, | |
| "grad_norm": 50.245628356933594, | |
| "kl": 4.3265625, | |
| "learning_rate": 9.849696362374397e-07, | |
| "loss": 0.8211, | |
| "reward": 0.49166668951511383, | |
| "reward_std": 0.8819661140441895, | |
| "rewards/accuracy_reward": 0.1000000037252903, | |
| "rewards/format_reward": 0.3916666813194752, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 130.46667098999023, | |
| "epoch": 0.17333333333333334, | |
| "grad_norm": 25.52703285217285, | |
| "kl": 4.159375, | |
| "learning_rate": 9.838157053050423e-07, | |
| "loss": 0.6859, | |
| "reward": 0.8916666805744171, | |
| "reward_std": 0.7892452508211136, | |
| "rewards/accuracy_reward": 0.21666667312383653, | |
| "rewards/format_reward": 0.6750000208616257, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 171.3375015258789, | |
| "epoch": 0.176, | |
| "grad_norm": 38.008914947509766, | |
| "kl": 4.453125, | |
| "learning_rate": 9.826198287724346e-07, | |
| "loss": 0.8512, | |
| "reward": 0.8041666924953461, | |
| "reward_std": 0.8764552354812623, | |
| "rewards/accuracy_reward": 0.26250000633299353, | |
| "rewards/format_reward": 0.5416666954755783, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 164.07500610351562, | |
| "epoch": 0.17866666666666667, | |
| "grad_norm": 128.8215789794922, | |
| "kl": 5.5421875, | |
| "learning_rate": 9.813821103190931e-07, | |
| "loss": 0.9175, | |
| "reward": 0.8166666984558105, | |
| "reward_std": 0.8608273565769196, | |
| "rewards/accuracy_reward": 0.25833333991467955, | |
| "rewards/format_reward": 0.5583333522081375, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 148.26666946411132, | |
| "epoch": 0.18133333333333335, | |
| "grad_norm": 46.26949691772461, | |
| "kl": 3.05859375, | |
| "learning_rate": 9.80102657252083e-07, | |
| "loss": 0.6734, | |
| "reward": 0.8250000238418579, | |
| "reward_std": 0.7950194001197814, | |
| "rewards/accuracy_reward": 0.21666667349636554, | |
| "rewards/format_reward": 0.6083333492279053, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 157.37917098999023, | |
| "epoch": 0.184, | |
| "grad_norm": 20.64828109741211, | |
| "kl": 4.228125, | |
| "learning_rate": 9.787815804967551e-07, | |
| "loss": 0.9426, | |
| "reward": 0.9125000298023224, | |
| "reward_std": 0.7014284431934357, | |
| "rewards/accuracy_reward": 0.22083334252238274, | |
| "rewards/format_reward": 0.6916666805744172, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 169.56667251586913, | |
| "epoch": 0.18666666666666668, | |
| "grad_norm": 57.10884094238281, | |
| "kl": 2.883203125, | |
| "learning_rate": 9.774189945871288e-07, | |
| "loss": 0.7312, | |
| "reward": 0.7875000238418579, | |
| "reward_std": 0.6836557567119599, | |
| "rewards/accuracy_reward": 0.16250000484287738, | |
| "rewards/format_reward": 0.6250000178813935, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 162.88750457763672, | |
| "epoch": 0.18933333333333333, | |
| "grad_norm": 33.1915397644043, | |
| "kl": 3.08203125, | |
| "learning_rate": 9.760150176559624e-07, | |
| "loss": 0.6298, | |
| "reward": 0.9041666865348816, | |
| "reward_std": 0.7536427795886993, | |
| "rewards/accuracy_reward": 0.24583334252238273, | |
| "rewards/format_reward": 0.658333358168602, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 127.81666870117188, | |
| "epoch": 0.192, | |
| "grad_norm": 28.192670822143555, | |
| "kl": 7.875, | |
| "learning_rate": 9.745697714245118e-07, | |
| "loss": 1.1418, | |
| "reward": 1.066666692495346, | |
| "reward_std": 0.6796198636293411, | |
| "rewards/accuracy_reward": 0.3333333428949118, | |
| "rewards/format_reward": 0.7333333551883697, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 126.42917175292969, | |
| "epoch": 0.19466666666666665, | |
| "grad_norm": 19.537328720092773, | |
| "kl": 1.71875, | |
| "learning_rate": 9.730833811919762e-07, | |
| "loss": 0.5002, | |
| "reward": 0.9875000238418579, | |
| "reward_std": 0.6549044132232666, | |
| "rewards/accuracy_reward": 0.2541666716337204, | |
| "rewards/format_reward": 0.7333333551883697, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 129.67500534057618, | |
| "epoch": 0.19733333333333333, | |
| "grad_norm": 35.49064636230469, | |
| "kl": 3.1390625, | |
| "learning_rate": 9.715559758246361e-07, | |
| "loss": 0.5476, | |
| "reward": 1.041666704416275, | |
| "reward_std": 0.6668142318725586, | |
| "rewards/accuracy_reward": 0.2916666753590107, | |
| "rewards/format_reward": 0.7500000178813935, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 124.04166870117187, | |
| "epoch": 0.2, | |
| "grad_norm": 16.886371612548828, | |
| "kl": 4.10625, | |
| "learning_rate": 9.699876877446812e-07, | |
| "loss": 0.6237, | |
| "reward": 0.9458333551883698, | |
| "reward_std": 0.6157839864492416, | |
| "rewards/accuracy_reward": 0.1958333395421505, | |
| "rewards/format_reward": 0.7500000238418579, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 128.82083740234376, | |
| "epoch": 0.20266666666666666, | |
| "grad_norm": 60.52346420288086, | |
| "kl": 1.719140625, | |
| "learning_rate": 9.683786529187285e-07, | |
| "loss": 0.4091, | |
| "reward": 0.9833333730697632, | |
| "reward_std": 0.5964143082499505, | |
| "rewards/accuracy_reward": 0.25833334401249886, | |
| "rewards/format_reward": 0.7250000178813935, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 130.2500030517578, | |
| "epoch": 0.20533333333333334, | |
| "grad_norm": 8.501641273498535, | |
| "kl": 2.725, | |
| "learning_rate": 9.667290108460353e-07, | |
| "loss": 0.4553, | |
| "reward": 0.9625000298023224, | |
| "reward_std": 0.6627969831228256, | |
| "rewards/accuracy_reward": 0.25416667088866235, | |
| "rewards/format_reward": 0.7083333551883697, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 151.49167251586914, | |
| "epoch": 0.208, | |
| "grad_norm": 9.237239837646484, | |
| "kl": 3.03125, | |
| "learning_rate": 9.650389045464044e-07, | |
| "loss": 0.5862, | |
| "reward": 1.0083333671092987, | |
| "reward_std": 0.6827763438224792, | |
| "rewards/accuracy_reward": 0.28333334177732467, | |
| "rewards/format_reward": 0.725000011920929, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 157.08750381469727, | |
| "epoch": 0.21066666666666667, | |
| "grad_norm": 21.65406036376953, | |
| "kl": 2.93125, | |
| "learning_rate": 9.633084805477855e-07, | |
| "loss": 0.7111, | |
| "reward": 1.0791666984558106, | |
| "reward_std": 0.6408874064683914, | |
| "rewards/accuracy_reward": 0.3125000070780516, | |
| "rewards/format_reward": 0.766666692495346, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 177.63750381469725, | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 18.198787689208984, | |
| "kl": 3.2578125, | |
| "learning_rate": 9.615378888735705e-07, | |
| "loss": 0.6602, | |
| "reward": 0.9416666984558105, | |
| "reward_std": 0.7165269427001476, | |
| "rewards/accuracy_reward": 0.2583333391696215, | |
| "rewards/format_reward": 0.6833333492279052, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.21333333333333335, | |
| "eval_completion_length": 176.4338934326172, | |
| "eval_kl": 3.930625, | |
| "eval_loss": 0.726865291595459, | |
| "eval_reward": 0.8177778057257334, | |
| "eval_reward_std": 0.6464416084686915, | |
| "eval_rewards/accuracy_reward": 0.125555559694767, | |
| "eval_rewards/format_reward": 0.6922222431500753, | |
| "eval_runtime": 651.5271, | |
| "eval_samples_per_second": 0.46, | |
| "eval_steps_per_second": 0.02, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 156.30417175292968, | |
| "epoch": 0.216, | |
| "grad_norm": 9.428586959838867, | |
| "kl": 3.2203125, | |
| "learning_rate": 9.597272830295876e-07, | |
| "loss": 0.5783, | |
| "reward": 0.9208333551883697, | |
| "reward_std": 0.6943224638700485, | |
| "rewards/accuracy_reward": 0.22083334028720855, | |
| "rewards/format_reward": 0.7000000238418579, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 187.24167251586914, | |
| "epoch": 0.21866666666666668, | |
| "grad_norm": 8.181583404541016, | |
| "kl": 2.2078125, | |
| "learning_rate": 9.578768199907919e-07, | |
| "loss": 0.5979, | |
| "reward": 0.8416666924953461, | |
| "reward_std": 0.683533999323845, | |
| "rewards/accuracy_reward": 0.15833333805203437, | |
| "rewards/format_reward": 0.6833333522081375, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 196.9791732788086, | |
| "epoch": 0.22133333333333333, | |
| "grad_norm": 241.11936950683594, | |
| "kl": 5.8875, | |
| "learning_rate": 9.55986660187658e-07, | |
| "loss": 0.9332, | |
| "reward": 0.8916666984558106, | |
| "reward_std": 0.7562600076198578, | |
| "rewards/accuracy_reward": 0.2916666727513075, | |
| "rewards/format_reward": 0.6000000089406967, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 200.53750610351562, | |
| "epoch": 0.224, | |
| "grad_norm": 24.950624465942383, | |
| "kl": 2.9796875, | |
| "learning_rate": 9.540569674922684e-07, | |
| "loss": 0.6774, | |
| "reward": 0.9208333611488342, | |
| "reward_std": 0.7625810235738755, | |
| "rewards/accuracy_reward": 0.2625000074505806, | |
| "rewards/format_reward": 0.6583333522081375, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 198.76250686645508, | |
| "epoch": 0.22666666666666666, | |
| "grad_norm": 13.279343605041504, | |
| "kl": 2.8796875, | |
| "learning_rate": 9.520879092041083e-07, | |
| "loss": 0.7823, | |
| "reward": 0.8708333611488343, | |
| "reward_std": 0.6853504031896591, | |
| "rewards/accuracy_reward": 0.18750000335276126, | |
| "rewards/format_reward": 0.6833333611488343, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 132.6208381652832, | |
| "epoch": 0.22933333333333333, | |
| "grad_norm": 19.399991989135742, | |
| "kl": 3.008203125, | |
| "learning_rate": 9.500796560355602e-07, | |
| "loss": 0.4804, | |
| "reward": 0.9458333551883698, | |
| "reward_std": 0.5760359674692154, | |
| "rewards/accuracy_reward": 0.1791666705161333, | |
| "rewards/format_reward": 0.7666666686534882, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 156.29583816528321, | |
| "epoch": 0.232, | |
| "grad_norm": 13.930024147033691, | |
| "kl": 2.163671875, | |
| "learning_rate": 9.480323820971037e-07, | |
| "loss": 0.6149, | |
| "reward": 0.9000000298023224, | |
| "reward_std": 0.6693511486053467, | |
| "rewards/accuracy_reward": 0.16666667014360428, | |
| "rewards/format_reward": 0.7333333551883697, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 170.9166732788086, | |
| "epoch": 0.23466666666666666, | |
| "grad_norm": 28.107120513916016, | |
| "kl": 3.184375, | |
| "learning_rate": 9.459462648822207e-07, | |
| "loss": 0.6151, | |
| "reward": 0.8916667103767395, | |
| "reward_std": 0.6882745712995529, | |
| "rewards/accuracy_reward": 0.21666667684912683, | |
| "rewards/format_reward": 0.675000011920929, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 162.44583892822266, | |
| "epoch": 0.23733333333333334, | |
| "grad_norm": 20.64507293701172, | |
| "kl": 4.196875, | |
| "learning_rate": 9.438214852520072e-07, | |
| "loss": 0.7043, | |
| "reward": 0.829166692495346, | |
| "reward_std": 0.6397666782140732, | |
| "rewards/accuracy_reward": 0.1458333373069763, | |
| "rewards/format_reward": 0.6833333492279052, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 133.62917098999023, | |
| "epoch": 0.24, | |
| "grad_norm": 5.781806945800781, | |
| "kl": 2.41796875, | |
| "learning_rate": 9.416582274194929e-07, | |
| "loss": 0.5327, | |
| "reward": 1.0500000238418579, | |
| "reward_std": 0.6202212646603584, | |
| "rewards/accuracy_reward": 0.2666666727513075, | |
| "rewards/format_reward": 0.7833333492279053, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 216.83334045410157, | |
| "epoch": 0.24266666666666667, | |
| "grad_norm": 24.161720275878906, | |
| "kl": 1.6671875, | |
| "learning_rate": 9.394566789336707e-07, | |
| "loss": 0.5952, | |
| "reward": 0.8916666984558106, | |
| "reward_std": 0.6439453423023224, | |
| "rewards/accuracy_reward": 0.19166667051613331, | |
| "rewards/format_reward": 0.7000000178813934, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 133.31667098999023, | |
| "epoch": 0.24533333333333332, | |
| "grad_norm": 14.312320709228516, | |
| "kl": 2.26484375, | |
| "learning_rate": 9.372170306632358e-07, | |
| "loss": 0.3488, | |
| "reward": 1.0583333730697633, | |
| "reward_std": 0.4993865922093391, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/format_reward": 0.8083333551883698, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 115.00000228881837, | |
| "epoch": 0.248, | |
| "grad_norm": 24.176897048950195, | |
| "kl": 2.22109375, | |
| "learning_rate": 9.349394767800396e-07, | |
| "loss": 0.3995, | |
| "reward": 1.0458333611488342, | |
| "reward_std": 0.5499838680028916, | |
| "rewards/accuracy_reward": 0.25416667461395265, | |
| "rewards/format_reward": 0.791666692495346, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 170.7375030517578, | |
| "epoch": 0.25066666666666665, | |
| "grad_norm": 15.032400131225586, | |
| "kl": 3.209375, | |
| "learning_rate": 9.326242147422536e-07, | |
| "loss": 0.6388, | |
| "reward": 0.9458333611488342, | |
| "reward_std": 0.6427857339382171, | |
| "rewards/accuracy_reward": 0.22916667051613332, | |
| "rewards/format_reward": 0.7166666865348816, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 178.14167098999025, | |
| "epoch": 0.25333333333333335, | |
| "grad_norm": 12.969917297363281, | |
| "kl": 2.8015625, | |
| "learning_rate": 9.302714452772514e-07, | |
| "loss": 0.5282, | |
| "reward": 0.8666666924953461, | |
| "reward_std": 0.6148743867874146, | |
| "rewards/accuracy_reward": 0.20833334065973758, | |
| "rewards/format_reward": 0.6583333492279053, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 160.37916946411133, | |
| "epoch": 0.256, | |
| "grad_norm": 25.496326446533203, | |
| "kl": 2.88515625, | |
| "learning_rate": 9.278813723642059e-07, | |
| "loss": 0.6265, | |
| "reward": 0.7583333671092987, | |
| "reward_std": 0.6592622727155686, | |
| "rewards/accuracy_reward": 0.09166666939854622, | |
| "rewards/format_reward": 0.666666692495346, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 156.80833740234374, | |
| "epoch": 0.25866666666666666, | |
| "grad_norm": 19.942245483398438, | |
| "kl": 2.80546875, | |
| "learning_rate": 9.254542032164046e-07, | |
| "loss": 0.5487, | |
| "reward": 1.0916667103767395, | |
| "reward_std": 0.5701596170663834, | |
| "rewards/accuracy_reward": 0.2916666727513075, | |
| "rewards/format_reward": 0.8000000238418579, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 190.86667556762694, | |
| "epoch": 0.2613333333333333, | |
| "grad_norm": 29.593677520751953, | |
| "kl": 5.2046875, | |
| "learning_rate": 9.229901482632849e-07, | |
| "loss": 0.8562, | |
| "reward": 0.8750000268220901, | |
| "reward_std": 0.7407498300075531, | |
| "rewards/accuracy_reward": 0.25833334438502786, | |
| "rewards/format_reward": 0.6166666775941849, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 185.21250381469727, | |
| "epoch": 0.264, | |
| "grad_norm": 15.83928108215332, | |
| "kl": 3.34765625, | |
| "learning_rate": 9.204894211321905e-07, | |
| "loss": 0.7039, | |
| "reward": 0.9708333551883698, | |
| "reward_std": 0.7463637501001358, | |
| "rewards/accuracy_reward": 0.30416667759418486, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 177.40833740234376, | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 12.166021347045898, | |
| "kl": 4.6421875, | |
| "learning_rate": 9.179522386298506e-07, | |
| "loss": 0.8557, | |
| "reward": 0.8416666924953461, | |
| "reward_std": 0.753840970993042, | |
| "rewards/accuracy_reward": 0.22500000558793545, | |
| "rewards/format_reward": 0.6166666924953461, | |
| "step": 500 | |
| }, | |
| { | |
| "completion_length": 201.27083892822264, | |
| "epoch": 0.2693333333333333, | |
| "grad_norm": 23.116260528564453, | |
| "kl": 2.7734375, | |
| "learning_rate": 9.153788207235826e-07, | |
| "loss": 0.7171, | |
| "reward": 1.0333333730697631, | |
| "reward_std": 0.7333385825157166, | |
| "rewards/accuracy_reward": 0.35000001192092894, | |
| "rewards/format_reward": 0.6833333492279052, | |
| "step": 505 | |
| }, | |
| { | |
| "completion_length": 168.51250457763672, | |
| "epoch": 0.272, | |
| "grad_norm": 78.56771850585938, | |
| "kl": 3.934375, | |
| "learning_rate": 9.127693905222223e-07, | |
| "loss": 0.7631, | |
| "reward": 0.9583333551883697, | |
| "reward_std": 0.7650938987731933, | |
| "rewards/accuracy_reward": 0.31666667461395265, | |
| "rewards/format_reward": 0.6416666865348816, | |
| "step": 510 | |
| }, | |
| { | |
| "completion_length": 179.67500534057618, | |
| "epoch": 0.27466666666666667, | |
| "grad_norm": 12.310942649841309, | |
| "kl": 2.8015625, | |
| "learning_rate": 9.1012417425678e-07, | |
| "loss": 0.7708, | |
| "reward": 1.050000047683716, | |
| "reward_std": 0.6466626852750779, | |
| "rewards/accuracy_reward": 0.3250000040978193, | |
| "rewards/format_reward": 0.7250000238418579, | |
| "step": 515 | |
| }, | |
| { | |
| "completion_length": 137.3458381652832, | |
| "epoch": 0.2773333333333333, | |
| "grad_norm": 23.13488006591797, | |
| "kl": 2.5033203125, | |
| "learning_rate": 9.074434012608281e-07, | |
| "loss": 0.5319, | |
| "reward": 1.0208333611488343, | |
| "reward_std": 0.5777831941843032, | |
| "rewards/accuracy_reward": 0.2541666753590107, | |
| "rewards/format_reward": 0.766666692495346, | |
| "step": 520 | |
| }, | |
| { | |
| "completion_length": 139.60833663940429, | |
| "epoch": 0.28, | |
| "grad_norm": 18.197731018066406, | |
| "kl": 2.709375, | |
| "learning_rate": 9.047273039506174e-07, | |
| "loss": 0.6145, | |
| "reward": 1.0708333730697632, | |
| "reward_std": 0.47921385020017626, | |
| "rewards/accuracy_reward": 0.24583334103226662, | |
| "rewards/format_reward": 0.8250000238418579, | |
| "step": 525 | |
| }, | |
| { | |
| "completion_length": 125.83333816528321, | |
| "epoch": 0.2826666666666667, | |
| "grad_norm": 62.50031661987305, | |
| "kl": 1.6216796875, | |
| "learning_rate": 9.019761178049279e-07, | |
| "loss": 0.4705, | |
| "reward": 1.1375000178813934, | |
| "reward_std": 0.4138069227337837, | |
| "rewards/accuracy_reward": 0.2958333373069763, | |
| "rewards/format_reward": 0.8416666805744171, | |
| "step": 530 | |
| }, | |
| { | |
| "completion_length": 179.41667251586915, | |
| "epoch": 0.2853333333333333, | |
| "grad_norm": 18.075428009033203, | |
| "kl": 2.503125, | |
| "learning_rate": 8.991900813446522e-07, | |
| "loss": 0.6926, | |
| "reward": 0.9208333671092988, | |
| "reward_std": 0.7170521825551986, | |
| "rewards/accuracy_reward": 0.26250000596046447, | |
| "rewards/format_reward": 0.6583333551883698, | |
| "step": 535 | |
| }, | |
| { | |
| "completion_length": 269.2916717529297, | |
| "epoch": 0.288, | |
| "grad_norm": 10.559329986572266, | |
| "kl": 3.5578125, | |
| "learning_rate": 8.963694361121185e-07, | |
| "loss": 0.7955, | |
| "reward": 0.7375000238418579, | |
| "reward_std": 0.7803374290466308, | |
| "rewards/accuracy_reward": 0.21250000707805156, | |
| "rewards/format_reward": 0.5250000119209289, | |
| "step": 540 | |
| }, | |
| { | |
| "completion_length": 171.4625045776367, | |
| "epoch": 0.2906666666666667, | |
| "grad_norm": 85.9892578125, | |
| "kl": 3.36953125, | |
| "learning_rate": 8.935144266501468e-07, | |
| "loss": 0.7548, | |
| "reward": 0.9333333671092987, | |
| "reward_std": 0.6961856186389923, | |
| "rewards/accuracy_reward": 0.2416666690260172, | |
| "rewards/format_reward": 0.6916666805744172, | |
| "step": 545 | |
| }, | |
| { | |
| "completion_length": 116.50833663940429, | |
| "epoch": 0.29333333333333333, | |
| "grad_norm": 61.62466812133789, | |
| "kl": 3.23515625, | |
| "learning_rate": 8.906253004808504e-07, | |
| "loss": 0.6184, | |
| "reward": 1.1833333849906922, | |
| "reward_std": 0.5228020772337914, | |
| "rewards/accuracy_reward": 0.35833334028720853, | |
| "rewards/format_reward": 0.8250000238418579, | |
| "step": 550 | |
| }, | |
| { | |
| "completion_length": 130.47500381469726, | |
| "epoch": 0.296, | |
| "grad_norm": 13.287981986999512, | |
| "kl": 1.23828125, | |
| "learning_rate": 8.877023080841737e-07, | |
| "loss": 0.4985, | |
| "reward": 1.1375000178813934, | |
| "reward_std": 0.4386047780513763, | |
| "rewards/accuracy_reward": 0.29583334028720853, | |
| "rewards/format_reward": 0.8416666924953461, | |
| "step": 555 | |
| }, | |
| { | |
| "completion_length": 171.2916702270508, | |
| "epoch": 0.2986666666666667, | |
| "grad_norm": 11.248329162597656, | |
| "kl": 2.68125, | |
| "learning_rate": 8.847457028761782e-07, | |
| "loss": 0.7836, | |
| "reward": 1.045833373069763, | |
| "reward_std": 0.6420545637607574, | |
| "rewards/accuracy_reward": 0.28750000447034835, | |
| "rewards/format_reward": 0.7583333492279053, | |
| "step": 560 | |
| }, | |
| { | |
| "completion_length": 153.5500045776367, | |
| "epoch": 0.30133333333333334, | |
| "grad_norm": 11.977041244506836, | |
| "kl": 2.99296875, | |
| "learning_rate": 8.817557411870715e-07, | |
| "loss": 0.6474, | |
| "reward": 1.1375000417232513, | |
| "reward_std": 0.546431428194046, | |
| "rewards/accuracy_reward": 0.3458333432674408, | |
| "rewards/format_reward": 0.7916666805744171, | |
| "step": 565 | |
| }, | |
| { | |
| "completion_length": 157.00833740234376, | |
| "epoch": 0.304, | |
| "grad_norm": 8.921804428100586, | |
| "kl": 2.18046875, | |
| "learning_rate": 8.787326822389835e-07, | |
| "loss": 0.7647, | |
| "reward": 1.0416667103767394, | |
| "reward_std": 0.4992914006114006, | |
| "rewards/accuracy_reward": 0.2333333373069763, | |
| "rewards/format_reward": 0.8083333551883698, | |
| "step": 570 | |
| }, | |
| { | |
| "completion_length": 203.42916946411134, | |
| "epoch": 0.30666666666666664, | |
| "grad_norm": 8.668245315551758, | |
| "kl": 1.5703125, | |
| "learning_rate": 8.756767881234928e-07, | |
| "loss": 0.7449, | |
| "reward": 1.0916666865348816, | |
| "reward_std": 0.6213793724775314, | |
| "rewards/accuracy_reward": 0.3166666731238365, | |
| "rewards/format_reward": 0.7750000119209289, | |
| "step": 575 | |
| }, | |
| { | |
| "completion_length": 147.91667098999022, | |
| "epoch": 0.30933333333333335, | |
| "grad_norm": 27.652040481567383, | |
| "kl": 2.9046875, | |
| "learning_rate": 8.725883237789044e-07, | |
| "loss": 0.7151, | |
| "reward": 1.1083333551883698, | |
| "reward_std": 0.5087577894330024, | |
| "rewards/accuracy_reward": 0.27500000670552255, | |
| "rewards/format_reward": 0.8333333611488343, | |
| "step": 580 | |
| }, | |
| { | |
| "completion_length": 195.73750381469728, | |
| "epoch": 0.312, | |
| "grad_norm": 5.533527851104736, | |
| "kl": 2.3046875, | |
| "learning_rate": 8.694675569672799e-07, | |
| "loss": 0.7577, | |
| "reward": 0.9291666924953461, | |
| "reward_std": 0.5898149274289608, | |
| "rewards/accuracy_reward": 0.17916667014360427, | |
| "rewards/format_reward": 0.7500000178813935, | |
| "step": 585 | |
| }, | |
| { | |
| "completion_length": 218.38334121704102, | |
| "epoch": 0.31466666666666665, | |
| "grad_norm": 7.887436866760254, | |
| "kl": 1.71484375, | |
| "learning_rate": 8.663147582512231e-07, | |
| "loss": 0.6805, | |
| "reward": 1.0375000298023225, | |
| "reward_std": 0.6552489116787911, | |
| "rewards/accuracy_reward": 0.3125000029802322, | |
| "rewards/format_reward": 0.7250000178813935, | |
| "step": 590 | |
| }, | |
| { | |
| "completion_length": 142.9625030517578, | |
| "epoch": 0.31733333333333336, | |
| "grad_norm": 39.04753112792969, | |
| "kl": 3.50078125, | |
| "learning_rate": 8.631302009704233e-07, | |
| "loss": 0.7378, | |
| "reward": 1.0500000357627868, | |
| "reward_std": 0.5874016582965851, | |
| "rewards/accuracy_reward": 0.2416666716337204, | |
| "rewards/format_reward": 0.8083333551883698, | |
| "step": 595 | |
| }, | |
| { | |
| "completion_length": 145.0583366394043, | |
| "epoch": 0.32, | |
| "grad_norm": 10.890267372131348, | |
| "kl": 1.9578125, | |
| "learning_rate": 8.59914161217957e-07, | |
| "loss": 0.4446, | |
| "reward": 1.2125000357627869, | |
| "reward_std": 0.33466504961252214, | |
| "rewards/accuracy_reward": 0.3208333432674408, | |
| "rewards/format_reward": 0.8916666865348816, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_completion_length": 157.21389434814452, | |
| "eval_kl": 1.6558333333333333, | |
| "eval_loss": 0.4634725749492645, | |
| "eval_reward": 0.9744444727897644, | |
| "eval_reward_std": 0.43147118786970773, | |
| "eval_rewards/accuracy_reward": 0.13000000352660815, | |
| "eval_rewards/format_reward": 0.8444444608688354, | |
| "eval_runtime": 533.9055, | |
| "eval_samples_per_second": 0.562, | |
| "eval_steps_per_second": 0.024, | |
| "step": 600 | |
| }, | |
| { | |
| "completion_length": 193.183341217041, | |
| "epoch": 0.32266666666666666, | |
| "grad_norm": 13.141694068908691, | |
| "kl": 1.72421875, | |
| "learning_rate": 8.566669178163512e-07, | |
| "loss": 0.5602, | |
| "reward": 0.9708333551883698, | |
| "reward_std": 0.5239060014486313, | |
| "rewards/accuracy_reward": 0.2041666716337204, | |
| "rewards/format_reward": 0.766666692495346, | |
| "step": 605 | |
| }, | |
| { | |
| "completion_length": 154.98333663940429, | |
| "epoch": 0.3253333333333333, | |
| "grad_norm": 9.7343111038208, | |
| "kl": 1.919921875, | |
| "learning_rate": 8.533887522934114e-07, | |
| "loss": 0.4813, | |
| "reward": 1.1041667103767394, | |
| "reward_std": 0.5670485764741897, | |
| "rewards/accuracy_reward": 0.28750000819563865, | |
| "rewards/format_reward": 0.8166666924953461, | |
| "step": 610 | |
| }, | |
| { | |
| "completion_length": 109.46667022705078, | |
| "epoch": 0.328, | |
| "grad_norm": 6.92060661315918, | |
| "kl": 1.7703125, | |
| "learning_rate": 8.500799488578119e-07, | |
| "loss": 0.2986, | |
| "reward": 1.1000000476837157, | |
| "reward_std": 0.2986703909933567, | |
| "rewards/accuracy_reward": 0.1916666727513075, | |
| "rewards/format_reward": 0.9083333492279053, | |
| "step": 615 | |
| }, | |
| { | |
| "completion_length": 181.3708366394043, | |
| "epoch": 0.33066666666666666, | |
| "grad_norm": 7.871663570404053, | |
| "kl": 1.2625, | |
| "learning_rate": 8.467407943744573e-07, | |
| "loss": 0.6639, | |
| "reward": 1.1875000476837159, | |
| "reward_std": 0.5026830688118935, | |
| "rewards/accuracy_reward": 0.37916667610406873, | |
| "rewards/format_reward": 0.8083333551883698, | |
| "step": 620 | |
| }, | |
| { | |
| "completion_length": 130.59166946411133, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 9.6685791015625, | |
| "kl": 1.58359375, | |
| "learning_rate": 8.433715783396114e-07, | |
| "loss": 0.5216, | |
| "reward": 1.1583333551883697, | |
| "reward_std": 0.4340529665350914, | |
| "rewards/accuracy_reward": 0.2750000134110451, | |
| "rewards/format_reward": 0.8833333492279053, | |
| "step": 625 | |
| }, | |
| { | |
| "completion_length": 180.36667022705078, | |
| "epoch": 0.336, | |
| "grad_norm": 39.6641960144043, | |
| "kl": 3.36953125, | |
| "learning_rate": 8.399725928557985e-07, | |
| "loss": 0.7533, | |
| "reward": 1.025000023841858, | |
| "reward_std": 0.46841561794281006, | |
| "rewards/accuracy_reward": 0.20833333991467953, | |
| "rewards/format_reward": 0.8166666865348816, | |
| "step": 630 | |
| }, | |
| { | |
| "completion_length": 135.39167022705078, | |
| "epoch": 0.33866666666666667, | |
| "grad_norm": 11.369102478027344, | |
| "kl": 2.64765625, | |
| "learning_rate": 8.365441326064788e-07, | |
| "loss": 0.5253, | |
| "reward": 1.1875000476837159, | |
| "reward_std": 0.40690153986215594, | |
| "rewards/accuracy_reward": 0.3375000089406967, | |
| "rewards/format_reward": 0.8500000238418579, | |
| "step": 635 | |
| }, | |
| { | |
| "completion_length": 210.98333816528321, | |
| "epoch": 0.3413333333333333, | |
| "grad_norm": 12.549598693847656, | |
| "kl": 1.3271484375, | |
| "learning_rate": 8.330864948305007e-07, | |
| "loss": 0.7683, | |
| "reward": 1.0166667103767395, | |
| "reward_std": 0.4776305049657822, | |
| "rewards/accuracy_reward": 0.1833333384245634, | |
| "rewards/format_reward": 0.8333333551883697, | |
| "step": 640 | |
| }, | |
| { | |
| "completion_length": 151.5333381652832, | |
| "epoch": 0.344, | |
| "grad_norm": 6.856210231781006, | |
| "kl": 1.084765625, | |
| "learning_rate": 8.295999792963299e-07, | |
| "loss": 0.4446, | |
| "reward": 1.200000035762787, | |
| "reward_std": 0.43040212616324425, | |
| "rewards/accuracy_reward": 0.3250000089406967, | |
| "rewards/format_reward": 0.8750000238418579, | |
| "step": 645 | |
| }, | |
| { | |
| "completion_length": 269.2375091552734, | |
| "epoch": 0.3466666666666667, | |
| "grad_norm": 23.80208396911621, | |
| "kl": 2.725, | |
| "learning_rate": 8.260848882760615e-07, | |
| "loss": 0.865, | |
| "reward": 0.829166692495346, | |
| "reward_std": 0.659246638417244, | |
| "rewards/accuracy_reward": 0.1875000014901161, | |
| "rewards/format_reward": 0.6416666865348816, | |
| "step": 650 | |
| }, | |
| { | |
| "completion_length": 211.78334045410156, | |
| "epoch": 0.34933333333333333, | |
| "grad_norm": 6.310311317443848, | |
| "kl": 1.966015625, | |
| "learning_rate": 8.225415265192126e-07, | |
| "loss": 0.769, | |
| "reward": 0.954166692495346, | |
| "reward_std": 0.5517986357212067, | |
| "rewards/accuracy_reward": 0.19583333991467952, | |
| "rewards/format_reward": 0.7583333551883698, | |
| "step": 655 | |
| }, | |
| { | |
| "completion_length": 183.13750457763672, | |
| "epoch": 0.352, | |
| "grad_norm": 5.835783958435059, | |
| "kl": 1.602734375, | |
| "learning_rate": 8.18970201226302e-07, | |
| "loss": 0.6146, | |
| "reward": 1.0166667103767395, | |
| "reward_std": 0.44397214204072954, | |
| "rewards/accuracy_reward": 0.2166666716337204, | |
| "rewards/format_reward": 0.8000000178813934, | |
| "step": 660 | |
| }, | |
| { | |
| "completion_length": 199.48750686645508, | |
| "epoch": 0.3546666666666667, | |
| "grad_norm": 8.435812950134277, | |
| "kl": 1.846875, | |
| "learning_rate": 8.153712220222163e-07, | |
| "loss": 0.7525, | |
| "reward": 0.9833333671092988, | |
| "reward_std": 0.544944578409195, | |
| "rewards/accuracy_reward": 0.21666666828095912, | |
| "rewards/format_reward": 0.7666666984558106, | |
| "step": 665 | |
| }, | |
| { | |
| "completion_length": 134.15000381469727, | |
| "epoch": 0.35733333333333334, | |
| "grad_norm": 13.892914772033691, | |
| "kl": 1.921484375, | |
| "learning_rate": 8.117449009293668e-07, | |
| "loss": 0.5067, | |
| "reward": 1.2250000417232514, | |
| "reward_std": 0.3981220737099648, | |
| "rewards/accuracy_reward": 0.35000001154839994, | |
| "rewards/format_reward": 0.8750000178813935, | |
| "step": 670 | |
| }, | |
| { | |
| "completion_length": 199.30000534057618, | |
| "epoch": 0.36, | |
| "grad_norm": 53.40914535522461, | |
| "kl": 2.228125, | |
| "learning_rate": 8.080915523406369e-07, | |
| "loss": 0.8388, | |
| "reward": 1.1083333611488342, | |
| "reward_std": 0.5545098386704922, | |
| "rewards/accuracy_reward": 0.3250000089406967, | |
| "rewards/format_reward": 0.7833333432674408, | |
| "step": 675 | |
| }, | |
| { | |
| "completion_length": 197.45833740234374, | |
| "epoch": 0.3626666666666667, | |
| "grad_norm": 46.9063720703125, | |
| "kl": 1.98671875, | |
| "learning_rate": 8.044114929921263e-07, | |
| "loss": 0.8575, | |
| "reward": 0.9833333492279053, | |
| "reward_std": 0.5669769406318664, | |
| "rewards/accuracy_reward": 0.2000000011175871, | |
| "rewards/format_reward": 0.7833333492279053, | |
| "step": 680 | |
| }, | |
| { | |
| "completion_length": 245.34584197998046, | |
| "epoch": 0.36533333333333334, | |
| "grad_norm": 20.520357131958008, | |
| "kl": 2.934375, | |
| "learning_rate": 8.007050419356898e-07, | |
| "loss": 0.8979, | |
| "reward": 0.8958333611488343, | |
| "reward_std": 0.6523119986057282, | |
| "rewards/accuracy_reward": 0.20416667275130748, | |
| "rewards/format_reward": 0.6916666805744172, | |
| "step": 685 | |
| }, | |
| { | |
| "completion_length": 312.5666793823242, | |
| "epoch": 0.368, | |
| "grad_norm": 13.216290473937988, | |
| "kl": 3.625, | |
| "learning_rate": 7.969725205112765e-07, | |
| "loss": 0.9082, | |
| "reward": 0.8625000193715096, | |
| "reward_std": 0.748971363902092, | |
| "rewards/accuracy_reward": 0.3125000085681677, | |
| "rewards/format_reward": 0.550000025331974, | |
| "step": 690 | |
| }, | |
| { | |
| "completion_length": 352.8666717529297, | |
| "epoch": 0.37066666666666664, | |
| "grad_norm": 30.815317153930664, | |
| "kl": 2.85390625, | |
| "learning_rate": 7.93214252319071e-07, | |
| "loss": 0.8438, | |
| "reward": 0.7708333522081375, | |
| "reward_std": 0.8650185167789459, | |
| "rewards/accuracy_reward": 0.3125000070780516, | |
| "rewards/format_reward": 0.45833334177732465, | |
| "step": 695 | |
| }, | |
| { | |
| "completion_length": 232.11667175292968, | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 18.286134719848633, | |
| "kl": 2.925, | |
| "learning_rate": 7.894305631914373e-07, | |
| "loss": 0.9608, | |
| "reward": 0.9333333596587181, | |
| "reward_std": 0.7935751020908356, | |
| "rewards/accuracy_reward": 0.31666667610406873, | |
| "rewards/format_reward": 0.6166666969656944, | |
| "step": 700 | |
| }, | |
| { | |
| "completion_length": 181.43750457763673, | |
| "epoch": 0.376, | |
| "grad_norm": 20.183544158935547, | |
| "kl": 3.06015625, | |
| "learning_rate": 7.856217811646706e-07, | |
| "loss": 0.7195, | |
| "reward": 1.0458333671092988, | |
| "reward_std": 0.5713184028863907, | |
| "rewards/accuracy_reward": 0.2791666716337204, | |
| "rewards/format_reward": 0.7666666984558106, | |
| "step": 705 | |
| }, | |
| { | |
| "completion_length": 223.5291717529297, | |
| "epoch": 0.37866666666666665, | |
| "grad_norm": 11.171494483947754, | |
| "kl": 2.15234375, | |
| "learning_rate": 7.817882364505568e-07, | |
| "loss": 0.6855, | |
| "reward": 0.9000000357627869, | |
| "reward_std": 0.5742868632078171, | |
| "rewards/accuracy_reward": 0.1750000026077032, | |
| "rewards/format_reward": 0.7250000149011612, | |
| "step": 710 | |
| }, | |
| { | |
| "completion_length": 266.17500457763674, | |
| "epoch": 0.38133333333333336, | |
| "grad_norm": 10.772181510925293, | |
| "kl": 2.66953125, | |
| "learning_rate": 7.779302614077448e-07, | |
| "loss": 0.7085, | |
| "reward": 0.900000023841858, | |
| "reward_std": 0.6552800923585892, | |
| "rewards/accuracy_reward": 0.24166667312383652, | |
| "rewards/format_reward": 0.6583333551883698, | |
| "step": 715 | |
| }, | |
| { | |
| "completion_length": 253.56667785644532, | |
| "epoch": 0.384, | |
| "grad_norm": 15.041406631469727, | |
| "kl": 2.715625, | |
| "learning_rate": 7.740481905129306e-07, | |
| "loss": 0.8722, | |
| "reward": 0.9958333611488343, | |
| "reward_std": 0.6977577596902848, | |
| "rewards/accuracy_reward": 0.3125000037252903, | |
| "rewards/format_reward": 0.6833333551883698, | |
| "step": 720 | |
| }, | |
| { | |
| "completion_length": 161.67084045410155, | |
| "epoch": 0.38666666666666666, | |
| "grad_norm": 4.084959030151367, | |
| "kl": 2.09296875, | |
| "learning_rate": 7.701423603318604e-07, | |
| "loss": 0.5005, | |
| "reward": 1.1458333611488343, | |
| "reward_std": 0.40467526763677597, | |
| "rewards/accuracy_reward": 0.3375000059604645, | |
| "rewards/format_reward": 0.8083333551883698, | |
| "step": 725 | |
| }, | |
| { | |
| "completion_length": 259.2875091552734, | |
| "epoch": 0.3893333333333333, | |
| "grad_norm": 154.38690185546875, | |
| "kl": 1.82265625, | |
| "learning_rate": 7.662131094901498e-07, | |
| "loss": 0.7136, | |
| "reward": 0.8708333611488343, | |
| "reward_std": 0.6159812211990356, | |
| "rewards/accuracy_reward": 0.19583334289491178, | |
| "rewards/format_reward": 0.675000011920929, | |
| "step": 730 | |
| }, | |
| { | |
| "completion_length": 261.3000061035156, | |
| "epoch": 0.392, | |
| "grad_norm": 2285.590576171875, | |
| "kl": 98.85625, | |
| "learning_rate": 7.622607786439278e-07, | |
| "loss": 18.7274, | |
| "reward": 0.9541666984558106, | |
| "reward_std": 0.6454987242817879, | |
| "rewards/accuracy_reward": 0.2541666701436043, | |
| "rewards/format_reward": 0.7000000238418579, | |
| "step": 735 | |
| }, | |
| { | |
| "completion_length": 293.8916717529297, | |
| "epoch": 0.39466666666666667, | |
| "grad_norm": 103.3537826538086, | |
| "kl": 2.91640625, | |
| "learning_rate": 7.582857104503e-07, | |
| "loss": 0.7409, | |
| "reward": 0.8625000178813934, | |
| "reward_std": 0.6774646982550621, | |
| "rewards/accuracy_reward": 0.2458333380520344, | |
| "rewards/format_reward": 0.6166666880249977, | |
| "step": 740 | |
| }, | |
| { | |
| "completion_length": 363.01250915527345, | |
| "epoch": 0.3973333333333333, | |
| "grad_norm": 54.62760925292969, | |
| "kl": 5.337890625, | |
| "learning_rate": 7.542882495376435e-07, | |
| "loss": 1.3766, | |
| "reward": 0.7250000238418579, | |
| "reward_std": 0.7046854376792908, | |
| "rewards/accuracy_reward": 0.19166667126119136, | |
| "rewards/format_reward": 0.533333346247673, | |
| "step": 745 | |
| }, | |
| { | |
| "completion_length": 305.82500610351565, | |
| "epoch": 0.4, | |
| "grad_norm": 95.76229095458984, | |
| "kl": 2.378125, | |
| "learning_rate": 7.502687424757277e-07, | |
| "loss": 0.9832, | |
| "reward": 0.9791667044162751, | |
| "reward_std": 0.776354917883873, | |
| "rewards/accuracy_reward": 0.3291666775941849, | |
| "rewards/format_reward": 0.6500000119209289, | |
| "step": 750 | |
| }, | |
| { | |
| "completion_length": 265.6125061035156, | |
| "epoch": 0.4026666666666667, | |
| "grad_norm": 80.57921600341797, | |
| "kl": 10.53046875, | |
| "learning_rate": 7.462275377456669e-07, | |
| "loss": 2.1369, | |
| "reward": 0.925000011920929, | |
| "reward_std": 0.8046808481216431, | |
| "rewards/accuracy_reward": 0.2833333395421505, | |
| "rewards/format_reward": 0.6416666805744171, | |
| "step": 755 | |
| }, | |
| { | |
| "completion_length": 321.10834045410155, | |
| "epoch": 0.4053333333333333, | |
| "grad_norm": 34.1876220703125, | |
| "kl": 4.4828125, | |
| "learning_rate": 7.421649857097091e-07, | |
| "loss": 0.9501, | |
| "reward": 0.8291666865348816, | |
| "reward_std": 0.6768444120883942, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.6000000149011612, | |
| "step": 760 | |
| }, | |
| { | |
| "completion_length": 319.262508392334, | |
| "epoch": 0.408, | |
| "grad_norm": 117.45841979980469, | |
| "kl": 5.225, | |
| "learning_rate": 7.380814385808594e-07, | |
| "loss": 1.394, | |
| "reward": 0.8166666895151138, | |
| "reward_std": 0.7352788507938385, | |
| "rewards/accuracy_reward": 0.21666667461395264, | |
| "rewards/format_reward": 0.6000000163912773, | |
| "step": 765 | |
| }, | |
| { | |
| "completion_length": 367.7666809082031, | |
| "epoch": 0.4106666666666667, | |
| "grad_norm": 66.30304718017578, | |
| "kl": 5.2375, | |
| "learning_rate": 7.339772503923443e-07, | |
| "loss": 1.1981, | |
| "reward": 0.8458333551883698, | |
| "reward_std": 0.7178668111562729, | |
| "rewards/accuracy_reward": 0.2708333406597376, | |
| "rewards/format_reward": 0.5750000178813934, | |
| "step": 770 | |
| }, | |
| { | |
| "completion_length": 316.7833450317383, | |
| "epoch": 0.41333333333333333, | |
| "grad_norm": 16.508329391479492, | |
| "kl": 4.0140625, | |
| "learning_rate": 7.298527769669187e-07, | |
| "loss": 1.1483, | |
| "reward": 0.9291666865348815, | |
| "reward_std": 0.6445024594664573, | |
| "rewards/accuracy_reward": 0.25416667461395265, | |
| "rewards/format_reward": 0.6750000238418579, | |
| "step": 775 | |
| }, | |
| { | |
| "completion_length": 366.72084350585936, | |
| "epoch": 0.416, | |
| "grad_norm": 88.48445892333984, | |
| "kl": 5.3171875, | |
| "learning_rate": 7.257083758860157e-07, | |
| "loss": 1.5957, | |
| "reward": 0.7666666805744171, | |
| "reward_std": 0.9076652824878693, | |
| "rewards/accuracy_reward": 0.24166667126119137, | |
| "rewards/format_reward": 0.5250000208616257, | |
| "step": 780 | |
| }, | |
| { | |
| "completion_length": 261.41251068115236, | |
| "epoch": 0.4186666666666667, | |
| "grad_norm": 58.172264099121094, | |
| "kl": 5.23984375, | |
| "learning_rate": 7.215444064587462e-07, | |
| "loss": 1.4679, | |
| "reward": 0.9916666924953461, | |
| "reward_std": 0.6018173396587372, | |
| "rewards/accuracy_reward": 0.24166667237877845, | |
| "rewards/format_reward": 0.7500000178813935, | |
| "step": 785 | |
| }, | |
| { | |
| "completion_length": 316.15417709350584, | |
| "epoch": 0.42133333333333334, | |
| "grad_norm": 47.1014404296875, | |
| "kl": 5.3765625, | |
| "learning_rate": 7.173612296907472e-07, | |
| "loss": 1.0298, | |
| "reward": 0.8125000208616256, | |
| "reward_std": 0.6595857471227646, | |
| "rewards/accuracy_reward": 0.20416667312383652, | |
| "rewards/format_reward": 0.6083333522081376, | |
| "step": 790 | |
| }, | |
| { | |
| "completion_length": 541.3083404541015, | |
| "epoch": 0.424, | |
| "grad_norm": 39.66009521484375, | |
| "kl": 6.48125, | |
| "learning_rate": 7.131592082528835e-07, | |
| "loss": 1.4332, | |
| "reward": 0.5750000149011611, | |
| "reward_std": 0.9711216628551483, | |
| "rewards/accuracy_reward": 0.25000000894069674, | |
| "rewards/format_reward": 0.325000012665987, | |
| "step": 795 | |
| }, | |
| { | |
| "completion_length": 529.3041809082031, | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 47.98305892944336, | |
| "kl": 7.490625, | |
| "learning_rate": 7.089387064498055e-07, | |
| "loss": 1.4781, | |
| "reward": 0.6041666865348816, | |
| "reward_std": 0.9722610518336297, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.37500000968575475, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4266666666666667, | |
| "eval_completion_length": 626.6472381591797, | |
| "eval_kl": 10.508958333333334, | |
| "eval_loss": 1.6093299388885498, | |
| "eval_reward": 0.24388889610767364, | |
| "eval_reward_std": 0.9690509649117788, | |
| "eval_rewards/accuracy_reward": 0.08500000188748041, | |
| "eval_rewards/format_reward": 0.1588888943195343, | |
| "eval_runtime": 1086.9778, | |
| "eval_samples_per_second": 0.276, | |
| "eval_steps_per_second": 0.012, | |
| "step": 800 | |
| }, | |
| { | |
| "completion_length": 575.5916809082031, | |
| "epoch": 0.42933333333333334, | |
| "grad_norm": 105.59420013427734, | |
| "kl": 8.5296875, | |
| "learning_rate": 7.047000901883645e-07, | |
| "loss": 1.419, | |
| "reward": 0.4041666805744171, | |
| "reward_std": 0.9605139315128326, | |
| "rewards/accuracy_reward": 0.21250000409781933, | |
| "rewards/format_reward": 0.19166667610406876, | |
| "step": 805 | |
| }, | |
| { | |
| "completion_length": 561.8750213623047, | |
| "epoch": 0.432, | |
| "grad_norm": 40.40248489379883, | |
| "kl": 7.340625, | |
| "learning_rate": 7.004437269458894e-07, | |
| "loss": 1.4182, | |
| "reward": 0.45833334140479565, | |
| "reward_std": 0.9228455007076264, | |
| "rewards/accuracy_reward": 0.15000000558793544, | |
| "rewards/format_reward": 0.3083333432674408, | |
| "step": 810 | |
| }, | |
| { | |
| "completion_length": 499.4708450317383, | |
| "epoch": 0.43466666666666665, | |
| "grad_norm": 15.018860816955566, | |
| "kl": 7.36875, | |
| "learning_rate": 6.961699857383278e-07, | |
| "loss": 1.4916, | |
| "reward": 0.6500000096857548, | |
| "reward_std": 0.8621464431285858, | |
| "rewards/accuracy_reward": 0.2166666742414236, | |
| "rewards/format_reward": 0.43333334624767306, | |
| "step": 815 | |
| }, | |
| { | |
| "completion_length": 550.2083557128906, | |
| "epoch": 0.43733333333333335, | |
| "grad_norm": 161.48826599121094, | |
| "kl": 5.7875, | |
| "learning_rate": 6.91879237088253e-07, | |
| "loss": 1.3703, | |
| "reward": 0.6125000208616257, | |
| "reward_std": 0.9770530998706818, | |
| "rewards/accuracy_reward": 0.27083334475755694, | |
| "rewards/format_reward": 0.34166667237877846, | |
| "step": 820 | |
| }, | |
| { | |
| "completion_length": 456.0291831970215, | |
| "epoch": 0.44, | |
| "grad_norm": 63.651100158691406, | |
| "kl": 6.596875, | |
| "learning_rate": 6.875718529927404e-07, | |
| "loss": 1.5676, | |
| "reward": 0.7250000149011612, | |
| "reward_std": 0.8462802618741989, | |
| "rewards/accuracy_reward": 0.24166667684912682, | |
| "rewards/format_reward": 0.48333334624767305, | |
| "step": 825 | |
| }, | |
| { | |
| "completion_length": 405.82500610351565, | |
| "epoch": 0.44266666666666665, | |
| "grad_norm": 75.36550903320312, | |
| "kl": 5.9515625, | |
| "learning_rate": 6.832482068911166e-07, | |
| "loss": 1.8022, | |
| "reward": 0.8083333611488343, | |
| "reward_std": 0.8616673052310944, | |
| "rewards/accuracy_reward": 0.2500000074505806, | |
| "rewards/format_reward": 0.5583333462476731, | |
| "step": 830 | |
| }, | |
| { | |
| "completion_length": 348.73751068115234, | |
| "epoch": 0.44533333333333336, | |
| "grad_norm": 66.93795013427734, | |
| "kl": 5.48984375, | |
| "learning_rate": 6.789086736325834e-07, | |
| "loss": 1.1328, | |
| "reward": 0.9583333611488343, | |
| "reward_std": 0.6873706102371215, | |
| "rewards/accuracy_reward": 0.325000012293458, | |
| "rewards/format_reward": 0.6333333551883698, | |
| "step": 835 | |
| }, | |
| { | |
| "completion_length": 301.7250061035156, | |
| "epoch": 0.448, | |
| "grad_norm": 38.354427337646484, | |
| "kl": 2.6962890625, | |
| "learning_rate": 6.745536294437186e-07, | |
| "loss": 0.9267, | |
| "reward": 1.0375000298023225, | |
| "reward_std": 0.5275938391685486, | |
| "rewards/accuracy_reward": 0.32083333767950534, | |
| "rewards/format_reward": 0.7166666865348816, | |
| "step": 840 | |
| }, | |
| { | |
| "completion_length": 353.62501220703126, | |
| "epoch": 0.45066666666666666, | |
| "grad_norm": 15.596612930297852, | |
| "kl": 5.7234375, | |
| "learning_rate": 6.701834518958586e-07, | |
| "loss": 1.764, | |
| "reward": 0.9833333730697632, | |
| "reward_std": 0.7794818341732025, | |
| "rewards/accuracy_reward": 0.35000001192092894, | |
| "rewards/format_reward": 0.6333333492279053, | |
| "step": 845 | |
| }, | |
| { | |
| "completion_length": 303.0666778564453, | |
| "epoch": 0.4533333333333333, | |
| "grad_norm": 38.20693588256836, | |
| "kl": 4.66875, | |
| "learning_rate": 6.657985198723643e-07, | |
| "loss": 1.1933, | |
| "reward": 0.9541666895151139, | |
| "reward_std": 0.6846331983804703, | |
| "rewards/accuracy_reward": 0.3041666798293591, | |
| "rewards/format_reward": 0.6500000149011612, | |
| "step": 850 | |
| }, | |
| { | |
| "completion_length": 320.7541793823242, | |
| "epoch": 0.456, | |
| "grad_norm": 18.968719482421875, | |
| "kl": 5.1609375, | |
| "learning_rate": 6.613992135357712e-07, | |
| "loss": 1.5211, | |
| "reward": 0.7916666865348816, | |
| "reward_std": 0.698268249630928, | |
| "rewards/accuracy_reward": 0.1500000052154064, | |
| "rewards/format_reward": 0.6416666865348816, | |
| "step": 855 | |
| }, | |
| { | |
| "completion_length": 317.71667633056643, | |
| "epoch": 0.45866666666666667, | |
| "grad_norm": 37.6462287902832, | |
| "kl": 4.6234375, | |
| "learning_rate": 6.569859142948327e-07, | |
| "loss": 1.3873, | |
| "reward": 0.9583333611488343, | |
| "reward_std": 0.6036534637212754, | |
| "rewards/accuracy_reward": 0.2750000063329935, | |
| "rewards/format_reward": 0.6833333522081375, | |
| "step": 860 | |
| }, | |
| { | |
| "completion_length": 405.2833450317383, | |
| "epoch": 0.4613333333333333, | |
| "grad_norm": 65.66597747802734, | |
| "kl": 6.6875, | |
| "learning_rate": 6.52559004771451e-07, | |
| "loss": 1.5637, | |
| "reward": 0.7166666906327009, | |
| "reward_std": 0.7335242480039597, | |
| "rewards/accuracy_reward": 0.20833333507180213, | |
| "rewards/format_reward": 0.5083333522081375, | |
| "step": 865 | |
| }, | |
| { | |
| "completion_length": 306.6833435058594, | |
| "epoch": 0.464, | |
| "grad_norm": 40.86724853515625, | |
| "kl": 3.3671875, | |
| "learning_rate": 6.481188687675057e-07, | |
| "loss": 0.9366, | |
| "reward": 0.9291666686534882, | |
| "reward_std": 0.5080332323908806, | |
| "rewards/accuracy_reward": 0.23750000223517417, | |
| "rewards/format_reward": 0.6916666865348816, | |
| "step": 870 | |
| }, | |
| { | |
| "completion_length": 263.4833419799805, | |
| "epoch": 0.4666666666666667, | |
| "grad_norm": 28.436981201171875, | |
| "kl": 3.7203125, | |
| "learning_rate": 6.436658912315788e-07, | |
| "loss": 1.093, | |
| "reward": 1.0041666984558106, | |
| "reward_std": 0.5717462062835693, | |
| "rewards/accuracy_reward": 0.2625000078231096, | |
| "rewards/format_reward": 0.7416666805744171, | |
| "step": 875 | |
| }, | |
| { | |
| "completion_length": 237.6666702270508, | |
| "epoch": 0.4693333333333333, | |
| "grad_norm": 36.36558532714844, | |
| "kl": 2.8359375, | |
| "learning_rate": 6.392004582255807e-07, | |
| "loss": 0.7905, | |
| "reward": 1.104166716337204, | |
| "reward_std": 0.5215684860944748, | |
| "rewards/accuracy_reward": 0.3375000111758709, | |
| "rewards/format_reward": 0.7666666865348816, | |
| "step": 880 | |
| }, | |
| { | |
| "completion_length": 254.6166732788086, | |
| "epoch": 0.472, | |
| "grad_norm": 87.91173553466797, | |
| "kl": 4.0609375, | |
| "learning_rate": 6.347229568912794e-07, | |
| "loss": 1.2084, | |
| "reward": 0.9250000298023224, | |
| "reward_std": 0.6195752292871475, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/format_reward": 0.7166666805744171, | |
| "step": 885 | |
| }, | |
| { | |
| "completion_length": 280.2916717529297, | |
| "epoch": 0.4746666666666667, | |
| "grad_norm": 33.61180114746094, | |
| "kl": 3.603125, | |
| "learning_rate": 6.302337754167369e-07, | |
| "loss": 1.1916, | |
| "reward": 1.0166666984558106, | |
| "reward_std": 0.6988473400473595, | |
| "rewards/accuracy_reward": 0.3000000089406967, | |
| "rewards/format_reward": 0.7166666805744171, | |
| "step": 890 | |
| }, | |
| { | |
| "completion_length": 193.06666946411133, | |
| "epoch": 0.47733333333333333, | |
| "grad_norm": 34.74186325073242, | |
| "kl": 2.271875, | |
| "learning_rate": 6.257333030026538e-07, | |
| "loss": 0.8303, | |
| "reward": 1.2000000476837158, | |
| "reward_std": 0.4528850480914116, | |
| "rewards/accuracy_reward": 0.3666666731238365, | |
| "rewards/format_reward": 0.8333333551883697, | |
| "step": 895 | |
| }, | |
| { | |
| "completion_length": 275.5500061035156, | |
| "epoch": 0.48, | |
| "grad_norm": 30.625368118286133, | |
| "kl": 5.39375, | |
| "learning_rate": 6.212219298286261e-07, | |
| "loss": 1.5581, | |
| "reward": 1.000000035762787, | |
| "reward_std": 0.6969257593154907, | |
| "rewards/accuracy_reward": 0.26666667237877845, | |
| "rewards/format_reward": 0.7333333551883697, | |
| "step": 900 | |
| }, | |
| { | |
| "completion_length": 214.02500610351564, | |
| "epoch": 0.4826666666666667, | |
| "grad_norm": 28.710886001586914, | |
| "kl": 3.11328125, | |
| "learning_rate": 6.167000470193188e-07, | |
| "loss": 1.1685, | |
| "reward": 0.9666666924953461, | |
| "reward_std": 0.5459988377988338, | |
| "rewards/accuracy_reward": 0.19166667386889458, | |
| "rewards/format_reward": 0.7750000119209289, | |
| "step": 905 | |
| }, | |
| { | |
| "completion_length": 224.62500610351563, | |
| "epoch": 0.48533333333333334, | |
| "grad_norm": 44.227230072021484, | |
| "kl": 2.56953125, | |
| "learning_rate": 6.121680466105559e-07, | |
| "loss": 0.9869, | |
| "reward": 0.9708333671092987, | |
| "reward_std": 0.5105708941817284, | |
| "rewards/accuracy_reward": 0.18750000819563867, | |
| "rewards/format_reward": 0.7833333551883698, | |
| "step": 910 | |
| }, | |
| { | |
| "completion_length": 184.80417404174804, | |
| "epoch": 0.488, | |
| "grad_norm": 24.200429916381836, | |
| "kl": 2.596875, | |
| "learning_rate": 6.076263215153307e-07, | |
| "loss": 0.8057, | |
| "reward": 1.1250000238418578, | |
| "reward_std": 0.4605237804353237, | |
| "rewards/accuracy_reward": 0.3000000089406967, | |
| "rewards/format_reward": 0.8250000178813934, | |
| "step": 915 | |
| }, | |
| { | |
| "completion_length": 234.8166763305664, | |
| "epoch": 0.49066666666666664, | |
| "grad_norm": 40.316471099853516, | |
| "kl": 4.540625, | |
| "learning_rate": 6.030752654897434e-07, | |
| "loss": 1.2207, | |
| "reward": 0.9833333611488342, | |
| "reward_std": 0.5229995906352997, | |
| "rewards/accuracy_reward": 0.20833333544433116, | |
| "rewards/format_reward": 0.7750000119209289, | |
| "step": 920 | |
| }, | |
| { | |
| "completion_length": 311.7916801452637, | |
| "epoch": 0.49333333333333335, | |
| "grad_norm": 34.297813415527344, | |
| "kl": 4.3859375, | |
| "learning_rate": 5.985152730988617e-07, | |
| "loss": 1.3617, | |
| "reward": 0.9625000298023224, | |
| "reward_std": 0.7176508605480194, | |
| "rewards/accuracy_reward": 0.2625000074505806, | |
| "rewards/format_reward": 0.700000011920929, | |
| "step": 925 | |
| }, | |
| { | |
| "completion_length": 182.29167404174805, | |
| "epoch": 0.496, | |
| "grad_norm": 15.424689292907715, | |
| "kl": 2.127734375, | |
| "learning_rate": 5.939467396825136e-07, | |
| "loss": 0.7552, | |
| "reward": 1.1791667103767396, | |
| "reward_std": 0.3441163420677185, | |
| "rewards/accuracy_reward": 0.31250000596046446, | |
| "rewards/format_reward": 0.8666666865348815, | |
| "step": 930 | |
| }, | |
| { | |
| "completion_length": 286.0208404541016, | |
| "epoch": 0.49866666666666665, | |
| "grad_norm": 33.052181243896484, | |
| "kl": 3.21015625, | |
| "learning_rate": 5.893700613210127e-07, | |
| "loss": 1.0319, | |
| "reward": 0.9208333551883697, | |
| "reward_std": 0.6939111322164535, | |
| "rewards/accuracy_reward": 0.23750000558793544, | |
| "rewards/format_reward": 0.6833333432674408, | |
| "step": 935 | |
| }, | |
| { | |
| "completion_length": 343.77500610351564, | |
| "epoch": 0.5013333333333333, | |
| "grad_norm": 133.8216094970703, | |
| "kl": 5.746875, | |
| "learning_rate": 5.847856348008188e-07, | |
| "loss": 1.4352, | |
| "reward": 0.9500000268220902, | |
| "reward_std": 0.7578961223363876, | |
| "rewards/accuracy_reward": 0.3000000074505806, | |
| "rewards/format_reward": 0.6500000178813934, | |
| "step": 940 | |
| }, | |
| { | |
| "completion_length": 435.433349609375, | |
| "epoch": 0.504, | |
| "grad_norm": 20.99445152282715, | |
| "kl": 4.63515625, | |
| "learning_rate": 5.801938575801371e-07, | |
| "loss": 1.3974, | |
| "reward": 0.7708333671092987, | |
| "reward_std": 0.926627391576767, | |
| "rewards/accuracy_reward": 0.27083334103226664, | |
| "rewards/format_reward": 0.5000000163912773, | |
| "step": 945 | |
| }, | |
| { | |
| "completion_length": 370.6958404541016, | |
| "epoch": 0.5066666666666667, | |
| "grad_norm": 8.884126663208008, | |
| "kl": 3.3875, | |
| "learning_rate": 5.755951277544607e-07, | |
| "loss": 1.131, | |
| "reward": 0.8166666984558105, | |
| "reward_std": 0.764404758810997, | |
| "rewards/accuracy_reward": 0.2333333395421505, | |
| "rewards/format_reward": 0.5833333551883697, | |
| "step": 950 | |
| }, | |
| { | |
| "completion_length": 340.5333419799805, | |
| "epoch": 0.5093333333333333, | |
| "grad_norm": 49.03192138671875, | |
| "kl": 3.83671875, | |
| "learning_rate": 5.709898440220551e-07, | |
| "loss": 1.1697, | |
| "reward": 0.9958333760499954, | |
| "reward_std": 0.7807205557823181, | |
| "rewards/accuracy_reward": 0.37916667461395265, | |
| "rewards/format_reward": 0.6166666805744171, | |
| "step": 955 | |
| }, | |
| { | |
| "completion_length": 263.5541793823242, | |
| "epoch": 0.512, | |
| "grad_norm": 9.941174507141113, | |
| "kl": 2.30234375, | |
| "learning_rate": 5.663784056493936e-07, | |
| "loss": 0.9917, | |
| "reward": 1.1750000298023224, | |
| "reward_std": 0.5471759930253028, | |
| "rewards/accuracy_reward": 0.4166666727513075, | |
| "rewards/format_reward": 0.7583333492279053, | |
| "step": 960 | |
| }, | |
| { | |
| "completion_length": 298.9041778564453, | |
| "epoch": 0.5146666666666667, | |
| "grad_norm": 28.354358673095703, | |
| "kl": 3.20546875, | |
| "learning_rate": 5.61761212436541e-07, | |
| "loss": 1.051, | |
| "reward": 0.862500011920929, | |
| "reward_std": 0.6203906744718551, | |
| "rewards/accuracy_reward": 0.19583333656191826, | |
| "rewards/format_reward": 0.6666666805744171, | |
| "step": 965 | |
| }, | |
| { | |
| "completion_length": 181.4708381652832, | |
| "epoch": 0.5173333333333333, | |
| "grad_norm": 10.501627922058105, | |
| "kl": 2.1251953125, | |
| "learning_rate": 5.571386646824922e-07, | |
| "loss": 0.4847, | |
| "reward": 1.1541667103767395, | |
| "reward_std": 0.2954378850758076, | |
| "rewards/accuracy_reward": 0.29583333879709245, | |
| "rewards/format_reward": 0.8583333492279053, | |
| "step": 970 | |
| }, | |
| { | |
| "completion_length": 157.45000381469725, | |
| "epoch": 0.52, | |
| "grad_norm": 8.8187894821167, | |
| "kl": 1.551953125, | |
| "learning_rate": 5.525111631504677e-07, | |
| "loss": 0.5822, | |
| "reward": 1.1541666984558105, | |
| "reward_std": 0.36384222060441973, | |
| "rewards/accuracy_reward": 0.27083334028720857, | |
| "rewards/format_reward": 0.8833333492279053, | |
| "step": 975 | |
| }, | |
| { | |
| "completion_length": 146.86666870117188, | |
| "epoch": 0.5226666666666666, | |
| "grad_norm": 8.133127212524414, | |
| "kl": 1.458203125, | |
| "learning_rate": 5.478791090331677e-07, | |
| "loss": 0.5357, | |
| "reward": 1.2083333730697632, | |
| "reward_std": 0.3662621095776558, | |
| "rewards/accuracy_reward": 0.3166666738688946, | |
| "rewards/format_reward": 0.891666692495346, | |
| "step": 980 | |
| }, | |
| { | |
| "completion_length": 128.45417251586915, | |
| "epoch": 0.5253333333333333, | |
| "grad_norm": 15.48926830291748, | |
| "kl": 1.6015625, | |
| "learning_rate": 5.432429039179899e-07, | |
| "loss": 0.3506, | |
| "reward": 1.1458333671092986, | |
| "reward_std": 0.2651623532176018, | |
| "rewards/accuracy_reward": 0.2541666731238365, | |
| "rewards/format_reward": 0.891666692495346, | |
| "step": 985 | |
| }, | |
| { | |
| "completion_length": 146.4000045776367, | |
| "epoch": 0.528, | |
| "grad_norm": 5.376186370849609, | |
| "kl": 1.834765625, | |
| "learning_rate": 5.386029497522133e-07, | |
| "loss": 0.6246, | |
| "reward": 1.2416666865348815, | |
| "reward_std": 0.35661301463842393, | |
| "rewards/accuracy_reward": 0.3583333447575569, | |
| "rewards/format_reward": 0.8833333551883698, | |
| "step": 990 | |
| }, | |
| { | |
| "completion_length": 177.82500457763672, | |
| "epoch": 0.5306666666666666, | |
| "grad_norm": 22.633089065551758, | |
| "kl": 1.623828125, | |
| "learning_rate": 5.3395964880815e-07, | |
| "loss": 0.6805, | |
| "reward": 1.141666692495346, | |
| "reward_std": 0.358070158213377, | |
| "rewards/accuracy_reward": 0.27500000707805156, | |
| "rewards/format_reward": 0.8666666805744171, | |
| "step": 995 | |
| }, | |
| { | |
| "completion_length": 196.29583969116212, | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 47.34584426879883, | |
| "kl": 1.69375, | |
| "learning_rate": 5.293134036482698e-07, | |
| "loss": 0.823, | |
| "reward": 1.087500023841858, | |
| "reward_std": 0.3883266061544418, | |
| "rewards/accuracy_reward": 0.24583334140479565, | |
| "rewards/format_reward": 0.8416666924953461, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "eval_completion_length": 175.36000528971354, | |
| "eval_kl": 1.351640625, | |
| "eval_loss": 0.610393226146698, | |
| "eval_reward": 1.0127778077125549, | |
| "eval_reward_std": 0.40122534612814587, | |
| "eval_rewards/accuracy_reward": 0.1438888931274414, | |
| "eval_rewards/format_reward": 0.8688889082272847, | |
| "eval_runtime": 672.1413, | |
| "eval_samples_per_second": 0.446, | |
| "eval_steps_per_second": 0.019, | |
| "step": 1000 | |
| }, | |
| { | |
| "completion_length": 131.97083587646483, | |
| "epoch": 0.536, | |
| "grad_norm": 80.59904479980469, | |
| "kl": 1.5953125, | |
| "learning_rate": 5.246646170902975e-07, | |
| "loss": 0.4407, | |
| "reward": 1.1833333492279052, | |
| "reward_std": 0.2533145576715469, | |
| "rewards/accuracy_reward": 0.26666667461395266, | |
| "rewards/format_reward": 0.9166666746139527, | |
| "step": 1005 | |
| }, | |
| { | |
| "completion_length": 149.46666946411133, | |
| "epoch": 0.5386666666666666, | |
| "grad_norm": 6.973750591278076, | |
| "kl": 1.4193359375, | |
| "learning_rate": 5.200136921722918e-07, | |
| "loss": 0.5413, | |
| "reward": 1.1666666924953462, | |
| "reward_std": 0.3360213190317154, | |
| "rewards/accuracy_reward": 0.26666667833924296, | |
| "rewards/format_reward": 0.9000000178813934, | |
| "step": 1010 | |
| }, | |
| { | |
| "completion_length": 178.3958366394043, | |
| "epoch": 0.5413333333333333, | |
| "grad_norm": 10.757118225097656, | |
| "kl": 1.02265625, | |
| "learning_rate": 5.153610321177013e-07, | |
| "loss": 0.4342, | |
| "reward": 1.0666667103767395, | |
| "reward_std": 0.2775675721466541, | |
| "rewards/accuracy_reward": 0.17500000670552254, | |
| "rewards/format_reward": 0.8916666865348816, | |
| "step": 1015 | |
| }, | |
| { | |
| "completion_length": 265.3958404541016, | |
| "epoch": 0.544, | |
| "grad_norm": 11.506460189819336, | |
| "kl": 2.033984375, | |
| "learning_rate": 5.107070403004066e-07, | |
| "loss": 0.8198, | |
| "reward": 0.8750000298023224, | |
| "reward_std": 0.6099086761474609, | |
| "rewards/accuracy_reward": 0.15833333767950536, | |
| "rewards/format_reward": 0.7166666865348816, | |
| "step": 1020 | |
| }, | |
| { | |
| "completion_length": 302.5041778564453, | |
| "epoch": 0.5466666666666666, | |
| "grad_norm": 16.32666015625, | |
| "kl": 2.891015625, | |
| "learning_rate": 5.060521202097489e-07, | |
| "loss": 1.0253, | |
| "reward": 1.025000014901161, | |
| "reward_std": 0.5060222968459129, | |
| "rewards/accuracy_reward": 0.27500000186264517, | |
| "rewards/format_reward": 0.7500000208616256, | |
| "step": 1025 | |
| }, | |
| { | |
| "completion_length": 416.4416763305664, | |
| "epoch": 0.5493333333333333, | |
| "grad_norm": 15.603365898132324, | |
| "kl": 2.96328125, | |
| "learning_rate": 5.013966754155482e-07, | |
| "loss": 1.1611, | |
| "reward": 0.7500000238418579, | |
| "reward_std": 0.8826716184616089, | |
| "rewards/accuracy_reward": 0.2000000063329935, | |
| "rewards/format_reward": 0.5500000149011612, | |
| "step": 1030 | |
| }, | |
| { | |
| "completion_length": 368.07500915527345, | |
| "epoch": 0.552, | |
| "grad_norm": 10.401843070983887, | |
| "kl": 3.10078125, | |
| "learning_rate": 4.967411095331149e-07, | |
| "loss": 1.114, | |
| "reward": 0.8791666865348816, | |
| "reward_std": 0.7436762899160385, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/format_reward": 0.608333346247673, | |
| "step": 1035 | |
| }, | |
| { | |
| "completion_length": 253.62917251586913, | |
| "epoch": 0.5546666666666666, | |
| "grad_norm": 8.17926025390625, | |
| "kl": 2.52109375, | |
| "learning_rate": 4.920858261882577e-07, | |
| "loss": 0.9692, | |
| "reward": 1.0166667014360429, | |
| "reward_std": 0.7252250477671623, | |
| "rewards/accuracy_reward": 0.3000000059604645, | |
| "rewards/format_reward": 0.7166666775941849, | |
| "step": 1040 | |
| }, | |
| { | |
| "completion_length": 202.72500228881836, | |
| "epoch": 0.5573333333333333, | |
| "grad_norm": 23.440996170043945, | |
| "kl": 2.301953125, | |
| "learning_rate": 4.874312289822899e-07, | |
| "loss": 0.8462, | |
| "reward": 1.1583333671092988, | |
| "reward_std": 0.5490816205739975, | |
| "rewards/accuracy_reward": 0.3333333469927311, | |
| "rewards/format_reward": 0.825000011920929, | |
| "step": 1045 | |
| }, | |
| { | |
| "completion_length": 236.8416748046875, | |
| "epoch": 0.56, | |
| "grad_norm": 6.160148620605469, | |
| "kl": 2.3287109375, | |
| "learning_rate": 4.827777214570384e-07, | |
| "loss": 0.9984, | |
| "reward": 1.066666704416275, | |
| "reward_std": 0.5855829656124115, | |
| "rewards/accuracy_reward": 0.2833333447575569, | |
| "rewards/format_reward": 0.7833333551883698, | |
| "step": 1050 | |
| }, | |
| { | |
| "completion_length": 160.41250305175782, | |
| "epoch": 0.5626666666666666, | |
| "grad_norm": 59.35076904296875, | |
| "kl": 2.099609375, | |
| "learning_rate": 4.781257070598571e-07, | |
| "loss": 0.6582, | |
| "reward": 1.2125000357627869, | |
| "reward_std": 0.3614451542496681, | |
| "rewards/accuracy_reward": 0.3291666731238365, | |
| "rewards/format_reward": 0.8833333492279053, | |
| "step": 1055 | |
| }, | |
| { | |
| "completion_length": 216.84167404174804, | |
| "epoch": 0.5653333333333334, | |
| "grad_norm": 11.944228172302246, | |
| "kl": 1.9265625, | |
| "learning_rate": 4.734755891086498e-07, | |
| "loss": 0.8889, | |
| "reward": 0.9833333671092988, | |
| "reward_std": 0.4978467658162117, | |
| "rewards/accuracy_reward": 0.1583333358168602, | |
| "rewards/format_reward": 0.8250000238418579, | |
| "step": 1060 | |
| }, | |
| { | |
| "completion_length": 204.32500534057618, | |
| "epoch": 0.568, | |
| "grad_norm": 20.293838500976562, | |
| "kl": 2.085546875, | |
| "learning_rate": 4.6882777075690346e-07, | |
| "loss": 0.6455, | |
| "reward": 1.1208333671092987, | |
| "reward_std": 0.46157447397708895, | |
| "rewards/accuracy_reward": 0.3041666723787785, | |
| "rewards/format_reward": 0.8166666865348816, | |
| "step": 1065 | |
| }, | |
| { | |
| "completion_length": 239.66666946411132, | |
| "epoch": 0.5706666666666667, | |
| "grad_norm": 8.901671409606934, | |
| "kl": 1.844140625, | |
| "learning_rate": 4.6418265495873516e-07, | |
| "loss": 0.7643, | |
| "reward": 1.1750000357627868, | |
| "reward_std": 0.4657398253679276, | |
| "rewards/accuracy_reward": 0.35833334624767305, | |
| "rewards/format_reward": 0.8166666865348816, | |
| "step": 1070 | |
| }, | |
| { | |
| "completion_length": 236.07917327880858, | |
| "epoch": 0.5733333333333334, | |
| "grad_norm": 10.970005989074707, | |
| "kl": 1.715234375, | |
| "learning_rate": 4.595406444339576e-07, | |
| "loss": 0.7525, | |
| "reward": 1.0000000417232513, | |
| "reward_std": 0.4881373070180416, | |
| "rewards/accuracy_reward": 0.19166667051613331, | |
| "rewards/format_reward": 0.8083333492279052, | |
| "step": 1075 | |
| }, | |
| { | |
| "completion_length": 317.90001220703124, | |
| "epoch": 0.576, | |
| "grad_norm": 14.551898956298828, | |
| "kl": 3.174609375, | |
| "learning_rate": 4.5490214163316397e-07, | |
| "loss": 0.9734, | |
| "reward": 0.8916666984558106, | |
| "reward_std": 0.6427461057901382, | |
| "rewards/accuracy_reward": 0.20833333693444728, | |
| "rewards/format_reward": 0.6833333522081375, | |
| "step": 1080 | |
| }, | |
| { | |
| "completion_length": 227.68750457763673, | |
| "epoch": 0.5786666666666667, | |
| "grad_norm": 9.023757934570312, | |
| "kl": 1.8625, | |
| "learning_rate": 4.502675487028369e-07, | |
| "loss": 0.8347, | |
| "reward": 1.0416666984558105, | |
| "reward_std": 0.5368030399084092, | |
| "rewards/accuracy_reward": 0.2416666727513075, | |
| "rewards/format_reward": 0.8000000238418579, | |
| "step": 1085 | |
| }, | |
| { | |
| "completion_length": 288.81667404174806, | |
| "epoch": 0.5813333333333334, | |
| "grad_norm": 27.050466537475586, | |
| "kl": 2.22265625, | |
| "learning_rate": 4.456372674504828e-07, | |
| "loss": 0.9977, | |
| "reward": 0.9666666924953461, | |
| "reward_std": 0.5600151270627975, | |
| "rewards/accuracy_reward": 0.23333334028720856, | |
| "rewards/format_reward": 0.7333333492279053, | |
| "step": 1090 | |
| }, | |
| { | |
| "completion_length": 233.35834045410155, | |
| "epoch": 0.584, | |
| "grad_norm": 99.57305908203125, | |
| "kl": 2.521875, | |
| "learning_rate": 4.4101169930979677e-07, | |
| "loss": 1.0178, | |
| "reward": 1.0416666984558105, | |
| "reward_std": 0.5232247993350029, | |
| "rewards/accuracy_reward": 0.2500000074505806, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 1095 | |
| }, | |
| { | |
| "completion_length": 237.47084121704103, | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 32.285587310791016, | |
| "kl": 2.566796875, | |
| "learning_rate": 4.3639124530585885e-07, | |
| "loss": 0.9553, | |
| "reward": 1.0416666984558105, | |
| "reward_std": 0.49489557296037673, | |
| "rewards/accuracy_reward": 0.24166667237877845, | |
| "rewards/format_reward": 0.8000000238418579, | |
| "step": 1100 | |
| }, | |
| { | |
| "completion_length": 263.5375045776367, | |
| "epoch": 0.5893333333333334, | |
| "grad_norm": 14.501636505126953, | |
| "kl": 2.036328125, | |
| "learning_rate": 4.317763060203664e-07, | |
| "loss": 0.7803, | |
| "reward": 1.0916667044162751, | |
| "reward_std": 0.5974579885601997, | |
| "rewards/accuracy_reward": 0.34166668020188806, | |
| "rewards/format_reward": 0.7500000119209289, | |
| "step": 1105 | |
| }, | |
| { | |
| "completion_length": 190.2458381652832, | |
| "epoch": 0.592, | |
| "grad_norm": 28.570791244506836, | |
| "kl": 2.1984375, | |
| "learning_rate": 4.271672815569047e-07, | |
| "loss": 0.7099, | |
| "reward": 1.1125000298023224, | |
| "reward_std": 0.4488224387168884, | |
| "rewards/accuracy_reward": 0.2958333443850279, | |
| "rewards/format_reward": 0.8166666924953461, | |
| "step": 1110 | |
| }, | |
| { | |
| "completion_length": 191.68750381469727, | |
| "epoch": 0.5946666666666667, | |
| "grad_norm": 56.26449203491211, | |
| "kl": 1.941015625, | |
| "learning_rate": 4.2256457150625847e-07, | |
| "loss": 0.7596, | |
| "reward": 1.1041666984558105, | |
| "reward_std": 0.43807603493332864, | |
| "rewards/accuracy_reward": 0.27916667200624945, | |
| "rewards/format_reward": 0.8250000238418579, | |
| "step": 1115 | |
| }, | |
| { | |
| "completion_length": 164.62500534057617, | |
| "epoch": 0.5973333333333334, | |
| "grad_norm": 159.04928588867188, | |
| "kl": 2.11171875, | |
| "learning_rate": 4.1796857491176966e-07, | |
| "loss": 0.7277, | |
| "reward": 1.200000035762787, | |
| "reward_std": 0.3421368353068829, | |
| "rewards/accuracy_reward": 0.30833334289491177, | |
| "rewards/format_reward": 0.8916666805744171, | |
| "step": 1120 | |
| }, | |
| { | |
| "completion_length": 216.95833892822264, | |
| "epoch": 0.6, | |
| "grad_norm": 12.805575370788574, | |
| "kl": 1.614453125, | |
| "learning_rate": 4.133796902347396e-07, | |
| "loss": 0.5934, | |
| "reward": 1.1250000417232513, | |
| "reward_std": 0.4264773324131966, | |
| "rewards/accuracy_reward": 0.3083333373069763, | |
| "rewards/format_reward": 0.8166666865348816, | |
| "step": 1125 | |
| }, | |
| { | |
| "completion_length": 219.3791748046875, | |
| "epoch": 0.6026666666666667, | |
| "grad_norm": 14.401493072509766, | |
| "kl": 2.0857421875, | |
| "learning_rate": 4.087983153198848e-07, | |
| "loss": 0.6371, | |
| "reward": 1.0958333671092988, | |
| "reward_std": 0.5428274616599083, | |
| "rewards/accuracy_reward": 0.3041666761040688, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 1130 | |
| }, | |
| { | |
| "completion_length": 189.86667556762694, | |
| "epoch": 0.6053333333333333, | |
| "grad_norm": 19.875732421875, | |
| "kl": 2.033203125, | |
| "learning_rate": 4.0422484736084414e-07, | |
| "loss": 0.6977, | |
| "reward": 1.0750000417232513, | |
| "reward_std": 0.40034623965620997, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/format_reward": 0.8666666805744171, | |
| "step": 1135 | |
| }, | |
| { | |
| "completion_length": 160.80417022705078, | |
| "epoch": 0.608, | |
| "grad_norm": 9.801048278808594, | |
| "kl": 1.8109375, | |
| "learning_rate": 3.9965968286574367e-07, | |
| "loss": 0.5807, | |
| "reward": 1.1375000476837158, | |
| "reward_std": 0.3900581821799278, | |
| "rewards/accuracy_reward": 0.2625000040978193, | |
| "rewards/format_reward": 0.8750000178813935, | |
| "step": 1140 | |
| }, | |
| { | |
| "completion_length": 157.98750534057618, | |
| "epoch": 0.6106666666666667, | |
| "grad_norm": 30.29570770263672, | |
| "kl": 1.40703125, | |
| "learning_rate": 3.951032176228199e-07, | |
| "loss": 0.5565, | |
| "reward": 1.200000035762787, | |
| "reward_std": 0.2764429710805416, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.9083333492279053, | |
| "step": 1145 | |
| }, | |
| { | |
| "completion_length": 180.90000610351564, | |
| "epoch": 0.6133333333333333, | |
| "grad_norm": 23.002822875976562, | |
| "kl": 1.65390625, | |
| "learning_rate": 3.9055584666610596e-07, | |
| "loss": 0.6413, | |
| "reward": 1.2041666984558106, | |
| "reward_std": 0.40405053198337554, | |
| "rewards/accuracy_reward": 0.33750001415610315, | |
| "rewards/format_reward": 0.8666666924953461, | |
| "step": 1150 | |
| }, | |
| { | |
| "completion_length": 186.8958396911621, | |
| "epoch": 0.616, | |
| "grad_norm": 12.507006645202637, | |
| "kl": 1.280078125, | |
| "learning_rate": 3.860179642411837e-07, | |
| "loss": 0.6498, | |
| "reward": 1.1333333432674408, | |
| "reward_std": 0.3774823874235153, | |
| "rewards/accuracy_reward": 0.2750000037252903, | |
| "rewards/format_reward": 0.8583333492279053, | |
| "step": 1155 | |
| }, | |
| { | |
| "completion_length": 155.95416946411132, | |
| "epoch": 0.6186666666666667, | |
| "grad_norm": 3.330310583114624, | |
| "kl": 1.2033203125, | |
| "learning_rate": 3.8148996377100304e-07, | |
| "loss": 0.4313, | |
| "reward": 1.2041667103767395, | |
| "reward_std": 0.3509316384792328, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.8916666805744171, | |
| "step": 1160 | |
| }, | |
| { | |
| "completion_length": 188.76667327880858, | |
| "epoch": 0.6213333333333333, | |
| "grad_norm": 40.26872634887695, | |
| "kl": 2.188671875, | |
| "learning_rate": 3.7697223782177303e-07, | |
| "loss": 0.6651, | |
| "reward": 1.2583333909511567, | |
| "reward_std": 0.4299319893121719, | |
| "rewards/accuracy_reward": 0.4083333432674408, | |
| "rewards/format_reward": 0.8500000178813935, | |
| "step": 1165 | |
| }, | |
| { | |
| "completion_length": 252.42917556762694, | |
| "epoch": 0.624, | |
| "grad_norm": 16.348541259765625, | |
| "kl": 2.37421875, | |
| "learning_rate": 3.724651780689285e-07, | |
| "loss": 0.9173, | |
| "reward": 1.0250000298023223, | |
| "reward_std": 0.5082567930221558, | |
| "rewards/accuracy_reward": 0.2666666731238365, | |
| "rewards/format_reward": 0.7583333551883698, | |
| "step": 1170 | |
| }, | |
| { | |
| "completion_length": 239.4791717529297, | |
| "epoch": 0.6266666666666667, | |
| "grad_norm": 17.348703384399414, | |
| "kl": 1.586328125, | |
| "learning_rate": 3.679691752631715e-07, | |
| "loss": 0.796, | |
| "reward": 1.1250000417232513, | |
| "reward_std": 0.4487001359462738, | |
| "rewards/accuracy_reward": 0.3250000089406967, | |
| "rewards/format_reward": 0.8000000238418579, | |
| "step": 1175 | |
| }, | |
| { | |
| "completion_length": 184.93750610351563, | |
| "epoch": 0.6293333333333333, | |
| "grad_norm": 5.322170734405518, | |
| "kl": 1.261328125, | |
| "learning_rate": 3.6348461919659433e-07, | |
| "loss": 0.5975, | |
| "reward": 1.200000023841858, | |
| "reward_std": 0.394177620112896, | |
| "rewards/accuracy_reward": 0.35000001043081286, | |
| "rewards/format_reward": 0.8500000178813935, | |
| "step": 1180 | |
| }, | |
| { | |
| "completion_length": 149.95417175292968, | |
| "epoch": 0.632, | |
| "grad_norm": 4.652257919311523, | |
| "kl": 1.3166015625, | |
| "learning_rate": 3.590118986688865e-07, | |
| "loss": 0.5187, | |
| "reward": 1.2375000357627868, | |
| "reward_std": 0.35248192474246026, | |
| "rewards/accuracy_reward": 0.34583334140479566, | |
| "rewards/format_reward": 0.8916666805744171, | |
| "step": 1185 | |
| }, | |
| { | |
| "completion_length": 214.28750915527343, | |
| "epoch": 0.6346666666666667, | |
| "grad_norm": 8.401276588439941, | |
| "kl": 1.615234375, | |
| "learning_rate": 3.5455140145362586e-07, | |
| "loss": 0.803, | |
| "reward": 1.1166667103767396, | |
| "reward_std": 0.385433167219162, | |
| "rewards/accuracy_reward": 0.2750000089406967, | |
| "rewards/format_reward": 0.8416666865348816, | |
| "step": 1190 | |
| }, | |
| { | |
| "completion_length": 243.75000686645507, | |
| "epoch": 0.6373333333333333, | |
| "grad_norm": 17.25551414489746, | |
| "kl": 1.5962890625, | |
| "learning_rate": 3.5010351426466003e-07, | |
| "loss": 0.7611, | |
| "reward": 1.1750000417232513, | |
| "reward_std": 0.5330882802605629, | |
| "rewards/accuracy_reward": 0.36666667759418486, | |
| "rewards/format_reward": 0.8083333492279052, | |
| "step": 1195 | |
| }, | |
| { | |
| "completion_length": 185.69167251586913, | |
| "epoch": 0.64, | |
| "grad_norm": 15.053462982177734, | |
| "kl": 1.40078125, | |
| "learning_rate": 3.4566862272257917e-07, | |
| "loss": 0.7929, | |
| "reward": 1.154166692495346, | |
| "reward_std": 0.4828508198261261, | |
| "rewards/accuracy_reward": 0.30416667461395264, | |
| "rewards/format_reward": 0.8500000178813935, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_completion_length": 263.9861196899414, | |
| "eval_kl": 2.4522395833333333, | |
| "eval_loss": 0.904167890548706, | |
| "eval_reward": 0.9144444712003073, | |
| "eval_reward_std": 0.5771439289053281, | |
| "eval_rewards/accuracy_reward": 0.16111111541589102, | |
| "eval_rewards/format_reward": 0.7533333551883697, | |
| "eval_runtime": 815.5292, | |
| "eval_samples_per_second": 0.368, | |
| "eval_steps_per_second": 0.016, | |
| "step": 1200 | |
| }, | |
| { | |
| "completion_length": 224.0416717529297, | |
| "epoch": 0.6426666666666667, | |
| "grad_norm": 14.006892204284668, | |
| "kl": 2.721875, | |
| "learning_rate": 3.412471113212837e-07, | |
| "loss": 0.8414, | |
| "reward": 1.0583333611488341, | |
| "reward_std": 0.5013190120458603, | |
| "rewards/accuracy_reward": 0.2583333384245634, | |
| "rewards/format_reward": 0.8000000178813934, | |
| "step": 1205 | |
| }, | |
| { | |
| "completion_length": 250.73750228881835, | |
| "epoch": 0.6453333333333333, | |
| "grad_norm": 4.346928596496582, | |
| "kl": 1.89140625, | |
| "learning_rate": 3.3683936339464955e-07, | |
| "loss": 0.6544, | |
| "reward": 1.1083333611488342, | |
| "reward_std": 0.5055985808372497, | |
| "rewards/accuracy_reward": 0.33333334103226664, | |
| "rewards/format_reward": 0.7750000178813934, | |
| "step": 1210 | |
| }, | |
| { | |
| "completion_length": 262.02500915527344, | |
| "epoch": 0.648, | |
| "grad_norm": 8.122538566589355, | |
| "kl": 1.6572265625, | |
| "learning_rate": 3.324457610832941e-07, | |
| "loss": 0.8986, | |
| "reward": 1.1541667103767395, | |
| "reward_std": 0.5832742094993592, | |
| "rewards/accuracy_reward": 0.3791666805744171, | |
| "rewards/format_reward": 0.775000023841858, | |
| "step": 1215 | |
| }, | |
| { | |
| "completion_length": 245.7916717529297, | |
| "epoch": 0.6506666666666666, | |
| "grad_norm": 11.933501243591309, | |
| "kl": 1.238671875, | |
| "learning_rate": 3.280666853014457e-07, | |
| "loss": 0.7222, | |
| "reward": 1.1958333611488343, | |
| "reward_std": 0.5057858511805534, | |
| "rewards/accuracy_reward": 0.40416667573153975, | |
| "rewards/format_reward": 0.7916666746139527, | |
| "step": 1220 | |
| }, | |
| { | |
| "completion_length": 311.77917633056643, | |
| "epoch": 0.6533333333333333, | |
| "grad_norm": 13.826354026794434, | |
| "kl": 2.342578125, | |
| "learning_rate": 3.2370251570391925e-07, | |
| "loss": 0.9241, | |
| "reward": 0.941666704416275, | |
| "reward_std": 0.5843323901295662, | |
| "rewards/accuracy_reward": 0.2000000037252903, | |
| "rewards/format_reward": 0.7416666865348815, | |
| "step": 1225 | |
| }, | |
| { | |
| "completion_length": 288.49167709350587, | |
| "epoch": 0.656, | |
| "grad_norm": 12.062870979309082, | |
| "kl": 2.384375, | |
| "learning_rate": 3.1935363065320126e-07, | |
| "loss": 0.9282, | |
| "reward": 0.9458333492279053, | |
| "reward_std": 0.5712588280439377, | |
| "rewards/accuracy_reward": 0.19583333730697633, | |
| "rewards/format_reward": 0.7500000119209289, | |
| "step": 1230 | |
| }, | |
| { | |
| "completion_length": 235.39584197998047, | |
| "epoch": 0.6586666666666666, | |
| "grad_norm": 7.450344085693359, | |
| "kl": 1.758203125, | |
| "learning_rate": 3.150204071866464e-07, | |
| "loss": 0.6324, | |
| "reward": 1.1458333611488343, | |
| "reward_std": 0.5423780143260956, | |
| "rewards/accuracy_reward": 0.36250000819563866, | |
| "rewards/format_reward": 0.7833333551883698, | |
| "step": 1235 | |
| }, | |
| { | |
| "completion_length": 197.81250762939453, | |
| "epoch": 0.6613333333333333, | |
| "grad_norm": 11.524059295654297, | |
| "kl": 1.31171875, | |
| "learning_rate": 3.107032209837892e-07, | |
| "loss": 0.5257, | |
| "reward": 1.0458333551883698, | |
| "reward_std": 0.334098968654871, | |
| "rewards/accuracy_reward": 0.19583333805203437, | |
| "rewards/format_reward": 0.850000011920929, | |
| "step": 1240 | |
| }, | |
| { | |
| "completion_length": 247.20000762939452, | |
| "epoch": 0.664, | |
| "grad_norm": 7.961787700653076, | |
| "kl": 1.726171875, | |
| "learning_rate": 3.064024463337747e-07, | |
| "loss": 0.8873, | |
| "reward": 0.916666692495346, | |
| "reward_std": 0.4848631680011749, | |
| "rewards/accuracy_reward": 0.14166667088866233, | |
| "rewards/format_reward": 0.775000023841858, | |
| "step": 1245 | |
| }, | |
| { | |
| "completion_length": 229.66667633056642, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 23.106739044189453, | |
| "kl": 1.5203125, | |
| "learning_rate": 3.021184561029071e-07, | |
| "loss": 0.8455, | |
| "reward": 1.1083333611488342, | |
| "reward_std": 0.45471236705780027, | |
| "rewards/accuracy_reward": 0.28333334140479566, | |
| "rewards/format_reward": 0.8250000238418579, | |
| "step": 1250 | |
| }, | |
| { | |
| "completion_length": 316.7458381652832, | |
| "epoch": 0.6693333333333333, | |
| "grad_norm": 9.744142532348633, | |
| "kl": 2.18046875, | |
| "learning_rate": 2.9785162170232424e-07, | |
| "loss": 0.9333, | |
| "reward": 0.829166692495346, | |
| "reward_std": 0.6003586441278458, | |
| "rewards/accuracy_reward": 0.13750000558793546, | |
| "rewards/format_reward": 0.6916666984558105, | |
| "step": 1255 | |
| }, | |
| { | |
| "completion_length": 242.23750762939454, | |
| "epoch": 0.672, | |
| "grad_norm": 25.725797653198242, | |
| "kl": 1.94375, | |
| "learning_rate": 2.936023130557964e-07, | |
| "loss": 0.7897, | |
| "reward": 1.079166704416275, | |
| "reward_std": 0.5282017394900322, | |
| "rewards/accuracy_reward": 0.2875000059604645, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 1260 | |
| }, | |
| { | |
| "completion_length": 226.35417251586915, | |
| "epoch": 0.6746666666666666, | |
| "grad_norm": 4.361880779266357, | |
| "kl": 1.287109375, | |
| "learning_rate": 2.893708985676556e-07, | |
| "loss": 0.7382, | |
| "reward": 1.1708333611488342, | |
| "reward_std": 0.4796324223279953, | |
| "rewards/accuracy_reward": 0.33750001415610315, | |
| "rewards/format_reward": 0.8333333551883697, | |
| "step": 1265 | |
| }, | |
| { | |
| "completion_length": 179.21250839233397, | |
| "epoch": 0.6773333333333333, | |
| "grad_norm": 10.620832443237305, | |
| "kl": 1.5546875, | |
| "learning_rate": 2.851577450908553e-07, | |
| "loss": 0.5363, | |
| "reward": 1.204166692495346, | |
| "reward_std": 0.39730483293533325, | |
| "rewards/accuracy_reward": 0.3291666775941849, | |
| "rewards/format_reward": 0.8750000238418579, | |
| "step": 1270 | |
| }, | |
| { | |
| "completion_length": 225.60834045410155, | |
| "epoch": 0.68, | |
| "grad_norm": 24.622392654418945, | |
| "kl": 1.930859375, | |
| "learning_rate": 2.809632178951655e-07, | |
| "loss": 0.7525, | |
| "reward": 1.2000000476837158, | |
| "reward_std": 0.43045871555805204, | |
| "rewards/accuracy_reward": 0.3833333432674408, | |
| "rewards/format_reward": 0.8166666865348816, | |
| "step": 1275 | |
| }, | |
| { | |
| "completion_length": 189.73333892822265, | |
| "epoch": 0.6826666666666666, | |
| "grad_norm": 7.506571292877197, | |
| "kl": 1.469921875, | |
| "learning_rate": 2.767876806355045e-07, | |
| "loss": 0.6633, | |
| "reward": 1.1250000298023224, | |
| "reward_std": 0.4194158732891083, | |
| "rewards/accuracy_reward": 0.2666666753590107, | |
| "rewards/format_reward": 0.8583333551883697, | |
| "step": 1280 | |
| }, | |
| { | |
| "completion_length": 205.73334045410155, | |
| "epoch": 0.6853333333333333, | |
| "grad_norm": 6.358190059661865, | |
| "kl": 1.3841796875, | |
| "learning_rate": 2.7263149532041107e-07, | |
| "loss": 0.8039, | |
| "reward": 1.2166667103767395, | |
| "reward_std": 0.5374398469924927, | |
| "rewards/accuracy_reward": 0.3750000111758709, | |
| "rewards/format_reward": 0.8416666865348816, | |
| "step": 1285 | |
| }, | |
| { | |
| "completion_length": 210.5208427429199, | |
| "epoch": 0.688, | |
| "grad_norm": 27.677288055419922, | |
| "kl": 2.022265625, | |
| "learning_rate": 2.6849502228065955e-07, | |
| "loss": 0.8598, | |
| "reward": 1.0458333790302277, | |
| "reward_std": 0.47432570457458495, | |
| "rewards/accuracy_reward": 0.2291666693985462, | |
| "rewards/format_reward": 0.8166666924953461, | |
| "step": 1290 | |
| }, | |
| { | |
| "completion_length": 206.36667022705078, | |
| "epoch": 0.6906666666666667, | |
| "grad_norm": 27.595359802246094, | |
| "kl": 2.200390625, | |
| "learning_rate": 2.6437862013801937e-07, | |
| "loss": 0.7624, | |
| "reward": 1.075000023841858, | |
| "reward_std": 0.45619752407073977, | |
| "rewards/accuracy_reward": 0.27500000447034834, | |
| "rewards/format_reward": 0.8000000238418579, | |
| "step": 1295 | |
| }, | |
| { | |
| "completion_length": 281.9166717529297, | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 7.714141845703125, | |
| "kl": 3.14765625, | |
| "learning_rate": 2.6028264577416414e-07, | |
| "loss": 0.9706, | |
| "reward": 0.8500000357627868, | |
| "reward_std": 0.6301615715026856, | |
| "rewards/accuracy_reward": 0.1916666690260172, | |
| "rewards/format_reward": 0.6583333551883698, | |
| "step": 1300 | |
| }, | |
| { | |
| "completion_length": 227.537508392334, | |
| "epoch": 0.696, | |
| "grad_norm": 11.018620491027832, | |
| "kl": 1.490625, | |
| "learning_rate": 2.5620745429973046e-07, | |
| "loss": 0.7425, | |
| "reward": 1.0625000417232513, | |
| "reward_std": 0.47583652585744857, | |
| "rewards/accuracy_reward": 0.2708333406597376, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 1305 | |
| }, | |
| { | |
| "completion_length": 206.64584045410157, | |
| "epoch": 0.6986666666666667, | |
| "grad_norm": 7.6855149269104, | |
| "kl": 1.15, | |
| "learning_rate": 2.5215339902353093e-07, | |
| "loss": 0.6114, | |
| "reward": 1.2291666984558105, | |
| "reward_std": 0.40088424533605577, | |
| "rewards/accuracy_reward": 0.37083333879709246, | |
| "rewards/format_reward": 0.8583333492279053, | |
| "step": 1310 | |
| }, | |
| { | |
| "completion_length": 236.3791717529297, | |
| "epoch": 0.7013333333333334, | |
| "grad_norm": 36.869991302490234, | |
| "kl": 1.91953125, | |
| "learning_rate": 2.4812083142192323e-07, | |
| "loss": 0.8435, | |
| "reward": 1.1000000357627868, | |
| "reward_std": 0.501855157315731, | |
| "rewards/accuracy_reward": 0.30000000447034836, | |
| "rewards/format_reward": 0.8000000178813934, | |
| "step": 1315 | |
| }, | |
| { | |
| "completion_length": 239.29167404174805, | |
| "epoch": 0.704, | |
| "grad_norm": 34.20293045043945, | |
| "kl": 2.9728515625, | |
| "learning_rate": 2.441101011083378e-07, | |
| "loss": 0.652, | |
| "reward": 1.1500000417232514, | |
| "reward_std": 0.4413339100778103, | |
| "rewards/accuracy_reward": 0.33333334028720857, | |
| "rewards/format_reward": 0.8166666865348816, | |
| "step": 1320 | |
| }, | |
| { | |
| "completion_length": 196.09583740234376, | |
| "epoch": 0.7066666666666667, | |
| "grad_norm": 6.715369701385498, | |
| "kl": 1.90703125, | |
| "learning_rate": 2.4012155580296705e-07, | |
| "loss": 0.571, | |
| "reward": 1.3291667222976684, | |
| "reward_std": 0.47337266951799395, | |
| "rewards/accuracy_reward": 0.4958333492279053, | |
| "rewards/format_reward": 0.8333333492279053, | |
| "step": 1325 | |
| }, | |
| { | |
| "completion_length": 218.24584045410157, | |
| "epoch": 0.7093333333333334, | |
| "grad_norm": 6.281963348388672, | |
| "kl": 1.1951171875, | |
| "learning_rate": 2.3615554130262e-07, | |
| "loss": 0.7098, | |
| "reward": 1.2250000357627868, | |
| "reward_std": 0.4241747669875622, | |
| "rewards/accuracy_reward": 0.3916666805744171, | |
| "rewards/format_reward": 0.8333333551883697, | |
| "step": 1330 | |
| }, | |
| { | |
| "completion_length": 233.00000915527343, | |
| "epoch": 0.712, | |
| "grad_norm": 65.47384643554688, | |
| "kl": 1.837109375, | |
| "learning_rate": 2.3221240145074095e-07, | |
| "loss": 0.8178, | |
| "reward": 1.0833333551883697, | |
| "reward_std": 0.49581558406353, | |
| "rewards/accuracy_reward": 0.3000000059604645, | |
| "rewards/format_reward": 0.7833333551883698, | |
| "step": 1335 | |
| }, | |
| { | |
| "completion_length": 223.82500839233398, | |
| "epoch": 0.7146666666666667, | |
| "grad_norm": 4.103445529937744, | |
| "kl": 1.4453125, | |
| "learning_rate": 2.2829247810760021e-07, | |
| "loss": 0.7565, | |
| "reward": 1.075000023841858, | |
| "reward_std": 0.45809874683618546, | |
| "rewards/accuracy_reward": 0.2666666697710752, | |
| "rewards/format_reward": 0.8083333551883698, | |
| "step": 1340 | |
| }, | |
| { | |
| "completion_length": 205.72083892822266, | |
| "epoch": 0.7173333333333334, | |
| "grad_norm": 12.356497764587402, | |
| "kl": 1.340234375, | |
| "learning_rate": 2.2439611112065547e-07, | |
| "loss": 0.5622, | |
| "reward": 1.0833333671092986, | |
| "reward_std": 0.3753781244158745, | |
| "rewards/accuracy_reward": 0.2500000026077032, | |
| "rewards/format_reward": 0.8333333492279053, | |
| "step": 1345 | |
| }, | |
| { | |
| "completion_length": 223.57083969116212, | |
| "epoch": 0.72, | |
| "grad_norm": 17.527667999267578, | |
| "kl": 1.3921875, | |
| "learning_rate": 2.2052363829508776e-07, | |
| "loss": 0.6411, | |
| "reward": 1.2500000536441802, | |
| "reward_std": 0.46224844083189964, | |
| "rewards/accuracy_reward": 0.41666667014360426, | |
| "rewards/format_reward": 0.8333333551883697, | |
| "step": 1350 | |
| }, | |
| { | |
| "completion_length": 223.97083892822266, | |
| "epoch": 0.7226666666666667, | |
| "grad_norm": 6.988903522491455, | |
| "kl": 2.165234375, | |
| "learning_rate": 2.1667539536451452e-07, | |
| "loss": 0.7447, | |
| "reward": 1.004166704416275, | |
| "reward_std": 0.5843853443861008, | |
| "rewards/accuracy_reward": 0.2458333346992731, | |
| "rewards/format_reward": 0.7583333611488342, | |
| "step": 1355 | |
| }, | |
| { | |
| "completion_length": 207.48334045410155, | |
| "epoch": 0.7253333333333334, | |
| "grad_norm": 4.7572832107543945, | |
| "kl": 1.383203125, | |
| "learning_rate": 2.1285171596188268e-07, | |
| "loss": 0.6242, | |
| "reward": 1.2250000357627868, | |
| "reward_std": 0.4148427419364452, | |
| "rewards/accuracy_reward": 0.36666668206453323, | |
| "rewards/format_reward": 0.8583333551883697, | |
| "step": 1360 | |
| }, | |
| { | |
| "completion_length": 198.64167556762695, | |
| "epoch": 0.728, | |
| "grad_norm": 9.330974578857422, | |
| "kl": 1.2017578125, | |
| "learning_rate": 2.090529315905431e-07, | |
| "loss": 0.5328, | |
| "reward": 1.1500000298023223, | |
| "reward_std": 0.36292394176125525, | |
| "rewards/accuracy_reward": 0.3000000089406967, | |
| "rewards/format_reward": 0.8500000089406967, | |
| "step": 1365 | |
| }, | |
| { | |
| "completion_length": 175.92083740234375, | |
| "epoch": 0.7306666666666667, | |
| "grad_norm": 5.827265739440918, | |
| "kl": 1.030078125, | |
| "learning_rate": 2.052793715955104e-07, | |
| "loss": 0.4956, | |
| "reward": 1.2375000238418579, | |
| "reward_std": 0.41373512148857117, | |
| "rewards/accuracy_reward": 0.3625000089406967, | |
| "rewards/format_reward": 0.8750000178813935, | |
| "step": 1370 | |
| }, | |
| { | |
| "completion_length": 211.1041732788086, | |
| "epoch": 0.7333333333333333, | |
| "grad_norm": 19.677011489868164, | |
| "kl": 1.71640625, | |
| "learning_rate": 2.0153136313490943e-07, | |
| "loss": 0.6799, | |
| "reward": 1.1750000238418579, | |
| "reward_std": 0.4355910629034042, | |
| "rewards/accuracy_reward": 0.33333333767950535, | |
| "rewards/format_reward": 0.8416666805744171, | |
| "step": 1375 | |
| }, | |
| { | |
| "completion_length": 178.94584045410156, | |
| "epoch": 0.736, | |
| "grad_norm": 9.273496627807617, | |
| "kl": 2.182421875, | |
| "learning_rate": 1.9780923115161158e-07, | |
| "loss": 0.7168, | |
| "reward": 1.2291667103767394, | |
| "reward_std": 0.4490681551396847, | |
| "rewards/accuracy_reward": 0.37916667461395265, | |
| "rewards/format_reward": 0.8500000178813935, | |
| "step": 1380 | |
| }, | |
| { | |
| "completion_length": 169.15000762939454, | |
| "epoch": 0.7386666666666667, | |
| "grad_norm": 4.525454044342041, | |
| "kl": 1.0986328125, | |
| "learning_rate": 1.9411329834506286e-07, | |
| "loss": 0.5639, | |
| "reward": 1.237500047683716, | |
| "reward_std": 0.43203722685575485, | |
| "rewards/accuracy_reward": 0.3708333432674408, | |
| "rewards/format_reward": 0.8666666865348815, | |
| "step": 1385 | |
| }, | |
| { | |
| "completion_length": 246.45417175292968, | |
| "epoch": 0.7413333333333333, | |
| "grad_norm": 6.603713035583496, | |
| "kl": 1.836328125, | |
| "learning_rate": 1.904438851433068e-07, | |
| "loss": 0.9502, | |
| "reward": 0.9625000119209289, | |
| "reward_std": 0.5920814260840416, | |
| "rewards/accuracy_reward": 0.1958333369344473, | |
| "rewards/format_reward": 0.7666666865348816, | |
| "step": 1390 | |
| }, | |
| { | |
| "completion_length": 228.13750686645508, | |
| "epoch": 0.744, | |
| "grad_norm": 12.823264122009277, | |
| "kl": 2.1423828125, | |
| "learning_rate": 1.868013096752043e-07, | |
| "loss": 0.6335, | |
| "reward": 1.0458333492279053, | |
| "reward_std": 0.4450342819094658, | |
| "rewards/accuracy_reward": 0.2708333358168602, | |
| "rewards/format_reward": 0.7750000178813934, | |
| "step": 1395 | |
| }, | |
| { | |
| "completion_length": 179.4166717529297, | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 5.9196295738220215, | |
| "kl": 1.4501953125, | |
| "learning_rate": 1.8318588774285237e-07, | |
| "loss": 0.675, | |
| "reward": 1.2958333611488342, | |
| "reward_std": 0.4500196687877178, | |
| "rewards/accuracy_reward": 0.45416668355464934, | |
| "rewards/format_reward": 0.8416666924953461, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7466666666666667, | |
| "eval_completion_length": 254.25000681559246, | |
| "eval_kl": 2.1704947916666666, | |
| "eval_loss": 0.8455010652542114, | |
| "eval_reward": 0.9288889118035635, | |
| "eval_reward_std": 0.5875919719537099, | |
| "eval_rewards/accuracy_reward": 0.1711111158132553, | |
| "eval_rewards/format_reward": 0.7577777977784474, | |
| "eval_runtime": 828.1328, | |
| "eval_samples_per_second": 0.362, | |
| "eval_steps_per_second": 0.016, | |
| "step": 1400 | |
| }, | |
| { | |
| "completion_length": 203.6458396911621, | |
| "epoch": 0.7493333333333333, | |
| "grad_norm": 6.658132553100586, | |
| "kl": 1.80625, | |
| "learning_rate": 1.7959793279420505e-07, | |
| "loss": 0.7215, | |
| "reward": 1.1333333730697632, | |
| "reward_std": 0.4133936479687691, | |
| "rewards/accuracy_reward": 0.2916666746139526, | |
| "rewards/format_reward": 0.8416666924953461, | |
| "step": 1405 | |
| }, | |
| { | |
| "completion_length": 137.4416702270508, | |
| "epoch": 0.752, | |
| "grad_norm": 5.461396217346191, | |
| "kl": 1.5439453125, | |
| "learning_rate": 1.760377558958982e-07, | |
| "loss": 0.4963, | |
| "reward": 1.1791667282581328, | |
| "reward_std": 0.3674156993627548, | |
| "rewards/accuracy_reward": 0.287500013038516, | |
| "rewards/format_reward": 0.8916666865348816, | |
| "step": 1410 | |
| }, | |
| { | |
| "completion_length": 180.80833892822267, | |
| "epoch": 0.7546666666666667, | |
| "grad_norm": 6.98518180847168, | |
| "kl": 1.206640625, | |
| "learning_rate": 1.72505665706281e-07, | |
| "loss": 0.6087, | |
| "reward": 1.2375000536441803, | |
| "reward_std": 0.3306782692670822, | |
| "rewards/accuracy_reward": 0.3541666746139526, | |
| "rewards/format_reward": 0.8833333551883698, | |
| "step": 1415 | |
| }, | |
| { | |
| "completion_length": 148.8083396911621, | |
| "epoch": 0.7573333333333333, | |
| "grad_norm": 9.099882125854492, | |
| "kl": 1.595703125, | |
| "learning_rate": 1.690019684486557e-07, | |
| "loss": 0.5703, | |
| "reward": 1.1416666984558106, | |
| "reward_std": 0.3741296485066414, | |
| "rewards/accuracy_reward": 0.2666666775941849, | |
| "rewards/format_reward": 0.8750000119209289, | |
| "step": 1420 | |
| }, | |
| { | |
| "completion_length": 205.50000534057617, | |
| "epoch": 0.76, | |
| "grad_norm": 3.151777744293213, | |
| "kl": 1.4462890625, | |
| "learning_rate": 1.655269678847292e-07, | |
| "loss": 0.6055, | |
| "reward": 1.1291666865348815, | |
| "reward_std": 0.415025033056736, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/format_reward": 0.8583333432674408, | |
| "step": 1425 | |
| }, | |
| { | |
| "completion_length": 156.6208381652832, | |
| "epoch": 0.7626666666666667, | |
| "grad_norm": 8.909594535827637, | |
| "kl": 1.776953125, | |
| "learning_rate": 1.6208096528827714e-07, | |
| "loss": 0.4477, | |
| "reward": 1.1916666984558106, | |
| "reward_std": 0.291341669857502, | |
| "rewards/accuracy_reward": 0.3166666816920042, | |
| "rewards/format_reward": 0.8750000178813935, | |
| "step": 1430 | |
| }, | |
| { | |
| "completion_length": 214.9083396911621, | |
| "epoch": 0.7653333333333333, | |
| "grad_norm": 10.612777709960938, | |
| "kl": 1.9314453125, | |
| "learning_rate": 1.5866425941902522e-07, | |
| "loss": 0.7519, | |
| "reward": 1.0875000536441803, | |
| "reward_std": 0.4693745546042919, | |
| "rewards/accuracy_reward": 0.2625000089406967, | |
| "rewards/format_reward": 0.8250000178813934, | |
| "step": 1435 | |
| }, | |
| { | |
| "completion_length": 223.26250991821288, | |
| "epoch": 0.768, | |
| "grad_norm": 21.530683517456055, | |
| "kl": 1.9796875, | |
| "learning_rate": 1.5527714649674638e-07, | |
| "loss": 0.8694, | |
| "reward": 1.1375000476837158, | |
| "reward_std": 0.47066808491945267, | |
| "rewards/accuracy_reward": 0.3375000089406967, | |
| "rewards/format_reward": 0.8000000238418579, | |
| "step": 1440 | |
| }, | |
| { | |
| "completion_length": 226.37083740234374, | |
| "epoch": 0.7706666666666667, | |
| "grad_norm": 13.953042030334473, | |
| "kl": 2.4328125, | |
| "learning_rate": 1.5191992017557993e-07, | |
| "loss": 0.8953, | |
| "reward": 0.9833333551883697, | |
| "reward_std": 0.5663923621177673, | |
| "rewards/accuracy_reward": 0.2250000074505806, | |
| "rewards/format_reward": 0.7583333492279053, | |
| "step": 1445 | |
| }, | |
| { | |
| "completion_length": 235.72917404174805, | |
| "epoch": 0.7733333333333333, | |
| "grad_norm": 89.63887786865234, | |
| "kl": 1.90859375, | |
| "learning_rate": 1.485928715185721e-07, | |
| "loss": 0.6224, | |
| "reward": 1.1458333492279054, | |
| "reward_std": 0.4521483927965164, | |
| "rewards/accuracy_reward": 0.3458333402872086, | |
| "rewards/format_reward": 0.8000000238418579, | |
| "step": 1450 | |
| }, | |
| { | |
| "completion_length": 238.7125045776367, | |
| "epoch": 0.776, | |
| "grad_norm": 9.134527206420898, | |
| "kl": 0.866015625, | |
| "learning_rate": 1.4529628897244212e-07, | |
| "loss": 0.7825, | |
| "reward": 1.1291666984558106, | |
| "reward_std": 0.49776799529790877, | |
| "rewards/accuracy_reward": 0.31250000447034837, | |
| "rewards/format_reward": 0.8166666984558105, | |
| "step": 1455 | |
| }, | |
| { | |
| "completion_length": 201.51250381469725, | |
| "epoch": 0.7786666666666666, | |
| "grad_norm": 11.216150283813477, | |
| "kl": 1.4703125, | |
| "learning_rate": 1.4203045834257417e-07, | |
| "loss": 0.6511, | |
| "reward": 1.137500035762787, | |
| "reward_std": 0.4635964795947075, | |
| "rewards/accuracy_reward": 0.31250000894069674, | |
| "rewards/format_reward": 0.8250000238418579, | |
| "step": 1460 | |
| }, | |
| { | |
| "completion_length": 130.93750381469727, | |
| "epoch": 0.7813333333333333, | |
| "grad_norm": 3.627894639968872, | |
| "kl": 1.075, | |
| "learning_rate": 1.3879566276823896e-07, | |
| "loss": 0.3128, | |
| "reward": 1.2708333611488343, | |
| "reward_std": 0.2865241147577763, | |
| "rewards/accuracy_reward": 0.3375000089406967, | |
| "rewards/format_reward": 0.9333333432674408, | |
| "step": 1465 | |
| }, | |
| { | |
| "completion_length": 216.83750839233397, | |
| "epoch": 0.784, | |
| "grad_norm": 12.179669380187988, | |
| "kl": 1.70078125, | |
| "learning_rate": 1.3559218269804624e-07, | |
| "loss": 0.7939, | |
| "reward": 1.237500047683716, | |
| "reward_std": 0.4636766240000725, | |
| "rewards/accuracy_reward": 0.42083334252238275, | |
| "rewards/format_reward": 0.8166666865348816, | |
| "step": 1470 | |
| }, | |
| { | |
| "completion_length": 158.32917251586915, | |
| "epoch": 0.7866666666666666, | |
| "grad_norm": 5.9073967933654785, | |
| "kl": 1.6234375, | |
| "learning_rate": 1.3242029586563054e-07, | |
| "loss": 0.4521, | |
| "reward": 1.066666692495346, | |
| "reward_std": 0.27624749541282656, | |
| "rewards/accuracy_reward": 0.20000000447034835, | |
| "rewards/format_reward": 0.8666666865348815, | |
| "step": 1475 | |
| }, | |
| { | |
| "completion_length": 207.18334197998047, | |
| "epoch": 0.7893333333333333, | |
| "grad_norm": 3.3254387378692627, | |
| "kl": 1.4654296875, | |
| "learning_rate": 1.2928027726557255e-07, | |
| "loss": 0.5483, | |
| "reward": 1.2083333730697632, | |
| "reward_std": 0.43673594370484353, | |
| "rewards/accuracy_reward": 0.38333334140479564, | |
| "rewards/format_reward": 0.8250000178813934, | |
| "step": 1480 | |
| }, | |
| { | |
| "completion_length": 235.04167556762695, | |
| "epoch": 0.792, | |
| "grad_norm": 10.714239120483398, | |
| "kl": 1.662109375, | |
| "learning_rate": 1.2617239912955757e-07, | |
| "loss": 0.8735, | |
| "reward": 1.0708333671092987, | |
| "reward_std": 0.4854356274008751, | |
| "rewards/accuracy_reward": 0.2541666731238365, | |
| "rewards/format_reward": 0.8166666865348816, | |
| "step": 1485 | |
| }, | |
| { | |
| "completion_length": 176.85834045410155, | |
| "epoch": 0.7946666666666666, | |
| "grad_norm": 6.300493240356445, | |
| "kl": 1.4615234375, | |
| "learning_rate": 1.230969309027739e-07, | |
| "loss": 0.5979, | |
| "reward": 1.262500023841858, | |
| "reward_std": 0.41847621351480485, | |
| "rewards/accuracy_reward": 0.3958333473652601, | |
| "rewards/format_reward": 0.8666666805744171, | |
| "step": 1490 | |
| }, | |
| { | |
| "completion_length": 230.1958381652832, | |
| "epoch": 0.7973333333333333, | |
| "grad_norm": 10.270231246948242, | |
| "kl": 2.72734375, | |
| "learning_rate": 1.2005413922055248e-07, | |
| "loss": 0.9828, | |
| "reward": 0.9833333671092988, | |
| "reward_std": 0.6273943156003952, | |
| "rewards/accuracy_reward": 0.21666667349636554, | |
| "rewards/format_reward": 0.7666666865348816, | |
| "step": 1495 | |
| }, | |
| { | |
| "completion_length": 277.3833374023437, | |
| "epoch": 0.8, | |
| "grad_norm": 20.984962463378906, | |
| "kl": 2.16640625, | |
| "learning_rate": 1.1704428788525029e-07, | |
| "loss": 0.8738, | |
| "reward": 1.066666692495346, | |
| "reward_std": 0.6233089223504067, | |
| "rewards/accuracy_reward": 0.3250000089406967, | |
| "rewards/format_reward": 0.7416666865348815, | |
| "step": 1500 | |
| }, | |
| { | |
| "completion_length": 201.60000762939453, | |
| "epoch": 0.8026666666666666, | |
| "grad_norm": 5.6333489418029785, | |
| "kl": 1.384375, | |
| "learning_rate": 1.1406763784337948e-07, | |
| "loss": 0.6346, | |
| "reward": 1.1416666865348817, | |
| "reward_std": 0.4620711088180542, | |
| "rewards/accuracy_reward": 0.30833334289491177, | |
| "rewards/format_reward": 0.8333333551883697, | |
| "step": 1505 | |
| }, | |
| { | |
| "completion_length": 155.48750381469728, | |
| "epoch": 0.8053333333333333, | |
| "grad_norm": 2.1134753227233887, | |
| "kl": 0.8833984375, | |
| "learning_rate": 1.111244471629838e-07, | |
| "loss": 0.4204, | |
| "reward": 1.2833333611488342, | |
| "reward_std": 0.3339101344347, | |
| "rewards/accuracy_reward": 0.3666666835546494, | |
| "rewards/format_reward": 0.9166666805744171, | |
| "step": 1510 | |
| }, | |
| { | |
| "completion_length": 208.70000686645508, | |
| "epoch": 0.808, | |
| "grad_norm": 15.448051452636719, | |
| "kl": 1.694140625, | |
| "learning_rate": 1.0821497101126487e-07, | |
| "loss": 0.725, | |
| "reward": 1.079166704416275, | |
| "reward_std": 0.46868581771850587, | |
| "rewards/accuracy_reward": 0.25416667126119136, | |
| "rewards/format_reward": 0.8250000178813934, | |
| "step": 1515 | |
| }, | |
| { | |
| "completion_length": 174.8625045776367, | |
| "epoch": 0.8106666666666666, | |
| "grad_norm": 10.781864166259766, | |
| "kl": 1.6875, | |
| "learning_rate": 1.0533946163245983e-07, | |
| "loss": 0.651, | |
| "reward": 1.1500000417232514, | |
| "reward_std": 0.4552404969930649, | |
| "rewards/accuracy_reward": 0.3166666753590107, | |
| "rewards/format_reward": 0.8333333551883697, | |
| "step": 1520 | |
| }, | |
| { | |
| "completion_length": 177.89167327880858, | |
| "epoch": 0.8133333333333334, | |
| "grad_norm": 6.928041458129883, | |
| "kl": 1.4318359375, | |
| "learning_rate": 1.024981683259723e-07, | |
| "loss": 0.6095, | |
| "reward": 1.2250000357627868, | |
| "reward_std": 0.39699684381484984, | |
| "rewards/accuracy_reward": 0.35000001192092894, | |
| "rewards/format_reward": 0.8750000238418579, | |
| "step": 1525 | |
| }, | |
| { | |
| "completion_length": 183.4708396911621, | |
| "epoch": 0.816, | |
| "grad_norm": 7.962174892425537, | |
| "kl": 1.2412109375, | |
| "learning_rate": 9.969133742475883e-08, | |
| "loss": 0.6025, | |
| "reward": 1.2291666924953462, | |
| "reward_std": 0.4208852708339691, | |
| "rewards/accuracy_reward": 0.3625000089406967, | |
| "rewards/format_reward": 0.8666666924953461, | |
| "step": 1530 | |
| }, | |
| { | |
| "completion_length": 256.63334503173826, | |
| "epoch": 0.8186666666666667, | |
| "grad_norm": 33.930641174316406, | |
| "kl": 1.7005859375, | |
| "learning_rate": 9.691921227397226e-08, | |
| "loss": 0.752, | |
| "reward": 0.9458333611488342, | |
| "reward_std": 0.4962170884013176, | |
| "rewards/accuracy_reward": 0.16250000558793545, | |
| "rewards/format_reward": 0.7833333611488342, | |
| "step": 1535 | |
| }, | |
| { | |
| "completion_length": 178.5583381652832, | |
| "epoch": 0.8213333333333334, | |
| "grad_norm": 6.385335445404053, | |
| "kl": 1.1046875, | |
| "learning_rate": 9.4182033209865e-08, | |
| "loss": 0.5621, | |
| "reward": 1.1833333730697633, | |
| "reward_std": 0.3783799774944782, | |
| "rewards/accuracy_reward": 0.3083333406597376, | |
| "rewards/format_reward": 0.8750000178813935, | |
| "step": 1540 | |
| }, | |
| { | |
| "completion_length": 183.16250610351562, | |
| "epoch": 0.824, | |
| "grad_norm": 7.927608966827393, | |
| "kl": 1.2470703125, | |
| "learning_rate": 9.148003753895144e-08, | |
| "loss": 0.5574, | |
| "reward": 1.1875000476837159, | |
| "reward_std": 0.40479181706905365, | |
| "rewards/accuracy_reward": 0.33750000409781933, | |
| "rewards/format_reward": 0.8500000178813935, | |
| "step": 1545 | |
| }, | |
| { | |
| "completion_length": 193.51667404174805, | |
| "epoch": 0.8266666666666667, | |
| "grad_norm": 8.427675247192383, | |
| "kl": 1.1888671875, | |
| "learning_rate": 8.881345951743485e-08, | |
| "loss": 0.7181, | |
| "reward": 1.200000035762787, | |
| "reward_std": 0.43247459903359414, | |
| "rewards/accuracy_reward": 0.3500000085681677, | |
| "rewards/format_reward": 0.8500000178813935, | |
| "step": 1550 | |
| }, | |
| { | |
| "completion_length": 190.77500686645507, | |
| "epoch": 0.8293333333333334, | |
| "grad_norm": 7.281441688537598, | |
| "kl": 1.5609375, | |
| "learning_rate": 8.618253033089767e-08, | |
| "loss": 0.6316, | |
| "reward": 1.1625000476837157, | |
| "reward_std": 0.3389572203159332, | |
| "rewards/accuracy_reward": 0.3041666716337204, | |
| "rewards/format_reward": 0.8583333551883697, | |
| "step": 1555 | |
| }, | |
| { | |
| "completion_length": 196.6958366394043, | |
| "epoch": 0.832, | |
| "grad_norm": 3.593385696411133, | |
| "kl": 1.7515625, | |
| "learning_rate": 8.358747807425826e-08, | |
| "loss": 0.6747, | |
| "reward": 1.2041666984558106, | |
| "reward_std": 0.430715125054121, | |
| "rewards/accuracy_reward": 0.3625000145286322, | |
| "rewards/format_reward": 0.8416666865348816, | |
| "step": 1560 | |
| }, | |
| { | |
| "completion_length": 153.6750045776367, | |
| "epoch": 0.8346666666666667, | |
| "grad_norm": 3.8720762729644775, | |
| "kl": 1.4552734375, | |
| "learning_rate": 8.102852773199586e-08, | |
| "loss": 0.4917, | |
| "reward": 1.2208333671092988, | |
| "reward_std": 0.34195019751787187, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/format_reward": 0.9083333492279053, | |
| "step": 1565 | |
| }, | |
| { | |
| "completion_length": 158.85833816528321, | |
| "epoch": 0.8373333333333334, | |
| "grad_norm": 4.770650386810303, | |
| "kl": 1.3427734375, | |
| "learning_rate": 7.850590115864481e-08, | |
| "loss": 0.4683, | |
| "reward": 1.225000023841858, | |
| "reward_std": 0.33489523082971573, | |
| "rewards/accuracy_reward": 0.33333334028720857, | |
| "rewards/format_reward": 0.8916666865348816, | |
| "step": 1570 | |
| }, | |
| { | |
| "completion_length": 193.32500534057618, | |
| "epoch": 0.84, | |
| "grad_norm": 5.848920822143555, | |
| "kl": 1.3138671875, | |
| "learning_rate": 7.601981705956039e-08, | |
| "loss": 0.4761, | |
| "reward": 1.2541667103767395, | |
| "reward_std": 0.4497212260961533, | |
| "rewards/accuracy_reward": 0.4291666798293591, | |
| "rewards/format_reward": 0.8250000178813934, | |
| "step": 1575 | |
| }, | |
| { | |
| "completion_length": 217.37500610351563, | |
| "epoch": 0.8426666666666667, | |
| "grad_norm": 9.334526062011719, | |
| "kl": 1.68125, | |
| "learning_rate": 7.357049097195773e-08, | |
| "loss": 0.6737, | |
| "reward": 1.0375000298023225, | |
| "reward_std": 0.4624008506536484, | |
| "rewards/accuracy_reward": 0.22083333916962147, | |
| "rewards/format_reward": 0.8166666805744172, | |
| "step": 1580 | |
| }, | |
| { | |
| "completion_length": 238.8541702270508, | |
| "epoch": 0.8453333333333334, | |
| "grad_norm": 7.903083801269531, | |
| "kl": 2.119140625, | |
| "learning_rate": 7.115813524622488e-08, | |
| "loss": 0.7548, | |
| "reward": 1.0375000417232514, | |
| "reward_std": 0.6012955874204635, | |
| "rewards/accuracy_reward": 0.28750000707805157, | |
| "rewards/format_reward": 0.7500000178813935, | |
| "step": 1585 | |
| }, | |
| { | |
| "completion_length": 213.15834274291993, | |
| "epoch": 0.848, | |
| "grad_norm": 4.728046417236328, | |
| "kl": 1.553515625, | |
| "learning_rate": 6.878295902751319e-08, | |
| "loss": 0.6544, | |
| "reward": 1.1208333492279052, | |
| "reward_std": 0.42975625991821287, | |
| "rewards/accuracy_reward": 0.3041666716337204, | |
| "rewards/format_reward": 0.8166666865348816, | |
| "step": 1590 | |
| }, | |
| { | |
| "completion_length": 180.77500534057617, | |
| "epoch": 0.8506666666666667, | |
| "grad_norm": 77.55509948730469, | |
| "kl": 1.58125, | |
| "learning_rate": 6.644516823760437e-08, | |
| "loss": 0.5961, | |
| "reward": 1.091666692495346, | |
| "reward_std": 0.32875102311372756, | |
| "rewards/accuracy_reward": 0.2166666690260172, | |
| "rewards/format_reward": 0.8750000238418579, | |
| "step": 1595 | |
| }, | |
| { | |
| "completion_length": 214.39167404174805, | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 25.89487075805664, | |
| "kl": 1.84609375, | |
| "learning_rate": 6.414496555705801e-08, | |
| "loss": 0.6698, | |
| "reward": 1.1166666805744172, | |
| "reward_std": 0.5117101609706879, | |
| "rewards/accuracy_reward": 0.34166666977107524, | |
| "rewards/format_reward": 0.7750000178813934, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8533333333333334, | |
| "eval_completion_length": 246.72556213378905, | |
| "eval_kl": 2.08859375, | |
| "eval_loss": 0.8056277632713318, | |
| "eval_reward": 0.9472222503026326, | |
| "eval_reward_std": 0.5439748798807462, | |
| "eval_rewards/accuracy_reward": 0.17055555924773216, | |
| "eval_rewards/format_reward": 0.7766666837533315, | |
| "eval_runtime": 810.1675, | |
| "eval_samples_per_second": 0.37, | |
| "eval_steps_per_second": 0.016, | |
| "step": 1600 | |
| }, | |
| { | |
| "completion_length": 281.73750762939454, | |
| "epoch": 0.856, | |
| "grad_norm": 19.62468719482422, | |
| "kl": 1.675390625, | |
| "learning_rate": 6.188255040763929e-08, | |
| "loss": 0.9653, | |
| "reward": 1.041666704416275, | |
| "reward_std": 0.62858667075634, | |
| "rewards/accuracy_reward": 0.3000000089406967, | |
| "rewards/format_reward": 0.7416666865348815, | |
| "step": 1605 | |
| }, | |
| { | |
| "completion_length": 188.87916946411133, | |
| "epoch": 0.8586666666666667, | |
| "grad_norm": 3.9411680698394775, | |
| "kl": 1.18125, | |
| "learning_rate": 5.965811893503015e-08, | |
| "loss": 0.6174, | |
| "reward": 1.2083333730697632, | |
| "reward_std": 0.42002698704600333, | |
| "rewards/accuracy_reward": 0.35000000819563865, | |
| "rewards/format_reward": 0.8583333551883697, | |
| "step": 1610 | |
| }, | |
| { | |
| "completion_length": 199.65000534057617, | |
| "epoch": 0.8613333333333333, | |
| "grad_norm": 11.383617401123047, | |
| "kl": 2.05234375, | |
| "learning_rate": 5.7471863991823356e-08, | |
| "loss": 0.6184, | |
| "reward": 1.1125000357627868, | |
| "reward_std": 0.4449311882257462, | |
| "rewards/accuracy_reward": 0.2875000063329935, | |
| "rewards/format_reward": 0.8250000298023223, | |
| "step": 1615 | |
| }, | |
| { | |
| "completion_length": 176.72917404174805, | |
| "epoch": 0.864, | |
| "grad_norm": 1.5305043458938599, | |
| "kl": 1.0876953125, | |
| "learning_rate": 5.532397512080306e-08, | |
| "loss": 0.6773, | |
| "reward": 1.2458333611488341, | |
| "reward_std": 0.42713540196418764, | |
| "rewards/accuracy_reward": 0.3625000111758709, | |
| "rewards/format_reward": 0.8833333492279053, | |
| "step": 1620 | |
| }, | |
| { | |
| "completion_length": 169.13750457763672, | |
| "epoch": 0.8666666666666667, | |
| "grad_norm": 5.485569000244141, | |
| "kl": 1.5263671875, | |
| "learning_rate": 5.321463853851188e-08, | |
| "loss": 0.5792, | |
| "reward": 1.1875000417232513, | |
| "reward_std": 0.39003320038318634, | |
| "rewards/accuracy_reward": 0.3125000063329935, | |
| "rewards/format_reward": 0.8750000238418579, | |
| "step": 1625 | |
| }, | |
| { | |
| "completion_length": 166.21250381469727, | |
| "epoch": 0.8693333333333333, | |
| "grad_norm": 6.497225761413574, | |
| "kl": 1.157421875, | |
| "learning_rate": 5.114403711910631e-08, | |
| "loss": 0.5003, | |
| "reward": 1.1833333849906922, | |
| "reward_std": 0.33641326874494554, | |
| "rewards/accuracy_reward": 0.2833333358168602, | |
| "rewards/format_reward": 0.9000000178813934, | |
| "step": 1630 | |
| }, | |
| { | |
| "completion_length": 203.3166717529297, | |
| "epoch": 0.872, | |
| "grad_norm": 10.349686622619629, | |
| "kl": 1.8275390625, | |
| "learning_rate": 4.911235037850186e-08, | |
| "loss": 0.7575, | |
| "reward": 1.2083333790302277, | |
| "reward_std": 0.5746691286563873, | |
| "rewards/accuracy_reward": 0.4000000149011612, | |
| "rewards/format_reward": 0.8083333551883698, | |
| "step": 1635 | |
| }, | |
| { | |
| "completion_length": 159.37083740234374, | |
| "epoch": 0.8746666666666667, | |
| "grad_norm": 3.7331645488739014, | |
| "kl": 1.871875, | |
| "learning_rate": 4.7119754458809725e-08, | |
| "loss": 0.5334, | |
| "reward": 1.1375000417232513, | |
| "reward_std": 0.37364248782396314, | |
| "rewards/accuracy_reward": 0.28750000707805157, | |
| "rewards/format_reward": 0.8500000238418579, | |
| "step": 1640 | |
| }, | |
| { | |
| "completion_length": 177.79583740234375, | |
| "epoch": 0.8773333333333333, | |
| "grad_norm": 10.393296241760254, | |
| "kl": 1.009765625, | |
| "learning_rate": 4.516642211306587e-08, | |
| "loss": 0.61, | |
| "reward": 1.112500047683716, | |
| "reward_std": 0.3519383378326893, | |
| "rewards/accuracy_reward": 0.23750000521540643, | |
| "rewards/format_reward": 0.8750000178813935, | |
| "step": 1645 | |
| }, | |
| { | |
| "completion_length": 188.83750534057617, | |
| "epoch": 0.88, | |
| "grad_norm": 9.854552268981934, | |
| "kl": 1.759375, | |
| "learning_rate": 4.325252269025315e-08, | |
| "loss": 0.6185, | |
| "reward": 1.166666704416275, | |
| "reward_std": 0.41237895712256434, | |
| "rewards/accuracy_reward": 0.32500001452863214, | |
| "rewards/format_reward": 0.8416666865348816, | |
| "step": 1650 | |
| }, | |
| { | |
| "completion_length": 232.76250457763672, | |
| "epoch": 0.8826666666666667, | |
| "grad_norm": 5.535569667816162, | |
| "kl": 1.8421875, | |
| "learning_rate": 4.137822212061964e-08, | |
| "loss": 0.7118, | |
| "reward": 1.0375000178813933, | |
| "reward_std": 0.4611209347844124, | |
| "rewards/accuracy_reward": 0.2625000048428774, | |
| "rewards/format_reward": 0.7750000178813934, | |
| "step": 1655 | |
| }, | |
| { | |
| "completion_length": 231.8791717529297, | |
| "epoch": 0.8853333333333333, | |
| "grad_norm": 5.740895748138428, | |
| "kl": 1.616796875, | |
| "learning_rate": 3.954368290129301e-08, | |
| "loss": 0.6385, | |
| "reward": 1.2250000357627868, | |
| "reward_std": 0.5405340433120728, | |
| "rewards/accuracy_reward": 0.4000000074505806, | |
| "rewards/format_reward": 0.8250000238418579, | |
| "step": 1660 | |
| }, | |
| { | |
| "completion_length": 131.25833740234376, | |
| "epoch": 0.888, | |
| "grad_norm": 5.200298309326172, | |
| "kl": 1.3203125, | |
| "learning_rate": 3.774906408219197e-08, | |
| "loss": 0.277, | |
| "reward": 1.3500000178813933, | |
| "reward_std": 0.33660581335425377, | |
| "rewards/accuracy_reward": 0.4250000149011612, | |
| "rewards/format_reward": 0.925000011920929, | |
| "step": 1665 | |
| }, | |
| { | |
| "completion_length": 202.34584426879883, | |
| "epoch": 0.8906666666666667, | |
| "grad_norm": 7.758249759674072, | |
| "kl": 1.4638671875, | |
| "learning_rate": 3.5994521252237506e-08, | |
| "loss": 0.5761, | |
| "reward": 1.1083333790302277, | |
| "reward_std": 0.38408626839518545, | |
| "rewards/accuracy_reward": 0.26666667126119137, | |
| "rewards/format_reward": 0.8416666746139526, | |
| "step": 1670 | |
| }, | |
| { | |
| "completion_length": 200.52500534057617, | |
| "epoch": 0.8933333333333333, | |
| "grad_norm": 12.366177558898926, | |
| "kl": 1.696875, | |
| "learning_rate": 3.42802065258635e-08, | |
| "loss": 0.6531, | |
| "reward": 1.0083333611488343, | |
| "reward_std": 0.46105473637580874, | |
| "rewards/accuracy_reward": 0.20000000670552254, | |
| "rewards/format_reward": 0.8083333611488343, | |
| "step": 1675 | |
| }, | |
| { | |
| "completion_length": 201.46250381469727, | |
| "epoch": 0.896, | |
| "grad_norm": 6.266597747802734, | |
| "kl": 1.599609375, | |
| "learning_rate": 3.260626852982873e-08, | |
| "loss": 0.6282, | |
| "reward": 1.0958333611488342, | |
| "reward_std": 0.43908271491527556, | |
| "rewards/accuracy_reward": 0.2541666753590107, | |
| "rewards/format_reward": 0.8416666865348816, | |
| "step": 1680 | |
| }, | |
| { | |
| "completion_length": 160.81250228881837, | |
| "epoch": 0.8986666666666666, | |
| "grad_norm": 56.750545501708984, | |
| "kl": 1.580078125, | |
| "learning_rate": 3.097285239033137e-08, | |
| "loss": 0.5331, | |
| "reward": 1.1541666984558105, | |
| "reward_std": 0.3470060914754868, | |
| "rewards/accuracy_reward": 0.26250000596046447, | |
| "rewards/format_reward": 0.8916666746139527, | |
| "step": 1685 | |
| }, | |
| { | |
| "completion_length": 213.47917556762695, | |
| "epoch": 0.9013333333333333, | |
| "grad_norm": 11.425201416015625, | |
| "kl": 2.06640625, | |
| "learning_rate": 2.93800997204271e-08, | |
| "loss": 0.7264, | |
| "reward": 1.200000035762787, | |
| "reward_std": 0.5640496462583542, | |
| "rewards/accuracy_reward": 0.40833334140479566, | |
| "rewards/format_reward": 0.791666692495346, | |
| "step": 1690 | |
| }, | |
| { | |
| "completion_length": 194.2541717529297, | |
| "epoch": 0.904, | |
| "grad_norm": 20.20305061340332, | |
| "kl": 1.2748046875, | |
| "learning_rate": 2.7828148607751235e-08, | |
| "loss": 0.6901, | |
| "reward": 1.2416667103767396, | |
| "reward_std": 0.4328203298151493, | |
| "rewards/accuracy_reward": 0.39166667461395266, | |
| "rewards/format_reward": 0.8500000238418579, | |
| "step": 1695 | |
| }, | |
| { | |
| "completion_length": 231.80000991821288, | |
| "epoch": 0.9066666666666666, | |
| "grad_norm": 20.141338348388672, | |
| "kl": 1.8451171875, | |
| "learning_rate": 2.6317133602547335e-08, | |
| "loss": 0.7004, | |
| "reward": 1.0708333730697632, | |
| "reward_std": 0.5015199676156044, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/format_reward": 0.8000000238418579, | |
| "step": 1700 | |
| }, | |
| { | |
| "completion_length": 226.9791732788086, | |
| "epoch": 0.9093333333333333, | |
| "grad_norm": 8.150571823120117, | |
| "kl": 1.451953125, | |
| "learning_rate": 2.4847185706001637e-08, | |
| "loss": 0.7311, | |
| "reward": 1.1750000357627868, | |
| "reward_std": 0.4346106082201004, | |
| "rewards/accuracy_reward": 0.33333334550261495, | |
| "rewards/format_reward": 0.8416666924953461, | |
| "step": 1705 | |
| }, | |
| { | |
| "completion_length": 161.9708366394043, | |
| "epoch": 0.912, | |
| "grad_norm": 7.521031856536865, | |
| "kl": 1.84453125, | |
| "learning_rate": 2.341843235888563e-08, | |
| "loss": 0.6629, | |
| "reward": 1.3250000476837158, | |
| "reward_std": 0.4102425158023834, | |
| "rewards/accuracy_reward": 0.45000000596046447, | |
| "rewards/format_reward": 0.8750000238418579, | |
| "step": 1710 | |
| }, | |
| { | |
| "completion_length": 181.10417098999022, | |
| "epoch": 0.9146666666666666, | |
| "grad_norm": 18.987491607666016, | |
| "kl": 1.7630859375, | |
| "learning_rate": 2.203099743050746e-08, | |
| "loss": 0.6879, | |
| "reward": 1.170833373069763, | |
| "reward_std": 0.42629132717847823, | |
| "rewards/accuracy_reward": 0.32083334028720856, | |
| "rewards/format_reward": 0.850000011920929, | |
| "step": 1715 | |
| }, | |
| { | |
| "completion_length": 263.47084045410156, | |
| "epoch": 0.9173333333333333, | |
| "grad_norm": 13.362593650817871, | |
| "kl": 2.15078125, | |
| "learning_rate": 2.068500120797284e-08, | |
| "loss": 0.9974, | |
| "reward": 1.125000035762787, | |
| "reward_std": 0.6081652283668518, | |
| "rewards/accuracy_reward": 0.3583333432674408, | |
| "rewards/format_reward": 0.766666692495346, | |
| "step": 1720 | |
| }, | |
| { | |
| "completion_length": 187.62083892822267, | |
| "epoch": 0.92, | |
| "grad_norm": 6.908720970153809, | |
| "kl": 1.466015625, | |
| "learning_rate": 1.9380560385756084e-08, | |
| "loss": 0.7325, | |
| "reward": 1.2166666984558105, | |
| "reward_std": 0.39846049398183825, | |
| "rewards/accuracy_reward": 0.3500000100582838, | |
| "rewards/format_reward": 0.8666666984558106, | |
| "step": 1725 | |
| }, | |
| { | |
| "completion_length": 208.1041687011719, | |
| "epoch": 0.9226666666666666, | |
| "grad_norm": 7.331279754638672, | |
| "kl": 1.965625, | |
| "learning_rate": 1.8117788055583284e-08, | |
| "loss": 0.8239, | |
| "reward": 1.1291667103767395, | |
| "reward_std": 0.5210324048995971, | |
| "rewards/accuracy_reward": 0.31250001303851604, | |
| "rewards/format_reward": 0.8166666805744172, | |
| "step": 1730 | |
| }, | |
| { | |
| "completion_length": 219.9958381652832, | |
| "epoch": 0.9253333333333333, | |
| "grad_norm": 10.382403373718262, | |
| "kl": 1.346484375, | |
| "learning_rate": 1.68967936966275e-08, | |
| "loss": 0.6091, | |
| "reward": 1.1708333671092988, | |
| "reward_std": 0.4482548341155052, | |
| "rewards/accuracy_reward": 0.3375000096857548, | |
| "rewards/format_reward": 0.8333333492279053, | |
| "step": 1735 | |
| }, | |
| { | |
| "completion_length": 180.16250228881836, | |
| "epoch": 0.928, | |
| "grad_norm": 8.021461486816406, | |
| "kl": 2.263671875, | |
| "learning_rate": 1.571768316601718e-08, | |
| "loss": 0.583, | |
| "reward": 1.0458333790302277, | |
| "reward_std": 0.4388016849756241, | |
| "rewards/accuracy_reward": 0.23750000335276128, | |
| "rewards/format_reward": 0.8083333551883698, | |
| "step": 1740 | |
| }, | |
| { | |
| "completion_length": 244.97917251586915, | |
| "epoch": 0.9306666666666666, | |
| "grad_norm": 27.44346809387207, | |
| "kl": 1.7083984375, | |
| "learning_rate": 1.4580558689658406e-08, | |
| "loss": 0.9154, | |
| "reward": 1.1708333671092988, | |
| "reward_std": 0.6085290633141994, | |
| "rewards/accuracy_reward": 0.379166679084301, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 1745 | |
| }, | |
| { | |
| "completion_length": 176.87917251586913, | |
| "epoch": 0.9333333333333333, | |
| "grad_norm": 21.765560150146484, | |
| "kl": 1.50078125, | |
| "learning_rate": 1.3485518853372624e-08, | |
| "loss": 0.5667, | |
| "reward": 1.2708333671092986, | |
| "reward_std": 0.44170806705951693, | |
| "rewards/accuracy_reward": 0.3958333425223827, | |
| "rewards/format_reward": 0.8750000238418579, | |
| "step": 1750 | |
| }, | |
| { | |
| "completion_length": 245.63750457763672, | |
| "epoch": 0.936, | |
| "grad_norm": 5.950416564941406, | |
| "kl": 2.523828125, | |
| "learning_rate": 1.243265859434911e-08, | |
| "loss": 0.8785, | |
| "reward": 1.1625000417232514, | |
| "reward_std": 0.6160432323813438, | |
| "rewards/accuracy_reward": 0.40416667312383653, | |
| "rewards/format_reward": 0.7583333492279053, | |
| "step": 1755 | |
| }, | |
| { | |
| "completion_length": 174.72083892822266, | |
| "epoch": 0.9386666666666666, | |
| "grad_norm": 19.66292381286621, | |
| "kl": 1.318359375, | |
| "learning_rate": 1.1422069192914219e-08, | |
| "loss": 0.516, | |
| "reward": 1.200000035762787, | |
| "reward_std": 0.37103949785232543, | |
| "rewards/accuracy_reward": 0.33333334028720857, | |
| "rewards/format_reward": 0.8666666805744171, | |
| "step": 1760 | |
| }, | |
| { | |
| "completion_length": 224.34167709350587, | |
| "epoch": 0.9413333333333334, | |
| "grad_norm": 6.988897323608398, | |
| "kl": 1.60703125, | |
| "learning_rate": 1.0453838264617709e-08, | |
| "loss": 0.8039, | |
| "reward": 0.9708333611488342, | |
| "reward_std": 0.5082953691482544, | |
| "rewards/accuracy_reward": 0.1791666727513075, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 1765 | |
| }, | |
| { | |
| "completion_length": 238.2375061035156, | |
| "epoch": 0.944, | |
| "grad_norm": 12.432623863220215, | |
| "kl": 1.816796875, | |
| "learning_rate": 9.528049752636714e-09, | |
| "loss": 0.6603, | |
| "reward": 1.0416666984558105, | |
| "reward_std": 0.4743375271558762, | |
| "rewards/accuracy_reward": 0.26666667573153974, | |
| "rewards/format_reward": 0.775000023841858, | |
| "step": 1770 | |
| }, | |
| { | |
| "completion_length": 169.10417251586915, | |
| "epoch": 0.9466666666666667, | |
| "grad_norm": 7.009900093078613, | |
| "kl": 1.668359375, | |
| "learning_rate": 8.644783920498e-09, | |
| "loss": 0.485, | |
| "reward": 1.200000035762787, | |
| "reward_std": 0.37502728700637816, | |
| "rewards/accuracy_reward": 0.3416666783392429, | |
| "rewards/format_reward": 0.8583333551883697, | |
| "step": 1775 | |
| }, | |
| { | |
| "completion_length": 233.9000045776367, | |
| "epoch": 0.9493333333333334, | |
| "grad_norm": 5.138508319854736, | |
| "kl": 1.686328125, | |
| "learning_rate": 7.804117345119266e-09, | |
| "loss": 0.7679, | |
| "reward": 1.1333333730697632, | |
| "reward_std": 0.5335646510124207, | |
| "rewards/accuracy_reward": 0.34166667722165583, | |
| "rewards/format_reward": 0.7916666984558105, | |
| "step": 1780 | |
| }, | |
| { | |
| "completion_length": 183.32084121704102, | |
| "epoch": 0.952, | |
| "grad_norm": 3580.284912109375, | |
| "kl": 2.233203125, | |
| "learning_rate": 7.00612291017022e-09, | |
| "loss": 0.6764, | |
| "reward": 1.1083333492279053, | |
| "reward_std": 0.3922556236386299, | |
| "rewards/accuracy_reward": 0.24166667684912682, | |
| "rewards/format_reward": 0.8666666924953461, | |
| "step": 1785 | |
| }, | |
| { | |
| "completion_length": 191.9708381652832, | |
| "epoch": 0.9546666666666667, | |
| "grad_norm": 7.294617176055908, | |
| "kl": 1.476171875, | |
| "learning_rate": 6.2508697997538665e-09, | |
| "loss": 0.5961, | |
| "reward": 1.0791666865348817, | |
| "reward_std": 0.4319505989551544, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.8500000238418579, | |
| "step": 1790 | |
| }, | |
| { | |
| "completion_length": 211.21250610351564, | |
| "epoch": 0.9573333333333334, | |
| "grad_norm": 4.981256484985352, | |
| "kl": 1.1861328125, | |
| "learning_rate": 5.538423492408129e-09, | |
| "loss": 0.604, | |
| "reward": 1.262500035762787, | |
| "reward_std": 0.45401586443185804, | |
| "rewards/accuracy_reward": 0.39583334848284724, | |
| "rewards/format_reward": 0.8666666805744171, | |
| "step": 1795 | |
| }, | |
| { | |
| "completion_length": 283.41251068115236, | |
| "epoch": 0.96, | |
| "grad_norm": 2.2728030681610107, | |
| "kl": 1.4833984375, | |
| "learning_rate": 4.8688457554291736e-09, | |
| "loss": 0.7529, | |
| "reward": 1.1791666984558105, | |
| "reward_std": 0.535879123210907, | |
| "rewards/accuracy_reward": 0.4041666742414236, | |
| "rewards/format_reward": 0.7750000178813934, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_completion_length": 212.29556121826172, | |
| "eval_kl": 1.955859375, | |
| "eval_loss": 0.6787320971488953, | |
| "eval_reward": 0.9722222558657329, | |
| "eval_reward_std": 0.4794289442896843, | |
| "eval_rewards/accuracy_reward": 0.1600000035762787, | |
| "eval_rewards/format_reward": 0.8122222447395324, | |
| "eval_runtime": 704.5393, | |
| "eval_samples_per_second": 0.426, | |
| "eval_steps_per_second": 0.018, | |
| "step": 1800 | |
| }, | |
| { | |
| "completion_length": 171.38750381469725, | |
| "epoch": 0.9626666666666667, | |
| "grad_norm": 7.666684150695801, | |
| "kl": 1.298046875, | |
| "learning_rate": 4.242194639516416e-09, | |
| "loss": 0.5453, | |
| "reward": 1.1750000238418579, | |
| "reward_std": 0.3607516996562481, | |
| "rewards/accuracy_reward": 0.30000000856816766, | |
| "rewards/format_reward": 0.8750000178813935, | |
| "step": 1805 | |
| }, | |
| { | |
| "completion_length": 182.1208396911621, | |
| "epoch": 0.9653333333333334, | |
| "grad_norm": 2.6979856491088867, | |
| "kl": 1.5603515625, | |
| "learning_rate": 3.658524473739544e-09, | |
| "loss": 0.7398, | |
| "reward": 1.1625000536441803, | |
| "reward_std": 0.40499134212732313, | |
| "rewards/accuracy_reward": 0.30416667461395264, | |
| "rewards/format_reward": 0.8583333551883697, | |
| "step": 1810 | |
| }, | |
| { | |
| "completion_length": 216.9875061035156, | |
| "epoch": 0.968, | |
| "grad_norm": 11.175540924072266, | |
| "kl": 1.25625, | |
| "learning_rate": 3.1178858608283954e-09, | |
| "loss": 0.7517, | |
| "reward": 1.0833333790302277, | |
| "reward_std": 0.4425746828317642, | |
| "rewards/accuracy_reward": 0.24166667386889457, | |
| "rewards/format_reward": 0.8416666865348816, | |
| "step": 1815 | |
| }, | |
| { | |
| "completion_length": 187.23750762939454, | |
| "epoch": 0.9706666666666667, | |
| "grad_norm": 7.220974922180176, | |
| "kl": 1.28515625, | |
| "learning_rate": 2.6203256727859167e-09, | |
| "loss": 0.7132, | |
| "reward": 1.1583333730697631, | |
| "reward_std": 0.43746666610240936, | |
| "rewards/accuracy_reward": 0.3083333432674408, | |
| "rewards/format_reward": 0.8500000178813935, | |
| "step": 1820 | |
| }, | |
| { | |
| "completion_length": 164.65833892822266, | |
| "epoch": 0.9733333333333334, | |
| "grad_norm": 10.702654838562012, | |
| "kl": 1.7052734375, | |
| "learning_rate": 2.165887046824133e-09, | |
| "loss": 0.5298, | |
| "reward": 1.1958333730697632, | |
| "reward_std": 0.32650465294718745, | |
| "rewards/accuracy_reward": 0.32916667461395266, | |
| "rewards/format_reward": 0.8666666865348815, | |
| "step": 1825 | |
| }, | |
| { | |
| "completion_length": 198.562508392334, | |
| "epoch": 0.976, | |
| "grad_norm": 6.067382335662842, | |
| "kl": 1.7404296875, | |
| "learning_rate": 1.7546093816246387e-09, | |
| "loss": 0.6923, | |
| "reward": 1.004166704416275, | |
| "reward_std": 0.3946992427110672, | |
| "rewards/accuracy_reward": 0.17083333544433116, | |
| "rewards/format_reward": 0.8333333551883697, | |
| "step": 1830 | |
| }, | |
| { | |
| "completion_length": 206.00834197998046, | |
| "epoch": 0.9786666666666667, | |
| "grad_norm": 8.223440170288086, | |
| "kl": 1.7333984375, | |
| "learning_rate": 1.3865283339228316e-09, | |
| "loss": 0.6633, | |
| "reward": 1.091666692495346, | |
| "reward_std": 0.45499495714902877, | |
| "rewards/accuracy_reward": 0.26666667237877845, | |
| "rewards/format_reward": 0.8250000178813934, | |
| "step": 1835 | |
| }, | |
| { | |
| "completion_length": 162.4708381652832, | |
| "epoch": 0.9813333333333333, | |
| "grad_norm": 6.2932610511779785, | |
| "kl": 1.397265625, | |
| "learning_rate": 1.0616758154161631e-09, | |
| "loss": 0.5335, | |
| "reward": 1.3208333730697632, | |
| "reward_std": 0.27748758494853976, | |
| "rewards/accuracy_reward": 0.4041666716337204, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 1840 | |
| }, | |
| { | |
| "completion_length": 165.61667251586914, | |
| "epoch": 0.984, | |
| "grad_norm": 9.122838020324707, | |
| "kl": 1.4970703125, | |
| "learning_rate": 7.80079989997906e-10, | |
| "loss": 0.6188, | |
| "reward": 1.1958333611488343, | |
| "reward_std": 0.4245347425341606, | |
| "rewards/accuracy_reward": 0.33750000409781933, | |
| "rewards/format_reward": 0.8583333551883697, | |
| "step": 1845 | |
| }, | |
| { | |
| "completion_length": 215.14167251586915, | |
| "epoch": 0.9866666666666667, | |
| "grad_norm": 23.73024559020996, | |
| "kl": 1.775, | |
| "learning_rate": 5.417652713152199e-10, | |
| "loss": 0.5543, | |
| "reward": 1.1333333730697632, | |
| "reward_std": 0.4075448580086231, | |
| "rewards/accuracy_reward": 0.31666667610406873, | |
| "rewards/format_reward": 0.8166666835546493, | |
| "step": 1850 | |
| }, | |
| { | |
| "completion_length": 185.79167404174805, | |
| "epoch": 0.9893333333333333, | |
| "grad_norm": 20.497873306274414, | |
| "kl": 1.5189453125, | |
| "learning_rate": 3.4675232065256574e-10, | |
| "loss": 0.4238, | |
| "reward": 1.0708333492279052, | |
| "reward_std": 0.32569129317998885, | |
| "rewards/accuracy_reward": 0.24583334028720855, | |
| "rewards/format_reward": 0.8250000059604645, | |
| "step": 1855 | |
| }, | |
| { | |
| "completion_length": 225.09583892822266, | |
| "epoch": 0.992, | |
| "grad_norm": 12.528355598449707, | |
| "kl": 1.2953125, | |
| "learning_rate": 1.9505804514047264e-10, | |
| "loss": 0.4901, | |
| "reward": 1.1541666984558105, | |
| "reward_std": 0.4231454662978649, | |
| "rewards/accuracy_reward": 0.32083334028720856, | |
| "rewards/format_reward": 0.8333333373069763, | |
| "step": 1860 | |
| }, | |
| { | |
| "completion_length": 221.36667175292968, | |
| "epoch": 0.9946666666666667, | |
| "grad_norm": 13.679535865783691, | |
| "kl": 1.390625, | |
| "learning_rate": 8.669559628954326e-11, | |
| "loss": 0.8532, | |
| "reward": 1.1583333730697631, | |
| "reward_std": 0.5791940867900849, | |
| "rewards/accuracy_reward": 0.35000000409781934, | |
| "rewards/format_reward": 0.8083333611488343, | |
| "step": 1865 | |
| }, | |
| { | |
| "completion_length": 158.7708381652832, | |
| "epoch": 0.9973333333333333, | |
| "grad_norm": 37.47658157348633, | |
| "kl": 1.707421875, | |
| "learning_rate": 2.1674368850643777e-11, | |
| "loss": 0.5296, | |
| "reward": 1.1458333611488343, | |
| "reward_std": 0.29994617849588395, | |
| "rewards/accuracy_reward": 0.2541666738688946, | |
| "rewards/format_reward": 0.8916666805744171, | |
| "step": 1870 | |
| }, | |
| { | |
| "completion_length": 223.59584197998046, | |
| "epoch": 1.0, | |
| "grad_norm": 7.715764999389648, | |
| "kl": 1.104296875, | |
| "learning_rate": 0.0, | |
| "loss": 0.7452, | |
| "reward": 1.1708333790302277, | |
| "reward_std": 0.4897158071398735, | |
| "rewards/accuracy_reward": 0.3375000100582838, | |
| "rewards/format_reward": 0.8333333492279053, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 1875, | |
| "total_flos": 0.0, | |
| "train_loss": 0.738647110915184, | |
| "train_runtime": 47500.6212, | |
| "train_samples_per_second": 0.316, | |
| "train_steps_per_second": 0.039 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1875, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |