Qwen2.5-1.5B-Knowledge-R1-GRPO / trainer_state.json
hzy's picture
Model save
26c526b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 200,
"global_step": 1875,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 309.1416763305664,
"epoch": 0.0026666666666666666,
"grad_norm": 1.0915299654006958,
"kl": 0.000769805908203125,
"learning_rate": 2.6595744680851062e-08,
"loss": 0.0572,
"reward": -0.8166666805744172,
"reward_std": 0.30653437227010727,
"rewards/accuracy_reward": 0.07500000223517418,
"rewards/format_reward": -0.8916666805744171,
"step": 5
},
{
"completion_length": 335.7166778564453,
"epoch": 0.005333333333333333,
"grad_norm": 1.6455351114273071,
"kl": 0.00115509033203125,
"learning_rate": 5.3191489361702123e-08,
"loss": 0.0544,
"reward": -0.8583333492279053,
"reward_std": 0.2569814197719097,
"rewards/accuracy_reward": 0.0666666679084301,
"rewards/format_reward": -0.925000011920929,
"step": 10
},
{
"completion_length": 290.56251220703126,
"epoch": 0.008,
"grad_norm": 1.7345483303070068,
"kl": 0.0012176513671875,
"learning_rate": 7.978723404255319e-08,
"loss": 0.0639,
"reward": -0.8333333611488343,
"reward_std": 0.4082482993602753,
"rewards/accuracy_reward": 0.04166666753590107,
"rewards/format_reward": -0.8750000238418579,
"step": 15
},
{
"completion_length": 320.79584350585935,
"epoch": 0.010666666666666666,
"grad_norm": 1.5033214092254639,
"kl": 0.0012172698974609376,
"learning_rate": 1.0638297872340425e-07,
"loss": 0.0569,
"reward": -0.8416666924953461,
"reward_std": 0.2757193736732006,
"rewards/accuracy_reward": 0.04166666753590107,
"rewards/format_reward": -0.8833333492279053,
"step": 20
},
{
"completion_length": 341.09584655761716,
"epoch": 0.013333333333333334,
"grad_norm": 1.9244236946105957,
"kl": 0.0012542724609375,
"learning_rate": 1.329787234042553e-07,
"loss": 0.0252,
"reward": -0.8458333492279053,
"reward_std": 0.3047572821378708,
"rewards/accuracy_reward": 0.08750000149011612,
"rewards/format_reward": -0.9333333492279052,
"step": 25
},
{
"completion_length": 298.28334350585936,
"epoch": 0.016,
"grad_norm": 2.07985258102417,
"kl": 0.0014141082763671875,
"learning_rate": 1.5957446808510638e-07,
"loss": 0.0589,
"reward": -0.8083333551883698,
"reward_std": 0.3681695103645325,
"rewards/accuracy_reward": 0.05833333432674408,
"rewards/format_reward": -0.8666666924953461,
"step": 30
},
{
"completion_length": 327.3208374023437,
"epoch": 0.018666666666666668,
"grad_norm": 1.2517731189727783,
"kl": 0.0022617340087890624,
"learning_rate": 1.8617021276595742e-07,
"loss": 0.0912,
"reward": -0.854166692495346,
"reward_std": 0.3303105406463146,
"rewards/accuracy_reward": 0.03750000111758709,
"rewards/format_reward": -0.891666692495346,
"step": 35
},
{
"completion_length": 352.6000122070312,
"epoch": 0.021333333333333333,
"grad_norm": 0.8213557004928589,
"kl": 0.003079986572265625,
"learning_rate": 2.127659574468085e-07,
"loss": 0.0448,
"reward": -0.850000011920929,
"reward_std": 0.28401452749967576,
"rewards/accuracy_reward": 0.05833333395421505,
"rewards/format_reward": -0.9083333432674408,
"step": 40
},
{
"completion_length": 337.5208435058594,
"epoch": 0.024,
"grad_norm": 0.7038048505783081,
"kl": 0.0064483642578125,
"learning_rate": 2.393617021276596e-07,
"loss": 0.1086,
"reward": -0.8291666865348816,
"reward_std": 0.320113442838192,
"rewards/accuracy_reward": 0.07083333469927311,
"rewards/format_reward": -0.9000000119209289,
"step": 45
},
{
"completion_length": 353.56251220703126,
"epoch": 0.02666666666666667,
"grad_norm": 1.7768754959106445,
"kl": 0.011151123046875,
"learning_rate": 2.659574468085106e-07,
"loss": 0.0946,
"reward": -0.791666692495346,
"reward_std": 0.3645200379192829,
"rewards/accuracy_reward": 0.05000000149011612,
"rewards/format_reward": -0.8416666805744171,
"step": 50
},
{
"completion_length": 304.80834197998047,
"epoch": 0.029333333333333333,
"grad_norm": 1.8845570087432861,
"kl": 0.0145263671875,
"learning_rate": 2.925531914893617e-07,
"loss": 0.1085,
"reward": -0.825000011920929,
"reward_std": 0.378238408267498,
"rewards/accuracy_reward": 0.05000000149011612,
"rewards/format_reward": -0.8750000119209289,
"step": 55
},
{
"completion_length": 300.5541717529297,
"epoch": 0.032,
"grad_norm": 1.3354154825210571,
"kl": 0.0145751953125,
"learning_rate": 3.1914893617021275e-07,
"loss": 0.0812,
"reward": -0.8083333492279052,
"reward_std": 0.3903405636548996,
"rewards/accuracy_reward": 0.09166666939854622,
"rewards/format_reward": -0.900000023841858,
"step": 60
},
{
"completion_length": 302.21250915527344,
"epoch": 0.034666666666666665,
"grad_norm": 1.0147898197174072,
"kl": 0.022381591796875,
"learning_rate": 3.457446808510638e-07,
"loss": 0.1457,
"reward": -0.7458333492279052,
"reward_std": 0.47493031769990923,
"rewards/accuracy_reward": 0.03750000111758709,
"rewards/format_reward": -0.7833333492279053,
"step": 65
},
{
"completion_length": 291.40417633056643,
"epoch": 0.037333333333333336,
"grad_norm": 2.549830675125122,
"kl": 0.03157958984375,
"learning_rate": 3.7234042553191484e-07,
"loss": 0.1052,
"reward": -0.7708333551883697,
"reward_std": 0.4252162277698517,
"rewards/accuracy_reward": 0.06250000037252904,
"rewards/format_reward": -0.8333333551883697,
"step": 70
},
{
"completion_length": 315.60834350585935,
"epoch": 0.04,
"grad_norm": 1.0364981889724731,
"kl": 0.02958984375,
"learning_rate": 3.989361702127659e-07,
"loss": 0.1479,
"reward": -0.7458333551883698,
"reward_std": 0.48624068051576613,
"rewards/accuracy_reward": 0.06250000111758709,
"rewards/format_reward": -0.8083333611488343,
"step": 75
},
{
"completion_length": 275.8916732788086,
"epoch": 0.042666666666666665,
"grad_norm": 1.8735177516937256,
"kl": 0.03341064453125,
"learning_rate": 4.25531914893617e-07,
"loss": 0.1291,
"reward": -0.7250000238418579,
"reward_std": 0.5295502826571464,
"rewards/accuracy_reward": 0.05833333395421505,
"rewards/format_reward": -0.7833333551883698,
"step": 80
},
{
"completion_length": 278.6000091552734,
"epoch": 0.04533333333333334,
"grad_norm": 1.7058619260787964,
"kl": 0.04388427734375,
"learning_rate": 4.5212765957446806e-07,
"loss": 0.1241,
"reward": -0.7166666746139526,
"reward_std": 0.517094686627388,
"rewards/accuracy_reward": 0.05833333395421505,
"rewards/format_reward": -0.7750000119209289,
"step": 85
},
{
"completion_length": 294.21667327880857,
"epoch": 0.048,
"grad_norm": 2.6495754718780518,
"kl": 0.05029296875,
"learning_rate": 4.787234042553192e-07,
"loss": 0.1663,
"reward": -0.6333333402872086,
"reward_std": 0.6619763910770416,
"rewards/accuracy_reward": 0.08333333507180214,
"rewards/format_reward": -0.7166666865348816,
"step": 90
},
{
"completion_length": 290.7458435058594,
"epoch": 0.050666666666666665,
"grad_norm": 1.9712241888046265,
"kl": 0.064501953125,
"learning_rate": 5.053191489361702e-07,
"loss": 0.2083,
"reward": -0.5291666805744171,
"reward_std": 0.7097373753786087,
"rewards/accuracy_reward": 0.10416666902601719,
"rewards/format_reward": -0.6333333551883698,
"step": 95
},
{
"completion_length": 276.9541778564453,
"epoch": 0.05333333333333334,
"grad_norm": 2.6062912940979004,
"kl": 0.09736328125,
"learning_rate": 5.319148936170212e-07,
"loss": 0.1889,
"reward": -0.5833333522081375,
"reward_std": 0.6344460442662239,
"rewards/accuracy_reward": 0.09166666977107525,
"rewards/format_reward": -0.675000011920929,
"step": 100
},
{
"completion_length": 258.17084350585935,
"epoch": 0.056,
"grad_norm": 3.833866834640503,
"kl": 0.1748046875,
"learning_rate": 5.585106382978722e-07,
"loss": 0.1782,
"reward": -0.5166666775941848,
"reward_std": 0.7733665883541108,
"rewards/accuracy_reward": 0.11666666902601719,
"rewards/format_reward": -0.6333333492279053,
"step": 105
},
{
"completion_length": 276.9500076293945,
"epoch": 0.058666666666666666,
"grad_norm": 2.573361873626709,
"kl": 0.1017578125,
"learning_rate": 5.851063829787234e-07,
"loss": 0.1876,
"reward": -0.49583334624767306,
"reward_std": 0.7565078109502792,
"rewards/accuracy_reward": 0.12083333656191826,
"rewards/format_reward": -0.6166666924953461,
"step": 110
},
{
"completion_length": 276.2250061035156,
"epoch": 0.06133333333333333,
"grad_norm": 1.7497327327728271,
"kl": 0.075927734375,
"learning_rate": 6.117021276595744e-07,
"loss": 0.1169,
"reward": -0.6041666865348816,
"reward_std": 0.6442232474684715,
"rewards/accuracy_reward": 0.10416666828095913,
"rewards/format_reward": -0.7083333551883697,
"step": 115
},
{
"completion_length": 258.6750091552734,
"epoch": 0.064,
"grad_norm": 2.181297779083252,
"kl": 0.081201171875,
"learning_rate": 6.382978723404255e-07,
"loss": 0.1729,
"reward": -0.5708333432674408,
"reward_std": 0.7166081488132476,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/format_reward": -0.6333333522081375,
"step": 120
},
{
"completion_length": 268.40834045410156,
"epoch": 0.06666666666666667,
"grad_norm": 4.303333759307861,
"kl": 0.133251953125,
"learning_rate": 6.648936170212765e-07,
"loss": 0.2288,
"reward": -0.5125000178813934,
"reward_std": 0.7744767606258393,
"rewards/accuracy_reward": 0.07083333544433117,
"rewards/format_reward": -0.5833333492279053,
"step": 125
},
{
"completion_length": 299.7708435058594,
"epoch": 0.06933333333333333,
"grad_norm": 4.013002872467041,
"kl": 0.169775390625,
"learning_rate": 6.914893617021277e-07,
"loss": 0.318,
"reward": -0.3000000104308128,
"reward_std": 0.8895713210105896,
"rewards/accuracy_reward": 0.08333333507180214,
"rewards/format_reward": -0.3833333447575569,
"step": 130
},
{
"completion_length": 235.02500915527344,
"epoch": 0.072,
"grad_norm": 2.6780853271484375,
"kl": 0.173779296875,
"learning_rate": 7.180851063829787e-07,
"loss": 0.1926,
"reward": -0.35416668057441714,
"reward_std": 0.7739700466394425,
"rewards/accuracy_reward": 0.11250000409781932,
"rewards/format_reward": -0.46666668355464935,
"step": 135
},
{
"completion_length": 257.54584045410155,
"epoch": 0.07466666666666667,
"grad_norm": 1.999470829963684,
"kl": 0.117333984375,
"learning_rate": 7.446808510638297e-07,
"loss": 0.2742,
"reward": -0.23750000968575477,
"reward_std": 0.8838598787784576,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/format_reward": -0.3416666746139526,
"step": 140
},
{
"completion_length": 246.15000610351564,
"epoch": 0.07733333333333334,
"grad_norm": 3.609872579574585,
"kl": 0.20576171875,
"learning_rate": 7.712765957446808e-07,
"loss": 0.2398,
"reward": -0.3708333432674408,
"reward_std": 0.879425299167633,
"rewards/accuracy_reward": 0.11250000149011612,
"rewards/format_reward": -0.4833333492279053,
"step": 145
},
{
"completion_length": 259.1625091552734,
"epoch": 0.08,
"grad_norm": 4.464442729949951,
"kl": 0.21318359375,
"learning_rate": 7.978723404255318e-07,
"loss": 0.2627,
"reward": -0.1833333384245634,
"reward_std": 0.9505029916763306,
"rewards/accuracy_reward": 0.12500000260770322,
"rewards/format_reward": -0.3083333432674408,
"step": 150
},
{
"completion_length": 236.57083740234376,
"epoch": 0.08266666666666667,
"grad_norm": 4.6332550048828125,
"kl": 0.216796875,
"learning_rate": 8.24468085106383e-07,
"loss": 0.2414,
"reward": -0.23333333767950534,
"reward_std": 0.9220583379268646,
"rewards/accuracy_reward": 0.13333333544433118,
"rewards/format_reward": -0.3666666798293591,
"step": 155
},
{
"completion_length": 223.12500610351563,
"epoch": 0.08533333333333333,
"grad_norm": 2.853530168533325,
"kl": 0.240625,
"learning_rate": 8.51063829787234e-07,
"loss": 0.2636,
"reward": -0.2166666705161333,
"reward_std": 0.9433093965053558,
"rewards/accuracy_reward": 0.10833333730697632,
"rewards/format_reward": -0.32500001341104506,
"step": 160
},
{
"completion_length": 187.4041717529297,
"epoch": 0.088,
"grad_norm": 3.2459123134613037,
"kl": 0.2255859375,
"learning_rate": 8.77659574468085e-07,
"loss": 0.2653,
"reward": -0.13750000298023224,
"reward_std": 0.9205778002738952,
"rewards/accuracy_reward": 0.1291666690260172,
"rewards/format_reward": -0.2666666805744171,
"step": 165
},
{
"completion_length": 199.13333892822266,
"epoch": 0.09066666666666667,
"grad_norm": 2.3914144039154053,
"kl": 0.2478515625,
"learning_rate": 9.042553191489361e-07,
"loss": 0.242,
"reward": -0.2500000067055225,
"reward_std": 0.9306629121303558,
"rewards/accuracy_reward": 0.05833333432674408,
"rewards/format_reward": -0.30833333879709246,
"step": 170
},
{
"completion_length": 194.1500045776367,
"epoch": 0.09333333333333334,
"grad_norm": 7.140366554260254,
"kl": 0.336328125,
"learning_rate": 9.308510638297871e-07,
"loss": 0.2465,
"reward": 0.11250000335276127,
"reward_std": 0.9915949404239655,
"rewards/accuracy_reward": 0.2041666742414236,
"rewards/format_reward": -0.09166667088866234,
"step": 175
},
{
"completion_length": 177.92917022705078,
"epoch": 0.096,
"grad_norm": 3.311171293258667,
"kl": 0.31455078125,
"learning_rate": 9.574468085106384e-07,
"loss": 0.2429,
"reward": -0.02083333097398281,
"reward_std": 0.9874655485153199,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/format_reward": -0.1666666716337204,
"step": 180
},
{
"completion_length": 204.3000045776367,
"epoch": 0.09866666666666667,
"grad_norm": 15.891555786132812,
"kl": 0.29267578125,
"learning_rate": 9.840425531914893e-07,
"loss": 0.2048,
"reward": -0.04166666828095913,
"reward_std": 0.999242752790451,
"rewards/accuracy_reward": 0.16666667237877847,
"rewards/format_reward": -0.20833333879709243,
"step": 185
},
{
"completion_length": 169.1041702270508,
"epoch": 0.10133333333333333,
"grad_norm": 3.4694690704345703,
"kl": 0.36845703125,
"learning_rate": 9.999965320799375e-07,
"loss": 0.2537,
"reward": 0.29166667312383654,
"reward_std": 0.9786251783370972,
"rewards/accuracy_reward": 0.19166667237877846,
"rewards/format_reward": 0.10000000670552253,
"step": 190
},
{
"completion_length": 155.45833740234374,
"epoch": 0.104,
"grad_norm": 4.827131271362305,
"kl": 0.4091796875,
"learning_rate": 9.999575185316993e-07,
"loss": 0.253,
"reward": 0.21666667386889457,
"reward_std": 0.9847019255161286,
"rewards/accuracy_reward": 0.12500000335276126,
"rewards/format_reward": 0.09166666865348816,
"step": 195
},
{
"completion_length": 153.3083366394043,
"epoch": 0.10666666666666667,
"grad_norm": 6.091984272003174,
"kl": 0.4533203125,
"learning_rate": 9.998751599287957e-07,
"loss": 0.2823,
"reward": 0.37916667610406873,
"reward_std": 1.0222100555896758,
"rewards/accuracy_reward": 0.16250000521540642,
"rewards/format_reward": 0.21666667237877846,
"step": 200
},
{
"epoch": 0.10666666666666667,
"eval_completion_length": 163.56056142171224,
"eval_kl": 0.3924609375,
"eval_loss": 0.2787472605705261,
"eval_reward": 0.34055556610226634,
"eval_reward_std": 0.9718689926465353,
"eval_rewards/accuracy_reward": 0.09500000228484472,
"eval_rewards/format_reward": 0.24555556188027064,
"eval_runtime": 368.0531,
"eval_samples_per_second": 0.815,
"eval_steps_per_second": 0.035,
"step": 200
},
{
"completion_length": 155.39583740234374,
"epoch": 0.10933333333333334,
"grad_norm": 4.420968532562256,
"kl": 0.378125,
"learning_rate": 9.9974946341151e-07,
"loss": 0.2678,
"reward": 0.42916667759418486,
"reward_std": 1.0038800358772277,
"rewards/accuracy_reward": 0.17916666977107526,
"rewards/format_reward": 0.25000000819563867,
"step": 205
},
{
"completion_length": 143.2500030517578,
"epoch": 0.112,
"grad_norm": 40.516544342041016,
"kl": 0.4390625,
"learning_rate": 9.995804398774126e-07,
"loss": 0.303,
"reward": 0.6583333551883698,
"reward_std": 0.892980021238327,
"rewards/accuracy_reward": 0.20000000819563865,
"rewards/format_reward": 0.45833334028720857,
"step": 210
},
{
"completion_length": 129.4250045776367,
"epoch": 0.11466666666666667,
"grad_norm": 3.4134747982025146,
"kl": 0.75546875,
"learning_rate": 9.993681039804173e-07,
"loss": 0.2308,
"reward": 0.6916666924953461,
"reward_std": 0.7836884081363678,
"rewards/accuracy_reward": 0.16666667200624943,
"rewards/format_reward": 0.5250000201165677,
"step": 215
},
{
"completion_length": 127.5083381652832,
"epoch": 0.11733333333333333,
"grad_norm": 4.250089645385742,
"kl": 0.4599609375,
"learning_rate": 9.991124741295105e-07,
"loss": 0.2392,
"reward": 0.6500000178813934,
"reward_std": 0.790798443555832,
"rewards/accuracy_reward": 0.10000000298023223,
"rewards/format_reward": 0.5500000208616257,
"step": 220
},
{
"completion_length": 142.82500534057618,
"epoch": 0.12,
"grad_norm": 3.440370798110962,
"kl": 0.3755859375,
"learning_rate": 9.988135724871545e-07,
"loss": 0.2241,
"reward": 0.6125000149011612,
"reward_std": 0.9097375214099884,
"rewards/accuracy_reward": 0.17083333767950534,
"rewards/format_reward": 0.4416666775941849,
"step": 225
},
{
"completion_length": 142.2708381652832,
"epoch": 0.12266666666666666,
"grad_norm": 3.0582029819488525,
"kl": 0.4955078125,
"learning_rate": 9.984714249673673e-07,
"loss": 0.2703,
"reward": 0.7000000238418579,
"reward_std": 0.9109564527869225,
"rewards/accuracy_reward": 0.22500000149011612,
"rewards/format_reward": 0.47500001490116117,
"step": 230
},
{
"completion_length": 133.31250381469727,
"epoch": 0.12533333333333332,
"grad_norm": 4.252659797668457,
"kl": 0.50390625,
"learning_rate": 9.98086061233475e-07,
"loss": 0.312,
"reward": 0.8500000298023224,
"reward_std": 0.840288233757019,
"rewards/accuracy_reward": 0.2833333402872086,
"rewards/format_reward": 0.5666666805744172,
"step": 235
},
{
"completion_length": 118.69167022705078,
"epoch": 0.128,
"grad_norm": 82.35729217529297,
"kl": 1.1455078125,
"learning_rate": 9.97657514695541e-07,
"loss": 0.2646,
"reward": 0.9041666984558105,
"reward_std": 0.7261348217725754,
"rewards/accuracy_reward": 0.22083334028720855,
"rewards/format_reward": 0.6833333551883698,
"step": 240
},
{
"completion_length": 123.26250381469727,
"epoch": 0.13066666666666665,
"grad_norm": 3.8186757564544678,
"kl": 0.5734375,
"learning_rate": 9.971858225074672e-07,
"loss": 0.2327,
"reward": 0.8333333790302276,
"reward_std": 0.7271113842725754,
"rewards/accuracy_reward": 0.19166667014360428,
"rewards/format_reward": 0.6416666895151139,
"step": 245
},
{
"completion_length": 139.0875045776367,
"epoch": 0.13333333333333333,
"grad_norm": 4.611005783081055,
"kl": 0.55546875,
"learning_rate": 9.966710255637762e-07,
"loss": 0.3656,
"reward": 0.7083333514630794,
"reward_std": 0.7690498679876328,
"rewards/accuracy_reward": 0.12500000298023223,
"rewards/format_reward": 0.5833333611488343,
"step": 250
},
{
"completion_length": 131.25000610351563,
"epoch": 0.136,
"grad_norm": 5.000573635101318,
"kl": 0.6904296875,
"learning_rate": 9.961131684960634e-07,
"loss": 0.3735,
"reward": 0.7583333611488342,
"reward_std": 0.8809190809726715,
"rewards/accuracy_reward": 0.21666667200624942,
"rewards/format_reward": 0.5416666835546493,
"step": 255
},
{
"completion_length": 130.9083381652832,
"epoch": 0.13866666666666666,
"grad_norm": 10.388055801391602,
"kl": 0.87734375,
"learning_rate": 9.955122996691277e-07,
"loss": 0.4113,
"reward": 0.8041666835546494,
"reward_std": 0.8410302340984345,
"rewards/accuracy_reward": 0.2125000074505806,
"rewards/format_reward": 0.5916666954755783,
"step": 260
},
{
"completion_length": 128.3708366394043,
"epoch": 0.14133333333333334,
"grad_norm": 5.107520580291748,
"kl": 0.790234375,
"learning_rate": 9.948684711767799e-07,
"loss": 0.3787,
"reward": 0.8416666924953461,
"reward_std": 0.6999663650989533,
"rewards/accuracy_reward": 0.18333333507180213,
"rewards/format_reward": 0.6583333522081375,
"step": 265
},
{
"completion_length": 163.54583816528321,
"epoch": 0.144,
"grad_norm": 7.847362995147705,
"kl": 0.6859375,
"learning_rate": 9.941817388373247e-07,
"loss": 0.486,
"reward": 0.9250000238418579,
"reward_std": 0.6243289351463318,
"rewards/accuracy_reward": 0.1916666693985462,
"rewards/format_reward": 0.7333333551883697,
"step": 270
},
{
"completion_length": 222.45000915527345,
"epoch": 0.14666666666666667,
"grad_norm": 4.114348411560059,
"kl": 0.951953125,
"learning_rate": 9.934521621887221e-07,
"loss": 0.5765,
"reward": 0.825000011920929,
"reward_std": 0.70595743060112,
"rewards/accuracy_reward": 0.18333333767950535,
"rewards/format_reward": 0.6416666865348816,
"step": 275
},
{
"completion_length": 270.7250061035156,
"epoch": 0.14933333333333335,
"grad_norm": 7.752039432525635,
"kl": 1.58359375,
"learning_rate": 9.926798044834259e-07,
"loss": 0.8842,
"reward": 0.7625000387430191,
"reward_std": 0.8192247807979584,
"rewards/accuracy_reward": 0.1541666727513075,
"rewards/format_reward": 0.608333346247673,
"step": 280
},
{
"completion_length": 286.2875045776367,
"epoch": 0.152,
"grad_norm": 20.332632064819336,
"kl": 1.559765625,
"learning_rate": 9.91864732682899e-07,
"loss": 0.7816,
"reward": 0.6875000238418579,
"reward_std": 0.8882942378520966,
"rewards/accuracy_reward": 0.19583333805203437,
"rewards/format_reward": 0.49166667759418486,
"step": 285
},
{
"completion_length": 321.8916748046875,
"epoch": 0.15466666666666667,
"grad_norm": 70.37603759765625,
"kl": 2.5359375,
"learning_rate": 9.910070174518091e-07,
"loss": 0.8462,
"reward": 0.7041666865348816,
"reward_std": 0.9259481251239776,
"rewards/accuracy_reward": 0.1875000037252903,
"rewards/format_reward": 0.5166666805744171,
"step": 290
},
{
"completion_length": 311.6041748046875,
"epoch": 0.15733333333333333,
"grad_norm": 78.9567642211914,
"kl": 3.86171875,
"learning_rate": 9.90106733151901e-07,
"loss": 0.9557,
"reward": 0.5708333492279053,
"reward_std": 0.9381726026535034,
"rewards/accuracy_reward": 0.13750000149011612,
"rewards/format_reward": 0.4333333447575569,
"step": 295
},
{
"completion_length": 250.3291778564453,
"epoch": 0.16,
"grad_norm": 138.24191284179688,
"kl": 8.51640625,
"learning_rate": 9.89163957835551e-07,
"loss": 1.2961,
"reward": 0.6208333462476731,
"reward_std": 0.909997683763504,
"rewards/accuracy_reward": 0.1541666727513075,
"rewards/format_reward": 0.46666667610406876,
"step": 300
},
{
"completion_length": 215.75833740234376,
"epoch": 0.16266666666666665,
"grad_norm": 126.323974609375,
"kl": 3.709375,
"learning_rate": 9.881787732389985e-07,
"loss": 0.9302,
"reward": 0.6666666895151139,
"reward_std": 0.8827720135450363,
"rewards/accuracy_reward": 0.16666667088866233,
"rewards/format_reward": 0.500000013411045,
"step": 305
},
{
"completion_length": 180.2916732788086,
"epoch": 0.16533333333333333,
"grad_norm": 36.90127944946289,
"kl": 3.61640625,
"learning_rate": 9.871512647752612e-07,
"loss": 0.8254,
"reward": 0.5916666835546494,
"reward_std": 0.9250853776931762,
"rewards/accuracy_reward": 0.11666666902601719,
"rewards/format_reward": 0.47500001192092894,
"step": 310
},
{
"completion_length": 159.34167404174804,
"epoch": 0.168,
"grad_norm": 50.75526428222656,
"kl": 5.4921875,
"learning_rate": 9.860815215267287e-07,
"loss": 0.8761,
"reward": 0.704166692495346,
"reward_std": 0.9947333693504333,
"rewards/accuracy_reward": 0.21250000447034836,
"rewards/format_reward": 0.491666679084301,
"step": 315
},
{
"completion_length": 171.51667327880858,
"epoch": 0.17066666666666666,
"grad_norm": 50.245628356933594,
"kl": 4.3265625,
"learning_rate": 9.849696362374397e-07,
"loss": 0.8211,
"reward": 0.49166668951511383,
"reward_std": 0.8819661140441895,
"rewards/accuracy_reward": 0.1000000037252903,
"rewards/format_reward": 0.3916666813194752,
"step": 320
},
{
"completion_length": 130.46667098999023,
"epoch": 0.17333333333333334,
"grad_norm": 25.52703285217285,
"kl": 4.159375,
"learning_rate": 9.838157053050423e-07,
"loss": 0.6859,
"reward": 0.8916666805744171,
"reward_std": 0.7892452508211136,
"rewards/accuracy_reward": 0.21666667312383653,
"rewards/format_reward": 0.6750000208616257,
"step": 325
},
{
"completion_length": 171.3375015258789,
"epoch": 0.176,
"grad_norm": 38.008914947509766,
"kl": 4.453125,
"learning_rate": 9.826198287724346e-07,
"loss": 0.8512,
"reward": 0.8041666924953461,
"reward_std": 0.8764552354812623,
"rewards/accuracy_reward": 0.26250000633299353,
"rewards/format_reward": 0.5416666954755783,
"step": 330
},
{
"completion_length": 164.07500610351562,
"epoch": 0.17866666666666667,
"grad_norm": 128.8215789794922,
"kl": 5.5421875,
"learning_rate": 9.813821103190931e-07,
"loss": 0.9175,
"reward": 0.8166666984558105,
"reward_std": 0.8608273565769196,
"rewards/accuracy_reward": 0.25833333991467955,
"rewards/format_reward": 0.5583333522081375,
"step": 335
},
{
"completion_length": 148.26666946411132,
"epoch": 0.18133333333333335,
"grad_norm": 46.26949691772461,
"kl": 3.05859375,
"learning_rate": 9.80102657252083e-07,
"loss": 0.6734,
"reward": 0.8250000238418579,
"reward_std": 0.7950194001197814,
"rewards/accuracy_reward": 0.21666667349636554,
"rewards/format_reward": 0.6083333492279053,
"step": 340
},
{
"completion_length": 157.37917098999023,
"epoch": 0.184,
"grad_norm": 20.64828109741211,
"kl": 4.228125,
"learning_rate": 9.787815804967551e-07,
"loss": 0.9426,
"reward": 0.9125000298023224,
"reward_std": 0.7014284431934357,
"rewards/accuracy_reward": 0.22083334252238274,
"rewards/format_reward": 0.6916666805744172,
"step": 345
},
{
"completion_length": 169.56667251586913,
"epoch": 0.18666666666666668,
"grad_norm": 57.10884094238281,
"kl": 2.883203125,
"learning_rate": 9.774189945871288e-07,
"loss": 0.7312,
"reward": 0.7875000238418579,
"reward_std": 0.6836557567119599,
"rewards/accuracy_reward": 0.16250000484287738,
"rewards/format_reward": 0.6250000178813935,
"step": 350
},
{
"completion_length": 162.88750457763672,
"epoch": 0.18933333333333333,
"grad_norm": 33.1915397644043,
"kl": 3.08203125,
"learning_rate": 9.760150176559624e-07,
"loss": 0.6298,
"reward": 0.9041666865348816,
"reward_std": 0.7536427795886993,
"rewards/accuracy_reward": 0.24583334252238273,
"rewards/format_reward": 0.658333358168602,
"step": 355
},
{
"completion_length": 127.81666870117188,
"epoch": 0.192,
"grad_norm": 28.192670822143555,
"kl": 7.875,
"learning_rate": 9.745697714245118e-07,
"loss": 1.1418,
"reward": 1.066666692495346,
"reward_std": 0.6796198636293411,
"rewards/accuracy_reward": 0.3333333428949118,
"rewards/format_reward": 0.7333333551883697,
"step": 360
},
{
"completion_length": 126.42917175292969,
"epoch": 0.19466666666666665,
"grad_norm": 19.537328720092773,
"kl": 1.71875,
"learning_rate": 9.730833811919762e-07,
"loss": 0.5002,
"reward": 0.9875000238418579,
"reward_std": 0.6549044132232666,
"rewards/accuracy_reward": 0.2541666716337204,
"rewards/format_reward": 0.7333333551883697,
"step": 365
},
{
"completion_length": 129.67500534057618,
"epoch": 0.19733333333333333,
"grad_norm": 35.49064636230469,
"kl": 3.1390625,
"learning_rate": 9.715559758246361e-07,
"loss": 0.5476,
"reward": 1.041666704416275,
"reward_std": 0.6668142318725586,
"rewards/accuracy_reward": 0.2916666753590107,
"rewards/format_reward": 0.7500000178813935,
"step": 370
},
{
"completion_length": 124.04166870117187,
"epoch": 0.2,
"grad_norm": 16.886371612548828,
"kl": 4.10625,
"learning_rate": 9.699876877446812e-07,
"loss": 0.6237,
"reward": 0.9458333551883698,
"reward_std": 0.6157839864492416,
"rewards/accuracy_reward": 0.1958333395421505,
"rewards/format_reward": 0.7500000238418579,
"step": 375
},
{
"completion_length": 128.82083740234376,
"epoch": 0.20266666666666666,
"grad_norm": 60.52346420288086,
"kl": 1.719140625,
"learning_rate": 9.683786529187285e-07,
"loss": 0.4091,
"reward": 0.9833333730697632,
"reward_std": 0.5964143082499505,
"rewards/accuracy_reward": 0.25833334401249886,
"rewards/format_reward": 0.7250000178813935,
"step": 380
},
{
"completion_length": 130.2500030517578,
"epoch": 0.20533333333333334,
"grad_norm": 8.501641273498535,
"kl": 2.725,
"learning_rate": 9.667290108460353e-07,
"loss": 0.4553,
"reward": 0.9625000298023224,
"reward_std": 0.6627969831228256,
"rewards/accuracy_reward": 0.25416667088866235,
"rewards/format_reward": 0.7083333551883697,
"step": 385
},
{
"completion_length": 151.49167251586914,
"epoch": 0.208,
"grad_norm": 9.237239837646484,
"kl": 3.03125,
"learning_rate": 9.650389045464044e-07,
"loss": 0.5862,
"reward": 1.0083333671092987,
"reward_std": 0.6827763438224792,
"rewards/accuracy_reward": 0.28333334177732467,
"rewards/format_reward": 0.725000011920929,
"step": 390
},
{
"completion_length": 157.08750381469727,
"epoch": 0.21066666666666667,
"grad_norm": 21.65406036376953,
"kl": 2.93125,
"learning_rate": 9.633084805477855e-07,
"loss": 0.7111,
"reward": 1.0791666984558106,
"reward_std": 0.6408874064683914,
"rewards/accuracy_reward": 0.3125000070780516,
"rewards/format_reward": 0.766666692495346,
"step": 395
},
{
"completion_length": 177.63750381469725,
"epoch": 0.21333333333333335,
"grad_norm": 18.198787689208984,
"kl": 3.2578125,
"learning_rate": 9.615378888735705e-07,
"loss": 0.6602,
"reward": 0.9416666984558105,
"reward_std": 0.7165269427001476,
"rewards/accuracy_reward": 0.2583333391696215,
"rewards/format_reward": 0.6833333492279052,
"step": 400
},
{
"epoch": 0.21333333333333335,
"eval_completion_length": 176.4338934326172,
"eval_kl": 3.930625,
"eval_loss": 0.726865291595459,
"eval_reward": 0.8177778057257334,
"eval_reward_std": 0.6464416084686915,
"eval_rewards/accuracy_reward": 0.125555559694767,
"eval_rewards/format_reward": 0.6922222431500753,
"eval_runtime": 651.5271,
"eval_samples_per_second": 0.46,
"eval_steps_per_second": 0.02,
"step": 400
},
{
"completion_length": 156.30417175292968,
"epoch": 0.216,
"grad_norm": 9.428586959838867,
"kl": 3.2203125,
"learning_rate": 9.597272830295876e-07,
"loss": 0.5783,
"reward": 0.9208333551883697,
"reward_std": 0.6943224638700485,
"rewards/accuracy_reward": 0.22083334028720855,
"rewards/format_reward": 0.7000000238418579,
"step": 405
},
{
"completion_length": 187.24167251586914,
"epoch": 0.21866666666666668,
"grad_norm": 8.181583404541016,
"kl": 2.2078125,
"learning_rate": 9.578768199907919e-07,
"loss": 0.5979,
"reward": 0.8416666924953461,
"reward_std": 0.683533999323845,
"rewards/accuracy_reward": 0.15833333805203437,
"rewards/format_reward": 0.6833333522081375,
"step": 410
},
{
"completion_length": 196.9791732788086,
"epoch": 0.22133333333333333,
"grad_norm": 241.11936950683594,
"kl": 5.8875,
"learning_rate": 9.55986660187658e-07,
"loss": 0.9332,
"reward": 0.8916666984558106,
"reward_std": 0.7562600076198578,
"rewards/accuracy_reward": 0.2916666727513075,
"rewards/format_reward": 0.6000000089406967,
"step": 415
},
{
"completion_length": 200.53750610351562,
"epoch": 0.224,
"grad_norm": 24.950624465942383,
"kl": 2.9796875,
"learning_rate": 9.540569674922684e-07,
"loss": 0.6774,
"reward": 0.9208333611488342,
"reward_std": 0.7625810235738755,
"rewards/accuracy_reward": 0.2625000074505806,
"rewards/format_reward": 0.6583333522081375,
"step": 420
},
{
"completion_length": 198.76250686645508,
"epoch": 0.22666666666666666,
"grad_norm": 13.279343605041504,
"kl": 2.8796875,
"learning_rate": 9.520879092041083e-07,
"loss": 0.7823,
"reward": 0.8708333611488343,
"reward_std": 0.6853504031896591,
"rewards/accuracy_reward": 0.18750000335276126,
"rewards/format_reward": 0.6833333611488343,
"step": 425
},
{
"completion_length": 132.6208381652832,
"epoch": 0.22933333333333333,
"grad_norm": 19.399991989135742,
"kl": 3.008203125,
"learning_rate": 9.500796560355602e-07,
"loss": 0.4804,
"reward": 0.9458333551883698,
"reward_std": 0.5760359674692154,
"rewards/accuracy_reward": 0.1791666705161333,
"rewards/format_reward": 0.7666666686534882,
"step": 430
},
{
"completion_length": 156.29583816528321,
"epoch": 0.232,
"grad_norm": 13.930024147033691,
"kl": 2.163671875,
"learning_rate": 9.480323820971037e-07,
"loss": 0.6149,
"reward": 0.9000000298023224,
"reward_std": 0.6693511486053467,
"rewards/accuracy_reward": 0.16666667014360428,
"rewards/format_reward": 0.7333333551883697,
"step": 435
},
{
"completion_length": 170.9166732788086,
"epoch": 0.23466666666666666,
"grad_norm": 28.107120513916016,
"kl": 3.184375,
"learning_rate": 9.459462648822207e-07,
"loss": 0.6151,
"reward": 0.8916667103767395,
"reward_std": 0.6882745712995529,
"rewards/accuracy_reward": 0.21666667684912683,
"rewards/format_reward": 0.675000011920929,
"step": 440
},
{
"completion_length": 162.44583892822266,
"epoch": 0.23733333333333334,
"grad_norm": 20.64507293701172,
"kl": 4.196875,
"learning_rate": 9.438214852520072e-07,
"loss": 0.7043,
"reward": 0.829166692495346,
"reward_std": 0.6397666782140732,
"rewards/accuracy_reward": 0.1458333373069763,
"rewards/format_reward": 0.6833333492279052,
"step": 445
},
{
"completion_length": 133.62917098999023,
"epoch": 0.24,
"grad_norm": 5.781806945800781,
"kl": 2.41796875,
"learning_rate": 9.416582274194929e-07,
"loss": 0.5327,
"reward": 1.0500000238418579,
"reward_std": 0.6202212646603584,
"rewards/accuracy_reward": 0.2666666727513075,
"rewards/format_reward": 0.7833333492279053,
"step": 450
},
{
"completion_length": 216.83334045410157,
"epoch": 0.24266666666666667,
"grad_norm": 24.161720275878906,
"kl": 1.6671875,
"learning_rate": 9.394566789336707e-07,
"loss": 0.5952,
"reward": 0.8916666984558106,
"reward_std": 0.6439453423023224,
"rewards/accuracy_reward": 0.19166667051613331,
"rewards/format_reward": 0.7000000178813934,
"step": 455
},
{
"completion_length": 133.31667098999023,
"epoch": 0.24533333333333332,
"grad_norm": 14.312320709228516,
"kl": 2.26484375,
"learning_rate": 9.372170306632358e-07,
"loss": 0.3488,
"reward": 1.0583333730697633,
"reward_std": 0.4993865922093391,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/format_reward": 0.8083333551883698,
"step": 460
},
{
"completion_length": 115.00000228881837,
"epoch": 0.248,
"grad_norm": 24.176897048950195,
"kl": 2.22109375,
"learning_rate": 9.349394767800396e-07,
"loss": 0.3995,
"reward": 1.0458333611488342,
"reward_std": 0.5499838680028916,
"rewards/accuracy_reward": 0.25416667461395265,
"rewards/format_reward": 0.791666692495346,
"step": 465
},
{
"completion_length": 170.7375030517578,
"epoch": 0.25066666666666665,
"grad_norm": 15.032400131225586,
"kl": 3.209375,
"learning_rate": 9.326242147422536e-07,
"loss": 0.6388,
"reward": 0.9458333611488342,
"reward_std": 0.6427857339382171,
"rewards/accuracy_reward": 0.22916667051613332,
"rewards/format_reward": 0.7166666865348816,
"step": 470
},
{
"completion_length": 178.14167098999025,
"epoch": 0.25333333333333335,
"grad_norm": 12.969917297363281,
"kl": 2.8015625,
"learning_rate": 9.302714452772514e-07,
"loss": 0.5282,
"reward": 0.8666666924953461,
"reward_std": 0.6148743867874146,
"rewards/accuracy_reward": 0.20833334065973758,
"rewards/format_reward": 0.6583333492279053,
"step": 475
},
{
"completion_length": 160.37916946411133,
"epoch": 0.256,
"grad_norm": 25.496326446533203,
"kl": 2.88515625,
"learning_rate": 9.278813723642059e-07,
"loss": 0.6265,
"reward": 0.7583333671092987,
"reward_std": 0.6592622727155686,
"rewards/accuracy_reward": 0.09166666939854622,
"rewards/format_reward": 0.666666692495346,
"step": 480
},
{
"completion_length": 156.80833740234374,
"epoch": 0.25866666666666666,
"grad_norm": 19.942245483398438,
"kl": 2.80546875,
"learning_rate": 9.254542032164046e-07,
"loss": 0.5487,
"reward": 1.0916667103767395,
"reward_std": 0.5701596170663834,
"rewards/accuracy_reward": 0.2916666727513075,
"rewards/format_reward": 0.8000000238418579,
"step": 485
},
{
"completion_length": 190.86667556762694,
"epoch": 0.2613333333333333,
"grad_norm": 29.593677520751953,
"kl": 5.2046875,
"learning_rate": 9.229901482632849e-07,
"loss": 0.8562,
"reward": 0.8750000268220901,
"reward_std": 0.7407498300075531,
"rewards/accuracy_reward": 0.25833334438502786,
"rewards/format_reward": 0.6166666775941849,
"step": 490
},
{
"completion_length": 185.21250381469727,
"epoch": 0.264,
"grad_norm": 15.83928108215332,
"kl": 3.34765625,
"learning_rate": 9.204894211321905e-07,
"loss": 0.7039,
"reward": 0.9708333551883698,
"reward_std": 0.7463637501001358,
"rewards/accuracy_reward": 0.30416667759418486,
"rewards/format_reward": 0.6666666865348816,
"step": 495
},
{
"completion_length": 177.40833740234376,
"epoch": 0.26666666666666666,
"grad_norm": 12.166021347045898,
"kl": 4.6421875,
"learning_rate": 9.179522386298506e-07,
"loss": 0.8557,
"reward": 0.8416666924953461,
"reward_std": 0.753840970993042,
"rewards/accuracy_reward": 0.22500000558793545,
"rewards/format_reward": 0.6166666924953461,
"step": 500
},
{
"completion_length": 201.27083892822264,
"epoch": 0.2693333333333333,
"grad_norm": 23.116260528564453,
"kl": 2.7734375,
"learning_rate": 9.153788207235826e-07,
"loss": 0.7171,
"reward": 1.0333333730697631,
"reward_std": 0.7333385825157166,
"rewards/accuracy_reward": 0.35000001192092894,
"rewards/format_reward": 0.6833333492279052,
"step": 505
},
{
"completion_length": 168.51250457763672,
"epoch": 0.272,
"grad_norm": 78.56771850585938,
"kl": 3.934375,
"learning_rate": 9.127693905222223e-07,
"loss": 0.7631,
"reward": 0.9583333551883697,
"reward_std": 0.7650938987731933,
"rewards/accuracy_reward": 0.31666667461395265,
"rewards/format_reward": 0.6416666865348816,
"step": 510
},
{
"completion_length": 179.67500534057618,
"epoch": 0.27466666666666667,
"grad_norm": 12.310942649841309,
"kl": 2.8015625,
"learning_rate": 9.1012417425678e-07,
"loss": 0.7708,
"reward": 1.050000047683716,
"reward_std": 0.6466626852750779,
"rewards/accuracy_reward": 0.3250000040978193,
"rewards/format_reward": 0.7250000238418579,
"step": 515
},
{
"completion_length": 137.3458381652832,
"epoch": 0.2773333333333333,
"grad_norm": 23.13488006591797,
"kl": 2.5033203125,
"learning_rate": 9.074434012608281e-07,
"loss": 0.5319,
"reward": 1.0208333611488343,
"reward_std": 0.5777831941843032,
"rewards/accuracy_reward": 0.2541666753590107,
"rewards/format_reward": 0.766666692495346,
"step": 520
},
{
"completion_length": 139.60833663940429,
"epoch": 0.28,
"grad_norm": 18.197731018066406,
"kl": 2.709375,
"learning_rate": 9.047273039506174e-07,
"loss": 0.6145,
"reward": 1.0708333730697632,
"reward_std": 0.47921385020017626,
"rewards/accuracy_reward": 0.24583334103226662,
"rewards/format_reward": 0.8250000238418579,
"step": 525
},
{
"completion_length": 125.83333816528321,
"epoch": 0.2826666666666667,
"grad_norm": 62.50031661987305,
"kl": 1.6216796875,
"learning_rate": 9.019761178049279e-07,
"loss": 0.4705,
"reward": 1.1375000178813934,
"reward_std": 0.4138069227337837,
"rewards/accuracy_reward": 0.2958333373069763,
"rewards/format_reward": 0.8416666805744171,
"step": 530
},
{
"completion_length": 179.41667251586915,
"epoch": 0.2853333333333333,
"grad_norm": 18.075428009033203,
"kl": 2.503125,
"learning_rate": 8.991900813446522e-07,
"loss": 0.6926,
"reward": 0.9208333671092988,
"reward_std": 0.7170521825551986,
"rewards/accuracy_reward": 0.26250000596046447,
"rewards/format_reward": 0.6583333551883698,
"step": 535
},
{
"completion_length": 269.2916717529297,
"epoch": 0.288,
"grad_norm": 10.559329986572266,
"kl": 3.5578125,
"learning_rate": 8.963694361121185e-07,
"loss": 0.7955,
"reward": 0.7375000238418579,
"reward_std": 0.7803374290466308,
"rewards/accuracy_reward": 0.21250000707805156,
"rewards/format_reward": 0.5250000119209289,
"step": 540
},
{
"completion_length": 171.4625045776367,
"epoch": 0.2906666666666667,
"grad_norm": 85.9892578125,
"kl": 3.36953125,
"learning_rate": 8.935144266501468e-07,
"loss": 0.7548,
"reward": 0.9333333671092987,
"reward_std": 0.6961856186389923,
"rewards/accuracy_reward": 0.2416666690260172,
"rewards/format_reward": 0.6916666805744172,
"step": 545
},
{
"completion_length": 116.50833663940429,
"epoch": 0.29333333333333333,
"grad_norm": 61.62466812133789,
"kl": 3.23515625,
"learning_rate": 8.906253004808504e-07,
"loss": 0.6184,
"reward": 1.1833333849906922,
"reward_std": 0.5228020772337914,
"rewards/accuracy_reward": 0.35833334028720853,
"rewards/format_reward": 0.8250000238418579,
"step": 550
},
{
"completion_length": 130.47500381469726,
"epoch": 0.296,
"grad_norm": 13.287981986999512,
"kl": 1.23828125,
"learning_rate": 8.877023080841737e-07,
"loss": 0.4985,
"reward": 1.1375000178813934,
"reward_std": 0.4386047780513763,
"rewards/accuracy_reward": 0.29583334028720853,
"rewards/format_reward": 0.8416666924953461,
"step": 555
},
{
"completion_length": 171.2916702270508,
"epoch": 0.2986666666666667,
"grad_norm": 11.248329162597656,
"kl": 2.68125,
"learning_rate": 8.847457028761782e-07,
"loss": 0.7836,
"reward": 1.045833373069763,
"reward_std": 0.6420545637607574,
"rewards/accuracy_reward": 0.28750000447034835,
"rewards/format_reward": 0.7583333492279053,
"step": 560
},
{
"completion_length": 153.5500045776367,
"epoch": 0.30133333333333334,
"grad_norm": 11.977041244506836,
"kl": 2.99296875,
"learning_rate": 8.817557411870715e-07,
"loss": 0.6474,
"reward": 1.1375000417232513,
"reward_std": 0.546431428194046,
"rewards/accuracy_reward": 0.3458333432674408,
"rewards/format_reward": 0.7916666805744171,
"step": 565
},
{
"completion_length": 157.00833740234376,
"epoch": 0.304,
"grad_norm": 8.921804428100586,
"kl": 2.18046875,
"learning_rate": 8.787326822389835e-07,
"loss": 0.7647,
"reward": 1.0416667103767394,
"reward_std": 0.4992914006114006,
"rewards/accuracy_reward": 0.2333333373069763,
"rewards/format_reward": 0.8083333551883698,
"step": 570
},
{
"completion_length": 203.42916946411134,
"epoch": 0.30666666666666664,
"grad_norm": 8.668245315551758,
"kl": 1.5703125,
"learning_rate": 8.756767881234928e-07,
"loss": 0.7449,
"reward": 1.0916666865348816,
"reward_std": 0.6213793724775314,
"rewards/accuracy_reward": 0.3166666731238365,
"rewards/format_reward": 0.7750000119209289,
"step": 575
},
{
"completion_length": 147.91667098999022,
"epoch": 0.30933333333333335,
"grad_norm": 27.652040481567383,
"kl": 2.9046875,
"learning_rate": 8.725883237789044e-07,
"loss": 0.7151,
"reward": 1.1083333551883698,
"reward_std": 0.5087577894330024,
"rewards/accuracy_reward": 0.27500000670552255,
"rewards/format_reward": 0.8333333611488343,
"step": 580
},
{
"completion_length": 195.73750381469728,
"epoch": 0.312,
"grad_norm": 5.533527851104736,
"kl": 2.3046875,
"learning_rate": 8.694675569672799e-07,
"loss": 0.7577,
"reward": 0.9291666924953461,
"reward_std": 0.5898149274289608,
"rewards/accuracy_reward": 0.17916667014360427,
"rewards/format_reward": 0.7500000178813935,
"step": 585
},
{
"completion_length": 218.38334121704102,
"epoch": 0.31466666666666665,
"grad_norm": 7.887436866760254,
"kl": 1.71484375,
"learning_rate": 8.663147582512231e-07,
"loss": 0.6805,
"reward": 1.0375000298023225,
"reward_std": 0.6552489116787911,
"rewards/accuracy_reward": 0.3125000029802322,
"rewards/format_reward": 0.7250000178813935,
"step": 590
},
{
"completion_length": 142.9625030517578,
"epoch": 0.31733333333333336,
"grad_norm": 39.04753112792969,
"kl": 3.50078125,
"learning_rate": 8.631302009704233e-07,
"loss": 0.7378,
"reward": 1.0500000357627868,
"reward_std": 0.5874016582965851,
"rewards/accuracy_reward": 0.2416666716337204,
"rewards/format_reward": 0.8083333551883698,
"step": 595
},
{
"completion_length": 145.0583366394043,
"epoch": 0.32,
"grad_norm": 10.890267372131348,
"kl": 1.9578125,
"learning_rate": 8.59914161217957e-07,
"loss": 0.4446,
"reward": 1.2125000357627869,
"reward_std": 0.33466504961252214,
"rewards/accuracy_reward": 0.3208333432674408,
"rewards/format_reward": 0.8916666865348816,
"step": 600
},
{
"epoch": 0.32,
"eval_completion_length": 157.21389434814452,
"eval_kl": 1.6558333333333333,
"eval_loss": 0.4634725749492645,
"eval_reward": 0.9744444727897644,
"eval_reward_std": 0.43147118786970773,
"eval_rewards/accuracy_reward": 0.13000000352660815,
"eval_rewards/format_reward": 0.8444444608688354,
"eval_runtime": 533.9055,
"eval_samples_per_second": 0.562,
"eval_steps_per_second": 0.024,
"step": 600
},
{
"completion_length": 193.183341217041,
"epoch": 0.32266666666666666,
"grad_norm": 13.141694068908691,
"kl": 1.72421875,
"learning_rate": 8.566669178163512e-07,
"loss": 0.5602,
"reward": 0.9708333551883698,
"reward_std": 0.5239060014486313,
"rewards/accuracy_reward": 0.2041666716337204,
"rewards/format_reward": 0.766666692495346,
"step": 605
},
{
"completion_length": 154.98333663940429,
"epoch": 0.3253333333333333,
"grad_norm": 9.7343111038208,
"kl": 1.919921875,
"learning_rate": 8.533887522934114e-07,
"loss": 0.4813,
"reward": 1.1041667103767394,
"reward_std": 0.5670485764741897,
"rewards/accuracy_reward": 0.28750000819563865,
"rewards/format_reward": 0.8166666924953461,
"step": 610
},
{
"completion_length": 109.46667022705078,
"epoch": 0.328,
"grad_norm": 6.92060661315918,
"kl": 1.7703125,
"learning_rate": 8.500799488578119e-07,
"loss": 0.2986,
"reward": 1.1000000476837157,
"reward_std": 0.2986703909933567,
"rewards/accuracy_reward": 0.1916666727513075,
"rewards/format_reward": 0.9083333492279053,
"step": 615
},
{
"completion_length": 181.3708366394043,
"epoch": 0.33066666666666666,
"grad_norm": 7.871663570404053,
"kl": 1.2625,
"learning_rate": 8.467407943744573e-07,
"loss": 0.6639,
"reward": 1.1875000476837159,
"reward_std": 0.5026830688118935,
"rewards/accuracy_reward": 0.37916667610406873,
"rewards/format_reward": 0.8083333551883698,
"step": 620
},
{
"completion_length": 130.59166946411133,
"epoch": 0.3333333333333333,
"grad_norm": 9.6685791015625,
"kl": 1.58359375,
"learning_rate": 8.433715783396114e-07,
"loss": 0.5216,
"reward": 1.1583333551883697,
"reward_std": 0.4340529665350914,
"rewards/accuracy_reward": 0.2750000134110451,
"rewards/format_reward": 0.8833333492279053,
"step": 625
},
{
"completion_length": 180.36667022705078,
"epoch": 0.336,
"grad_norm": 39.6641960144043,
"kl": 3.36953125,
"learning_rate": 8.399725928557985e-07,
"loss": 0.7533,
"reward": 1.025000023841858,
"reward_std": 0.46841561794281006,
"rewards/accuracy_reward": 0.20833333991467953,
"rewards/format_reward": 0.8166666865348816,
"step": 630
},
{
"completion_length": 135.39167022705078,
"epoch": 0.33866666666666667,
"grad_norm": 11.369102478027344,
"kl": 2.64765625,
"learning_rate": 8.365441326064788e-07,
"loss": 0.5253,
"reward": 1.1875000476837159,
"reward_std": 0.40690153986215594,
"rewards/accuracy_reward": 0.3375000089406967,
"rewards/format_reward": 0.8500000238418579,
"step": 635
},
{
"completion_length": 210.98333816528321,
"epoch": 0.3413333333333333,
"grad_norm": 12.549598693847656,
"kl": 1.3271484375,
"learning_rate": 8.330864948305007e-07,
"loss": 0.7683,
"reward": 1.0166667103767395,
"reward_std": 0.4776305049657822,
"rewards/accuracy_reward": 0.1833333384245634,
"rewards/format_reward": 0.8333333551883697,
"step": 640
},
{
"completion_length": 151.5333381652832,
"epoch": 0.344,
"grad_norm": 6.856210231781006,
"kl": 1.084765625,
"learning_rate": 8.295999792963299e-07,
"loss": 0.4446,
"reward": 1.200000035762787,
"reward_std": 0.43040212616324425,
"rewards/accuracy_reward": 0.3250000089406967,
"rewards/format_reward": 0.8750000238418579,
"step": 645
},
{
"completion_length": 269.2375091552734,
"epoch": 0.3466666666666667,
"grad_norm": 23.80208396911621,
"kl": 2.725,
"learning_rate": 8.260848882760615e-07,
"loss": 0.865,
"reward": 0.829166692495346,
"reward_std": 0.659246638417244,
"rewards/accuracy_reward": 0.1875000014901161,
"rewards/format_reward": 0.6416666865348816,
"step": 650
},
{
"completion_length": 211.78334045410156,
"epoch": 0.34933333333333333,
"grad_norm": 6.310311317443848,
"kl": 1.966015625,
"learning_rate": 8.225415265192126e-07,
"loss": 0.769,
"reward": 0.954166692495346,
"reward_std": 0.5517986357212067,
"rewards/accuracy_reward": 0.19583333991467952,
"rewards/format_reward": 0.7583333551883698,
"step": 655
},
{
"completion_length": 183.13750457763672,
"epoch": 0.352,
"grad_norm": 5.835783958435059,
"kl": 1.602734375,
"learning_rate": 8.18970201226302e-07,
"loss": 0.6146,
"reward": 1.0166667103767395,
"reward_std": 0.44397214204072954,
"rewards/accuracy_reward": 0.2166666716337204,
"rewards/format_reward": 0.8000000178813934,
"step": 660
},
{
"completion_length": 199.48750686645508,
"epoch": 0.3546666666666667,
"grad_norm": 8.435812950134277,
"kl": 1.846875,
"learning_rate": 8.153712220222163e-07,
"loss": 0.7525,
"reward": 0.9833333671092988,
"reward_std": 0.544944578409195,
"rewards/accuracy_reward": 0.21666666828095912,
"rewards/format_reward": 0.7666666984558106,
"step": 665
},
{
"completion_length": 134.15000381469727,
"epoch": 0.35733333333333334,
"grad_norm": 13.892914772033691,
"kl": 1.921484375,
"learning_rate": 8.117449009293668e-07,
"loss": 0.5067,
"reward": 1.2250000417232514,
"reward_std": 0.3981220737099648,
"rewards/accuracy_reward": 0.35000001154839994,
"rewards/format_reward": 0.8750000178813935,
"step": 670
},
{
"completion_length": 199.30000534057618,
"epoch": 0.36,
"grad_norm": 53.40914535522461,
"kl": 2.228125,
"learning_rate": 8.080915523406369e-07,
"loss": 0.8388,
"reward": 1.1083333611488342,
"reward_std": 0.5545098386704922,
"rewards/accuracy_reward": 0.3250000089406967,
"rewards/format_reward": 0.7833333432674408,
"step": 675
},
{
"completion_length": 197.45833740234374,
"epoch": 0.3626666666666667,
"grad_norm": 46.9063720703125,
"kl": 1.98671875,
"learning_rate": 8.044114929921263e-07,
"loss": 0.8575,
"reward": 0.9833333492279053,
"reward_std": 0.5669769406318664,
"rewards/accuracy_reward": 0.2000000011175871,
"rewards/format_reward": 0.7833333492279053,
"step": 680
},
{
"completion_length": 245.34584197998046,
"epoch": 0.36533333333333334,
"grad_norm": 20.520357131958008,
"kl": 2.934375,
"learning_rate": 8.007050419356898e-07,
"loss": 0.8979,
"reward": 0.8958333611488343,
"reward_std": 0.6523119986057282,
"rewards/accuracy_reward": 0.20416667275130748,
"rewards/format_reward": 0.6916666805744172,
"step": 685
},
{
"completion_length": 312.5666793823242,
"epoch": 0.368,
"grad_norm": 13.216290473937988,
"kl": 3.625,
"learning_rate": 7.969725205112765e-07,
"loss": 0.9082,
"reward": 0.8625000193715096,
"reward_std": 0.748971363902092,
"rewards/accuracy_reward": 0.3125000085681677,
"rewards/format_reward": 0.550000025331974,
"step": 690
},
{
"completion_length": 352.8666717529297,
"epoch": 0.37066666666666664,
"grad_norm": 30.815317153930664,
"kl": 2.85390625,
"learning_rate": 7.93214252319071e-07,
"loss": 0.8438,
"reward": 0.7708333522081375,
"reward_std": 0.8650185167789459,
"rewards/accuracy_reward": 0.3125000070780516,
"rewards/format_reward": 0.45833334177732465,
"step": 695
},
{
"completion_length": 232.11667175292968,
"epoch": 0.37333333333333335,
"grad_norm": 18.286134719848633,
"kl": 2.925,
"learning_rate": 7.894305631914373e-07,
"loss": 0.9608,
"reward": 0.9333333596587181,
"reward_std": 0.7935751020908356,
"rewards/accuracy_reward": 0.31666667610406873,
"rewards/format_reward": 0.6166666969656944,
"step": 700
},
{
"completion_length": 181.43750457763673,
"epoch": 0.376,
"grad_norm": 20.183544158935547,
"kl": 3.06015625,
"learning_rate": 7.856217811646706e-07,
"loss": 0.7195,
"reward": 1.0458333671092988,
"reward_std": 0.5713184028863907,
"rewards/accuracy_reward": 0.2791666716337204,
"rewards/format_reward": 0.7666666984558106,
"step": 705
},
{
"completion_length": 223.5291717529297,
"epoch": 0.37866666666666665,
"grad_norm": 11.171494483947754,
"kl": 2.15234375,
"learning_rate": 7.817882364505568e-07,
"loss": 0.6855,
"reward": 0.9000000357627869,
"reward_std": 0.5742868632078171,
"rewards/accuracy_reward": 0.1750000026077032,
"rewards/format_reward": 0.7250000149011612,
"step": 710
},
{
"completion_length": 266.17500457763674,
"epoch": 0.38133333333333336,
"grad_norm": 10.772181510925293,
"kl": 2.66953125,
"learning_rate": 7.779302614077448e-07,
"loss": 0.7085,
"reward": 0.900000023841858,
"reward_std": 0.6552800923585892,
"rewards/accuracy_reward": 0.24166667312383652,
"rewards/format_reward": 0.6583333551883698,
"step": 715
},
{
"completion_length": 253.56667785644532,
"epoch": 0.384,
"grad_norm": 15.041406631469727,
"kl": 2.715625,
"learning_rate": 7.740481905129306e-07,
"loss": 0.8722,
"reward": 0.9958333611488343,
"reward_std": 0.6977577596902848,
"rewards/accuracy_reward": 0.3125000037252903,
"rewards/format_reward": 0.6833333551883698,
"step": 720
},
{
"completion_length": 161.67084045410155,
"epoch": 0.38666666666666666,
"grad_norm": 4.084959030151367,
"kl": 2.09296875,
"learning_rate": 7.701423603318604e-07,
"loss": 0.5005,
"reward": 1.1458333611488343,
"reward_std": 0.40467526763677597,
"rewards/accuracy_reward": 0.3375000059604645,
"rewards/format_reward": 0.8083333551883698,
"step": 725
},
{
"completion_length": 259.2875091552734,
"epoch": 0.3893333333333333,
"grad_norm": 154.38690185546875,
"kl": 1.82265625,
"learning_rate": 7.662131094901498e-07,
"loss": 0.7136,
"reward": 0.8708333611488343,
"reward_std": 0.6159812211990356,
"rewards/accuracy_reward": 0.19583334289491178,
"rewards/format_reward": 0.675000011920929,
"step": 730
},
{
"completion_length": 261.3000061035156,
"epoch": 0.392,
"grad_norm": 2285.590576171875,
"kl": 98.85625,
"learning_rate": 7.622607786439278e-07,
"loss": 18.7274,
"reward": 0.9541666984558106,
"reward_std": 0.6454987242817879,
"rewards/accuracy_reward": 0.2541666701436043,
"rewards/format_reward": 0.7000000238418579,
"step": 735
},
{
"completion_length": 293.8916717529297,
"epoch": 0.39466666666666667,
"grad_norm": 103.3537826538086,
"kl": 2.91640625,
"learning_rate": 7.582857104503e-07,
"loss": 0.7409,
"reward": 0.8625000178813934,
"reward_std": 0.6774646982550621,
"rewards/accuracy_reward": 0.2458333380520344,
"rewards/format_reward": 0.6166666880249977,
"step": 740
},
{
"completion_length": 363.01250915527345,
"epoch": 0.3973333333333333,
"grad_norm": 54.62760925292969,
"kl": 5.337890625,
"learning_rate": 7.542882495376435e-07,
"loss": 1.3766,
"reward": 0.7250000238418579,
"reward_std": 0.7046854376792908,
"rewards/accuracy_reward": 0.19166667126119136,
"rewards/format_reward": 0.533333346247673,
"step": 745
},
{
"completion_length": 305.82500610351565,
"epoch": 0.4,
"grad_norm": 95.76229095458984,
"kl": 2.378125,
"learning_rate": 7.502687424757277e-07,
"loss": 0.9832,
"reward": 0.9791667044162751,
"reward_std": 0.776354917883873,
"rewards/accuracy_reward": 0.3291666775941849,
"rewards/format_reward": 0.6500000119209289,
"step": 750
},
{
"completion_length": 265.6125061035156,
"epoch": 0.4026666666666667,
"grad_norm": 80.57921600341797,
"kl": 10.53046875,
"learning_rate": 7.462275377456669e-07,
"loss": 2.1369,
"reward": 0.925000011920929,
"reward_std": 0.8046808481216431,
"rewards/accuracy_reward": 0.2833333395421505,
"rewards/format_reward": 0.6416666805744171,
"step": 755
},
{
"completion_length": 321.10834045410155,
"epoch": 0.4053333333333333,
"grad_norm": 34.1876220703125,
"kl": 4.4828125,
"learning_rate": 7.421649857097091e-07,
"loss": 0.9501,
"reward": 0.8291666865348816,
"reward_std": 0.6768444120883942,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/format_reward": 0.6000000149011612,
"step": 760
},
{
"completion_length": 319.262508392334,
"epoch": 0.408,
"grad_norm": 117.45841979980469,
"kl": 5.225,
"learning_rate": 7.380814385808594e-07,
"loss": 1.394,
"reward": 0.8166666895151138,
"reward_std": 0.7352788507938385,
"rewards/accuracy_reward": 0.21666667461395264,
"rewards/format_reward": 0.6000000163912773,
"step": 765
},
{
"completion_length": 367.7666809082031,
"epoch": 0.4106666666666667,
"grad_norm": 66.30304718017578,
"kl": 5.2375,
"learning_rate": 7.339772503923443e-07,
"loss": 1.1981,
"reward": 0.8458333551883698,
"reward_std": 0.7178668111562729,
"rewards/accuracy_reward": 0.2708333406597376,
"rewards/format_reward": 0.5750000178813934,
"step": 770
},
{
"completion_length": 316.7833450317383,
"epoch": 0.41333333333333333,
"grad_norm": 16.508329391479492,
"kl": 4.0140625,
"learning_rate": 7.298527769669187e-07,
"loss": 1.1483,
"reward": 0.9291666865348815,
"reward_std": 0.6445024594664573,
"rewards/accuracy_reward": 0.25416667461395265,
"rewards/format_reward": 0.6750000238418579,
"step": 775
},
{
"completion_length": 366.72084350585936,
"epoch": 0.416,
"grad_norm": 88.48445892333984,
"kl": 5.3171875,
"learning_rate": 7.257083758860157e-07,
"loss": 1.5957,
"reward": 0.7666666805744171,
"reward_std": 0.9076652824878693,
"rewards/accuracy_reward": 0.24166667126119137,
"rewards/format_reward": 0.5250000208616257,
"step": 780
},
{
"completion_length": 261.41251068115236,
"epoch": 0.4186666666666667,
"grad_norm": 58.172264099121094,
"kl": 5.23984375,
"learning_rate": 7.215444064587462e-07,
"loss": 1.4679,
"reward": 0.9916666924953461,
"reward_std": 0.6018173396587372,
"rewards/accuracy_reward": 0.24166667237877845,
"rewards/format_reward": 0.7500000178813935,
"step": 785
},
{
"completion_length": 316.15417709350584,
"epoch": 0.42133333333333334,
"grad_norm": 47.1014404296875,
"kl": 5.3765625,
"learning_rate": 7.173612296907472e-07,
"loss": 1.0298,
"reward": 0.8125000208616256,
"reward_std": 0.6595857471227646,
"rewards/accuracy_reward": 0.20416667312383652,
"rewards/format_reward": 0.6083333522081376,
"step": 790
},
{
"completion_length": 541.3083404541015,
"epoch": 0.424,
"grad_norm": 39.66009521484375,
"kl": 6.48125,
"learning_rate": 7.131592082528835e-07,
"loss": 1.4332,
"reward": 0.5750000149011611,
"reward_std": 0.9711216628551483,
"rewards/accuracy_reward": 0.25000000894069674,
"rewards/format_reward": 0.325000012665987,
"step": 795
},
{
"completion_length": 529.3041809082031,
"epoch": 0.4266666666666667,
"grad_norm": 47.98305892944336,
"kl": 7.490625,
"learning_rate": 7.089387064498055e-07,
"loss": 1.4781,
"reward": 0.6041666865348816,
"reward_std": 0.9722610518336297,
"rewards/accuracy_reward": 0.2291666753590107,
"rewards/format_reward": 0.37500000968575475,
"step": 800
},
{
"epoch": 0.4266666666666667,
"eval_completion_length": 626.6472381591797,
"eval_kl": 10.508958333333334,
"eval_loss": 1.6093299388885498,
"eval_reward": 0.24388889610767364,
"eval_reward_std": 0.9690509649117788,
"eval_rewards/accuracy_reward": 0.08500000188748041,
"eval_rewards/format_reward": 0.1588888943195343,
"eval_runtime": 1086.9778,
"eval_samples_per_second": 0.276,
"eval_steps_per_second": 0.012,
"step": 800
},
{
"completion_length": 575.5916809082031,
"epoch": 0.42933333333333334,
"grad_norm": 105.59420013427734,
"kl": 8.5296875,
"learning_rate": 7.047000901883645e-07,
"loss": 1.419,
"reward": 0.4041666805744171,
"reward_std": 0.9605139315128326,
"rewards/accuracy_reward": 0.21250000409781933,
"rewards/format_reward": 0.19166667610406876,
"step": 805
},
{
"completion_length": 561.8750213623047,
"epoch": 0.432,
"grad_norm": 40.40248489379883,
"kl": 7.340625,
"learning_rate": 7.004437269458894e-07,
"loss": 1.4182,
"reward": 0.45833334140479565,
"reward_std": 0.9228455007076264,
"rewards/accuracy_reward": 0.15000000558793544,
"rewards/format_reward": 0.3083333432674408,
"step": 810
},
{
"completion_length": 499.4708450317383,
"epoch": 0.43466666666666665,
"grad_norm": 15.018860816955566,
"kl": 7.36875,
"learning_rate": 6.961699857383278e-07,
"loss": 1.4916,
"reward": 0.6500000096857548,
"reward_std": 0.8621464431285858,
"rewards/accuracy_reward": 0.2166666742414236,
"rewards/format_reward": 0.43333334624767306,
"step": 815
},
{
"completion_length": 550.2083557128906,
"epoch": 0.43733333333333335,
"grad_norm": 161.48826599121094,
"kl": 5.7875,
"learning_rate": 6.91879237088253e-07,
"loss": 1.3703,
"reward": 0.6125000208616257,
"reward_std": 0.9770530998706818,
"rewards/accuracy_reward": 0.27083334475755694,
"rewards/format_reward": 0.34166667237877846,
"step": 820
},
{
"completion_length": 456.0291831970215,
"epoch": 0.44,
"grad_norm": 63.651100158691406,
"kl": 6.596875,
"learning_rate": 6.875718529927404e-07,
"loss": 1.5676,
"reward": 0.7250000149011612,
"reward_std": 0.8462802618741989,
"rewards/accuracy_reward": 0.24166667684912682,
"rewards/format_reward": 0.48333334624767305,
"step": 825
},
{
"completion_length": 405.82500610351565,
"epoch": 0.44266666666666665,
"grad_norm": 75.36550903320312,
"kl": 5.9515625,
"learning_rate": 6.832482068911166e-07,
"loss": 1.8022,
"reward": 0.8083333611488343,
"reward_std": 0.8616673052310944,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.5583333462476731,
"step": 830
},
{
"completion_length": 348.73751068115234,
"epoch": 0.44533333333333336,
"grad_norm": 66.93795013427734,
"kl": 5.48984375,
"learning_rate": 6.789086736325834e-07,
"loss": 1.1328,
"reward": 0.9583333611488343,
"reward_std": 0.6873706102371215,
"rewards/accuracy_reward": 0.325000012293458,
"rewards/format_reward": 0.6333333551883698,
"step": 835
},
{
"completion_length": 301.7250061035156,
"epoch": 0.448,
"grad_norm": 38.354427337646484,
"kl": 2.6962890625,
"learning_rate": 6.745536294437186e-07,
"loss": 0.9267,
"reward": 1.0375000298023225,
"reward_std": 0.5275938391685486,
"rewards/accuracy_reward": 0.32083333767950534,
"rewards/format_reward": 0.7166666865348816,
"step": 840
},
{
"completion_length": 353.62501220703126,
"epoch": 0.45066666666666666,
"grad_norm": 15.596612930297852,
"kl": 5.7234375,
"learning_rate": 6.701834518958586e-07,
"loss": 1.764,
"reward": 0.9833333730697632,
"reward_std": 0.7794818341732025,
"rewards/accuracy_reward": 0.35000001192092894,
"rewards/format_reward": 0.6333333492279053,
"step": 845
},
{
"completion_length": 303.0666778564453,
"epoch": 0.4533333333333333,
"grad_norm": 38.20693588256836,
"kl": 4.66875,
"learning_rate": 6.657985198723643e-07,
"loss": 1.1933,
"reward": 0.9541666895151139,
"reward_std": 0.6846331983804703,
"rewards/accuracy_reward": 0.3041666798293591,
"rewards/format_reward": 0.6500000149011612,
"step": 850
},
{
"completion_length": 320.7541793823242,
"epoch": 0.456,
"grad_norm": 18.968719482421875,
"kl": 5.1609375,
"learning_rate": 6.613992135357712e-07,
"loss": 1.5211,
"reward": 0.7916666865348816,
"reward_std": 0.698268249630928,
"rewards/accuracy_reward": 0.1500000052154064,
"rewards/format_reward": 0.6416666865348816,
"step": 855
},
{
"completion_length": 317.71667633056643,
"epoch": 0.45866666666666667,
"grad_norm": 37.6462287902832,
"kl": 4.6234375,
"learning_rate": 6.569859142948327e-07,
"loss": 1.3873,
"reward": 0.9583333611488343,
"reward_std": 0.6036534637212754,
"rewards/accuracy_reward": 0.2750000063329935,
"rewards/format_reward": 0.6833333522081375,
"step": 860
},
{
"completion_length": 405.2833450317383,
"epoch": 0.4613333333333333,
"grad_norm": 65.66597747802734,
"kl": 6.6875,
"learning_rate": 6.52559004771451e-07,
"loss": 1.5637,
"reward": 0.7166666906327009,
"reward_std": 0.7335242480039597,
"rewards/accuracy_reward": 0.20833333507180213,
"rewards/format_reward": 0.5083333522081375,
"step": 865
},
{
"completion_length": 306.6833435058594,
"epoch": 0.464,
"grad_norm": 40.86724853515625,
"kl": 3.3671875,
"learning_rate": 6.481188687675057e-07,
"loss": 0.9366,
"reward": 0.9291666686534882,
"reward_std": 0.5080332323908806,
"rewards/accuracy_reward": 0.23750000223517417,
"rewards/format_reward": 0.6916666865348816,
"step": 870
},
{
"completion_length": 263.4833419799805,
"epoch": 0.4666666666666667,
"grad_norm": 28.436981201171875,
"kl": 3.7203125,
"learning_rate": 6.436658912315788e-07,
"loss": 1.093,
"reward": 1.0041666984558106,
"reward_std": 0.5717462062835693,
"rewards/accuracy_reward": 0.2625000078231096,
"rewards/format_reward": 0.7416666805744171,
"step": 875
},
{
"completion_length": 237.6666702270508,
"epoch": 0.4693333333333333,
"grad_norm": 36.36558532714844,
"kl": 2.8359375,
"learning_rate": 6.392004582255807e-07,
"loss": 0.7905,
"reward": 1.104166716337204,
"reward_std": 0.5215684860944748,
"rewards/accuracy_reward": 0.3375000111758709,
"rewards/format_reward": 0.7666666865348816,
"step": 880
},
{
"completion_length": 254.6166732788086,
"epoch": 0.472,
"grad_norm": 87.91173553466797,
"kl": 4.0609375,
"learning_rate": 6.347229568912794e-07,
"loss": 1.2084,
"reward": 0.9250000298023224,
"reward_std": 0.6195752292871475,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/format_reward": 0.7166666805744171,
"step": 885
},
{
"completion_length": 280.2916717529297,
"epoch": 0.4746666666666667,
"grad_norm": 33.61180114746094,
"kl": 3.603125,
"learning_rate": 6.302337754167369e-07,
"loss": 1.1916,
"reward": 1.0166666984558106,
"reward_std": 0.6988473400473595,
"rewards/accuracy_reward": 0.3000000089406967,
"rewards/format_reward": 0.7166666805744171,
"step": 890
},
{
"completion_length": 193.06666946411133,
"epoch": 0.47733333333333333,
"grad_norm": 34.74186325073242,
"kl": 2.271875,
"learning_rate": 6.257333030026538e-07,
"loss": 0.8303,
"reward": 1.2000000476837158,
"reward_std": 0.4528850480914116,
"rewards/accuracy_reward": 0.3666666731238365,
"rewards/format_reward": 0.8333333551883697,
"step": 895
},
{
"completion_length": 275.5500061035156,
"epoch": 0.48,
"grad_norm": 30.625368118286133,
"kl": 5.39375,
"learning_rate": 6.212219298286261e-07,
"loss": 1.5581,
"reward": 1.000000035762787,
"reward_std": 0.6969257593154907,
"rewards/accuracy_reward": 0.26666667237877845,
"rewards/format_reward": 0.7333333551883697,
"step": 900
},
{
"completion_length": 214.02500610351564,
"epoch": 0.4826666666666667,
"grad_norm": 28.710886001586914,
"kl": 3.11328125,
"learning_rate": 6.167000470193188e-07,
"loss": 1.1685,
"reward": 0.9666666924953461,
"reward_std": 0.5459988377988338,
"rewards/accuracy_reward": 0.19166667386889458,
"rewards/format_reward": 0.7750000119209289,
"step": 905
},
{
"completion_length": 224.62500610351563,
"epoch": 0.48533333333333334,
"grad_norm": 44.227230072021484,
"kl": 2.56953125,
"learning_rate": 6.121680466105559e-07,
"loss": 0.9869,
"reward": 0.9708333671092987,
"reward_std": 0.5105708941817284,
"rewards/accuracy_reward": 0.18750000819563867,
"rewards/format_reward": 0.7833333551883698,
"step": 910
},
{
"completion_length": 184.80417404174804,
"epoch": 0.488,
"grad_norm": 24.200429916381836,
"kl": 2.596875,
"learning_rate": 6.076263215153307e-07,
"loss": 0.8057,
"reward": 1.1250000238418578,
"reward_std": 0.4605237804353237,
"rewards/accuracy_reward": 0.3000000089406967,
"rewards/format_reward": 0.8250000178813934,
"step": 915
},
{
"completion_length": 234.8166763305664,
"epoch": 0.49066666666666664,
"grad_norm": 40.316471099853516,
"kl": 4.540625,
"learning_rate": 6.030752654897434e-07,
"loss": 1.2207,
"reward": 0.9833333611488342,
"reward_std": 0.5229995906352997,
"rewards/accuracy_reward": 0.20833333544433116,
"rewards/format_reward": 0.7750000119209289,
"step": 920
},
{
"completion_length": 311.7916801452637,
"epoch": 0.49333333333333335,
"grad_norm": 34.297813415527344,
"kl": 4.3859375,
"learning_rate": 5.985152730988617e-07,
"loss": 1.3617,
"reward": 0.9625000298023224,
"reward_std": 0.7176508605480194,
"rewards/accuracy_reward": 0.2625000074505806,
"rewards/format_reward": 0.700000011920929,
"step": 925
},
{
"completion_length": 182.29167404174805,
"epoch": 0.496,
"grad_norm": 15.424689292907715,
"kl": 2.127734375,
"learning_rate": 5.939467396825136e-07,
"loss": 0.7552,
"reward": 1.1791667103767396,
"reward_std": 0.3441163420677185,
"rewards/accuracy_reward": 0.31250000596046446,
"rewards/format_reward": 0.8666666865348815,
"step": 930
},
{
"completion_length": 286.0208404541016,
"epoch": 0.49866666666666665,
"grad_norm": 33.052181243896484,
"kl": 3.21015625,
"learning_rate": 5.893700613210127e-07,
"loss": 1.0319,
"reward": 0.9208333551883697,
"reward_std": 0.6939111322164535,
"rewards/accuracy_reward": 0.23750000558793544,
"rewards/format_reward": 0.6833333432674408,
"step": 935
},
{
"completion_length": 343.77500610351564,
"epoch": 0.5013333333333333,
"grad_norm": 133.8216094970703,
"kl": 5.746875,
"learning_rate": 5.847856348008188e-07,
"loss": 1.4352,
"reward": 0.9500000268220902,
"reward_std": 0.7578961223363876,
"rewards/accuracy_reward": 0.3000000074505806,
"rewards/format_reward": 0.6500000178813934,
"step": 940
},
{
"completion_length": 435.433349609375,
"epoch": 0.504,
"grad_norm": 20.99445152282715,
"kl": 4.63515625,
"learning_rate": 5.801938575801371e-07,
"loss": 1.3974,
"reward": 0.7708333671092987,
"reward_std": 0.926627391576767,
"rewards/accuracy_reward": 0.27083334103226664,
"rewards/format_reward": 0.5000000163912773,
"step": 945
},
{
"completion_length": 370.6958404541016,
"epoch": 0.5066666666666667,
"grad_norm": 8.884126663208008,
"kl": 3.3875,
"learning_rate": 5.755951277544607e-07,
"loss": 1.131,
"reward": 0.8166666984558105,
"reward_std": 0.764404758810997,
"rewards/accuracy_reward": 0.2333333395421505,
"rewards/format_reward": 0.5833333551883697,
"step": 950
},
{
"completion_length": 340.5333419799805,
"epoch": 0.5093333333333333,
"grad_norm": 49.03192138671875,
"kl": 3.83671875,
"learning_rate": 5.709898440220551e-07,
"loss": 1.1697,
"reward": 0.9958333760499954,
"reward_std": 0.7807205557823181,
"rewards/accuracy_reward": 0.37916667461395265,
"rewards/format_reward": 0.6166666805744171,
"step": 955
},
{
"completion_length": 263.5541793823242,
"epoch": 0.512,
"grad_norm": 9.941174507141113,
"kl": 2.30234375,
"learning_rate": 5.663784056493936e-07,
"loss": 0.9917,
"reward": 1.1750000298023224,
"reward_std": 0.5471759930253028,
"rewards/accuracy_reward": 0.4166666727513075,
"rewards/format_reward": 0.7583333492279053,
"step": 960
},
{
"completion_length": 298.9041778564453,
"epoch": 0.5146666666666667,
"grad_norm": 28.354358673095703,
"kl": 3.20546875,
"learning_rate": 5.61761212436541e-07,
"loss": 1.051,
"reward": 0.862500011920929,
"reward_std": 0.6203906744718551,
"rewards/accuracy_reward": 0.19583333656191826,
"rewards/format_reward": 0.6666666805744171,
"step": 965
},
{
"completion_length": 181.4708381652832,
"epoch": 0.5173333333333333,
"grad_norm": 10.501627922058105,
"kl": 2.1251953125,
"learning_rate": 5.571386646824922e-07,
"loss": 0.4847,
"reward": 1.1541667103767395,
"reward_std": 0.2954378850758076,
"rewards/accuracy_reward": 0.29583333879709245,
"rewards/format_reward": 0.8583333492279053,
"step": 970
},
{
"completion_length": 157.45000381469725,
"epoch": 0.52,
"grad_norm": 8.8187894821167,
"kl": 1.551953125,
"learning_rate": 5.525111631504677e-07,
"loss": 0.5822,
"reward": 1.1541666984558105,
"reward_std": 0.36384222060441973,
"rewards/accuracy_reward": 0.27083334028720857,
"rewards/format_reward": 0.8833333492279053,
"step": 975
},
{
"completion_length": 146.86666870117188,
"epoch": 0.5226666666666666,
"grad_norm": 8.133127212524414,
"kl": 1.458203125,
"learning_rate": 5.478791090331677e-07,
"loss": 0.5357,
"reward": 1.2083333730697632,
"reward_std": 0.3662621095776558,
"rewards/accuracy_reward": 0.3166666738688946,
"rewards/format_reward": 0.891666692495346,
"step": 980
},
{
"completion_length": 128.45417251586915,
"epoch": 0.5253333333333333,
"grad_norm": 15.48926830291748,
"kl": 1.6015625,
"learning_rate": 5.432429039179899e-07,
"loss": 0.3506,
"reward": 1.1458333671092986,
"reward_std": 0.2651623532176018,
"rewards/accuracy_reward": 0.2541666731238365,
"rewards/format_reward": 0.891666692495346,
"step": 985
},
{
"completion_length": 146.4000045776367,
"epoch": 0.528,
"grad_norm": 5.376186370849609,
"kl": 1.834765625,
"learning_rate": 5.386029497522133e-07,
"loss": 0.6246,
"reward": 1.2416666865348815,
"reward_std": 0.35661301463842393,
"rewards/accuracy_reward": 0.3583333447575569,
"rewards/format_reward": 0.8833333551883698,
"step": 990
},
{
"completion_length": 177.82500457763672,
"epoch": 0.5306666666666666,
"grad_norm": 22.633089065551758,
"kl": 1.623828125,
"learning_rate": 5.3395964880815e-07,
"loss": 0.6805,
"reward": 1.141666692495346,
"reward_std": 0.358070158213377,
"rewards/accuracy_reward": 0.27500000707805156,
"rewards/format_reward": 0.8666666805744171,
"step": 995
},
{
"completion_length": 196.29583969116212,
"epoch": 0.5333333333333333,
"grad_norm": 47.34584426879883,
"kl": 1.69375,
"learning_rate": 5.293134036482698e-07,
"loss": 0.823,
"reward": 1.087500023841858,
"reward_std": 0.3883266061544418,
"rewards/accuracy_reward": 0.24583334140479565,
"rewards/format_reward": 0.8416666924953461,
"step": 1000
},
{
"epoch": 0.5333333333333333,
"eval_completion_length": 175.36000528971354,
"eval_kl": 1.351640625,
"eval_loss": 0.610393226146698,
"eval_reward": 1.0127778077125549,
"eval_reward_std": 0.40122534612814587,
"eval_rewards/accuracy_reward": 0.1438888931274414,
"eval_rewards/format_reward": 0.8688889082272847,
"eval_runtime": 672.1413,
"eval_samples_per_second": 0.446,
"eval_steps_per_second": 0.019,
"step": 1000
},
{
"completion_length": 131.97083587646483,
"epoch": 0.536,
"grad_norm": 80.59904479980469,
"kl": 1.5953125,
"learning_rate": 5.246646170902975e-07,
"loss": 0.4407,
"reward": 1.1833333492279052,
"reward_std": 0.2533145576715469,
"rewards/accuracy_reward": 0.26666667461395266,
"rewards/format_reward": 0.9166666746139527,
"step": 1005
},
{
"completion_length": 149.46666946411133,
"epoch": 0.5386666666666666,
"grad_norm": 6.973750591278076,
"kl": 1.4193359375,
"learning_rate": 5.200136921722918e-07,
"loss": 0.5413,
"reward": 1.1666666924953462,
"reward_std": 0.3360213190317154,
"rewards/accuracy_reward": 0.26666667833924296,
"rewards/format_reward": 0.9000000178813934,
"step": 1010
},
{
"completion_length": 178.3958366394043,
"epoch": 0.5413333333333333,
"grad_norm": 10.757118225097656,
"kl": 1.02265625,
"learning_rate": 5.153610321177013e-07,
"loss": 0.4342,
"reward": 1.0666667103767395,
"reward_std": 0.2775675721466541,
"rewards/accuracy_reward": 0.17500000670552254,
"rewards/format_reward": 0.8916666865348816,
"step": 1015
},
{
"completion_length": 265.3958404541016,
"epoch": 0.544,
"grad_norm": 11.506460189819336,
"kl": 2.033984375,
"learning_rate": 5.107070403004066e-07,
"loss": 0.8198,
"reward": 0.8750000298023224,
"reward_std": 0.6099086761474609,
"rewards/accuracy_reward": 0.15833333767950536,
"rewards/format_reward": 0.7166666865348816,
"step": 1020
},
{
"completion_length": 302.5041778564453,
"epoch": 0.5466666666666666,
"grad_norm": 16.32666015625,
"kl": 2.891015625,
"learning_rate": 5.060521202097489e-07,
"loss": 1.0253,
"reward": 1.025000014901161,
"reward_std": 0.5060222968459129,
"rewards/accuracy_reward": 0.27500000186264517,
"rewards/format_reward": 0.7500000208616256,
"step": 1025
},
{
"completion_length": 416.4416763305664,
"epoch": 0.5493333333333333,
"grad_norm": 15.603365898132324,
"kl": 2.96328125,
"learning_rate": 5.013966754155482e-07,
"loss": 1.1611,
"reward": 0.7500000238418579,
"reward_std": 0.8826716184616089,
"rewards/accuracy_reward": 0.2000000063329935,
"rewards/format_reward": 0.5500000149011612,
"step": 1030
},
{
"completion_length": 368.07500915527345,
"epoch": 0.552,
"grad_norm": 10.401843070983887,
"kl": 3.10078125,
"learning_rate": 4.967411095331149e-07,
"loss": 1.114,
"reward": 0.8791666865348816,
"reward_std": 0.7436762899160385,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/format_reward": 0.608333346247673,
"step": 1035
},
{
"completion_length": 253.62917251586913,
"epoch": 0.5546666666666666,
"grad_norm": 8.17926025390625,
"kl": 2.52109375,
"learning_rate": 4.920858261882577e-07,
"loss": 0.9692,
"reward": 1.0166667014360429,
"reward_std": 0.7252250477671623,
"rewards/accuracy_reward": 0.3000000059604645,
"rewards/format_reward": 0.7166666775941849,
"step": 1040
},
{
"completion_length": 202.72500228881836,
"epoch": 0.5573333333333333,
"grad_norm": 23.440996170043945,
"kl": 2.301953125,
"learning_rate": 4.874312289822899e-07,
"loss": 0.8462,
"reward": 1.1583333671092988,
"reward_std": 0.5490816205739975,
"rewards/accuracy_reward": 0.3333333469927311,
"rewards/format_reward": 0.825000011920929,
"step": 1045
},
{
"completion_length": 236.8416748046875,
"epoch": 0.56,
"grad_norm": 6.160148620605469,
"kl": 2.3287109375,
"learning_rate": 4.827777214570384e-07,
"loss": 0.9984,
"reward": 1.066666704416275,
"reward_std": 0.5855829656124115,
"rewards/accuracy_reward": 0.2833333447575569,
"rewards/format_reward": 0.7833333551883698,
"step": 1050
},
{
"completion_length": 160.41250305175782,
"epoch": 0.5626666666666666,
"grad_norm": 59.35076904296875,
"kl": 2.099609375,
"learning_rate": 4.781257070598571e-07,
"loss": 0.6582,
"reward": 1.2125000357627869,
"reward_std": 0.3614451542496681,
"rewards/accuracy_reward": 0.3291666731238365,
"rewards/format_reward": 0.8833333492279053,
"step": 1055
},
{
"completion_length": 216.84167404174804,
"epoch": 0.5653333333333334,
"grad_norm": 11.944228172302246,
"kl": 1.9265625,
"learning_rate": 4.734755891086498e-07,
"loss": 0.8889,
"reward": 0.9833333671092988,
"reward_std": 0.4978467658162117,
"rewards/accuracy_reward": 0.1583333358168602,
"rewards/format_reward": 0.8250000238418579,
"step": 1060
},
{
"completion_length": 204.32500534057618,
"epoch": 0.568,
"grad_norm": 20.293838500976562,
"kl": 2.085546875,
"learning_rate": 4.6882777075690346e-07,
"loss": 0.6455,
"reward": 1.1208333671092987,
"reward_std": 0.46157447397708895,
"rewards/accuracy_reward": 0.3041666723787785,
"rewards/format_reward": 0.8166666865348816,
"step": 1065
},
{
"completion_length": 239.66666946411132,
"epoch": 0.5706666666666667,
"grad_norm": 8.901671409606934,
"kl": 1.844140625,
"learning_rate": 4.6418265495873516e-07,
"loss": 0.7643,
"reward": 1.1750000357627868,
"reward_std": 0.4657398253679276,
"rewards/accuracy_reward": 0.35833334624767305,
"rewards/format_reward": 0.8166666865348816,
"step": 1070
},
{
"completion_length": 236.07917327880858,
"epoch": 0.5733333333333334,
"grad_norm": 10.970005989074707,
"kl": 1.715234375,
"learning_rate": 4.595406444339576e-07,
"loss": 0.7525,
"reward": 1.0000000417232513,
"reward_std": 0.4881373070180416,
"rewards/accuracy_reward": 0.19166667051613331,
"rewards/format_reward": 0.8083333492279052,
"step": 1075
},
{
"completion_length": 317.90001220703124,
"epoch": 0.576,
"grad_norm": 14.551898956298828,
"kl": 3.174609375,
"learning_rate": 4.5490214163316397e-07,
"loss": 0.9734,
"reward": 0.8916666984558106,
"reward_std": 0.6427461057901382,
"rewards/accuracy_reward": 0.20833333693444728,
"rewards/format_reward": 0.6833333522081375,
"step": 1080
},
{
"completion_length": 227.68750457763673,
"epoch": 0.5786666666666667,
"grad_norm": 9.023757934570312,
"kl": 1.8625,
"learning_rate": 4.502675487028369e-07,
"loss": 0.8347,
"reward": 1.0416666984558105,
"reward_std": 0.5368030399084092,
"rewards/accuracy_reward": 0.2416666727513075,
"rewards/format_reward": 0.8000000238418579,
"step": 1085
},
{
"completion_length": 288.81667404174806,
"epoch": 0.5813333333333334,
"grad_norm": 27.050466537475586,
"kl": 2.22265625,
"learning_rate": 4.456372674504828e-07,
"loss": 0.9977,
"reward": 0.9666666924953461,
"reward_std": 0.5600151270627975,
"rewards/accuracy_reward": 0.23333334028720856,
"rewards/format_reward": 0.7333333492279053,
"step": 1090
},
{
"completion_length": 233.35834045410155,
"epoch": 0.584,
"grad_norm": 99.57305908203125,
"kl": 2.521875,
"learning_rate": 4.4101169930979677e-07,
"loss": 1.0178,
"reward": 1.0416666984558105,
"reward_std": 0.5232247993350029,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.7916666865348816,
"step": 1095
},
{
"completion_length": 237.47084121704103,
"epoch": 0.5866666666666667,
"grad_norm": 32.285587310791016,
"kl": 2.566796875,
"learning_rate": 4.3639124530585885e-07,
"loss": 0.9553,
"reward": 1.0416666984558105,
"reward_std": 0.49489557296037673,
"rewards/accuracy_reward": 0.24166667237877845,
"rewards/format_reward": 0.8000000238418579,
"step": 1100
},
{
"completion_length": 263.5375045776367,
"epoch": 0.5893333333333334,
"grad_norm": 14.501636505126953,
"kl": 2.036328125,
"learning_rate": 4.317763060203664e-07,
"loss": 0.7803,
"reward": 1.0916667044162751,
"reward_std": 0.5974579885601997,
"rewards/accuracy_reward": 0.34166668020188806,
"rewards/format_reward": 0.7500000119209289,
"step": 1105
},
{
"completion_length": 190.2458381652832,
"epoch": 0.592,
"grad_norm": 28.570791244506836,
"kl": 2.1984375,
"learning_rate": 4.271672815569047e-07,
"loss": 0.7099,
"reward": 1.1125000298023224,
"reward_std": 0.4488224387168884,
"rewards/accuracy_reward": 0.2958333443850279,
"rewards/format_reward": 0.8166666924953461,
"step": 1110
},
{
"completion_length": 191.68750381469727,
"epoch": 0.5946666666666667,
"grad_norm": 56.26449203491211,
"kl": 1.941015625,
"learning_rate": 4.2256457150625847e-07,
"loss": 0.7596,
"reward": 1.1041666984558105,
"reward_std": 0.43807603493332864,
"rewards/accuracy_reward": 0.27916667200624945,
"rewards/format_reward": 0.8250000238418579,
"step": 1115
},
{
"completion_length": 164.62500534057617,
"epoch": 0.5973333333333334,
"grad_norm": 159.04928588867188,
"kl": 2.11171875,
"learning_rate": 4.1796857491176966e-07,
"loss": 0.7277,
"reward": 1.200000035762787,
"reward_std": 0.3421368353068829,
"rewards/accuracy_reward": 0.30833334289491177,
"rewards/format_reward": 0.8916666805744171,
"step": 1120
},
{
"completion_length": 216.95833892822264,
"epoch": 0.6,
"grad_norm": 12.805575370788574,
"kl": 1.614453125,
"learning_rate": 4.133796902347396e-07,
"loss": 0.5934,
"reward": 1.1250000417232513,
"reward_std": 0.4264773324131966,
"rewards/accuracy_reward": 0.3083333373069763,
"rewards/format_reward": 0.8166666865348816,
"step": 1125
},
{
"completion_length": 219.3791748046875,
"epoch": 0.6026666666666667,
"grad_norm": 14.401493072509766,
"kl": 2.0857421875,
"learning_rate": 4.087983153198848e-07,
"loss": 0.6371,
"reward": 1.0958333671092988,
"reward_std": 0.5428274616599083,
"rewards/accuracy_reward": 0.3041666761040688,
"rewards/format_reward": 0.7916666865348816,
"step": 1130
},
{
"completion_length": 189.86667556762694,
"epoch": 0.6053333333333333,
"grad_norm": 19.875732421875,
"kl": 2.033203125,
"learning_rate": 4.0422484736084414e-07,
"loss": 0.6977,
"reward": 1.0750000417232513,
"reward_std": 0.40034623965620997,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/format_reward": 0.8666666805744171,
"step": 1135
},
{
"completion_length": 160.80417022705078,
"epoch": 0.608,
"grad_norm": 9.801048278808594,
"kl": 1.8109375,
"learning_rate": 3.9965968286574367e-07,
"loss": 0.5807,
"reward": 1.1375000476837158,
"reward_std": 0.3900581821799278,
"rewards/accuracy_reward": 0.2625000040978193,
"rewards/format_reward": 0.8750000178813935,
"step": 1140
},
{
"completion_length": 157.98750534057618,
"epoch": 0.6106666666666667,
"grad_norm": 30.29570770263672,
"kl": 1.40703125,
"learning_rate": 3.951032176228199e-07,
"loss": 0.5565,
"reward": 1.200000035762787,
"reward_std": 0.2764429710805416,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/format_reward": 0.9083333492279053,
"step": 1145
},
{
"completion_length": 180.90000610351564,
"epoch": 0.6133333333333333,
"grad_norm": 23.002822875976562,
"kl": 1.65390625,
"learning_rate": 3.9055584666610596e-07,
"loss": 0.6413,
"reward": 1.2041666984558106,
"reward_std": 0.40405053198337554,
"rewards/accuracy_reward": 0.33750001415610315,
"rewards/format_reward": 0.8666666924953461,
"step": 1150
},
{
"completion_length": 186.8958396911621,
"epoch": 0.616,
"grad_norm": 12.507006645202637,
"kl": 1.280078125,
"learning_rate": 3.860179642411837e-07,
"loss": 0.6498,
"reward": 1.1333333432674408,
"reward_std": 0.3774823874235153,
"rewards/accuracy_reward": 0.2750000037252903,
"rewards/format_reward": 0.8583333492279053,
"step": 1155
},
{
"completion_length": 155.95416946411132,
"epoch": 0.6186666666666667,
"grad_norm": 3.330310583114624,
"kl": 1.2033203125,
"learning_rate": 3.8148996377100304e-07,
"loss": 0.4313,
"reward": 1.2041667103767395,
"reward_std": 0.3509316384792328,
"rewards/accuracy_reward": 0.3125000074505806,
"rewards/format_reward": 0.8916666805744171,
"step": 1160
},
{
"completion_length": 188.76667327880858,
"epoch": 0.6213333333333333,
"grad_norm": 40.26872634887695,
"kl": 2.188671875,
"learning_rate": 3.7697223782177303e-07,
"loss": 0.6651,
"reward": 1.2583333909511567,
"reward_std": 0.4299319893121719,
"rewards/accuracy_reward": 0.4083333432674408,
"rewards/format_reward": 0.8500000178813935,
"step": 1165
},
{
"completion_length": 252.42917556762694,
"epoch": 0.624,
"grad_norm": 16.348541259765625,
"kl": 2.37421875,
"learning_rate": 3.724651780689285e-07,
"loss": 0.9173,
"reward": 1.0250000298023223,
"reward_std": 0.5082567930221558,
"rewards/accuracy_reward": 0.2666666731238365,
"rewards/format_reward": 0.7583333551883698,
"step": 1170
},
{
"completion_length": 239.4791717529297,
"epoch": 0.6266666666666667,
"grad_norm": 17.348703384399414,
"kl": 1.586328125,
"learning_rate": 3.679691752631715e-07,
"loss": 0.796,
"reward": 1.1250000417232513,
"reward_std": 0.4487001359462738,
"rewards/accuracy_reward": 0.3250000089406967,
"rewards/format_reward": 0.8000000238418579,
"step": 1175
},
{
"completion_length": 184.93750610351563,
"epoch": 0.6293333333333333,
"grad_norm": 5.322170734405518,
"kl": 1.261328125,
"learning_rate": 3.6348461919659433e-07,
"loss": 0.5975,
"reward": 1.200000023841858,
"reward_std": 0.394177620112896,
"rewards/accuracy_reward": 0.35000001043081286,
"rewards/format_reward": 0.8500000178813935,
"step": 1180
},
{
"completion_length": 149.95417175292968,
"epoch": 0.632,
"grad_norm": 4.652257919311523,
"kl": 1.3166015625,
"learning_rate": 3.590118986688865e-07,
"loss": 0.5187,
"reward": 1.2375000357627868,
"reward_std": 0.35248192474246026,
"rewards/accuracy_reward": 0.34583334140479566,
"rewards/format_reward": 0.8916666805744171,
"step": 1185
},
{
"completion_length": 214.28750915527343,
"epoch": 0.6346666666666667,
"grad_norm": 8.401276588439941,
"kl": 1.615234375,
"learning_rate": 3.5455140145362586e-07,
"loss": 0.803,
"reward": 1.1166667103767396,
"reward_std": 0.385433167219162,
"rewards/accuracy_reward": 0.2750000089406967,
"rewards/format_reward": 0.8416666865348816,
"step": 1190
},
{
"completion_length": 243.75000686645507,
"epoch": 0.6373333333333333,
"grad_norm": 17.25551414489746,
"kl": 1.5962890625,
"learning_rate": 3.5010351426466003e-07,
"loss": 0.7611,
"reward": 1.1750000417232513,
"reward_std": 0.5330882802605629,
"rewards/accuracy_reward": 0.36666667759418486,
"rewards/format_reward": 0.8083333492279052,
"step": 1195
},
{
"completion_length": 185.69167251586913,
"epoch": 0.64,
"grad_norm": 15.053462982177734,
"kl": 1.40078125,
"learning_rate": 3.4566862272257917e-07,
"loss": 0.7929,
"reward": 1.154166692495346,
"reward_std": 0.4828508198261261,
"rewards/accuracy_reward": 0.30416667461395264,
"rewards/format_reward": 0.8500000178813935,
"step": 1200
},
{
"epoch": 0.64,
"eval_completion_length": 263.9861196899414,
"eval_kl": 2.4522395833333333,
"eval_loss": 0.904167890548706,
"eval_reward": 0.9144444712003073,
"eval_reward_std": 0.5771439289053281,
"eval_rewards/accuracy_reward": 0.16111111541589102,
"eval_rewards/format_reward": 0.7533333551883697,
"eval_runtime": 815.5292,
"eval_samples_per_second": 0.368,
"eval_steps_per_second": 0.016,
"step": 1200
},
{
"completion_length": 224.0416717529297,
"epoch": 0.6426666666666667,
"grad_norm": 14.006892204284668,
"kl": 2.721875,
"learning_rate": 3.412471113212837e-07,
"loss": 0.8414,
"reward": 1.0583333611488341,
"reward_std": 0.5013190120458603,
"rewards/accuracy_reward": 0.2583333384245634,
"rewards/format_reward": 0.8000000178813934,
"step": 1205
},
{
"completion_length": 250.73750228881835,
"epoch": 0.6453333333333333,
"grad_norm": 4.346928596496582,
"kl": 1.89140625,
"learning_rate": 3.3683936339464955e-07,
"loss": 0.6544,
"reward": 1.1083333611488342,
"reward_std": 0.5055985808372497,
"rewards/accuracy_reward": 0.33333334103226664,
"rewards/format_reward": 0.7750000178813934,
"step": 1210
},
{
"completion_length": 262.02500915527344,
"epoch": 0.648,
"grad_norm": 8.122538566589355,
"kl": 1.6572265625,
"learning_rate": 3.324457610832941e-07,
"loss": 0.8986,
"reward": 1.1541667103767395,
"reward_std": 0.5832742094993592,
"rewards/accuracy_reward": 0.3791666805744171,
"rewards/format_reward": 0.775000023841858,
"step": 1215
},
{
"completion_length": 245.7916717529297,
"epoch": 0.6506666666666666,
"grad_norm": 11.933501243591309,
"kl": 1.238671875,
"learning_rate": 3.280666853014457e-07,
"loss": 0.7222,
"reward": 1.1958333611488343,
"reward_std": 0.5057858511805534,
"rewards/accuracy_reward": 0.40416667573153975,
"rewards/format_reward": 0.7916666746139527,
"step": 1220
},
{
"completion_length": 311.77917633056643,
"epoch": 0.6533333333333333,
"grad_norm": 13.826354026794434,
"kl": 2.342578125,
"learning_rate": 3.2370251570391925e-07,
"loss": 0.9241,
"reward": 0.941666704416275,
"reward_std": 0.5843323901295662,
"rewards/accuracy_reward": 0.2000000037252903,
"rewards/format_reward": 0.7416666865348815,
"step": 1225
},
{
"completion_length": 288.49167709350587,
"epoch": 0.656,
"grad_norm": 12.062870979309082,
"kl": 2.384375,
"learning_rate": 3.1935363065320126e-07,
"loss": 0.9282,
"reward": 0.9458333492279053,
"reward_std": 0.5712588280439377,
"rewards/accuracy_reward": 0.19583333730697633,
"rewards/format_reward": 0.7500000119209289,
"step": 1230
},
{
"completion_length": 235.39584197998047,
"epoch": 0.6586666666666666,
"grad_norm": 7.450344085693359,
"kl": 1.758203125,
"learning_rate": 3.150204071866464e-07,
"loss": 0.6324,
"reward": 1.1458333611488343,
"reward_std": 0.5423780143260956,
"rewards/accuracy_reward": 0.36250000819563866,
"rewards/format_reward": 0.7833333551883698,
"step": 1235
},
{
"completion_length": 197.81250762939453,
"epoch": 0.6613333333333333,
"grad_norm": 11.524059295654297,
"kl": 1.31171875,
"learning_rate": 3.107032209837892e-07,
"loss": 0.5257,
"reward": 1.0458333551883698,
"reward_std": 0.334098968654871,
"rewards/accuracy_reward": 0.19583333805203437,
"rewards/format_reward": 0.850000011920929,
"step": 1240
},
{
"completion_length": 247.20000762939452,
"epoch": 0.664,
"grad_norm": 7.961787700653076,
"kl": 1.726171875,
"learning_rate": 3.064024463337747e-07,
"loss": 0.8873,
"reward": 0.916666692495346,
"reward_std": 0.4848631680011749,
"rewards/accuracy_reward": 0.14166667088866233,
"rewards/format_reward": 0.775000023841858,
"step": 1245
},
{
"completion_length": 229.66667633056642,
"epoch": 0.6666666666666666,
"grad_norm": 23.106739044189453,
"kl": 1.5203125,
"learning_rate": 3.021184561029071e-07,
"loss": 0.8455,
"reward": 1.1083333611488342,
"reward_std": 0.45471236705780027,
"rewards/accuracy_reward": 0.28333334140479566,
"rewards/format_reward": 0.8250000238418579,
"step": 1250
},
{
"completion_length": 316.7458381652832,
"epoch": 0.6693333333333333,
"grad_norm": 9.744142532348633,
"kl": 2.18046875,
"learning_rate": 2.9785162170232424e-07,
"loss": 0.9333,
"reward": 0.829166692495346,
"reward_std": 0.6003586441278458,
"rewards/accuracy_reward": 0.13750000558793546,
"rewards/format_reward": 0.6916666984558105,
"step": 1255
},
{
"completion_length": 242.23750762939454,
"epoch": 0.672,
"grad_norm": 25.725797653198242,
"kl": 1.94375,
"learning_rate": 2.936023130557964e-07,
"loss": 0.7897,
"reward": 1.079166704416275,
"reward_std": 0.5282017394900322,
"rewards/accuracy_reward": 0.2875000059604645,
"rewards/format_reward": 0.7916666865348816,
"step": 1260
},
{
"completion_length": 226.35417251586915,
"epoch": 0.6746666666666666,
"grad_norm": 4.361880779266357,
"kl": 1.287109375,
"learning_rate": 2.893708985676556e-07,
"loss": 0.7382,
"reward": 1.1708333611488342,
"reward_std": 0.4796324223279953,
"rewards/accuracy_reward": 0.33750001415610315,
"rewards/format_reward": 0.8333333551883697,
"step": 1265
},
{
"completion_length": 179.21250839233397,
"epoch": 0.6773333333333333,
"grad_norm": 10.620832443237305,
"kl": 1.5546875,
"learning_rate": 2.851577450908553e-07,
"loss": 0.5363,
"reward": 1.204166692495346,
"reward_std": 0.39730483293533325,
"rewards/accuracy_reward": 0.3291666775941849,
"rewards/format_reward": 0.8750000238418579,
"step": 1270
},
{
"completion_length": 225.60834045410155,
"epoch": 0.68,
"grad_norm": 24.622392654418945,
"kl": 1.930859375,
"learning_rate": 2.809632178951655e-07,
"loss": 0.7525,
"reward": 1.2000000476837158,
"reward_std": 0.43045871555805204,
"rewards/accuracy_reward": 0.3833333432674408,
"rewards/format_reward": 0.8166666865348816,
"step": 1275
},
{
"completion_length": 189.73333892822265,
"epoch": 0.6826666666666666,
"grad_norm": 7.506571292877197,
"kl": 1.469921875,
"learning_rate": 2.767876806355045e-07,
"loss": 0.6633,
"reward": 1.1250000298023224,
"reward_std": 0.4194158732891083,
"rewards/accuracy_reward": 0.2666666753590107,
"rewards/format_reward": 0.8583333551883697,
"step": 1280
},
{
"completion_length": 205.73334045410155,
"epoch": 0.6853333333333333,
"grad_norm": 6.358190059661865,
"kl": 1.3841796875,
"learning_rate": 2.7263149532041107e-07,
"loss": 0.8039,
"reward": 1.2166667103767395,
"reward_std": 0.5374398469924927,
"rewards/accuracy_reward": 0.3750000111758709,
"rewards/format_reward": 0.8416666865348816,
"step": 1285
},
{
"completion_length": 210.5208427429199,
"epoch": 0.688,
"grad_norm": 27.677288055419922,
"kl": 2.022265625,
"learning_rate": 2.6849502228065955e-07,
"loss": 0.8598,
"reward": 1.0458333790302277,
"reward_std": 0.47432570457458495,
"rewards/accuracy_reward": 0.2291666693985462,
"rewards/format_reward": 0.8166666924953461,
"step": 1290
},
{
"completion_length": 206.36667022705078,
"epoch": 0.6906666666666667,
"grad_norm": 27.595359802246094,
"kl": 2.200390625,
"learning_rate": 2.6437862013801937e-07,
"loss": 0.7624,
"reward": 1.075000023841858,
"reward_std": 0.45619752407073977,
"rewards/accuracy_reward": 0.27500000447034834,
"rewards/format_reward": 0.8000000238418579,
"step": 1295
},
{
"completion_length": 281.9166717529297,
"epoch": 0.6933333333333334,
"grad_norm": 7.714141845703125,
"kl": 3.14765625,
"learning_rate": 2.6028264577416414e-07,
"loss": 0.9706,
"reward": 0.8500000357627868,
"reward_std": 0.6301615715026856,
"rewards/accuracy_reward": 0.1916666690260172,
"rewards/format_reward": 0.6583333551883698,
"step": 1300
},
{
"completion_length": 227.537508392334,
"epoch": 0.696,
"grad_norm": 11.018620491027832,
"kl": 1.490625,
"learning_rate": 2.5620745429973046e-07,
"loss": 0.7425,
"reward": 1.0625000417232513,
"reward_std": 0.47583652585744857,
"rewards/accuracy_reward": 0.2708333406597376,
"rewards/format_reward": 0.7916666865348816,
"step": 1305
},
{
"completion_length": 206.64584045410157,
"epoch": 0.6986666666666667,
"grad_norm": 7.6855149269104,
"kl": 1.15,
"learning_rate": 2.5215339902353093e-07,
"loss": 0.6114,
"reward": 1.2291666984558105,
"reward_std": 0.40088424533605577,
"rewards/accuracy_reward": 0.37083333879709246,
"rewards/format_reward": 0.8583333492279053,
"step": 1310
},
{
"completion_length": 236.3791717529297,
"epoch": 0.7013333333333334,
"grad_norm": 36.869991302490234,
"kl": 1.91953125,
"learning_rate": 2.4812083142192323e-07,
"loss": 0.8435,
"reward": 1.1000000357627868,
"reward_std": 0.501855157315731,
"rewards/accuracy_reward": 0.30000000447034836,
"rewards/format_reward": 0.8000000178813934,
"step": 1315
},
{
"completion_length": 239.29167404174805,
"epoch": 0.704,
"grad_norm": 34.20293045043945,
"kl": 2.9728515625,
"learning_rate": 2.441101011083378e-07,
"loss": 0.652,
"reward": 1.1500000417232514,
"reward_std": 0.4413339100778103,
"rewards/accuracy_reward": 0.33333334028720857,
"rewards/format_reward": 0.8166666865348816,
"step": 1320
},
{
"completion_length": 196.09583740234376,
"epoch": 0.7066666666666667,
"grad_norm": 6.715369701385498,
"kl": 1.90703125,
"learning_rate": 2.4012155580296705e-07,
"loss": 0.571,
"reward": 1.3291667222976684,
"reward_std": 0.47337266951799395,
"rewards/accuracy_reward": 0.4958333492279053,
"rewards/format_reward": 0.8333333492279053,
"step": 1325
},
{
"completion_length": 218.24584045410157,
"epoch": 0.7093333333333334,
"grad_norm": 6.281963348388672,
"kl": 1.1951171875,
"learning_rate": 2.3615554130262e-07,
"loss": 0.7098,
"reward": 1.2250000357627868,
"reward_std": 0.4241747669875622,
"rewards/accuracy_reward": 0.3916666805744171,
"rewards/format_reward": 0.8333333551883697,
"step": 1330
},
{
"completion_length": 233.00000915527343,
"epoch": 0.712,
"grad_norm": 65.47384643554688,
"kl": 1.837109375,
"learning_rate": 2.3221240145074095e-07,
"loss": 0.8178,
"reward": 1.0833333551883697,
"reward_std": 0.49581558406353,
"rewards/accuracy_reward": 0.3000000059604645,
"rewards/format_reward": 0.7833333551883698,
"step": 1335
},
{
"completion_length": 223.82500839233398,
"epoch": 0.7146666666666667,
"grad_norm": 4.103445529937744,
"kl": 1.4453125,
"learning_rate": 2.2829247810760021e-07,
"loss": 0.7565,
"reward": 1.075000023841858,
"reward_std": 0.45809874683618546,
"rewards/accuracy_reward": 0.2666666697710752,
"rewards/format_reward": 0.8083333551883698,
"step": 1340
},
{
"completion_length": 205.72083892822266,
"epoch": 0.7173333333333334,
"grad_norm": 12.356497764587402,
"kl": 1.340234375,
"learning_rate": 2.2439611112065547e-07,
"loss": 0.5622,
"reward": 1.0833333671092986,
"reward_std": 0.3753781244158745,
"rewards/accuracy_reward": 0.2500000026077032,
"rewards/format_reward": 0.8333333492279053,
"step": 1345
},
{
"completion_length": 223.57083969116212,
"epoch": 0.72,
"grad_norm": 17.527667999267578,
"kl": 1.3921875,
"learning_rate": 2.2052363829508776e-07,
"loss": 0.6411,
"reward": 1.2500000536441802,
"reward_std": 0.46224844083189964,
"rewards/accuracy_reward": 0.41666667014360426,
"rewards/format_reward": 0.8333333551883697,
"step": 1350
},
{
"completion_length": 223.97083892822266,
"epoch": 0.7226666666666667,
"grad_norm": 6.988903522491455,
"kl": 2.165234375,
"learning_rate": 2.1667539536451452e-07,
"loss": 0.7447,
"reward": 1.004166704416275,
"reward_std": 0.5843853443861008,
"rewards/accuracy_reward": 0.2458333346992731,
"rewards/format_reward": 0.7583333611488342,
"step": 1355
},
{
"completion_length": 207.48334045410155,
"epoch": 0.7253333333333334,
"grad_norm": 4.7572832107543945,
"kl": 1.383203125,
"learning_rate": 2.1285171596188268e-07,
"loss": 0.6242,
"reward": 1.2250000357627868,
"reward_std": 0.4148427419364452,
"rewards/accuracy_reward": 0.36666668206453323,
"rewards/format_reward": 0.8583333551883697,
"step": 1360
},
{
"completion_length": 198.64167556762695,
"epoch": 0.728,
"grad_norm": 9.330974578857422,
"kl": 1.2017578125,
"learning_rate": 2.090529315905431e-07,
"loss": 0.5328,
"reward": 1.1500000298023223,
"reward_std": 0.36292394176125525,
"rewards/accuracy_reward": 0.3000000089406967,
"rewards/format_reward": 0.8500000089406967,
"step": 1365
},
{
"completion_length": 175.92083740234375,
"epoch": 0.7306666666666667,
"grad_norm": 5.827265739440918,
"kl": 1.030078125,
"learning_rate": 2.052793715955104e-07,
"loss": 0.4956,
"reward": 1.2375000238418579,
"reward_std": 0.41373512148857117,
"rewards/accuracy_reward": 0.3625000089406967,
"rewards/format_reward": 0.8750000178813935,
"step": 1370
},
{
"completion_length": 211.1041732788086,
"epoch": 0.7333333333333333,
"grad_norm": 19.677011489868164,
"kl": 1.71640625,
"learning_rate": 2.0153136313490943e-07,
"loss": 0.6799,
"reward": 1.1750000238418579,
"reward_std": 0.4355910629034042,
"rewards/accuracy_reward": 0.33333333767950535,
"rewards/format_reward": 0.8416666805744171,
"step": 1375
},
{
"completion_length": 178.94584045410156,
"epoch": 0.736,
"grad_norm": 9.273496627807617,
"kl": 2.182421875,
"learning_rate": 1.9780923115161158e-07,
"loss": 0.7168,
"reward": 1.2291667103767394,
"reward_std": 0.4490681551396847,
"rewards/accuracy_reward": 0.37916667461395265,
"rewards/format_reward": 0.8500000178813935,
"step": 1380
},
{
"completion_length": 169.15000762939454,
"epoch": 0.7386666666666667,
"grad_norm": 4.525454044342041,
"kl": 1.0986328125,
"learning_rate": 1.9411329834506286e-07,
"loss": 0.5639,
"reward": 1.237500047683716,
"reward_std": 0.43203722685575485,
"rewards/accuracy_reward": 0.3708333432674408,
"rewards/format_reward": 0.8666666865348815,
"step": 1385
},
{
"completion_length": 246.45417175292968,
"epoch": 0.7413333333333333,
"grad_norm": 6.603713035583496,
"kl": 1.836328125,
"learning_rate": 1.904438851433068e-07,
"loss": 0.9502,
"reward": 0.9625000119209289,
"reward_std": 0.5920814260840416,
"rewards/accuracy_reward": 0.1958333369344473,
"rewards/format_reward": 0.7666666865348816,
"step": 1390
},
{
"completion_length": 228.13750686645508,
"epoch": 0.744,
"grad_norm": 12.823264122009277,
"kl": 2.1423828125,
"learning_rate": 1.868013096752043e-07,
"loss": 0.6335,
"reward": 1.0458333492279053,
"reward_std": 0.4450342819094658,
"rewards/accuracy_reward": 0.2708333358168602,
"rewards/format_reward": 0.7750000178813934,
"step": 1395
},
{
"completion_length": 179.4166717529297,
"epoch": 0.7466666666666667,
"grad_norm": 5.9196295738220215,
"kl": 1.4501953125,
"learning_rate": 1.8318588774285237e-07,
"loss": 0.675,
"reward": 1.2958333611488342,
"reward_std": 0.4500196687877178,
"rewards/accuracy_reward": 0.45416668355464934,
"rewards/format_reward": 0.8416666924953461,
"step": 1400
},
{
"epoch": 0.7466666666666667,
"eval_completion_length": 254.25000681559246,
"eval_kl": 2.1704947916666666,
"eval_loss": 0.8455010652542114,
"eval_reward": 0.9288889118035635,
"eval_reward_std": 0.5875919719537099,
"eval_rewards/accuracy_reward": 0.1711111158132553,
"eval_rewards/format_reward": 0.7577777977784474,
"eval_runtime": 828.1328,
"eval_samples_per_second": 0.362,
"eval_steps_per_second": 0.016,
"step": 1400
},
{
"completion_length": 203.6458396911621,
"epoch": 0.7493333333333333,
"grad_norm": 6.658132553100586,
"kl": 1.80625,
"learning_rate": 1.7959793279420505e-07,
"loss": 0.7215,
"reward": 1.1333333730697632,
"reward_std": 0.4133936479687691,
"rewards/accuracy_reward": 0.2916666746139526,
"rewards/format_reward": 0.8416666924953461,
"step": 1405
},
{
"completion_length": 137.4416702270508,
"epoch": 0.752,
"grad_norm": 5.461396217346191,
"kl": 1.5439453125,
"learning_rate": 1.760377558958982e-07,
"loss": 0.4963,
"reward": 1.1791667282581328,
"reward_std": 0.3674156993627548,
"rewards/accuracy_reward": 0.287500013038516,
"rewards/format_reward": 0.8916666865348816,
"step": 1410
},
{
"completion_length": 180.80833892822267,
"epoch": 0.7546666666666667,
"grad_norm": 6.98518180847168,
"kl": 1.206640625,
"learning_rate": 1.72505665706281e-07,
"loss": 0.6087,
"reward": 1.2375000536441803,
"reward_std": 0.3306782692670822,
"rewards/accuracy_reward": 0.3541666746139526,
"rewards/format_reward": 0.8833333551883698,
"step": 1415
},
{
"completion_length": 148.8083396911621,
"epoch": 0.7573333333333333,
"grad_norm": 9.099882125854492,
"kl": 1.595703125,
"learning_rate": 1.690019684486557e-07,
"loss": 0.5703,
"reward": 1.1416666984558106,
"reward_std": 0.3741296485066414,
"rewards/accuracy_reward": 0.2666666775941849,
"rewards/format_reward": 0.8750000119209289,
"step": 1420
},
{
"completion_length": 205.50000534057617,
"epoch": 0.76,
"grad_norm": 3.151777744293213,
"kl": 1.4462890625,
"learning_rate": 1.655269678847292e-07,
"loss": 0.6055,
"reward": 1.1291666865348815,
"reward_std": 0.415025033056736,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/format_reward": 0.8583333432674408,
"step": 1425
},
{
"completion_length": 156.6208381652832,
"epoch": 0.7626666666666667,
"grad_norm": 8.909594535827637,
"kl": 1.776953125,
"learning_rate": 1.6208096528827714e-07,
"loss": 0.4477,
"reward": 1.1916666984558106,
"reward_std": 0.291341669857502,
"rewards/accuracy_reward": 0.3166666816920042,
"rewards/format_reward": 0.8750000178813935,
"step": 1430
},
{
"completion_length": 214.9083396911621,
"epoch": 0.7653333333333333,
"grad_norm": 10.612777709960938,
"kl": 1.9314453125,
"learning_rate": 1.5866425941902522e-07,
"loss": 0.7519,
"reward": 1.0875000536441803,
"reward_std": 0.4693745546042919,
"rewards/accuracy_reward": 0.2625000089406967,
"rewards/format_reward": 0.8250000178813934,
"step": 1435
},
{
"completion_length": 223.26250991821288,
"epoch": 0.768,
"grad_norm": 21.530683517456055,
"kl": 1.9796875,
"learning_rate": 1.5527714649674638e-07,
"loss": 0.8694,
"reward": 1.1375000476837158,
"reward_std": 0.47066808491945267,
"rewards/accuracy_reward": 0.3375000089406967,
"rewards/format_reward": 0.8000000238418579,
"step": 1440
},
{
"completion_length": 226.37083740234374,
"epoch": 0.7706666666666667,
"grad_norm": 13.953042030334473,
"kl": 2.4328125,
"learning_rate": 1.5191992017557993e-07,
"loss": 0.8953,
"reward": 0.9833333551883697,
"reward_std": 0.5663923621177673,
"rewards/accuracy_reward": 0.2250000074505806,
"rewards/format_reward": 0.7583333492279053,
"step": 1445
},
{
"completion_length": 235.72917404174805,
"epoch": 0.7733333333333333,
"grad_norm": 89.63887786865234,
"kl": 1.90859375,
"learning_rate": 1.485928715185721e-07,
"loss": 0.6224,
"reward": 1.1458333492279054,
"reward_std": 0.4521483927965164,
"rewards/accuracy_reward": 0.3458333402872086,
"rewards/format_reward": 0.8000000238418579,
"step": 1450
},
{
"completion_length": 238.7125045776367,
"epoch": 0.776,
"grad_norm": 9.134527206420898,
"kl": 0.866015625,
"learning_rate": 1.4529628897244212e-07,
"loss": 0.7825,
"reward": 1.1291666984558106,
"reward_std": 0.49776799529790877,
"rewards/accuracy_reward": 0.31250000447034837,
"rewards/format_reward": 0.8166666984558105,
"step": 1455
},
{
"completion_length": 201.51250381469725,
"epoch": 0.7786666666666666,
"grad_norm": 11.216150283813477,
"kl": 1.4703125,
"learning_rate": 1.4203045834257417e-07,
"loss": 0.6511,
"reward": 1.137500035762787,
"reward_std": 0.4635964795947075,
"rewards/accuracy_reward": 0.31250000894069674,
"rewards/format_reward": 0.8250000238418579,
"step": 1460
},
{
"completion_length": 130.93750381469727,
"epoch": 0.7813333333333333,
"grad_norm": 3.627894639968872,
"kl": 1.075,
"learning_rate": 1.3879566276823896e-07,
"loss": 0.3128,
"reward": 1.2708333611488343,
"reward_std": 0.2865241147577763,
"rewards/accuracy_reward": 0.3375000089406967,
"rewards/format_reward": 0.9333333432674408,
"step": 1465
},
{
"completion_length": 216.83750839233397,
"epoch": 0.784,
"grad_norm": 12.179669380187988,
"kl": 1.70078125,
"learning_rate": 1.3559218269804624e-07,
"loss": 0.7939,
"reward": 1.237500047683716,
"reward_std": 0.4636766240000725,
"rewards/accuracy_reward": 0.42083334252238275,
"rewards/format_reward": 0.8166666865348816,
"step": 1470
},
{
"completion_length": 158.32917251586915,
"epoch": 0.7866666666666666,
"grad_norm": 5.9073967933654785,
"kl": 1.6234375,
"learning_rate": 1.3242029586563054e-07,
"loss": 0.4521,
"reward": 1.066666692495346,
"reward_std": 0.27624749541282656,
"rewards/accuracy_reward": 0.20000000447034835,
"rewards/format_reward": 0.8666666865348815,
"step": 1475
},
{
"completion_length": 207.18334197998047,
"epoch": 0.7893333333333333,
"grad_norm": 3.3254387378692627,
"kl": 1.4654296875,
"learning_rate": 1.2928027726557255e-07,
"loss": 0.5483,
"reward": 1.2083333730697632,
"reward_std": 0.43673594370484353,
"rewards/accuracy_reward": 0.38333334140479564,
"rewards/format_reward": 0.8250000178813934,
"step": 1480
},
{
"completion_length": 235.04167556762695,
"epoch": 0.792,
"grad_norm": 10.714239120483398,
"kl": 1.662109375,
"learning_rate": 1.2617239912955757e-07,
"loss": 0.8735,
"reward": 1.0708333671092987,
"reward_std": 0.4854356274008751,
"rewards/accuracy_reward": 0.2541666731238365,
"rewards/format_reward": 0.8166666865348816,
"step": 1485
},
{
"completion_length": 176.85834045410155,
"epoch": 0.7946666666666666,
"grad_norm": 6.300493240356445,
"kl": 1.4615234375,
"learning_rate": 1.230969309027739e-07,
"loss": 0.5979,
"reward": 1.262500023841858,
"reward_std": 0.41847621351480485,
"rewards/accuracy_reward": 0.3958333473652601,
"rewards/format_reward": 0.8666666805744171,
"step": 1490
},
{
"completion_length": 230.1958381652832,
"epoch": 0.7973333333333333,
"grad_norm": 10.270231246948242,
"kl": 2.72734375,
"learning_rate": 1.2005413922055248e-07,
"loss": 0.9828,
"reward": 0.9833333671092988,
"reward_std": 0.6273943156003952,
"rewards/accuracy_reward": 0.21666667349636554,
"rewards/format_reward": 0.7666666865348816,
"step": 1495
},
{
"completion_length": 277.3833374023437,
"epoch": 0.8,
"grad_norm": 20.984962463378906,
"kl": 2.16640625,
"learning_rate": 1.1704428788525029e-07,
"loss": 0.8738,
"reward": 1.066666692495346,
"reward_std": 0.6233089223504067,
"rewards/accuracy_reward": 0.3250000089406967,
"rewards/format_reward": 0.7416666865348815,
"step": 1500
},
{
"completion_length": 201.60000762939453,
"epoch": 0.8026666666666666,
"grad_norm": 5.6333489418029785,
"kl": 1.384375,
"learning_rate": 1.1406763784337948e-07,
"loss": 0.6346,
"reward": 1.1416666865348817,
"reward_std": 0.4620711088180542,
"rewards/accuracy_reward": 0.30833334289491177,
"rewards/format_reward": 0.8333333551883697,
"step": 1505
},
{
"completion_length": 155.48750381469728,
"epoch": 0.8053333333333333,
"grad_norm": 2.1134753227233887,
"kl": 0.8833984375,
"learning_rate": 1.111244471629838e-07,
"loss": 0.4204,
"reward": 1.2833333611488342,
"reward_std": 0.3339101344347,
"rewards/accuracy_reward": 0.3666666835546494,
"rewards/format_reward": 0.9166666805744171,
"step": 1510
},
{
"completion_length": 208.70000686645508,
"epoch": 0.808,
"grad_norm": 15.448051452636719,
"kl": 1.694140625,
"learning_rate": 1.0821497101126487e-07,
"loss": 0.725,
"reward": 1.079166704416275,
"reward_std": 0.46868581771850587,
"rewards/accuracy_reward": 0.25416667126119136,
"rewards/format_reward": 0.8250000178813934,
"step": 1515
},
{
"completion_length": 174.8625045776367,
"epoch": 0.8106666666666666,
"grad_norm": 10.781864166259766,
"kl": 1.6875,
"learning_rate": 1.0533946163245983e-07,
"loss": 0.651,
"reward": 1.1500000417232514,
"reward_std": 0.4552404969930649,
"rewards/accuracy_reward": 0.3166666753590107,
"rewards/format_reward": 0.8333333551883697,
"step": 1520
},
{
"completion_length": 177.89167327880858,
"epoch": 0.8133333333333334,
"grad_norm": 6.928041458129883,
"kl": 1.4318359375,
"learning_rate": 1.024981683259723e-07,
"loss": 0.6095,
"reward": 1.2250000357627868,
"reward_std": 0.39699684381484984,
"rewards/accuracy_reward": 0.35000001192092894,
"rewards/format_reward": 0.8750000238418579,
"step": 1525
},
{
"completion_length": 183.4708396911621,
"epoch": 0.816,
"grad_norm": 7.962174892425537,
"kl": 1.2412109375,
"learning_rate": 9.969133742475883e-08,
"loss": 0.6025,
"reward": 1.2291666924953462,
"reward_std": 0.4208852708339691,
"rewards/accuracy_reward": 0.3625000089406967,
"rewards/format_reward": 0.8666666924953461,
"step": 1530
},
{
"completion_length": 256.63334503173826,
"epoch": 0.8186666666666667,
"grad_norm": 33.930641174316406,
"kl": 1.7005859375,
"learning_rate": 9.691921227397226e-08,
"loss": 0.752,
"reward": 0.9458333611488342,
"reward_std": 0.4962170884013176,
"rewards/accuracy_reward": 0.16250000558793545,
"rewards/format_reward": 0.7833333611488342,
"step": 1535
},
{
"completion_length": 178.5583381652832,
"epoch": 0.8213333333333334,
"grad_norm": 6.385335445404053,
"kl": 1.1046875,
"learning_rate": 9.4182033209865e-08,
"loss": 0.5621,
"reward": 1.1833333730697633,
"reward_std": 0.3783799774944782,
"rewards/accuracy_reward": 0.3083333406597376,
"rewards/format_reward": 0.8750000178813935,
"step": 1540
},
{
"completion_length": 183.16250610351562,
"epoch": 0.824,
"grad_norm": 7.927608966827393,
"kl": 1.2470703125,
"learning_rate": 9.148003753895144e-08,
"loss": 0.5574,
"reward": 1.1875000476837159,
"reward_std": 0.40479181706905365,
"rewards/accuracy_reward": 0.33750000409781933,
"rewards/format_reward": 0.8500000178813935,
"step": 1545
},
{
"completion_length": 193.51667404174805,
"epoch": 0.8266666666666667,
"grad_norm": 8.427675247192383,
"kl": 1.1888671875,
"learning_rate": 8.881345951743485e-08,
"loss": 0.7181,
"reward": 1.200000035762787,
"reward_std": 0.43247459903359414,
"rewards/accuracy_reward": 0.3500000085681677,
"rewards/format_reward": 0.8500000178813935,
"step": 1550
},
{
"completion_length": 190.77500686645507,
"epoch": 0.8293333333333334,
"grad_norm": 7.281441688537598,
"kl": 1.5609375,
"learning_rate": 8.618253033089767e-08,
"loss": 0.6316,
"reward": 1.1625000476837157,
"reward_std": 0.3389572203159332,
"rewards/accuracy_reward": 0.3041666716337204,
"rewards/format_reward": 0.8583333551883697,
"step": 1555
},
{
"completion_length": 196.6958366394043,
"epoch": 0.832,
"grad_norm": 3.593385696411133,
"kl": 1.7515625,
"learning_rate": 8.358747807425826e-08,
"loss": 0.6747,
"reward": 1.2041666984558106,
"reward_std": 0.430715125054121,
"rewards/accuracy_reward": 0.3625000145286322,
"rewards/format_reward": 0.8416666865348816,
"step": 1560
},
{
"completion_length": 153.6750045776367,
"epoch": 0.8346666666666667,
"grad_norm": 3.8720762729644775,
"kl": 1.4552734375,
"learning_rate": 8.102852773199586e-08,
"loss": 0.4917,
"reward": 1.2208333671092988,
"reward_std": 0.34195019751787187,
"rewards/accuracy_reward": 0.3125000111758709,
"rewards/format_reward": 0.9083333492279053,
"step": 1565
},
{
"completion_length": 158.85833816528321,
"epoch": 0.8373333333333334,
"grad_norm": 4.770650386810303,
"kl": 1.3427734375,
"learning_rate": 7.850590115864481e-08,
"loss": 0.4683,
"reward": 1.225000023841858,
"reward_std": 0.33489523082971573,
"rewards/accuracy_reward": 0.33333334028720857,
"rewards/format_reward": 0.8916666865348816,
"step": 1570
},
{
"completion_length": 193.32500534057618,
"epoch": 0.84,
"grad_norm": 5.848920822143555,
"kl": 1.3138671875,
"learning_rate": 7.601981705956039e-08,
"loss": 0.4761,
"reward": 1.2541667103767395,
"reward_std": 0.4497212260961533,
"rewards/accuracy_reward": 0.4291666798293591,
"rewards/format_reward": 0.8250000178813934,
"step": 1575
},
{
"completion_length": 217.37500610351563,
"epoch": 0.8426666666666667,
"grad_norm": 9.334526062011719,
"kl": 1.68125,
"learning_rate": 7.357049097195773e-08,
"loss": 0.6737,
"reward": 1.0375000298023225,
"reward_std": 0.4624008506536484,
"rewards/accuracy_reward": 0.22083333916962147,
"rewards/format_reward": 0.8166666805744172,
"step": 1580
},
{
"completion_length": 238.8541702270508,
"epoch": 0.8453333333333334,
"grad_norm": 7.903083801269531,
"kl": 2.119140625,
"learning_rate": 7.115813524622488e-08,
"loss": 0.7548,
"reward": 1.0375000417232514,
"reward_std": 0.6012955874204635,
"rewards/accuracy_reward": 0.28750000707805157,
"rewards/format_reward": 0.7500000178813935,
"step": 1585
},
{
"completion_length": 213.15834274291993,
"epoch": 0.848,
"grad_norm": 4.728046417236328,
"kl": 1.553515625,
"learning_rate": 6.878295902751319e-08,
"loss": 0.6544,
"reward": 1.1208333492279052,
"reward_std": 0.42975625991821287,
"rewards/accuracy_reward": 0.3041666716337204,
"rewards/format_reward": 0.8166666865348816,
"step": 1590
},
{
"completion_length": 180.77500534057617,
"epoch": 0.8506666666666667,
"grad_norm": 77.55509948730469,
"kl": 1.58125,
"learning_rate": 6.644516823760437e-08,
"loss": 0.5961,
"reward": 1.091666692495346,
"reward_std": 0.32875102311372756,
"rewards/accuracy_reward": 0.2166666690260172,
"rewards/format_reward": 0.8750000238418579,
"step": 1595
},
{
"completion_length": 214.39167404174805,
"epoch": 0.8533333333333334,
"grad_norm": 25.89487075805664,
"kl": 1.84609375,
"learning_rate": 6.414496555705801e-08,
"loss": 0.6698,
"reward": 1.1166666805744172,
"reward_std": 0.5117101609706879,
"rewards/accuracy_reward": 0.34166666977107524,
"rewards/format_reward": 0.7750000178813934,
"step": 1600
},
{
"epoch": 0.8533333333333334,
"eval_completion_length": 246.72556213378905,
"eval_kl": 2.08859375,
"eval_loss": 0.8056277632713318,
"eval_reward": 0.9472222503026326,
"eval_reward_std": 0.5439748798807462,
"eval_rewards/accuracy_reward": 0.17055555924773216,
"eval_rewards/format_reward": 0.7766666837533315,
"eval_runtime": 810.1675,
"eval_samples_per_second": 0.37,
"eval_steps_per_second": 0.016,
"step": 1600
},
{
"completion_length": 281.73750762939454,
"epoch": 0.856,
"grad_norm": 19.62468719482422,
"kl": 1.675390625,
"learning_rate": 6.188255040763929e-08,
"loss": 0.9653,
"reward": 1.041666704416275,
"reward_std": 0.62858667075634,
"rewards/accuracy_reward": 0.3000000089406967,
"rewards/format_reward": 0.7416666865348815,
"step": 1605
},
{
"completion_length": 188.87916946411133,
"epoch": 0.8586666666666667,
"grad_norm": 3.9411680698394775,
"kl": 1.18125,
"learning_rate": 5.965811893503015e-08,
"loss": 0.6174,
"reward": 1.2083333730697632,
"reward_std": 0.42002698704600333,
"rewards/accuracy_reward": 0.35000000819563865,
"rewards/format_reward": 0.8583333551883697,
"step": 1610
},
{
"completion_length": 199.65000534057617,
"epoch": 0.8613333333333333,
"grad_norm": 11.383617401123047,
"kl": 2.05234375,
"learning_rate": 5.7471863991823356e-08,
"loss": 0.6184,
"reward": 1.1125000357627868,
"reward_std": 0.4449311882257462,
"rewards/accuracy_reward": 0.2875000063329935,
"rewards/format_reward": 0.8250000298023223,
"step": 1615
},
{
"completion_length": 176.72917404174805,
"epoch": 0.864,
"grad_norm": 1.5305043458938599,
"kl": 1.0876953125,
"learning_rate": 5.532397512080306e-08,
"loss": 0.6773,
"reward": 1.2458333611488341,
"reward_std": 0.42713540196418764,
"rewards/accuracy_reward": 0.3625000111758709,
"rewards/format_reward": 0.8833333492279053,
"step": 1620
},
{
"completion_length": 169.13750457763672,
"epoch": 0.8666666666666667,
"grad_norm": 5.485569000244141,
"kl": 1.5263671875,
"learning_rate": 5.321463853851188e-08,
"loss": 0.5792,
"reward": 1.1875000417232513,
"reward_std": 0.39003320038318634,
"rewards/accuracy_reward": 0.3125000063329935,
"rewards/format_reward": 0.8750000238418579,
"step": 1625
},
{
"completion_length": 166.21250381469727,
"epoch": 0.8693333333333333,
"grad_norm": 6.497225761413574,
"kl": 1.157421875,
"learning_rate": 5.114403711910631e-08,
"loss": 0.5003,
"reward": 1.1833333849906922,
"reward_std": 0.33641326874494554,
"rewards/accuracy_reward": 0.2833333358168602,
"rewards/format_reward": 0.9000000178813934,
"step": 1630
},
{
"completion_length": 203.3166717529297,
"epoch": 0.872,
"grad_norm": 10.349686622619629,
"kl": 1.8275390625,
"learning_rate": 4.911235037850186e-08,
"loss": 0.7575,
"reward": 1.2083333790302277,
"reward_std": 0.5746691286563873,
"rewards/accuracy_reward": 0.4000000149011612,
"rewards/format_reward": 0.8083333551883698,
"step": 1635
},
{
"completion_length": 159.37083740234374,
"epoch": 0.8746666666666667,
"grad_norm": 3.7331645488739014,
"kl": 1.871875,
"learning_rate": 4.7119754458809725e-08,
"loss": 0.5334,
"reward": 1.1375000417232513,
"reward_std": 0.37364248782396314,
"rewards/accuracy_reward": 0.28750000707805157,
"rewards/format_reward": 0.8500000238418579,
"step": 1640
},
{
"completion_length": 177.79583740234375,
"epoch": 0.8773333333333333,
"grad_norm": 10.393296241760254,
"kl": 1.009765625,
"learning_rate": 4.516642211306587e-08,
"loss": 0.61,
"reward": 1.112500047683716,
"reward_std": 0.3519383378326893,
"rewards/accuracy_reward": 0.23750000521540643,
"rewards/format_reward": 0.8750000178813935,
"step": 1645
},
{
"completion_length": 188.83750534057617,
"epoch": 0.88,
"grad_norm": 9.854552268981934,
"kl": 1.759375,
"learning_rate": 4.325252269025315e-08,
"loss": 0.6185,
"reward": 1.166666704416275,
"reward_std": 0.41237895712256434,
"rewards/accuracy_reward": 0.32500001452863214,
"rewards/format_reward": 0.8416666865348816,
"step": 1650
},
{
"completion_length": 232.76250457763672,
"epoch": 0.8826666666666667,
"grad_norm": 5.535569667816162,
"kl": 1.8421875,
"learning_rate": 4.137822212061964e-08,
"loss": 0.7118,
"reward": 1.0375000178813933,
"reward_std": 0.4611209347844124,
"rewards/accuracy_reward": 0.2625000048428774,
"rewards/format_reward": 0.7750000178813934,
"step": 1655
},
{
"completion_length": 231.8791717529297,
"epoch": 0.8853333333333333,
"grad_norm": 5.740895748138428,
"kl": 1.616796875,
"learning_rate": 3.954368290129301e-08,
"loss": 0.6385,
"reward": 1.2250000357627868,
"reward_std": 0.5405340433120728,
"rewards/accuracy_reward": 0.4000000074505806,
"rewards/format_reward": 0.8250000238418579,
"step": 1660
},
{
"completion_length": 131.25833740234376,
"epoch": 0.888,
"grad_norm": 5.200298309326172,
"kl": 1.3203125,
"learning_rate": 3.774906408219197e-08,
"loss": 0.277,
"reward": 1.3500000178813933,
"reward_std": 0.33660581335425377,
"rewards/accuracy_reward": 0.4250000149011612,
"rewards/format_reward": 0.925000011920929,
"step": 1665
},
{
"completion_length": 202.34584426879883,
"epoch": 0.8906666666666667,
"grad_norm": 7.758249759674072,
"kl": 1.4638671875,
"learning_rate": 3.5994521252237506e-08,
"loss": 0.5761,
"reward": 1.1083333790302277,
"reward_std": 0.38408626839518545,
"rewards/accuracy_reward": 0.26666667126119137,
"rewards/format_reward": 0.8416666746139526,
"step": 1670
},
{
"completion_length": 200.52500534057617,
"epoch": 0.8933333333333333,
"grad_norm": 12.366177558898926,
"kl": 1.696875,
"learning_rate": 3.42802065258635e-08,
"loss": 0.6531,
"reward": 1.0083333611488343,
"reward_std": 0.46105473637580874,
"rewards/accuracy_reward": 0.20000000670552254,
"rewards/format_reward": 0.8083333611488343,
"step": 1675
},
{
"completion_length": 201.46250381469727,
"epoch": 0.896,
"grad_norm": 6.266597747802734,
"kl": 1.599609375,
"learning_rate": 3.260626852982873e-08,
"loss": 0.6282,
"reward": 1.0958333611488342,
"reward_std": 0.43908271491527556,
"rewards/accuracy_reward": 0.2541666753590107,
"rewards/format_reward": 0.8416666865348816,
"step": 1680
},
{
"completion_length": 160.81250228881837,
"epoch": 0.8986666666666666,
"grad_norm": 56.750545501708984,
"kl": 1.580078125,
"learning_rate": 3.097285239033137e-08,
"loss": 0.5331,
"reward": 1.1541666984558105,
"reward_std": 0.3470060914754868,
"rewards/accuracy_reward": 0.26250000596046447,
"rewards/format_reward": 0.8916666746139527,
"step": 1685
},
{
"completion_length": 213.47917556762695,
"epoch": 0.9013333333333333,
"grad_norm": 11.425201416015625,
"kl": 2.06640625,
"learning_rate": 2.93800997204271e-08,
"loss": 0.7264,
"reward": 1.200000035762787,
"reward_std": 0.5640496462583542,
"rewards/accuracy_reward": 0.40833334140479566,
"rewards/format_reward": 0.791666692495346,
"step": 1690
},
{
"completion_length": 194.2541717529297,
"epoch": 0.904,
"grad_norm": 20.20305061340332,
"kl": 1.2748046875,
"learning_rate": 2.7828148607751235e-08,
"loss": 0.6901,
"reward": 1.2416667103767396,
"reward_std": 0.4328203298151493,
"rewards/accuracy_reward": 0.39166667461395266,
"rewards/format_reward": 0.8500000238418579,
"step": 1695
},
{
"completion_length": 231.80000991821288,
"epoch": 0.9066666666666666,
"grad_norm": 20.141338348388672,
"kl": 1.8451171875,
"learning_rate": 2.6317133602547335e-08,
"loss": 0.7004,
"reward": 1.0708333730697632,
"reward_std": 0.5015199676156044,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/format_reward": 0.8000000238418579,
"step": 1700
},
{
"completion_length": 226.9791732788086,
"epoch": 0.9093333333333333,
"grad_norm": 8.150571823120117,
"kl": 1.451953125,
"learning_rate": 2.4847185706001637e-08,
"loss": 0.7311,
"reward": 1.1750000357627868,
"reward_std": 0.4346106082201004,
"rewards/accuracy_reward": 0.33333334550261495,
"rewards/format_reward": 0.8416666924953461,
"step": 1705
},
{
"completion_length": 161.9708366394043,
"epoch": 0.912,
"grad_norm": 7.521031856536865,
"kl": 1.84453125,
"learning_rate": 2.341843235888563e-08,
"loss": 0.6629,
"reward": 1.3250000476837158,
"reward_std": 0.4102425158023834,
"rewards/accuracy_reward": 0.45000000596046447,
"rewards/format_reward": 0.8750000238418579,
"step": 1710
},
{
"completion_length": 181.10417098999022,
"epoch": 0.9146666666666666,
"grad_norm": 18.987491607666016,
"kl": 1.7630859375,
"learning_rate": 2.203099743050746e-08,
"loss": 0.6879,
"reward": 1.170833373069763,
"reward_std": 0.42629132717847823,
"rewards/accuracy_reward": 0.32083334028720856,
"rewards/format_reward": 0.850000011920929,
"step": 1715
},
{
"completion_length": 263.47084045410156,
"epoch": 0.9173333333333333,
"grad_norm": 13.362593650817871,
"kl": 2.15078125,
"learning_rate": 2.068500120797284e-08,
"loss": 0.9974,
"reward": 1.125000035762787,
"reward_std": 0.6081652283668518,
"rewards/accuracy_reward": 0.3583333432674408,
"rewards/format_reward": 0.766666692495346,
"step": 1720
},
{
"completion_length": 187.62083892822267,
"epoch": 0.92,
"grad_norm": 6.908720970153809,
"kl": 1.466015625,
"learning_rate": 1.9380560385756084e-08,
"loss": 0.7325,
"reward": 1.2166666984558105,
"reward_std": 0.39846049398183825,
"rewards/accuracy_reward": 0.3500000100582838,
"rewards/format_reward": 0.8666666984558106,
"step": 1725
},
{
"completion_length": 208.1041687011719,
"epoch": 0.9226666666666666,
"grad_norm": 7.331279754638672,
"kl": 1.965625,
"learning_rate": 1.8117788055583284e-08,
"loss": 0.8239,
"reward": 1.1291667103767395,
"reward_std": 0.5210324048995971,
"rewards/accuracy_reward": 0.31250001303851604,
"rewards/format_reward": 0.8166666805744172,
"step": 1730
},
{
"completion_length": 219.9958381652832,
"epoch": 0.9253333333333333,
"grad_norm": 10.382403373718262,
"kl": 1.346484375,
"learning_rate": 1.68967936966275e-08,
"loss": 0.6091,
"reward": 1.1708333671092988,
"reward_std": 0.4482548341155052,
"rewards/accuracy_reward": 0.3375000096857548,
"rewards/format_reward": 0.8333333492279053,
"step": 1735
},
{
"completion_length": 180.16250228881836,
"epoch": 0.928,
"grad_norm": 8.021461486816406,
"kl": 2.263671875,
"learning_rate": 1.571768316601718e-08,
"loss": 0.583,
"reward": 1.0458333790302277,
"reward_std": 0.4388016849756241,
"rewards/accuracy_reward": 0.23750000335276128,
"rewards/format_reward": 0.8083333551883698,
"step": 1740
},
{
"completion_length": 244.97917251586915,
"epoch": 0.9306666666666666,
"grad_norm": 27.44346809387207,
"kl": 1.7083984375,
"learning_rate": 1.4580558689658406e-08,
"loss": 0.9154,
"reward": 1.1708333671092988,
"reward_std": 0.6085290633141994,
"rewards/accuracy_reward": 0.379166679084301,
"rewards/format_reward": 0.7916666865348816,
"step": 1745
},
{
"completion_length": 176.87917251586913,
"epoch": 0.9333333333333333,
"grad_norm": 21.765560150146484,
"kl": 1.50078125,
"learning_rate": 1.3485518853372624e-08,
"loss": 0.5667,
"reward": 1.2708333671092986,
"reward_std": 0.44170806705951693,
"rewards/accuracy_reward": 0.3958333425223827,
"rewards/format_reward": 0.8750000238418579,
"step": 1750
},
{
"completion_length": 245.63750457763672,
"epoch": 0.936,
"grad_norm": 5.950416564941406,
"kl": 2.523828125,
"learning_rate": 1.243265859434911e-08,
"loss": 0.8785,
"reward": 1.1625000417232514,
"reward_std": 0.6160432323813438,
"rewards/accuracy_reward": 0.40416667312383653,
"rewards/format_reward": 0.7583333492279053,
"step": 1755
},
{
"completion_length": 174.72083892822266,
"epoch": 0.9386666666666666,
"grad_norm": 19.66292381286621,
"kl": 1.318359375,
"learning_rate": 1.1422069192914219e-08,
"loss": 0.516,
"reward": 1.200000035762787,
"reward_std": 0.37103949785232543,
"rewards/accuracy_reward": 0.33333334028720857,
"rewards/format_reward": 0.8666666805744171,
"step": 1760
},
{
"completion_length": 224.34167709350587,
"epoch": 0.9413333333333334,
"grad_norm": 6.988897323608398,
"kl": 1.60703125,
"learning_rate": 1.0453838264617709e-08,
"loss": 0.8039,
"reward": 0.9708333611488342,
"reward_std": 0.5082953691482544,
"rewards/accuracy_reward": 0.1791666727513075,
"rewards/format_reward": 0.7916666865348816,
"step": 1765
},
{
"completion_length": 238.2375061035156,
"epoch": 0.944,
"grad_norm": 12.432623863220215,
"kl": 1.816796875,
"learning_rate": 9.528049752636714e-09,
"loss": 0.6603,
"reward": 1.0416666984558105,
"reward_std": 0.4743375271558762,
"rewards/accuracy_reward": 0.26666667573153974,
"rewards/format_reward": 0.775000023841858,
"step": 1770
},
{
"completion_length": 169.10417251586915,
"epoch": 0.9466666666666667,
"grad_norm": 7.009900093078613,
"kl": 1.668359375,
"learning_rate": 8.644783920498e-09,
"loss": 0.485,
"reward": 1.200000035762787,
"reward_std": 0.37502728700637816,
"rewards/accuracy_reward": 0.3416666783392429,
"rewards/format_reward": 0.8583333551883697,
"step": 1775
},
{
"completion_length": 233.9000045776367,
"epoch": 0.9493333333333334,
"grad_norm": 5.138508319854736,
"kl": 1.686328125,
"learning_rate": 7.804117345119266e-09,
"loss": 0.7679,
"reward": 1.1333333730697632,
"reward_std": 0.5335646510124207,
"rewards/accuracy_reward": 0.34166667722165583,
"rewards/format_reward": 0.7916666984558105,
"step": 1780
},
{
"completion_length": 183.32084121704102,
"epoch": 0.952,
"grad_norm": 3580.284912109375,
"kl": 2.233203125,
"learning_rate": 7.00612291017022e-09,
"loss": 0.6764,
"reward": 1.1083333492279053,
"reward_std": 0.3922556236386299,
"rewards/accuracy_reward": 0.24166667684912682,
"rewards/format_reward": 0.8666666924953461,
"step": 1785
},
{
"completion_length": 191.9708381652832,
"epoch": 0.9546666666666667,
"grad_norm": 7.294617176055908,
"kl": 1.476171875,
"learning_rate": 6.2508697997538665e-09,
"loss": 0.5961,
"reward": 1.0791666865348817,
"reward_std": 0.4319505989551544,
"rewards/accuracy_reward": 0.2291666753590107,
"rewards/format_reward": 0.8500000238418579,
"step": 1790
},
{
"completion_length": 211.21250610351564,
"epoch": 0.9573333333333334,
"grad_norm": 4.981256484985352,
"kl": 1.1861328125,
"learning_rate": 5.538423492408129e-09,
"loss": 0.604,
"reward": 1.262500035762787,
"reward_std": 0.45401586443185804,
"rewards/accuracy_reward": 0.39583334848284724,
"rewards/format_reward": 0.8666666805744171,
"step": 1795
},
{
"completion_length": 283.41251068115236,
"epoch": 0.96,
"grad_norm": 2.2728030681610107,
"kl": 1.4833984375,
"learning_rate": 4.8688457554291736e-09,
"loss": 0.7529,
"reward": 1.1791666984558105,
"reward_std": 0.535879123210907,
"rewards/accuracy_reward": 0.4041666742414236,
"rewards/format_reward": 0.7750000178813934,
"step": 1800
},
{
"epoch": 0.96,
"eval_completion_length": 212.29556121826172,
"eval_kl": 1.955859375,
"eval_loss": 0.6787320971488953,
"eval_reward": 0.9722222558657329,
"eval_reward_std": 0.4794289442896843,
"eval_rewards/accuracy_reward": 0.1600000035762787,
"eval_rewards/format_reward": 0.8122222447395324,
"eval_runtime": 704.5393,
"eval_samples_per_second": 0.426,
"eval_steps_per_second": 0.018,
"step": 1800
},
{
"completion_length": 171.38750381469725,
"epoch": 0.9626666666666667,
"grad_norm": 7.666684150695801,
"kl": 1.298046875,
"learning_rate": 4.242194639516416e-09,
"loss": 0.5453,
"reward": 1.1750000238418579,
"reward_std": 0.3607516996562481,
"rewards/accuracy_reward": 0.30000000856816766,
"rewards/format_reward": 0.8750000178813935,
"step": 1805
},
{
"completion_length": 182.1208396911621,
"epoch": 0.9653333333333334,
"grad_norm": 2.6979856491088867,
"kl": 1.5603515625,
"learning_rate": 3.658524473739544e-09,
"loss": 0.7398,
"reward": 1.1625000536441803,
"reward_std": 0.40499134212732313,
"rewards/accuracy_reward": 0.30416667461395264,
"rewards/format_reward": 0.8583333551883697,
"step": 1810
},
{
"completion_length": 216.9875061035156,
"epoch": 0.968,
"grad_norm": 11.175540924072266,
"kl": 1.25625,
"learning_rate": 3.1178858608283954e-09,
"loss": 0.7517,
"reward": 1.0833333790302277,
"reward_std": 0.4425746828317642,
"rewards/accuracy_reward": 0.24166667386889457,
"rewards/format_reward": 0.8416666865348816,
"step": 1815
},
{
"completion_length": 187.23750762939454,
"epoch": 0.9706666666666667,
"grad_norm": 7.220974922180176,
"kl": 1.28515625,
"learning_rate": 2.6203256727859167e-09,
"loss": 0.7132,
"reward": 1.1583333730697631,
"reward_std": 0.43746666610240936,
"rewards/accuracy_reward": 0.3083333432674408,
"rewards/format_reward": 0.8500000178813935,
"step": 1820
},
{
"completion_length": 164.65833892822266,
"epoch": 0.9733333333333334,
"grad_norm": 10.702654838562012,
"kl": 1.7052734375,
"learning_rate": 2.165887046824133e-09,
"loss": 0.5298,
"reward": 1.1958333730697632,
"reward_std": 0.32650465294718745,
"rewards/accuracy_reward": 0.32916667461395266,
"rewards/format_reward": 0.8666666865348815,
"step": 1825
},
{
"completion_length": 198.562508392334,
"epoch": 0.976,
"grad_norm": 6.067382335662842,
"kl": 1.7404296875,
"learning_rate": 1.7546093816246387e-09,
"loss": 0.6923,
"reward": 1.004166704416275,
"reward_std": 0.3946992427110672,
"rewards/accuracy_reward": 0.17083333544433116,
"rewards/format_reward": 0.8333333551883697,
"step": 1830
},
{
"completion_length": 206.00834197998046,
"epoch": 0.9786666666666667,
"grad_norm": 8.223440170288086,
"kl": 1.7333984375,
"learning_rate": 1.3865283339228316e-09,
"loss": 0.6633,
"reward": 1.091666692495346,
"reward_std": 0.45499495714902877,
"rewards/accuracy_reward": 0.26666667237877845,
"rewards/format_reward": 0.8250000178813934,
"step": 1835
},
{
"completion_length": 162.4708381652832,
"epoch": 0.9813333333333333,
"grad_norm": 6.2932610511779785,
"kl": 1.397265625,
"learning_rate": 1.0616758154161631e-09,
"loss": 0.5335,
"reward": 1.3208333730697632,
"reward_std": 0.27748758494853976,
"rewards/accuracy_reward": 0.4041666716337204,
"rewards/format_reward": 0.9166666865348816,
"step": 1840
},
{
"completion_length": 165.61667251586914,
"epoch": 0.984,
"grad_norm": 9.122838020324707,
"kl": 1.4970703125,
"learning_rate": 7.80079989997906e-10,
"loss": 0.6188,
"reward": 1.1958333611488343,
"reward_std": 0.4245347425341606,
"rewards/accuracy_reward": 0.33750000409781933,
"rewards/format_reward": 0.8583333551883697,
"step": 1845
},
{
"completion_length": 215.14167251586915,
"epoch": 0.9866666666666667,
"grad_norm": 23.73024559020996,
"kl": 1.775,
"learning_rate": 5.417652713152199e-10,
"loss": 0.5543,
"reward": 1.1333333730697632,
"reward_std": 0.4075448580086231,
"rewards/accuracy_reward": 0.31666667610406873,
"rewards/format_reward": 0.8166666835546493,
"step": 1850
},
{
"completion_length": 185.79167404174805,
"epoch": 0.9893333333333333,
"grad_norm": 20.497873306274414,
"kl": 1.5189453125,
"learning_rate": 3.4675232065256574e-10,
"loss": 0.4238,
"reward": 1.0708333492279052,
"reward_std": 0.32569129317998885,
"rewards/accuracy_reward": 0.24583334028720855,
"rewards/format_reward": 0.8250000059604645,
"step": 1855
},
{
"completion_length": 225.09583892822266,
"epoch": 0.992,
"grad_norm": 12.528355598449707,
"kl": 1.2953125,
"learning_rate": 1.9505804514047264e-10,
"loss": 0.4901,
"reward": 1.1541666984558105,
"reward_std": 0.4231454662978649,
"rewards/accuracy_reward": 0.32083334028720856,
"rewards/format_reward": 0.8333333373069763,
"step": 1860
},
{
"completion_length": 221.36667175292968,
"epoch": 0.9946666666666667,
"grad_norm": 13.679535865783691,
"kl": 1.390625,
"learning_rate": 8.669559628954326e-11,
"loss": 0.8532,
"reward": 1.1583333730697631,
"reward_std": 0.5791940867900849,
"rewards/accuracy_reward": 0.35000000409781934,
"rewards/format_reward": 0.8083333611488343,
"step": 1865
},
{
"completion_length": 158.7708381652832,
"epoch": 0.9973333333333333,
"grad_norm": 37.47658157348633,
"kl": 1.707421875,
"learning_rate": 2.1674368850643777e-11,
"loss": 0.5296,
"reward": 1.1458333611488343,
"reward_std": 0.29994617849588395,
"rewards/accuracy_reward": 0.2541666738688946,
"rewards/format_reward": 0.8916666805744171,
"step": 1870
},
{
"completion_length": 223.59584197998046,
"epoch": 1.0,
"grad_norm": 7.715764999389648,
"kl": 1.104296875,
"learning_rate": 0.0,
"loss": 0.7452,
"reward": 1.1708333790302277,
"reward_std": 0.4897158071398735,
"rewards/accuracy_reward": 0.3375000100582838,
"rewards/format_reward": 0.8333333492279053,
"step": 1875
},
{
"epoch": 1.0,
"step": 1875,
"total_flos": 0.0,
"train_loss": 0.738647110915184,
"train_runtime": 47500.6212,
"train_samples_per_second": 0.316,
"train_steps_per_second": 0.039
}
],
"logging_steps": 5,
"max_steps": 1875,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}