{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994666666666666, "eval_steps": 500, "global_step": 937, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 1774.2917404174805, "epoch": 0.0010666666666666667, "grad_norm": 0.18701235949993134, "kl": 0.0, "learning_rate": 9.999974706864252e-07, "loss": 0.0766, "reward": 0.4270833395421505, "reward_std": 0.4668492004275322, "rewards/accuracy_reward": 0.4270833395421505, "rewards/format_reward": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 2005.7500610351562, "epoch": 0.0021333333333333334, "grad_norm": 0.19043712317943573, "kl": 3.331899642944336e-05, "learning_rate": 9.999898827741336e-07, "loss": 0.0651, "reward": 0.4687500027939677, "reward_std": 0.38904277235269547, "rewards/accuracy_reward": 0.4687500027939677, "rewards/format_reward": 0.0, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 1372.8958740234375, "epoch": 0.0032, "grad_norm": 0.18866874277591705, "kl": 2.220645546913147e-05, "learning_rate": 9.999772363484244e-07, "loss": 0.0599, "reward": 0.447916679084301, "reward_std": 0.500461045652628, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.0, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 1659.9271621704102, "epoch": 0.004266666666666667, "grad_norm": 0.16920481622219086, "kl": 1.957640051841736e-05, "learning_rate": 9.999595315514606e-07, "loss": 0.039, "reward": 0.5104166865348816, "reward_std": 0.4596557542681694, "rewards/accuracy_reward": 0.5104166865348816, "rewards/format_reward": 0.0, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 1601.2917022705078, "epoch": 0.005333333333333333, "grad_norm": 0.16836965084075928, "kl": 8.948147296905518e-06, "learning_rate": 9.999367685822688e-07, "loss": 0.0486, "reward": 0.6250000149011612, "reward_std": 0.3569832220673561, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 1920.5938186645508, "epoch": 0.0064, "grad_norm": 0.16410422325134277, "kl": 2.4374574422836304e-05, "learning_rate": 9.999089476967368e-07, "loss": 0.0273, "reward": 0.4062500139698386, "reward_std": 0.31201592087745667, "rewards/accuracy_reward": 0.4062500139698386, "rewards/format_reward": 0.0, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 1992.3437728881836, "epoch": 0.007466666666666667, "grad_norm": 0.16315753757953644, "kl": 2.1262094378471375e-05, "learning_rate": 9.9987606920761e-07, "loss": 0.0126, "reward": 0.48958334047347307, "reward_std": 0.30044984444975853, "rewards/accuracy_reward": 0.48958334047347307, "rewards/format_reward": 0.0, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 1878.0729446411133, "epoch": 0.008533333333333334, "grad_norm": 0.18799559772014618, "kl": 1.8259510397911072e-05, "learning_rate": 9.998381334844881e-07, "loss": 0.0576, "reward": 0.5520833386108279, "reward_std": 0.28380872681736946, "rewards/accuracy_reward": 0.5520833386108279, "rewards/format_reward": 0.0, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 1437.770896911621, "epoch": 0.0096, "grad_norm": 0.18718113005161285, "kl": 2.1253712475299835e-05, "learning_rate": 9.99795140953822e-07, "loss": 0.0369, "reward": 0.6770833432674408, "reward_std": 0.427349217236042, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.010416666977107525, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 1738.239646911621, "epoch": 0.010666666666666666, "grad_norm": 0.2009180784225464, "kl": 4.573166370391846e-05, "learning_rate": 9.997470920989077e-07, "loss": 0.064, "reward": 0.43750001210719347, "reward_std": 0.42636168375611305, "rewards/accuracy_reward": 0.43750001210719347, "rewards/format_reward": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 1742.427116394043, "epoch": 0.011733333333333333, "grad_norm": 0.21475809812545776, "kl": 8.036941289901733e-05, "learning_rate": 9.996939874598807e-07, "loss": 0.0114, "reward": 0.5520833423361182, "reward_std": 0.4352228157222271, "rewards/accuracy_reward": 0.531250006519258, "rewards/format_reward": 0.02083333395421505, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 1903.1146392822266, "epoch": 0.0128, "grad_norm": 0.17319771647453308, "kl": 8.814036846160889e-05, "learning_rate": 9.996358276337115e-07, "loss": 0.0111, "reward": 0.4583333367481828, "reward_std": 0.36444995552301407, "rewards/accuracy_reward": 0.4583333367481828, "rewards/format_reward": 0.0, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 1597.0937805175781, "epoch": 0.013866666666666666, "grad_norm": 0.20308388769626617, "kl": 0.0001263543963432312, "learning_rate": 9.995726132741977e-07, "loss": 0.0407, "reward": 0.39583334140479565, "reward_std": 0.34121064841747284, "rewards/accuracy_reward": 0.38541667349636555, "rewards/format_reward": 0.010416666977107525, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 1663.947956085205, "epoch": 0.014933333333333333, "grad_norm": 0.22094866633415222, "kl": 0.00014838576316833496, "learning_rate": 9.995043450919568e-07, "loss": 0.094, "reward": 0.30208333767950535, "reward_std": 0.3719097673892975, "rewards/accuracy_reward": 0.28125000558793545, "rewards/format_reward": 0.02083333395421505, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 2394.6250495910645, "epoch": 0.016, "grad_norm": 0.23982514441013336, "kl": 0.00017839670181274414, "learning_rate": 9.994310238544181e-07, "loss": 0.045, "reward": 0.3541666753590107, "reward_std": 0.3615064211189747, "rewards/accuracy_reward": 0.33333333767950535, "rewards/format_reward": 0.02083333395421505, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 1466.7291946411133, "epoch": 0.017066666666666667, "grad_norm": 0.22785434126853943, "kl": 0.0002970695495605469, "learning_rate": 9.993526503858145e-07, "loss": 0.0957, "reward": 0.5000000111758709, "reward_std": 0.49544867128133774, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.0, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 1815.3229675292969, "epoch": 0.018133333333333335, "grad_norm": 0.14333269000053406, "kl": 0.0002849102020263672, "learning_rate": 9.992692255671732e-07, "loss": 0.0102, "reward": 0.5104166865348816, "reward_std": 0.33437221124768257, "rewards/accuracy_reward": 0.5104166865348816, "rewards/format_reward": 0.0, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 1853.3229370117188, "epoch": 0.0192, "grad_norm": 0.21924446523189545, "kl": 0.00028146058320999146, "learning_rate": 9.991807503363055e-07, "loss": 0.0745, "reward": 0.5833333507180214, "reward_std": 0.37610871344804764, "rewards/accuracy_reward": 0.5833333507180214, "rewards/format_reward": 0.0, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 1893.0417022705078, "epoch": 0.020266666666666665, "grad_norm": 0.14779509603977203, "kl": 0.00043460726737976074, "learning_rate": 9.99087225687796e-07, "loss": 0.0225, "reward": 0.5208333432674408, "reward_std": 0.29638051614165306, "rewards/accuracy_reward": 0.48958334885537624, "rewards/format_reward": 0.031250000931322575, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 1392.4166793823242, "epoch": 0.021333333333333333, "grad_norm": 0.21756593883037567, "kl": 0.0006494522094726562, "learning_rate": 9.989886526729925e-07, "loss": 0.0706, "reward": 0.5208333423361182, "reward_std": 0.3607165887951851, "rewards/accuracy_reward": 0.5208333423361182, "rewards/format_reward": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 2272.468780517578, "epoch": 0.0224, "grad_norm": 0.10997360199689865, "kl": 0.000676274299621582, "learning_rate": 9.988850323999922e-07, "loss": 0.0164, "reward": 0.375000006519258, "reward_std": 0.27381766214966774, "rewards/accuracy_reward": 0.3645833386108279, "rewards/format_reward": 0.010416666977107525, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 1460.3333587646484, "epoch": 0.023466666666666667, "grad_norm": 0.2336568385362625, "kl": 0.0015399456024169922, "learning_rate": 9.987763660336326e-07, "loss": 0.1072, "reward": 0.6250000167638063, "reward_std": 0.42103152722120285, "rewards/accuracy_reward": 0.6250000167638063, "rewards/format_reward": 0.0, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 1686.1354446411133, "epoch": 0.024533333333333334, "grad_norm": 0.21012543141841888, "kl": 0.0010382533073425293, "learning_rate": 9.986626547954742e-07, "loss": 0.0269, "reward": 0.5104166744276881, "reward_std": 0.37196434289216995, "rewards/accuracy_reward": 0.4895833386108279, "rewards/format_reward": 0.02083333395421505, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 1397.489616394043, "epoch": 0.0256, "grad_norm": 0.16066192090511322, "kl": 0.0012965202331542969, "learning_rate": 9.9854389996379e-07, "loss": 0.0016, "reward": 0.7395833507180214, "reward_std": 0.2652670443058014, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.010416666977107525, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 1133.8645973205566, "epoch": 0.02666666666666667, "grad_norm": 0.23755109310150146, "kl": 0.001789093017578125, "learning_rate": 9.9842010287355e-07, "loss": 0.0018, "reward": 0.708333358168602, "reward_std": 0.36289718747138977, "rewards/accuracy_reward": 0.677083358168602, "rewards/format_reward": 0.031250000931322575, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 1863.4896278381348, "epoch": 0.027733333333333332, "grad_norm": 0.23543404042720795, "kl": 0.0016307830810546875, "learning_rate": 9.982912649164061e-07, "loss": 0.0045, "reward": 0.3958333423361182, "reward_std": 0.3312040790915489, "rewards/accuracy_reward": 0.3958333423361182, "rewards/format_reward": 0.0, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 1727.270881652832, "epoch": 0.0288, "grad_norm": 0.1436164826154709, "kl": 0.0019381046295166016, "learning_rate": 9.981573875406763e-07, "loss": 0.0108, "reward": 0.697916679084301, "reward_std": 0.23890409246087074, "rewards/accuracy_reward": 0.697916679084301, "rewards/format_reward": 0.0, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 1998.052131652832, "epoch": 0.029866666666666666, "grad_norm": 0.17359857261180878, "kl": 0.0017195940017700195, "learning_rate": 9.980184722513288e-07, "loss": 0.0499, "reward": 0.5729166865348816, "reward_std": 0.3478570319712162, "rewards/accuracy_reward": 0.5729166865348816, "rewards/format_reward": 0.0, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 2231.8646240234375, "epoch": 0.030933333333333334, "grad_norm": 0.11865271627902985, "kl": 0.0020372867584228516, "learning_rate": 9.978745206099648e-07, "loss": -0.0082, "reward": 0.5104166707023978, "reward_std": 0.23773327097296715, "rewards/accuracy_reward": 0.4791666707023978, "rewards/format_reward": 0.03125, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 1811.9896240234375, "epoch": 0.032, "grad_norm": 0.1826982945203781, "kl": 0.002782106399536133, "learning_rate": 9.977255342348023e-07, "loss": 0.0195, "reward": 0.6666666809469461, "reward_std": 0.3413033299148083, "rewards/accuracy_reward": 0.6666666809469461, "rewards/format_reward": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 1324.5208892822266, "epoch": 0.03306666666666667, "grad_norm": 0.19605153799057007, "kl": 0.004055976867675781, "learning_rate": 9.97571514800655e-07, "loss": -0.0667, "reward": 0.6354166828095913, "reward_std": 0.3293267898261547, "rewards/accuracy_reward": 0.6354166828095913, "rewards/format_reward": 0.0, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 2245.166717529297, "epoch": 0.034133333333333335, "grad_norm": 0.1556224673986435, "kl": 0.003000974655151367, "learning_rate": 9.974124640389162e-07, "loss": 0.0367, "reward": 0.4583333507180214, "reward_std": 0.34503670409321785, "rewards/accuracy_reward": 0.4583333507180214, "rewards/format_reward": 0.0, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 2014.0312728881836, "epoch": 0.0352, "grad_norm": 0.16419324278831482, "kl": 0.0023903846740722656, "learning_rate": 9.972483837375379e-07, "loss": 0.0295, "reward": 0.6562500074505806, "reward_std": 0.2018139883875847, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.010416666977107525, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 1773.427116394043, "epoch": 0.03626666666666667, "grad_norm": 0.20299893617630005, "kl": 0.003962039947509766, "learning_rate": 9.970792757410118e-07, "loss": 0.0104, "reward": 0.6770833460614085, "reward_std": 0.30044984444975853, "rewards/accuracy_reward": 0.6354166818782687, "rewards/format_reward": 0.0416666679084301, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 1414.1562957763672, "epoch": 0.037333333333333336, "grad_norm": 0.12651610374450684, "kl": 0.0036401748657226562, "learning_rate": 9.969051419503473e-07, "loss": 0.0331, "reward": 0.854166679084301, "reward_std": 0.22326868772506714, "rewards/accuracy_reward": 0.8333333358168602, "rewards/format_reward": 0.02083333395421505, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 1070.8646087646484, "epoch": 0.0384, "grad_norm": 0.18587985634803772, "kl": 0.0022962093353271484, "learning_rate": 9.967259843230507e-07, "loss": 0.0235, "reward": 0.8229166846722364, "reward_std": 0.23070911318063736, "rewards/accuracy_reward": 0.8229166846722364, "rewards/format_reward": 0.0, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 2604.395866394043, "epoch": 0.039466666666666664, "grad_norm": 0.09990446269512177, "kl": 0.003547191619873047, "learning_rate": 9.965418048731037e-07, "loss": 0.0219, "reward": 0.4375000027939677, "reward_std": 0.2505146563053131, "rewards/accuracy_reward": 0.4375000027939677, "rewards/format_reward": 0.0, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 1730.9375381469727, "epoch": 0.04053333333333333, "grad_norm": 0.1686713844537735, "kl": 0.00626373291015625, "learning_rate": 9.9635260567094e-07, "loss": 0.013, "reward": 0.7291666744276881, "reward_std": 0.25834736600518227, "rewards/accuracy_reward": 0.7187500102445483, "rewards/format_reward": 0.010416666977107525, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 2586.229202270508, "epoch": 0.0416, "grad_norm": 0.21493321657180786, "kl": 0.007950305938720703, "learning_rate": 9.961583888434218e-07, "loss": 0.1102, "reward": 0.5104166818782687, "reward_std": 0.3676362670958042, "rewards/accuracy_reward": 0.5104166818782687, "rewards/format_reward": 0.0, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 1660.375015258789, "epoch": 0.042666666666666665, "grad_norm": 0.18594913184642792, "kl": 0.0047435760498046875, "learning_rate": 9.959591565738175e-07, "loss": 0.0098, "reward": 0.6770833507180214, "reward_std": 0.27546053379774094, "rewards/accuracy_reward": 0.656250013038516, "rewards/format_reward": 0.02083333395421505, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 2076.3750381469727, "epoch": 0.04373333333333333, "grad_norm": 0.19637811183929443, "kl": 0.004240989685058594, "learning_rate": 9.957549111017753e-07, "loss": 0.0419, "reward": 0.4791666744276881, "reward_std": 0.22633230313658714, "rewards/accuracy_reward": 0.4687500102445483, "rewards/format_reward": 0.010416666977107525, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 1814.2187881469727, "epoch": 0.0448, "grad_norm": 0.18655166029930115, "kl": 0.00447845458984375, "learning_rate": 9.955456547232989e-07, "loss": 0.0229, "reward": 0.6354166818782687, "reward_std": 0.36353693157434464, "rewards/accuracy_reward": 0.6041666744276881, "rewards/format_reward": 0.031250000931322575, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 1901.6666946411133, "epoch": 0.04586666666666667, "grad_norm": 0.15990513563156128, "kl": 0.007071971893310547, "learning_rate": 9.953313897907216e-07, "loss": 0.038, "reward": 0.7187500083819032, "reward_std": 0.21481411904096603, "rewards/accuracy_reward": 0.7083333441987634, "rewards/format_reward": 0.010416666977107525, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 2394.052146911621, "epoch": 0.046933333333333334, "grad_norm": 0.1640961915254593, "kl": 0.0046596527099609375, "learning_rate": 9.951121187126799e-07, "loss": 0.0599, "reward": 0.4687500074505806, "reward_std": 0.2185296081006527, "rewards/accuracy_reward": 0.4687500074505806, "rewards/format_reward": 0.0, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 2111.7500381469727, "epoch": 0.048, "grad_norm": 0.1542307436466217, "kl": 0.004932403564453125, "learning_rate": 9.948878439540863e-07, "loss": 0.0287, "reward": 0.6770833414047956, "reward_std": 0.2504701688885689, "rewards/accuracy_reward": 0.6666666772216558, "rewards/format_reward": 0.010416666977107525, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 2134.3646392822266, "epoch": 0.04906666666666667, "grad_norm": 0.10857236385345459, "kl": 0.004971504211425781, "learning_rate": 9.946585680361016e-07, "loss": 0.0128, "reward": 0.5937500037252903, "reward_std": 0.19867587089538574, "rewards/accuracy_reward": 0.5937500037252903, "rewards/format_reward": 0.0, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 2397.4271392822266, "epoch": 0.050133333333333335, "grad_norm": 0.13481411337852478, "kl": 0.0031180381774902344, "learning_rate": 9.94424293536107e-07, "loss": 0.0543, "reward": 0.5208333460614085, "reward_std": 0.32722481712698936, "rewards/accuracy_reward": 0.5000000083819032, "rewards/format_reward": 0.02083333395421505, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 2188.979217529297, "epoch": 0.0512, "grad_norm": 0.12762576341629028, "kl": 0.003852367401123047, "learning_rate": 9.941850230876738e-07, "loss": 0.0517, "reward": 0.6250000102445483, "reward_std": 0.2621734142303467, "rewards/accuracy_reward": 0.6145833432674408, "rewards/format_reward": 0.010416666977107525, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 2437.9166870117188, "epoch": 0.05226666666666667, "grad_norm": 0.17994792759418488, "kl": 0.0063953399658203125, "learning_rate": 9.939407593805358e-07, "loss": 0.0742, "reward": 0.6250000232830644, "reward_std": 0.2699316218495369, "rewards/accuracy_reward": 0.5937500083819032, "rewards/format_reward": 0.031250000931322575, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 1941.4792251586914, "epoch": 0.05333333333333334, "grad_norm": 0.19777339696884155, "kl": 0.005503654479980469, "learning_rate": 9.936915051605573e-07, "loss": 0.1346, "reward": 0.7291666846722364, "reward_std": 0.31169813871383667, "rewards/accuracy_reward": 0.7291666846722364, "rewards/format_reward": 0.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 2798.531295776367, "epoch": 0.0544, "grad_norm": 0.1710447520017624, "kl": 0.004336357116699219, "learning_rate": 9.93437263229703e-07, "loss": 0.0958, "reward": 0.42708333767950535, "reward_std": 0.2876347750425339, "rewards/accuracy_reward": 0.42708333767950535, "rewards/format_reward": 0.0, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 2771.4167289733887, "epoch": 0.055466666666666664, "grad_norm": 0.09168324619531631, "kl": 0.004642486572265625, "learning_rate": 9.931780364460065e-07, "loss": 0.0476, "reward": 0.5520833395421505, "reward_std": 0.2798020653426647, "rewards/accuracy_reward": 0.5520833395421505, "rewards/format_reward": 0.0, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 1201.6771278381348, "epoch": 0.05653333333333333, "grad_norm": 0.26731425523757935, "kl": 0.00766754150390625, "learning_rate": 9.929138277235384e-07, "loss": 0.1075, "reward": 0.8854166865348816, "reward_std": 0.3488067798316479, "rewards/accuracy_reward": 0.8437500149011612, "rewards/format_reward": 0.041666666977107525, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 988.5937728881836, "epoch": 0.0576, "grad_norm": 0.1857045590877533, "kl": 0.006832122802734375, "learning_rate": 9.926446400323727e-07, "loss": 0.0444, "reward": 0.9479166716337204, "reward_std": 0.26293323189020157, "rewards/accuracy_reward": 0.8854166716337204, "rewards/format_reward": 0.06250000093132257, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 2232.4062881469727, "epoch": 0.058666666666666666, "grad_norm": 0.29786041378974915, "kl": 0.0047760009765625, "learning_rate": 9.923704763985545e-07, "loss": 0.0896, "reward": 0.7916666865348816, "reward_std": 0.46465475857257843, "rewards/accuracy_reward": 0.729166679084301, "rewards/format_reward": 0.06250000279396772, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 1850.7396087646484, "epoch": 0.05973333333333333, "grad_norm": 0.13542978465557098, "kl": 0.005771636962890625, "learning_rate": 9.92091339904065e-07, "loss": 0.0128, "reward": 0.7708333469927311, "reward_std": 0.31989311799407005, "rewards/accuracy_reward": 0.7500000037252903, "rewards/format_reward": 0.02083333395421505, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 1955.677146911621, "epoch": 0.0608, "grad_norm": 0.19170919060707092, "kl": 0.0058460235595703125, "learning_rate": 9.918072336867873e-07, "loss": 0.0646, "reward": 0.7395833497866988, "reward_std": 0.26711129397153854, "rewards/accuracy_reward": 0.7395833497866988, "rewards/format_reward": 0.0, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 1804.0729370117188, "epoch": 0.06186666666666667, "grad_norm": 0.016404293477535248, "kl": 0.004487037658691406, "learning_rate": 9.915181609404712e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 2028.291690826416, "epoch": 0.06293333333333333, "grad_norm": 0.12325101345777512, "kl": 0.005527019500732422, "learning_rate": 9.912241249146973e-07, "loss": 0.0212, "reward": 0.5104166716337204, "reward_std": 0.10518955811858177, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.03125, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 1441.239616394043, "epoch": 0.064, "grad_norm": 0.18835172057151794, "kl": 0.0075016021728515625, "learning_rate": 9.909251289148405e-07, "loss": 0.0359, "reward": 0.8229166939854622, "reward_std": 0.28564152866601944, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.052083334885537624, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 1306.572956085205, "epoch": 0.06506666666666666, "grad_norm": 0.2342238575220108, "kl": 0.008769989013671875, "learning_rate": 9.906211763020323e-07, "loss": 0.0261, "reward": 0.9270833656191826, "reward_std": 0.3082825541496277, "rewards/accuracy_reward": 0.8125000074505806, "rewards/format_reward": 0.11458333674818277, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 2291.062545776367, "epoch": 0.06613333333333334, "grad_norm": 0.20412537455558777, "kl": 0.0077724456787109375, "learning_rate": 9.903122704931238e-07, "loss": 0.0758, "reward": 0.3854166781529784, "reward_std": 0.3362164609134197, "rewards/accuracy_reward": 0.3437500111758709, "rewards/format_reward": 0.0416666679084301, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 2112.93754196167, "epoch": 0.0672, "grad_norm": 0.2216949164867401, "kl": 0.008182525634765625, "learning_rate": 9.89998414960647e-07, "loss": 0.078, "reward": 0.6562500335276127, "reward_std": 0.36662935465574265, "rewards/accuracy_reward": 0.5625000074505806, "rewards/format_reward": 0.09375000279396772, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 1843.8021087646484, "epoch": 0.06826666666666667, "grad_norm": 0.25731685757637024, "kl": 0.0081329345703125, "learning_rate": 9.896796132327751e-07, "loss": 0.1182, "reward": 0.7187500149011612, "reward_std": 0.3873845897614956, "rewards/accuracy_reward": 0.6250000074505806, "rewards/format_reward": 0.09375000093132257, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 2183.895866394043, "epoch": 0.06933333333333333, "grad_norm": 0.2281903326511383, "kl": 0.014791488647460938, "learning_rate": 9.89355868893284e-07, "loss": 0.0648, "reward": 0.6458333563059568, "reward_std": 0.3596339784562588, "rewards/accuracy_reward": 0.5729166772216558, "rewards/format_reward": 0.0729166679084301, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 1706.8646392822266, "epoch": 0.0704, "grad_norm": 0.15528523921966553, "kl": 0.00827789306640625, "learning_rate": 9.890271855815112e-07, "loss": -0.0347, "reward": 0.833333371207118, "reward_std": 0.40761860460042953, "rewards/accuracy_reward": 0.6354166809469461, "rewards/format_reward": 0.19791667722165585, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 1939.1875305175781, "epoch": 0.07146666666666666, "grad_norm": 0.19526682794094086, "kl": 0.0071811676025390625, "learning_rate": 9.886935669923155e-07, "loss": 0.1116, "reward": 0.8229166818782687, "reward_std": 0.39551957696676254, "rewards/accuracy_reward": 0.6770833386108279, "rewards/format_reward": 0.14583333767950535, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 1613.114616394043, "epoch": 0.07253333333333334, "grad_norm": 0.2151508629322052, "kl": 0.0092620849609375, "learning_rate": 9.883550168760343e-07, "loss": 0.0513, "reward": 1.0520833730697632, "reward_std": 0.38381822407245636, "rewards/accuracy_reward": 0.8020833432674408, "rewards/format_reward": 0.25000001303851604, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 2083.1666870117188, "epoch": 0.0736, "grad_norm": 0.172663614153862, "kl": 0.008609771728515625, "learning_rate": 9.880115390384432e-07, "loss": -0.0169, "reward": 0.7291666772216558, "reward_std": 0.3138114660978317, "rewards/accuracy_reward": 0.6354166772216558, "rewards/format_reward": 0.0937500037252903, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 1694.2500534057617, "epoch": 0.07466666666666667, "grad_norm": 0.22943955659866333, "kl": 0.009500503540039062, "learning_rate": 9.876631373407115e-07, "loss": 0.0836, "reward": 0.843750030733645, "reward_std": 0.4887804500758648, "rewards/accuracy_reward": 0.6145833460614085, "rewards/format_reward": 0.22916666977107525, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 1159.854206085205, "epoch": 0.07573333333333333, "grad_norm": 0.2230231761932373, "kl": 0.010684967041015625, "learning_rate": 9.8730981569936e-07, "loss": 0.0656, "reward": 1.083333358168602, "reward_std": 0.46291813999414444, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.2500000102445483, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 1441.0417098999023, "epoch": 0.0768, "grad_norm": 0.2177305519580841, "kl": 0.010761260986328125, "learning_rate": 9.869515780862162e-07, "loss": 0.1599, "reward": 1.0312500447034836, "reward_std": 0.4610872007906437, "rewards/accuracy_reward": 0.8020833414047956, "rewards/format_reward": 0.22916667349636555, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 1886.958366394043, "epoch": 0.07786666666666667, "grad_norm": 0.17976365983486176, "kl": 0.0087738037109375, "learning_rate": 9.865884285283702e-07, "loss": 0.0505, "reward": 0.7395833609625697, "reward_std": 0.42143571749329567, "rewards/accuracy_reward": 0.5937500149011612, "rewards/format_reward": 0.14583333861082792, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 2070.4062881469727, "epoch": 0.07893333333333333, "grad_norm": 0.206215500831604, "kl": 0.009967803955078125, "learning_rate": 9.862203711081293e-07, "loss": 0.0675, "reward": 0.8541666977107525, "reward_std": 0.3208543434739113, "rewards/accuracy_reward": 0.729166679084301, "rewards/format_reward": 0.12500000279396772, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 1815.770866394043, "epoch": 0.08, "grad_norm": 0.23241013288497925, "kl": 0.01342010498046875, "learning_rate": 9.858474099629715e-07, "loss": 0.1304, "reward": 1.0833333656191826, "reward_std": 0.5579452253878117, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.291666679084301, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 2000.9688262939453, "epoch": 0.08106666666666666, "grad_norm": 0.2039710134267807, "kl": 0.009334564208984375, "learning_rate": 9.854695492855004e-07, "loss": 0.1444, "reward": 0.7916666893288493, "reward_std": 0.4798678606748581, "rewards/accuracy_reward": 0.5937500111758709, "rewards/format_reward": 0.19791667629033327, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 2128.7084045410156, "epoch": 0.08213333333333334, "grad_norm": 0.2619023323059082, "kl": 0.009532928466796875, "learning_rate": 9.850867933233959e-07, "loss": 0.1721, "reward": 0.7291666744276881, "reward_std": 0.47083980962634087, "rewards/accuracy_reward": 0.5625000027939677, "rewards/format_reward": 0.16666667442768812, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 1253.4271240234375, "epoch": 0.0832, "grad_norm": 0.2061297446489334, "kl": 0.012236595153808594, "learning_rate": 9.84699146379369e-07, "loss": 0.0185, "reward": 1.2291666939854622, "reward_std": 0.45226238295435905, "rewards/accuracy_reward": 0.8750000111758709, "rewards/format_reward": 0.35416667629033327, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 1785.6042251586914, "epoch": 0.08426666666666667, "grad_norm": 0.23687240481376648, "kl": 0.0115203857421875, "learning_rate": 9.843066128111114e-07, "loss": 0.1553, "reward": 1.1458333637565374, "reward_std": 0.5026613548398018, "rewards/accuracy_reward": 0.7395833414047956, "rewards/format_reward": 0.40625001583248377, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 1738.1041984558105, "epoch": 0.08533333333333333, "grad_norm": 0.234134703874588, "kl": 0.011470794677734375, "learning_rate": 9.839091970312477e-07, "loss": 0.1784, "reward": 1.0729167014360428, "reward_std": 0.4490736462175846, "rewards/accuracy_reward": 0.7708333507180214, "rewards/format_reward": 0.3020833423361182, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 2155.6354751586914, "epoch": 0.0864, "grad_norm": 0.19609688222408295, "kl": 0.01366424560546875, "learning_rate": 9.835069035072848e-07, "loss": 0.1985, "reward": 0.708333351649344, "reward_std": 0.4369525946676731, "rewards/accuracy_reward": 0.44791668001562357, "rewards/format_reward": 0.260416679084301, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 1839.9583854675293, "epoch": 0.08746666666666666, "grad_norm": 0.2354910969734192, "kl": 0.012722015380859375, "learning_rate": 9.830997367615627e-07, "loss": 0.1165, "reward": 0.9791666893288493, "reward_std": 0.5074816048145294, "rewards/accuracy_reward": 0.6770833460614085, "rewards/format_reward": 0.30208333767950535, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 1579.9792098999023, "epoch": 0.08853333333333334, "grad_norm": 0.22747208178043365, "kl": 0.013027191162109375, "learning_rate": 9.826877013712032e-07, "loss": 0.1324, "reward": 1.1875000447034836, "reward_std": 0.5296823643147945, "rewards/accuracy_reward": 0.7395833507180214, "rewards/format_reward": 0.4479166865348816, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 719.7291870117188, "epoch": 0.0896, "grad_norm": 0.365797221660614, "kl": 0.019989013671875, "learning_rate": 9.82270801968058e-07, "loss": 0.2009, "reward": 1.5937500223517418, "reward_std": 0.5179454982280731, "rewards/accuracy_reward": 0.8645833507180214, "rewards/format_reward": 0.729166679084301, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 1941.3646202087402, "epoch": 0.09066666666666667, "grad_norm": 0.18776486814022064, "kl": 0.011383056640625, "learning_rate": 9.818490432386579e-07, "loss": 0.0585, "reward": 1.0520833730697632, "reward_std": 0.38045133650302887, "rewards/accuracy_reward": 0.6875000074505806, "rewards/format_reward": 0.3645833469927311, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 1811.5104637145996, "epoch": 0.09173333333333333, "grad_norm": 0.23977559804916382, "kl": 0.0134429931640625, "learning_rate": 9.814224299241585e-07, "loss": 0.1228, "reward": 1.0729166865348816, "reward_std": 0.4849005527794361, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.4062500111758709, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 1089.3437881469727, "epoch": 0.0928, "grad_norm": 0.2976773977279663, "kl": 0.019195556640625, "learning_rate": 9.809909668202877e-07, "loss": 0.0593, "reward": 1.3229167088866234, "reward_std": 0.5590556673705578, "rewards/accuracy_reward": 0.7708333386108279, "rewards/format_reward": 0.5520833488553762, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 1300.8854713439941, "epoch": 0.09386666666666667, "grad_norm": 0.2701471447944641, "kl": 0.0176239013671875, "learning_rate": 9.805546587772927e-07, "loss": 0.0697, "reward": 1.3958333535119891, "reward_std": 0.44593585282564163, "rewards/accuracy_reward": 0.8125000176951289, "rewards/format_reward": 0.5833333432674408, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 2228.541732788086, "epoch": 0.09493333333333333, "grad_norm": 0.26080235838890076, "kl": 0.0147705078125, "learning_rate": 9.801135106998839e-07, "loss": 0.1298, "reward": 0.6250000214204192, "reward_std": 0.3846449702978134, "rewards/accuracy_reward": 0.30208333767950535, "rewards/format_reward": 0.32291667629033327, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 1295.1354598999023, "epoch": 0.096, "grad_norm": 0.2340189665555954, "kl": 0.01932525634765625, "learning_rate": 9.7966752754718e-07, "loss": 0.1027, "reward": 1.427083375863731, "reward_std": 0.38559460639953613, "rewards/accuracy_reward": 0.822916679084301, "rewards/format_reward": 0.6041666744276881, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 1600.3959045410156, "epoch": 0.09706666666666666, "grad_norm": 0.2895304262638092, "kl": 0.0176239013671875, "learning_rate": 9.792167143326537e-07, "loss": 0.1273, "reward": 1.2708333507180214, "reward_std": 0.5329140052199364, "rewards/accuracy_reward": 0.8020833507180214, "rewards/format_reward": 0.4687500027939677, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 1940.739631652832, "epoch": 0.09813333333333334, "grad_norm": 0.30634787678718567, "kl": 0.01779937744140625, "learning_rate": 9.787610761240735e-07, "loss": 0.0887, "reward": 1.0729167079553008, "reward_std": 0.5487616583704948, "rewards/accuracy_reward": 0.6250000139698386, "rewards/format_reward": 0.4479166716337204, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 1719.4375534057617, "epoch": 0.0992, "grad_norm": 0.24846328794956207, "kl": 0.02336883544921875, "learning_rate": 9.78300618043448e-07, "loss": 0.1357, "reward": 1.1562500363215804, "reward_std": 0.5000446438789368, "rewards/accuracy_reward": 0.6458333488553762, "rewards/format_reward": 0.5104166781529784, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 1805.6458892822266, "epoch": 0.10026666666666667, "grad_norm": 0.2803559899330139, "kl": 0.01806640625, "learning_rate": 9.77835345266968e-07, "loss": 0.1755, "reward": 1.2187500298023224, "reward_std": 0.47742901369929314, "rewards/accuracy_reward": 0.7083333386108279, "rewards/format_reward": 0.5104166762903333, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 1425.5625305175781, "epoch": 0.10133333333333333, "grad_norm": 0.3216603100299835, "kl": 0.024749755859375, "learning_rate": 9.773652630249475e-07, "loss": 0.1355, "reward": 1.458333358168602, "reward_std": 0.5219073481857777, "rewards/accuracy_reward": 0.9375000074505806, "rewards/format_reward": 0.5208333572372794, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 1832.8958778381348, "epoch": 0.1024, "grad_norm": 0.29569751024246216, "kl": 0.034336090087890625, "learning_rate": 9.768903766017662e-07, "loss": 0.0353, "reward": 1.0937500214204192, "reward_std": 0.3362164534628391, "rewards/accuracy_reward": 0.6354166679084301, "rewards/format_reward": 0.45833334140479565, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 1232.510425567627, "epoch": 0.10346666666666667, "grad_norm": 0.21783436834812164, "kl": 0.02545928955078125, "learning_rate": 9.764106913358094e-07, "loss": 0.028, "reward": 1.593750037252903, "reward_std": 0.35844462737441063, "rewards/accuracy_reward": 0.916666679084301, "rewards/format_reward": 0.6770833488553762, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 1992.895851135254, "epoch": 0.10453333333333334, "grad_norm": 0.22219501435756683, "kl": 0.0184173583984375, "learning_rate": 9.75926212619408e-07, "loss": 0.1329, "reward": 1.1666666939854622, "reward_std": 0.4547826051712036, "rewards/accuracy_reward": 0.7708333488553762, "rewards/format_reward": 0.39583333767950535, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 1043.12504196167, "epoch": 0.1056, "grad_norm": 0.3042198419570923, "kl": 0.029022216796875, "learning_rate": 9.754369458987779e-07, "loss": 0.1391, "reward": 1.6145833637565374, "reward_std": 0.3727728947997093, "rewards/accuracy_reward": 0.8437500102445483, "rewards/format_reward": 0.7708333535119891, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 1164.802101135254, "epoch": 0.10666666666666667, "grad_norm": 0.2411496639251709, "kl": 0.02048492431640625, "learning_rate": 9.749428966739589e-07, "loss": 0.0633, "reward": 1.5625000409781933, "reward_std": 0.403891209512949, "rewards/accuracy_reward": 0.8645833432674408, "rewards/format_reward": 0.6979166744276881, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 2031.3334121704102, "epoch": 0.10773333333333333, "grad_norm": 0.42925000190734863, "kl": 0.02660369873046875, "learning_rate": 9.74444070498753e-07, "loss": 0.1508, "reward": 1.0416666772216558, "reward_std": 0.49693433195352554, "rewards/accuracy_reward": 0.6666666772216558, "rewards/format_reward": 0.3750000102445483, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 1087.5000228881836, "epoch": 0.1088, "grad_norm": 0.277424156665802, "kl": 0.02606201171875, "learning_rate": 9.739404729806615e-07, "loss": 0.0607, "reward": 1.5312500447034836, "reward_std": 0.34802578389644623, "rewards/accuracy_reward": 0.8125000074505806, "rewards/format_reward": 0.7187500149011612, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 1521.4479522705078, "epoch": 0.10986666666666667, "grad_norm": 0.23330633342266083, "kl": 0.0203704833984375, "learning_rate": 9.734321097808223e-07, "loss": 0.0749, "reward": 1.2916666939854622, "reward_std": 0.3742976523935795, "rewards/accuracy_reward": 0.7500000074505806, "rewards/format_reward": 0.5416666707023978, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 1733.8645973205566, "epoch": 0.11093333333333333, "grad_norm": 0.34998589754104614, "kl": 0.028839111328125, "learning_rate": 9.729189866139463e-07, "loss": 0.0322, "reward": 1.0520833600312471, "reward_std": 0.43308593705296516, "rewards/accuracy_reward": 0.5520833525806665, "rewards/format_reward": 0.5000000149011612, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 1327.1771125793457, "epoch": 0.112, "grad_norm": 0.3030642867088318, "kl": 0.02559661865234375, "learning_rate": 9.72401109248253e-07, "loss": 0.1426, "reward": 1.3750000325962901, "reward_std": 0.42665768787264824, "rewards/accuracy_reward": 0.7604166697710752, "rewards/format_reward": 0.6145833469927311, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 1709.8229598999023, "epoch": 0.11306666666666666, "grad_norm": 0.3039052188396454, "kl": 0.02504730224609375, "learning_rate": 9.718784835054057e-07, "loss": 0.1408, "reward": 1.2812500316649675, "reward_std": 0.4775592312216759, "rewards/accuracy_reward": 0.7395833414047956, "rewards/format_reward": 0.5416666744276881, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 2236.6355056762695, "epoch": 0.11413333333333334, "grad_norm": 0.27138814330101013, "kl": 0.0279693603515625, "learning_rate": 9.71351115260446e-07, "loss": 0.0907, "reward": 1.052083371207118, "reward_std": 0.44266683980822563, "rewards/accuracy_reward": 0.6458333395421505, "rewards/format_reward": 0.4062500102445483, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 1280.4792022705078, "epoch": 0.1152, "grad_norm": 0.35789233446121216, "kl": 0.0234222412109375, "learning_rate": 9.708190104417285e-07, "loss": 0.1767, "reward": 1.4895833507180214, "reward_std": 0.4585767798125744, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 0.572916672565043, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 2308.958396911621, "epoch": 0.11626666666666667, "grad_norm": 0.1528209000825882, "kl": 0.0229949951171875, "learning_rate": 9.702821750308522e-07, "loss": 0.0137, "reward": 0.9479166865348816, "reward_std": 0.32401224970817566, "rewards/accuracy_reward": 0.5312500037252903, "rewards/format_reward": 0.4166666744276881, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 1268.8854446411133, "epoch": 0.11733333333333333, "grad_norm": 0.3063293695449829, "kl": 0.0299224853515625, "learning_rate": 9.69740615062596e-07, "loss": 0.1734, "reward": 1.2812500298023224, "reward_std": 0.5065745413303375, "rewards/accuracy_reward": 0.6562500102445483, "rewards/format_reward": 0.6250000111758709, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 1501.8541984558105, "epoch": 0.1184, "grad_norm": 0.23821334540843964, "kl": 0.0287017822265625, "learning_rate": 9.691943366248481e-07, "loss": 0.0376, "reward": 1.3333333656191826, "reward_std": 0.3079647719860077, "rewards/accuracy_reward": 0.7291666753590107, "rewards/format_reward": 0.6041666828095913, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 1986.0417366027832, "epoch": 0.11946666666666667, "grad_norm": 0.3393707871437073, "kl": 0.027587890625, "learning_rate": 9.686433458585398e-07, "loss": 0.1641, "reward": 0.9270833544433117, "reward_std": 0.5288807041943073, "rewards/accuracy_reward": 0.5104166781529784, "rewards/format_reward": 0.4166666679084301, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 1718.8541793823242, "epoch": 0.12053333333333334, "grad_norm": 0.5044848918914795, "kl": 0.0374908447265625, "learning_rate": 9.68087648957575e-07, "loss": 0.1899, "reward": 1.135416679084301, "reward_std": 0.6159082837402821, "rewards/accuracy_reward": 0.6354166818782687, "rewards/format_reward": 0.5000000158324838, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 1240.5937728881836, "epoch": 0.1216, "grad_norm": 0.4392492473125458, "kl": 0.0348052978515625, "learning_rate": 9.675272521687616e-07, "loss": 0.151, "reward": 1.5208333656191826, "reward_std": 0.4753985106945038, "rewards/accuracy_reward": 0.8750000223517418, "rewards/format_reward": 0.6458333544433117, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 2083.2396068573, "epoch": 0.12266666666666666, "grad_norm": 0.33083024621009827, "kl": 0.035308837890625, "learning_rate": 9.669621617917402e-07, "loss": 0.1351, "reward": 1.1562500186264515, "reward_std": 0.4262532405555248, "rewards/accuracy_reward": 0.6666666697710752, "rewards/format_reward": 0.48958334140479565, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 1543.0208587646484, "epoch": 0.12373333333333333, "grad_norm": 0.24876700341701508, "kl": 0.0368194580078125, "learning_rate": 9.663923841789144e-07, "loss": 0.0814, "reward": 1.2916666744276881, "reward_std": 0.3841373212635517, "rewards/accuracy_reward": 0.7187500149011612, "rewards/format_reward": 0.5729166800156236, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 1284.9791984558105, "epoch": 0.1248, "grad_norm": 0.38016289472579956, "kl": 0.0474395751953125, "learning_rate": 9.658179257353786e-07, "loss": 0.1416, "reward": 1.2812500400468707, "reward_std": 0.33223719149827957, "rewards/accuracy_reward": 0.7187500074505806, "rewards/format_reward": 0.5625000139698386, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 1730.1042251586914, "epoch": 0.12586666666666665, "grad_norm": 0.3926825523376465, "kl": 0.0301361083984375, "learning_rate": 9.652387929188463e-07, "loss": 0.1208, "reward": 1.0000000353902578, "reward_std": 0.4988885223865509, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.3333333432674408, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 1875.6771087646484, "epoch": 0.12693333333333334, "grad_norm": 0.2359820306301117, "kl": 0.041717529296875, "learning_rate": 9.646549922395774e-07, "loss": 0.0256, "reward": 1.0937500251457095, "reward_std": 0.27659300714731216, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.5104166744276881, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 859.7187652587891, "epoch": 0.128, "grad_norm": 0.32542699575424194, "kl": 0.0416259765625, "learning_rate": 9.640665302603054e-07, "loss": 0.0412, "reward": 1.7083333730697632, "reward_std": 0.3842291906476021, "rewards/accuracy_reward": 0.8229166828095913, "rewards/format_reward": 0.8854166865348816, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 2837.0417556762695, "epoch": 0.12906666666666666, "grad_norm": 0.37280964851379395, "kl": 0.0443115234375, "learning_rate": 9.634734135961627e-07, "loss": 0.1297, "reward": 0.635416679084301, "reward_std": 0.4121558666229248, "rewards/accuracy_reward": 0.3541666707023978, "rewards/format_reward": 0.28125000931322575, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 1537.3958549499512, "epoch": 0.13013333333333332, "grad_norm": 0.3010331094264984, "kl": 0.03973388671875, "learning_rate": 9.628756489146074e-07, "loss": 0.0569, "reward": 1.3958333730697632, "reward_std": 0.423760611563921, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.6250000186264515, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 2193.583381652832, "epoch": 0.1312, "grad_norm": 0.5454201102256775, "kl": 0.05181884765625, "learning_rate": 9.622732429353478e-07, "loss": 0.1632, "reward": 1.0416666865348816, "reward_std": 0.39809365570545197, "rewards/accuracy_reward": 0.5625000027939677, "rewards/format_reward": 0.47916667722165585, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 1723.7708740234375, "epoch": 0.13226666666666667, "grad_norm": 0.3056691288948059, "kl": 0.047149658203125, "learning_rate": 9.616662024302663e-07, "loss": 0.1658, "reward": 1.4062500251457095, "reward_std": 0.42834335193037987, "rewards/accuracy_reward": 0.7604166818782687, "rewards/format_reward": 0.6458333507180214, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 1217.2813034057617, "epoch": 0.13333333333333333, "grad_norm": 0.5317479372024536, "kl": 0.055328369140625, "learning_rate": 9.610545342233444e-07, "loss": 0.1154, "reward": 1.5416667088866234, "reward_std": 0.5501139312982559, "rewards/accuracy_reward": 0.854166679084301, "rewards/format_reward": 0.6875000204890966, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 1585.1666984558105, "epoch": 0.1344, "grad_norm": 0.5633459687232971, "kl": 0.06005859375, "learning_rate": 9.604382451905852e-07, "loss": 0.0398, "reward": 1.1979166977107525, "reward_std": 0.5115581676363945, "rewards/accuracy_reward": 0.6666666744276881, "rewards/format_reward": 0.5312500149011612, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 2355.7500610351562, "epoch": 0.13546666666666668, "grad_norm": 0.822866678237915, "kl": 0.05828857421875, "learning_rate": 9.598173422599363e-07, "loss": 0.2373, "reward": 0.9583333563059568, "reward_std": 0.5695855543017387, "rewards/accuracy_reward": 0.5520833497866988, "rewards/format_reward": 0.40625000931322575, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 2153.9063034057617, "epoch": 0.13653333333333334, "grad_norm": 0.713874340057373, "kl": 0.068756103515625, "learning_rate": 9.59191832411212e-07, "loss": 0.1851, "reward": 1.093750037252903, "reward_std": 0.5032150186598301, "rewards/accuracy_reward": 0.6458333414047956, "rewards/format_reward": 0.4479166818782687, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 997.5833549499512, "epoch": 0.1376, "grad_norm": 1.0707341432571411, "kl": 0.0745849609375, "learning_rate": 9.585617226760147e-07, "loss": 0.1624, "reward": 1.7083333879709244, "reward_std": 0.41239187121391296, "rewards/accuracy_reward": 0.8645833507180214, "rewards/format_reward": 0.8437500074505806, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 1263.7708740234375, "epoch": 0.13866666666666666, "grad_norm": 0.4855239689350128, "kl": 0.068389892578125, "learning_rate": 9.579270201376557e-07, "loss": 0.0828, "reward": 1.4062500447034836, "reward_std": 0.4460373967885971, "rewards/accuracy_reward": 0.6770833469927311, "rewards/format_reward": 0.7291666772216558, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 1630.5937690734863, "epoch": 0.13973333333333332, "grad_norm": 0.7625194787979126, "kl": 0.092681884765625, "learning_rate": 9.572877319310768e-07, "loss": 0.1198, "reward": 1.3958333656191826, "reward_std": 0.5493095219135284, "rewards/accuracy_reward": 0.7187500167638063, "rewards/format_reward": 0.6770833507180214, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 1427.8542289733887, "epoch": 0.1408, "grad_norm": 0.7275360226631165, "kl": 0.10162353515625, "learning_rate": 9.56643865242768e-07, "loss": 0.0633, "reward": 1.4895833805203438, "reward_std": 0.3939802907407284, "rewards/accuracy_reward": 0.8125000074505806, "rewards/format_reward": 0.6770833469927311, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 933.2083625793457, "epoch": 0.14186666666666667, "grad_norm": 0.6620070934295654, "kl": 0.101318359375, "learning_rate": 9.55995427310688e-07, "loss": 0.0786, "reward": 1.7187500149011612, "reward_std": 0.32781485840678215, "rewards/accuracy_reward": 0.854166679084301, "rewards/format_reward": 0.8645833432674408, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 1495.8541870117188, "epoch": 0.14293333333333333, "grad_norm": 0.8901642560958862, "kl": 0.112274169921875, "learning_rate": 9.553424254241831e-07, "loss": 0.0941, "reward": 1.6250000223517418, "reward_std": 0.4055785685777664, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.7291666734963655, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 1885.8229598999023, "epoch": 0.144, "grad_norm": 1.9587948322296143, "kl": 0.16876220703125, "learning_rate": 9.546848669239045e-07, "loss": 0.1103, "reward": 1.2083333544433117, "reward_std": 0.4303443431854248, "rewards/accuracy_reward": 0.6041666753590107, "rewards/format_reward": 0.6041666772216558, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 1352.9687690734863, "epoch": 0.14506666666666668, "grad_norm": 0.6883363723754883, "kl": 0.135009765625, "learning_rate": 9.540227592017262e-07, "loss": 0.0542, "reward": 1.5312500298023224, "reward_std": 0.33261488750576973, "rewards/accuracy_reward": 0.7916666679084301, "rewards/format_reward": 0.7395833507180214, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 1348.9687728881836, "epoch": 0.14613333333333334, "grad_norm": 2.398375988006592, "kl": 0.15057373046875, "learning_rate": 9.533561097006619e-07, "loss": 0.079, "reward": 1.4583333684131503, "reward_std": 0.5132912807166576, "rewards/accuracy_reward": 0.7395833507180214, "rewards/format_reward": 0.7187500102445483, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 779.8020896911621, "epoch": 0.1472, "grad_norm": 0.8179948925971985, "kl": 0.1387939453125, "learning_rate": 9.526849259147809e-07, "loss": 0.0295, "reward": 1.6562500596046448, "reward_std": 0.5109735615551472, "rewards/accuracy_reward": 0.8020833469927311, "rewards/format_reward": 0.8541666865348816, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 1485.6979675292969, "epoch": 0.14826666666666666, "grad_norm": 1.1039448976516724, "kl": 0.18243408203125, "learning_rate": 9.520092153891243e-07, "loss": 0.0427, "reward": 1.3854167014360428, "reward_std": 0.4788143113255501, "rewards/accuracy_reward": 0.760416679084301, "rewards/format_reward": 0.6250000149011612, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 782.8541946411133, "epoch": 0.14933333333333335, "grad_norm": 1.0269396305084229, "kl": 0.11767578125, "learning_rate": 9.513289857196201e-07, "loss": 0.0514, "reward": 1.6250000298023224, "reward_std": 0.6032419838011265, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.833333358168602, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 1220.7812805175781, "epoch": 0.1504, "grad_norm": 0.8581716418266296, "kl": 0.17388916015625, "learning_rate": 9.506442445529982e-07, "loss": 0.1079, "reward": 1.5104167014360428, "reward_std": 0.5175886489450932, "rewards/accuracy_reward": 0.7187500149011612, "rewards/format_reward": 0.7916666939854622, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 2013.1875457763672, "epoch": 0.15146666666666667, "grad_norm": 2.089837074279785, "kl": 0.23431396484375, "learning_rate": 9.499549995867032e-07, "loss": 0.1597, "reward": 1.0625000279396772, "reward_std": 0.6058536469936371, "rewards/accuracy_reward": 0.4583333507180214, "rewards/format_reward": 0.6041666828095913, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 2769.447967529297, "epoch": 0.15253333333333333, "grad_norm": 1.0887857675552368, "kl": 0.3216552734375, "learning_rate": 9.492612585688092e-07, "loss": 0.1352, "reward": 0.6458333544433117, "reward_std": 0.34844400733709335, "rewards/accuracy_reward": 0.2812500074505806, "rewards/format_reward": 0.3645833395421505, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 1439.072956085205, "epoch": 0.1536, "grad_norm": 1.8148412704467773, "kl": 0.213623046875, "learning_rate": 9.485630292979321e-07, "loss": 0.0854, "reward": 1.4270833637565374, "reward_std": 0.4695867523550987, "rewards/accuracy_reward": 0.6562500149011612, "rewards/format_reward": 0.770833345130086, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 1420.3750381469727, "epoch": 0.15466666666666667, "grad_norm": 1.576915979385376, "kl": 0.21185302734375, "learning_rate": 9.47860319623142e-07, "loss": 0.0849, "reward": 1.312500050291419, "reward_std": 0.46924449503421783, "rewards/accuracy_reward": 0.5937500093132257, "rewards/format_reward": 0.7187500186264515, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 1337.0000381469727, "epoch": 0.15573333333333333, "grad_norm": 0.9039284586906433, "kl": 0.22003173828125, "learning_rate": 9.47153137443875e-07, "loss": 0.0417, "reward": 1.6041667237877846, "reward_std": 0.40704870223999023, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.812500013038516, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 1584.5625228881836, "epoch": 0.1568, "grad_norm": 1.363537311553955, "kl": 0.2442626953125, "learning_rate": 9.464414907098443e-07, "loss": 0.0557, "reward": 1.3958333656191826, "reward_std": 0.31147706508636475, "rewards/accuracy_reward": 0.6458333386108279, "rewards/format_reward": 0.7500000111758709, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 1989.0104446411133, "epoch": 0.15786666666666666, "grad_norm": 1.416282296180725, "kl": 0.2830810546875, "learning_rate": 9.457253874209512e-07, "loss": 0.1746, "reward": 1.2395833590999246, "reward_std": 0.5029095597565174, "rewards/accuracy_reward": 0.6250000176951289, "rewards/format_reward": 0.6145833488553762, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 1887.0729522705078, "epoch": 0.15893333333333334, "grad_norm": 2.3967323303222656, "kl": 0.4505615234375, "learning_rate": 9.450048356271946e-07, "loss": 0.2089, "reward": 1.2500000251457095, "reward_std": 0.521727379411459, "rewards/accuracy_reward": 0.5416666744276881, "rewards/format_reward": 0.7083333423361182, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 1803.5208778381348, "epoch": 0.16, "grad_norm": 2.3804256916046143, "kl": 0.403564453125, "learning_rate": 9.442798434285806e-07, "loss": 0.1131, "reward": 1.2604166772216558, "reward_std": 0.35515038296580315, "rewards/accuracy_reward": 0.6354166781529784, "rewards/format_reward": 0.6250000102445483, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 1839.1666984558105, "epoch": 0.16106666666666666, "grad_norm": 4.2157206535339355, "kl": 0.5521240234375, "learning_rate": 9.435504189750322e-07, "loss": 0.2445, "reward": 1.156250037252903, "reward_std": 0.4841529466211796, "rewards/accuracy_reward": 0.5104166828095913, "rewards/format_reward": 0.6458333432674408, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 2317.2604598999023, "epoch": 0.16213333333333332, "grad_norm": 1.5710035562515259, "kl": 0.8486328125, "learning_rate": 9.428165704662967e-07, "loss": 0.1143, "reward": 0.9062500149011612, "reward_std": 0.3221505247056484, "rewards/accuracy_reward": 0.4270833348855376, "rewards/format_reward": 0.47916666977107525, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 1431.7083740234375, "epoch": 0.1632, "grad_norm": 1.04184889793396, "kl": 0.4625244140625, "learning_rate": 9.420783061518543e-07, "loss": 0.0712, "reward": 1.4375000149011612, "reward_std": 0.3457876518368721, "rewards/accuracy_reward": 0.7291666744276881, "rewards/format_reward": 0.7083333414047956, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 2483.687557220459, "epoch": 0.16426666666666667, "grad_norm": 2.589721441268921, "kl": 0.85009765625, "learning_rate": 9.413356343308244e-07, "loss": 0.1643, "reward": 0.8750000298023224, "reward_std": 0.40403280407190323, "rewards/accuracy_reward": 0.42708334140479565, "rewards/format_reward": 0.447916679084301, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 1218.6667175292969, "epoch": 0.16533333333333333, "grad_norm": 2.5739619731903076, "kl": 0.585693359375, "learning_rate": 9.405885633518735e-07, "loss": 0.1568, "reward": 1.6250000447034836, "reward_std": 0.5422121845185757, "rewards/accuracy_reward": 0.8125000223517418, "rewards/format_reward": 0.8125000223517418, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 1869.0625534057617, "epoch": 0.1664, "grad_norm": 2.131330728530884, "kl": 0.8607177734375, "learning_rate": 9.398371016131206e-07, "loss": 0.1812, "reward": 1.2083333590999246, "reward_std": 0.4389325752854347, "rewards/accuracy_reward": 0.5937500102445483, "rewards/format_reward": 0.614583345130086, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 2364.604217529297, "epoch": 0.16746666666666668, "grad_norm": 1.872695803642273, "kl": 1.120361328125, "learning_rate": 9.39081257562043e-07, "loss": 0.1484, "reward": 0.770833345130086, "reward_std": 0.3648338094353676, "rewards/accuracy_reward": 0.3020833358168602, "rewards/format_reward": 0.46875000931322575, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 2046.4167022705078, "epoch": 0.16853333333333334, "grad_norm": 4.385330677032471, "kl": 0.87646484375, "learning_rate": 9.383210396953811e-07, "loss": 0.2732, "reward": 1.031250037252903, "reward_std": 0.5925772674381733, "rewards/accuracy_reward": 0.45833334047347307, "rewards/format_reward": 0.572916679084301, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 2126.947982788086, "epoch": 0.1696, "grad_norm": 2.51013445854187, "kl": 1.0467529296875, "learning_rate": 9.375564565590434e-07, "loss": 0.1835, "reward": 1.0729167005047202, "reward_std": 0.5290332473814487, "rewards/accuracy_reward": 0.4270833386108279, "rewards/format_reward": 0.6458333460614085, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 2042.1771087646484, "epoch": 0.17066666666666666, "grad_norm": 5.503683567047119, "kl": 0.930908203125, "learning_rate": 9.367875167480096e-07, "loss": 0.3232, "reward": 1.02083336468786, "reward_std": 0.5387994796037674, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.5729166707023978, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 1749.6458549499512, "epoch": 0.17173333333333332, "grad_norm": 5.578032493591309, "kl": 0.742431640625, "learning_rate": 9.36014228906235e-07, "loss": 0.2039, "reward": 1.1979167014360428, "reward_std": 0.48741893842816353, "rewards/accuracy_reward": 0.5312500055879354, "rewards/format_reward": 0.6666666865348816, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 2082.343818664551, "epoch": 0.1728, "grad_norm": 3.8469884395599365, "kl": 1.032470703125, "learning_rate": 9.352366017265527e-07, "loss": 0.2116, "reward": 1.0833333525806665, "reward_std": 0.6154673881828785, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.6041666809469461, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 1579.3438262939453, "epoch": 0.17386666666666667, "grad_norm": 3.5108330249786377, "kl": 1.091552734375, "learning_rate": 9.34454643950576e-07, "loss": 0.2616, "reward": 1.3750000223517418, "reward_std": 0.5651338994503021, "rewards/accuracy_reward": 0.6354166762903333, "rewards/format_reward": 0.7395833507180214, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 1338.2812995910645, "epoch": 0.17493333333333333, "grad_norm": 5.076249122619629, "kl": 0.831787109375, "learning_rate": 9.336683643685999e-07, "loss": 0.2261, "reward": 1.3437500223517418, "reward_std": 0.6182434856891632, "rewards/accuracy_reward": 0.5729166846722364, "rewards/format_reward": 0.7708333544433117, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 2246.010482788086, "epoch": 0.176, "grad_norm": 2.9176199436187744, "kl": 1.531982421875, "learning_rate": 9.328777718195028e-07, "loss": 0.2521, "reward": 0.739583351649344, "reward_std": 0.4993966184556484, "rewards/accuracy_reward": 0.2708333386108279, "rewards/format_reward": 0.46875001210719347, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 1139.0312690734863, "epoch": 0.17706666666666668, "grad_norm": 3.0611939430236816, "kl": 1.2666015625, "learning_rate": 9.320828751906466e-07, "loss": 0.2754, "reward": 1.4791667088866234, "reward_std": 0.4715059995651245, "rewards/accuracy_reward": 0.6250000074505806, "rewards/format_reward": 0.8541666865348816, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 2079.92716217041, "epoch": 0.17813333333333334, "grad_norm": 3.909663200378418, "kl": 1.1314697265625, "learning_rate": 9.312836834177776e-07, "loss": 0.2661, "reward": 1.031250011175871, "reward_std": 0.36286717280745506, "rewards/accuracy_reward": 0.4479166716337204, "rewards/format_reward": 0.5833333469927311, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 2127.0521240234375, "epoch": 0.1792, "grad_norm": 5.591472148895264, "kl": 1.592041015625, "learning_rate": 9.30480205484925e-07, "loss": 0.1688, "reward": 1.04166669677943, "reward_std": 0.6420871168375015, "rewards/accuracy_reward": 0.42708335164934397, "rewards/format_reward": 0.6145833497866988, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 1795.958396911621, "epoch": 0.18026666666666666, "grad_norm": 3.7766122817993164, "kl": 1.688232421875, "learning_rate": 9.296724504243003e-07, "loss": 0.3228, "reward": 1.2187500298023224, "reward_std": 0.5375028699636459, "rewards/accuracy_reward": 0.5208333488553762, "rewards/format_reward": 0.697916679084301, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 2157.8125915527344, "epoch": 0.18133333333333335, "grad_norm": 4.6644134521484375, "kl": 1.70751953125, "learning_rate": 9.288604273161965e-07, "loss": 0.3534, "reward": 0.9062500186264515, "reward_std": 0.7508826814591885, "rewards/accuracy_reward": 0.3541666753590107, "rewards/format_reward": 0.5520833544433117, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 1906.7709045410156, "epoch": 0.1824, "grad_norm": 3.9379618167877197, "kl": 1.1201171875, "learning_rate": 9.280441452888847e-07, "loss": 0.3075, "reward": 1.1250000298023224, "reward_std": 0.6791660189628601, "rewards/accuracy_reward": 0.45833334419876337, "rewards/format_reward": 0.6666666828095913, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 1798.166732788086, "epoch": 0.18346666666666667, "grad_norm": 3.0716969966888428, "kl": 1.0343017578125, "learning_rate": 9.272236135185126e-07, "loss": 0.2195, "reward": 1.104166690260172, "reward_std": 0.5092082656919956, "rewards/accuracy_reward": 0.4375000037252903, "rewards/format_reward": 0.6666666865348816, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 1887.291732788086, "epoch": 0.18453333333333333, "grad_norm": 4.783283710479736, "kl": 1.08062744140625, "learning_rate": 9.26398841229001e-07, "loss": 0.2174, "reward": 1.1458333507180214, "reward_std": 0.6386397443711758, "rewards/accuracy_reward": 0.5000000167638063, "rewards/format_reward": 0.6458333507180214, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 1630.9062881469727, "epoch": 0.1856, "grad_norm": 6.4080305099487305, "kl": 0.78277587890625, "learning_rate": 9.255698376919398e-07, "loss": 0.245, "reward": 1.3125000223517418, "reward_std": 0.266142837703228, "rewards/accuracy_reward": 0.6041666744276881, "rewards/format_reward": 0.7083333432674408, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 1461.9688186645508, "epoch": 0.18666666666666668, "grad_norm": 3.1538641452789307, "kl": 0.69085693359375, "learning_rate": 9.247366122264841e-07, "loss": 0.2406, "reward": 1.4062500223517418, "reward_std": 0.6624869257211685, "rewards/accuracy_reward": 0.6041666828095913, "rewards/format_reward": 0.802083358168602, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 2072.2813110351562, "epoch": 0.18773333333333334, "grad_norm": 3.254478693008423, "kl": 1.2373046875, "learning_rate": 9.238991741992491e-07, "loss": 0.3216, "reward": 1.0312500298023224, "reward_std": 0.5931861028075218, "rewards/accuracy_reward": 0.4062500074505806, "rewards/format_reward": 0.6250000149011612, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 1519.0625305175781, "epoch": 0.1888, "grad_norm": 3.353346347808838, "kl": 0.619781494140625, "learning_rate": 9.230575330242059e-07, "loss": 0.2062, "reward": 1.427083358168602, "reward_std": 0.4653523154556751, "rewards/accuracy_reward": 0.6562500055879354, "rewards/format_reward": 0.7708333432674408, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 1323.3437881469727, "epoch": 0.18986666666666666, "grad_norm": 2.781611919403076, "kl": 0.4796142578125, "learning_rate": 9.222116981625737e-07, "loss": 0.1292, "reward": 1.552083358168602, "reward_std": 0.40244080126285553, "rewards/accuracy_reward": 0.7500000074505806, "rewards/format_reward": 0.8020833507180214, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 1167.6667022705078, "epoch": 0.19093333333333334, "grad_norm": 3.0870985984802246, "kl": 0.560302734375, "learning_rate": 9.213616791227157e-07, "loss": 0.1203, "reward": 1.5937500353902578, "reward_std": 0.3326336592435837, "rewards/accuracy_reward": 0.7604166744276881, "rewards/format_reward": 0.8333333460614085, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 1465.5729446411133, "epoch": 0.192, "grad_norm": 2.4987359046936035, "kl": 0.600830078125, "learning_rate": 9.2050748546003e-07, "loss": 0.0579, "reward": 1.5937500298023224, "reward_std": 0.4246576987206936, "rewards/accuracy_reward": 0.7604166772216558, "rewards/format_reward": 0.8333333432674408, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 1409.6771087646484, "epoch": 0.19306666666666666, "grad_norm": 2.7553412914276123, "kl": 0.63348388671875, "learning_rate": 9.196491267768442e-07, "loss": 0.1842, "reward": 1.3229166995733976, "reward_std": 0.4850395992398262, "rewards/accuracy_reward": 0.5729166753590107, "rewards/format_reward": 0.7500000204890966, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 2092.802146911621, "epoch": 0.19413333333333332, "grad_norm": 8.406367301940918, "kl": 1.5791015625, "learning_rate": 9.187866127223061e-07, "loss": 0.1345, "reward": 1.0312500186264515, "reward_std": 0.5448186621069908, "rewards/accuracy_reward": 0.416666672565043, "rewards/format_reward": 0.6145833507180214, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 1508.90629196167, "epoch": 0.1952, "grad_norm": 823.5792846679688, "kl": 3.7071533203125, "learning_rate": 9.179199529922757e-07, "loss": 0.3802, "reward": 1.3020833507180214, "reward_std": 0.3260840103030205, "rewards/accuracy_reward": 0.5000000027939677, "rewards/format_reward": 0.8020833507180214, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 1002.6562728881836, "epoch": 0.19626666666666667, "grad_norm": 2.981387138366699, "kl": 0.3248291015625, "learning_rate": 9.170491573292162e-07, "loss": 0.1318, "reward": 1.6875000298023224, "reward_std": 0.5356915444135666, "rewards/accuracy_reward": 0.8020833544433117, "rewards/format_reward": 0.8854166865348816, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 1552.125057220459, "epoch": 0.19733333333333333, "grad_norm": 2133.77783203125, "kl": 10.4677734375, "learning_rate": 9.161742355220844e-07, "loss": 0.5841, "reward": 1.3854166977107525, "reward_std": 0.4865904748439789, "rewards/accuracy_reward": 0.6354166744276881, "rewards/format_reward": 0.7500000167638063, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 1251.4167022705078, "epoch": 0.1984, "grad_norm": 2.3743228912353516, "kl": 0.48040771484375, "learning_rate": 9.152951974062208e-07, "loss": 0.1388, "reward": 1.5625000223517418, "reward_std": 0.28655456006526947, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.8541666716337204, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 1543.9479331970215, "epoch": 0.19946666666666665, "grad_norm": 4.138925075531006, "kl": 0.9083251953125, "learning_rate": 9.14412052863239e-07, "loss": 0.1608, "reward": 1.2395833432674408, "reward_std": 0.36822448298335075, "rewards/accuracy_reward": 0.4479166716337204, "rewards/format_reward": 0.7916666716337204, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 1250.7083892822266, "epoch": 0.20053333333333334, "grad_norm": 2.5038328170776367, "kl": 0.285308837890625, "learning_rate": 9.135248118209142e-07, "loss": 0.0744, "reward": 1.5937500149011612, "reward_std": 0.4877205602824688, "rewards/accuracy_reward": 0.8020833432674408, "rewards/format_reward": 0.7916666865348816, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 1619.2813262939453, "epoch": 0.2016, "grad_norm": 4.1387553215026855, "kl": 0.70965576171875, "learning_rate": 9.126334842530727e-07, "loss": 0.181, "reward": 1.3125000298023224, "reward_std": 0.5841807760298252, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.7708333507180214, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 1334.3125381469727, "epoch": 0.20266666666666666, "grad_norm": 3.0489108562469482, "kl": 0.273712158203125, "learning_rate": 9.117380801794782e-07, "loss": 0.0497, "reward": 1.6979166939854622, "reward_std": 0.37816476821899414, "rewards/accuracy_reward": 0.8229166753590107, "rewards/format_reward": 0.8750000149011612, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 1031.0937805175781, "epoch": 0.20373333333333332, "grad_norm": 25.862136840820312, "kl": 0.43017578125, "learning_rate": 9.108386096657203e-07, "loss": 0.2957, "reward": 1.6562500298023224, "reward_std": 0.5250744894146919, "rewards/accuracy_reward": 0.7812500111758709, "rewards/format_reward": 0.8750000149011612, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 1036.239601135254, "epoch": 0.2048, "grad_norm": 1.1769046783447266, "kl": 0.182647705078125, "learning_rate": 9.099350828231014e-07, "loss": 0.0343, "reward": 1.7604167014360428, "reward_std": 0.3228110000491142, "rewards/accuracy_reward": 0.8125000111758709, "rewards/format_reward": 0.9479166716337204, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 1024.6771240234375, "epoch": 0.20586666666666667, "grad_norm": 2.477172374725342, "kl": 0.118377685546875, "learning_rate": 9.090275098085224e-07, "loss": 0.0692, "reward": 1.8750000298023224, "reward_std": 0.22782234475016594, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.9375000074505806, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 1756.8437881469727, "epoch": 0.20693333333333333, "grad_norm": 6.860195159912109, "kl": 0.63568115234375, "learning_rate": 9.081159008243687e-07, "loss": 0.161, "reward": 1.3854166939854622, "reward_std": 0.4775024279952049, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.7812500149011612, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 1987.7292251586914, "epoch": 0.208, "grad_norm": 2.2822182178497314, "kl": 0.4808349609375, "learning_rate": 9.072002661183958e-07, "loss": 0.074, "reward": 1.3541666977107525, "reward_std": 0.4726897403597832, "rewards/accuracy_reward": 0.5729166697710752, "rewards/format_reward": 0.7812500074505806, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 1837.8542175292969, "epoch": 0.20906666666666668, "grad_norm": 1.931454062461853, "kl": 0.517608642578125, "learning_rate": 9.06280615983614e-07, "loss": 0.0742, "reward": 1.3333333805203438, "reward_std": 0.4153929352760315, "rewards/accuracy_reward": 0.5312500149011612, "rewards/format_reward": 0.8020833507180214, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 1430.0416870117188, "epoch": 0.21013333333333334, "grad_norm": 4.724989891052246, "kl": 0.328369140625, "learning_rate": 9.053569607581725e-07, "loss": 0.0822, "reward": 1.6250000521540642, "reward_std": 0.421865351498127, "rewards/accuracy_reward": 0.8020833414047956, "rewards/format_reward": 0.822916679084301, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 1870.3229522705078, "epoch": 0.2112, "grad_norm": 38.04512405395508, "kl": 1.106658935546875, "learning_rate": 9.044293108252431e-07, "loss": 0.1124, "reward": 1.3541666865348816, "reward_std": 0.3369580917060375, "rewards/accuracy_reward": 0.6041666744276881, "rewards/format_reward": 0.7500000055879354, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 1582.1250305175781, "epoch": 0.21226666666666666, "grad_norm": 369.296630859375, "kl": 2.8210601806640625, "learning_rate": 9.034976766129041e-07, "loss": 0.269, "reward": 1.7187500298023224, "reward_std": 0.27324533089995384, "rewards/accuracy_reward": 0.781250006519258, "rewards/format_reward": 0.9375000074505806, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 1518.6875457763672, "epoch": 0.21333333333333335, "grad_norm": 5.9838151931762695, "kl": 0.283599853515625, "learning_rate": 9.025620685940222e-07, "loss": 0.0134, "reward": 1.5625000447034836, "reward_std": 0.35411839932203293, "rewards/accuracy_reward": 0.6979166697710752, "rewards/format_reward": 0.8645833432674408, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 1538.552146911621, "epoch": 0.2144, "grad_norm": 2.6527292728424072, "kl": 0.361053466796875, "learning_rate": 9.016224972861356e-07, "loss": 0.0427, "reward": 1.4479166865348816, "reward_std": 0.2920834720134735, "rewards/accuracy_reward": 0.6145833358168602, "rewards/format_reward": 0.8333333507180214, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 1743.1771392822266, "epoch": 0.21546666666666667, "grad_norm": 33.810062408447266, "kl": 0.704925537109375, "learning_rate": 9.006789732513354e-07, "loss": 0.174, "reward": 1.583333358168602, "reward_std": 0.3741266131401062, "rewards/accuracy_reward": 0.7395833414047956, "rewards/format_reward": 0.8437500037252903, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 1758.4062957763672, "epoch": 0.21653333333333333, "grad_norm": 2.123501777648926, "kl": 0.34552001953125, "learning_rate": 8.997315070961464e-07, "loss": 0.0959, "reward": 1.552083358168602, "reward_std": 0.36208200454711914, "rewards/accuracy_reward": 0.6666666772216558, "rewards/format_reward": 0.8854166716337204, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 1555.0000534057617, "epoch": 0.2176, "grad_norm": 4.16554594039917, "kl": 0.2510986328125, "learning_rate": 8.987801094714088e-07, "loss": 0.0441, "reward": 1.468750037252903, "reward_std": 0.3903345912694931, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.8437500074505806, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 1415.7292175292969, "epoch": 0.21866666666666668, "grad_norm": 6.0451741218566895, "kl": 0.191558837890625, "learning_rate": 8.978247910721578e-07, "loss": 0.1557, "reward": 1.7291667312383652, "reward_std": 0.3799184523522854, "rewards/accuracy_reward": 0.7500000102445483, "rewards/format_reward": 0.9791666716337204, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 1875.3229675292969, "epoch": 0.21973333333333334, "grad_norm": 9.641579627990723, "kl": 0.26275634765625, "learning_rate": 8.968655626375038e-07, "loss": 0.0559, "reward": 1.7500000447034836, "reward_std": 0.35534025728702545, "rewards/accuracy_reward": 0.8125000223517418, "rewards/format_reward": 0.9375000074505806, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 2336.104217529297, "epoch": 0.2208, "grad_norm": 43.12470626831055, "kl": 0.88690185546875, "learning_rate": 8.959024349505108e-07, "loss": 0.2316, "reward": 1.1250000149011612, "reward_std": 0.492625392973423, "rewards/accuracy_reward": 0.4062500102445483, "rewards/format_reward": 0.7187500149011612, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 2190.8021392822266, "epoch": 0.22186666666666666, "grad_norm": 4.655129909515381, "kl": 0.41680908203125, "learning_rate": 8.949354188380768e-07, "loss": -0.0172, "reward": 1.2708333730697632, "reward_std": 0.4765875115990639, "rewards/accuracy_reward": 0.4895833423361182, "rewards/format_reward": 0.7812500298023224, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 1868.5938186645508, "epoch": 0.22293333333333334, "grad_norm": 2.240370750427246, "kl": 0.225982666015625, "learning_rate": 8.939645251708102e-07, "loss": 0.1364, "reward": 1.4687500335276127, "reward_std": 0.3428783416748047, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.8229166753590107, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 1668.5000457763672, "epoch": 0.224, "grad_norm": 4.978948593139648, "kl": 0.326690673828125, "learning_rate": 8.92989764862909e-07, "loss": 0.1636, "reward": 1.4791666939854622, "reward_std": 0.4652542732656002, "rewards/accuracy_reward": 0.6979166744276881, "rewards/format_reward": 0.7812500111758709, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 1820.1250457763672, "epoch": 0.22506666666666666, "grad_norm": 5.223400115966797, "kl": 0.2906494140625, "learning_rate": 8.920111488720378e-07, "loss": 0.1686, "reward": 1.4479166939854622, "reward_std": 0.4606770984828472, "rewards/accuracy_reward": 0.6250000055879354, "rewards/format_reward": 0.8229166716337204, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 1564.5000534057617, "epoch": 0.22613333333333333, "grad_norm": 1.587716817855835, "kl": 0.1948089599609375, "learning_rate": 8.91028688199204e-07, "loss": 0.0083, "reward": 1.7187500447034836, "reward_std": 0.31057000532746315, "rewards/accuracy_reward": 0.8125000074505806, "rewards/format_reward": 0.9062500149011612, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 2046.0312805175781, "epoch": 0.2272, "grad_norm": 4.068866729736328, "kl": 0.438812255859375, "learning_rate": 8.900423938886345e-07, "loss": 0.0758, "reward": 1.3541666939854622, "reward_std": 0.5512768216431141, "rewards/accuracy_reward": 0.5625000176951289, "rewards/format_reward": 0.791666679084301, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 2010.3437881469727, "epoch": 0.22826666666666667, "grad_norm": 3.046147108078003, "kl": 0.193939208984375, "learning_rate": 8.890522770276525e-07, "loss": 0.0371, "reward": 1.4270833656191826, "reward_std": 0.38436976447701454, "rewards/accuracy_reward": 0.5416666818782687, "rewards/format_reward": 0.8854166865348816, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 1691.1250381469727, "epoch": 0.22933333333333333, "grad_norm": 2.5607383251190186, "kl": 0.406585693359375, "learning_rate": 8.88058348746551e-07, "loss": 0.0592, "reward": 1.5000000484287739, "reward_std": 0.4401639252901077, "rewards/accuracy_reward": 0.6458333469927311, "rewards/format_reward": 0.8541666753590107, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 2202.322998046875, "epoch": 0.2304, "grad_norm": 3.439923048019409, "kl": 0.39349365234375, "learning_rate": 8.870606202184695e-07, "loss": 0.1401, "reward": 1.447916692122817, "reward_std": 0.4352228157222271, "rewards/accuracy_reward": 0.6562500149011612, "rewards/format_reward": 0.7916666772216558, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 1404.239601135254, "epoch": 0.23146666666666665, "grad_norm": 1.5993566513061523, "kl": 0.17138671875, "learning_rate": 8.860591026592667e-07, "loss": -0.0011, "reward": 1.5729167014360428, "reward_std": 0.3325564116239548, "rewards/accuracy_reward": 0.7083333367481828, "rewards/format_reward": 0.8645833432674408, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 2226.572982788086, "epoch": 0.23253333333333334, "grad_norm": 1.7370694875717163, "kl": 0.523681640625, "learning_rate": 8.850538073273958e-07, "loss": 0.0229, "reward": 1.0833333618938923, "reward_std": 0.5069956891238689, "rewards/accuracy_reward": 0.4062500176951289, "rewards/format_reward": 0.6770833469927311, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 1870.9688110351562, "epoch": 0.2336, "grad_norm": 3.033568859100342, "kl": 0.16973876953125, "learning_rate": 8.840447455237776e-07, "loss": 0.0493, "reward": 1.395833358168602, "reward_std": 0.5017962045967579, "rewards/accuracy_reward": 0.6250000139698386, "rewards/format_reward": 0.7708333395421505, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 1831.0937957763672, "epoch": 0.23466666666666666, "grad_norm": 4.48084831237793, "kl": 0.133392333984375, "learning_rate": 8.830319285916729e-07, "loss": 0.0094, "reward": 1.6562500447034836, "reward_std": 0.37595532462000847, "rewards/accuracy_reward": 0.760416679084301, "rewards/format_reward": 0.8958333432674408, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 1412.8646278381348, "epoch": 0.23573333333333332, "grad_norm": 2.2837796211242676, "kl": 0.262908935546875, "learning_rate": 8.820153679165556e-07, "loss": -0.0129, "reward": 1.4062500037252903, "reward_std": 0.2850641906261444, "rewards/accuracy_reward": 0.5729166772216558, "rewards/format_reward": 0.8333333469927311, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 1654.0209121704102, "epoch": 0.2368, "grad_norm": 10.215902328491211, "kl": 0.2847747802734375, "learning_rate": 8.809950749259846e-07, "loss": 0.0344, "reward": 1.6979166939854622, "reward_std": 0.3535758815705776, "rewards/accuracy_reward": 0.8125000074505806, "rewards/format_reward": 0.885416679084301, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 2406.4896392822266, "epoch": 0.23786666666666667, "grad_norm": 1.5515832901000977, "kl": 0.2773895263671875, "learning_rate": 8.799710610894747e-07, "loss": 0.0704, "reward": 1.2708333432674408, "reward_std": 0.4076950065791607, "rewards/accuracy_reward": 0.4687500074505806, "rewards/format_reward": 0.8020833432674408, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 1814.864631652832, "epoch": 0.23893333333333333, "grad_norm": 1.9577429294586182, "kl": 0.2260894775390625, "learning_rate": 8.789433379183688e-07, "loss": 0.0175, "reward": 1.406250037252903, "reward_std": 0.4987129122018814, "rewards/accuracy_reward": 0.5833333460614085, "rewards/format_reward": 0.822916679084301, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 1767.1771392822266, "epoch": 0.24, "grad_norm": 1.6191619634628296, "kl": 0.3135986328125, "learning_rate": 8.779119169657077e-07, "loss": 0.153, "reward": 1.406250037252903, "reward_std": 0.3813643418252468, "rewards/accuracy_reward": 0.5625000074505806, "rewards/format_reward": 0.8437500149011612, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 2248.7188568115234, "epoch": 0.24106666666666668, "grad_norm": 3.2142527103424072, "kl": 0.26239013671875, "learning_rate": 8.768768098261001e-07, "loss": 0.0698, "reward": 1.3645833507180214, "reward_std": 0.399388175457716, "rewards/accuracy_reward": 0.5625000158324838, "rewards/format_reward": 0.8020833395421505, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 1429.9062805175781, "epoch": 0.24213333333333334, "grad_norm": 1.2355214357376099, "kl": 0.219329833984375, "learning_rate": 8.758380281355932e-07, "loss": 0.0195, "reward": 1.6875000596046448, "reward_std": 0.5177478417754173, "rewards/accuracy_reward": 0.7916666939854622, "rewards/format_reward": 0.8958333507180214, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 1607.8438262939453, "epoch": 0.2432, "grad_norm": 1.5416598320007324, "kl": 0.279754638671875, "learning_rate": 8.747955835715406e-07, "loss": 0.049, "reward": 1.395833358168602, "reward_std": 0.4821494333446026, "rewards/accuracy_reward": 0.614583351649344, "rewards/format_reward": 0.7812500074505806, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 1573.9687957763672, "epoch": 0.24426666666666666, "grad_norm": 5.66356086730957, "kl": 0.438232421875, "learning_rate": 8.737494878524723e-07, "loss": 0.2043, "reward": 1.0833333730697632, "reward_std": 0.5594120845198631, "rewards/accuracy_reward": 0.38541667629033327, "rewards/format_reward": 0.6979166753590107, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 2063.9792251586914, "epoch": 0.24533333333333332, "grad_norm": 1.9646347761154175, "kl": 0.25408935546875, "learning_rate": 8.726997527379619e-07, "loss": -0.0002, "reward": 1.4062500149011612, "reward_std": 0.289574958384037, "rewards/accuracy_reward": 0.6770833432674408, "rewards/format_reward": 0.729166679084301, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 1920.5000610351562, "epoch": 0.2464, "grad_norm": 1.053942322731018, "kl": 0.24371337890625, "learning_rate": 8.716463900284949e-07, "loss": 0.021, "reward": 1.541666716337204, "reward_std": 0.32670827955007553, "rewards/accuracy_reward": 0.5937500074505806, "rewards/format_reward": 0.947916679084301, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 2199.4584045410156, "epoch": 0.24746666666666667, "grad_norm": 2.894688129425049, "kl": 0.552978515625, "learning_rate": 8.705894115653365e-07, "loss": 0.0639, "reward": 1.1041666939854622, "reward_std": 0.5019615069031715, "rewards/accuracy_reward": 0.37500000558793545, "rewards/format_reward": 0.7291666828095913, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 1793.197982788086, "epoch": 0.24853333333333333, "grad_norm": 1.487608551979065, "kl": 0.236328125, "learning_rate": 8.695288292303977e-07, "loss": 0.0217, "reward": 1.5833333879709244, "reward_std": 0.39883434772491455, "rewards/accuracy_reward": 0.656250013038516, "rewards/format_reward": 0.9270833507180214, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 1817.4479751586914, "epoch": 0.2496, "grad_norm": 1.0141682624816895, "kl": 0.24725341796875, "learning_rate": 8.684646549461016e-07, "loss": 0.0822, "reward": 1.4479166939854622, "reward_std": 0.5110773295164108, "rewards/accuracy_reward": 0.6041666800156236, "rewards/format_reward": 0.8437500149011612, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 1853.3334007263184, "epoch": 0.25066666666666665, "grad_norm": 0.8220115900039673, "kl": 0.2499847412109375, "learning_rate": 8.67396900675251e-07, "loss": 0.0792, "reward": 1.5729166865348816, "reward_std": 0.35277800261974335, "rewards/accuracy_reward": 0.6770833432674408, "rewards/format_reward": 0.8958333432674408, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 1562.8021049499512, "epoch": 0.2517333333333333, "grad_norm": 1.1124240159988403, "kl": 2.553314208984375, "learning_rate": 8.663255784208915e-07, "loss": 0.0649, "reward": 1.5104166865348816, "reward_std": 0.24479617923498154, "rewards/accuracy_reward": 0.6041666669771075, "rewards/format_reward": 0.9062500074505806, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 1963.0521621704102, "epoch": 0.2528, "grad_norm": 0.9364346861839294, "kl": 0.398712158203125, "learning_rate": 8.652507002261783e-07, "loss": 0.0469, "reward": 1.2708333507180214, "reward_std": 0.3397647328674793, "rewards/accuracy_reward": 0.48958334140479565, "rewards/format_reward": 0.7812500074505806, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 1410.1771545410156, "epoch": 0.2538666666666667, "grad_norm": 5.20961856842041, "kl": 0.134521484375, "learning_rate": 8.641722781742404e-07, "loss": 0.0431, "reward": 1.656250037252903, "reward_std": 0.383124440908432, "rewards/accuracy_reward": 0.7187500074505806, "rewards/format_reward": 0.9375000149011612, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 1934.9896087646484, "epoch": 0.25493333333333335, "grad_norm": 1.1437102556228638, "kl": 0.420257568359375, "learning_rate": 8.630903243880447e-07, "loss": 0.2244, "reward": 1.1770833507180214, "reward_std": 0.6597242429852486, "rewards/accuracy_reward": 0.4583333535119891, "rewards/format_reward": 0.7187500149011612, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 2341.1979598999023, "epoch": 0.256, "grad_norm": 0.5519979000091553, "kl": 0.4168548583984375, "learning_rate": 8.620048510302597e-07, "loss": 0.073, "reward": 1.135416679084301, "reward_std": 0.3740782104432583, "rewards/accuracy_reward": 0.38541666977107525, "rewards/format_reward": 0.7500000149011612, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 1429.3646354675293, "epoch": 0.25706666666666667, "grad_norm": 0.7625186443328857, "kl": 0.25, "learning_rate": 8.609158703031184e-07, "loss": 0.1329, "reward": 1.5208333656191826, "reward_std": 0.42619096860289574, "rewards/accuracy_reward": 0.6666666809469461, "rewards/format_reward": 0.854166679084301, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 1913.6771392822266, "epoch": 0.2581333333333333, "grad_norm": 1.0741691589355469, "kl": 0.1340484619140625, "learning_rate": 8.598233944482821e-07, "loss": 0.0499, "reward": 1.6666667088866234, "reward_std": 0.42081642523407936, "rewards/accuracy_reward": 0.729166679084301, "rewards/format_reward": 0.9375000074505806, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 1832.7604751586914, "epoch": 0.2592, "grad_norm": 0.8904412984848022, "kl": 0.185760498046875, "learning_rate": 8.58727435746702e-07, "loss": 0.0958, "reward": 1.6770833656191826, "reward_std": 0.29584166035056114, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.9583333432674408, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 1766.9479446411133, "epoch": 0.26026666666666665, "grad_norm": 0.6860199570655823, "kl": 0.20458984375, "learning_rate": 8.576280065184813e-07, "loss": 0.1256, "reward": 1.5312500298023224, "reward_std": 0.31957533583045006, "rewards/accuracy_reward": 0.6562500102445483, "rewards/format_reward": 0.8750000074505806, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 1486.8125381469727, "epoch": 0.2613333333333333, "grad_norm": 2.836918830871582, "kl": 0.269775390625, "learning_rate": 8.565251191227365e-07, "loss": -0.0003, "reward": 1.6145833656191826, "reward_std": 0.5003794655203819, "rewards/accuracy_reward": 0.7604166846722364, "rewards/format_reward": 0.854166679084301, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 1527.1771697998047, "epoch": 0.2624, "grad_norm": 1.0194785594940186, "kl": 0.6246337890625, "learning_rate": 8.554187859574593e-07, "loss": 0.1018, "reward": 1.510416716337204, "reward_std": 0.46897274628281593, "rewards/accuracy_reward": 0.6458333414047956, "rewards/format_reward": 0.8645833432674408, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 1719.8333892822266, "epoch": 0.2634666666666667, "grad_norm": 0.7223559021949768, "kl": 0.26751708984375, "learning_rate": 8.543090194593762e-07, "loss": 0.0187, "reward": 1.3333333507180214, "reward_std": 0.5536307394504547, "rewards/accuracy_reward": 0.5208333414047956, "rewards/format_reward": 0.8125000223517418, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 2148.697998046875, "epoch": 0.26453333333333334, "grad_norm": 1.1031389236450195, "kl": 0.38037109375, "learning_rate": 8.531958321038091e-07, "loss": 0.1609, "reward": 1.2604166865348816, "reward_std": 0.5420085936784744, "rewards/accuracy_reward": 0.4687500102445483, "rewards/format_reward": 0.791666679084301, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 1545.9583892822266, "epoch": 0.2656, "grad_norm": 1.3137245178222656, "kl": 0.47442626953125, "learning_rate": 8.520792364045358e-07, "loss": 0.1104, "reward": 1.3020833656191826, "reward_std": 0.5698424428701401, "rewards/accuracy_reward": 0.5000000102445483, "rewards/format_reward": 0.802083358168602, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 1894.4479751586914, "epoch": 0.26666666666666666, "grad_norm": 0.521087110042572, "kl": 0.237762451171875, "learning_rate": 8.509592449136476e-07, "loss": 0.056, "reward": 1.239583358168602, "reward_std": 0.5633081533014774, "rewards/accuracy_reward": 0.4270833432674408, "rewards/format_reward": 0.8125000074505806, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 1502.2396240234375, "epoch": 0.2677333333333333, "grad_norm": 1.2082853317260742, "kl": 0.15814208984375, "learning_rate": 8.498358702214099e-07, "loss": 0.1036, "reward": 1.5416666939854622, "reward_std": 0.3727410286664963, "rewards/accuracy_reward": 0.6666666846722364, "rewards/format_reward": 0.8750000074505806, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 2058.385482788086, "epoch": 0.2688, "grad_norm": 0.6066608428955078, "kl": 0.287567138671875, "learning_rate": 8.487091249561201e-07, "loss": 0.0874, "reward": 1.4062500298023224, "reward_std": 0.5151247978210449, "rewards/accuracy_reward": 0.5833333367481828, "rewards/format_reward": 0.822916679084301, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 1651.7083587646484, "epoch": 0.26986666666666664, "grad_norm": 0.4232212007045746, "kl": 0.1504974365234375, "learning_rate": 8.475790217839649e-07, "loss": 0.0079, "reward": 1.562500037252903, "reward_std": 0.5178634375333786, "rewards/accuracy_reward": 0.6770833432674408, "rewards/format_reward": 0.885416679084301, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 1870.8541946411133, "epoch": 0.27093333333333336, "grad_norm": 0.6529887318611145, "kl": 0.118377685546875, "learning_rate": 8.464455734088792e-07, "loss": 0.1535, "reward": 1.6979166865348816, "reward_std": 0.37109608575701714, "rewards/accuracy_reward": 0.7500000111758709, "rewards/format_reward": 0.9479166716337204, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 1639.6354446411133, "epoch": 0.272, "grad_norm": 0.809012234210968, "kl": 0.226409912109375, "learning_rate": 8.453087925724023e-07, "loss": 0.1889, "reward": 1.5729167088866234, "reward_std": 0.45508959889411926, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.885416679084301, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 1709.0938262939453, "epoch": 0.2730666666666667, "grad_norm": 1.7895228862762451, "kl": 0.349212646484375, "learning_rate": 8.441686920535352e-07, "loss": 0.2666, "reward": 1.3958333879709244, "reward_std": 0.6179759874939919, "rewards/accuracy_reward": 0.5416666828095913, "rewards/format_reward": 0.854166679084301, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 1488.8333740234375, "epoch": 0.27413333333333334, "grad_norm": 0.6860136389732361, "kl": 0.293182373046875, "learning_rate": 8.430252846685965e-07, "loss": 0.0801, "reward": 1.6875000149011612, "reward_std": 0.34433629736304283, "rewards/accuracy_reward": 0.7812500037252903, "rewards/format_reward": 0.9062500074505806, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 1531.1667251586914, "epoch": 0.2752, "grad_norm": 2.2987053394317627, "kl": 0.63665771484375, "learning_rate": 8.418785832710787e-07, "loss": 0.2211, "reward": 1.4479166977107525, "reward_std": 0.402406208217144, "rewards/accuracy_reward": 0.5937500102445483, "rewards/format_reward": 0.8541666828095913, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 2180.0313034057617, "epoch": 0.27626666666666666, "grad_norm": 1.2567552328109741, "kl": 0.56939697265625, "learning_rate": 8.407286007515039e-07, "loss": 0.0858, "reward": 1.1770833507180214, "reward_std": 0.538055032491684, "rewards/accuracy_reward": 0.3958333395421505, "rewards/format_reward": 0.7812500149011612, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 1646.302116394043, "epoch": 0.2773333333333333, "grad_norm": 0.8053186535835266, "kl": 0.32696533203125, "learning_rate": 8.395753500372778e-07, "loss": 0.0971, "reward": 1.4375000447034836, "reward_std": 0.5081432908773422, "rewards/accuracy_reward": 0.5416666809469461, "rewards/format_reward": 0.8958333507180214, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 1178.7917137145996, "epoch": 0.2784, "grad_norm": 1.0854579210281372, "kl": 0.385986328125, "learning_rate": 8.384188440925463e-07, "loss": 0.1312, "reward": 1.6770833730697632, "reward_std": 0.38609835505485535, "rewards/accuracy_reward": 0.7604166753590107, "rewards/format_reward": 0.916666679084301, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 1783.7187957763672, "epoch": 0.27946666666666664, "grad_norm": 1.6074930429458618, "kl": 0.5771484375, "learning_rate": 8.372590959180476e-07, "loss": 0.1793, "reward": 1.2083333432674408, "reward_std": 0.510901901870966, "rewards/accuracy_reward": 0.4479166781529784, "rewards/format_reward": 0.7604166865348816, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 1618.8854446411133, "epoch": 0.28053333333333336, "grad_norm": 1.0961273908615112, "kl": 0.446044921875, "learning_rate": 8.360961185509678e-07, "loss": 0.2013, "reward": 1.3229167014360428, "reward_std": 0.5566804073750973, "rewards/accuracy_reward": 0.4791666707023978, "rewards/format_reward": 0.8437500149011612, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 1980.6667175292969, "epoch": 0.2816, "grad_norm": 2.437978982925415, "kl": 0.559326171875, "learning_rate": 8.34929925064793e-07, "loss": 0.1602, "reward": 1.1979167088866234, "reward_std": 0.472545325756073, "rewards/accuracy_reward": 0.3750000102445483, "rewards/format_reward": 0.822916679084301, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 1982.5105056762695, "epoch": 0.2826666666666667, "grad_norm": 1.1949418783187866, "kl": 0.46533203125, "learning_rate": 8.337605285691632e-07, "loss": 0.2213, "reward": 1.0312500223517418, "reward_std": 0.6169419474899769, "rewards/accuracy_reward": 0.2604166716337204, "rewards/format_reward": 0.7708333358168602, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 1638.0625305175781, "epoch": 0.28373333333333334, "grad_norm": 0.8912386894226074, "kl": 0.3704833984375, "learning_rate": 8.325879422097249e-07, "loss": 0.1251, "reward": 1.4270833693444729, "reward_std": 0.4173879250884056, "rewards/accuracy_reward": 0.6250000102445483, "rewards/format_reward": 0.8020833432674408, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 1974.5521392822266, "epoch": 0.2848, "grad_norm": 0.7802460789680481, "kl": 0.62841796875, "learning_rate": 8.314121791679833e-07, "loss": 0.2093, "reward": 1.083333358168602, "reward_std": 0.6368258446455002, "rewards/accuracy_reward": 0.3854166744276881, "rewards/format_reward": 0.697916679084301, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 1408.635440826416, "epoch": 0.28586666666666666, "grad_norm": 0.7876457571983337, "kl": 0.21978759765625, "learning_rate": 8.302332526611531e-07, "loss": 0.0678, "reward": 1.6979167014360428, "reward_std": 0.3801100105047226, "rewards/accuracy_reward": 0.8125000176951289, "rewards/format_reward": 0.885416679084301, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 1472.9479446411133, "epoch": 0.2869333333333333, "grad_norm": 0.9652960896492004, "kl": 0.274658203125, "learning_rate": 8.290511759420114e-07, "loss": 0.1414, "reward": 1.5833333879709244, "reward_std": 0.4702134057879448, "rewards/accuracy_reward": 0.656250006519258, "rewards/format_reward": 0.9270833432674408, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 2061.166717529297, "epoch": 0.288, "grad_norm": 1.0656603574752808, "kl": 0.4866943359375, "learning_rate": 8.278659622987482e-07, "loss": 0.2411, "reward": 1.2187500298023224, "reward_std": 0.5930022709071636, "rewards/accuracy_reward": 0.47916667722165585, "rewards/format_reward": 0.739583358168602, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 1407.2292098999023, "epoch": 0.2890666666666667, "grad_norm": 0.9060713052749634, "kl": 0.3543701171875, "learning_rate": 8.266776250548164e-07, "loss": 0.1955, "reward": 1.5625000298023224, "reward_std": 0.4194052144885063, "rewards/accuracy_reward": 0.6770833432674408, "rewards/format_reward": 0.8854166865348816, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 1137.5833625793457, "epoch": 0.29013333333333335, "grad_norm": 0.908098578453064, "kl": 0.6112060546875, "learning_rate": 8.254861775687829e-07, "loss": 0.158, "reward": 1.552083358168602, "reward_std": 0.4453302435576916, "rewards/accuracy_reward": 0.6979166818782687, "rewards/format_reward": 0.854166679084301, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 1834.9167022705078, "epoch": 0.2912, "grad_norm": 1.4929542541503906, "kl": 0.2888641357421875, "learning_rate": 8.242916332341776e-07, "loss": 0.1286, "reward": 1.4895833805203438, "reward_std": 0.5719096660614014, "rewards/accuracy_reward": 0.6770833469927311, "rewards/format_reward": 0.8125000111758709, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 1654.4792022705078, "epoch": 0.2922666666666667, "grad_norm": 0.5312554240226746, "kl": 0.364959716796875, "learning_rate": 8.230940054793441e-07, "loss": 0.1397, "reward": 1.4479167088866234, "reward_std": 0.4793417602777481, "rewards/accuracy_reward": 0.6145833432674408, "rewards/format_reward": 0.8333333432674408, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 1256.3542022705078, "epoch": 0.29333333333333333, "grad_norm": 0.5127818584442139, "kl": 0.205474853515625, "learning_rate": 8.218933077672872e-07, "loss": 0.0392, "reward": 1.5625000298023224, "reward_std": 0.414396308362484, "rewards/accuracy_reward": 0.6770833414047956, "rewards/format_reward": 0.885416679084301, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 977.2500267028809, "epoch": 0.2944, "grad_norm": 0.5600444674491882, "kl": 0.165374755859375, "learning_rate": 8.206895535955225e-07, "loss": 0.0141, "reward": 1.7916667014360428, "reward_std": 0.3620692007243633, "rewards/accuracy_reward": 0.8645833432674408, "rewards/format_reward": 0.9270833432674408, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 1417.5417137145996, "epoch": 0.29546666666666666, "grad_norm": 0.8361416459083557, "kl": 0.26031494140625, "learning_rate": 8.194827564959247e-07, "loss": -0.0153, "reward": 1.3333333805203438, "reward_std": 0.5745716989040375, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.8333333507180214, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 1347.6250686645508, "epoch": 0.2965333333333333, "grad_norm": 0.8475180268287659, "kl": 0.2686767578125, "learning_rate": 8.182729300345748e-07, "loss": 0.1436, "reward": 1.427083358168602, "reward_std": 0.6094422340393066, "rewards/accuracy_reward": 0.6041666846722364, "rewards/format_reward": 0.8229166865348816, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 1358.2500343322754, "epoch": 0.2976, "grad_norm": 1.1723148822784424, "kl": 0.164794921875, "learning_rate": 8.170600878116087e-07, "loss": 0.1663, "reward": 1.4479167088866234, "reward_std": 0.4762081578373909, "rewards/accuracy_reward": 0.6354166753590107, "rewards/format_reward": 0.8125000149011612, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 1757.0937957763672, "epoch": 0.2986666666666667, "grad_norm": 0.5767378807067871, "kl": 0.2451171875, "learning_rate": 8.15844243461063e-07, "loss": 0.1453, "reward": 1.5729167014360428, "reward_std": 0.46012451499700546, "rewards/accuracy_reward": 0.7187500158324838, "rewards/format_reward": 0.854166679084301, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 1412.552116394043, "epoch": 0.29973333333333335, "grad_norm": 0.5410394668579102, "kl": 0.2686767578125, "learning_rate": 8.146254106507225e-07, "loss": 0.1332, "reward": 1.4479167014360428, "reward_std": 0.37505171447992325, "rewards/accuracy_reward": 0.5937500111758709, "rewards/format_reward": 0.854166679084301, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 1771.5417022705078, "epoch": 0.3008, "grad_norm": 1.0621534585952759, "kl": 0.272613525390625, "learning_rate": 8.134036030819673e-07, "loss": 0.2514, "reward": 1.427083358168602, "reward_std": 0.5580050200223923, "rewards/accuracy_reward": 0.5937500074505806, "rewards/format_reward": 0.8333333507180214, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 1287.3125457763672, "epoch": 0.30186666666666667, "grad_norm": 0.7947608232498169, "kl": 0.20953369140625, "learning_rate": 8.121788344896168e-07, "loss": 0.0135, "reward": 1.5000000298023224, "reward_std": 0.4432336129248142, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.916666679084301, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 1052.5417137145996, "epoch": 0.30293333333333333, "grad_norm": 0.629499077796936, "kl": 0.195343017578125, "learning_rate": 8.109511186417767e-07, "loss": 0.1282, "reward": 1.5000000223517418, "reward_std": 0.3794792778789997, "rewards/accuracy_reward": 0.583333345130086, "rewards/format_reward": 0.9166666716337204, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 1366.5000610351562, "epoch": 0.304, "grad_norm": 1.9636276960372925, "kl": 0.50543212890625, "learning_rate": 8.097204693396845e-07, "loss": 0.1107, "reward": 1.5625000149011612, "reward_std": 0.5436407700181007, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.8958333507180214, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 1280.291706085205, "epoch": 0.30506666666666665, "grad_norm": 1.165725827217102, "kl": 0.27081298828125, "learning_rate": 8.084869004175535e-07, "loss": 0.0942, "reward": 1.4062500223517418, "reward_std": 0.39643871039152145, "rewards/accuracy_reward": 0.5312500009313226, "rewards/format_reward": 0.8750000149011612, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 1424.041690826416, "epoch": 0.3061333333333333, "grad_norm": 1.0519682168960571, "kl": 0.3187408447265625, "learning_rate": 8.072504257424173e-07, "loss": 0.0487, "reward": 1.3645833656191826, "reward_std": 0.4175382927060127, "rewards/accuracy_reward": 0.5104166744276881, "rewards/format_reward": 0.854166679084301, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 1647.1666984558105, "epoch": 0.3072, "grad_norm": 0.5156410932540894, "kl": 0.265167236328125, "learning_rate": 8.060110592139746e-07, "loss": 0.1047, "reward": 1.4791667014360428, "reward_std": 0.45215024054050446, "rewards/accuracy_reward": 0.6458333507180214, "rewards/format_reward": 0.8333333432674408, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 1185.4167022705078, "epoch": 0.3082666666666667, "grad_norm": 0.9654502868652344, "kl": 0.067230224609375, "learning_rate": 8.047688147644327e-07, "loss": 0.067, "reward": 1.8125000298023224, "reward_std": 0.30013206228613853, "rewards/accuracy_reward": 0.8229166828095913, "rewards/format_reward": 0.9895833358168602, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 1318.2604751586914, "epoch": 0.30933333333333335, "grad_norm": 0.733893096446991, "kl": 0.245697021484375, "learning_rate": 8.0352370635835e-07, "loss": 0.0518, "reward": 1.5208333730697632, "reward_std": 0.4748336784541607, "rewards/accuracy_reward": 0.635416679084301, "rewards/format_reward": 0.8854166716337204, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 927.3854598999023, "epoch": 0.3104, "grad_norm": 7.129660606384277, "kl": 0.2369384765625, "learning_rate": 8.022757479924805e-07, "loss": 0.0992, "reward": 1.7395833879709244, "reward_std": 0.4172895923256874, "rewards/accuracy_reward": 0.8020833507180214, "rewards/format_reward": 0.9375000149011612, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 1336.4792098999023, "epoch": 0.31146666666666667, "grad_norm": 1.0306510925292969, "kl": 0.250640869140625, "learning_rate": 8.010249536956157e-07, "loss": 0.2081, "reward": 1.3645833507180214, "reward_std": 0.4482138492166996, "rewards/accuracy_reward": 0.5104166772216558, "rewards/format_reward": 0.8541666865348816, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 1512.0104598999023, "epoch": 0.31253333333333333, "grad_norm": 1.8973042964935303, "kl": 0.2510833740234375, "learning_rate": 7.997713375284263e-07, "loss": 0.1557, "reward": 1.4166667088866234, "reward_std": 0.4112800881266594, "rewards/accuracy_reward": 0.5208333460614085, "rewards/format_reward": 0.8958333432674408, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 1080.4271202087402, "epoch": 0.3136, "grad_norm": 1.1492464542388916, "kl": 0.169891357421875, "learning_rate": 7.985149135833054e-07, "loss": 0.1004, "reward": 1.4895833805203438, "reward_std": 0.48474639654159546, "rewards/accuracy_reward": 0.5833333488553762, "rewards/format_reward": 0.9062500149011612, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 1205.5729484558105, "epoch": 0.31466666666666665, "grad_norm": 0.9352542161941528, "kl": 0.193634033203125, "learning_rate": 7.972556959842089e-07, "loss": 0.0302, "reward": 1.5000000149011612, "reward_std": 0.48415151983499527, "rewards/accuracy_reward": 0.6875000111758709, "rewards/format_reward": 0.8125000149011612, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 1314.5729293823242, "epoch": 0.3157333333333333, "grad_norm": 1.044317603111267, "kl": 0.17327880859375, "learning_rate": 7.959936988864977e-07, "loss": 0.0923, "reward": 1.4062500596046448, "reward_std": 0.49878566339612007, "rewards/accuracy_reward": 0.45833335164934397, "rewards/format_reward": 0.947916679084301, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 691.2812538146973, "epoch": 0.3168, "grad_norm": 0.8354275822639465, "kl": 0.132843017578125, "learning_rate": 7.947289364767781e-07, "loss": -0.0598, "reward": 1.6770833879709244, "reward_std": 0.418355792760849, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.9270833432674408, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 1138.8541946411133, "epoch": 0.3178666666666667, "grad_norm": 1.8273200988769531, "kl": 0.2562255859375, "learning_rate": 7.934614229727422e-07, "loss": 0.1554, "reward": 1.5000000149011612, "reward_std": 0.5222887694835663, "rewards/accuracy_reward": 0.635416679084301, "rewards/format_reward": 0.8645833432674408, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 963.1042060852051, "epoch": 0.31893333333333335, "grad_norm": 0.9429172277450562, "kl": 0.14398193359375, "learning_rate": 7.921911726230082e-07, "loss": 0.0099, "reward": 1.6354167014360428, "reward_std": 0.3216959051787853, "rewards/accuracy_reward": 0.6979166744276881, "rewards/format_reward": 0.9375000074505806, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 1040.7708625793457, "epoch": 0.32, "grad_norm": 1.7778241634368896, "kl": 0.31884765625, "learning_rate": 7.909181997069602e-07, "loss": 0.1587, "reward": 1.5416666939854622, "reward_std": 0.4231627397239208, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.8750000223517418, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 1482.8958740234375, "epoch": 0.32106666666666667, "grad_norm": 2.5433237552642822, "kl": 0.60394287109375, "learning_rate": 7.896425185345883e-07, "loss": 0.2056, "reward": 1.145833358168602, "reward_std": 0.619047075510025, "rewards/accuracy_reward": 0.3958333469927311, "rewards/format_reward": 0.7500000149011612, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 965.635425567627, "epoch": 0.3221333333333333, "grad_norm": 1.935640811920166, "kl": 0.23553466796875, "learning_rate": 7.883641434463262e-07, "loss": 0.1472, "reward": 1.5625000447034836, "reward_std": 0.572762954980135, "rewards/accuracy_reward": 0.6875000186264515, "rewards/format_reward": 0.8750000223517418, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 974.9271011352539, "epoch": 0.3232, "grad_norm": 1.690371036529541, "kl": 0.284881591796875, "learning_rate": 7.87083088812892e-07, "loss": -0.0177, "reward": 1.708333358168602, "reward_std": 0.3563476577401161, "rewards/accuracy_reward": 0.7812500223517418, "rewards/format_reward": 0.9270833432674408, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 773.2291793823242, "epoch": 0.32426666666666665, "grad_norm": 1.2238810062408447, "kl": 0.21820068359375, "learning_rate": 7.857993690351249e-07, "loss": 0.0559, "reward": 1.7604167014360428, "reward_std": 0.2869829013943672, "rewards/accuracy_reward": 0.822916679084301, "rewards/format_reward": 0.9375000074505806, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 961.7395935058594, "epoch": 0.3253333333333333, "grad_norm": 1.8622803688049316, "kl": 0.3310546875, "learning_rate": 7.845129985438242e-07, "loss": 0.1044, "reward": 1.5937500298023224, "reward_std": 0.4341789707541466, "rewards/accuracy_reward": 0.6666666772216558, "rewards/format_reward": 0.9270833432674408, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 1097.7500228881836, "epoch": 0.3264, "grad_norm": 3.5552356243133545, "kl": 0.893218994140625, "learning_rate": 7.832239917995872e-07, "loss": 0.0876, "reward": 1.4583333730697632, "reward_std": 0.490246519446373, "rewards/accuracy_reward": 0.656250013038516, "rewards/format_reward": 0.8020833432674408, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 1157.8854522705078, "epoch": 0.3274666666666667, "grad_norm": 5.172700881958008, "kl": 0.65338134765625, "learning_rate": 7.819323632926462e-07, "loss": 0.2081, "reward": 1.4166667014360428, "reward_std": 0.5154023952782154, "rewards/accuracy_reward": 0.5729166716337204, "rewards/format_reward": 0.8437500149011612, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 1218.677101135254, "epoch": 0.32853333333333334, "grad_norm": 2.41021466255188, "kl": 0.406005859375, "learning_rate": 7.806381275427054e-07, "loss": 0.1547, "reward": 1.270833358168602, "reward_std": 0.5414909049868584, "rewards/accuracy_reward": 0.40625000838190317, "rewards/format_reward": 0.8645833432674408, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 982.0208473205566, "epoch": 0.3296, "grad_norm": 3.1297335624694824, "kl": 0.46722412109375, "learning_rate": 7.793412990987784e-07, "loss": 0.0511, "reward": 1.4791666828095913, "reward_std": 0.3489444889128208, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.8333333395421505, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 957.6771087646484, "epoch": 0.33066666666666666, "grad_norm": 4.894863605499268, "kl": 0.418365478515625, "learning_rate": 7.780418925390246e-07, "loss": 0.0081, "reward": 1.6979167014360428, "reward_std": 0.3283577933907509, "rewards/accuracy_reward": 0.7708333414047956, "rewards/format_reward": 0.9270833432674408, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 1161.8229446411133, "epoch": 0.3317333333333333, "grad_norm": 5.087875843048096, "kl": 0.416748046875, "learning_rate": 7.767399224705845e-07, "loss": 0.1285, "reward": 1.458333358168602, "reward_std": 0.50701455026865, "rewards/accuracy_reward": 0.6354166865348816, "rewards/format_reward": 0.822916679084301, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 1054.104206085205, "epoch": 0.3328, "grad_norm": 3.4494948387145996, "kl": 0.532745361328125, "learning_rate": 7.75435403529416e-07, "loss": 0.1465, "reward": 1.4166666977107525, "reward_std": 0.46951768547296524, "rewards/accuracy_reward": 0.5833333414047956, "rewards/format_reward": 0.8333333469927311, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 582.3541870117188, "epoch": 0.33386666666666664, "grad_norm": 3.774197578430176, "kl": 0.263824462890625, "learning_rate": 7.741283503801303e-07, "loss": -0.0436, "reward": 1.666666716337204, "reward_std": 0.4201158806681633, "rewards/accuracy_reward": 0.7395833507180214, "rewards/format_reward": 0.9270833507180214, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 594.8333511352539, "epoch": 0.33493333333333336, "grad_norm": 1.9384793043136597, "kl": 0.111297607421875, "learning_rate": 7.728187777158263e-07, "loss": 0.0028, "reward": 1.8854166865348816, "reward_std": 0.19398127868771553, "rewards/accuracy_reward": 0.9062500074505806, "rewards/format_reward": 0.9791666716337204, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 1069.4896087646484, "epoch": 0.336, "grad_norm": 15.45499038696289, "kl": 0.47100830078125, "learning_rate": 7.715067002579259e-07, "loss": 0.0146, "reward": 1.5000000447034836, "reward_std": 0.33740581199526787, "rewards/accuracy_reward": 0.5937500055879354, "rewards/format_reward": 0.9062500074505806, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 1233.5729446411133, "epoch": 0.3370666666666667, "grad_norm": 8.274024963378906, "kl": 0.459930419921875, "learning_rate": 7.701921327560081e-07, "loss": 0.1138, "reward": 1.3541666865348816, "reward_std": 0.44535424932837486, "rewards/accuracy_reward": 0.4895833395421505, "rewards/format_reward": 0.8645833507180214, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 1027.2917022705078, "epoch": 0.33813333333333334, "grad_norm": 4.972082138061523, "kl": 0.233367919921875, "learning_rate": 7.68875089987644e-07, "loss": 0.0416, "reward": 1.6354167088866234, "reward_std": 0.40076498687267303, "rewards/accuracy_reward": 0.7291666800156236, "rewards/format_reward": 0.9062500149011612, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 829.9583511352539, "epoch": 0.3392, "grad_norm": 3.6147990226745605, "kl": 0.188568115234375, "learning_rate": 7.675555867582297e-07, "loss": -0.0667, "reward": 1.8437500298023224, "reward_std": 0.34904519096016884, "rewards/accuracy_reward": 0.9062500149011612, "rewards/format_reward": 0.9375000074505806, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 1173.0521354675293, "epoch": 0.34026666666666666, "grad_norm": 13.80300235748291, "kl": 0.333160400390625, "learning_rate": 7.662336379008205e-07, "loss": 0.1839, "reward": 1.3229166939854622, "reward_std": 0.5023048371076584, "rewards/accuracy_reward": 0.46875000558793545, "rewards/format_reward": 0.8541666939854622, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 1004.6666870117188, "epoch": 0.3413333333333333, "grad_norm": 2.7146377563476562, "kl": 0.4814453125, "learning_rate": 7.649092582759638e-07, "loss": -0.0163, "reward": 1.3437500223517418, "reward_std": 0.3927299566566944, "rewards/accuracy_reward": 0.5000000009313226, "rewards/format_reward": 0.8437500074505806, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 1296.7083778381348, "epoch": 0.3424, "grad_norm": 6.0118408203125, "kl": 0.496185302734375, "learning_rate": 7.635824627715323e-07, "loss": 0.0405, "reward": 1.239583358168602, "reward_std": 0.3449416197836399, "rewards/accuracy_reward": 0.3958333386108279, "rewards/format_reward": 0.8437500074505806, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 1065.4896087646484, "epoch": 0.34346666666666664, "grad_norm": 6.713414669036865, "kl": 1.63702392578125, "learning_rate": 7.622532663025569e-07, "loss": -0.0299, "reward": 1.3958333507180214, "reward_std": 0.49913502484560013, "rewards/accuracy_reward": 0.5520833525806665, "rewards/format_reward": 0.8437500223517418, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 1247.739631652832, "epoch": 0.34453333333333336, "grad_norm": 6.239667892456055, "kl": 0.64556884765625, "learning_rate": 7.609216838110578e-07, "loss": 0.0646, "reward": 1.4791667088866234, "reward_std": 0.3544144034385681, "rewards/accuracy_reward": 0.593750013038516, "rewards/format_reward": 0.8854166716337204, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 1398.2708740234375, "epoch": 0.3456, "grad_norm": 31.820987701416016, "kl": 1.36273193359375, "learning_rate": 7.595877302658785e-07, "loss": 0.2674, "reward": 1.3854166865348816, "reward_std": 0.49169205874204636, "rewards/accuracy_reward": 0.5416666818782687, "rewards/format_reward": 0.8437500149011612, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 1384.6979484558105, "epoch": 0.3466666666666667, "grad_norm": 10.067298889160156, "kl": 0.679443359375, "learning_rate": 7.582514206625158e-07, "loss": 0.1994, "reward": 1.2812500521540642, "reward_std": 0.5226832777261734, "rewards/accuracy_reward": 0.4895833358168602, "rewards/format_reward": 0.7916666865348816, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 1132.4479446411133, "epoch": 0.34773333333333334, "grad_norm": 10.390936851501465, "kl": 0.541717529296875, "learning_rate": 7.569127700229518e-07, "loss": -0.0186, "reward": 1.6250000298023224, "reward_std": 0.36182602122426033, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.916666679084301, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 840.770866394043, "epoch": 0.3488, "grad_norm": 1.6628046035766602, "kl": 0.133270263671875, "learning_rate": 7.555717933954856e-07, "loss": 0.0093, "reward": 1.8437500298023224, "reward_std": 0.27773431688547134, "rewards/accuracy_reward": 0.9062500074505806, "rewards/format_reward": 0.9375000074505806, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 888.9687805175781, "epoch": 0.34986666666666666, "grad_norm": 3.119671583175659, "kl": 1.580780029296875, "learning_rate": 7.542285058545633e-07, "loss": 0.0092, "reward": 1.4270833879709244, "reward_std": 0.47405627742409706, "rewards/accuracy_reward": 0.5416666818782687, "rewards/format_reward": 0.8854166865348816, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 1518.239631652832, "epoch": 0.3509333333333333, "grad_norm": 12.579157829284668, "kl": 0.64727783203125, "learning_rate": 7.528829225006088e-07, "loss": 0.201, "reward": 1.3229167014360428, "reward_std": 0.519088301807642, "rewards/accuracy_reward": 0.5104166772216558, "rewards/format_reward": 0.8125000149011612, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 1141.4375267028809, "epoch": 0.352, "grad_norm": 9.715423583984375, "kl": 0.196502685546875, "learning_rate": 7.515350584598544e-07, "loss": 0.0738, "reward": 1.6041666865348816, "reward_std": 0.2143857777118683, "rewards/accuracy_reward": 0.6770833358168602, "rewards/format_reward": 0.9270833432674408, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 1030.208366394043, "epoch": 0.35306666666666664, "grad_norm": 15.31102180480957, "kl": 0.171356201171875, "learning_rate": 7.501849288841704e-07, "loss": 0.043, "reward": 1.3854166939854622, "reward_std": 0.43308593705296516, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.9375000149011612, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 1196.0729446411133, "epoch": 0.35413333333333336, "grad_norm": 7.252166748046875, "kl": 0.17071533203125, "learning_rate": 7.48832548950895e-07, "loss": 0.0472, "reward": 1.5312500298023224, "reward_std": 0.38917475938796997, "rewards/accuracy_reward": 0.6041666772216558, "rewards/format_reward": 0.9270833432674408, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 1048.6146087646484, "epoch": 0.3552, "grad_norm": 0.8475338816642761, "kl": 0.164794921875, "learning_rate": 7.47477933862663e-07, "loss": 0.0903, "reward": 1.4062500298023224, "reward_std": 0.4889158792793751, "rewards/accuracy_reward": 0.5104166716337204, "rewards/format_reward": 0.8958333432674408, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 1100.7812767028809, "epoch": 0.3562666666666667, "grad_norm": 1.2215956449508667, "kl": 0.150238037109375, "learning_rate": 7.461210988472362e-07, "loss": 0.1215, "reward": 1.5000000223517418, "reward_std": 0.4870755188167095, "rewards/accuracy_reward": 0.6041666781529784, "rewards/format_reward": 0.8958333432674408, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 959.8542137145996, "epoch": 0.35733333333333334, "grad_norm": 0.48862820863723755, "kl": 0.086456298828125, "learning_rate": 7.447620591573311e-07, "loss": 0.024, "reward": 1.760416716337204, "reward_std": 0.3585582338273525, "rewards/accuracy_reward": 0.8229166865348816, "rewards/format_reward": 0.9375000074505806, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 958.2187728881836, "epoch": 0.3584, "grad_norm": 0.8009869456291199, "kl": 0.10211181640625, "learning_rate": 7.434008300704479e-07, "loss": 0.1049, "reward": 1.5937500298023224, "reward_std": 0.34535447880625725, "rewards/accuracy_reward": 0.6770833395421505, "rewards/format_reward": 0.9166666716337204, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 755.0521087646484, "epoch": 0.35946666666666666, "grad_norm": 0.34822410345077515, "kl": 0.0480194091796875, "learning_rate": 7.420374268886987e-07, "loss": -0.0483, "reward": 1.9062500298023224, "reward_std": 0.22601452097296715, "rewards/accuracy_reward": 0.9270833507180214, "rewards/format_reward": 0.9791666716337204, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 871.729190826416, "epoch": 0.3605333333333333, "grad_norm": 0.6047457456588745, "kl": 0.10894775390625, "learning_rate": 7.406718649386349e-07, "loss": 0.0381, "reward": 1.7187500149011612, "reward_std": 0.3210354298353195, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.9270833432674408, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 782.3229331970215, "epoch": 0.3616, "grad_norm": 0.719771146774292, "kl": 0.07958984375, "learning_rate": 7.393041595710766e-07, "loss": 0.065, "reward": 1.7291666865348816, "reward_std": 0.2552092485129833, "rewards/accuracy_reward": 0.760416679084301, "rewards/format_reward": 0.96875, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 1268.3021240234375, "epoch": 0.3626666666666667, "grad_norm": 1.2847076654434204, "kl": 0.130706787109375, "learning_rate": 7.379343261609379e-07, "loss": 0.0403, "reward": 1.5937500447034836, "reward_std": 0.4980827644467354, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.9062500149011612, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 1064.8541870117188, "epoch": 0.36373333333333335, "grad_norm": 0.9363823533058167, "kl": 0.131988525390625, "learning_rate": 7.365623801070555e-07, "loss": 0.111, "reward": 1.4583333730697632, "reward_std": 0.5342362560331821, "rewards/accuracy_reward": 0.5520833507180214, "rewards/format_reward": 0.9062500149011612, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 916.822940826416, "epoch": 0.3648, "grad_norm": 1.0483967065811157, "kl": 0.117095947265625, "learning_rate": 7.35188336832015e-07, "loss": 0.0687, "reward": 1.7916666865348816, "reward_std": 0.31002992391586304, "rewards/accuracy_reward": 0.8437500074505806, "rewards/format_reward": 0.9479166716337204, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 846.0521087646484, "epoch": 0.3658666666666667, "grad_norm": 1.01327645778656, "kl": 0.10784912109375, "learning_rate": 7.338122117819781e-07, "loss": 0.1145, "reward": 1.6979166865348816, "reward_std": 0.5162635967135429, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.885416679084301, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 1203.8750267028809, "epoch": 0.36693333333333333, "grad_norm": 0.8397670388221741, "kl": 0.212615966796875, "learning_rate": 7.324340204265078e-07, "loss": 0.1583, "reward": 1.427083358168602, "reward_std": 0.3170727826654911, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.822916679084301, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 1348.5625267028809, "epoch": 0.368, "grad_norm": 2.001857280731201, "kl": 0.247589111328125, "learning_rate": 7.310537782583958e-07, "loss": 0.1331, "reward": 1.3229167088866234, "reward_std": 0.5851752124726772, "rewards/accuracy_reward": 0.5625000111758709, "rewards/format_reward": 0.760416679084301, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 1013.2812843322754, "epoch": 0.36906666666666665, "grad_norm": 0.6677552461624146, "kl": 0.204925537109375, "learning_rate": 7.296715007934877e-07, "loss": 0.1066, "reward": 1.6250000447034836, "reward_std": 0.3390367962419987, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.9375000074505806, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 1123.6146125793457, "epoch": 0.3701333333333333, "grad_norm": 1.3507872819900513, "kl": 0.301025390625, "learning_rate": 7.282872035705088e-07, "loss": 0.0487, "reward": 1.4791667014360428, "reward_std": 0.5175114572048187, "rewards/accuracy_reward": 0.6562500149011612, "rewards/format_reward": 0.822916679084301, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 1098.1458549499512, "epoch": 0.3712, "grad_norm": 2.27197003364563, "kl": 0.34222412109375, "learning_rate": 7.269009021508888e-07, "loss": 0.3054, "reward": 1.3854166939854622, "reward_std": 0.3718506693840027, "rewards/accuracy_reward": 0.531250006519258, "rewards/format_reward": 0.8541666865348816, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 805.2916831970215, "epoch": 0.3722666666666667, "grad_norm": 0.9724274277687073, "kl": 0.206451416015625, "learning_rate": 7.255126121185881e-07, "loss": 0.0573, "reward": 1.7291667014360428, "reward_std": 0.3542907014489174, "rewards/accuracy_reward": 0.8229166772216558, "rewards/format_reward": 0.9062500149011612, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 1044.2187805175781, "epoch": 0.37333333333333335, "grad_norm": 1.6361585855484009, "kl": 0.33349609375, "learning_rate": 7.241223490799211e-07, "loss": 0.1167, "reward": 1.5312500596046448, "reward_std": 0.42567015439271927, "rewards/accuracy_reward": 0.6354166828095913, "rewards/format_reward": 0.8958333507180214, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 1145.2291984558105, "epoch": 0.3744, "grad_norm": 1.2437489032745361, "kl": 0.371337890625, "learning_rate": 7.22730128663382e-07, "loss": 0.2104, "reward": 1.4062500298023224, "reward_std": 0.5294820331037045, "rewards/accuracy_reward": 0.572916679084301, "rewards/format_reward": 0.8333333432674408, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 1382.1458702087402, "epoch": 0.37546666666666667, "grad_norm": 3.2521817684173584, "kl": 0.465576171875, "learning_rate": 7.213359665194688e-07, "loss": 0.2481, "reward": 1.2395833656191826, "reward_std": 0.5113514773547649, "rewards/accuracy_reward": 0.4479166753590107, "rewards/format_reward": 0.7916666865348816, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 1021.9896202087402, "epoch": 0.37653333333333333, "grad_norm": 1.0260621309280396, "kl": 0.387603759765625, "learning_rate": 7.199398783205067e-07, "loss": 0.2381, "reward": 1.385416716337204, "reward_std": 0.513124618679285, "rewards/accuracy_reward": 0.5000000176951289, "rewards/format_reward": 0.8854166716337204, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 1091.489601135254, "epoch": 0.3776, "grad_norm": 1.415032982826233, "kl": 0.324066162109375, "learning_rate": 7.18541879760473e-07, "loss": 0.3167, "reward": 1.5208333730697632, "reward_std": 0.512064516544342, "rewards/accuracy_reward": 0.6562500111758709, "rewards/format_reward": 0.864583358168602, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 1148.7396430969238, "epoch": 0.37866666666666665, "grad_norm": 1.2797808647155762, "kl": 0.426849365234375, "learning_rate": 7.171419865548196e-07, "loss": 0.2408, "reward": 1.2291666846722364, "reward_std": 0.5464715287089348, "rewards/accuracy_reward": 0.4270833497866988, "rewards/format_reward": 0.8020833563059568, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 983.7708702087402, "epoch": 0.3797333333333333, "grad_norm": 1.5298693180084229, "kl": 0.27685546875, "learning_rate": 7.157402144402973e-07, "loss": 0.0284, "reward": 1.572916716337204, "reward_std": 0.5262652039527893, "rewards/accuracy_reward": 0.7187500149011612, "rewards/format_reward": 0.854166679084301, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 962.2187843322754, "epoch": 0.3808, "grad_norm": 0.7264670729637146, "kl": 0.21783447265625, "learning_rate": 7.143365791747783e-07, "loss": 0.17, "reward": 1.5104167014360428, "reward_std": 0.4970603957772255, "rewards/accuracy_reward": 0.6562500149011612, "rewards/format_reward": 0.854166679084301, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 1125.9062957763672, "epoch": 0.3818666666666667, "grad_norm": 1.204340934753418, "kl": 0.409332275390625, "learning_rate": 7.12931096537079e-07, "loss": 0.232, "reward": 1.3020833507180214, "reward_std": 0.44261181354522705, "rewards/accuracy_reward": 0.4687500111758709, "rewards/format_reward": 0.8333333507180214, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 1054.6979522705078, "epoch": 0.38293333333333335, "grad_norm": 2.1280171871185303, "kl": 0.301116943359375, "learning_rate": 7.115237823267833e-07, "loss": 0.1586, "reward": 1.3541667088866234, "reward_std": 0.4544401839375496, "rewards/accuracy_reward": 0.4687500027939677, "rewards/format_reward": 0.885416679084301, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 972.1875228881836, "epoch": 0.384, "grad_norm": 1.8355038166046143, "kl": 0.275390625, "learning_rate": 7.101146523640637e-07, "loss": 0.2766, "reward": 1.4687500149011612, "reward_std": 0.5012203902006149, "rewards/accuracy_reward": 0.5833333460614085, "rewards/format_reward": 0.885416679084301, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 1367.6666984558105, "epoch": 0.38506666666666667, "grad_norm": 1.603998064994812, "kl": 0.4791259765625, "learning_rate": 7.087037224895055e-07, "loss": 0.2142, "reward": 1.2500000223517418, "reward_std": 0.5560787320137024, "rewards/accuracy_reward": 0.43750001303851604, "rewards/format_reward": 0.8125000223517418, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 782.1458511352539, "epoch": 0.38613333333333333, "grad_norm": 1.4373447895050049, "kl": 0.198944091796875, "learning_rate": 7.072910085639267e-07, "loss": -0.064, "reward": 1.7083333879709244, "reward_std": 0.2873006835579872, "rewards/accuracy_reward": 0.7395833469927311, "rewards/format_reward": 0.9687500074505806, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 967.7500152587891, "epoch": 0.3872, "grad_norm": 1.1653681993484497, "kl": 0.229095458984375, "learning_rate": 7.058765264682001e-07, "loss": 0.1366, "reward": 1.2187500149011612, "reward_std": 0.3342692330479622, "rewards/accuracy_reward": 0.31250000558793545, "rewards/format_reward": 0.9062500149011612, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 815.5208625793457, "epoch": 0.38826666666666665, "grad_norm": 0.5035037398338318, "kl": 0.16796875, "learning_rate": 7.044602921030764e-07, "loss": 0.0894, "reward": 1.6458333730697632, "reward_std": 0.34470200911164284, "rewards/accuracy_reward": 0.6875000102445483, "rewards/format_reward": 0.9583333432674408, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 1040.166690826416, "epoch": 0.3893333333333333, "grad_norm": 0.8360941410064697, "kl": 0.256317138671875, "learning_rate": 7.030423213890036e-07, "loss": 0.2138, "reward": 1.5729166865348816, "reward_std": 0.4548347741365433, "rewards/accuracy_reward": 0.7083333535119891, "rewards/format_reward": 0.8645833507180214, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 762.0833587646484, "epoch": 0.3904, "grad_norm": 1.6780955791473389, "kl": 0.14801025390625, "learning_rate": 7.016226302659482e-07, "loss": 0.0782, "reward": 1.4687500223517418, "reward_std": 0.3682915084064007, "rewards/accuracy_reward": 0.5208333358168602, "rewards/format_reward": 0.9479166716337204, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 1414.5938148498535, "epoch": 0.3914666666666667, "grad_norm": 0.9848142266273499, "kl": 0.328582763671875, "learning_rate": 7.002012346932176e-07, "loss": 0.1746, "reward": 1.2916667088866234, "reward_std": 0.3980933204293251, "rewards/accuracy_reward": 0.5312500027939677, "rewards/format_reward": 0.7604166846722364, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 805.2604446411133, "epoch": 0.39253333333333335, "grad_norm": 2.4972431659698486, "kl": 0.14532470703125, "learning_rate": 6.987781506492787e-07, "loss": 0.1364, "reward": 1.7187500447034836, "reward_std": 0.4709051474928856, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.9062500149011612, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 1110.4271049499512, "epoch": 0.3936, "grad_norm": 2.764476776123047, "kl": 0.24267578125, "learning_rate": 6.973533941315795e-07, "loss": 0.267, "reward": 1.218750037252903, "reward_std": 0.512102946639061, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.8437500149011612, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 966.9583587646484, "epoch": 0.39466666666666667, "grad_norm": 3.155534029006958, "kl": 0.292327880859375, "learning_rate": 6.959269811563688e-07, "loss": 0.1943, "reward": 1.385416679084301, "reward_std": 0.43692220002412796, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.8645833432674408, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 965.3958740234375, "epoch": 0.3957333333333333, "grad_norm": 0.9313881397247314, "kl": 0.20770263671875, "learning_rate": 6.944989277585163e-07, "loss": 0.1173, "reward": 1.4791667088866234, "reward_std": 0.5170732252299786, "rewards/accuracy_reward": 0.6145833488553762, "rewards/format_reward": 0.8645833507180214, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 1060.7708549499512, "epoch": 0.3968, "grad_norm": 1.0511809587478638, "kl": 0.272003173828125, "learning_rate": 6.930692499913328e-07, "loss": 0.1786, "reward": 1.333333358168602, "reward_std": 0.40783967822790146, "rewards/accuracy_reward": 0.5104166716337204, "rewards/format_reward": 0.8229166865348816, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 977.9792060852051, "epoch": 0.39786666666666665, "grad_norm": 1.0052509307861328, "kl": 0.198394775390625, "learning_rate": 6.916379639263885e-07, "loss": 0.1445, "reward": 1.5937500298023224, "reward_std": 0.365662157535553, "rewards/accuracy_reward": 0.6666666800156236, "rewards/format_reward": 0.9270833432674408, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 1453.9062957763672, "epoch": 0.3989333333333333, "grad_norm": 1.0411237478256226, "kl": 0.31353759765625, "learning_rate": 6.902050856533337e-07, "loss": 0.1947, "reward": 1.1875000298023224, "reward_std": 0.409720566123724, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.7708333432674408, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 1122.5729446411133, "epoch": 0.4, "grad_norm": 0.8255424499511719, "kl": 0.19122314453125, "learning_rate": 6.887706312797172e-07, "loss": 0.1199, "reward": 1.4062500149011612, "reward_std": 0.3334619514644146, "rewards/accuracy_reward": 0.5208333348855376, "rewards/format_reward": 0.885416679084301, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 1046.4791946411133, "epoch": 0.4010666666666667, "grad_norm": 3.676121950149536, "kl": 0.2720947265625, "learning_rate": 6.873346169308052e-07, "loss": 0.1565, "reward": 1.3229166865348816, "reward_std": 0.5201920606195927, "rewards/accuracy_reward": 0.4687500139698386, "rewards/format_reward": 0.854166679084301, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 826.6875457763672, "epoch": 0.40213333333333334, "grad_norm": 2.508950710296631, "kl": 0.165740966796875, "learning_rate": 6.858970587494003e-07, "loss": 0.1498, "reward": 1.6875000298023224, "reward_std": 0.38457342237234116, "rewards/accuracy_reward": 0.7291666772216558, "rewards/format_reward": 0.9583333432674408, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 1094.9479598999023, "epoch": 0.4032, "grad_norm": 0.7185199856758118, "kl": 0.3359375, "learning_rate": 6.8445797289566e-07, "loss": 0.1548, "reward": 1.4062500074505806, "reward_std": 0.2562185227870941, "rewards/accuracy_reward": 0.5520833358168602, "rewards/format_reward": 0.854166679084301, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 1088.1354331970215, "epoch": 0.40426666666666666, "grad_norm": 0.6900866031646729, "kl": 0.281036376953125, "learning_rate": 6.830173755469149e-07, "loss": 0.1165, "reward": 1.4791666977107525, "reward_std": 0.2828211933374405, "rewards/accuracy_reward": 0.6458333395421505, "rewards/format_reward": 0.8333333469927311, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 1280.333351135254, "epoch": 0.4053333333333333, "grad_norm": 0.6138240098953247, "kl": 0.456390380859375, "learning_rate": 6.815752828974869e-07, "loss": 0.1919, "reward": 1.2604167014360428, "reward_std": 0.4229285977780819, "rewards/accuracy_reward": 0.43750000558793545, "rewards/format_reward": 0.822916679084301, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 902.9375228881836, "epoch": 0.4064, "grad_norm": 1.3629505634307861, "kl": 0.239990234375, "learning_rate": 6.80131711158507e-07, "loss": 0.1808, "reward": 1.6666666865348816, "reward_std": 0.43050170689821243, "rewards/accuracy_reward": 0.7395833488553762, "rewards/format_reward": 0.9270833432674408, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 793.7708625793457, "epoch": 0.40746666666666664, "grad_norm": 1.2077202796936035, "kl": 0.217041015625, "learning_rate": 6.786866765577336e-07, "loss": 0.2103, "reward": 1.7083333730697632, "reward_std": 0.4028569795191288, "rewards/accuracy_reward": 0.8020833432674408, "rewards/format_reward": 0.9062500074505806, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 1231.5104637145996, "epoch": 0.40853333333333336, "grad_norm": 1.785702109336853, "kl": 0.36920166015625, "learning_rate": 6.772401953393696e-07, "loss": 0.1348, "reward": 1.302083358168602, "reward_std": 0.40213219076395035, "rewards/accuracy_reward": 0.4791666744276881, "rewards/format_reward": 0.8229166865348816, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 1040.8750190734863, "epoch": 0.4096, "grad_norm": 2.138511896133423, "kl": 0.4578857421875, "learning_rate": 6.757922837638796e-07, "loss": 0.3392, "reward": 1.4895833730697632, "reward_std": 0.6408408060669899, "rewards/accuracy_reward": 0.6145833544433117, "rewards/format_reward": 0.8750000223517418, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 802.8750457763672, "epoch": 0.4106666666666667, "grad_norm": 1.046784520149231, "kl": 0.259857177734375, "learning_rate": 6.743429581078076e-07, "loss": 0.0747, "reward": 1.6458333656191826, "reward_std": 0.29978782683610916, "rewards/accuracy_reward": 0.7395833414047956, "rewards/format_reward": 0.9062500149011612, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 1011.5521011352539, "epoch": 0.41173333333333334, "grad_norm": 0.871096134185791, "kl": 0.3658447265625, "learning_rate": 6.728922346635941e-07, "loss": 0.2731, "reward": 1.5312500447034836, "reward_std": 0.5864512249827385, "rewards/accuracy_reward": 0.6666666828095913, "rewards/format_reward": 0.8645833469927311, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 797.0625267028809, "epoch": 0.4128, "grad_norm": 0.8144972324371338, "kl": 0.26123046875, "learning_rate": 6.714401297393922e-07, "loss": 0.1245, "reward": 1.666666716337204, "reward_std": 0.3884149417281151, "rewards/accuracy_reward": 0.7187500149011612, "rewards/format_reward": 0.947916679084301, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 1023.8854522705078, "epoch": 0.41386666666666666, "grad_norm": 1.9077574014663696, "kl": 0.41729736328125, "learning_rate": 6.69986659658885e-07, "loss": 0.3564, "reward": 1.4375000223517418, "reward_std": 0.4380268417298794, "rewards/accuracy_reward": 0.5937500027939677, "rewards/format_reward": 0.8437500149011612, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 1176.6771202087402, "epoch": 0.4149333333333333, "grad_norm": 2.1152803897857666, "kl": 0.594482421875, "learning_rate": 6.685318407611019e-07, "loss": 0.2635, "reward": 1.3229166865348816, "reward_std": 0.5500838123261929, "rewards/accuracy_reward": 0.5000000083819032, "rewards/format_reward": 0.822916679084301, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 1137.6458930969238, "epoch": 0.416, "grad_norm": 1.5146763324737549, "kl": 0.55462646484375, "learning_rate": 6.67075689400235e-07, "loss": 0.2344, "reward": 1.208333358168602, "reward_std": 0.3566027730703354, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.7708333507180214, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 1371.2083587646484, "epoch": 0.41706666666666664, "grad_norm": 1.61208975315094, "kl": 0.614959716796875, "learning_rate": 6.656182219454548e-07, "loss": 0.1713, "reward": 1.104166679084301, "reward_std": 0.4115760698914528, "rewards/accuracy_reward": 0.28125000558793545, "rewards/format_reward": 0.822916679084301, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 1177.5937805175781, "epoch": 0.41813333333333336, "grad_norm": 1.679085612297058, "kl": 0.44256591796875, "learning_rate": 6.641594547807268e-07, "loss": 0.1816, "reward": 1.1666666865348816, "reward_std": 0.36804045736789703, "rewards/accuracy_reward": 0.35416666977107525, "rewards/format_reward": 0.8125000149011612, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 1169.0312690734863, "epoch": 0.4192, "grad_norm": 2.0038578510284424, "kl": 0.4388427734375, "learning_rate": 6.626994043046269e-07, "loss": 0.1973, "reward": 1.3020833656191826, "reward_std": 0.44222358614206314, "rewards/accuracy_reward": 0.5000000102445483, "rewards/format_reward": 0.8020833507180214, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 1093.9166793823242, "epoch": 0.4202666666666667, "grad_norm": 1.1773083209991455, "kl": 0.348175048828125, "learning_rate": 6.612380869301573e-07, "loss": 0.1622, "reward": 1.4166666939854622, "reward_std": 0.36502527073025703, "rewards/accuracy_reward": 0.5520833358168602, "rewards/format_reward": 0.8645833432674408, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 958.4896011352539, "epoch": 0.42133333333333334, "grad_norm": 0.9272183775901794, "kl": 0.2109375, "learning_rate": 6.597755190845619e-07, "loss": 0.043, "reward": 1.5208333730697632, "reward_std": 0.4148876518011093, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.9375000149011612, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 957.8333778381348, "epoch": 0.4224, "grad_norm": 0.8411521911621094, "kl": 0.19659423828125, "learning_rate": 6.583117172091415e-07, "loss": 0.0511, "reward": 1.614583358168602, "reward_std": 0.3648170605301857, "rewards/accuracy_reward": 0.7187500055879354, "rewards/format_reward": 0.8958333432674408, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 1113.4062995910645, "epoch": 0.42346666666666666, "grad_norm": 1.1961528062820435, "kl": 0.232086181640625, "learning_rate": 6.568466977590694e-07, "loss": 0.1331, "reward": 1.5000000223517418, "reward_std": 0.3847620487213135, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.8958333432674408, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 1058.104206085205, "epoch": 0.4245333333333333, "grad_norm": 1.5068055391311646, "kl": 0.2559814453125, "learning_rate": 6.553804772032059e-07, "loss": 0.1528, "reward": 1.4895833805203438, "reward_std": 0.4491768889129162, "rewards/accuracy_reward": 0.614583345130086, "rewards/format_reward": 0.8750000074505806, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 1140.010456085205, "epoch": 0.4256, "grad_norm": 1.3789491653442383, "kl": 0.25872802734375, "learning_rate": 6.539130720239134e-07, "loss": 0.1769, "reward": 1.312500037252903, "reward_std": 0.4560883715748787, "rewards/accuracy_reward": 0.4791666669771075, "rewards/format_reward": 0.833333358168602, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 1524.6041946411133, "epoch": 0.4266666666666667, "grad_norm": 0.7028902173042297, "kl": 0.425811767578125, "learning_rate": 6.524444987168713e-07, "loss": 0.2271, "reward": 1.1250000223517418, "reward_std": 0.566280260682106, "rewards/accuracy_reward": 0.33333334140479565, "rewards/format_reward": 0.7916666939854622, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 927.8958549499512, "epoch": 0.42773333333333335, "grad_norm": 0.9502766132354736, "kl": 0.2158203125, "learning_rate": 6.509747737908904e-07, "loss": 0.0845, "reward": 1.5625000298023224, "reward_std": 0.3746686056256294, "rewards/accuracy_reward": 0.6458333395421505, "rewards/format_reward": 0.916666679084301, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 924.3541946411133, "epoch": 0.4288, "grad_norm": 0.9651437997817993, "kl": 0.228515625, "learning_rate": 6.495039137677267e-07, "loss": 0.1053, "reward": 1.4166667088866234, "reward_std": 0.5610124319791794, "rewards/accuracy_reward": 0.5520833544433117, "rewards/format_reward": 0.8645833432674408, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 1351.4688034057617, "epoch": 0.4298666666666667, "grad_norm": 1.2320345640182495, "kl": 0.19091796875, "learning_rate": 6.480319351818972e-07, "loss": 0.2591, "reward": 1.2187500447034836, "reward_std": 0.6209069006145, "rewards/accuracy_reward": 0.4062500139698386, "rewards/format_reward": 0.8125000149011612, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 1325.4167137145996, "epoch": 0.43093333333333333, "grad_norm": 31.8544921875, "kl": 0.210205078125, "learning_rate": 6.465588545804927e-07, "loss": 0.1599, "reward": 1.4270833507180214, "reward_std": 0.32917357608675957, "rewards/accuracy_reward": 0.5625000027939677, "rewards/format_reward": 0.8645833432674408, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 1317.9687957763672, "epoch": 0.432, "grad_norm": 18.878442764282227, "kl": 0.3192138671875, "learning_rate": 6.450846885229915e-07, "loss": 0.1699, "reward": 1.4270833805203438, "reward_std": 0.5161432698369026, "rewards/accuracy_reward": 0.520833345130086, "rewards/format_reward": 0.9062500149011612, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 1525.5416793823242, "epoch": 0.43306666666666666, "grad_norm": 11.640298843383789, "kl": 0.18597412109375, "learning_rate": 6.436094535810754e-07, "loss": 0.1825, "reward": 1.1250000298023224, "reward_std": 0.4353012591600418, "rewards/accuracy_reward": 0.2916666679084301, "rewards/format_reward": 0.8333333507180214, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 1335.364616394043, "epoch": 0.4341333333333333, "grad_norm": 5.88375997543335, "kl": 1.52130126953125, "learning_rate": 6.421331663384404e-07, "loss": 0.1575, "reward": 1.385416716337204, "reward_std": 0.5242011658847332, "rewards/accuracy_reward": 0.5104166753590107, "rewards/format_reward": 0.8750000149011612, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 1102.2812614440918, "epoch": 0.4352, "grad_norm": 35.51770782470703, "kl": 0.76947021484375, "learning_rate": 6.40655843390613e-07, "loss": 0.0778, "reward": 1.697916716337204, "reward_std": 0.38435108959674835, "rewards/accuracy_reward": 0.8229166744276881, "rewards/format_reward": 0.8750000149011612, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 1012.3021125793457, "epoch": 0.4362666666666667, "grad_norm": 34.375755310058594, "kl": 0.28704833984375, "learning_rate": 6.391775013447621e-07, "loss": 0.0845, "reward": 1.8229167014360428, "reward_std": 0.35347601026296616, "rewards/accuracy_reward": 0.8437500223517418, "rewards/format_reward": 0.9791666716337204, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 971.3854446411133, "epoch": 0.43733333333333335, "grad_norm": 17.26616096496582, "kl": 0.33563232421875, "learning_rate": 6.376981568195124e-07, "loss": 0.0684, "reward": 1.604166716337204, "reward_std": 0.46712398529052734, "rewards/accuracy_reward": 0.6666666846722364, "rewards/format_reward": 0.9375000074505806, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 999.4479370117188, "epoch": 0.4384, "grad_norm": 110.0888900756836, "kl": 2.1038818359375, "learning_rate": 6.362178264447581e-07, "loss": 0.2461, "reward": 1.5520833656191826, "reward_std": 0.4223914369940758, "rewards/accuracy_reward": 0.6354166753590107, "rewards/format_reward": 0.9166666716337204, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 932.2916793823242, "epoch": 0.43946666666666667, "grad_norm": 79.39080047607422, "kl": 0.336669921875, "learning_rate": 6.347365268614758e-07, "loss": 0.1602, "reward": 1.5104166865348816, "reward_std": 0.23890409246087074, "rewards/accuracy_reward": 0.5937500027939677, "rewards/format_reward": 0.9166666716337204, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 1485.3229370117188, "epoch": 0.44053333333333333, "grad_norm": 64.26009368896484, "kl": 0.2237548828125, "learning_rate": 6.33254274721537e-07, "loss": 0.1468, "reward": 1.4166666939854622, "reward_std": 0.5587838850915432, "rewards/accuracy_reward": 0.5937500149011612, "rewards/format_reward": 0.8229166865348816, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 1471.4271697998047, "epoch": 0.4416, "grad_norm": 2521.252685546875, "kl": 9.8607177734375, "learning_rate": 6.317710866875218e-07, "loss": 0.8836, "reward": 1.3645833730697632, "reward_std": 0.5474788174033165, "rewards/accuracy_reward": 0.5208333423361182, "rewards/format_reward": 0.8437500223517418, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 1196.4167022705078, "epoch": 0.44266666666666665, "grad_norm": 84.5479507446289, "kl": 0.17425537109375, "learning_rate": 6.302869794325306e-07, "loss": 0.2437, "reward": 1.5104167088866234, "reward_std": 0.6145225800573826, "rewards/accuracy_reward": 0.6666666828095913, "rewards/format_reward": 0.8437500223517418, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 1319.625015258789, "epoch": 0.4437333333333333, "grad_norm": 54.84637451171875, "kl": 0.69024658203125, "learning_rate": 6.288019696399975e-07, "loss": 0.3185, "reward": 1.3229166939854622, "reward_std": 0.4518851414322853, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.8437500149011612, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 1123.5312957763672, "epoch": 0.4448, "grad_norm": 593.1818237304688, "kl": 2.4298095703125, "learning_rate": 6.273160740035019e-07, "loss": 0.3677, "reward": 1.4270833730697632, "reward_std": 0.4870987832546234, "rewards/accuracy_reward": 0.5520833469927311, "rewards/format_reward": 0.8750000149011612, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 1274.8646049499512, "epoch": 0.4458666666666667, "grad_norm": 87.65736389160156, "kl": 0.224884033203125, "learning_rate": 6.258293092265818e-07, "loss": 0.1591, "reward": 1.3020833730697632, "reward_std": 0.4722311571240425, "rewards/accuracy_reward": 0.42708334140479565, "rewards/format_reward": 0.8750000149011612, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 1008.6354522705078, "epoch": 0.44693333333333335, "grad_norm": 13.259299278259277, "kl": 0.141815185546875, "learning_rate": 6.243416920225453e-07, "loss": 0.0754, "reward": 1.6041667014360428, "reward_std": 0.40286241471767426, "rewards/accuracy_reward": 0.6666666818782687, "rewards/format_reward": 0.9375000074505806, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 1238.0416946411133, "epoch": 0.448, "grad_norm": 246.11990356445312, "kl": 1.67327880859375, "learning_rate": 6.228532391142827e-07, "loss": 0.4789, "reward": 1.406250037252903, "reward_std": 0.49672670662403107, "rewards/accuracy_reward": 0.5729166828095913, "rewards/format_reward": 0.8333333432674408, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 1314.7500267028809, "epoch": 0.44906666666666667, "grad_norm": 64.00416564941406, "kl": 0.31744384765625, "learning_rate": 6.213639672340797e-07, "loss": 0.175, "reward": 1.375000026077032, "reward_std": 0.4715101718902588, "rewards/accuracy_reward": 0.5833333376795053, "rewards/format_reward": 0.7916666828095913, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 1229.9792022705078, "epoch": 0.45013333333333333, "grad_norm": 27.404314041137695, "kl": 0.668060302734375, "learning_rate": 6.198738931234276e-07, "loss": 0.122, "reward": 1.3020833507180214, "reward_std": 0.5618328638374805, "rewards/accuracy_reward": 0.47916667349636555, "rewards/format_reward": 0.8229166865348816, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 1061.9271278381348, "epoch": 0.4512, "grad_norm": 16.95953369140625, "kl": 0.106231689453125, "learning_rate": 6.183830335328354e-07, "loss": 0.0265, "reward": 1.6875000447034836, "reward_std": 0.2828211933374405, "rewards/accuracy_reward": 0.7500000111758709, "rewards/format_reward": 0.9375000074505806, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 958.0729446411133, "epoch": 0.45226666666666665, "grad_norm": 28.86214828491211, "kl": 0.6239013671875, "learning_rate": 6.168914052216437e-07, "loss": 0.1723, "reward": 1.6250000298023224, "reward_std": 0.3731166943907738, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.916666679084301, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 1213.3125610351562, "epoch": 0.4533333333333333, "grad_norm": 2575.681396484375, "kl": 13.8638916015625, "learning_rate": 6.153990249578328e-07, "loss": 0.8945, "reward": 1.447916679084301, "reward_std": 0.285232275724411, "rewards/accuracy_reward": 0.5729166697710752, "rewards/format_reward": 0.8750000149011612, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 1073.8646087646484, "epoch": 0.4544, "grad_norm": 1776.67236328125, "kl": 12.0625, "learning_rate": 6.139059095178371e-07, "loss": 0.9893, "reward": 1.5937500223517418, "reward_std": 0.31957533583045006, "rewards/accuracy_reward": 0.6354166744276881, "rewards/format_reward": 0.9583333432674408, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 917.7604446411133, "epoch": 0.4554666666666667, "grad_norm": 41.573211669921875, "kl": 0.64886474609375, "learning_rate": 6.124120756863547e-07, "loss": 0.0847, "reward": 1.4062500149011612, "reward_std": 0.3291121870279312, "rewards/accuracy_reward": 0.4791666744276881, "rewards/format_reward": 0.9270833432674408, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 1000.5625076293945, "epoch": 0.45653333333333335, "grad_norm": 78.51416778564453, "kl": 0.3447265625, "learning_rate": 6.109175402561602e-07, "loss": 0.1635, "reward": 1.4895833656191826, "reward_std": 0.3995768278837204, "rewards/accuracy_reward": 0.6354166865348816, "rewards/format_reward": 0.8541666716337204, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 677.4166831970215, "epoch": 0.4576, "grad_norm": 34.25173568725586, "kl": 0.15863037109375, "learning_rate": 6.094223200279145e-07, "loss": 0.0902, "reward": 1.5625000447034836, "reward_std": 0.31075509265065193, "rewards/accuracy_reward": 0.604166672565043, "rewards/format_reward": 0.9583333432674408, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 1130.7917137145996, "epoch": 0.45866666666666667, "grad_norm": 611.9981079101562, "kl": 5.9510498046875, "learning_rate": 6.079264318099769e-07, "loss": 0.5612, "reward": 1.5416667088866234, "reward_std": 0.35896989330649376, "rewards/accuracy_reward": 0.6250000102445483, "rewards/format_reward": 0.9166666716337204, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 967.3125305175781, "epoch": 0.4597333333333333, "grad_norm": 131.31832885742188, "kl": 1.2967529296875, "learning_rate": 6.064298924182157e-07, "loss": 0.1523, "reward": 1.593750037252903, "reward_std": 0.39825279265642166, "rewards/accuracy_reward": 0.6562500149011612, "rewards/format_reward": 0.9375000149011612, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 1364.9687805175781, "epoch": 0.4608, "grad_norm": 745.2891235351562, "kl": 4.271026611328125, "learning_rate": 6.049327186758191e-07, "loss": 0.5222, "reward": 1.2395833730697632, "reward_std": 0.37972255051136017, "rewards/accuracy_reward": 0.37500000838190317, "rewards/format_reward": 0.8645833432674408, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 1253.5625305175781, "epoch": 0.46186666666666665, "grad_norm": 52.66091537475586, "kl": 0.8101806640625, "learning_rate": 6.034349274131068e-07, "loss": 0.1619, "reward": 1.3333333656191826, "reward_std": 0.4226510338485241, "rewards/accuracy_reward": 0.40625000558793545, "rewards/format_reward": 0.9270833432674408, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 1144.5104598999023, "epoch": 0.4629333333333333, "grad_norm": 58.70127487182617, "kl": 1.2481689453125, "learning_rate": 6.0193653546734e-07, "loss": 0.1093, "reward": 1.6875000298023224, "reward_std": 0.27582165598869324, "rewards/accuracy_reward": 0.7083333423361182, "rewards/format_reward": 0.9791666716337204, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 997.5104675292969, "epoch": 0.464, "grad_norm": 288.7427978515625, "kl": 4.02593994140625, "learning_rate": 6.004375596825324e-07, "loss": 0.4087, "reward": 1.531250037252903, "reward_std": 0.43094127625226974, "rewards/accuracy_reward": 0.6562500158324838, "rewards/format_reward": 0.8750000074505806, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 596.3021125793457, "epoch": 0.4650666666666667, "grad_norm": 4.796201705932617, "kl": 0.114044189453125, "learning_rate": 5.989380169092607e-07, "loss": 0.0515, "reward": 1.9062500298023224, "reward_std": 0.20439471304416656, "rewards/accuracy_reward": 0.916666679084301, "rewards/format_reward": 0.9895833358168602, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 1265.8854598999023, "epoch": 0.46613333333333334, "grad_norm": 38.0161247253418, "kl": 0.342071533203125, "learning_rate": 5.974379240044757e-07, "loss": 0.1287, "reward": 1.4062500149011612, "reward_std": 0.3942745327949524, "rewards/accuracy_reward": 0.5312500102445483, "rewards/format_reward": 0.8750000149011612, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 1063.9375648498535, "epoch": 0.4672, "grad_norm": 4.848820209503174, "kl": 0.138336181640625, "learning_rate": 5.959372978313126e-07, "loss": 0.0117, "reward": 1.4583333507180214, "reward_std": 0.36458762362599373, "rewards/accuracy_reward": 0.5625000083819032, "rewards/format_reward": 0.8958333432674408, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 1370.885456085205, "epoch": 0.46826666666666666, "grad_norm": 39.78768539428711, "kl": 0.15106201171875, "learning_rate": 5.944361552589003e-07, "loss": 0.1002, "reward": 1.343750037252903, "reward_std": 0.43372881412506104, "rewards/accuracy_reward": 0.4687500102445483, "rewards/format_reward": 0.8750000149011612, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 1080.2292137145996, "epoch": 0.4693333333333333, "grad_norm": 56.690887451171875, "kl": 0.576873779296875, "learning_rate": 5.92934513162174e-07, "loss": 0.2128, "reward": 1.5520833656191826, "reward_std": 0.3306523635983467, "rewards/accuracy_reward": 0.6666666744276881, "rewards/format_reward": 0.885416679084301, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 1037.4896087646484, "epoch": 0.4704, "grad_norm": 16.32632064819336, "kl": 0.208282470703125, "learning_rate": 5.914323884216832e-07, "loss": 0.1075, "reward": 1.4687500521540642, "reward_std": 0.5128796398639679, "rewards/accuracy_reward": 0.5729166865348816, "rewards/format_reward": 0.8958333432674408, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 849.4479446411133, "epoch": 0.47146666666666665, "grad_norm": 0.5189881324768066, "kl": 0.063629150390625, "learning_rate": 5.899297979234037e-07, "loss": 0.0337, "reward": 1.8750000298023224, "reward_std": 0.2106524109840393, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.9791666716337204, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 1275.2604637145996, "epoch": 0.47253333333333336, "grad_norm": 10.518548965454102, "kl": 0.1610107421875, "learning_rate": 5.884267585585467e-07, "loss": 0.1015, "reward": 1.583333358168602, "reward_std": 0.3139677122235298, "rewards/accuracy_reward": 0.6875000074505806, "rewards/format_reward": 0.8958333358168602, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 986.1250343322754, "epoch": 0.4736, "grad_norm": 1.4581832885742188, "kl": 0.062408447265625, "learning_rate": 5.869232872233695e-07, "loss": 0.0638, "reward": 1.7812500298023224, "reward_std": 0.1980806216597557, "rewards/accuracy_reward": 0.8020833376795053, "rewards/format_reward": 0.9791666716337204, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 1108.6771087646484, "epoch": 0.4746666666666667, "grad_norm": 20.779022216796875, "kl": 0.157867431640625, "learning_rate": 5.854194008189851e-07, "loss": 0.1836, "reward": 1.5833333656191826, "reward_std": 0.34167931228876114, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.916666679084301, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 1228.416690826416, "epoch": 0.47573333333333334, "grad_norm": 4.517639636993408, "kl": 0.38446044921875, "learning_rate": 5.839151162511727e-07, "loss": 0.0156, "reward": 1.75, "reward_std": 0.17834587395191193, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.96875, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 854.9375228881836, "epoch": 0.4768, "grad_norm": 1.2065224647521973, "kl": 0.062042236328125, "learning_rate": 5.824104504301874e-07, "loss": 0.0921, "reward": 1.5312500447034836, "reward_std": 0.4021575525403023, "rewards/accuracy_reward": 0.5937500223517418, "rewards/format_reward": 0.9375000149011612, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 960.0729293823242, "epoch": 0.47786666666666666, "grad_norm": 2.8080995082855225, "kl": 0.0590362548828125, "learning_rate": 5.809054202705698e-07, "loss": 0.1354, "reward": 1.562500037252903, "reward_std": 0.24164992570877075, "rewards/accuracy_reward": 0.6041666744276881, "rewards/format_reward": 0.9583333432674408, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 844.3333511352539, "epoch": 0.4789333333333333, "grad_norm": 0.3209676444530487, "kl": 0.05377197265625, "learning_rate": 5.794000426909568e-07, "loss": -0.0054, "reward": 1.7187500298023224, "reward_std": 0.22601452097296715, "rewards/accuracy_reward": 0.7187500149011612, "rewards/format_reward": 1.0, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 1069.8333702087402, "epoch": 0.48, "grad_norm": 181.88961791992188, "kl": 5.291717529296875, "learning_rate": 5.778943346138898e-07, "loss": 0.7317, "reward": 1.562500037252903, "reward_std": 0.49867335706949234, "rewards/accuracy_reward": 0.6562500223517418, "rewards/format_reward": 0.9062500074505806, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 1527.4583587646484, "epoch": 0.48106666666666664, "grad_norm": 90.7333984375, "kl": 3.141571044921875, "learning_rate": 5.763883129656265e-07, "loss": 0.3785, "reward": 1.1354167088866234, "reward_std": 0.48482025042176247, "rewards/accuracy_reward": 0.3020833432674408, "rewards/format_reward": 0.8333333507180214, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 882.5208396911621, "epoch": 0.48213333333333336, "grad_norm": 48.06346130371094, "kl": 0.242095947265625, "learning_rate": 5.748819946759489e-07, "loss": 0.0636, "reward": 1.697916716337204, "reward_std": 0.37512117996811867, "rewards/accuracy_reward": 0.7395833507180214, "rewards/format_reward": 0.9583333432674408, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 1034.2083587646484, "epoch": 0.4832, "grad_norm": 177.72445678710938, "kl": 2.21392822265625, "learning_rate": 5.73375396677974e-07, "loss": 0.2768, "reward": 1.5208333507180214, "reward_std": 0.3366379141807556, "rewards/accuracy_reward": 0.593750013038516, "rewards/format_reward": 0.9270833432674408, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 1132.0416946411133, "epoch": 0.4842666666666667, "grad_norm": 29.359024047851562, "kl": 1.2421722412109375, "learning_rate": 5.718685359079631e-07, "loss": 0.1471, "reward": 1.6145833507180214, "reward_std": 0.39312246814370155, "rewards/accuracy_reward": 0.6979166753590107, "rewards/format_reward": 0.916666679084301, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 1311.3542098999023, "epoch": 0.48533333333333334, "grad_norm": 41.005043029785156, "kl": 0.35418701171875, "learning_rate": 5.703614293051308e-07, "loss": 0.0909, "reward": 1.5625000298023224, "reward_std": 0.5347935371100903, "rewards/accuracy_reward": 0.6875000111758709, "rewards/format_reward": 0.8750000074505806, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 1434.5417251586914, "epoch": 0.4864, "grad_norm": 75.74324035644531, "kl": 0.23590087890625, "learning_rate": 5.688540938114564e-07, "loss": 0.2017, "reward": 1.416666716337204, "reward_std": 0.5128495693206787, "rewards/accuracy_reward": 0.5000000158324838, "rewards/format_reward": 0.916666679084301, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 1352.00004196167, "epoch": 0.48746666666666666, "grad_norm": 37.30264663696289, "kl": 0.634918212890625, "learning_rate": 5.67346546371491e-07, "loss": 0.2053, "reward": 1.4375000447034836, "reward_std": 0.5411749184131622, "rewards/accuracy_reward": 0.5937500102445483, "rewards/format_reward": 0.8437500223517418, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 983.7604370117188, "epoch": 0.4885333333333333, "grad_norm": 48.25519561767578, "kl": 0.329681396484375, "learning_rate": 5.658388039321694e-07, "loss": 0.148, "reward": 1.6041666939854622, "reward_std": 0.2859203666448593, "rewards/accuracy_reward": 0.6562500093132257, "rewards/format_reward": 0.947916679084301, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 860.5937805175781, "epoch": 0.4896, "grad_norm": 0.8728722929954529, "kl": 0.17864990234375, "learning_rate": 5.643308834426173e-07, "loss": 0.0306, "reward": 1.656250037252903, "reward_std": 0.23832672834396362, "rewards/accuracy_reward": 0.7187500027939677, "rewards/format_reward": 0.9375000074505806, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 1391.5104370117188, "epoch": 0.49066666666666664, "grad_norm": 89.86829376220703, "kl": 3.979095458984375, "learning_rate": 5.628228018539631e-07, "loss": 0.491, "reward": 1.5833333730697632, "reward_std": 0.44540898501873016, "rewards/accuracy_reward": 0.6979166818782687, "rewards/format_reward": 0.8854166716337204, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 1070.947940826416, "epoch": 0.49173333333333336, "grad_norm": 2.7298476696014404, "kl": 0.3533477783203125, "learning_rate": 5.613145761191451e-07, "loss": 0.0235, "reward": 1.5416666865348816, "reward_std": 0.3220609836280346, "rewards/accuracy_reward": 0.6041666753590107, "rewards/format_reward": 0.9375000074505806, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 1347.1354598999023, "epoch": 0.4928, "grad_norm": 8.43759536743164, "kl": 1.178955078125, "learning_rate": 5.598062231927232e-07, "loss": 0.1993, "reward": 1.3750000149011612, "reward_std": 0.37342574819922447, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.8750000074505806, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 980.8333892822266, "epoch": 0.4938666666666667, "grad_norm": 3.5615270137786865, "kl": 0.174957275390625, "learning_rate": 5.582977600306856e-07, "loss": 0.1205, "reward": 1.6875000447034836, "reward_std": 0.33051614835858345, "rewards/accuracy_reward": 0.7187500149011612, "rewards/format_reward": 0.96875, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 1073.8229217529297, "epoch": 0.49493333333333334, "grad_norm": 2.344088554382324, "kl": 0.089935302734375, "learning_rate": 5.567892035902612e-07, "loss": 0.026, "reward": 1.614583358168602, "reward_std": 0.3041832111775875, "rewards/accuracy_reward": 0.6875000074505806, "rewards/format_reward": 0.9270833358168602, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 895.2604522705078, "epoch": 0.496, "grad_norm": 241.40060424804688, "kl": 2.470977783203125, "learning_rate": 5.552805708297264e-07, "loss": 0.1642, "reward": 1.7083333656191826, "reward_std": 0.3530357927083969, "rewards/accuracy_reward": 0.7708333507180214, "rewards/format_reward": 0.9375000074505806, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 1455.1666870117188, "epoch": 0.49706666666666666, "grad_norm": 10.934683799743652, "kl": 0.439605712890625, "learning_rate": 5.537718787082162e-07, "loss": 0.1509, "reward": 1.3541667014360428, "reward_std": 0.4359077699482441, "rewards/accuracy_reward": 0.5104166753590107, "rewards/format_reward": 0.8437500149011612, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 1263.1562881469727, "epoch": 0.4981333333333333, "grad_norm": 6.648770332336426, "kl": 0.164031982421875, "learning_rate": 5.522631441855325e-07, "loss": 0.2001, "reward": 1.312500037252903, "reward_std": 0.4301319867372513, "rewards/accuracy_reward": 0.3750000102445483, "rewards/format_reward": 0.9375000149011612, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 1069.8541946411133, "epoch": 0.4992, "grad_norm": 159.56680297851562, "kl": 0.593292236328125, "learning_rate": 5.507543842219541e-07, "loss": 0.1154, "reward": 1.5833333730697632, "reward_std": 0.2916114218533039, "rewards/accuracy_reward": 0.6875000074505806, "rewards/format_reward": 0.8958333358168602, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 979.9792060852051, "epoch": 0.5002666666666666, "grad_norm": 0.7747508883476257, "kl": 0.065338134765625, "learning_rate": 5.492456157780459e-07, "loss": -0.0019, "reward": 1.7395833730697632, "reward_std": 0.3105490952730179, "rewards/accuracy_reward": 0.7604166753590107, "rewards/format_reward": 0.9791666716337204, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 860.9166946411133, "epoch": 0.5013333333333333, "grad_norm": 1.0076816082000732, "kl": 0.201873779296875, "learning_rate": 5.477368558144674e-07, "loss": -0.0253, "reward": 1.6354167014360428, "reward_std": 0.28756027296185493, "rewards/accuracy_reward": 0.6562500111758709, "rewards/format_reward": 0.9791666716337204, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 1240.3437805175781, "epoch": 0.5024, "grad_norm": 0.30497127771377563, "kl": 0.0601043701171875, "learning_rate": 5.462281212917839e-07, "loss": 0.0466, "reward": 1.5937500298023224, "reward_std": 0.29314732179045677, "rewards/accuracy_reward": 0.6145833441987634, "rewards/format_reward": 0.9791666716337204, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 1267.4062881469727, "epoch": 0.5034666666666666, "grad_norm": 3.5861380100250244, "kl": 0.703887939453125, "learning_rate": 5.447194291702737e-07, "loss": 0.2513, "reward": 1.312500037252903, "reward_std": 0.4364953637123108, "rewards/accuracy_reward": 0.4583333358168602, "rewards/format_reward": 0.854166679084301, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 974.8437881469727, "epoch": 0.5045333333333333, "grad_norm": 1.9175397157669067, "kl": 0.19952392578125, "learning_rate": 5.432107964097389e-07, "loss": -0.0385, "reward": 1.4479166865348816, "reward_std": 0.4383159466087818, "rewards/accuracy_reward": 0.5104166828095913, "rewards/format_reward": 0.9375000149011612, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 978.5417137145996, "epoch": 0.5056, "grad_norm": 3.128610372543335, "kl": 0.45489501953125, "learning_rate": 5.417022399693144e-07, "loss": 0.118, "reward": 1.5416667014360428, "reward_std": 0.3715756759047508, "rewards/accuracy_reward": 0.5729166781529784, "rewards/format_reward": 0.9687500074505806, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 989.6666946411133, "epoch": 0.5066666666666667, "grad_norm": 2.618649482727051, "kl": 0.47442626953125, "learning_rate": 5.401937768072769e-07, "loss": 0.023, "reward": 1.562500037252903, "reward_std": 0.31314966827630997, "rewards/accuracy_reward": 0.6354166734963655, "rewards/format_reward": 0.9270833432674408, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 1209.9062805175781, "epoch": 0.5077333333333334, "grad_norm": 0.3048057556152344, "kl": 0.0694427490234375, "learning_rate": 5.386854238808547e-07, "loss": -0.0464, "reward": 1.5208333656191826, "reward_std": 0.4437421075999737, "rewards/accuracy_reward": 0.5833333460614085, "rewards/format_reward": 0.9375000074505806, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 995.2187652587891, "epoch": 0.5088, "grad_norm": 0.8494847416877747, "kl": 0.1318511962890625, "learning_rate": 5.37177198146037e-07, "loss": 0.0434, "reward": 1.7083333730697632, "reward_std": 0.26209891214966774, "rewards/accuracy_reward": 0.729166679084301, "rewards/format_reward": 0.9791666716337204, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 1016.5000228881836, "epoch": 0.5098666666666667, "grad_norm": 3759.301513671875, "kl": 23.200286865234375, "learning_rate": 5.356691165573826e-07, "loss": 2.7576, "reward": 1.6145833730697632, "reward_std": 0.3837018124759197, "rewards/accuracy_reward": 0.6875000074505806, "rewards/format_reward": 0.9270833432674408, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 827.510440826416, "epoch": 0.5109333333333334, "grad_norm": 3.650665044784546, "kl": 0.505584716796875, "learning_rate": 5.341611960678306e-07, "loss": 0.0581, "reward": 1.614583358168602, "reward_std": 0.29390744492411613, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.96875, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 1121.2500190734863, "epoch": 0.512, "grad_norm": 18.437366485595703, "kl": 0.6036529541015625, "learning_rate": 5.32653453628509e-07, "loss": -0.0085, "reward": 1.4479166939854622, "reward_std": 0.42354104667901993, "rewards/accuracy_reward": 0.5729166716337204, "rewards/format_reward": 0.8750000074505806, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 889.2708740234375, "epoch": 0.5130666666666667, "grad_norm": 13.363719940185547, "kl": 0.39764404296875, "learning_rate": 5.311459061885436e-07, "loss": 0.1321, "reward": 1.5833333879709244, "reward_std": 0.4101930595934391, "rewards/accuracy_reward": 0.6354166744276881, "rewards/format_reward": 0.947916679084301, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 1145.395851135254, "epoch": 0.5141333333333333, "grad_norm": 4.226874351501465, "kl": 0.5244140625, "learning_rate": 5.296385706948692e-07, "loss": 0.1036, "reward": 1.5729166865348816, "reward_std": 0.318786159157753, "rewards/accuracy_reward": 0.6458333358168602, "rewards/format_reward": 0.9270833358168602, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 873.0833549499512, "epoch": 0.5152, "grad_norm": 0.3591465353965759, "kl": 0.050262451171875, "learning_rate": 5.281314640920371e-07, "loss": 0.0348, "reward": 1.7604166939854622, "reward_std": 0.24359868466854095, "rewards/accuracy_reward": 0.7812500149011612, "rewards/format_reward": 0.9791666716337204, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 1121.520839691162, "epoch": 0.5162666666666667, "grad_norm": 1.356518268585205, "kl": 0.08282470703125, "learning_rate": 5.26624603322026e-07, "loss": 0.0313, "reward": 1.3229167014360428, "reward_std": 0.21845510601997375, "rewards/accuracy_reward": 0.35416666977107525, "rewards/format_reward": 0.9687500074505806, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 843.1250343322754, "epoch": 0.5173333333333333, "grad_norm": 6.40158748626709, "kl": 0.090423583984375, "learning_rate": 5.251180053240511e-07, "loss": 0.0202, "reward": 1.750000037252903, "reward_std": 0.32716843485832214, "rewards/accuracy_reward": 0.7916666846722364, "rewards/format_reward": 0.9583333358168602, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 1090.2604637145996, "epoch": 0.5184, "grad_norm": 4.955749034881592, "kl": 0.311309814453125, "learning_rate": 5.236116870343736e-07, "loss": 0.1316, "reward": 1.520833358168602, "reward_std": 0.375509899109602, "rewards/accuracy_reward": 0.6041666734963655, "rewards/format_reward": 0.916666679084301, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 955.9583549499512, "epoch": 0.5194666666666666, "grad_norm": 1.1546536684036255, "kl": 0.3025665283203125, "learning_rate": 5.221056653861102e-07, "loss": 0.0909, "reward": 1.6458333730697632, "reward_std": 0.38057558983564377, "rewards/accuracy_reward": 0.6875000223517418, "rewards/format_reward": 0.9583333432674408, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 1322.1875343322754, "epoch": 0.5205333333333333, "grad_norm": 22.05941390991211, "kl": 0.11602783203125, "learning_rate": 5.205999573090434e-07, "loss": 0.1143, "reward": 1.364583358168602, "reward_std": 0.4962585009634495, "rewards/accuracy_reward": 0.48958334885537624, "rewards/format_reward": 0.8750000074505806, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 872.5833549499512, "epoch": 0.5216, "grad_norm": 14.048080444335938, "kl": 0.45220947265625, "learning_rate": 5.190945797294301e-07, "loss": 0.1018, "reward": 1.6458333805203438, "reward_std": 0.3703707158565521, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.9375000074505806, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 1007.9062614440918, "epoch": 0.5226666666666666, "grad_norm": 0.8130990266799927, "kl": 0.1074981689453125, "learning_rate": 5.175895495698126e-07, "loss": 0.0862, "reward": 1.677083358168602, "reward_std": 0.30308152735233307, "rewards/accuracy_reward": 0.708333345130086, "rewards/format_reward": 0.9687500074505806, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 936.3125152587891, "epoch": 0.5237333333333334, "grad_norm": 2.514721632003784, "kl": 0.439361572265625, "learning_rate": 5.160848837488273e-07, "loss": 0.128, "reward": 1.5937500298023224, "reward_std": 0.4219443053007126, "rewards/accuracy_reward": 0.6666666753590107, "rewards/format_reward": 0.9270833432674408, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 887.8333587646484, "epoch": 0.5248, "grad_norm": 4.754851341247559, "kl": 1.089324951171875, "learning_rate": 5.145805991810149e-07, "loss": 0.1297, "reward": 1.6041667088866234, "reward_std": 0.2744111120700836, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.9583333432674408, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 1148.291690826416, "epoch": 0.5258666666666667, "grad_norm": 51.733585357666016, "kl": 0.519378662109375, "learning_rate": 5.130767127766306e-07, "loss": 0.22, "reward": 1.354166679084301, "reward_std": 0.32098614424467087, "rewards/accuracy_reward": 0.43750000558793545, "rewards/format_reward": 0.916666679084301, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 831.9375114440918, "epoch": 0.5269333333333334, "grad_norm": 0.6235740184783936, "kl": 0.20062255859375, "learning_rate": 5.115732414414534e-07, "loss": 0.0566, "reward": 1.7395833432674408, "reward_std": 0.33314163982868195, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.9687500074505806, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 776.2604446411133, "epoch": 0.528, "grad_norm": 0.761950671672821, "kl": 0.15130615234375, "learning_rate": 5.100702020765963e-07, "loss": 0.0005, "reward": 1.7604166939854622, "reward_std": 0.26212893426418304, "rewards/accuracy_reward": 0.7916666818782687, "rewards/format_reward": 0.9687500074505806, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 932.8750343322754, "epoch": 0.5290666666666667, "grad_norm": 0.49579811096191406, "kl": 0.0848846435546875, "learning_rate": 5.085676115783169e-07, "loss": -0.0108, "reward": 1.5937500298023224, "reward_std": 0.40676072984933853, "rewards/accuracy_reward": 0.6666666818782687, "rewards/format_reward": 0.9270833432674408, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 1092.9167022705078, "epoch": 0.5301333333333333, "grad_norm": 18.881961822509766, "kl": 1.3722686767578125, "learning_rate": 5.070654868378262e-07, "loss": 0.1628, "reward": 1.5208333805203438, "reward_std": 0.3837137147784233, "rewards/accuracy_reward": 0.6041666828095913, "rewards/format_reward": 0.916666679084301, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 860.052116394043, "epoch": 0.5312, "grad_norm": 0.5304872989654541, "kl": 0.0459136962890625, "learning_rate": 5.055638447410996e-07, "loss": -0.0173, "reward": 1.6666666865348816, "reward_std": 0.23389171808958054, "rewards/accuracy_reward": 0.6979166669771075, "rewards/format_reward": 0.9687500074505806, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 1267.9791984558105, "epoch": 0.5322666666666667, "grad_norm": 1.2921442985534668, "kl": 0.2737274169921875, "learning_rate": 5.040627021686874e-07, "loss": 0.2063, "reward": 1.40625, "reward_std": 0.34152011945843697, "rewards/accuracy_reward": 0.541666672565043, "rewards/format_reward": 0.8645833432674408, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 854.5625419616699, "epoch": 0.5333333333333333, "grad_norm": 0.3303286135196686, "kl": 0.038055419921875, "learning_rate": 5.025620759955241e-07, "loss": -0.0144, "reward": 1.614583358168602, "reward_std": 0.32799656316637993, "rewards/accuracy_reward": 0.6354166734963655, "rewards/format_reward": 0.9791666716337204, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 989.2917022705078, "epoch": 0.5344, "grad_norm": 0.44046640396118164, "kl": 0.0551910400390625, "learning_rate": 5.010619830907393e-07, "loss": 0.0506, "reward": 1.2604166865348816, "reward_std": 0.42462046071887016, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.8645833432674408, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 1111.9375228881836, "epoch": 0.5354666666666666, "grad_norm": 9.999726295471191, "kl": 0.4522857666015625, "learning_rate": 4.995624403174676e-07, "loss": 0.0304, "reward": 1.5104167014360428, "reward_std": 0.39825279265642166, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.9062500149011612, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 1148.8646240234375, "epoch": 0.5365333333333333, "grad_norm": 0.6131556630134583, "kl": 0.068115234375, "learning_rate": 4.980634645326601e-07, "loss": 0.0842, "reward": 1.5104166865348816, "reward_std": 0.35300539806485176, "rewards/accuracy_reward": 0.5520833395421505, "rewards/format_reward": 0.9583333432674408, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 959.8541946411133, "epoch": 0.5376, "grad_norm": 0.1613159477710724, "kl": 0.0384674072265625, "learning_rate": 4.965650725868931e-07, "loss": 0.0297, "reward": 1.7812500149011612, "reward_std": 0.1569838561117649, "rewards/accuracy_reward": 0.8020833376795053, "rewards/format_reward": 0.9791666716337204, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 723.7187614440918, "epoch": 0.5386666666666666, "grad_norm": 0.19780069589614868, "kl": 0.032012939453125, "learning_rate": 4.950672813241809e-07, "loss": -0.009, "reward": 1.8229166865348816, "reward_std": 0.17735834047198296, "rewards/accuracy_reward": 0.8333333460614085, "rewards/format_reward": 0.9895833358168602, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 1027.500015258789, "epoch": 0.5397333333333333, "grad_norm": 0.3331089913845062, "kl": 0.0340728759765625, "learning_rate": 4.935701075817843e-07, "loss": 0.0504, "reward": 1.6666667088866234, "reward_std": 0.20588331669569016, "rewards/accuracy_reward": 0.6875000074505806, "rewards/format_reward": 0.9791666716337204, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 1171.2812728881836, "epoch": 0.5408, "grad_norm": 7.195439338684082, "kl": 0.11244964599609375, "learning_rate": 4.920735681900231e-07, "loss": 0.1318, "reward": 1.7187500447034836, "reward_std": 0.33477523922920227, "rewards/accuracy_reward": 0.781250013038516, "rewards/format_reward": 0.9375000149011612, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 1006.0625343322754, "epoch": 0.5418666666666667, "grad_norm": 0.5811529159545898, "kl": 0.057403564453125, "learning_rate": 4.905776799720854e-07, "loss": 0.008, "reward": 1.5937500521540642, "reward_std": 0.3521696403622627, "rewards/accuracy_reward": 0.6562500149011612, "rewards/format_reward": 0.9375000074505806, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 1065.9375267028809, "epoch": 0.5429333333333334, "grad_norm": 1.2165635824203491, "kl": 0.037017822265625, "learning_rate": 4.890824597438397e-07, "loss": 0.0574, "reward": 1.7187500149011612, "reward_std": 0.4026795066893101, "rewards/accuracy_reward": 0.7604166865348816, "rewards/format_reward": 0.9583333432674408, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 857.604190826416, "epoch": 0.544, "grad_norm": 0.2694817781448364, "kl": 0.037750244140625, "learning_rate": 4.875879243136452e-07, "loss": -0.014, "reward": 1.8125000149011612, "reward_std": 0.2661800757050514, "rewards/accuracy_reward": 0.822916679084301, "rewards/format_reward": 0.9895833358168602, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 744.7500267028809, "epoch": 0.5450666666666667, "grad_norm": 0.36050766706466675, "kl": 0.0471954345703125, "learning_rate": 4.86094090482163e-07, "loss": -0.0344, "reward": 1.9062500149011612, "reward_std": 0.16673531010746956, "rewards/accuracy_reward": 0.9062500074505806, "rewards/format_reward": 1.0, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 853.8958702087402, "epoch": 0.5461333333333334, "grad_norm": 0.3080451190471649, "kl": 0.05609130859375, "learning_rate": 4.846009750421671e-07, "loss": -0.0196, "reward": 1.6354166939854622, "reward_std": 0.18967054039239883, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.9687500074505806, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 1263.7083892822266, "epoch": 0.5472, "grad_norm": 1.284267544746399, "kl": 0.0446929931640625, "learning_rate": 4.831085947783563e-07, "loss": 0.138, "reward": 1.364583358168602, "reward_std": 0.3560374788939953, "rewards/accuracy_reward": 0.416666672565043, "rewards/format_reward": 0.9479166716337204, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 1404.0417022705078, "epoch": 0.5482666666666667, "grad_norm": 0.511377215385437, "kl": 0.0489654541015625, "learning_rate": 4.816169664671645e-07, "loss": 0.0465, "reward": 1.4166666828095913, "reward_std": 0.2239840179681778, "rewards/accuracy_reward": 0.5104166753590107, "rewards/format_reward": 0.9062500037252903, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 787.9270992279053, "epoch": 0.5493333333333333, "grad_norm": 0.5615193247795105, "kl": 0.044647216796875, "learning_rate": 4.801261068765725e-07, "loss": 0.0281, "reward": 1.583333358168602, "reward_std": 0.35948578268289566, "rewards/accuracy_reward": 0.6250000102445483, "rewards/format_reward": 0.9583333432674408, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 1206.4479446411133, "epoch": 0.5504, "grad_norm": 2.2395873069763184, "kl": 0.046356201171875, "learning_rate": 4.786360327659203e-07, "loss": 0.1542, "reward": 1.6979167014360428, "reward_std": 0.5004544034600258, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 0.885416679084301, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 1229.6146507263184, "epoch": 0.5514666666666667, "grad_norm": 0.27496543526649475, "kl": 0.03912353515625, "learning_rate": 4.771467608857172e-07, "loss": 0.0652, "reward": 1.7916666865348816, "reward_std": 0.17188428342342377, "rewards/accuracy_reward": 0.8229166716337204, "rewards/format_reward": 0.96875, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 1653.0729637145996, "epoch": 0.5525333333333333, "grad_norm": 1.173843502998352, "kl": 0.0523681640625, "learning_rate": 4.7565830797745475e-07, "loss": 0.0626, "reward": 1.3541666865348816, "reward_std": 0.3583943210542202, "rewards/accuracy_reward": 0.4895833395421505, "rewards/format_reward": 0.8645833358168602, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 990.6875305175781, "epoch": 0.5536, "grad_norm": 1.2484556436538696, "kl": 0.04754638671875, "learning_rate": 4.741706907734183e-07, "loss": 0.1011, "reward": 1.770833358168602, "reward_std": 0.3177112191915512, "rewards/accuracy_reward": 0.8229166753590107, "rewards/format_reward": 0.947916679084301, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 1078.8750228881836, "epoch": 0.5546666666666666, "grad_norm": 1.2922035455703735, "kl": 0.0792236328125, "learning_rate": 4.7268392599649807e-07, "loss": 0.0598, "reward": 1.6562500223517418, "reward_std": 0.3240548260509968, "rewards/accuracy_reward": 0.7187500149011612, "rewards/format_reward": 0.9375000074505806, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 1730.1562957763672, "epoch": 0.5557333333333333, "grad_norm": 4.274890899658203, "kl": 0.2340087890625, "learning_rate": 4.711980303600025e-07, "loss": 0.2149, "reward": 1.1875000298023224, "reward_std": 0.5018721781671047, "rewards/accuracy_reward": 0.3750000102445483, "rewards/format_reward": 0.8125000149011612, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 930.6771087646484, "epoch": 0.5568, "grad_norm": 0.7662429809570312, "kl": 0.0736083984375, "learning_rate": 4.6971302056746944e-07, "loss": -0.0189, "reward": 1.5729167014360428, "reward_std": 0.2869829088449478, "rewards/accuracy_reward": 0.6458333395421505, "rewards/format_reward": 0.9270833432674408, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 767.3958511352539, "epoch": 0.5578666666666666, "grad_norm": 0.4983062148094177, "kl": 0.0561065673828125, "learning_rate": 4.6822891331247816e-07, "loss": 0.0226, "reward": 1.833333358168602, "reward_std": 0.27158063277602196, "rewards/accuracy_reward": 0.854166679084301, "rewards/format_reward": 0.9791666716337204, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 1294.5208854675293, "epoch": 0.5589333333333333, "grad_norm": 1.835287094116211, "kl": 0.11724853515625, "learning_rate": 4.66745725278463e-07, "loss": 0.063, "reward": 1.572916716337204, "reward_std": 0.3826962113380432, "rewards/accuracy_reward": 0.7083333414047956, "rewards/format_reward": 0.8645833507180214, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 986.895866394043, "epoch": 0.56, "grad_norm": 1.4356944561004639, "kl": 0.081207275390625, "learning_rate": 4.6526347313852443e-07, "loss": 0.109, "reward": 1.7500000447034836, "reward_std": 0.348210871219635, "rewards/accuracy_reward": 0.8333333488553762, "rewards/format_reward": 0.916666679084301, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 998.2500228881836, "epoch": 0.5610666666666667, "grad_norm": 0.5850461721420288, "kl": 0.09051513671875, "learning_rate": 4.6378217355524183e-07, "loss": -0.0069, "reward": 1.864583358168602, "reward_std": 0.21481411904096603, "rewards/accuracy_reward": 0.885416679084301, "rewards/format_reward": 0.9791666716337204, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 1057.8125343322754, "epoch": 0.5621333333333334, "grad_norm": 1.2090662717819214, "kl": 0.124420166015625, "learning_rate": 4.623018431804876e-07, "loss": -0.0083, "reward": 1.697916716337204, "reward_std": 0.34756145626306534, "rewards/accuracy_reward": 0.7604166716337204, "rewards/format_reward": 0.9375000149011612, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 811.6250267028809, "epoch": 0.5632, "grad_norm": 0.46283066272735596, "kl": 0.0629425048828125, "learning_rate": 4.6082249865523793e-07, "loss": -0.0244, "reward": 1.9687500149011612, "reward_std": 0.08474057167768478, "rewards/accuracy_reward": 0.9687500074505806, "rewards/format_reward": 1.0, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 998.3229675292969, "epoch": 0.5642666666666667, "grad_norm": 4.270676612854004, "kl": 0.21612548828125, "learning_rate": 4.5934415660938697e-07, "loss": 0.0547, "reward": 1.6145833730697632, "reward_std": 0.21728428453207016, "rewards/accuracy_reward": 0.635416679084301, "rewards/format_reward": 0.9791666716337204, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 1319.2500381469727, "epoch": 0.5653333333333334, "grad_norm": 15.186071395874023, "kl": 0.3563232421875, "learning_rate": 4.5786683366155963e-07, "loss": 0.1132, "reward": 1.4062500298023224, "reward_std": 0.3875078819692135, "rewards/accuracy_reward": 0.5208333460614085, "rewards/format_reward": 0.8854166865348816, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 1368.5729522705078, "epoch": 0.5664, "grad_norm": 42.63965606689453, "kl": 0.6212158203125, "learning_rate": 4.563905464189247e-07, "loss": 0.1826, "reward": 1.3333333656191826, "reward_std": 0.4672715626657009, "rewards/accuracy_reward": 0.4895833386108279, "rewards/format_reward": 0.8437500149011612, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 1138.2604522705078, "epoch": 0.5674666666666667, "grad_norm": 2.9966073036193848, "kl": 0.11944580078125, "learning_rate": 4.5491531147700844e-07, "loss": 0.0528, "reward": 1.7395833432674408, "reward_std": 0.25231020152568817, "rewards/accuracy_reward": 0.7812500074505806, "rewards/format_reward": 0.9583333358168602, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 1063.4896087646484, "epoch": 0.5685333333333333, "grad_norm": 0.8220905065536499, "kl": 0.096527099609375, "learning_rate": 4.534411454195075e-07, "loss": -0.0119, "reward": 1.7395833432674408, "reward_std": 0.20452256500720978, "rewards/accuracy_reward": 0.7708333358168602, "rewards/format_reward": 0.9687500074505806, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 755.5416946411133, "epoch": 0.5696, "grad_norm": 0.8123533725738525, "kl": 0.07708740234375, "learning_rate": 4.519680648181028e-07, "loss": 0.0113, "reward": 1.9270833730697632, "reward_std": 0.20556553453207016, "rewards/accuracy_reward": 0.9270833507180214, "rewards/format_reward": 1.0, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 616.5833473205566, "epoch": 0.5706666666666667, "grad_norm": 0.7389527559280396, "kl": 0.080230712890625, "learning_rate": 4.504960862322732e-07, "loss": 0.0165, "reward": 1.9062500447034836, "reward_std": 0.2618393227458, "rewards/accuracy_reward": 0.947916679084301, "rewards/format_reward": 0.9583333358168602, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 1467.0521202087402, "epoch": 0.5717333333333333, "grad_norm": 0.7303226590156555, "kl": 0.208221435546875, "learning_rate": 4.4902522620910976e-07, "loss": 0.0004, "reward": 1.2500000223517418, "reward_std": 0.30092189460992813, "rewards/accuracy_reward": 0.3437500027939677, "rewards/format_reward": 0.9062500149011612, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 1360.7187957763672, "epoch": 0.5728, "grad_norm": 2.42128586769104, "kl": 0.261505126953125, "learning_rate": 4.475555012831286e-07, "loss": 0.0766, "reward": 1.4791666716337204, "reward_std": 0.34703297540545464, "rewards/accuracy_reward": 0.5937500074505806, "rewards/format_reward": 0.8854166716337204, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 863.9687843322754, "epoch": 0.5738666666666666, "grad_norm": 0.47356778383255005, "kl": 0.109344482421875, "learning_rate": 4.460869279760866e-07, "loss": 0.0003, "reward": 1.708333358168602, "reward_std": 0.16948115080595016, "rewards/accuracy_reward": 0.7187500102445483, "rewards/format_reward": 0.9895833358168602, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 1085.3438034057617, "epoch": 0.5749333333333333, "grad_norm": 1.1762667894363403, "kl": 0.142303466796875, "learning_rate": 4.446195227967943e-07, "loss": 0.0477, "reward": 1.6875000447034836, "reward_std": 0.3715756759047508, "rewards/accuracy_reward": 0.760416679084301, "rewards/format_reward": 0.9270833432674408, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 1236.656265258789, "epoch": 0.576, "grad_norm": 3.8168044090270996, "kl": 0.16790771484375, "learning_rate": 4.4315330224093064e-07, "loss": 0.0729, "reward": 1.479166716337204, "reward_std": 0.4122267961502075, "rewards/accuracy_reward": 0.5416666734963655, "rewards/format_reward": 0.9375000149011612, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 1686.7187728881836, "epoch": 0.5770666666666666, "grad_norm": 2.7254996299743652, "kl": 0.468505859375, "learning_rate": 4.4168828279085856e-07, "loss": 0.0435, "reward": 1.3645833656191826, "reward_std": 0.49666087329387665, "rewards/accuracy_reward": 0.5104166734963655, "rewards/format_reward": 0.8541666865348816, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 1575.447982788086, "epoch": 0.5781333333333334, "grad_norm": 2.775719165802002, "kl": 0.28704833984375, "learning_rate": 4.4022448091543817e-07, "loss": 0.1425, "reward": 1.4166666939854622, "reward_std": 0.3431961238384247, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.8750000149011612, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 1211.5417022705078, "epoch": 0.5792, "grad_norm": 2.0754506587982178, "kl": 0.21502685546875, "learning_rate": 4.387619130698428e-07, "loss": 0.0648, "reward": 1.6458333656191826, "reward_std": 0.33279772102832794, "rewards/accuracy_reward": 0.7291666753590107, "rewards/format_reward": 0.916666679084301, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 1365.9167022705078, "epoch": 0.5802666666666667, "grad_norm": 1.5016562938690186, "kl": 0.361114501953125, "learning_rate": 4.3730059569537325e-07, "loss": 0.0154, "reward": 1.6875000447034836, "reward_std": 0.3400267921388149, "rewards/accuracy_reward": 0.750000013038516, "rewards/format_reward": 0.9375000074505806, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 1051.1666793823242, "epoch": 0.5813333333333334, "grad_norm": 0.6383575797080994, "kl": 0.2515869140625, "learning_rate": 4.3584054521927325e-07, "loss": 0.0011, "reward": 1.6666666716337204, "reward_std": 0.31435612589120865, "rewards/accuracy_reward": 0.7291666734963655, "rewards/format_reward": 0.9375000149011612, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 1384.020866394043, "epoch": 0.5824, "grad_norm": 2.1003081798553467, "kl": 0.287750244140625, "learning_rate": 4.343817780545452e-07, "loss": 0.0232, "reward": 1.7500000298023224, "reward_std": 0.30958427116274834, "rewards/accuracy_reward": 0.7812500102445483, "rewards/format_reward": 0.9687500074505806, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 1316.8541946411133, "epoch": 0.5834666666666667, "grad_norm": 1.951837182044983, "kl": 0.28790283203125, "learning_rate": 4.32924310599765e-07, "loss": 0.0689, "reward": 1.5833333730697632, "reward_std": 0.4099063277244568, "rewards/accuracy_reward": 0.6562500176951289, "rewards/format_reward": 0.9270833358168602, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 1366.4167175292969, "epoch": 0.5845333333333333, "grad_norm": 2.5205070972442627, "kl": 0.490478515625, "learning_rate": 4.31468159238898e-07, "loss": 0.0405, "reward": 1.6770833879709244, "reward_std": 0.3717028982937336, "rewards/accuracy_reward": 0.718750013038516, "rewards/format_reward": 0.9583333432674408, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 1256.3958587646484, "epoch": 0.5856, "grad_norm": 3.4707999229431152, "kl": 0.31011962890625, "learning_rate": 4.30013340341115e-07, "loss": 0.1457, "reward": 1.593750037252903, "reward_std": 0.3216141611337662, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.9687500074505806, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 1460.3125610351562, "epoch": 0.5866666666666667, "grad_norm": 0.9887894988059998, "kl": 0.6949462890625, "learning_rate": 4.285598702606079e-07, "loss": 0.0843, "reward": 1.4166666939854622, "reward_std": 0.3922397643327713, "rewards/accuracy_reward": 0.46875001676380634, "rewards/format_reward": 0.947916679084301, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 1341.8229637145996, "epoch": 0.5877333333333333, "grad_norm": 1.2238372564315796, "kl": 0.67572021484375, "learning_rate": 4.271077653364059e-07, "loss": 0.0833, "reward": 1.583333358168602, "reward_std": 0.37109465152025223, "rewards/accuracy_reward": 0.6458333414047956, "rewards/format_reward": 0.9375000074505806, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 1860.9583740234375, "epoch": 0.5888, "grad_norm": 2.898916006088257, "kl": 1.0804443359375, "learning_rate": 4.2565704189219243e-07, "loss": 0.1398, "reward": 1.2395833730697632, "reward_std": 0.38778845220804214, "rewards/accuracy_reward": 0.34375000838190317, "rewards/format_reward": 0.8958333507180214, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 1022.979190826416, "epoch": 0.5898666666666667, "grad_norm": 1.0747959613800049, "kl": 0.34234619140625, "learning_rate": 4.2420771623612053e-07, "loss": 0.0165, "reward": 1.6770833730697632, "reward_std": 0.32964421063661575, "rewards/accuracy_reward": 0.7395833358168602, "rewards/format_reward": 0.9375000149011612, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 998.2604446411133, "epoch": 0.5909333333333333, "grad_norm": 1.4153343439102173, "kl": 0.59942626953125, "learning_rate": 4.227598046606304e-07, "loss": 0.071, "reward": 1.739583358168602, "reward_std": 0.2874988839030266, "rewards/accuracy_reward": 0.7812500074505806, "rewards/format_reward": 0.9583333432674408, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 1080.3021278381348, "epoch": 0.592, "grad_norm": 2.9174323081970215, "kl": 0.4683837890625, "learning_rate": 4.2131332344226633e-07, "loss": 0.0173, "reward": 1.5833333656191826, "reward_std": 0.37639934569597244, "rewards/accuracy_reward": 0.6666666818782687, "rewards/format_reward": 0.916666679084301, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 955.2187728881836, "epoch": 0.5930666666666666, "grad_norm": 2.4012086391448975, "kl": 0.563720703125, "learning_rate": 4.198682888414929e-07, "loss": -0.0047, "reward": 1.739583358168602, "reward_std": 0.16481656581163406, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.9479166716337204, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 1287.3958549499512, "epoch": 0.5941333333333333, "grad_norm": 4.240655899047852, "kl": 0.7423095703125, "learning_rate": 4.1842471710251314e-07, "loss": -0.047, "reward": 1.3333333730697632, "reward_std": 0.44682400301098824, "rewards/accuracy_reward": 0.43750000838190317, "rewards/format_reward": 0.8958333432674408, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 1285.1042022705078, "epoch": 0.5952, "grad_norm": 2.106400728225708, "kl": 0.511474609375, "learning_rate": 4.169826244530852e-07, "loss": 0.1082, "reward": 1.5416667088866234, "reward_std": 0.3705526515841484, "rewards/accuracy_reward": 0.5937500074505806, "rewards/format_reward": 0.947916679084301, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 1158.4167022705078, "epoch": 0.5962666666666666, "grad_norm": 2.9380686283111572, "kl": 0.56915283203125, "learning_rate": 4.155420271043399e-07, "loss": 0.1502, "reward": 1.5729167014360428, "reward_std": 0.45139269158244133, "rewards/accuracy_reward": 0.6666666734963655, "rewards/format_reward": 0.9062500223517418, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 1290.8646049499512, "epoch": 0.5973333333333334, "grad_norm": 2.1440930366516113, "kl": 0.521240234375, "learning_rate": 4.1410294125059963e-07, "loss": 0.0091, "reward": 1.354166679084301, "reward_std": 0.2643946297466755, "rewards/accuracy_reward": 0.406250006519258, "rewards/format_reward": 0.947916679084301, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 1190.9375228881836, "epoch": 0.5984, "grad_norm": 4.1599297523498535, "kl": 0.330322265625, "learning_rate": 4.1266538306919495e-07, "loss": 0.0418, "reward": 1.437500037252903, "reward_std": 0.45979804173111916, "rewards/accuracy_reward": 0.5104166781529784, "rewards/format_reward": 0.9270833432674408, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 1145.1666870117188, "epoch": 0.5994666666666667, "grad_norm": 1.9109588861465454, "kl": 0.383544921875, "learning_rate": 4.1122936872028277e-07, "loss": 0.0214, "reward": 1.4895833656191826, "reward_std": 0.3596104346215725, "rewards/accuracy_reward": 0.5416666818782687, "rewards/format_reward": 0.947916679084301, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 1093.3750457763672, "epoch": 0.6005333333333334, "grad_norm": 2.6363561153411865, "kl": 0.2674560546875, "learning_rate": 4.0979491434666634e-07, "loss": 0.1096, "reward": 1.5625000447034836, "reward_std": 0.3243246152997017, "rewards/accuracy_reward": 0.6666666744276881, "rewards/format_reward": 0.8958333507180214, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 1106.958381652832, "epoch": 0.6016, "grad_norm": 3.6697440147399902, "kl": 0.24444580078125, "learning_rate": 4.083620360736116e-07, "loss": 0.0962, "reward": 1.5312500149011612, "reward_std": 0.399420827627182, "rewards/accuracy_reward": 0.5833333395421505, "rewards/format_reward": 0.947916679084301, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 1060.4271087646484, "epoch": 0.6026666666666667, "grad_norm": 9.529276847839355, "kl": 0.19439697265625, "learning_rate": 4.069307500086674e-07, "loss": 0.0632, "reward": 1.5729166939854622, "reward_std": 0.4158751852810383, "rewards/accuracy_reward": 0.6354166828095913, "rewards/format_reward": 0.9375000149011612, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 1161.7812881469727, "epoch": 0.6037333333333333, "grad_norm": 4.243790149688721, "kl": 0.34765625, "learning_rate": 4.055010722414838e-07, "loss": 0.0471, "reward": 1.5416667088866234, "reward_std": 0.3978218361735344, "rewards/accuracy_reward": 0.6354166781529784, "rewards/format_reward": 0.9062500149011612, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 1396.0417289733887, "epoch": 0.6048, "grad_norm": 1.8754830360412598, "kl": 0.2523193359375, "learning_rate": 4.040730188436313e-07, "loss": -0.0355, "reward": 1.427083358168602, "reward_std": 0.3315218612551689, "rewards/accuracy_reward": 0.4791666753590107, "rewards/format_reward": 0.947916679084301, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 813.2396125793457, "epoch": 0.6058666666666667, "grad_norm": 3.3452703952789307, "kl": 0.13275146484375, "learning_rate": 4.0264660586842057e-07, "loss": -0.0345, "reward": 1.7083333730697632, "reward_std": 0.3676062524318695, "rewards/accuracy_reward": 0.760416679084301, "rewards/format_reward": 0.9479166716337204, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 884.6562576293945, "epoch": 0.6069333333333333, "grad_norm": 0.5250332951545715, "kl": 0.13702392578125, "learning_rate": 4.012218493507213e-07, "loss": 0.0085, "reward": 1.5937500447034836, "reward_std": 0.2561890110373497, "rewards/accuracy_reward": 0.6145833432674408, "rewards/format_reward": 0.9791666716337204, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 1327.6563034057617, "epoch": 0.608, "grad_norm": 29.265817642211914, "kl": 0.89306640625, "learning_rate": 3.9979876530678235e-07, "loss": 0.04, "reward": 1.4687500298023224, "reward_std": 0.4133736863732338, "rewards/accuracy_reward": 0.5833333460614085, "rewards/format_reward": 0.8854166865348816, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 1072.7812576293945, "epoch": 0.6090666666666666, "grad_norm": 0.9587487578392029, "kl": 0.23687744140625, "learning_rate": 3.9837736973405165e-07, "loss": 0.114, "reward": 1.6145833879709244, "reward_std": 0.3060160130262375, "rewards/accuracy_reward": 0.6562500102445483, "rewards/format_reward": 0.9583333432674408, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 1137.802101135254, "epoch": 0.6101333333333333, "grad_norm": 2.504870891571045, "kl": 0.2181396484375, "learning_rate": 3.969576786109966e-07, "loss": 0.0277, "reward": 1.4375000223517418, "reward_std": 0.2844015806913376, "rewards/accuracy_reward": 0.5312500009313226, "rewards/format_reward": 0.9062500074505806, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 1612.0833854675293, "epoch": 0.6112, "grad_norm": 2.3138840198516846, "kl": 0.509033203125, "learning_rate": 3.9553970789692346e-07, "loss": 0.1097, "reward": 1.3333333656191826, "reward_std": 0.34927263855934143, "rewards/accuracy_reward": 0.40625000558793545, "rewards/format_reward": 0.9270833432674408, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 917.9583549499512, "epoch": 0.6122666666666666, "grad_norm": 7.510659694671631, "kl": 0.53466796875, "learning_rate": 3.941234735317999e-07, "loss": -0.1673, "reward": 1.3125000474974513, "reward_std": 0.36423856019973755, "rewards/accuracy_reward": 0.5104166744276881, "rewards/format_reward": 0.8020833460614085, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 1224.6354293823242, "epoch": 0.6133333333333333, "grad_norm": 3.048250198364258, "kl": 0.2265625, "learning_rate": 3.927089914360736e-07, "loss": 0.0229, "reward": 1.604166716337204, "reward_std": 0.5064091011881828, "rewards/accuracy_reward": 0.6770833563059568, "rewards/format_reward": 0.9270833507180214, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 1005.6354446411133, "epoch": 0.6144, "grad_norm": 4.1318230628967285, "kl": 0.22613525390625, "learning_rate": 3.9129627751049443e-07, "loss": 0.0939, "reward": 1.614583358168602, "reward_std": 0.47450821846723557, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.947916679084301, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 1476.1458892822266, "epoch": 0.6154666666666667, "grad_norm": 13.875448226928711, "kl": 0.97344970703125, "learning_rate": 3.8988534763593626e-07, "loss": 0.0987, "reward": 1.5000000298023224, "reward_std": 0.37025700882077217, "rewards/accuracy_reward": 0.5937500074505806, "rewards/format_reward": 0.9062500074505806, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 1267.739631652832, "epoch": 0.6165333333333334, "grad_norm": 1.6409517526626587, "kl": 0.28094482421875, "learning_rate": 3.884762176732169e-07, "loss": 0.0056, "reward": 1.5000000223517418, "reward_std": 0.3055504336953163, "rewards/accuracy_reward": 0.5833333414047956, "rewards/format_reward": 0.916666679084301, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 1195.0000305175781, "epoch": 0.6176, "grad_norm": 1.4281710386276245, "kl": 0.19781494140625, "learning_rate": 3.87068903462921e-07, "loss": 0.0184, "reward": 1.562500037252903, "reward_std": 0.4131728671491146, "rewards/accuracy_reward": 0.6145833414047956, "rewards/format_reward": 0.947916679084301, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 817.1771011352539, "epoch": 0.6186666666666667, "grad_norm": 0.5565060973167419, "kl": 0.15069580078125, "learning_rate": 3.8566342082522174e-07, "loss": -0.0041, "reward": 1.7500000223517418, "reward_std": 0.26919664442539215, "rewards/accuracy_reward": 0.7812500149011612, "rewards/format_reward": 0.9687500074505806, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 917.1562652587891, "epoch": 0.6197333333333334, "grad_norm": 1.0966062545776367, "kl": 0.154022216796875, "learning_rate": 3.8425978555970263e-07, "loss": 0.0147, "reward": 1.7395833730697632, "reward_std": 0.3370062857866287, "rewards/accuracy_reward": 0.7708333488553762, "rewards/format_reward": 0.9687500074505806, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 990.208366394043, "epoch": 0.6208, "grad_norm": 1.5839117765426636, "kl": 0.1610107421875, "learning_rate": 3.828580134451803e-07, "loss": 0.0048, "reward": 1.770833358168602, "reward_std": 0.22603315114974976, "rewards/accuracy_reward": 0.8020833395421505, "rewards/format_reward": 0.96875, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 966.2500152587891, "epoch": 0.6218666666666667, "grad_norm": 2.4596199989318848, "kl": 0.45098876953125, "learning_rate": 3.8145812023952703e-07, "loss": 0.0452, "reward": 1.4062500149011612, "reward_std": 0.34706785529851913, "rewards/accuracy_reward": 0.47916667722165585, "rewards/format_reward": 0.9270833432674408, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 784.8125076293945, "epoch": 0.6229333333333333, "grad_norm": 0.8661391735076904, "kl": 0.10235595703125, "learning_rate": 3.800601216794932e-07, "loss": 0.0282, "reward": 1.8854166865348816, "reward_std": 0.22484370693564415, "rewards/accuracy_reward": 0.9062500074505806, "rewards/format_reward": 0.9791666716337204, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 1097.677116394043, "epoch": 0.624, "grad_norm": 2.622683525085449, "kl": 0.22222900390625, "learning_rate": 3.786640334805313e-07, "loss": 0.0818, "reward": 1.4375000223517418, "reward_std": 0.2334362268447876, "rewards/accuracy_reward": 0.4791666753590107, "rewards/format_reward": 0.9583333432674408, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 1105.4375534057617, "epoch": 0.6250666666666667, "grad_norm": 1.3202171325683594, "kl": 0.150543212890625, "learning_rate": 3.7726987133661804e-07, "loss": 0.0592, "reward": 1.687500037252903, "reward_std": 0.19299374520778656, "rewards/accuracy_reward": 0.697916679084301, "rewards/format_reward": 0.9895833358168602, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 834.0521087646484, "epoch": 0.6261333333333333, "grad_norm": 0.4571961760520935, "kl": 0.125152587890625, "learning_rate": 3.7587765092007906e-07, "loss": 0.0015, "reward": 1.802083358168602, "reward_std": 0.18205293267965317, "rewards/accuracy_reward": 0.8020833507180214, "rewards/format_reward": 1.0, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 1106.614601135254, "epoch": 0.6272, "grad_norm": 1.5413470268249512, "kl": 0.1348876953125, "learning_rate": 3.744873878814121e-07, "loss": 0.1185, "reward": 1.739583358168602, "reward_std": 0.35308028012514114, "rewards/accuracy_reward": 0.7812500149011612, "rewards/format_reward": 0.9583333432674408, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 1573.062515258789, "epoch": 0.6282666666666666, "grad_norm": 1.7936609983444214, "kl": 0.26727294921875, "learning_rate": 3.730990978491112e-07, "loss": 0.0474, "reward": 1.3020833805203438, "reward_std": 0.443328820168972, "rewards/accuracy_reward": 0.35416668094694614, "rewards/format_reward": 0.947916679084301, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 1275.677116394043, "epoch": 0.6293333333333333, "grad_norm": 2.226879358291626, "kl": 0.178863525390625, "learning_rate": 3.7171279642949125e-07, "loss": -0.0081, "reward": 1.3229167014360428, "reward_std": 0.37269312888383865, "rewards/accuracy_reward": 0.3854166716337204, "rewards/format_reward": 0.9375000074505806, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 1004.2916870117188, "epoch": 0.6304, "grad_norm": 12.572297096252441, "kl": 0.118316650390625, "learning_rate": 3.703284992065122e-07, "loss": 0.0427, "reward": 1.583333358168602, "reward_std": 0.37477848678827286, "rewards/accuracy_reward": 0.6562500149011612, "rewards/format_reward": 0.9270833432674408, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 862.4583587646484, "epoch": 0.6314666666666666, "grad_norm": 7.475311279296875, "kl": 0.10040283203125, "learning_rate": 3.6894622174160417e-07, "loss": 0.0321, "reward": 1.5833333730697632, "reward_std": 0.33747728168964386, "rewards/accuracy_reward": 0.6041666753590107, "rewards/format_reward": 0.9791666716337204, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 877.8750305175781, "epoch": 0.6325333333333333, "grad_norm": 2.722390651702881, "kl": 0.1099853515625, "learning_rate": 3.675659795734922e-07, "loss": 0.0165, "reward": 1.666666679084301, "reward_std": 0.23338165134191513, "rewards/accuracy_reward": 0.6979166772216558, "rewards/format_reward": 0.9687500074505806, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 995.0833473205566, "epoch": 0.6336, "grad_norm": 24.570680618286133, "kl": 1.43798828125, "learning_rate": 3.661877882180221e-07, "loss": 0.2243, "reward": 1.5937500149011612, "reward_std": 0.3610902987420559, "rewards/accuracy_reward": 0.6770833414047956, "rewards/format_reward": 0.916666679084301, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 1019.3437652587891, "epoch": 0.6346666666666667, "grad_norm": 14.760440826416016, "kl": 0.202545166015625, "learning_rate": 3.6481166316798484e-07, "loss": 0.0503, "reward": 1.5104166939854622, "reward_std": 0.29062388837337494, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.9270833432674408, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 864.8229522705078, "epoch": 0.6357333333333334, "grad_norm": 3.083564519882202, "kl": 0.182220458984375, "learning_rate": 3.6343761989294464e-07, "loss": 0.0628, "reward": 1.7187500149011612, "reward_std": 0.2149786315858364, "rewards/accuracy_reward": 0.7395833441987634, "rewards/format_reward": 0.9791666716337204, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 798.395866394043, "epoch": 0.6368, "grad_norm": 9.388452529907227, "kl": 0.1539306640625, "learning_rate": 3.620656738390622e-07, "loss": 0.0713, "reward": 1.729166716337204, "reward_std": 0.4321279153227806, "rewards/accuracy_reward": 0.7604166865348816, "rewards/format_reward": 0.9687500074505806, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 960.9583587646484, "epoch": 0.6378666666666667, "grad_norm": 6.271109104156494, "kl": 0.40185546875, "learning_rate": 3.6069584042892345e-07, "loss": 0.0056, "reward": 1.5208333730697632, "reward_std": 0.3486473150551319, "rewards/accuracy_reward": 0.6041666744276881, "rewards/format_reward": 0.916666679084301, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 1046.1770935058594, "epoch": 0.6389333333333334, "grad_norm": 1.6304008960723877, "kl": 0.1185302734375, "learning_rate": 3.5932813506136493e-07, "loss": 0.0438, "reward": 1.6145833879709244, "reward_std": 0.4094809554517269, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.9687500074505806, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 887.4791870117188, "epoch": 0.64, "grad_norm": 1.9282420873641968, "kl": 0.10137939453125, "learning_rate": 3.5796257311130153e-07, "loss": 0.0315, "reward": 1.6458333507180214, "reward_std": 0.20281970873475075, "rewards/accuracy_reward": 0.6770833432674408, "rewards/format_reward": 0.9687500074505806, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 1034.2916870117188, "epoch": 0.6410666666666667, "grad_norm": 32.0504035949707, "kl": 0.696014404296875, "learning_rate": 3.5659916992955207e-07, "loss": 0.1698, "reward": 1.583333358168602, "reward_std": 0.48160572350025177, "rewards/accuracy_reward": 0.6770833544433117, "rewards/format_reward": 0.9062500074505806, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 893.1042098999023, "epoch": 0.6421333333333333, "grad_norm": 1.722001552581787, "kl": 0.106475830078125, "learning_rate": 3.55237940842669e-07, "loss": 0.0812, "reward": 1.614583358168602, "reward_std": 0.3013214208185673, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.9687500074505806, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 835.9062652587891, "epoch": 0.6432, "grad_norm": 1.0314631462097168, "kl": 0.0804443359375, "learning_rate": 3.5387890115276364e-07, "loss": 0.0315, "reward": 1.6458333879709244, "reward_std": 0.32016588002443314, "rewards/accuracy_reward": 0.6666666800156236, "rewards/format_reward": 0.9791666716337204, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 1395.9167175292969, "epoch": 0.6442666666666667, "grad_norm": 15.281646728515625, "kl": 1.753143310546875, "learning_rate": 3.5252206613733703e-07, "loss": 0.256, "reward": 1.3750000298023224, "reward_std": 0.3483247943222523, "rewards/accuracy_reward": 0.46875000838190317, "rewards/format_reward": 0.9062500149011612, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 884.3958549499512, "epoch": 0.6453333333333333, "grad_norm": 2.1040310859680176, "kl": 0.31280517578125, "learning_rate": 3.5116745104910526e-07, "loss": 0.1112, "reward": 1.6979166865348816, "reward_std": 0.3349786065518856, "rewards/accuracy_reward": 0.7916666818782687, "rewards/format_reward": 0.9062500074505806, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 711.0937690734863, "epoch": 0.6464, "grad_norm": 9.785202980041504, "kl": 0.36798095703125, "learning_rate": 3.4981507111582964e-07, "loss": 0.0345, "reward": 1.7604166865348816, "reward_std": 0.30038901045918465, "rewards/accuracy_reward": 0.791666679084301, "rewards/format_reward": 0.9687500074505806, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 1095.083366394043, "epoch": 0.6474666666666666, "grad_norm": 18.800233840942383, "kl": 0.2333984375, "learning_rate": 3.484649415401455e-07, "loss": 0.0946, "reward": 1.4687500447034836, "reward_std": 0.33810924738645554, "rewards/accuracy_reward": 0.5416666734963655, "rewards/format_reward": 0.9270833432674408, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 1081.5729446411133, "epoch": 0.6485333333333333, "grad_norm": 16.855220794677734, "kl": 0.52532958984375, "learning_rate": 3.4711707749939134e-07, "loss": 0.1089, "reward": 1.5937500298023224, "reward_std": 0.33570922538638115, "rewards/accuracy_reward": 0.6666666753590107, "rewards/format_reward": 0.9270833358168602, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 788.5312652587891, "epoch": 0.6496, "grad_norm": 1.625116229057312, "kl": 0.1793212890625, "learning_rate": 3.457714941454367e-07, "loss": 0.1381, "reward": 1.656250037252903, "reward_std": 0.3428783491253853, "rewards/accuracy_reward": 0.7187500223517418, "rewards/format_reward": 0.9375000074505806, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 1150.6979522705078, "epoch": 0.6506666666666666, "grad_norm": 12.408306121826172, "kl": 1.242950439453125, "learning_rate": 3.444282066045143e-07, "loss": 0.1583, "reward": 1.5625000447034836, "reward_std": 0.4535408467054367, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.8958333507180214, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 1012.7916946411133, "epoch": 0.6517333333333334, "grad_norm": 5.02968168258667, "kl": 0.2144775390625, "learning_rate": 3.430872299770482e-07, "loss": 0.0252, "reward": 1.604166716337204, "reward_std": 0.3896602652966976, "rewards/accuracy_reward": 0.6458333488553762, "rewards/format_reward": 0.9583333432674408, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 1084.9479484558105, "epoch": 0.6528, "grad_norm": 22.4835205078125, "kl": 1.90093994140625, "learning_rate": 3.4174857933748427e-07, "loss": 0.319, "reward": 1.5000000223517418, "reward_std": 0.4698401540517807, "rewards/accuracy_reward": 0.6354166744276881, "rewards/format_reward": 0.8645833432674408, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 1185.2187881469727, "epoch": 0.6538666666666667, "grad_norm": 50.94549560546875, "kl": 3.463287353515625, "learning_rate": 3.404122697341216e-07, "loss": 0.3885, "reward": 1.4062500298023224, "reward_std": 0.4417867511510849, "rewards/accuracy_reward": 0.5625000074505806, "rewards/format_reward": 0.8437500149011612, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 959.5521087646484, "epoch": 0.6549333333333334, "grad_norm": 60.851322174072266, "kl": 0.379302978515625, "learning_rate": 3.390783161889422e-07, "loss": 0.1539, "reward": 1.5520833656191826, "reward_std": 0.42869795113801956, "rewards/accuracy_reward": 0.6354166809469461, "rewards/format_reward": 0.916666679084301, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 914.8437728881836, "epoch": 0.656, "grad_norm": 6.720454692840576, "kl": 0.313446044921875, "learning_rate": 3.377467336974432e-07, "loss": 0.0333, "reward": 1.5416666939854622, "reward_std": 0.2652370296418667, "rewards/accuracy_reward": 0.5937500074505806, "rewards/format_reward": 0.9479166716337204, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 894.8125076293945, "epoch": 0.6570666666666667, "grad_norm": 6.657434940338135, "kl": 0.578887939453125, "learning_rate": 3.3641753722846776e-07, "loss": 0.1118, "reward": 1.531250037252903, "reward_std": 0.3340081050992012, "rewards/accuracy_reward": 0.5937500102445483, "rewards/format_reward": 0.9375000074505806, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 711.5416717529297, "epoch": 0.6581333333333333, "grad_norm": 3.054525375366211, "kl": 0.167205810546875, "learning_rate": 3.3509074172403625e-07, "loss": -0.0002, "reward": 1.6666666939854622, "reward_std": 0.23695533350110054, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.9375000074505806, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 954.0937843322754, "epoch": 0.6592, "grad_norm": 15.97026538848877, "kl": 1.61865234375, "learning_rate": 3.337663620991795e-07, "loss": 0.2356, "reward": 1.427083358168602, "reward_std": 0.46526888385415077, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.8333333507180214, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 814.6458511352539, "epoch": 0.6602666666666667, "grad_norm": 12.89101791381836, "kl": 0.2235107421875, "learning_rate": 3.324444132417703e-07, "loss": 0.0999, "reward": 1.5625000298023224, "reward_std": 0.4349632263183594, "rewards/accuracy_reward": 0.635416679084301, "rewards/format_reward": 0.9270833507180214, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 1028.208366394043, "epoch": 0.6613333333333333, "grad_norm": 5.208676338195801, "kl": 1.238555908203125, "learning_rate": 3.311249100123559e-07, "loss": 0.1919, "reward": 1.614583358168602, "reward_std": 0.4838796518743038, "rewards/accuracy_reward": 0.6875000223517418, "rewards/format_reward": 0.9270833432674408, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 1209.8437805175781, "epoch": 0.6624, "grad_norm": 3.8470444679260254, "kl": 0.1860809326171875, "learning_rate": 3.2980786724399176e-07, "loss": -0.0031, "reward": 1.6354166865348816, "reward_std": 0.2573598325252533, "rewards/accuracy_reward": 0.666666672565043, "rewards/format_reward": 0.96875, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 1086.739631652832, "epoch": 0.6634666666666666, "grad_norm": 14.626192092895508, "kl": 1.3588104248046875, "learning_rate": 3.284932997420742e-07, "loss": 0.24, "reward": 1.479166716337204, "reward_std": 0.38606199249625206, "rewards/accuracy_reward": 0.5208333441987634, "rewards/format_reward": 0.9583333358168602, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 1246.8750534057617, "epoch": 0.6645333333333333, "grad_norm": 15.085412979125977, "kl": 2.21905517578125, "learning_rate": 3.2718122228417366e-07, "loss": 0.3328, "reward": 1.3333333656191826, "reward_std": 0.53714844211936, "rewards/accuracy_reward": 0.47916668001562357, "rewards/format_reward": 0.854166679084301, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 698.1146011352539, "epoch": 0.6656, "grad_norm": 0.45277509093284607, "kl": 0.06561279296875, "learning_rate": 3.2587164961986956e-07, "loss": 0.0013, "reward": 1.8333333730697632, "reward_std": 0.2916114218533039, "rewards/accuracy_reward": 0.8437500149011612, "rewards/format_reward": 0.9895833358168602, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 892.1458587646484, "epoch": 0.6666666666666666, "grad_norm": 2.5573537349700928, "kl": 0.126495361328125, "learning_rate": 3.2456459647058396e-07, "loss": 0.0206, "reward": 1.583333358168602, "reward_std": 0.34503669664263725, "rewards/accuracy_reward": 0.593750013038516, "rewards/format_reward": 0.9895833358168602, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 1087.4479522705078, "epoch": 0.6677333333333333, "grad_norm": 0.5194029808044434, "kl": 0.101593017578125, "learning_rate": 3.2326007752941546e-07, "loss": 0.1096, "reward": 1.635416716337204, "reward_std": 0.32177040725946426, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.947916679084301, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 734.4375228881836, "epoch": 0.6688, "grad_norm": 2.1165590286254883, "kl": 0.149322509765625, "learning_rate": 3.219581074609754e-07, "loss": 0.0275, "reward": 1.6666667088866234, "reward_std": 0.24955713748931885, "rewards/accuracy_reward": 0.6979166744276881, "rewards/format_reward": 0.9687500074505806, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 1036.2812957763672, "epoch": 0.6698666666666667, "grad_norm": 0.6837100982666016, "kl": 0.333099365234375, "learning_rate": 3.2065870090122157e-07, "loss": 0.0752, "reward": 1.395833358168602, "reward_std": 0.320793516933918, "rewards/accuracy_reward": 0.4895833395421505, "rewards/format_reward": 0.9062500149011612, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 1460.7292098999023, "epoch": 0.6709333333333334, "grad_norm": 1.5506960153579712, "kl": 0.176666259765625, "learning_rate": 3.1936187245729465e-07, "loss": 0.0439, "reward": 1.5312500447034836, "reward_std": 0.48475077748298645, "rewards/accuracy_reward": 0.5937500186264515, "rewards/format_reward": 0.9375000074505806, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 866.7604370117188, "epoch": 0.672, "grad_norm": 0.739509105682373, "kl": 0.0802459716796875, "learning_rate": 3.18067636707354e-07, "loss": 0.0544, "reward": 1.6562500447034836, "reward_std": 0.33380211517214775, "rewards/accuracy_reward": 0.7187500074505806, "rewards/format_reward": 0.9375000149011612, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 913.5937728881836, "epoch": 0.6730666666666667, "grad_norm": 0.8652342557907104, "kl": 0.071380615234375, "learning_rate": 3.167760082004128e-07, "loss": 0.0281, "reward": 1.5104167014360428, "reward_std": 0.2938329428434372, "rewards/accuracy_reward": 0.5625000046566129, "rewards/format_reward": 0.947916679084301, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 1340.0312805175781, "epoch": 0.6741333333333334, "grad_norm": 1.4384431838989258, "kl": 0.28704833984375, "learning_rate": 3.154870014561757e-07, "loss": 0.067, "reward": 1.2187500149011612, "reward_std": 0.3853958137333393, "rewards/accuracy_reward": 0.3437500074505806, "rewards/format_reward": 0.8750000149011612, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 1089.770851135254, "epoch": 0.6752, "grad_norm": 2.923095464706421, "kl": 0.266754150390625, "learning_rate": 3.1420063096487514e-07, "loss": 0.0839, "reward": 1.5208333879709244, "reward_std": 0.4190382808446884, "rewards/accuracy_reward": 0.6145833535119891, "rewards/format_reward": 0.9062500149011612, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 1016.5312728881836, "epoch": 0.6762666666666667, "grad_norm": 1.1481581926345825, "kl": 0.183624267578125, "learning_rate": 3.1291691118710793e-07, "loss": 0.0197, "reward": 1.5625000149011612, "reward_std": 0.35367371141910553, "rewards/accuracy_reward": 0.5833333395421505, "rewards/format_reward": 0.9791666716337204, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 1359.1146392822266, "epoch": 0.6773333333333333, "grad_norm": 1.4839800596237183, "kl": 0.21722412109375, "learning_rate": 3.1163585655367363e-07, "loss": -0.0086, "reward": 1.2916667014360428, "reward_std": 0.5007696375250816, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.9166666865348816, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 1198.6145935058594, "epoch": 0.6784, "grad_norm": 1.1200307607650757, "kl": 0.2467041015625, "learning_rate": 3.1035748146541173e-07, "loss": 0.0035, "reward": 1.500000037252903, "reward_std": 0.4086421839892864, "rewards/accuracy_reward": 0.541666685603559, "rewards/format_reward": 0.9583333432674408, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 976.5312919616699, "epoch": 0.6794666666666667, "grad_norm": 0.44682320952415466, "kl": 0.072113037109375, "learning_rate": 3.0908180029303965e-07, "loss": 0.008, "reward": 1.5312500447034836, "reward_std": 0.45227735862135887, "rewards/accuracy_reward": 0.6145833423361182, "rewards/format_reward": 0.916666679084301, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 1372.4166793823242, "epoch": 0.6805333333333333, "grad_norm": 12.15744400024414, "kl": 0.795806884765625, "learning_rate": 3.078088273769919e-07, "loss": 0.1718, "reward": 1.2291666865348816, "reward_std": 0.32040610536932945, "rewards/accuracy_reward": 0.3333333367481828, "rewards/format_reward": 0.8958333507180214, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 1298.4479789733887, "epoch": 0.6816, "grad_norm": 4.289257049560547, "kl": 0.95635986328125, "learning_rate": 3.065385770272576e-07, "loss": 0.1438, "reward": 1.3750000298023224, "reward_std": 0.41677897050976753, "rewards/accuracy_reward": 0.5000000139698386, "rewards/format_reward": 0.8750000074505806, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 856.8021202087402, "epoch": 0.6826666666666666, "grad_norm": 1.403261661529541, "kl": 0.153076171875, "learning_rate": 3.0527106352322175e-07, "loss": 0.028, "reward": 1.6770833730697632, "reward_std": 0.31542911008000374, "rewards/accuracy_reward": 0.7083333460614085, "rewards/format_reward": 0.9687500074505806, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 726.3958511352539, "epoch": 0.6837333333333333, "grad_norm": 1.1283363103866577, "kl": 0.130615234375, "learning_rate": 3.040063011135023e-07, "loss": 0.0875, "reward": 1.6770833730697632, "reward_std": 0.3422049209475517, "rewards/accuracy_reward": 0.7083333488553762, "rewards/format_reward": 0.9687500074505806, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 892.3750305175781, "epoch": 0.6848, "grad_norm": 0.6274095177650452, "kl": 0.130157470703125, "learning_rate": 3.027443040157912e-07, "loss": 0.0005, "reward": 1.5833333507180214, "reward_std": 0.16478655114769936, "rewards/accuracy_reward": 0.6354166697710752, "rewards/format_reward": 0.9479166716337204, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 956.3021087646484, "epoch": 0.6858666666666666, "grad_norm": 1.031857967376709, "kl": 0.074737548828125, "learning_rate": 3.014850864166947e-07, "loss": 0.0417, "reward": 1.770833358168602, "reward_std": 0.1977628394961357, "rewards/accuracy_reward": 0.8020833432674408, "rewards/format_reward": 0.9687500074505806, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 1117.1562805175781, "epoch": 0.6869333333333333, "grad_norm": 5.93824577331543, "kl": 0.442840576171875, "learning_rate": 3.0022866247157384e-07, "loss": 0.15, "reward": 1.3333333805203438, "reward_std": 0.384777557104826, "rewards/accuracy_reward": 0.4375000102445483, "rewards/format_reward": 0.8958333507180214, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 1019.0625457763672, "epoch": 0.688, "grad_norm": 0.8183615803718567, "kl": 0.16387939453125, "learning_rate": 2.9897504630438444e-07, "loss": 0.037, "reward": 1.3854167088866234, "reward_std": 0.4402470849454403, "rewards/accuracy_reward": 0.4479166753590107, "rewards/format_reward": 0.9375000149011612, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 1186.6250305175781, "epoch": 0.6890666666666667, "grad_norm": 1.1076335906982422, "kl": 0.190948486328125, "learning_rate": 2.977242520075194e-07, "loss": -0.0297, "reward": 1.2916667014360428, "reward_std": 0.4970863461494446, "rewards/accuracy_reward": 0.40625001303851604, "rewards/format_reward": 0.8854166865348816, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 949.7500305175781, "epoch": 0.6901333333333334, "grad_norm": 1.2940726280212402, "kl": 0.123687744140625, "learning_rate": 2.964762936416501e-07, "loss": -0.0052, "reward": 1.6875000298023224, "reward_std": 0.3272993192076683, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.9583333432674408, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 1132.34379196167, "epoch": 0.6912, "grad_norm": 1.3681833744049072, "kl": 0.21514892578125, "learning_rate": 2.9523118523556737e-07, "loss": 0.0606, "reward": 1.5104167088866234, "reward_std": 0.4060435928404331, "rewards/accuracy_reward": 0.6145833432674408, "rewards/format_reward": 0.8958333432674408, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 1115.6979522705078, "epoch": 0.6922666666666667, "grad_norm": 0.6763229966163635, "kl": 0.084716796875, "learning_rate": 2.939889407860254e-07, "loss": 0.1146, "reward": 1.5312500223517418, "reward_std": 0.359790101647377, "rewards/accuracy_reward": 0.5937500111758709, "rewards/format_reward": 0.9375000149011612, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 857.8021049499512, "epoch": 0.6933333333333334, "grad_norm": 0.4083541929721832, "kl": 0.06121826171875, "learning_rate": 2.9274957425758273e-07, "loss": 0.0214, "reward": 1.7083333730697632, "reward_std": 0.2776201665401459, "rewards/accuracy_reward": 0.7500000037252903, "rewards/format_reward": 0.9583333432674408, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 967.5937805175781, "epoch": 0.6944, "grad_norm": 0.7651016712188721, "kl": 0.356536865234375, "learning_rate": 2.915130995824465e-07, "loss": 0.0647, "reward": 1.6041666865348816, "reward_std": 0.3425474762916565, "rewards/accuracy_reward": 0.6458333423361182, "rewards/format_reward": 0.9583333358168602, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 1074.4167022705078, "epoch": 0.6954666666666667, "grad_norm": 2.490422010421753, "kl": 0.194732666015625, "learning_rate": 2.902795306603155e-07, "loss": 0.1259, "reward": 1.6041667014360428, "reward_std": 0.33192605152726173, "rewards/accuracy_reward": 0.6666666772216558, "rewards/format_reward": 0.9375000074505806, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 648.9583587646484, "epoch": 0.6965333333333333, "grad_norm": 1.4425081014633179, "kl": 0.0735931396484375, "learning_rate": 2.8904888135822323e-07, "loss": 0.0396, "reward": 1.708333358168602, "reward_std": 0.368838120251894, "rewards/accuracy_reward": 0.7500000111758709, "rewards/format_reward": 0.9583333432674408, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 912.7500381469727, "epoch": 0.6976, "grad_norm": 0.5843232870101929, "kl": 0.091522216796875, "learning_rate": 2.878211655103833e-07, "loss": 0.0019, "reward": 1.5937500298023224, "reward_std": 0.30044984444975853, "rewards/accuracy_reward": 0.6250000074505806, "rewards/format_reward": 0.9687500074505806, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 981.8437652587891, "epoch": 0.6986666666666667, "grad_norm": 0.7024815678596497, "kl": 0.111785888671875, "learning_rate": 2.865963969180327e-07, "loss": 0.0642, "reward": 1.4375000521540642, "reward_std": 0.40964606404304504, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.9375000149011612, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 1047.2187881469727, "epoch": 0.6997333333333333, "grad_norm": 0.5074041485786438, "kl": 0.1312255859375, "learning_rate": 2.853745893492773e-07, "loss": 0.0697, "reward": 1.5729166939854622, "reward_std": 0.2766379974782467, "rewards/accuracy_reward": 0.6145833395421505, "rewards/format_reward": 0.9583333358168602, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 1103.239616394043, "epoch": 0.7008, "grad_norm": 6.225416660308838, "kl": 0.60150146484375, "learning_rate": 2.84155756538937e-07, "loss": 0.0692, "reward": 1.5000000223517418, "reward_std": 0.2809883914887905, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.9583333358168602, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 641.3229370117188, "epoch": 0.7018666666666666, "grad_norm": 0.9151586294174194, "kl": 0.057586669921875, "learning_rate": 2.8293991218839137e-07, "loss": -0.0076, "reward": 1.8020833879709244, "reward_std": 0.36325282603502274, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.947916679084301, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 1223.2916831970215, "epoch": 0.7029333333333333, "grad_norm": 1.6588691473007202, "kl": 0.117584228515625, "learning_rate": 2.8172706996542505e-07, "loss": 0.0651, "reward": 1.6041667014360428, "reward_std": 0.461502131074667, "rewards/accuracy_reward": 0.6354166818782687, "rewards/format_reward": 0.9687500074505806, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 883.8958587646484, "epoch": 0.704, "grad_norm": 1.0666347742080688, "kl": 0.237640380859375, "learning_rate": 2.805172435040754e-07, "loss": -0.0286, "reward": 1.6875000447034836, "reward_std": 0.41948574036359787, "rewards/accuracy_reward": 0.7604166902601719, "rewards/format_reward": 0.9270833432674408, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 892.2500343322754, "epoch": 0.7050666666666666, "grad_norm": 0.6305563449859619, "kl": 0.16796875, "learning_rate": 2.7931044640447756e-07, "loss": 0.065, "reward": 1.6979167088866234, "reward_std": 0.2620038352906704, "rewards/accuracy_reward": 0.7604166697710752, "rewards/format_reward": 0.9375000149011612, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 1088.4375228881836, "epoch": 0.7061333333333333, "grad_norm": 2.0747480392456055, "kl": 0.50091552734375, "learning_rate": 2.781066922327128e-07, "loss": 0.1477, "reward": 1.4583333656191826, "reward_std": 0.5128946527838707, "rewards/accuracy_reward": 0.562500013038516, "rewards/format_reward": 0.8958333432674408, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 1187.2917022705078, "epoch": 0.7072, "grad_norm": 4.1691203117370605, "kl": 0.506561279296875, "learning_rate": 2.7690599452065594e-07, "loss": 0.2036, "reward": 1.5000000298023224, "reward_std": 0.5297778397798538, "rewards/accuracy_reward": 0.5833333460614085, "rewards/format_reward": 0.9166666865348816, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 913.7916793823242, "epoch": 0.7082666666666667, "grad_norm": 0.5313553214073181, "kl": 0.0843505859375, "learning_rate": 2.757083667658223e-07, "loss": -0.0145, "reward": 1.6562500298023224, "reward_std": 0.32917358353734016, "rewards/accuracy_reward": 0.677083345130086, "rewards/format_reward": 0.9791666716337204, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 835.8958511352539, "epoch": 0.7093333333333334, "grad_norm": 4.928295612335205, "kl": 0.421875, "learning_rate": 2.7451382243121715e-07, "loss": 0.1142, "reward": 1.7083333879709244, "reward_std": 0.4274976886808872, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.9583333358168602, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 1036.0729598999023, "epoch": 0.7104, "grad_norm": 0.6349641680717468, "kl": 0.119964599609375, "learning_rate": 2.7332237494518363e-07, "loss": 0.0247, "reward": 1.6666666865348816, "reward_std": 0.33679112792015076, "rewards/accuracy_reward": 0.7083333535119891, "rewards/format_reward": 0.9583333432674408, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 997.0104446411133, "epoch": 0.7114666666666667, "grad_norm": 2.4918785095214844, "kl": 0.30010986328125, "learning_rate": 2.7213403770125177e-07, "loss": 0.0392, "reward": 1.6041667088866234, "reward_std": 0.44989366829395294, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.9375000149011612, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 871.4896049499512, "epoch": 0.7125333333333334, "grad_norm": 3.289602279663086, "kl": 0.713134765625, "learning_rate": 2.7094882405798845e-07, "loss": 0.1865, "reward": 1.5625000149011612, "reward_std": 0.3819554075598717, "rewards/accuracy_reward": 0.6145833414047956, "rewards/format_reward": 0.9479166716337204, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 762.291690826416, "epoch": 0.7136, "grad_norm": 1.761427879333496, "kl": 0.173095703125, "learning_rate": 2.6976674733884697e-07, "loss": 0.0551, "reward": 1.677083358168602, "reward_std": 0.3454144485294819, "rewards/accuracy_reward": 0.6979166744276881, "rewards/format_reward": 0.9791666716337204, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 1314.7812805175781, "epoch": 0.7146666666666667, "grad_norm": 10.182624816894531, "kl": 1.03778076171875, "learning_rate": 2.6858782083201667e-07, "loss": 0.2645, "reward": 0.9687500298023224, "reward_std": 0.5213157199323177, "rewards/accuracy_reward": 0.17708334140479565, "rewards/format_reward": 0.7916666865348816, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 1088.1041984558105, "epoch": 0.7157333333333333, "grad_norm": 3.976217269897461, "kl": 0.63525390625, "learning_rate": 2.6741205779027505e-07, "loss": 0.157, "reward": 1.4166666865348816, "reward_std": 0.36350977420806885, "rewards/accuracy_reward": 0.5104166818782687, "rewards/format_reward": 0.9062500149011612, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 1121.1875343322754, "epoch": 0.7168, "grad_norm": 6.075131893157959, "kl": 0.512603759765625, "learning_rate": 2.662394714308368e-07, "loss": 0.1438, "reward": 1.447916716337204, "reward_std": 0.4291433356702328, "rewards/accuracy_reward": 0.5312500055879354, "rewards/format_reward": 0.9166666865348816, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 862.4479331970215, "epoch": 0.7178666666666667, "grad_norm": 160.36422729492188, "kl": 2.6207275390625, "learning_rate": 2.6507007493520705e-07, "loss": 0.2569, "reward": 1.6041667014360428, "reward_std": 0.3186477944254875, "rewards/accuracy_reward": 0.6562500055879354, "rewards/format_reward": 0.9479166716337204, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 981.2083740234375, "epoch": 0.7189333333333333, "grad_norm": 0.7098330855369568, "kl": 0.1138916015625, "learning_rate": 2.639038814490323e-07, "loss": 0.0471, "reward": 1.5104167014360428, "reward_std": 0.4105909503996372, "rewards/accuracy_reward": 0.6250000176951289, "rewards/format_reward": 0.885416679084301, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 1081.8021087646484, "epoch": 0.72, "grad_norm": 6.5205559730529785, "kl": 0.8470458984375, "learning_rate": 2.6274090408195227e-07, "loss": 0.1279, "reward": 1.7187500447034836, "reward_std": 0.3710501566529274, "rewards/accuracy_reward": 0.8229166753590107, "rewards/format_reward": 0.895833358168602, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 1009.6354370117188, "epoch": 0.7210666666666666, "grad_norm": 0.9510160088539124, "kl": 0.207611083984375, "learning_rate": 2.6158115590745355e-07, "loss": 0.0592, "reward": 1.437500037252903, "reward_std": 0.39434347301721573, "rewards/accuracy_reward": 0.5104166753590107, "rewards/format_reward": 0.9270833432674408, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 1139.7812957763672, "epoch": 0.7221333333333333, "grad_norm": 4.398621082305908, "kl": 0.975067138671875, "learning_rate": 2.6042464996272206e-07, "loss": 0.0769, "reward": 1.4375000447034836, "reward_std": 0.4710211008787155, "rewards/accuracy_reward": 0.5104166716337204, "rewards/format_reward": 0.9270833432674408, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 961.1666870117188, "epoch": 0.7232, "grad_norm": 2.0873498916625977, "kl": 0.165191650390625, "learning_rate": 2.592713992484962e-07, "loss": -0.0303, "reward": 1.5312500298023224, "reward_std": 0.3082825541496277, "rewards/accuracy_reward": 0.5833333423361182, "rewards/format_reward": 0.9479166716337204, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 1108.7812805175781, "epoch": 0.7242666666666666, "grad_norm": 1.296276330947876, "kl": 0.219757080078125, "learning_rate": 2.5812141672892123e-07, "loss": 0.0309, "reward": 1.5625000298023224, "reward_std": 0.5001791417598724, "rewards/accuracy_reward": 0.6354166781529784, "rewards/format_reward": 0.9270833432674408, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 1082.5000457763672, "epoch": 0.7253333333333334, "grad_norm": 1.211511492729187, "kl": 0.507110595703125, "learning_rate": 2.5697471533140355e-07, "loss": 0.0133, "reward": 1.635416716337204, "reward_std": 0.4115978553891182, "rewards/accuracy_reward": 0.6979166818782687, "rewards/format_reward": 0.9375000074505806, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 1108.7291984558105, "epoch": 0.7264, "grad_norm": 0.707737922668457, "kl": 0.177886962890625, "learning_rate": 2.558313079464648e-07, "loss": 0.031, "reward": 1.3645833656191826, "reward_std": 0.40360889956355095, "rewards/accuracy_reward": 0.4895833460614085, "rewards/format_reward": 0.8750000074505806, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 934.7708435058594, "epoch": 0.7274666666666667, "grad_norm": 5.215168476104736, "kl": 0.4847412109375, "learning_rate": 2.5469120742759753e-07, "loss": 0.0793, "reward": 1.489583358168602, "reward_std": 0.3755987100303173, "rewards/accuracy_reward": 0.5937500074505806, "rewards/format_reward": 0.8958333507180214, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 778.8333511352539, "epoch": 0.7285333333333334, "grad_norm": 0.47144362330436707, "kl": 0.115447998046875, "learning_rate": 2.535544265911208e-07, "loss": -0.0382, "reward": 1.7187500447034836, "reward_std": 0.3311595916748047, "rewards/accuracy_reward": 0.7500000223517418, "rewards/format_reward": 0.9687500074505806, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 967.2187881469727, "epoch": 0.7296, "grad_norm": 4.495987892150879, "kl": 0.325439453125, "learning_rate": 2.5242097821603505e-07, "loss": 0.0106, "reward": 1.3854166865348816, "reward_std": 0.32177040725946426, "rewards/accuracy_reward": 0.43750000558793545, "rewards/format_reward": 0.947916679084301, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 1080.3229446411133, "epoch": 0.7306666666666667, "grad_norm": 1.357442021369934, "kl": 0.287567138671875, "learning_rate": 2.5129087504388003e-07, "loss": -0.0289, "reward": 1.3125000447034836, "reward_std": 0.4295176789164543, "rewards/accuracy_reward": 0.354166672565043, "rewards/format_reward": 0.9583333432674408, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 871.0833625793457, "epoch": 0.7317333333333333, "grad_norm": 6.430542469024658, "kl": 0.325592041015625, "learning_rate": 2.5016412977859005e-07, "loss": 0.1271, "reward": 1.4375000521540642, "reward_std": 0.4537182115018368, "rewards/accuracy_reward": 0.5625000176951289, "rewards/format_reward": 0.8750000223517418, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 1094.7708587646484, "epoch": 0.7328, "grad_norm": 16.837717056274414, "kl": 0.314453125, "learning_rate": 2.4904075508635236e-07, "loss": 0.091, "reward": 1.4895833656191826, "reward_std": 0.41906312108039856, "rewards/accuracy_reward": 0.572916679084301, "rewards/format_reward": 0.9166666865348816, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 1363.0208740234375, "epoch": 0.7338666666666667, "grad_norm": 27.720783233642578, "kl": 0.4267578125, "learning_rate": 2.479207635954643e-07, "loss": 0.1668, "reward": 1.2187500298023224, "reward_std": 0.4818352535367012, "rewards/accuracy_reward": 0.35416667722165585, "rewards/format_reward": 0.8645833507180214, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 1054.3333587646484, "epoch": 0.7349333333333333, "grad_norm": 11.535548210144043, "kl": 0.381927490234375, "learning_rate": 2.4680416789619076e-07, "loss": 0.044, "reward": 1.3750000447034836, "reward_std": 0.40773781016469, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.8958333432674408, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 1012.8750343322754, "epoch": 0.736, "grad_norm": 1.0356338024139404, "kl": 0.170196533203125, "learning_rate": 2.4569098054062384e-07, "loss": 0.0948, "reward": 1.5104166939854622, "reward_std": 0.3004790209233761, "rewards/accuracy_reward": 0.5520833404734731, "rewards/format_reward": 0.9583333432674408, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 908.0208435058594, "epoch": 0.7370666666666666, "grad_norm": 0.7542375922203064, "kl": 0.1258544921875, "learning_rate": 2.445812140425408e-07, "loss": 0.0051, "reward": 1.4583333507180214, "reward_std": 0.3205810487270355, "rewards/accuracy_reward": 0.5520833432674408, "rewards/format_reward": 0.9062500074505806, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 722.5416870117188, "epoch": 0.7381333333333333, "grad_norm": 0.7455814480781555, "kl": 0.123077392578125, "learning_rate": 2.4347488087726346e-07, "loss": 0.0306, "reward": 1.8125000447034836, "reward_std": 0.39229433983564377, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.9583333432674408, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 928.5104331970215, "epoch": 0.7392, "grad_norm": 1.7981981039047241, "kl": 0.409912109375, "learning_rate": 2.423719934815187e-07, "loss": 0.0153, "reward": 1.635416716337204, "reward_std": 0.36286717280745506, "rewards/accuracy_reward": 0.6875000102445483, "rewards/format_reward": 0.9479166716337204, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 696.1875267028809, "epoch": 0.7402666666666666, "grad_norm": 1.5391870737075806, "kl": 0.104095458984375, "learning_rate": 2.41272564253298e-07, "loss": 0.0576, "reward": 1.7291666865348816, "reward_std": 0.4050438515841961, "rewards/accuracy_reward": 0.8020833507180214, "rewards/format_reward": 0.9270833432674408, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 1328.0833587646484, "epoch": 0.7413333333333333, "grad_norm": 4.551530361175537, "kl": 1.223785400390625, "learning_rate": 2.401766055517178e-07, "loss": 0.1597, "reward": 1.2604167088866234, "reward_std": 0.43168457970023155, "rewards/accuracy_reward": 0.4062500074505806, "rewards/format_reward": 0.8541666865348816, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 1056.1354370117188, "epoch": 0.7424, "grad_norm": 1.337199330329895, "kl": 0.403656005859375, "learning_rate": 2.390841296968817e-07, "loss": 0.0657, "reward": 1.4270833656191826, "reward_std": 0.3545970916748047, "rewards/accuracy_reward": 0.5000000055879354, "rewards/format_reward": 0.9270833432674408, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 588.1562652587891, "epoch": 0.7434666666666667, "grad_norm": 0.6823047399520874, "kl": 0.1053466796875, "learning_rate": 2.379951489697404e-07, "loss": 0.032, "reward": 1.7187500298023224, "reward_std": 0.3323116935789585, "rewards/accuracy_reward": 0.7708333414047956, "rewards/format_reward": 0.947916679084301, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 1371.4166793823242, "epoch": 0.7445333333333334, "grad_norm": 5.908626556396484, "kl": 0.717620849609375, "learning_rate": 2.3690967561195527e-07, "loss": 0.1489, "reward": 1.2083333656191826, "reward_std": 0.4223487488925457, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.8333333507180214, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 891.7604370117188, "epoch": 0.7456, "grad_norm": 3.848353385925293, "kl": 1.01715087890625, "learning_rate": 2.3582772182575967e-07, "loss": 0.0464, "reward": 1.3333333656191826, "reward_std": 0.518491305410862, "rewards/accuracy_reward": 0.4270833460614085, "rewards/format_reward": 0.9062500149011612, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 1018.6979370117188, "epoch": 0.7466666666666667, "grad_norm": 0.8497524261474609, "kl": 0.23333740234375, "learning_rate": 2.3474929977382175e-07, "loss": -0.0377, "reward": 1.4687500298023224, "reward_std": 0.47928325086832047, "rewards/accuracy_reward": 0.5208333553746343, "rewards/format_reward": 0.947916679084301, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 988.1041946411133, "epoch": 0.7477333333333334, "grad_norm": 1.7994163036346436, "kl": 0.22723388671875, "learning_rate": 2.3367442157910848e-07, "loss": 0.033, "reward": 1.4791666939854622, "reward_std": 0.4290347881615162, "rewards/accuracy_reward": 0.5729166865348816, "rewards/format_reward": 0.9062500074505806, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 786.6875152587891, "epoch": 0.7488, "grad_norm": 0.5747361183166504, "kl": 0.1256103515625, "learning_rate": 2.3260309932474904e-07, "loss": 0.0832, "reward": 1.7187500298023224, "reward_std": 0.29825780540704727, "rewards/accuracy_reward": 0.7395833488553762, "rewards/format_reward": 0.9791666716337204, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 863.4896087646484, "epoch": 0.7498666666666667, "grad_norm": 3.1733694076538086, "kl": 0.294769287109375, "learning_rate": 2.315353450538982e-07, "loss": 0.0396, "reward": 1.375000037252903, "reward_std": 0.461956899613142, "rewards/accuracy_reward": 0.45833334047347307, "rewards/format_reward": 0.916666679084301, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 906.8750305175781, "epoch": 0.7509333333333333, "grad_norm": 27.232263565063477, "kl": 0.399383544921875, "learning_rate": 2.3047117076960228e-07, "loss": 0.0932, "reward": 1.6354167088866234, "reward_std": 0.4527810290455818, "rewards/accuracy_reward": 0.7083333488553762, "rewards/format_reward": 0.9270833432674408, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 1083.7500228881836, "epoch": 0.752, "grad_norm": 3.5035312175750732, "kl": 0.253082275390625, "learning_rate": 2.294105884346635e-07, "loss": 0.0769, "reward": 1.5208333730697632, "reward_std": 0.4758353419601917, "rewards/accuracy_reward": 0.5937500093132257, "rewards/format_reward": 0.9270833432674408, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 1037.6354446411133, "epoch": 0.7530666666666667, "grad_norm": 1.8923299312591553, "kl": 0.310272216796875, "learning_rate": 2.2835360997150504e-07, "loss": 0.026, "reward": 1.437500037252903, "reward_std": 0.4207856319844723, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.9375000149011612, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 967.479190826416, "epoch": 0.7541333333333333, "grad_norm": 0.7099140882492065, "kl": 0.15631103515625, "learning_rate": 2.2730024726203827e-07, "loss": -0.038, "reward": 1.6145833730697632, "reward_std": 0.3547466956079006, "rewards/accuracy_reward": 0.6666666772216558, "rewards/format_reward": 0.947916679084301, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 934.354190826416, "epoch": 0.7552, "grad_norm": 0.827660083770752, "kl": 0.1495361328125, "learning_rate": 2.2625051214752774e-07, "loss": 0.0168, "reward": 1.770833358168602, "reward_std": 0.31749148294329643, "rewards/accuracy_reward": 0.8020833507180214, "rewards/format_reward": 0.9687500074505806, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 780.3646011352539, "epoch": 0.7562666666666666, "grad_norm": 0.7398484349250793, "kl": 0.115692138671875, "learning_rate": 2.252044164284593e-07, "loss": -0.0168, "reward": 1.5833333730697632, "reward_std": 0.329414464533329, "rewards/accuracy_reward": 0.6354166772216558, "rewards/format_reward": 0.947916679084301, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 919.0833435058594, "epoch": 0.7573333333333333, "grad_norm": 1.3091974258422852, "kl": 0.24310302734375, "learning_rate": 2.241619718644068e-07, "loss": 0.0355, "reward": 1.6875000447034836, "reward_std": 0.36130407452583313, "rewards/accuracy_reward": 0.7291666772216558, "rewards/format_reward": 0.9583333432674408, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 1170.0937805175781, "epoch": 0.7584, "grad_norm": 0.6972825527191162, "kl": 0.23699951171875, "learning_rate": 2.2312319017389976e-07, "loss": 0.0048, "reward": 1.3750000298023224, "reward_std": 0.3249349743127823, "rewards/accuracy_reward": 0.447916672565043, "rewards/format_reward": 0.9270833507180214, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 927.1562728881836, "epoch": 0.7594666666666666, "grad_norm": 1.7842580080032349, "kl": 0.210845947265625, "learning_rate": 2.2208808303429227e-07, "loss": 0.0452, "reward": 1.437500037252903, "reward_std": 0.39503008499741554, "rewards/accuracy_reward": 0.5312500149011612, "rewards/format_reward": 0.9062500149011612, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 1067.9479675292969, "epoch": 0.7605333333333333, "grad_norm": 1.5189014673233032, "kl": 0.219696044921875, "learning_rate": 2.2105666208163114e-07, "loss": -0.0018, "reward": 1.4895833432674408, "reward_std": 0.3450811840593815, "rewards/accuracy_reward": 0.5416666679084301, "rewards/format_reward": 0.9479166716337204, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 1017.8646087646484, "epoch": 0.7616, "grad_norm": 5.648002624511719, "kl": 0.35919189453125, "learning_rate": 2.2002893891052527e-07, "loss": 0.1539, "reward": 1.4375000298023224, "reward_std": 0.43367520347237587, "rewards/accuracy_reward": 0.5104166772216558, "rewards/format_reward": 0.9270833432674408, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 884.9166946411133, "epoch": 0.7626666666666667, "grad_norm": 1.5512899160385132, "kl": 0.180419921875, "learning_rate": 2.1900492507401542e-07, "loss": 0.0457, "reward": 1.3229166939854622, "reward_std": 0.45191490650177, "rewards/accuracy_reward": 0.42708334885537624, "rewards/format_reward": 0.8958333507180214, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 899.4166889190674, "epoch": 0.7637333333333334, "grad_norm": 4.212532997131348, "kl": 0.292205810546875, "learning_rate": 2.1798463208344438e-07, "loss": 0.1051, "reward": 1.5937500447034836, "reward_std": 0.3960140123963356, "rewards/accuracy_reward": 0.6562500111758709, "rewards/format_reward": 0.9375000149011612, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 895.2500305175781, "epoch": 0.7648, "grad_norm": 1.3778448104858398, "kl": 0.199737548828125, "learning_rate": 2.169680714083271e-07, "loss": 0.0304, "reward": 1.4375000447034836, "reward_std": 0.43354735895991325, "rewards/accuracy_reward": 0.5208333460614085, "rewards/format_reward": 0.916666679084301, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 694.6979370117188, "epoch": 0.7658666666666667, "grad_norm": 1.3334805965423584, "kl": 0.195556640625, "learning_rate": 2.159552544762225e-07, "loss": 0.0149, "reward": 1.6666666865348816, "reward_std": 0.3372039869427681, "rewards/accuracy_reward": 0.7083333358168602, "rewards/format_reward": 0.9583333432674408, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 883.8229446411133, "epoch": 0.7669333333333334, "grad_norm": 1.8701186180114746, "kl": 0.297149658203125, "learning_rate": 2.1494619267260423e-07, "loss": 0.1146, "reward": 1.593750037252903, "reward_std": 0.31725750863552094, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.9270833507180214, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 972.7396087646484, "epoch": 0.768, "grad_norm": 1.7674444913864136, "kl": 0.248809814453125, "learning_rate": 2.1394089734073334e-07, "loss": 0.0588, "reward": 1.489583358168602, "reward_std": 0.47073448821902275, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.9270833432674408, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 890.0000114440918, "epoch": 0.7690666666666667, "grad_norm": 1.1370450258255005, "kl": 0.173095703125, "learning_rate": 2.129393797815306e-07, "loss": -0.0144, "reward": 1.656250037252903, "reward_std": 0.35308264568448067, "rewards/accuracy_reward": 0.7604166865348816, "rewards/format_reward": 0.8958333432674408, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 987.9271240234375, "epoch": 0.7701333333333333, "grad_norm": 1.0588834285736084, "kl": 0.25982666015625, "learning_rate": 2.1194165125344887e-07, "loss": 0.0176, "reward": 1.2916667014360428, "reward_std": 0.40891727805137634, "rewards/accuracy_reward": 0.38541667722165585, "rewards/format_reward": 0.9062500149011612, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 1072.3542175292969, "epoch": 0.7712, "grad_norm": 3.3874661922454834, "kl": 0.32440185546875, "learning_rate": 2.109477229723474e-07, "loss": 0.0543, "reward": 1.4166666939854622, "reward_std": 0.3817276917397976, "rewards/accuracy_reward": 0.4687500149011612, "rewards/format_reward": 0.947916679084301, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 737.2291946411133, "epoch": 0.7722666666666667, "grad_norm": 0.7529110312461853, "kl": 0.1143798828125, "learning_rate": 2.099576061113655e-07, "loss": 0.0298, "reward": 1.7500000596046448, "reward_std": 0.40108276903629303, "rewards/accuracy_reward": 0.8020833432674408, "rewards/format_reward": 0.947916679084301, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 807.9375228881836, "epoch": 0.7733333333333333, "grad_norm": 1.0176854133605957, "kl": 0.1875, "learning_rate": 2.0897131180079613e-07, "loss": 0.0398, "reward": 1.6666666939854622, "reward_std": 0.3032701797783375, "rewards/accuracy_reward": 0.6875000111758709, "rewards/format_reward": 0.9791666716337204, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 856.6771125793457, "epoch": 0.7744, "grad_norm": 0.9703332185745239, "kl": 0.2115478515625, "learning_rate": 2.079888511279622e-07, "loss": 0.0827, "reward": 1.3437500447034836, "reward_std": 0.28237831965088844, "rewards/accuracy_reward": 0.3958333367481828, "rewards/format_reward": 0.947916679084301, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 1148.0625381469727, "epoch": 0.7754666666666666, "grad_norm": 5.954011917114258, "kl": 0.39178466796875, "learning_rate": 2.07010235137091e-07, "loss": 0.0592, "reward": 1.229166716337204, "reward_std": 0.5766806975007057, "rewards/accuracy_reward": 0.375000006519258, "rewards/format_reward": 0.8541666939854622, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 1000.8542022705078, "epoch": 0.7765333333333333, "grad_norm": 0.8599480986595154, "kl": 0.241119384765625, "learning_rate": 2.060354748291898e-07, "loss": 0.0134, "reward": 1.4895833656191826, "reward_std": 0.35373418405652046, "rewards/accuracy_reward": 0.5416666828095913, "rewards/format_reward": 0.947916679084301, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 778.3958473205566, "epoch": 0.7776, "grad_norm": 1.1848405599594116, "kl": 0.1983642578125, "learning_rate": 2.0506458116192332e-07, "loss": 0.0377, "reward": 1.5833333507180214, "reward_std": 0.4458504989743233, "rewards/accuracy_reward": 0.6458333535119891, "rewards/format_reward": 0.9375000149011612, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 1002.8333740234375, "epoch": 0.7786666666666666, "grad_norm": 1.547785997390747, "kl": 0.27142333984375, "learning_rate": 2.0409756504948916e-07, "loss": 0.0474, "reward": 1.6041666865348816, "reward_std": 0.48500459641218185, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.9583333432674408, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 1217.614616394043, "epoch": 0.7797333333333333, "grad_norm": 1.3330804109573364, "kl": 0.42578125, "learning_rate": 2.0313443736249624e-07, "loss": 0.0334, "reward": 1.416666679084301, "reward_std": 0.5124132037162781, "rewards/accuracy_reward": 0.5520833535119891, "rewards/format_reward": 0.864583358168602, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 811.0208625793457, "epoch": 0.7808, "grad_norm": 2.3340306282043457, "kl": 0.291259765625, "learning_rate": 2.0217520892784218e-07, "loss": 0.058, "reward": 1.5000000298023224, "reward_std": 0.27726873755455017, "rewards/accuracy_reward": 0.5520833432674408, "rewards/format_reward": 0.947916679084301, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 1068.7604446411133, "epoch": 0.7818666666666667, "grad_norm": 1.4344955682754517, "kl": 0.33612060546875, "learning_rate": 2.0121989052859117e-07, "loss": 0.0604, "reward": 1.302083358168602, "reward_std": 0.24196770787239075, "rewards/accuracy_reward": 0.3645833386108279, "rewards/format_reward": 0.9375000074505806, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 1125.4270973205566, "epoch": 0.7829333333333334, "grad_norm": 10.290349006652832, "kl": 1.21771240234375, "learning_rate": 2.0026849290385352e-07, "loss": 0.1414, "reward": 1.4270833805203438, "reward_std": 0.38045133650302887, "rewards/accuracy_reward": 0.5208333395421505, "rewards/format_reward": 0.9062500149011612, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 1117.6458854675293, "epoch": 0.784, "grad_norm": 21.428125381469727, "kl": 0.62823486328125, "learning_rate": 1.9932102674866469e-07, "loss": 0.1579, "reward": 1.3645833507180214, "reward_std": 0.40390491113066673, "rewards/accuracy_reward": 0.437500006519258, "rewards/format_reward": 0.9270833432674408, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 977.4479331970215, "epoch": 0.7850666666666667, "grad_norm": 1.7059738636016846, "kl": 0.30517578125, "learning_rate": 1.9837750271386428e-07, "loss": 0.0208, "reward": 1.4479167088866234, "reward_std": 0.4304392486810684, "rewards/accuracy_reward": 0.5312500204890966, "rewards/format_reward": 0.9166666865348816, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 925.5208511352539, "epoch": 0.7861333333333334, "grad_norm": 2.0579044818878174, "kl": 0.241943359375, "learning_rate": 1.974379314059777e-07, "loss": -0.0127, "reward": 1.3958333432674408, "reward_std": 0.32745253294706345, "rewards/accuracy_reward": 0.4687500027939677, "rewards/format_reward": 0.9270833432674408, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 942.6875228881836, "epoch": 0.7872, "grad_norm": 1.8747525215148926, "kl": 0.2430419921875, "learning_rate": 1.9650232338709596e-07, "loss": 0.0652, "reward": 1.4895833432674408, "reward_std": 0.3189259320497513, "rewards/accuracy_reward": 0.5416666707023978, "rewards/format_reward": 0.9479166716337204, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 722.5104522705078, "epoch": 0.7882666666666667, "grad_norm": 1.0884095430374146, "kl": 0.2164306640625, "learning_rate": 1.9557068917475683e-07, "loss": 0.0073, "reward": 1.437500037252903, "reward_std": 0.3657444529235363, "rewards/accuracy_reward": 0.500000006519258, "rewards/format_reward": 0.9375000149011612, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 1252.6979522705078, "epoch": 0.7893333333333333, "grad_norm": 3.8189446926116943, "kl": 0.528564453125, "learning_rate": 1.946430392418274e-07, "loss": 0.0416, "reward": 1.3750000223517418, "reward_std": 0.3463601917028427, "rewards/accuracy_reward": 0.4791666753590107, "rewards/format_reward": 0.8958333432674408, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 975.3541870117188, "epoch": 0.7904, "grad_norm": 3.2512094974517822, "kl": 0.29052734375, "learning_rate": 1.937193840163859e-07, "loss": -0.0645, "reward": 1.3958333507180214, "reward_std": 0.42790301889181137, "rewards/accuracy_reward": 0.4375000102445483, "rewards/format_reward": 0.9583333432674408, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 837.6146049499512, "epoch": 0.7914666666666667, "grad_norm": 1.8230537176132202, "kl": 0.2279052734375, "learning_rate": 1.9279973388160408e-07, "loss": 0.0036, "reward": 1.6145833730697632, "reward_std": 0.43504659086465836, "rewards/accuracy_reward": 0.6666666753590107, "rewards/format_reward": 0.947916679084301, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 898.4791870117188, "epoch": 0.7925333333333333, "grad_norm": 1.1278133392333984, "kl": 0.2513427734375, "learning_rate": 1.9188409917563132e-07, "loss": 0.0147, "reward": 1.395833358168602, "reward_std": 0.24955713748931885, "rewards/accuracy_reward": 0.44791666977107525, "rewards/format_reward": 0.947916679084301, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 1120.9479370117188, "epoch": 0.7936, "grad_norm": 3.0750279426574707, "kl": 0.2421875, "learning_rate": 1.909724901914776e-07, "loss": 0.0798, "reward": 1.3645833656191826, "reward_std": 0.40000708773732185, "rewards/accuracy_reward": 0.4375000111758709, "rewards/format_reward": 0.9270833507180214, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 1103.1250457763672, "epoch": 0.7946666666666666, "grad_norm": 2.6668882369995117, "kl": 0.250030517578125, "learning_rate": 1.9006491717689853e-07, "loss": -0.0186, "reward": 1.6250000298023224, "reward_std": 0.44062621146440506, "rewards/accuracy_reward": 0.6770833563059568, "rewards/format_reward": 0.947916679084301, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 854.8958511352539, "epoch": 0.7957333333333333, "grad_norm": 12.840085983276367, "kl": 0.2720947265625, "learning_rate": 1.8916139033427975e-07, "loss": 0.0587, "reward": 1.4375000447034836, "reward_std": 0.46082213521003723, "rewards/accuracy_reward": 0.5312500139698386, "rewards/format_reward": 0.9062500149011612, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 707.5416946411133, "epoch": 0.7968, "grad_norm": 1.5780742168426514, "kl": 0.16748046875, "learning_rate": 1.8826191982052192e-07, "loss": 0.0192, "reward": 1.562500037252903, "reward_std": 0.2979237139225006, "rewards/accuracy_reward": 0.5937500055879354, "rewards/format_reward": 0.9687500074505806, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 1043.2604446411133, "epoch": 0.7978666666666666, "grad_norm": 6.6425700187683105, "kl": 0.3953857421875, "learning_rate": 1.8736651574692734e-07, "loss": 0.0969, "reward": 1.3958333507180214, "reward_std": 0.34754062071442604, "rewards/accuracy_reward": 0.47916667349636555, "rewards/format_reward": 0.916666679084301, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 830.2291870117188, "epoch": 0.7989333333333334, "grad_norm": 1.246835470199585, "kl": 0.20343017578125, "learning_rate": 1.8647518817908574e-07, "loss": 0.0184, "reward": 1.6666667312383652, "reward_std": 0.4226461946964264, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.9375000149011612, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 766.9687881469727, "epoch": 0.8, "grad_norm": 0.7691798806190491, "kl": 0.1527099609375, "learning_rate": 1.8558794713676101e-07, "loss": -0.0069, "reward": 1.5625000447034836, "reward_std": 0.38382968306541443, "rewards/accuracy_reward": 0.6354166865348816, "rewards/format_reward": 0.9270833507180214, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 1000.770866394043, "epoch": 0.8010666666666667, "grad_norm": 15.842864990234375, "kl": 0.3739013671875, "learning_rate": 1.8470480259377908e-07, "loss": 0.0363, "reward": 1.458333358168602, "reward_std": 0.578138392418623, "rewards/accuracy_reward": 0.5416666809469461, "rewards/format_reward": 0.916666679084301, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 942.4479293823242, "epoch": 0.8021333333333334, "grad_norm": 2.8398404121398926, "kl": 0.2115478515625, "learning_rate": 1.8382576447791561e-07, "loss": 0.033, "reward": 1.645833358168602, "reward_std": 0.44001880660653114, "rewards/accuracy_reward": 0.6770833469927311, "rewards/format_reward": 0.9687500074505806, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 952.2917098999023, "epoch": 0.8032, "grad_norm": 3.9699840545654297, "kl": 0.5777587890625, "learning_rate": 1.829508426707838e-07, "loss": 0.074, "reward": 1.4687500223517418, "reward_std": 0.5219125226140022, "rewards/accuracy_reward": 0.5937500223517418, "rewards/format_reward": 0.8750000149011612, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 757.4271011352539, "epoch": 0.8042666666666667, "grad_norm": 7.9854888916015625, "kl": 0.36993408203125, "learning_rate": 1.8208004700772437e-07, "loss": 0.1452, "reward": 1.6562500149011612, "reward_std": 0.2189716435968876, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.9479166716337204, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 950.0521125793457, "epoch": 0.8053333333333333, "grad_norm": 2.006345272064209, "kl": 0.247100830078125, "learning_rate": 1.8121338727769387e-07, "loss": 0.0679, "reward": 1.5104167014360428, "reward_std": 0.4440036676824093, "rewards/accuracy_reward": 0.5833333469927311, "rewards/format_reward": 0.9270833507180214, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 961.1562728881836, "epoch": 0.8064, "grad_norm": 0.7762404680252075, "kl": 0.21185302734375, "learning_rate": 1.8035087322315574e-07, "loss": -0.0233, "reward": 1.541666716337204, "reward_std": 0.3484475761651993, "rewards/accuracy_reward": 0.6250000074505806, "rewards/format_reward": 0.916666679084301, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 690.4062652587891, "epoch": 0.8074666666666667, "grad_norm": 1.0976313352584839, "kl": 0.15045166015625, "learning_rate": 1.7949251453997e-07, "loss": -0.0534, "reward": 1.7812500298023224, "reward_std": 0.24248424544930458, "rewards/accuracy_reward": 0.8020833432674408, "rewards/format_reward": 0.9791666716337204, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 860.7916946411133, "epoch": 0.8085333333333333, "grad_norm": 1.160918951034546, "kl": 0.189697265625, "learning_rate": 1.7863832087728442e-07, "loss": 0.0256, "reward": 1.4375000149011612, "reward_std": 0.2871919013559818, "rewards/accuracy_reward": 0.447916672565043, "rewards/format_reward": 0.9895833358168602, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 1134.50004196167, "epoch": 0.8096, "grad_norm": 1.0557599067687988, "kl": 0.274658203125, "learning_rate": 1.777883018374262e-07, "loss": 0.021, "reward": 1.3750000223517418, "reward_std": 0.3799249231815338, "rewards/accuracy_reward": 0.42708334885537624, "rewards/format_reward": 0.947916679084301, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 1140.3854446411133, "epoch": 0.8106666666666666, "grad_norm": 1.095528244972229, "kl": 0.240966796875, "learning_rate": 1.7694246697579418e-07, "loss": 0.0007, "reward": 1.3020833656191826, "reward_std": 0.4519112594425678, "rewards/accuracy_reward": 0.3854166781529784, "rewards/format_reward": 0.9166666865348816, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 943.708366394043, "epoch": 0.8117333333333333, "grad_norm": 1.5596681833267212, "kl": 0.1734619140625, "learning_rate": 1.761008258007508e-07, "loss": -0.015, "reward": 1.666666716337204, "reward_std": 0.39558567106723785, "rewards/accuracy_reward": 0.6979166828095913, "rewards/format_reward": 0.9687500074505806, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 990.5521087646484, "epoch": 0.8128, "grad_norm": 3.032179117202759, "kl": 0.9287109375, "learning_rate": 1.7526338777351597e-07, "loss": 0.0453, "reward": 1.4375000223517418, "reward_std": 0.34503669664263725, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.9375000074505806, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 870.3437805175781, "epoch": 0.8138666666666666, "grad_norm": 2.0443239212036133, "kl": 0.296661376953125, "learning_rate": 1.7443016230806023e-07, "loss": -0.0222, "reward": 1.4375000223517418, "reward_std": 0.33434219658374786, "rewards/accuracy_reward": 0.4687500037252903, "rewards/format_reward": 0.9687500074505806, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 1011.6875381469727, "epoch": 0.8149333333333333, "grad_norm": 1.160479187965393, "kl": 0.3385009765625, "learning_rate": 1.7360115877099897e-07, "loss": 0.0527, "reward": 1.4375000298023224, "reward_std": 0.42859094217419624, "rewards/accuracy_reward": 0.4895833507180214, "rewards/format_reward": 0.9479166716337204, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 795.9687690734863, "epoch": 0.816, "grad_norm": 2.3876380920410156, "kl": 0.35272216796875, "learning_rate": 1.7277638648148734e-07, "loss": 0.0777, "reward": 1.6562500149011612, "reward_std": 0.31107934564352036, "rewards/accuracy_reward": 0.6979166800156236, "rewards/format_reward": 0.9583333432674408, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 722.4062690734863, "epoch": 0.8170666666666667, "grad_norm": 2.4777677059173584, "kl": 0.22705078125, "learning_rate": 1.719558547111153e-07, "loss": 0.0735, "reward": 1.6979167014360428, "reward_std": 0.25305089354515076, "rewards/accuracy_reward": 0.7500000074505806, "rewards/format_reward": 0.947916679084301, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 1074.7396125793457, "epoch": 0.8181333333333334, "grad_norm": 2.1304819583892822, "kl": 0.27313232421875, "learning_rate": 1.7113957268380347e-07, "loss": 0.0851, "reward": 1.5937500298023224, "reward_std": 0.38442378491163254, "rewards/accuracy_reward": 0.6875000074505806, "rewards/format_reward": 0.9062500149011612, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 1049.1250457763672, "epoch": 0.8192, "grad_norm": 4.155076026916504, "kl": 0.7435302734375, "learning_rate": 1.7032754957569965e-07, "loss": 0.0302, "reward": 1.6979166939854622, "reward_std": 0.2937192916870117, "rewards/accuracy_reward": 0.7395833414047956, "rewards/format_reward": 0.9583333358168602, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 1120.8229370117188, "epoch": 0.8202666666666667, "grad_norm": 2.1990880966186523, "kl": 0.241546630859375, "learning_rate": 1.6951979451507498e-07, "loss": 0.0502, "reward": 1.385416716337204, "reward_std": 0.4335870072245598, "rewards/accuracy_reward": 0.4791666753590107, "rewards/format_reward": 0.9062500149011612, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 960.4167060852051, "epoch": 0.8213333333333334, "grad_norm": 0.6311684846878052, "kl": 0.152923583984375, "learning_rate": 1.6871631658222228e-07, "loss": 0.037, "reward": 1.5104167088866234, "reward_std": 0.30990205332636833, "rewards/accuracy_reward": 0.5520833358168602, "rewards/format_reward": 0.9583333432674408, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 648.6354484558105, "epoch": 0.8224, "grad_norm": 0.6765944361686707, "kl": 0.13336181640625, "learning_rate": 1.679171248093533e-07, "loss": 0.0269, "reward": 1.6979167014360428, "reward_std": 0.3041832111775875, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.9687500074505806, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 1005.4583740234375, "epoch": 0.8234666666666667, "grad_norm": 1.1520076990127563, "kl": 0.209716796875, "learning_rate": 1.671222281804973e-07, "loss": 0.0195, "reward": 1.5104167014360428, "reward_std": 0.4377918280661106, "rewards/accuracy_reward": 0.5729166762903333, "rewards/format_reward": 0.9375000149011612, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 828.1666946411133, "epoch": 0.8245333333333333, "grad_norm": 1.5678932666778564, "kl": 0.166046142578125, "learning_rate": 1.6633163563140007e-07, "loss": -0.0442, "reward": 1.7291667014360428, "reward_std": 0.2842370681464672, "rewards/accuracy_reward": 0.7604166716337204, "rewards/format_reward": 0.9687500074505806, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 870.0208740234375, "epoch": 0.8256, "grad_norm": 1.0592260360717773, "kl": 0.22418212890625, "learning_rate": 1.6554535604942404e-07, "loss": 0.0063, "reward": 1.4895833730697632, "reward_std": 0.41969112679362297, "rewards/accuracy_reward": 0.5520833460614085, "rewards/format_reward": 0.9375000149011612, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 1129.1354370117188, "epoch": 0.8266666666666667, "grad_norm": 2.106901168823242, "kl": 0.2469482421875, "learning_rate": 1.6476339827344722e-07, "loss": 0.0947, "reward": 1.322916716337204, "reward_std": 0.41504712402820587, "rewards/accuracy_reward": 0.39583334140479565, "rewards/format_reward": 0.9270833507180214, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 1008.9583587646484, "epoch": 0.8277333333333333, "grad_norm": 1.511322259902954, "kl": 0.2313232421875, "learning_rate": 1.6398577109376495e-07, "loss": 0.0654, "reward": 1.645833358168602, "reward_std": 0.3160989172756672, "rewards/accuracy_reward": 0.697916679084301, "rewards/format_reward": 0.9479166716337204, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 1127.9687957763672, "epoch": 0.8288, "grad_norm": 1.6113932132720947, "kl": 0.26025390625, "learning_rate": 1.632124832519904e-07, "loss": -0.017, "reward": 1.427083358168602, "reward_std": 0.24359868466854095, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.9687500074505806, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 1008.8437957763672, "epoch": 0.8298666666666666, "grad_norm": 1.947967529296875, "kl": 0.37506103515625, "learning_rate": 1.624435434409566e-07, "loss": -0.0025, "reward": 1.4583333730697632, "reward_std": 0.5129223205149174, "rewards/accuracy_reward": 0.5520833432674408, "rewards/format_reward": 0.9062500149011612, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 1175.583381652832, "epoch": 0.8309333333333333, "grad_norm": 4.3604936599731445, "kl": 0.5877685546875, "learning_rate": 1.6167896030461882e-07, "loss": 0.172, "reward": 1.3750000447034836, "reward_std": 0.47958556935191154, "rewards/accuracy_reward": 0.46875001583248377, "rewards/format_reward": 0.9062500149011612, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 942.7396087646484, "epoch": 0.832, "grad_norm": 0.893103837966919, "kl": 0.2122802734375, "learning_rate": 1.609187424379569e-07, "loss": -0.0175, "reward": 1.6041667014360428, "reward_std": 0.27498848363757133, "rewards/accuracy_reward": 0.6145833432674408, "rewards/format_reward": 0.9895833358168602, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 1193.9687957763672, "epoch": 0.8330666666666666, "grad_norm": 1.3292351961135864, "kl": 0.34356689453125, "learning_rate": 1.6016289838687923e-07, "loss": 0.0962, "reward": 1.3229166939854622, "reward_std": 0.3624543137848377, "rewards/accuracy_reward": 0.44791666977107525, "rewards/format_reward": 0.8750000149011612, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 1262.3854598999023, "epoch": 0.8341333333333333, "grad_norm": 5.202003479003906, "kl": 0.2965087890625, "learning_rate": 1.5941143664812647e-07, "loss": 0.1206, "reward": 1.5104167014360428, "reward_std": 0.5827873609960079, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.885416679084301, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 1151.2500381469727, "epoch": 0.8352, "grad_norm": 1.1233078241348267, "kl": 0.20697021484375, "learning_rate": 1.5866436566917561e-07, "loss": 0.0186, "reward": 1.4166666865348816, "reward_std": 0.4896446615457535, "rewards/accuracy_reward": 0.510416685603559, "rewards/format_reward": 0.9062500149011612, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 918.8854446411133, "epoch": 0.8362666666666667, "grad_norm": 0.8144195079803467, "kl": 0.169677734375, "learning_rate": 1.5792169384814574e-07, "loss": 0.0086, "reward": 1.5833333507180214, "reward_std": 0.2991708368062973, "rewards/accuracy_reward": 0.6145833358168602, "rewards/format_reward": 0.9687500074505806, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 982.291690826416, "epoch": 0.8373333333333334, "grad_norm": 1.9690064191818237, "kl": 0.231170654296875, "learning_rate": 1.571834295337033e-07, "loss": 0.0002, "reward": 1.5416667014360428, "reward_std": 0.42281850427389145, "rewards/accuracy_reward": 0.6145833535119891, "rewards/format_reward": 0.9270833507180214, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 822.5416870117188, "epoch": 0.8384, "grad_norm": 1.160438895225525, "kl": 0.200286865234375, "learning_rate": 1.5644958102496774e-07, "loss": -0.008, "reward": 1.697916679084301, "reward_std": 0.16167844831943512, "rewards/accuracy_reward": 0.7291666744276881, "rewards/format_reward": 0.9687500074505806, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 966.5312652587891, "epoch": 0.8394666666666667, "grad_norm": 3.644686698913574, "kl": 0.31683349609375, "learning_rate": 1.5572015657141928e-07, "loss": 0.0526, "reward": 1.5312500298023224, "reward_std": 0.48770708218216896, "rewards/accuracy_reward": 0.5937500167638063, "rewards/format_reward": 0.9375000149011612, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 986.5312728881836, "epoch": 0.8405333333333334, "grad_norm": 1.4142725467681885, "kl": 0.23681640625, "learning_rate": 1.5499516437280544e-07, "loss": 0.0475, "reward": 1.5937500447034836, "reward_std": 0.3428783416748047, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.947916679084301, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 905.9271240234375, "epoch": 0.8416, "grad_norm": 1.9726401567459106, "kl": 0.21258544921875, "learning_rate": 1.5427461257904866e-07, "loss": -0.0141, "reward": 1.5833333730697632, "reward_std": 0.29119856283068657, "rewards/accuracy_reward": 0.6041666772216558, "rewards/format_reward": 0.9791666716337204, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 1078.9792098999023, "epoch": 0.8426666666666667, "grad_norm": 2.53570556640625, "kl": 0.2308349609375, "learning_rate": 1.535585092901556e-07, "loss": 0.0449, "reward": 1.4791667014360428, "reward_std": 0.3555065132677555, "rewards/accuracy_reward": 0.5000000158324838, "rewards/format_reward": 0.9791666716337204, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 1172.1354446411133, "epoch": 0.8437333333333333, "grad_norm": 4.247528076171875, "kl": 0.238189697265625, "learning_rate": 1.5284686255612496e-07, "loss": 0.0744, "reward": 1.5625000447034836, "reward_std": 0.5227247253060341, "rewards/accuracy_reward": 0.6458333460614085, "rewards/format_reward": 0.916666679084301, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 941.3125152587891, "epoch": 0.8448, "grad_norm": 2.0456783771514893, "kl": 0.22918701171875, "learning_rate": 1.5213968037685794e-07, "loss": 0.0744, "reward": 1.6145833656191826, "reward_std": 0.4737016186118126, "rewards/accuracy_reward": 0.6875000223517418, "rewards/format_reward": 0.9270833507180214, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 857.5208511352539, "epoch": 0.8458666666666667, "grad_norm": 1.5369303226470947, "kl": 0.13916015625, "learning_rate": 1.514369707020679e-07, "loss": 0.0305, "reward": 1.6250000149011612, "reward_std": 0.3211798593401909, "rewards/accuracy_reward": 0.6770833432674408, "rewards/format_reward": 0.9479166716337204, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 1040.8541870117188, "epoch": 0.8469333333333333, "grad_norm": 1.1746795177459717, "kl": 0.20806884765625, "learning_rate": 1.507387414311908e-07, "loss": 0.015, "reward": 1.4895833656191826, "reward_std": 0.319347620010376, "rewards/accuracy_reward": 0.5520833395421505, "rewards/format_reward": 0.9375000149011612, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 704.8541831970215, "epoch": 0.848, "grad_norm": 1.7647831439971924, "kl": 0.1636962890625, "learning_rate": 1.5004500041329677e-07, "loss": 0.024, "reward": 1.697916716337204, "reward_std": 0.5096255466341972, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.9270833432674408, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 782.197940826416, "epoch": 0.8490666666666666, "grad_norm": 2.7635409832000732, "kl": 0.145355224609375, "learning_rate": 1.493557554470018e-07, "loss": 0.0559, "reward": 1.8854167014360428, "reward_std": 0.22770369052886963, "rewards/accuracy_reward": 0.9062500074505806, "rewards/format_reward": 0.9791666716337204, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 1084.2916946411133, "epoch": 0.8501333333333333, "grad_norm": 1.6833831071853638, "kl": 0.44281005859375, "learning_rate": 1.4867101428037984e-07, "loss": 0.0403, "reward": 1.5729167088866234, "reward_std": 0.46121441945433617, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.9270833432674408, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 1005.8229446411133, "epoch": 0.8512, "grad_norm": 1.07530677318573, "kl": 0.27227783203125, "learning_rate": 1.4799078461087566e-07, "loss": -0.0342, "reward": 1.3541667014360428, "reward_std": 0.3421004042029381, "rewards/accuracy_reward": 0.4062500149011612, "rewards/format_reward": 0.947916679084301, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 1079.0416946411133, "epoch": 0.8522666666666666, "grad_norm": 3.306112766265869, "kl": 0.2705078125, "learning_rate": 1.473150740852191e-07, "loss": 0.0029, "reward": 1.3958333730697632, "reward_std": 0.34810031950473785, "rewards/accuracy_reward": 0.4270833507180214, "rewards/format_reward": 0.9687500074505806, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 927.9583625793457, "epoch": 0.8533333333333334, "grad_norm": 1.4898066520690918, "kl": 0.26318359375, "learning_rate": 1.4664389029933808e-07, "loss": -0.0383, "reward": 1.3541667014360428, "reward_std": 0.35192636027932167, "rewards/accuracy_reward": 0.42708334140479565, "rewards/format_reward": 0.9270833507180214, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 808.5104370117188, "epoch": 0.8544, "grad_norm": 1.580403447151184, "kl": 0.2164306640625, "learning_rate": 1.4597724079827372e-07, "loss": 0.0074, "reward": 1.4895833656191826, "reward_std": 0.5060476772487164, "rewards/accuracy_reward": 0.572916679084301, "rewards/format_reward": 0.916666679084301, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 811.8229484558105, "epoch": 0.8554666666666667, "grad_norm": 1.4932245016098022, "kl": 0.2420654296875, "learning_rate": 1.4531513307609544e-07, "loss": 0.0726, "reward": 1.6458333656191826, "reward_std": 0.3374767564237118, "rewards/accuracy_reward": 0.7083333488553762, "rewards/format_reward": 0.9375000149011612, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 1022.6458587646484, "epoch": 0.8565333333333334, "grad_norm": 1.143593430519104, "kl": 0.229156494140625, "learning_rate": 1.4465757457581688e-07, "loss": 0.0636, "reward": 1.4166666939854622, "reward_std": 0.27143750712275505, "rewards/accuracy_reward": 0.4687500102445483, "rewards/format_reward": 0.947916679084301, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 890.6041946411133, "epoch": 0.8576, "grad_norm": 1.0793025493621826, "kl": 0.171844482421875, "learning_rate": 1.4400457268931203e-07, "loss": -0.0716, "reward": 1.4583333507180214, "reward_std": 0.35357537120580673, "rewards/accuracy_reward": 0.5312500149011612, "rewards/format_reward": 0.9270833432674408, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 1212.9479598999023, "epoch": 0.8586666666666667, "grad_norm": 1.360930323600769, "kl": 0.262939453125, "learning_rate": 1.4335613475723206e-07, "loss": 0.0809, "reward": 1.5104166939854622, "reward_std": 0.3813658058643341, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.9479166716337204, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 721.9375228881836, "epoch": 0.8597333333333333, "grad_norm": 1.833483338356018, "kl": 0.3011474609375, "learning_rate": 1.427122680689231e-07, "loss": 0.0098, "reward": 1.5416666865348816, "reward_std": 0.4681377038359642, "rewards/accuracy_reward": 0.6145833469927311, "rewards/format_reward": 0.9270833507180214, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 1299.1250381469727, "epoch": 0.8608, "grad_norm": 0.8212108612060547, "kl": 0.31787109375, "learning_rate": 1.4207297986234413e-07, "loss": 0.0217, "reward": 1.458333358168602, "reward_std": 0.2921244166791439, "rewards/accuracy_reward": 0.5208333404734731, "rewards/format_reward": 0.9375000149011612, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 848.9166946411133, "epoch": 0.8618666666666667, "grad_norm": 1.7335200309753418, "kl": 0.169891357421875, "learning_rate": 1.4143827732398542e-07, "loss": -0.0248, "reward": 1.7395833730697632, "reward_std": 0.2883782275021076, "rewards/accuracy_reward": 0.7708333414047956, "rewards/format_reward": 0.9687500074505806, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 760.802116394043, "epoch": 0.8629333333333333, "grad_norm": 0.9716601967811584, "kl": 0.1861572265625, "learning_rate": 1.4080816758878795e-07, "loss": 0.0575, "reward": 1.8229166865348816, "reward_std": 0.2840975485742092, "rewards/accuracy_reward": 0.8645833469927311, "rewards/format_reward": 0.9583333432674408, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 1121.6562805175781, "epoch": 0.864, "grad_norm": 2.0248258113861084, "kl": 0.2901611328125, "learning_rate": 1.4018265774006365e-07, "loss": 0.0166, "reward": 1.3958333656191826, "reward_std": 0.3707854636013508, "rewards/accuracy_reward": 0.4687500037252903, "rewards/format_reward": 0.9270833432674408, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 905.3125228881836, "epoch": 0.8650666666666667, "grad_norm": 0.8582003712654114, "kl": 0.1873779296875, "learning_rate": 1.395617548094147e-07, "loss": -0.0524, "reward": 1.562500037252903, "reward_std": 0.38269609957933426, "rewards/accuracy_reward": 0.593750013038516, "rewards/format_reward": 0.9687500074505806, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 992.6458740234375, "epoch": 0.8661333333333333, "grad_norm": 2.3336727619171143, "kl": 0.25518798828125, "learning_rate": 1.3894546577665552e-07, "loss": 0.0106, "reward": 1.4375000149011612, "reward_std": 0.2825479060411453, "rewards/accuracy_reward": 0.4687500074505806, "rewards/format_reward": 0.9687500074505806, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 894.239616394043, "epoch": 0.8672, "grad_norm": 1.7896404266357422, "kl": 0.2296142578125, "learning_rate": 1.3833379756973363e-07, "loss": -0.0825, "reward": 1.583333358168602, "reward_std": 0.41135457530617714, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.9791666716337204, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 884.9895935058594, "epoch": 0.8682666666666666, "grad_norm": 2.0375399589538574, "kl": 0.3360595703125, "learning_rate": 1.377267570646521e-07, "loss": 0.1162, "reward": 1.6250000447034836, "reward_std": 0.37510664388537407, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.916666679084301, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 630.6250076293945, "epoch": 0.8693333333333333, "grad_norm": 1.0730711221694946, "kl": 0.133056640625, "learning_rate": 1.3712435108539247e-07, "loss": -0.0116, "reward": 1.7708333879709244, "reward_std": 0.34480898082256317, "rewards/accuracy_reward": 0.8125000204890966, "rewards/format_reward": 0.9583333358168602, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 992.4479446411133, "epoch": 0.8704, "grad_norm": 3.689910411834717, "kl": 0.2490234375, "learning_rate": 1.3652658640383722e-07, "loss": 0.059, "reward": 1.3750000223517418, "reward_std": 0.4803656004369259, "rewards/accuracy_reward": 0.48958334885537624, "rewards/format_reward": 0.885416679084301, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 798.5312728881836, "epoch": 0.8714666666666666, "grad_norm": 1.6113483905792236, "kl": 0.149810791015625, "learning_rate": 1.359334697396946e-07, "loss": 0.0352, "reward": 1.687500037252903, "reward_std": 0.3138932101428509, "rewards/accuracy_reward": 0.7291666781529784, "rewards/format_reward": 0.9583333432674408, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 1355.5208587646484, "epoch": 0.8725333333333334, "grad_norm": 1.5596377849578857, "kl": 0.2879638671875, "learning_rate": 1.3534500776042248e-07, "loss": 0.0434, "reward": 1.4479167088866234, "reward_std": 0.46335188299417496, "rewards/accuracy_reward": 0.531250013038516, "rewards/format_reward": 0.916666679084301, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 1151.9375381469727, "epoch": 0.8736, "grad_norm": 1.503084421157837, "kl": 0.31329345703125, "learning_rate": 1.3476120708115368e-07, "loss": 0.0194, "reward": 1.583333358168602, "reward_std": 0.4321327470242977, "rewards/accuracy_reward": 0.6770833432674408, "rewards/format_reward": 0.9062500149011612, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 1255.166690826416, "epoch": 0.8746666666666667, "grad_norm": 0.8630591034889221, "kl": 0.252197265625, "learning_rate": 1.3418207426462128e-07, "loss": -0.0269, "reward": 1.4479167088866234, "reward_std": 0.5145560279488564, "rewards/accuracy_reward": 0.5625000167638063, "rewards/format_reward": 0.885416679084301, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 1112.895851135254, "epoch": 0.8757333333333334, "grad_norm": 2.35595965385437, "kl": 0.26861572265625, "learning_rate": 1.3360761582108552e-07, "loss": 0.0222, "reward": 1.6041667237877846, "reward_std": 0.3817530535161495, "rewards/accuracy_reward": 0.6562500204890966, "rewards/format_reward": 0.947916679084301, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 1166.8437881469727, "epoch": 0.8768, "grad_norm": 1.0438975095748901, "kl": 0.26513671875, "learning_rate": 1.330378382082597e-07, "loss": 0.0175, "reward": 1.5208333656191826, "reward_std": 0.2711624354124069, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.9791666716337204, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 1006.0937805175781, "epoch": 0.8778666666666667, "grad_norm": 1.9326547384262085, "kl": 0.283203125, "learning_rate": 1.3247274783123844e-07, "loss": -0.002, "reward": 1.437500037252903, "reward_std": 0.36804045736789703, "rewards/accuracy_reward": 0.5520833414047956, "rewards/format_reward": 0.885416679084301, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 823.947940826416, "epoch": 0.8789333333333333, "grad_norm": 1.3034135103225708, "kl": 0.348876953125, "learning_rate": 1.319123510424249e-07, "loss": -0.039, "reward": 1.656250037252903, "reward_std": 0.4106651097536087, "rewards/accuracy_reward": 0.7395833497866988, "rewards/format_reward": 0.916666679084301, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 968.8750381469727, "epoch": 0.88, "grad_norm": 2.120932102203369, "kl": 0.24066162109375, "learning_rate": 1.3135665414146019e-07, "loss": -0.0091, "reward": 1.6666667014360428, "reward_std": 0.3138932101428509, "rewards/accuracy_reward": 0.7395833423361182, "rewards/format_reward": 0.9270833432674408, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 902.3125267028809, "epoch": 0.8810666666666667, "grad_norm": 1.1300134658813477, "kl": 0.2630615234375, "learning_rate": 1.3080566337515188e-07, "loss": -0.0077, "reward": 1.3958333730697632, "reward_std": 0.3436807915568352, "rewards/accuracy_reward": 0.45833334419876337, "rewards/format_reward": 0.9375000149011612, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 901.3125305175781, "epoch": 0.8821333333333333, "grad_norm": 1.41934072971344, "kl": 0.25140380859375, "learning_rate": 1.30259384937404e-07, "loss": 0.0717, "reward": 1.489583358168602, "reward_std": 0.3904200829565525, "rewards/accuracy_reward": 0.5729166707023978, "rewards/format_reward": 0.916666679084301, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 856.1979446411133, "epoch": 0.8832, "grad_norm": 1.0813651084899902, "kl": 0.22027587890625, "learning_rate": 1.2971782496914756e-07, "loss": -0.0179, "reward": 1.6250000298023224, "reward_std": 0.3223811611533165, "rewards/accuracy_reward": 0.6458333386108279, "rewards/format_reward": 0.9791666716337204, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 983.5937767028809, "epoch": 0.8842666666666666, "grad_norm": 1.3288891315460205, "kl": 0.23785400390625, "learning_rate": 1.291809895582715e-07, "loss": 0.0038, "reward": 1.6562500447034836, "reward_std": 0.45108332112431526, "rewards/accuracy_reward": 0.6979166716337204, "rewards/format_reward": 0.9583333432674408, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 1249.3229598999023, "epoch": 0.8853333333333333, "grad_norm": 1.4891877174377441, "kl": 0.24615478515625, "learning_rate": 1.286488847395538e-07, "loss": 0.0618, "reward": 1.4375000223517418, "reward_std": 0.32164905965328217, "rewards/accuracy_reward": 0.48958334140479565, "rewards/format_reward": 0.947916679084301, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 1060.6666870117188, "epoch": 0.8864, "grad_norm": 0.9852237701416016, "kl": 0.2525634765625, "learning_rate": 1.2812151649459427e-07, "loss": 0.0279, "reward": 1.4687500223517418, "reward_std": 0.2906983904540539, "rewards/accuracy_reward": 0.5208333358168602, "rewards/format_reward": 0.947916679084301, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 936.0729446411133, "epoch": 0.8874666666666666, "grad_norm": 0.786924421787262, "kl": 0.192474365234375, "learning_rate": 1.2759889075174706e-07, "loss": 0.0213, "reward": 1.6250000447034836, "reward_std": 0.3067758306860924, "rewards/accuracy_reward": 0.6770833432674408, "rewards/format_reward": 0.947916679084301, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 1012.7812881469727, "epoch": 0.8885333333333333, "grad_norm": 1.001796841621399, "kl": 0.243194580078125, "learning_rate": 1.2708101338605376e-07, "loss": -0.0014, "reward": 1.4791666939854622, "reward_std": 0.3699610084295273, "rewards/accuracy_reward": 0.5104166772216558, "rewards/format_reward": 0.9687500074505806, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 852.3437767028809, "epoch": 0.8896, "grad_norm": 2.4660439491271973, "kl": 0.223388671875, "learning_rate": 1.2656789021917777e-07, "loss": 0.0763, "reward": 1.6979166939854622, "reward_std": 0.3311602473258972, "rewards/accuracy_reward": 0.7604166818782687, "rewards/format_reward": 0.9375000074505806, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 752.6666946411133, "epoch": 0.8906666666666667, "grad_norm": 0.9519287347793579, "kl": 0.14208984375, "learning_rate": 1.2605952701933854e-07, "loss": 0.0517, "reward": 1.697916716337204, "reward_std": 0.2702494040131569, "rewards/accuracy_reward": 0.7083333441987634, "rewards/format_reward": 0.9895833358168602, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 988.3021125793457, "epoch": 0.8917333333333334, "grad_norm": 1.210657000541687, "kl": 0.170562744140625, "learning_rate": 1.255559295012469e-07, "loss": 0.0271, "reward": 1.6666667014360428, "reward_std": 0.31766563653945923, "rewards/accuracy_reward": 0.6770833460614085, "rewards/format_reward": 0.9895833358168602, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 694.947940826416, "epoch": 0.8928, "grad_norm": 2.713172197341919, "kl": 0.144989013671875, "learning_rate": 1.25057103326041e-07, "loss": 0.0351, "reward": 1.6354167088866234, "reward_std": 0.372041255235672, "rewards/accuracy_reward": 0.6875000186264515, "rewards/format_reward": 0.9479166716337204, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 800.3854293823242, "epoch": 0.8938666666666667, "grad_norm": 1.4229414463043213, "kl": 0.277587890625, "learning_rate": 1.2456305410122204e-07, "loss": -0.0213, "reward": 1.4791667014360428, "reward_std": 0.32458771765232086, "rewards/accuracy_reward": 0.5416666697710752, "rewards/format_reward": 0.9375000149011612, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 1006.5521087646484, "epoch": 0.8949333333333334, "grad_norm": 1.0240747928619385, "kl": 0.224761962890625, "learning_rate": 1.2407378738059197e-07, "loss": 0.0605, "reward": 1.4270833730697632, "reward_std": 0.35213426128029823, "rewards/accuracy_reward": 0.4791666707023978, "rewards/format_reward": 0.947916679084301, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 958.2187881469727, "epoch": 0.896, "grad_norm": 1.5115375518798828, "kl": 0.26568603515625, "learning_rate": 1.2358930866419058e-07, "loss": 0.0447, "reward": 1.4583333805203438, "reward_std": 0.3236446641385555, "rewards/accuracy_reward": 0.48958334140479565, "rewards/format_reward": 0.9687500074505806, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 833.6875228881836, "epoch": 0.8970666666666667, "grad_norm": 1.1382992267608643, "kl": 0.17822265625, "learning_rate": 1.2310962339823374e-07, "loss": 0.0276, "reward": 1.5625000223517418, "reward_std": 0.24620391800999641, "rewards/accuracy_reward": 0.6041666744276881, "rewards/format_reward": 0.9583333432674408, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 1067.437557220459, "epoch": 0.8981333333333333, "grad_norm": 1.4394313097000122, "kl": 0.2972412109375, "learning_rate": 1.2263473697505248e-07, "loss": 0.0033, "reward": 1.3541667088866234, "reward_std": 0.4056843891739845, "rewards/accuracy_reward": 0.42708334047347307, "rewards/format_reward": 0.9270833432674408, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 929.5625381469727, "epoch": 0.8992, "grad_norm": 2.336404323577881, "kl": 0.279327392578125, "learning_rate": 1.22164654733032e-07, "loss": 0.0186, "reward": 1.4583333507180214, "reward_std": 0.3653324767947197, "rewards/accuracy_reward": 0.5520833386108279, "rewards/format_reward": 0.9062500149011612, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 1299.7291946411133, "epoch": 0.9002666666666667, "grad_norm": 2.581727981567383, "kl": 0.4512939453125, "learning_rate": 1.2169938195655186e-07, "loss": 0.0313, "reward": 1.1770833656191826, "reward_std": 0.4703824631869793, "rewards/accuracy_reward": 0.29166667349636555, "rewards/format_reward": 0.8854166865348816, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 986.5417098999023, "epoch": 0.9013333333333333, "grad_norm": 1.908642292022705, "kl": 0.25042724609375, "learning_rate": 1.2123892387592636e-07, "loss": 0.0221, "reward": 1.5312500223517418, "reward_std": 0.35777487978339195, "rewards/accuracy_reward": 0.6145833507180214, "rewards/format_reward": 0.916666679084301, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 1251.6562767028809, "epoch": 0.9024, "grad_norm": 3.2039759159088135, "kl": 0.3975830078125, "learning_rate": 1.207832856673463e-07, "loss": -0.0475, "reward": 1.1770833805203438, "reward_std": 0.5477559193968773, "rewards/accuracy_reward": 0.33333334419876337, "rewards/format_reward": 0.8437500223517418, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 683.3541793823242, "epoch": 0.9034666666666666, "grad_norm": 1.3555891513824463, "kl": 0.14520263671875, "learning_rate": 1.2033247245281994e-07, "loss": -0.004, "reward": 1.6458333730697632, "reward_std": 0.3904138244688511, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.9166666865348816, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 899.2396125793457, "epoch": 0.9045333333333333, "grad_norm": 0.8173375725746155, "kl": 0.18084716796875, "learning_rate": 1.198864893001162e-07, "loss": 0.0092, "reward": 1.7604167014360428, "reward_std": 0.3024350181221962, "rewards/accuracy_reward": 0.791666679084301, "rewards/format_reward": 0.9687500074505806, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 766.0625152587891, "epoch": 0.9056, "grad_norm": 0.8218348622322083, "kl": 0.1767578125, "learning_rate": 1.1944534122270718e-07, "loss": -0.0085, "reward": 1.718750037252903, "reward_std": 0.23698534816503525, "rewards/accuracy_reward": 0.7395833432674408, "rewards/format_reward": 0.9791666716337204, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 900.1875228881836, "epoch": 0.9066666666666666, "grad_norm": 1.405907154083252, "kl": 0.3050537109375, "learning_rate": 1.1900903317971215e-07, "loss": 0.0108, "reward": 1.5104166865348816, "reward_std": 0.45224636793136597, "rewards/accuracy_reward": 0.5729166772216558, "rewards/format_reward": 0.9375000149011612, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 845.7604370117188, "epoch": 0.9077333333333333, "grad_norm": 1.0749479532241821, "kl": 0.255615234375, "learning_rate": 1.1857757007584163e-07, "loss": -0.0092, "reward": 1.5937500298023224, "reward_std": 0.44467736035585403, "rewards/accuracy_reward": 0.6354166846722364, "rewards/format_reward": 0.9583333432674408, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 963.9791870117188, "epoch": 0.9088, "grad_norm": 1.3117467164993286, "kl": 0.25823974609375, "learning_rate": 1.181509567613421e-07, "loss": 0.0329, "reward": 1.5208333879709244, "reward_std": 0.4830211251974106, "rewards/accuracy_reward": 0.6145833488553762, "rewards/format_reward": 0.9062500074505806, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 1071.270866394043, "epoch": 0.9098666666666667, "grad_norm": 4.167201042175293, "kl": 0.2919921875, "learning_rate": 1.1772919803194184e-07, "loss": 0.1016, "reward": 1.3437500223517418, "reward_std": 0.26707590743899345, "rewards/accuracy_reward": 0.43750000558793545, "rewards/format_reward": 0.9062500149011612, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 995.0312805175781, "epoch": 0.9109333333333334, "grad_norm": 2.8516054153442383, "kl": 0.26458740234375, "learning_rate": 1.173122986287968e-07, "loss": 0.0336, "reward": 1.3854167014360428, "reward_std": 0.44443753361701965, "rewards/accuracy_reward": 0.5208333367481828, "rewards/format_reward": 0.8645833507180214, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 855.8021087646484, "epoch": 0.912, "grad_norm": 1.7860275506973267, "kl": 0.22503662109375, "learning_rate": 1.1690026323843726e-07, "loss": 0.0056, "reward": 1.6458333879709244, "reward_std": 0.3670288808643818, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.9583333358168602, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 834.0208435058594, "epoch": 0.9130666666666667, "grad_norm": 0.8977913856506348, "kl": 0.19659423828125, "learning_rate": 1.1649309649271511e-07, "loss": 0.0165, "reward": 1.6041667014360428, "reward_std": 0.3968309946358204, "rewards/accuracy_reward": 0.6250000093132257, "rewards/format_reward": 0.9791666716337204, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 877.989616394043, "epoch": 0.9141333333333334, "grad_norm": 1.3987524509429932, "kl": 0.179229736328125, "learning_rate": 1.1609080296875228e-07, "loss": 0.0502, "reward": 1.7291667088866234, "reward_std": 0.19299375265836716, "rewards/accuracy_reward": 0.7604166697710752, "rewards/format_reward": 0.9687500074505806, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 1046.0729675292969, "epoch": 0.9152, "grad_norm": 1.6334335803985596, "kl": 0.212158203125, "learning_rate": 1.1569338718888843e-07, "loss": 0.0703, "reward": 1.656250037252903, "reward_std": 0.2338472306728363, "rewards/accuracy_reward": 0.6770833414047956, "rewards/format_reward": 0.9791666716337204, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 621.4271030426025, "epoch": 0.9162666666666667, "grad_norm": 1.614911675453186, "kl": 0.24310302734375, "learning_rate": 1.1530085362063094e-07, "loss": 0.0234, "reward": 1.6562500298023224, "reward_std": 0.4237040653824806, "rewards/accuracy_reward": 0.7187500204890966, "rewards/format_reward": 0.9375000149011612, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 1046.083366394043, "epoch": 0.9173333333333333, "grad_norm": 1.2969164848327637, "kl": 0.289886474609375, "learning_rate": 1.1491320667660401e-07, "loss": 0.0189, "reward": 1.677083358168602, "reward_std": 0.38319894298911095, "rewards/accuracy_reward": 0.7083333488553762, "rewards/format_reward": 0.9687500074505806, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 1006.260440826416, "epoch": 0.9184, "grad_norm": 1.986854076385498, "kl": 0.260986328125, "learning_rate": 1.1453045071449969e-07, "loss": 0.0127, "reward": 1.5729167014360428, "reward_std": 0.2852242663502693, "rewards/accuracy_reward": 0.6354166716337204, "rewards/format_reward": 0.9375000149011612, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 1105.9479293823242, "epoch": 0.9194666666666667, "grad_norm": 1.8759348392486572, "kl": 0.40753173828125, "learning_rate": 1.1415259003702843e-07, "loss": 0.037, "reward": 1.1979166865348816, "reward_std": 0.3754154182970524, "rewards/accuracy_reward": 0.3333333386108279, "rewards/format_reward": 0.864583358168602, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 871.4896202087402, "epoch": 0.9205333333333333, "grad_norm": 1.6317452192306519, "kl": 0.26007080078125, "learning_rate": 1.1377962889187072e-07, "loss": 0.0354, "reward": 1.3854167088866234, "reward_std": 0.3189055845141411, "rewards/accuracy_reward": 0.4583333386108279, "rewards/format_reward": 0.9270833507180214, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 752.4687690734863, "epoch": 0.9216, "grad_norm": 2.130063772201538, "kl": 0.22149658203125, "learning_rate": 1.134115714716297e-07, "loss": -0.0006, "reward": 1.635416716337204, "reward_std": 0.4276634007692337, "rewards/accuracy_reward": 0.7395833507180214, "rewards/format_reward": 0.8958333432674408, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 1014.395881652832, "epoch": 0.9226666666666666, "grad_norm": 3.7138776779174805, "kl": 0.25653076171875, "learning_rate": 1.1304842191378376e-07, "loss": 0.1141, "reward": 1.6250000447034836, "reward_std": 0.4192088320851326, "rewards/accuracy_reward": 0.6979166846722364, "rewards/format_reward": 0.9270833507180214, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 912.5104293823242, "epoch": 0.9237333333333333, "grad_norm": 1.5268237590789795, "kl": 0.21722412109375, "learning_rate": 1.1269018430064004e-07, "loss": 0.066, "reward": 1.7395833879709244, "reward_std": 0.40181299299001694, "rewards/accuracy_reward": 0.8020833469927311, "rewards/format_reward": 0.9375000074505806, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 1198.958381652832, "epoch": 0.9248, "grad_norm": 2.45686936378479, "kl": 0.3450927734375, "learning_rate": 1.1233686265928844e-07, "loss": 0.0873, "reward": 1.3125000298023224, "reward_std": 0.521427571773529, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.8645833507180214, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 1123.864616394043, "epoch": 0.9258666666666666, "grad_norm": 2.46215558052063, "kl": 0.28497314453125, "learning_rate": 1.1198846096155678e-07, "loss": -0.0253, "reward": 1.4479167237877846, "reward_std": 0.518234871327877, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.9270833507180214, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 811.2500190734863, "epoch": 0.9269333333333334, "grad_norm": 1.435261845588684, "kl": 0.271270751953125, "learning_rate": 1.1164498312396562e-07, "loss": 0.0097, "reward": 1.6250000298023224, "reward_std": 0.42063241824507713, "rewards/accuracy_reward": 0.6875000186264515, "rewards/format_reward": 0.9375000074505806, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 986.5208549499512, "epoch": 0.928, "grad_norm": 2.26056170463562, "kl": 0.371551513671875, "learning_rate": 1.1130643300768453e-07, "loss": -0.0316, "reward": 1.3958333656191826, "reward_std": 0.39583809673786163, "rewards/accuracy_reward": 0.48958334047347307, "rewards/format_reward": 0.9062500149011612, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 1216.4062805175781, "epoch": 0.9290666666666667, "grad_norm": 6.7628703117370605, "kl": 0.3233642578125, "learning_rate": 1.1097281441848868e-07, "loss": 0.1096, "reward": 1.4895833730697632, "reward_std": 0.4963187128305435, "rewards/accuracy_reward": 0.5937500149011612, "rewards/format_reward": 0.895833358168602, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 1009.3542098999023, "epoch": 0.9301333333333334, "grad_norm": 1.7428706884384155, "kl": 0.4071044921875, "learning_rate": 1.1064413110671597e-07, "loss": 0.0023, "reward": 1.5208333432674408, "reward_std": 0.31717676669359207, "rewards/accuracy_reward": 0.6145833395421505, "rewards/format_reward": 0.9062500149011612, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 995.8437843322754, "epoch": 0.9312, "grad_norm": 1.5753730535507202, "kl": 0.2760009765625, "learning_rate": 1.1032038676722492e-07, "loss": -0.0232, "reward": 1.6666666865348816, "reward_std": 0.293089184910059, "rewards/accuracy_reward": 0.7083333395421505, "rewards/format_reward": 0.9583333432674408, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 949.6771087646484, "epoch": 0.9322666666666667, "grad_norm": 1.2945363521575928, "kl": 0.2786865234375, "learning_rate": 1.1000158503935303e-07, "loss": 0.0014, "reward": 1.4270833879709244, "reward_std": 0.44607771933078766, "rewards/accuracy_reward": 0.5000000083819032, "rewards/format_reward": 0.9270833432674408, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 951.8437652587891, "epoch": 0.9333333333333333, "grad_norm": 1.0721499919891357, "kl": 0.2825927734375, "learning_rate": 1.0968772950687608e-07, "loss": 0.0222, "reward": 1.5416666865348816, "reward_std": 0.2804777920246124, "rewards/accuracy_reward": 0.5937500083819032, "rewards/format_reward": 0.947916679084301, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 1033.1667022705078, "epoch": 0.9344, "grad_norm": 3.9650440216064453, "kl": 0.42669677734375, "learning_rate": 1.0937882369796763e-07, "loss": 0.1002, "reward": 1.5729167014360428, "reward_std": 0.5071489587426186, "rewards/accuracy_reward": 0.6562500223517418, "rewards/format_reward": 0.9166666716337204, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 797.2708740234375, "epoch": 0.9354666666666667, "grad_norm": 3.3797764778137207, "kl": 0.2645263671875, "learning_rate": 1.0907487108515954e-07, "loss": 0.0851, "reward": 1.6041667088866234, "reward_std": 0.3329674154520035, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.9375000149011612, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 963.2604560852051, "epoch": 0.9365333333333333, "grad_norm": 1.5051789283752441, "kl": 0.290283203125, "learning_rate": 1.0877587508530265e-07, "loss": 0.0173, "reward": 1.4479167014360428, "reward_std": 0.37400370836257935, "rewards/accuracy_reward": 0.5208333460614085, "rewards/format_reward": 0.9270833507180214, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 1140.2291946411133, "epoch": 0.9376, "grad_norm": 1.968876838684082, "kl": 0.2540283203125, "learning_rate": 1.0848183905952886e-07, "loss": 0.0047, "reward": 1.5520833730697632, "reward_std": 0.38682181015610695, "rewards/accuracy_reward": 0.6145833423361182, "rewards/format_reward": 0.9375000074505806, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 743.6458435058594, "epoch": 0.9386666666666666, "grad_norm": 2.687626838684082, "kl": 0.26312255859375, "learning_rate": 1.0819276631321285e-07, "loss": 0.1113, "reward": 1.6250000521540642, "reward_std": 0.46329230815172195, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.9166666865348816, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 1235.2396240234375, "epoch": 0.9397333333333333, "grad_norm": 2.584925889968872, "kl": 0.4984130859375, "learning_rate": 1.0790866009593511e-07, "loss": 0.0857, "reward": 1.197916679084301, "reward_std": 0.46526331454515457, "rewards/accuracy_reward": 0.32291667722165585, "rewards/format_reward": 0.8750000149011612, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 1003.4791946411133, "epoch": 0.9408, "grad_norm": 1.0362218618392944, "kl": 0.32135009765625, "learning_rate": 1.0762952360144548e-07, "loss": 0.0105, "reward": 1.3854167014360428, "reward_std": 0.4481824189424515, "rewards/accuracy_reward": 0.4166666753590107, "rewards/format_reward": 0.9687500074505806, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 967.5521087646484, "epoch": 0.9418666666666666, "grad_norm": 2.5335307121276855, "kl": 0.3900146484375, "learning_rate": 1.0735535996762717e-07, "loss": 0.0448, "reward": 1.4270833507180214, "reward_std": 0.498721182346344, "rewards/accuracy_reward": 0.510416679084301, "rewards/format_reward": 0.916666679084301, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 939.8229446411133, "epoch": 0.9429333333333333, "grad_norm": 2.13116192817688, "kl": 0.525390625, "learning_rate": 1.070861722764616e-07, "loss": 0.091, "reward": 1.4270833730697632, "reward_std": 0.4416884146630764, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.9062500149011612, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 745.8020935058594, "epoch": 0.944, "grad_norm": 1.7519346475601196, "kl": 0.217529296875, "learning_rate": 1.0682196355399347e-07, "loss": 0.0357, "reward": 1.6041666939854622, "reward_std": 0.4100252538919449, "rewards/accuracy_reward": 0.6875000027939677, "rewards/format_reward": 0.916666679084301, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 807.3750228881836, "epoch": 0.9450666666666667, "grad_norm": 1.3273733854293823, "kl": 0.29595947265625, "learning_rate": 1.0656273677029705e-07, "loss": 0.044, "reward": 1.5416667312383652, "reward_std": 0.4027745798230171, "rewards/accuracy_reward": 0.583333345130086, "rewards/format_reward": 0.9583333432674408, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 742.0625190734863, "epoch": 0.9461333333333334, "grad_norm": 2.1772632598876953, "kl": 0.2374267578125, "learning_rate": 1.0630849483944278e-07, "loss": 0.0193, "reward": 1.5729167014360428, "reward_std": 0.46779894083738327, "rewards/accuracy_reward": 0.6145833497866988, "rewards/format_reward": 0.9583333432674408, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 1114.6875457763672, "epoch": 0.9472, "grad_norm": 2.9138357639312744, "kl": 0.464111328125, "learning_rate": 1.0605924061946419e-07, "loss": 0.0032, "reward": 1.250000037252903, "reward_std": 0.3452054485678673, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.9375000149011612, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 823.614616394043, "epoch": 0.9482666666666667, "grad_norm": 1.8616143465042114, "kl": 0.33984375, "learning_rate": 1.0581497691232605e-07, "loss": 0.076, "reward": 1.7812500447034836, "reward_std": 0.375774621963501, "rewards/accuracy_reward": 0.822916679084301, "rewards/format_reward": 0.9583333432674408, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 995.2187652587891, "epoch": 0.9493333333333334, "grad_norm": 2.7014288902282715, "kl": 0.43536376953125, "learning_rate": 1.05575706463893e-07, "loss": 0.055, "reward": 1.6041667014360428, "reward_std": 0.38697197288274765, "rewards/accuracy_reward": 0.6354166772216558, "rewards/format_reward": 0.9687500074505806, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 1125.3646240234375, "epoch": 0.9504, "grad_norm": 2.045570135116577, "kl": 0.53240966796875, "learning_rate": 1.0534143196389822e-07, "loss": 0.0262, "reward": 1.4687500447034836, "reward_std": 0.4911299757659435, "rewards/accuracy_reward": 0.6145833469927311, "rewards/format_reward": 0.8541666865348816, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 1225.5417098999023, "epoch": 0.9514666666666667, "grad_norm": 4.070314407348633, "kl": 0.3896484375, "learning_rate": 1.0511215604591361e-07, "loss": 0.0466, "reward": 1.3437500149011612, "reward_std": 0.3908129595220089, "rewards/accuracy_reward": 0.4687500102445483, "rewards/format_reward": 0.8750000074505806, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 1003.7083702087402, "epoch": 0.9525333333333333, "grad_norm": 2.8889200687408447, "kl": 0.3616943359375, "learning_rate": 1.0488788128732007e-07, "loss": 0.124, "reward": 1.4375000298023224, "reward_std": 0.41631728783249855, "rewards/accuracy_reward": 0.5416666753590107, "rewards/format_reward": 0.8958333507180214, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 1068.5000228881836, "epoch": 0.9536, "grad_norm": 2.540318012237549, "kl": 0.55029296875, "learning_rate": 1.0466861020927841e-07, "loss": 0.0876, "reward": 1.208333358168602, "reward_std": 0.35725031420588493, "rewards/accuracy_reward": 0.29166668094694614, "rewards/format_reward": 0.916666679084301, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 829.6875381469727, "epoch": 0.9546666666666667, "grad_norm": 1.9189482927322388, "kl": 0.33551025390625, "learning_rate": 1.0445434527670114e-07, "loss": 0.0118, "reward": 1.531250037252903, "reward_std": 0.30752189829945564, "rewards/accuracy_reward": 0.5937500037252903, "rewards/format_reward": 0.9375000149011612, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 992.4792022705078, "epoch": 0.9557333333333333, "grad_norm": 1.5526313781738281, "kl": 0.25665283203125, "learning_rate": 1.0424508889822467e-07, "loss": 0.0827, "reward": 1.656250037252903, "reward_std": 0.3500521034002304, "rewards/accuracy_reward": 0.7083333367481828, "rewards/format_reward": 0.947916679084301, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 1067.2396087646484, "epoch": 0.9568, "grad_norm": 2.4601094722747803, "kl": 0.2857666015625, "learning_rate": 1.0404084342618244e-07, "loss": -0.0088, "reward": 1.6250000149011612, "reward_std": 0.2679456062614918, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.8958333507180214, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 1037.322940826416, "epoch": 0.9578666666666666, "grad_norm": 1.3647347688674927, "kl": 0.34576416015625, "learning_rate": 1.0384161115657813e-07, "loss": 0.0072, "reward": 1.468750037252903, "reward_std": 0.37953465431928635, "rewards/accuracy_reward": 0.5520833432674408, "rewards/format_reward": 0.916666679084301, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 915.3021202087402, "epoch": 0.9589333333333333, "grad_norm": 1.2627687454223633, "kl": 0.20697021484375, "learning_rate": 1.0364739432906011e-07, "loss": 0.0509, "reward": 1.7083333879709244, "reward_std": 0.3809720575809479, "rewards/accuracy_reward": 0.7812500149011612, "rewards/format_reward": 0.9270833507180214, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 1138.0833702087402, "epoch": 0.96, "grad_norm": 1.7111729383468628, "kl": 0.425537109375, "learning_rate": 1.0345819512689621e-07, "loss": 0.0939, "reward": 1.2604166939854622, "reward_std": 0.42381551861763, "rewards/accuracy_reward": 0.3958333386108279, "rewards/format_reward": 0.8645833507180214, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 814.0729446411133, "epoch": 0.9610666666666666, "grad_norm": 2.265413761138916, "kl": 0.21539306640625, "learning_rate": 1.0327401567694921e-07, "loss": 0.0718, "reward": 1.6562500447034836, "reward_std": 0.37976518273353577, "rewards/accuracy_reward": 0.7187500102445483, "rewards/format_reward": 0.9375000149011612, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 863.0521125793457, "epoch": 0.9621333333333333, "grad_norm": 1.8768540620803833, "kl": 0.2877197265625, "learning_rate": 1.0309485804965266e-07, "loss": 0.0766, "reward": 1.5416667088866234, "reward_std": 0.4115383252501488, "rewards/accuracy_reward": 0.6145833414047956, "rewards/format_reward": 0.9270833507180214, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 851.1458549499512, "epoch": 0.9632, "grad_norm": 2.7482450008392334, "kl": 0.331787109375, "learning_rate": 1.0292072425898808e-07, "loss": 0.0057, "reward": 1.5416667014360428, "reward_std": 0.42045415565371513, "rewards/accuracy_reward": 0.6145833460614085, "rewards/format_reward": 0.9270833432674408, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 857.7187843322754, "epoch": 0.9642666666666667, "grad_norm": 1.0865980386734009, "kl": 0.257568359375, "learning_rate": 1.02751616262462e-07, "loss": 0.0093, "reward": 1.5625000223517418, "reward_std": 0.28913528472185135, "rewards/accuracy_reward": 0.6250000074505806, "rewards/format_reward": 0.9375000149011612, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 1087.3542022705078, "epoch": 0.9653333333333334, "grad_norm": 1.1855332851409912, "kl": 0.384033203125, "learning_rate": 1.0258753596108392e-07, "loss": -0.0129, "reward": 1.3958333730697632, "reward_std": 0.44749053567647934, "rewards/accuracy_reward": 0.4687500149011612, "rewards/format_reward": 0.9270833507180214, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 955.2500381469727, "epoch": 0.9664, "grad_norm": 2.590808629989624, "kl": 0.2403564453125, "learning_rate": 1.0242848519934508e-07, "loss": 0.1179, "reward": 1.7291667088866234, "reward_std": 0.35143227875232697, "rewards/accuracy_reward": 0.812500013038516, "rewards/format_reward": 0.9166666716337204, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 1092.0833740234375, "epoch": 0.9674666666666667, "grad_norm": 1.6088510751724243, "kl": 0.34124755859375, "learning_rate": 1.0227446576519772e-07, "loss": 0.0607, "reward": 1.416666679084301, "reward_std": 0.42565515264868736, "rewards/accuracy_reward": 0.4895833432674408, "rewards/format_reward": 0.9270833432674408, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 774.8750171661377, "epoch": 0.9685333333333334, "grad_norm": 1.3935770988464355, "kl": 0.222900390625, "learning_rate": 1.0212547939003496e-07, "loss": -0.0092, "reward": 1.6666667088866234, "reward_std": 0.2934442237019539, "rewards/accuracy_reward": 0.7187500102445483, "rewards/format_reward": 0.947916679084301, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 955.0625228881836, "epoch": 0.9696, "grad_norm": 2.0161428451538086, "kl": 0.3148193359375, "learning_rate": 1.019815277486713e-07, "loss": 0.032, "reward": 1.4062500298023224, "reward_std": 0.46446023136377335, "rewards/accuracy_reward": 0.510416679084301, "rewards/format_reward": 0.8958333507180214, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 1335.5833778381348, "epoch": 0.9706666666666667, "grad_norm": 1.3265938758850098, "kl": 0.346435546875, "learning_rate": 1.0184261245932374e-07, "loss": 0.0575, "reward": 1.2604167014360428, "reward_std": 0.3877116069197655, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.885416679084301, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 1062.0520973205566, "epoch": 0.9717333333333333, "grad_norm": 3.251396417617798, "kl": 0.33392333984375, "learning_rate": 1.0170873508359379e-07, "loss": 0.1072, "reward": 1.4895833656191826, "reward_std": 0.5220699533820152, "rewards/accuracy_reward": 0.5729166865348816, "rewards/format_reward": 0.9166666865348816, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 1115.5312881469727, "epoch": 0.9728, "grad_norm": 1.9887382984161377, "kl": 0.30047607421875, "learning_rate": 1.0157989712644977e-07, "loss": 0.1048, "reward": 1.5000000298023224, "reward_std": 0.40314508602023125, "rewards/accuracy_reward": 0.5312500149011612, "rewards/format_reward": 0.9687500074505806, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 1353.9375228881836, "epoch": 0.9738666666666667, "grad_norm": 1.5293323993682861, "kl": 0.4132080078125, "learning_rate": 1.0145610003620982e-07, "loss": 0.041, "reward": 1.3020833507180214, "reward_std": 0.48885734006762505, "rewards/accuracy_reward": 0.45833334513008595, "rewards/format_reward": 0.8437500111758709, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 1008.4791870117188, "epoch": 0.9749333333333333, "grad_norm": 2.539541721343994, "kl": 0.25933837890625, "learning_rate": 1.0133734520452574e-07, "loss": 0.0767, "reward": 1.4583333432674408, "reward_std": 0.30692126974463463, "rewards/accuracy_reward": 0.5208333376795053, "rewards/format_reward": 0.9375000149011612, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 955.677116394043, "epoch": 0.976, "grad_norm": 2.5078229904174805, "kl": 0.29473876953125, "learning_rate": 1.0122363396636742e-07, "loss": 0.1021, "reward": 1.5833333507180214, "reward_std": 0.43194879591464996, "rewards/accuracy_reward": 0.6562500149011612, "rewards/format_reward": 0.9270833507180214, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 781.072940826416, "epoch": 0.9770666666666666, "grad_norm": 1.3488787412643433, "kl": 0.16754150390625, "learning_rate": 1.0111496760000757e-07, "loss": 0.0224, "reward": 1.7604167014360428, "reward_std": 0.3404262103140354, "rewards/accuracy_reward": 0.8020833507180214, "rewards/format_reward": 0.9583333432674408, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 870.2396011352539, "epoch": 0.9781333333333333, "grad_norm": 1.130600929260254, "kl": 0.214569091796875, "learning_rate": 1.0101134732700768e-07, "loss": 0.0422, "reward": 1.7083333656191826, "reward_std": 0.3244344964623451, "rewards/accuracy_reward": 0.7604166781529784, "rewards/format_reward": 0.947916679084301, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 1043.5937805175781, "epoch": 0.9792, "grad_norm": 3.501845598220825, "kl": 0.431640625, "learning_rate": 1.0091277431220393e-07, "loss": 0.0667, "reward": 1.458333358168602, "reward_std": 0.400258656591177, "rewards/accuracy_reward": 0.5312500027939677, "rewards/format_reward": 0.9270833432674408, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 846.5416831970215, "epoch": 0.9802666666666666, "grad_norm": 1.9060888290405273, "kl": 0.23687744140625, "learning_rate": 1.0081924966369448e-07, "loss": 0.0041, "reward": 1.6145833730697632, "reward_std": 0.23832672089338303, "rewards/accuracy_reward": 0.6770833358168602, "rewards/format_reward": 0.9375000149011612, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 1270.8333892822266, "epoch": 0.9813333333333333, "grad_norm": 2.7490394115448, "kl": 0.3651123046875, "learning_rate": 1.0073077443282665e-07, "loss": 0.095, "reward": 1.2187500223517418, "reward_std": 0.36577707529067993, "rewards/accuracy_reward": 0.33333333767950535, "rewards/format_reward": 0.885416679084301, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 1073.5729484558105, "epoch": 0.9824, "grad_norm": 1.1655904054641724, "kl": 0.2713623046875, "learning_rate": 1.0064734961418539e-07, "loss": 0.0387, "reward": 1.7083333432674408, "reward_std": 0.28262239694595337, "rewards/accuracy_reward": 0.750000013038516, "rewards/format_reward": 0.9583333432674408, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 1082.53129196167, "epoch": 0.9834666666666667, "grad_norm": 3.0024359226226807, "kl": 0.26446533203125, "learning_rate": 1.0056897614558193e-07, "loss": 0.0819, "reward": 1.4375000223517418, "reward_std": 0.2401643879711628, "rewards/accuracy_reward": 0.5000000055879354, "rewards/format_reward": 0.9375000074505806, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 941.4271049499512, "epoch": 0.9845333333333334, "grad_norm": 3.27968692779541, "kl": 0.19927978515625, "learning_rate": 1.0049565490804324e-07, "loss": 0.0643, "reward": 1.6250000298023224, "reward_std": 0.45093272253870964, "rewards/accuracy_reward": 0.6770833432674408, "rewards/format_reward": 0.947916679084301, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 1195.1041984558105, "epoch": 0.9856, "grad_norm": 4.580929756164551, "kl": 0.3822021484375, "learning_rate": 1.0042738672580223e-07, "loss": 0.066, "reward": 1.2187500223517418, "reward_std": 0.4470629580318928, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.8437500223517418, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 888.8229446411133, "epoch": 0.9866666666666667, "grad_norm": 1.9081313610076904, "kl": 0.20770263671875, "learning_rate": 1.0036417236628843e-07, "loss": 0.1251, "reward": 1.6666667014360428, "reward_std": 0.3265094868838787, "rewards/accuracy_reward": 0.6875000074505806, "rewards/format_reward": 0.9791666716337204, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 1098.5104522705078, "epoch": 0.9877333333333334, "grad_norm": 2.9460558891296387, "kl": 0.27386474609375, "learning_rate": 1.0030601254011931e-07, "loss": 0.0401, "reward": 1.3750000298023224, "reward_std": 0.44997796416282654, "rewards/accuracy_reward": 0.4583333423361182, "rewards/format_reward": 0.916666679084301, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 793.2916870117188, "epoch": 0.9888, "grad_norm": 1.0369291305541992, "kl": 0.17816162109375, "learning_rate": 1.002529079010924e-07, "loss": 0.0152, "reward": 1.8333333730697632, "reward_std": 0.29389167577028275, "rewards/accuracy_reward": 0.854166679084301, "rewards/format_reward": 0.9791666716337204, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 1021.1666946411133, "epoch": 0.9898666666666667, "grad_norm": 6.218924045562744, "kl": 0.30419921875, "learning_rate": 1.0020485904617782e-07, "loss": 0.0553, "reward": 1.6354167014360428, "reward_std": 0.4364391565322876, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.9687500074505806, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 1349.114631652832, "epoch": 0.9909333333333333, "grad_norm": 12.851278305053711, "kl": 0.4864501953125, "learning_rate": 1.0016186651551175e-07, "loss": 0.1308, "reward": 1.1875000223517418, "reward_std": 0.3709787204861641, "rewards/accuracy_reward": 0.30208333767950535, "rewards/format_reward": 0.8854166865348816, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 981.4687767028809, "epoch": 0.992, "grad_norm": 15.863187789916992, "kl": 0.41632080078125, "learning_rate": 1.0012393079239002e-07, "loss": 0.0261, "reward": 1.7083333805203438, "reward_std": 0.3790978267788887, "rewards/accuracy_reward": 0.7604166865348816, "rewards/format_reward": 0.947916679084301, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 930.6458625793457, "epoch": 0.9930666666666667, "grad_norm": 7.472486972808838, "kl": 0.35919189453125, "learning_rate": 1.0009105230326312e-07, "loss": 0.0532, "reward": 1.5104167088866234, "reward_std": 0.3663909435272217, "rewards/accuracy_reward": 0.5937500074505806, "rewards/format_reward": 0.916666679084301, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 919.7396240234375, "epoch": 0.9941333333333333, "grad_norm": 10.888038635253906, "kl": 0.24072265625, "learning_rate": 1.000632314177311e-07, "loss": 0.0075, "reward": 1.6250000223517418, "reward_std": 0.42438938096165657, "rewards/accuracy_reward": 0.6770833469927311, "rewards/format_reward": 0.947916679084301, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 929.1041946411133, "epoch": 0.9952, "grad_norm": 32.363121032714844, "kl": 0.7734375, "learning_rate": 1.0004046844853942e-07, "loss": 0.0999, "reward": 1.5416667014360428, "reward_std": 0.28913528472185135, "rewards/accuracy_reward": 0.5729166772216558, "rewards/format_reward": 0.9687500074505806, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 1032.6250343322754, "epoch": 0.9962666666666666, "grad_norm": 1.69345223903656, "kl": 0.217437744140625, "learning_rate": 1.0002276365157562e-07, "loss": -0.0031, "reward": 1.5833333879709244, "reward_std": 0.43135737627744675, "rewards/accuracy_reward": 0.6562500149011612, "rewards/format_reward": 0.9270833432674408, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 1260.895866394043, "epoch": 0.9973333333333333, "grad_norm": 24.261192321777344, "kl": 0.71392822265625, "learning_rate": 1.0001011722586624e-07, "loss": 0.1037, "reward": 1.343750037252903, "reward_std": 0.46963174268603325, "rewards/accuracy_reward": 0.4687500074505806, "rewards/format_reward": 0.8750000149011612, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 995.8333702087402, "epoch": 0.9984, "grad_norm": 57.679264068603516, "kl": 0.82611083984375, "learning_rate": 1.0000252931357484e-07, "loss": 0.112, "reward": 1.4791667088866234, "reward_std": 0.5362065136432648, "rewards/accuracy_reward": 0.5937500139698386, "rewards/format_reward": 0.885416679084301, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 989.6979522705078, "epoch": 0.9994666666666666, "grad_norm": 1.3699493408203125, "kl": 0.203857421875, "learning_rate": 1e-07, "loss": -0.0299, "reward": 1.4375000447034836, "reward_std": 0.31381870806217194, "rewards/accuracy_reward": 0.4687500149011612, "rewards/format_reward": 0.9687500074505806, "step": 937 }, { "epoch": 0.9994666666666666, "step": 937, "total_flos": 0.0, "train_loss": 0.09050674229391166, "train_runtime": 148511.5963, "train_samples_per_second": 0.051, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 937, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }