{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.17142857142857143, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2700.4271850585938, "cov_mean": -6.0587970438064076e-05, "cov_std": 0.35307812318205833, "entropy": 0.36962890625, "epoch": 0.001142857142857143, "grad_norm": 0.4682573080062866, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0446, "reward": 0.7604166893288493, "reward_std": 0.4268697127699852, "rewards/accuracy_reward": 0.25000001303851604, "rewards/format_reward": 0.5104166669771075, "step": 1, "w_high_ratio": 0.21827427297830582, "w_low_ratio": 0.03724556043744087, "w_max": 2.315404176712036, "w_mean": 1.47113436460495, "w_min": 0.0, "w_std": 0.2791289445012808 }, { "completion_length": 3127.3958435058594, "cov_mean": -2.155053698515985e-05, "cov_std": 0.310540571808815, "entropy": 0.353515625, "epoch": 0.002285714285714286, "grad_norm": 0.534198522567749, "kl": 0.0, "learning_rate": 6.666666666666667e-08, "loss": 0.0058, "reward": 0.6458333637565374, "reward_std": 0.4249730706214905, "rewards/accuracy_reward": 0.2812500102445483, "rewards/format_reward": 0.3645833386108279, "step": 2, "w_high_ratio": 0.05722124548628926, "w_low_ratio": 0.036368744214996696, "w_max": 1.8768170773983002, "w_mean": 1.2113382518291473, "w_min": 0.0, "w_std": 0.19011373445391655 }, { "completion_length": 3699.729248046875, "cov_mean": -7.181393357313937e-05, "cov_std": 0.2876722402870655, "entropy": 0.458984375, "epoch": 0.0034285714285714284, "grad_norm": 0.3820074200630188, "kl": 4.845857620239258e-05, "learning_rate": 1.3333333333333334e-07, "loss": 0.0581, "reward": 0.250000006519258, "reward_std": 0.392750509083271, "rewards/accuracy_reward": 0.0729166679084301, "rewards/format_reward": 0.17708334233611822, "step": 3, "w_high_ratio": 0.0, "w_low_ratio": 0.03734566690400243, "w_max": 1.4001508057117462, "w_mean": 1.0752681195735931, "w_min": 0.0, "w_std": 0.16775447502732277 }, { "completion_length": 2261.197998046875, "cov_mean": 2.3754174435453024e-05, "cov_std": 0.37356993183493614, "entropy": 0.3896484375, "epoch": 0.004571428571428572, "grad_norm": 0.6735726594924927, "kl": 3.325939178466797e-05, "learning_rate": 2e-07, "loss": -0.0291, "reward": 0.9166666865348816, "reward_std": 0.4729222096502781, "rewards/accuracy_reward": 0.19791667070239782, "rewards/format_reward": 0.7187500223517418, "step": 4, "w_high_ratio": 0.1157110151834786, "w_low_ratio": 0.03791455435566604, "w_max": 2.3952889442443848, "w_mean": 1.526582419872284, "w_min": 0.0, "w_std": 0.27415894344449043 }, { "completion_length": 3429.5313720703125, "cov_mean": 6.268694096434047e-05, "cov_std": 0.43890176713466644, "entropy": 0.45556640625, "epoch": 0.005714285714285714, "grad_norm": 0.4359005391597748, "kl": 4.1544437408447266e-05, "learning_rate": 2.6666666666666667e-07, "loss": -0.0158, "reward": 0.385416679084301, "reward_std": 0.4654032774269581, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.3437500074505806, "step": 5, "w_high_ratio": 0.02077934332191944, "w_low_ratio": 0.05408582789823413, "w_max": 1.6484719216823578, "w_mean": 1.1780387163162231, "w_min": 1.1194353007380611e-33, "w_std": 0.2486402541399002 }, { "completion_length": 3461.1875610351562, "cov_mean": -5.967591278022155e-06, "cov_std": 0.49887532368302345, "entropy": 0.46533203125, "epoch": 0.006857142857142857, "grad_norm": 0.3651033937931061, "kl": 4.723668098449707e-05, "learning_rate": 3.333333333333333e-07, "loss": 0.0963, "reward": 0.3750000027939677, "reward_std": 0.4880600869655609, "rewards/accuracy_reward": 0.09375000093132257, "rewards/format_reward": 0.281250006519258, "step": 6, "w_high_ratio": 0.09943684190511703, "w_low_ratio": 0.05423067696392536, "w_max": 2.1359716653823853, "w_mean": 1.2522149085998535, "w_min": 0.0, "w_std": 0.3030992951244116 }, { "completion_length": 3263.5834350585938, "cov_mean": 1.3439643225865439e-05, "cov_std": 0.48358847945928574, "entropy": 0.3818359375, "epoch": 0.008, "grad_norm": 0.30507004261016846, "kl": 1.7881393432617188e-05, "learning_rate": 4e-07, "loss": -0.0382, "reward": 0.895833358168602, "reward_std": 0.5626667812466621, "rewards/accuracy_reward": 0.2604166716337204, "rewards/format_reward": 0.6354166865348816, "step": 7, "w_high_ratio": 0.0625, "w_low_ratio": 0.061702484264969826, "w_max": 1.6040385067462921, "w_mean": 1.2159111201763153, "w_min": 0.0, "w_std": 0.2597455531358719 }, { "completion_length": 2891.2396240234375, "cov_mean": -2.5899114461935824e-06, "cov_std": 0.31939053907990456, "entropy": 0.352783203125, "epoch": 0.009142857142857144, "grad_norm": 0.2681833803653717, "kl": 2.5928020477294922e-05, "learning_rate": 4.6666666666666666e-07, "loss": -0.0366, "reward": 0.8645833544433117, "reward_std": 0.5030707456171513, "rewards/accuracy_reward": 0.38541667722165585, "rewards/format_reward": 0.47916667722165585, "step": 8, "w_high_ratio": 0.008502780459821224, "w_low_ratio": 0.032900307793170214, "w_max": 1.777650386095047, "w_mean": 1.312690258026123, "w_min": 0.0, "w_std": 0.19663411937654018 }, { "completion_length": 3367.1146850585938, "cov_mean": -1.6302776657539653e-05, "cov_std": 0.46739284694194794, "entropy": 0.455078125, "epoch": 0.010285714285714285, "grad_norm": 0.3553212881088257, "kl": 4.4405460357666016e-05, "learning_rate": 5.333333333333333e-07, "loss": -0.0055, "reward": 0.4687500074505806, "reward_std": 0.5153512582182884, "rewards/accuracy_reward": 0.10416667070239782, "rewards/format_reward": 0.3645833432674408, "step": 9, "w_high_ratio": 0.07327797263860703, "w_low_ratio": 0.05655699595808983, "w_max": 1.8923848271369934, "w_mean": 1.2186144888401031, "w_min": 0.0, "w_std": 0.2829560115933418 }, { "completion_length": 2910.5209350585938, "cov_mean": -4.502756110014161e-05, "cov_std": 0.2934259846806526, "entropy": 0.34228515625, "epoch": 0.011428571428571429, "grad_norm": 0.42780593037605286, "kl": 2.6166439056396484e-05, "learning_rate": 6e-07, "loss": -0.1052, "reward": 0.645833358168602, "reward_std": 0.4341064542531967, "rewards/accuracy_reward": 0.18750001024454832, "rewards/format_reward": 0.4583333469927311, "step": 10, "w_high_ratio": 0.10562621057033539, "w_low_ratio": 0.03302360652014613, "w_max": 2.053061753511429, "w_mean": 1.2806267738342285, "w_min": 0.0, "w_std": 0.18188510835170746 }, { "completion_length": 3628.7188110351562, "cov_mean": 7.874305651967006e-06, "cov_std": 0.26691694743931293, "entropy": 0.37451171875, "epoch": 0.012571428571428572, "grad_norm": 0.3640754818916321, "kl": 3.072619438171387e-05, "learning_rate": 6.666666666666666e-07, "loss": 0.028, "reward": 0.22916667442768812, "reward_std": 0.3545106574892998, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.16666666697710752, "step": 11, "w_high_ratio": 0.02717638947069645, "w_low_ratio": 0.032686853082850575, "w_max": 1.4469610452651978, "w_mean": 1.1045592427253723, "w_min": 0.25, "w_std": 0.16260053776204586 }, { "completion_length": 2600.3334350585938, "cov_mean": -2.9306334909051657e-05, "cov_std": 0.36532483994960785, "entropy": 0.3974609375, "epoch": 0.013714285714285714, "grad_norm": 0.5842437148094177, "kl": 3.3795833587646484e-05, "learning_rate": 7.333333333333332e-07, "loss": -0.0312, "reward": 0.8541667014360428, "reward_std": 0.42210913449525833, "rewards/accuracy_reward": 0.16666667070239782, "rewards/format_reward": 0.6875000298023224, "step": 12, "w_high_ratio": 0.25580327026546, "w_low_ratio": 0.03646009974181652, "w_max": 2.4061461091041565, "w_mean": 1.5557032227516174, "w_min": 0.0, "w_std": 0.2952301353216171 }, { "completion_length": 3168.260498046875, "cov_mean": -2.8395811568771023e-07, "cov_std": 0.16673796251416206, "entropy": 0.3974609375, "epoch": 0.014857142857142857, "grad_norm": 0.32018211483955383, "kl": 3.4928321838378906e-05, "learning_rate": 8e-07, "loss": 0.013, "reward": 0.6666666865348816, "reward_std": 0.21763009577989578, "rewards/accuracy_reward": 0.21875000558793545, "rewards/format_reward": 0.447916679084301, "step": 13, "w_high_ratio": 0.16240009665489197, "w_low_ratio": 0.019987554755061865, "w_max": 1.9363721311092377, "w_mean": 1.3362610340118408, "w_min": 0.25, "w_std": 0.13108899258077145 }, { "completion_length": 3131.4896240234375, "cov_mean": 2.226169999630656e-05, "cov_std": 0.4046770706772804, "entropy": 0.37109375, "epoch": 0.016, "grad_norm": 0.37404048442840576, "kl": 2.765655517578125e-05, "learning_rate": 8.666666666666667e-07, "loss": -0.0183, "reward": 0.697916679084301, "reward_std": 0.5398248583078384, "rewards/accuracy_reward": 0.23958334140479565, "rewards/format_reward": 0.4583333507180214, "step": 14, "w_high_ratio": 0.04599327966570854, "w_low_ratio": 0.04479631967842579, "w_max": 1.9890422523021698, "w_mean": 1.2714892327785492, "w_min": 0.0, "w_std": 0.2778457775712013 }, { "completion_length": 2876.5000610351562, "cov_mean": 5.642831865770859e-05, "cov_std": 0.2463381662964821, "entropy": 0.3603515625, "epoch": 0.017142857142857144, "grad_norm": 0.3123975694179535, "kl": 2.4527311325073242e-05, "learning_rate": 9.333333333333333e-07, "loss": 0.0189, "reward": 0.6145833358168602, "reward_std": 0.282865684479475, "rewards/accuracy_reward": 0.21875000279396772, "rewards/format_reward": 0.3958333395421505, "step": 15, "w_high_ratio": 0.0, "w_low_ratio": 0.025731001514941454, "w_max": 1.6097791492938995, "w_mean": 1.1706343591213226, "w_min": 0.25, "w_std": 0.15255355089902878 }, { "completion_length": 3825.5313110351562, "cov_mean": 4.868104133493034e-05, "cov_std": 0.24393152818083763, "entropy": 0.462890625, "epoch": 0.018285714285714287, "grad_norm": 0.33849093317985535, "kl": 3.88026237487793e-05, "learning_rate": 1e-06, "loss": 0.0219, "reward": 0.1458333358168602, "reward_std": 0.3029968775808811, "rewards/accuracy_reward": 0.05208333395421505, "rewards/format_reward": 0.09375, "step": 16, "w_high_ratio": 0.016145935282111168, "w_low_ratio": 0.030266874469816685, "w_max": 1.4482556581497192, "w_mean": 1.039628803730011, "w_min": 0.25, "w_std": 0.12409967929124832 }, { "completion_length": 2457.187530517578, "cov_mean": -5.5577158491360024e-05, "cov_std": 0.40591511130332947, "entropy": 0.44580078125, "epoch": 0.019428571428571427, "grad_norm": 0.4333915710449219, "kl": 3.7103891372680664e-05, "learning_rate": 9.998781585307575e-07, "loss": -0.0184, "reward": 0.8645833432674408, "reward_std": 0.4666195958852768, "rewards/accuracy_reward": 0.27083334140479565, "rewards/format_reward": 0.5937500149011612, "step": 17, "w_high_ratio": 0.2576238848268986, "w_low_ratio": 0.045077938586473465, "w_max": 2.4780974686145782, "w_mean": 1.4898322224617004, "w_min": 0.25, "w_std": 0.27607931941747665 }, { "completion_length": 3082.572998046875, "cov_mean": 2.8908147328365885e-05, "cov_std": 0.37330811098217964, "entropy": 0.35791015625, "epoch": 0.02057142857142857, "grad_norm": 0.4184105098247528, "kl": 1.868605613708496e-05, "learning_rate": 9.99512700102336e-07, "loss": -0.0108, "reward": 0.5833333432674408, "reward_std": 0.35651107877492905, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.4583333507180214, "step": 18, "w_high_ratio": 0.05808360502123833, "w_low_ratio": 0.036713168025016785, "w_max": 1.845141887664795, "w_mean": 1.2742244899272919, "w_min": 0.0, "w_std": 0.20810226537287235 }, { "completion_length": 3121.2084350585938, "cov_mean": -7.294934039236978e-05, "cov_std": 0.43627090007066727, "entropy": 0.390625, "epoch": 0.021714285714285714, "grad_norm": 0.37888550758361816, "kl": 3.045797348022461e-05, "learning_rate": 9.989038226169207e-07, "loss": -0.0515, "reward": 0.8437500223517418, "reward_std": 0.6305291801691055, "rewards/accuracy_reward": 0.37500001676380634, "rewards/format_reward": 0.4687500260770321, "step": 19, "w_high_ratio": 0.07324637286365032, "w_low_ratio": 0.05016931891441345, "w_max": 1.8255594372749329, "w_mean": 1.2571382224559784, "w_min": 0.0, "w_std": 0.2677953541278839 }, { "completion_length": 2640.885498046875, "cov_mean": 4.735650145448744e-05, "cov_std": 0.3673415333032608, "entropy": 0.346923828125, "epoch": 0.022857142857142857, "grad_norm": 0.5563057065010071, "kl": 2.5674700736999512e-05, "learning_rate": 9.98051855792412e-07, "loss": -0.0634, "reward": 0.927083358168602, "reward_std": 0.4865139201283455, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.6354166865348816, "step": 20, "w_high_ratio": 0.1635199710726738, "w_low_ratio": 0.04272336233407259, "w_max": 2.4736950993537903, "w_mean": 1.437729924917221, "w_min": 0.0, "w_std": 0.2519150599837303 }, { "completion_length": 2853.541748046875, "cov_mean": 2.0583035620802548e-05, "cov_std": 0.330572672188282, "entropy": 0.4111328125, "epoch": 0.024, "grad_norm": 1.0757092237472534, "kl": 6.0230493545532227e-05, "learning_rate": 9.969572609838744e-07, "loss": -0.0628, "reward": 0.6458333684131503, "reward_std": 0.4320429190993309, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.4583333535119891, "step": 21, "w_high_ratio": 0.2964049205183983, "w_low_ratio": 0.028110376093536615, "w_max": 2.3169990181922913, "w_mean": 1.5084502398967743, "w_min": 0.0, "w_std": 0.2527524419128895 }, { "completion_length": 1911.4584045410156, "cov_mean": 5.60426842639572e-05, "cov_std": 0.40649277716875076, "entropy": 0.4189453125, "epoch": 0.025142857142857144, "grad_norm": 0.6271277666091919, "kl": 7.963180541992188e-05, "learning_rate": 9.956206309337066e-07, "loss": -0.0446, "reward": 1.0937500447034836, "reward_std": 0.43423888459801674, "rewards/accuracy_reward": 0.26041666977107525, "rewards/format_reward": 0.833333358168602, "step": 22, "w_high_ratio": 0.31319986283779144, "w_low_ratio": 0.043129971250891685, "w_max": 2.544324040412903, "w_mean": 1.6584191024303436, "w_min": 1.0509738482436128e-45, "w_std": 0.2851412668824196 }, { "completion_length": 2764.9896240234375, "cov_mean": -5.482907317855279e-05, "cov_std": 0.4773280769586563, "entropy": 0.364013671875, "epoch": 0.026285714285714287, "grad_norm": 0.4739457368850708, "kl": 5.441904067993164e-05, "learning_rate": 9.940426894506606e-07, "loss": 0.0215, "reward": 0.677083358168602, "reward_std": 0.5219395384192467, "rewards/accuracy_reward": 0.17708333861082792, "rewards/format_reward": 0.5000000149011612, "step": 23, "w_high_ratio": 0.2707533538341522, "w_low_ratio": 0.053491173312067986, "w_max": 2.4205015003681183, "w_mean": 1.5040302574634552, "w_min": 0.0, "w_std": 0.36683739349246025 }, { "completion_length": 2940.791748046875, "cov_mean": -2.262868292746134e-07, "cov_std": 0.4811120182275772, "entropy": 0.36572265625, "epoch": 0.027428571428571427, "grad_norm": 0.7653826475143433, "kl": 7.31348991394043e-05, "learning_rate": 9.922242910178859e-07, "loss": -0.0493, "reward": 0.8750000149011612, "reward_std": 0.580329179763794, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.5625, "step": 24, "w_high_ratio": 0.13745100796222687, "w_low_ratio": 0.043023983016610146, "w_max": 2.282612681388855, "w_mean": 1.408221811056137, "w_min": 0.0, "w_std": 0.30761053785681725 }, { "completion_length": 2838.385498046875, "cov_mean": 5.768927030658233e-05, "cov_std": 0.3588615171611309, "entropy": 0.43115234375, "epoch": 0.02857142857142857, "grad_norm": 0.4592779278755188, "kl": 0.0001583397388458252, "learning_rate": 9.901664203302124e-07, "loss": 0.0336, "reward": 0.6458333535119891, "reward_std": 0.42768918722867966, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.4375000102445483, "step": 25, "w_high_ratio": 0.25193173810839653, "w_low_ratio": 0.03785138111561537, "w_max": 2.232436418533325, "w_mean": 1.3874212205410004, "w_min": 0.0, "w_std": 0.26341583393514156 }, { "completion_length": 3263.9271850585938, "cov_mean": 5.7232594372180756e-05, "cov_std": 0.3583865277469158, "entropy": 0.423828125, "epoch": 0.029714285714285714, "grad_norm": 0.29540950059890747, "kl": 2.8073787689208984e-05, "learning_rate": 9.878701917609207e-07, "loss": -0.0308, "reward": 0.6562500298023224, "reward_std": 0.30247221142053604, "rewards/accuracy_reward": 0.20833334140479565, "rewards/format_reward": 0.4479166716337204, "step": 26, "w_high_ratio": 0.043125174939632416, "w_low_ratio": 0.034932715352624655, "w_max": 1.7730375826358795, "w_mean": 1.1945610046386719, "w_min": 0.0, "w_std": 0.21625201031565666 }, { "completion_length": 3234.354248046875, "cov_mean": -1.585684123028841e-05, "cov_std": 0.3628169037401676, "entropy": 0.4287109375, "epoch": 0.030857142857142857, "grad_norm": 0.23347000777721405, "kl": 7.574260234832764e-05, "learning_rate": 9.853368487582886e-07, "loss": 0.0016, "reward": 0.5729166939854622, "reward_std": 0.4602612778544426, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.447916679084301, "step": 27, "w_high_ratio": 0.0, "w_low_ratio": 0.049336991272866726, "w_max": 1.539461612701416, "w_mean": 1.1640309989452362, "w_min": 0.0, "w_std": 0.19506899639964104 }, { "completion_length": 3126.1458740234375, "cov_mean": -2.0113498976570554e-05, "cov_std": 0.4664968028664589, "entropy": 0.40576171875, "epoch": 0.032, "grad_norm": 0.3652705252170563, "kl": 9.274482727050781e-05, "learning_rate": 9.825677631722435e-07, "loss": 0.0034, "reward": 0.7916666865348816, "reward_std": 0.5775652155280113, "rewards/accuracy_reward": 0.322916679084301, "rewards/format_reward": 0.4687500149011612, "step": 28, "w_high_ratio": 0.0399150624871254, "w_low_ratio": 0.05781116522848606, "w_max": 1.8517873883247375, "w_mean": 1.2485012710094452, "w_min": 0.0, "w_std": 0.2899218685925007 }, { "completion_length": 3620.3854370117188, "cov_mean": 3.1027989280119073e-06, "cov_std": 0.3902217298746109, "entropy": 0.42919921875, "epoch": 0.03314285714285714, "grad_norm": 0.42134925723075867, "kl": 0.00019466876983642578, "learning_rate": 9.795644345114794e-07, "loss": 0.054, "reward": 0.26041667722165585, "reward_std": 0.39330877363681793, "rewards/accuracy_reward": 0.041666666977107525, "rewards/format_reward": 0.21875000558793545, "step": 29, "w_high_ratio": 0.0, "w_low_ratio": 0.05211624875664711, "w_max": 1.389526218175888, "w_mean": 1.1012998819351196, "w_min": 0.25, "w_std": 0.21629613637924194 }, { "completion_length": 3127.28125, "cov_mean": 7.65450022299774e-06, "cov_std": 0.39614470303058624, "entropy": 0.37109375, "epoch": 0.03428571428571429, "grad_norm": 0.3341064751148224, "kl": 0.0004119873046875, "learning_rate": 9.76328489131448e-07, "loss": 0.0409, "reward": 0.7916666865348816, "reward_std": 0.536014050245285, "rewards/accuracy_reward": 0.2708333395421505, "rewards/format_reward": 0.5208333432674408, "step": 30, "w_high_ratio": 0.09042909741401672, "w_low_ratio": 0.04219994880259037, "w_max": 2.0287185609340668, "w_mean": 1.2833797633647919, "w_min": 0.25, "w_std": 0.2540416121482849 }, { "completion_length": 3311.0418090820312, "cov_mean": 1.0463507578606368e-05, "cov_std": 0.2679142467677593, "entropy": 0.392578125, "epoch": 0.03542857142857143, "grad_norm": 0.34394925832748413, "kl": 0.00025856494903564453, "learning_rate": 9.728616793536587e-07, "loss": 0.008, "reward": 0.5104166772216558, "reward_std": 0.41994429379701614, "rewards/accuracy_reward": 0.19791666977107525, "rewards/format_reward": 0.3125000027939677, "step": 31, "w_high_ratio": 0.013557232916355133, "w_low_ratio": 0.029067183146253228, "w_max": 1.5240460634231567, "w_mean": 1.1379797160625458, "w_min": 0.0, "w_std": 0.1539093293249607 }, { "completion_length": 3302.9793090820312, "cov_mean": -3.0240359592426103e-05, "cov_std": 0.38900837302207947, "entropy": 0.4296875, "epoch": 0.036571428571428574, "grad_norm": 0.3267619013786316, "kl": 0.0003151893615722656, "learning_rate": 9.69165882516764e-07, "loss": 0.0316, "reward": 0.708333333954215, "reward_std": 0.4763314798474312, "rewards/accuracy_reward": 0.29166666977107525, "rewards/format_reward": 0.4166666744276881, "step": 32, "w_high_ratio": 0.03662256337702274, "w_low_ratio": 0.04006141540594399, "w_max": 1.6614282727241516, "w_mean": 1.1914446651935577, "w_min": 0.0, "w_std": 0.22214871272444725 }, { "completion_length": 3574.9063720703125, "cov_mean": -1.1321862984914333e-05, "cov_std": 0.35655253008008003, "entropy": 0.375, "epoch": 0.037714285714285714, "grad_norm": 0.22990448772907257, "kl": 0.00036966800689697266, "learning_rate": 9.65243099959949e-07, "loss": -0.0046, "reward": 0.614583358168602, "reward_std": 0.5987343490123749, "rewards/accuracy_reward": 0.2395833432674408, "rewards/format_reward": 0.375, "step": 33, "w_high_ratio": 0.0, "w_low_ratio": 0.050560859963297844, "w_max": 1.4202565550804138, "w_mean": 1.1010453402996063, "w_min": 9.80908925027372e-45, "w_std": 0.22285480797290802 }, { "completion_length": 2701.3021545410156, "cov_mean": -9.209951758748502e-05, "cov_std": 0.40857063978910446, "entropy": 0.455078125, "epoch": 0.038857142857142854, "grad_norm": 0.49724769592285156, "kl": 0.0010764598846435547, "learning_rate": 9.610954559391704e-07, "loss": -0.0999, "reward": 0.8229166883975267, "reward_std": 0.44509488344192505, "rewards/accuracy_reward": 0.32291668001562357, "rewards/format_reward": 0.5000000158324838, "step": 34, "w_high_ratio": 0.255466103553772, "w_low_ratio": 0.058159707114100456, "w_max": 1.966733694076538, "w_mean": 1.445367842912674, "w_min": 0.0, "w_std": 0.2951628230512142 }, { "completion_length": 3312.5209350585938, "cov_mean": -2.75732190857525e-06, "cov_std": 0.4042964428663254, "entropy": 0.43115234375, "epoch": 0.04, "grad_norm": 0.5397042036056519, "kl": 0.0008138418197631836, "learning_rate": 9.567251964768342e-07, "loss": -0.0246, "reward": 0.6041666716337204, "reward_std": 0.5854796469211578, "rewards/accuracy_reward": 0.2604166753590107, "rewards/format_reward": 0.3437500111758709, "step": 35, "w_high_ratio": 0.05638222396373749, "w_low_ratio": 0.05512247420847416, "w_max": 2.192526876926422, "w_mean": 1.2277101576328278, "w_min": 5.989329002727702e-37, "w_std": 0.2675026059150696 }, { "completion_length": 3587.6354370117188, "cov_mean": -1.1433875897637336e-05, "cov_std": 0.2749627083539963, "entropy": 0.48095703125, "epoch": 0.04114285714285714, "grad_norm": 0.2379421591758728, "kl": 0.000979304313659668, "learning_rate": 9.521346881455354e-07, "loss": -0.0089, "reward": 0.22916666977107525, "reward_std": 0.3393310159444809, "rewards/accuracy_reward": 0.052083334885537624, "rewards/format_reward": 0.17708334140479565, "step": 36, "w_high_ratio": 0.0930290725082159, "w_low_ratio": 0.03267599269747734, "w_max": 1.6210260689258575, "w_mean": 1.149814784526825, "w_min": 0.25, "w_std": 0.14480087533593178 }, { "completion_length": 3585.6458740234375, "cov_mean": -6.411561662389431e-05, "cov_std": 0.3158421888947487, "entropy": 0.42236328125, "epoch": 0.04228571428571429, "grad_norm": 0.2652978301048279, "kl": 0.0006819963455200195, "learning_rate": 9.473264167865171e-07, "loss": 0.0058, "reward": 0.23958334419876337, "reward_std": 0.20556553453207016, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.21875000558793545, "step": 37, "w_high_ratio": 0.054578784853219986, "w_low_ratio": 0.03975548921152949, "w_max": 1.6195516288280487, "w_mean": 1.1414334774017334, "w_min": 0.0, "w_std": 0.17483297176659107 }, { "completion_length": 3557.1563110351562, "cov_mean": 1.9627160781965358e-05, "cov_std": 0.1310347281396389, "entropy": 0.4443359375, "epoch": 0.04342857142857143, "grad_norm": 0.18166524171829224, "kl": 0.00045049190521240234, "learning_rate": 9.42302986163543e-07, "loss": 0.0055, "reward": 0.2916666716337204, "reward_std": 0.20568452775478363, "rewards/accuracy_reward": 0.1354166716337204, "rewards/format_reward": 0.15625, "step": 38, "w_high_ratio": 0.0, "w_low_ratio": 0.013907193206250668, "w_max": 1.2318150103092194, "w_mean": 1.0593293607234955, "w_min": 0.5, "w_std": 0.0720198005437851 }, { "completion_length": 3032.9688110351562, "cov_mean": 3.0818391678621992e-06, "cov_std": 0.21192274242639542, "entropy": 0.361083984375, "epoch": 0.044571428571428574, "grad_norm": 0.19133131206035614, "kl": 0.001926124095916748, "learning_rate": 9.370671165529144e-07, "loss": 0.0176, "reward": 0.760416679084301, "reward_std": 0.19024790823459625, "rewards/accuracy_reward": 0.2604166669771075, "rewards/format_reward": 0.5000000074505806, "step": 39, "w_high_ratio": 0.0, "w_low_ratio": 0.031908176839351654, "w_max": 1.4641892611980438, "w_mean": 1.1698184311389923, "w_min": 0.5, "w_std": 0.11090587638318539 }, { "completion_length": 3034.4375610351562, "cov_mean": -2.096771822834853e-05, "cov_std": 0.2704497389495373, "entropy": 0.3974609375, "epoch": 0.045714285714285714, "grad_norm": 0.2450522482395172, "kl": 0.0029773712158203125, "learning_rate": 9.316216432703916e-07, "loss": 0.0304, "reward": 0.6145833469927311, "reward_std": 0.3184027150273323, "rewards/accuracy_reward": 0.16666666697710752, "rewards/format_reward": 0.447916679084301, "step": 40, "w_high_ratio": 0.0, "w_low_ratio": 0.03790469467639923, "w_max": 1.6503434479236603, "w_mean": 1.2544237673282623, "w_min": 0.0, "w_std": 0.16268039494752884 }, { "completion_length": 3323.0313110351562, "cov_mean": 1.6318189409503248e-05, "cov_std": 0.3728119507431984, "entropy": 0.37890625, "epoch": 0.046857142857142854, "grad_norm": 0.25028058886528015, "kl": 0.0005707740783691406, "learning_rate": 9.259695151358214e-07, "loss": 0.0052, "reward": 0.4166666716337204, "reward_std": 0.42720501869916916, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.3541666716337204, "step": 41, "w_high_ratio": 0.0, "w_low_ratio": 0.04678645543754101, "w_max": 1.4909087419509888, "w_mean": 1.1125215888023376, "w_min": 0.0, "w_std": 0.20876972749829292 }, { "completion_length": 2951.6458740234375, "cov_mean": 5.0857947826443706e-05, "cov_std": 0.20116684958338737, "entropy": 0.45947265625, "epoch": 0.048, "grad_norm": 0.20272254943847656, "kl": 0.0004711151123046875, "learning_rate": 9.20113792876298e-07, "loss": 0.0143, "reward": 0.40625000558793545, "reward_std": 0.2362503558397293, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.3750000027939677, "step": 42, "w_high_ratio": 0.125, "w_low_ratio": 0.02315131900832057, "w_max": 1.6171163022518158, "w_mean": 1.2283784747123718, "w_min": 0.5, "w_std": 0.10980619117617607 }, { "completion_length": 3174.875, "cov_mean": 4.1157167999017474e-05, "cov_std": 0.3693430423736572, "entropy": 0.396484375, "epoch": 0.04914285714285714, "grad_norm": 0.2846878170967102, "kl": 0.0015649795532226562, "learning_rate": 9.140576474687263e-07, "loss": 0.0363, "reward": 0.5000000111758709, "reward_std": 0.39679284393787384, "rewards/accuracy_reward": 0.17708334140479565, "rewards/format_reward": 0.32291666977107525, "step": 43, "w_high_ratio": 0.0364043265581131, "w_low_ratio": 0.052212903276085854, "w_max": 1.6643747389316559, "w_mean": 1.1576766669750214, "w_min": 0.0, "w_std": 0.22118561156094074 }, { "completion_length": 2870.6771697998047, "cov_mean": -0.00011349500164214987, "cov_std": 0.38516905158758163, "entropy": 0.392578125, "epoch": 0.05028571428571429, "grad_norm": 0.36889317631721497, "kl": 0.001346588134765625, "learning_rate": 9.078043584226815e-07, "loss": -0.016, "reward": 0.7708333469927311, "reward_std": 0.4629998579621315, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.520833333954215, "step": 44, "w_high_ratio": 0.16294695064425468, "w_low_ratio": 0.04376620473340154, "w_max": 1.860895425081253, "w_mean": 1.262012630701065, "w_min": 0.0, "w_std": 0.2177225835621357 }, { "completion_length": 3653.2084350585938, "cov_mean": 2.4295502612403652e-05, "cov_std": 0.34152911603450775, "entropy": 0.41552734375, "epoch": 0.05142857142857143, "grad_norm": 0.3034785985946655, "kl": 0.0036156177520751953, "learning_rate": 9.013573120044966e-07, "loss": 0.0183, "reward": 0.4375, "reward_std": 0.47292545437812805, "rewards/accuracy_reward": 0.17708333861082792, "rewards/format_reward": 0.26041667349636555, "step": 45, "w_high_ratio": 0.0399763397872448, "w_low_ratio": 0.03463862743228674, "w_max": 1.478510558605194, "w_mean": 1.0995357930660248, "w_min": 0.0, "w_std": 0.18589595332741737 }, { "completion_length": 3480.4583740234375, "cov_mean": -8.883081477506494e-06, "cov_std": 0.35656186379492283, "entropy": 0.482421875, "epoch": 0.052571428571428575, "grad_norm": 0.43441376090049744, "kl": 0.0010223388671875, "learning_rate": 8.9471999940354e-07, "loss": 0.1236, "reward": 0.2708333348855376, "reward_std": 0.3359568640589714, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.22916667442768812, "step": 46, "w_high_ratio": 0.09412252902984619, "w_low_ratio": 0.039234326453879476, "w_max": 1.7104443907737732, "w_mean": 1.1970912516117096, "w_min": 0.0, "w_std": 0.22640072740614414 }, { "completion_length": 3027.9375610351562, "cov_mean": -2.4718745407881215e-05, "cov_std": 0.26147888600826263, "entropy": 0.38720703125, "epoch": 0.053714285714285714, "grad_norm": 0.29539671540260315, "kl": 0.0011167526245117188, "learning_rate": 8.878960148416747e-07, "loss": -0.0436, "reward": 0.8125000149011612, "reward_std": 0.4830247238278389, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.5208333432674408, "step": 47, "w_high_ratio": 0.11265619844198227, "w_low_ratio": 0.028757336549460888, "w_max": 1.9260917007923126, "w_mean": 1.3850333988666534, "w_min": 0.25, "w_std": 0.19246497750282288 }, { "completion_length": 3071.947998046875, "cov_mean": 6.93041113208892e-05, "cov_std": 0.44929926097393036, "entropy": 0.3916015625, "epoch": 0.054857142857142854, "grad_norm": 0.429691344499588, "kl": 0.0033349990844726562, "learning_rate": 8.808890536269229e-07, "loss": 0.0121, "reward": 0.6250000074505806, "reward_std": 0.5002652183175087, "rewards/accuracy_reward": 0.21875000651925802, "rewards/format_reward": 0.40625001303851604, "step": 48, "w_high_ratio": 0.11854390799999237, "w_low_ratio": 0.049413095228374004, "w_max": 2.040738523006439, "w_mean": 1.2706988453865051, "w_min": 0.0, "w_std": 0.28789742290973663 }, { "completion_length": 2590.0208740234375, "cov_mean": 1.1296036518615438e-05, "cov_std": 0.44136329740285873, "entropy": 0.361328125, "epoch": 0.056, "grad_norm": 0.3813425898551941, "kl": 0.006386756896972656, "learning_rate": 8.737029101523929e-07, "loss": 0.0099, "reward": 0.8541667014360428, "reward_std": 0.5753171592950821, "rewards/accuracy_reward": 0.3020833358168602, "rewards/format_reward": 0.5520833507180214, "step": 49, "w_high_ratio": 0.08562466688454151, "w_low_ratio": 0.04912099055945873, "w_max": 2.1155774295330048, "w_mean": 1.313397854566574, "w_min": 1.0880903909928003e-39, "w_std": 0.2910540699958801 }, { "completion_length": 3252.7188110351562, "cov_mean": -5.4081367125036195e-05, "cov_std": 0.2936253622174263, "entropy": 0.341796875, "epoch": 0.05714285714285714, "grad_norm": 0.19821792840957642, "kl": 0.0009126663208007812, "learning_rate": 8.663414758415478e-07, "loss": 0.0282, "reward": 0.5937500204890966, "reward_std": 0.34640391170978546, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.34375000558793545, "step": 50, "w_high_ratio": 0.0625, "w_low_ratio": 0.038133963476866484, "w_max": 1.6234075427055359, "w_mean": 1.1869377791881561, "w_min": 0.25, "w_std": 0.1625902745872736 }, { "completion_length": 2461.135498046875, "cov_mean": 2.296896900588763e-05, "cov_std": 0.34488509595394135, "entropy": 0.43603515625, "epoch": 0.05828571428571429, "grad_norm": 0.37042999267578125, "kl": 0.0047626495361328125, "learning_rate": 8.588087370409302e-07, "loss": 0.0031, "reward": 0.6875000149011612, "reward_std": 0.3103678971529007, "rewards/accuracy_reward": 0.14583333767950535, "rewards/format_reward": 0.5416666716337204, "step": 51, "w_high_ratio": 0.012511095963418484, "w_low_ratio": 0.04256652761250734, "w_max": 1.5135074257850647, "w_mean": 1.154930055141449, "w_min": 0.25, "w_std": 0.20846375823020935 }, { "completion_length": 3097.0000610351562, "cov_mean": 1.4503702914225869e-06, "cov_std": 0.42206430435180664, "entropy": 0.39306640625, "epoch": 0.05942857142857143, "grad_norm": 0.7796747088432312, "kl": 0.0031175613403320312, "learning_rate": 8.511087728614862e-07, "loss": -0.0444, "reward": 0.7083333684131503, "reward_std": 0.5597149804234505, "rewards/accuracy_reward": 0.2812500074505806, "rewards/format_reward": 0.4270833386108279, "step": 52, "w_high_ratio": 0.1064748540520668, "w_low_ratio": 0.0502536753192544, "w_max": 1.7458641231060028, "w_mean": 1.2538467645645142, "w_min": 0.0, "w_std": 0.25843533128499985 }, { "completion_length": 2980.0833740234375, "cov_mean": -1.4053926861379296e-05, "cov_std": 0.5106082037091255, "entropy": 0.4208984375, "epoch": 0.060571428571428575, "grad_norm": 0.43870407342910767, "kl": 0.0011355876922607422, "learning_rate": 8.432457529696548e-07, "loss": -0.0332, "reward": 0.8229167014360428, "reward_std": 0.6252040863037109, "rewards/accuracy_reward": 0.2812500074505806, "rewards/format_reward": 0.5416666865348816, "step": 53, "w_high_ratio": 0.15575578436255455, "w_low_ratio": 0.04851931845769286, "w_max": 2.0333048701286316, "w_mean": 1.3303064107894897, "w_min": 0.0, "w_std": 0.3132343143224716 }, { "completion_length": 3125.135498046875, "cov_mean": 2.5879786335281096e-05, "cov_std": 0.4214501827955246, "entropy": 0.38623046875, "epoch": 0.061714285714285715, "grad_norm": 0.41025811433792114, "kl": 0.0010042190551757812, "learning_rate": 8.352239353294194e-07, "loss": -0.0491, "reward": 0.9479167014360428, "reward_std": 0.6265207231044769, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.5312500149011612, "step": 54, "w_high_ratio": 0.045562680810689926, "w_low_ratio": 0.05255642905831337, "w_max": 1.7965390384197235, "w_mean": 1.294397234916687, "w_min": 0.0, "w_std": 0.2626797705888748 }, { "completion_length": 3290.5521240234375, "cov_mean": 6.297564050328219e-05, "cov_std": 0.3725521042943001, "entropy": 0.3955078125, "epoch": 0.06285714285714286, "grad_norm": 0.3483085632324219, "kl": 0.0009459257125854492, "learning_rate": 8.270476638965461e-07, "loss": 0.0117, "reward": 0.6979166716337204, "reward_std": 0.5277387201786041, "rewards/accuracy_reward": 0.2604166716337204, "rewards/format_reward": 0.4375000149011612, "step": 55, "w_high_ratio": 0.0, "w_low_ratio": 0.04737149551510811, "w_max": 1.5634404420852661, "w_mean": 1.1619892120361328, "w_min": 0.0, "w_std": 0.2019548863172531 }, { "completion_length": 3193.3438110351562, "cov_mean": -6.364414912241045e-05, "cov_std": 0.35638032108545303, "entropy": 0.38623046875, "epoch": 0.064, "grad_norm": 0.22449523210525513, "kl": 0.000545501708984375, "learning_rate": 8.187213662662538e-07, "loss": -0.0369, "reward": 0.6770833432674408, "reward_std": 0.3087990954518318, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.4479166865348816, "step": 56, "w_high_ratio": 0.0, "w_low_ratio": 0.038341518957167864, "w_max": 1.6050121486186981, "w_mean": 1.1923900246620178, "w_min": 0.0, "w_std": 0.19538425654172897 }, { "completion_length": 3553.1563110351562, "cov_mean": 2.031017220360809e-05, "cov_std": 0.3057239428162575, "entropy": 0.31396484375, "epoch": 0.06514285714285714, "grad_norm": 0.1648004949092865, "kl": 0.0003190040588378906, "learning_rate": 8.102495512755938e-07, "loss": 0.0243, "reward": 0.5520833544433117, "reward_std": 0.47310057282447815, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.38541668467223644, "step": 57, "w_high_ratio": 0.0, "w_low_ratio": 0.04216110520064831, "w_max": 1.4258966147899628, "w_mean": 1.126460313796997, "w_min": 0.0, "w_std": 0.163984976708889 }, { "completion_length": 2358.1771240234375, "cov_mean": 0.00012649836571654305, "cov_std": 0.3942733556032181, "entropy": 0.32470703125, "epoch": 0.06628571428571428, "grad_norm": 0.3121250867843628, "kl": 0.0030040740966796875, "learning_rate": 8.01636806561836e-07, "loss": -0.0315, "reward": 0.9791667014360428, "reward_std": 0.47231587767601013, "rewards/accuracy_reward": 0.3020833395421505, "rewards/format_reward": 0.6770833358168602, "step": 58, "w_high_ratio": 0.08487696945667267, "w_low_ratio": 0.045225437730550766, "w_max": 1.9617216885089874, "w_mean": 1.3747560679912567, "w_min": 3.898036555555448e-35, "w_std": 0.2643117532134056 }, { "completion_length": 3183.8021850585938, "cov_mean": -4.030091076856479e-06, "cov_std": 0.21410225331783295, "entropy": 0.345703125, "epoch": 0.06742857142857143, "grad_norm": 0.37277111411094666, "kl": 0.0005140304565429688, "learning_rate": 7.928877960781808e-07, "loss": -0.0033, "reward": 0.5104166977107525, "reward_std": 0.2982303276658058, "rewards/accuracy_reward": 0.17708333861082792, "rewards/format_reward": 0.3333333432674408, "step": 59, "w_high_ratio": 0.1216658167541027, "w_low_ratio": 0.023135079303756356, "w_max": 1.7003150880336761, "w_mean": 1.2053856253623962, "w_min": 0.25, "w_std": 0.15523223765194416 }, { "completion_length": 3174.5000610351562, "cov_mean": -1.2183483704575337e-05, "cov_std": 0.27795324102044106, "entropy": 0.3603515625, "epoch": 0.06857142857142857, "grad_norm": 0.26638808846473694, "kl": 0.0025773048400878906, "learning_rate": 7.840072575681468e-07, "loss": 0.0243, "reward": 0.5729167014360428, "reward_std": 0.4113100916147232, "rewards/accuracy_reward": 0.14583334140479565, "rewards/format_reward": 0.4270833507180214, "step": 60, "w_high_ratio": 0.0, "w_low_ratio": 0.03565680282190442, "w_max": 1.410323053598404, "w_mean": 1.124219536781311, "w_min": 0.0, "w_std": 0.15482920035719872 }, { "completion_length": 3304.791748046875, "cov_mean": 1.00996726359881e-05, "cov_std": 0.37024862319231033, "entropy": 0.3603515625, "epoch": 0.06971428571428571, "grad_norm": 0.43320998549461365, "kl": 0.0008983612060546875, "learning_rate": 7.75e-07, "loss": -0.0682, "reward": 0.645833358168602, "reward_std": 0.45523863658308983, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.4583333432674408, "step": 61, "w_high_ratio": 0.01961296983063221, "w_low_ratio": 0.04602003050968051, "w_max": 1.6873614192008972, "w_mean": 1.205470085144043, "w_min": 0.0, "w_std": 0.22806508466601372 }, { "completion_length": 2739.2500610351562, "cov_mean": 5.323334062268259e-05, "cov_std": 0.4578249305486679, "entropy": 0.31982421875, "epoch": 0.07085714285714285, "grad_norm": 0.3162655532360077, "kl": 0.002288818359375, "learning_rate": 7.658709009626109e-07, "loss": 0.0286, "reward": 0.916666679084301, "reward_std": 0.6331392228603363, "rewards/accuracy_reward": 0.2812500111758709, "rewards/format_reward": 0.6354166939854622, "step": 62, "w_high_ratio": 0.0, "w_low_ratio": 0.05417798087000847, "w_max": 1.768731027841568, "w_mean": 1.2571848034858704, "w_min": 0.0, "w_std": 0.26054797321558 }, { "completion_length": 2615.3021850585938, "cov_mean": 1.778580372047145e-05, "cov_std": 0.4087640196084976, "entropy": 0.4072265625, "epoch": 0.072, "grad_norm": 0.392840176820755, "kl": 0.0025348663330078125, "learning_rate": 7.566249040241553e-07, "loss": -0.0491, "reward": 1.0104167014360428, "reward_std": 0.5578364282846451, "rewards/accuracy_reward": 0.3333333507180214, "rewards/format_reward": 0.677083358168602, "step": 63, "w_high_ratio": 0.008992652408778667, "w_low_ratio": 0.04948492627590895, "w_max": 1.822005033493042, "w_mean": 1.33000847697258, "w_min": 2.0318827732709848e-44, "w_std": 0.24196847900748253 }, { "completion_length": 3171.354248046875, "cov_mean": -0.00014721620300406357, "cov_std": 0.3767973370850086, "entropy": 0.3984375, "epoch": 0.07314285714285715, "grad_norm": 0.4929248094558716, "kl": 0.0016901493072509766, "learning_rate": 7.472670160550848e-07, "loss": -0.033, "reward": 0.6875000204890966, "reward_std": 0.5117540434002876, "rewards/accuracy_reward": 0.2708333460614085, "rewards/format_reward": 0.4166666669771075, "step": 64, "w_high_ratio": 0.14000318944454193, "w_low_ratio": 0.03487167996354401, "w_max": 1.7245290279388428, "w_mean": 1.3096586167812347, "w_min": 0.0, "w_std": 0.2602265626192093 }, { "completion_length": 2907.3126220703125, "cov_mean": 3.645536344265565e-05, "cov_std": 0.29790719598531723, "entropy": 0.3271484375, "epoch": 0.07428571428571429, "grad_norm": 0.32978716492652893, "kl": 0.0017614364624023438, "learning_rate": 7.37802304516818e-07, "loss": -0.0106, "reward": 0.6875000260770321, "reward_std": 0.3368534557521343, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.5208333395421505, "step": 65, "w_high_ratio": 0.01945015788078308, "w_low_ratio": 0.03530450165271759, "w_max": 1.5614324808120728, "w_mean": 1.165216714143753, "w_min": 0.0, "w_std": 0.17453981935977936 }, { "completion_length": 2260.9896087646484, "cov_mean": -8.368766430066898e-06, "cov_std": 0.21803472004830837, "entropy": 0.310791015625, "epoch": 0.07542857142857143, "grad_norm": 0.49300873279571533, "kl": 0.00205230712890625, "learning_rate": 7.282358947176205e-07, "loss": 0.0218, "reward": 0.8750000055879354, "reward_std": 0.27498848363757133, "rewards/accuracy_reward": 0.3645833348855376, "rewards/format_reward": 0.5104166669771075, "step": 66, "w_high_ratio": 0.0345294363796711, "w_low_ratio": 0.024254919728264213, "w_max": 1.7126049399375916, "w_mean": 1.2293311953544617, "w_min": 1.3662660027166966e-44, "w_std": 0.14490841701626778 }, { "completion_length": 3728.5833740234375, "cov_mean": 5.284800408844603e-06, "cov_std": 0.1643856093287468, "entropy": 0.34765625, "epoch": 0.07657142857142857, "grad_norm": 0.2334880828857422, "kl": 0.001039743423461914, "learning_rate": 7.185729670371604e-07, "loss": 0.0054, "reward": 0.1770833358168602, "reward_std": 0.19108106940984726, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.15625, "step": 67, "w_high_ratio": 0.0, "w_low_ratio": 0.02258414216339588, "w_max": 1.245382398366928, "w_mean": 1.0691203624010086, "w_min": 0.5, "w_std": 0.10235420987010002 }, { "completion_length": 2355.072982788086, "cov_mean": -1.9176595742464997e-06, "cov_std": 0.27589889243245125, "entropy": 0.3740234375, "epoch": 0.07771428571428571, "grad_norm": 0.37152960896492004, "kl": 0.0041484832763671875, "learning_rate": 7.08818754121241e-07, "loss": -0.0423, "reward": 0.8229167014360428, "reward_std": 0.415886752307415, "rewards/accuracy_reward": 0.2291666753590107, "rewards/format_reward": 0.5937500074505806, "step": 68, "w_high_ratio": 0.012929616495966911, "w_low_ratio": 0.031258232425898314, "w_max": 1.8356628715991974, "w_mean": 1.304297924041748, "w_min": 0.25, "w_std": 0.19349467381834984 }, { "completion_length": 2847.2604370117188, "cov_mean": -4.8151488954317756e-05, "cov_std": 0.28809408843517303, "entropy": 0.4375, "epoch": 0.07885714285714286, "grad_norm": 0.3872397840023041, "kl": 0.0045623779296875, "learning_rate": 6.989785380482312e-07, "loss": -0.0566, "reward": 0.4479166939854622, "reward_std": 0.33732588589191437, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.4166666865348816, "step": 69, "w_high_ratio": 0.19539788458496332, "w_low_ratio": 0.03220258606597781, "w_max": 2.5382341742515564, "w_mean": 1.4389045536518097, "w_min": 0.0, "w_std": 0.1934449914842844 }, { "completion_length": 3250.4063110351562, "cov_mean": 4.1365366996615194e-05, "cov_std": 0.29582661017775536, "entropy": 0.34326171875, "epoch": 0.08, "grad_norm": 0.23539891839027405, "kl": 0.0022635459899902344, "learning_rate": 6.890576474687263e-07, "loss": 0.0138, "reward": 0.6145833684131503, "reward_std": 0.3936128318309784, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.4583333535119891, "step": 70, "w_high_ratio": 0.0, "w_low_ratio": 0.03546257223933935, "w_max": 1.3784515261650085, "w_mean": 1.1171683073043823, "w_min": 0.0, "w_std": 0.1633461881428957 }, { "completion_length": 2797.4063415527344, "cov_mean": 4.543551585811656e-05, "cov_std": 0.2623286135494709, "entropy": 0.37744140625, "epoch": 0.08114285714285714, "grad_norm": 1.3858193159103394, "kl": 0.05230998992919922, "learning_rate": 6.790614547199906e-07, "loss": 0.0548, "reward": 0.5729166772216558, "reward_std": 0.24960162490606308, "rewards/accuracy_reward": 0.1770833358168602, "rewards/format_reward": 0.39583333395421505, "step": 71, "w_high_ratio": 0.13244600966572762, "w_low_ratio": 0.025677886325865984, "w_max": 1.9256412386894226, "w_mean": 1.2497790455818176, "w_min": 0.0, "w_std": 0.19411796145141125 }, { "completion_length": 3362.5313720703125, "cov_mean": 3.213349305042357e-05, "cov_std": 0.4226767495274544, "entropy": 0.486328125, "epoch": 0.08228571428571428, "grad_norm": 0.4099564254283905, "kl": 0.004467964172363281, "learning_rate": 6.68995372916741e-07, "loss": -0.0646, "reward": 0.416666679084301, "reward_std": 0.4769069701433182, "rewards/accuracy_reward": 0.0520833358168602, "rewards/format_reward": 0.3645833432674408, "step": 72, "w_high_ratio": 0.08334781229496002, "w_low_ratio": 0.05627091834321618, "w_max": 1.7420941889286041, "w_mean": 1.216987669467926, "w_min": 0.0, "w_std": 0.27180150151252747 }, { "completion_length": 3799.3334350585938, "cov_mean": 6.445545591304835e-05, "cov_std": 0.38433366641402245, "entropy": 0.45068359375, "epoch": 0.08342857142857144, "grad_norm": 0.2338305115699768, "kl": 0.00043392181396484375, "learning_rate": 6.588648530198504e-07, "loss": 0.0467, "reward": 0.4062500027939677, "reward_std": 0.48472320288419724, "rewards/accuracy_reward": 0.1770833395421505, "rewards/format_reward": 0.22916667442768812, "step": 73, "w_high_ratio": 0.0, "w_low_ratio": 0.054560547694563866, "w_max": 1.3258715867996216, "w_mean": 1.0365844666957855, "w_min": 2.0397733447937514e-38, "w_std": 0.1829804591834545 }, { "completion_length": 3411.354248046875, "cov_mean": 2.615013909235131e-05, "cov_std": 0.32939745485782623, "entropy": 0.38916015625, "epoch": 0.08457142857142858, "grad_norm": 0.31615766882896423, "kl": 0.0020766258239746094, "learning_rate": 6.486753808845564e-07, "loss": 0.0447, "reward": 0.5416666716337204, "reward_std": 0.48268113285303116, "rewards/accuracy_reward": 0.2500000037252903, "rewards/format_reward": 0.2916666716337204, "step": 74, "w_high_ratio": 0.017251405864953995, "w_low_ratio": 0.037632704712450504, "w_max": 1.4793908894062042, "w_mean": 1.120386153459549, "w_min": 0.25, "w_std": 0.18761293590068817 }, { "completion_length": 3256.6876220703125, "cov_mean": 7.561668553535128e-07, "cov_std": 0.39095795527100563, "entropy": 0.36669921875, "epoch": 0.08571428571428572, "grad_norm": 0.29881325364112854, "kl": 0.0019350051879882812, "learning_rate": 6.384324742897735e-07, "loss": 0.0709, "reward": 0.5625000260770321, "reward_std": 0.3942164406180382, "rewards/accuracy_reward": 0.1979166716337204, "rewards/format_reward": 0.36458334513008595, "step": 75, "w_high_ratio": 0.05663827061653137, "w_low_ratio": 0.04413987882435322, "w_max": 1.78597491979599, "w_mean": 1.1896328330039978, "w_min": 0.0, "w_std": 0.20989646948873997 }, { "completion_length": 3081.6875610351562, "cov_mean": 7.851473583286861e-06, "cov_std": 0.19348382577300072, "entropy": 0.38671875, "epoch": 0.08685714285714285, "grad_norm": 0.13885051012039185, "kl": 0.0005955696105957031, "learning_rate": 6.281416799501187e-07, "loss": -0.0033, "reward": 0.4583333432674408, "reward_std": 0.22215576469898224, "rewards/accuracy_reward": 0.0520833358168602, "rewards/format_reward": 0.4062500074505806, "step": 76, "w_high_ratio": 0.0, "w_low_ratio": 0.024444932583719492, "w_max": 1.385847419500351, "w_mean": 1.104325294494629, "w_min": 0.25, "w_std": 0.09913814999163151 }, { "completion_length": 3410.2084350585938, "cov_mean": 1.7174193999380805e-05, "cov_std": 0.28355711698532104, "entropy": 0.41064453125, "epoch": 0.088, "grad_norm": 0.220694899559021, "kl": 0.0006546974182128906, "learning_rate": 6.178085705122674e-07, "loss": -0.0111, "reward": 0.46875000558793545, "reward_std": 0.2968830242753029, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.34375000558793545, "step": 77, "w_high_ratio": 0.03967808559536934, "w_low_ratio": 0.03201043838635087, "w_max": 1.494775265455246, "w_mean": 1.1476246118545532, "w_min": 0.25, "w_std": 0.1502416580915451 }, { "completion_length": 3454.6771850585938, "cov_mean": 2.8414152438926976e-05, "cov_std": 0.42589128017425537, "entropy": 0.376953125, "epoch": 0.08914285714285715, "grad_norm": 0.24352695047855377, "kl": 0.0004711151123046875, "learning_rate": 6.074387415372676e-07, "loss": -0.0208, "reward": 0.6354166716337204, "reward_std": 0.5373464152216911, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.3854166716337204, "step": 78, "w_high_ratio": 0.040874604135751724, "w_low_ratio": 0.04665249306708574, "w_max": 1.7644164860248566, "w_mean": 1.1853148639202118, "w_min": 1.401298464324817e-45, "w_std": 0.24474802613258362 }, { "completion_length": 2533.354217529297, "cov_mean": 4.250231540936511e-06, "cov_std": 0.28236184269189835, "entropy": 0.322265625, "epoch": 0.09028571428571429, "grad_norm": 0.2319362461566925, "kl": 0.0014781951904296875, "learning_rate": 5.97037808470444e-07, "loss": 0.0229, "reward": 0.8541666865348816, "reward_std": 0.3287995010614395, "rewards/accuracy_reward": 0.23958333861082792, "rewards/format_reward": 0.6145833507180214, "step": 79, "w_high_ratio": 0.16244123131036758, "w_low_ratio": 0.035402802750468254, "w_max": 2.477913051843643, "w_mean": 1.4660456776618958, "w_min": 0.3189111649990082, "w_std": 0.194712957367301 }, { "completion_length": 3480.1250610351562, "cov_mean": -2.1006295810366282e-05, "cov_std": 0.1846322864294052, "entropy": 0.4541015625, "epoch": 0.09142857142857143, "grad_norm": 0.3083387017250061, "kl": 0.0014166831970214844, "learning_rate": 5.866114036005362e-07, "loss": 0.0178, "reward": 0.6041666893288493, "reward_std": 0.32849549502134323, "rewards/accuracy_reward": 0.2187500111758709, "rewards/format_reward": 0.3854166781529784, "step": 80, "w_high_ratio": 0.0, "w_low_ratio": 0.023656398989260197, "w_max": 1.5105722546577454, "w_mean": 1.109735906124115, "w_min": 0.25, "w_std": 0.10967518202960491 }, { "completion_length": 3358.6979370117188, "cov_mean": 1.4671212284156354e-05, "cov_std": 0.1834326833486557, "entropy": 0.5087890625, "epoch": 0.09257142857142857, "grad_norm": 0.17795835435390472, "kl": 0.005743980407714844, "learning_rate": 5.761651730097142e-07, "loss": -0.0007, "reward": 0.36458334885537624, "reward_std": 0.28906675428152084, "rewards/accuracy_reward": 0.09375000279396772, "rewards/format_reward": 0.2708333386108279, "step": 81, "w_high_ratio": 0.0625, "w_low_ratio": 0.025394567288458347, "w_max": 1.5137894749641418, "w_mean": 1.1780100166797638, "w_min": 0.25, "w_std": 0.09445377439260483 }, { "completion_length": 3041.3438110351562, "cov_mean": -2.3124930976337055e-05, "cov_std": 0.2670583054423332, "entropy": 0.39501953125, "epoch": 0.09371428571428571, "grad_norm": 0.30143725872039795, "kl": 0.0016946792602539062, "learning_rate": 5.657047735161255e-07, "loss": -0.0214, "reward": 0.7604166716337204, "reward_std": 0.34568001329898834, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.4479166716337204, "step": 82, "w_high_ratio": 0.04279303178191185, "w_low_ratio": 0.03479792643338442, "w_max": 1.7097257375717163, "w_mean": 1.1838775873184204, "w_min": 0.25, "w_std": 0.16872986778616905 }, { "completion_length": 3054.3646240234375, "cov_mean": -5.689870704372879e-06, "cov_std": 0.17481233924627304, "entropy": 0.43115234375, "epoch": 0.09485714285714286, "grad_norm": 0.4099189341068268, "kl": 0.0016026496887207031, "learning_rate": 5.552358696106288e-07, "loss": 0.0418, "reward": 0.46875, "reward_std": 0.2636485621333122, "rewards/accuracy_reward": 0.16666666697710752, "rewards/format_reward": 0.30208333395421505, "step": 83, "w_high_ratio": 0.0, "w_low_ratio": 0.01848737057298422, "w_max": 1.5311354398727417, "w_mean": 1.116348922252655, "w_min": 0.5, "w_std": 0.08476846665143967 }, { "completion_length": 3265.4375, "cov_mean": 2.6241104933433235e-06, "cov_std": 0.3729252219200134, "entropy": 0.40966796875, "epoch": 0.096, "grad_norm": 0.3238582909107208, "kl": 0.0008416175842285156, "learning_rate": 5.447641303893714e-07, "loss": 0.0273, "reward": 0.6770833432674408, "reward_std": 0.4903585724532604, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.4270833432674408, "step": 84, "w_high_ratio": 0.01827232539653778, "w_low_ratio": 0.04356031212955713, "w_max": 1.6150497496128082, "w_mean": 1.17939093708992, "w_min": 0.0, "w_std": 0.2226697877049446 }, { "completion_length": 3305.8125, "cov_mean": 5.61498741262767e-05, "cov_std": 0.4452364891767502, "entropy": 0.32861328125, "epoch": 0.09714285714285714, "grad_norm": 0.29220932722091675, "kl": 0.0005831718444824219, "learning_rate": 5.342952264838747e-07, "loss": 0.0609, "reward": 0.6354166865348816, "reward_std": 0.68822330981493, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.447916679084301, "step": 85, "w_high_ratio": 0.0, "w_low_ratio": 0.046569294296205044, "w_max": 1.4473570883274078, "w_mean": 1.1975693106651306, "w_min": 0.0, "w_std": 0.23973201215267181 }, { "completion_length": 3247.2188110351562, "cov_mean": -1.9998861716885585e-05, "cov_std": 0.31147949025034904, "entropy": 0.44189453125, "epoch": 0.09828571428571428, "grad_norm": 0.5364130735397339, "kl": 0.0019941329956054688, "learning_rate": 5.238348269902859e-07, "loss": -0.0757, "reward": 0.6041666865348816, "reward_std": 0.28646790236234665, "rewards/accuracy_reward": 0.1979166716337204, "rewards/format_reward": 0.40625, "step": 86, "w_high_ratio": 0.1384273413568735, "w_low_ratio": 0.027202811557799578, "w_max": 2.1997461020946503, "w_mean": 1.3846492767333984, "w_min": 0.25, "w_std": 0.2737709581851959 }, { "completion_length": 3057.2916870117188, "cov_mean": -5.754891753895208e-05, "cov_std": 0.5682927817106247, "entropy": 0.4931640625, "epoch": 0.09942857142857142, "grad_norm": 0.587156355381012, "kl": 0.0027523040771484375, "learning_rate": 5.133885963994639e-07, "loss": -0.0103, "reward": 0.7083333432674408, "reward_std": 0.560588151216507, "rewards/accuracy_reward": 0.21875000279396772, "rewards/format_reward": 0.4895833432674408, "step": 87, "w_high_ratio": 0.15174898132681847, "w_low_ratio": 0.06858384050428867, "w_max": 2.0574756860733032, "w_mean": 1.3842451870441437, "w_min": 0.0, "w_std": 0.38674013316631317 }, { "completion_length": 2906.0209350585938, "cov_mean": 4.286354305804707e-05, "cov_std": 0.5042391121387482, "entropy": 0.4453125, "epoch": 0.10057142857142858, "grad_norm": 0.44004324078559875, "kl": 0.0045490264892578125, "learning_rate": 5.02962191529556e-07, "loss": 0.0208, "reward": 0.9791666865348816, "reward_std": 0.5309341698884964, "rewards/accuracy_reward": 0.3437500074505806, "rewards/format_reward": 0.6354166716337204, "step": 88, "w_high_ratio": 0.17751162499189377, "w_low_ratio": 0.05830758810043335, "w_max": 2.042703092098236, "w_mean": 1.399275004863739, "w_min": 0.25, "w_std": 0.35736314207315445 }, { "completion_length": 3425.3333740234375, "cov_mean": -2.9065696480756742e-05, "cov_std": 0.3547215014696121, "entropy": 0.39208984375, "epoch": 0.10171428571428572, "grad_norm": 0.41978099942207336, "kl": 0.0021228790283203125, "learning_rate": 4.925612584627324e-07, "loss": -0.0597, "reward": 0.5625000223517418, "reward_std": 0.5216004773974419, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.354166679084301, "step": 89, "w_high_ratio": 0.017988620325922966, "w_low_ratio": 0.042721704579889774, "w_max": 1.6596693396568298, "w_mean": 1.1673058569431305, "w_min": 0.0, "w_std": 0.22802076116204262 }, { "completion_length": 2660.4375915527344, "cov_mean": 9.994783340516733e-06, "cov_std": 0.2509019151329994, "entropy": 0.546875, "epoch": 0.10285714285714286, "grad_norm": 0.7049030661582947, "kl": 0.0055828094482421875, "learning_rate": 4.821914294877326e-07, "loss": -0.0314, "reward": 0.48958335630595684, "reward_std": 0.24196770787239075, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.4687500176951289, "step": 90, "w_high_ratio": 0.09902577847242355, "w_low_ratio": 0.025217004818841815, "w_max": 2.4860771000385284, "w_mean": 1.4202724397182465, "w_min": 0.0, "w_std": 0.18205549381673336 }, { "completion_length": 3260.8334350585938, "cov_mean": -1.9221572074457072e-05, "cov_std": 0.2735915407538414, "entropy": 0.41845703125, "epoch": 0.104, "grad_norm": 0.2550157308578491, "kl": 0.0021266937255859375, "learning_rate": 4.7185832004988133e-07, "loss": 0.0421, "reward": 0.5312500316649675, "reward_std": 0.3842952623963356, "rewards/accuracy_reward": 0.14583333674818277, "rewards/format_reward": 0.385416672565043, "step": 91, "w_high_ratio": 0.042984794825315475, "w_low_ratio": 0.027249778620898724, "w_max": 1.7297326922416687, "w_mean": 1.2372649610042572, "w_min": 0.0, "w_std": 0.1690264195203781 }, { "completion_length": 2809.573028564453, "cov_mean": -4.66689198219683e-06, "cov_std": 0.41415752843022346, "entropy": 0.34033203125, "epoch": 0.10514285714285715, "grad_norm": 0.3232761323451996, "kl": 0.0012707710266113281, "learning_rate": 4.6156752571022637e-07, "loss": -0.0264, "reward": 0.7083333432674408, "reward_std": 0.48402564972639084, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.5416666939854622, "step": 92, "w_high_ratio": 0.0625, "w_low_ratio": 0.04879668727517128, "w_max": 1.7188981473445892, "w_mean": 1.264578402042389, "w_min": 1.0121512603752466e-34, "w_std": 0.23184099607169628 }, { "completion_length": 3819.1458740234375, "cov_mean": -4.416617707647674e-05, "cov_std": 0.18136212974786758, "entropy": 0.533203125, "epoch": 0.10628571428571429, "grad_norm": 0.2244294136762619, "kl": 0.002910614013671875, "learning_rate": 4.513246191154434e-07, "loss": 0.0303, "reward": 0.0937500037252903, "reward_std": 0.1851910501718521, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.08333333674818277, "step": 93, "w_high_ratio": 0.0, "w_low_ratio": 0.025186134036630392, "w_max": 1.3321870565414429, "w_mean": 1.0297031998634338, "w_min": 0.25, "w_std": 0.09247609600424767 }, { "completion_length": 3216.4376220703125, "cov_mean": -2.7046048671763856e-05, "cov_std": 0.25123433768749237, "entropy": 0.5078125, "epoch": 0.10742857142857143, "grad_norm": 0.2843908965587616, "kl": 0.0036516189575195312, "learning_rate": 4.4113514698014953e-07, "loss": 0.0352, "reward": 0.47916667722165585, "reward_std": 0.28607168793678284, "rewards/accuracy_reward": 0.17708333395421505, "rewards/format_reward": 0.30208334140479565, "step": 94, "w_high_ratio": 0.04015461727976799, "w_low_ratio": 0.02874834556132555, "w_max": 1.7613461911678314, "w_mean": 1.1813118755817413, "w_min": 0.25, "w_std": 0.16030845791101456 }, { "completion_length": 3702.510498046875, "cov_mean": 3.147694224026054e-05, "cov_std": 0.3758469521999359, "entropy": 0.4091796875, "epoch": 0.10857142857142857, "grad_norm": 0.2823449671268463, "kl": 0.0005347728729248047, "learning_rate": 4.3100462708325914e-07, "loss": 0.0264, "reward": 0.416666679084301, "reward_std": 0.5112503468990326, "rewards/accuracy_reward": 0.11458333488553762, "rewards/format_reward": 0.3020833432674408, "step": 95, "w_high_ratio": 0.0, "w_low_ratio": 0.0470161447301507, "w_max": 1.2909899652004242, "w_mean": 1.0598113238811493, "w_min": 0.25, "w_std": 0.19865867495536804 }, { "completion_length": 3136.7188110351562, "cov_mean": 2.809584930218989e-05, "cov_std": 0.38921112939715385, "entropy": 0.41259765625, "epoch": 0.10971428571428571, "grad_norm": 0.4165164828300476, "kl": 0.004611968994140625, "learning_rate": 4.209385452800095e-07, "loss": -0.0197, "reward": 0.677083358168602, "reward_std": 0.4686770662665367, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.4270833432674408, "step": 96, "w_high_ratio": 0.1767394095659256, "w_low_ratio": 0.046211169101297855, "w_max": 2.1192705631256104, "w_mean": 1.334891527891159, "w_min": 2.0274371819188836e-36, "w_std": 0.27778077498078346 }, { "completion_length": 3338.3958740234375, "cov_mean": 4.9627053158474155e-05, "cov_std": 0.3888591527938843, "entropy": 0.41650390625, "epoch": 0.11085714285714286, "grad_norm": 0.41793736815452576, "kl": 0.0014972686767578125, "learning_rate": 4.1094235253127374e-07, "loss": -0.0162, "reward": 0.5520833563059568, "reward_std": 0.5027666687965393, "rewards/accuracy_reward": 0.20833334419876337, "rewards/format_reward": 0.3437500176951289, "step": 97, "w_high_ratio": 0.036094631999731064, "w_low_ratio": 0.03847068129107356, "w_max": 1.7370452582836151, "w_mean": 1.2170022130012512, "w_min": 0.0, "w_std": 0.2471493650227785 }, { "completion_length": 3080.9584350585938, "cov_mean": -3.7720649288530694e-05, "cov_std": 0.35732631012797356, "entropy": 0.37109375, "epoch": 0.112, "grad_norm": 0.4429682493209839, "kl": 0.0007581710815429688, "learning_rate": 4.0102146195176887e-07, "loss": -0.087, "reward": 0.614583358168602, "reward_std": 0.3441091701388359, "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.4583333507180214, "step": 98, "w_high_ratio": 0.07888161391019821, "w_low_ratio": 0.0333328228443861, "w_max": 1.875957041978836, "w_mean": 1.2836172580718994, "w_min": 0.0, "w_std": 0.24231520667672157 }, { "completion_length": 3051.885467529297, "cov_mean": -8.294958661281271e-06, "cov_std": 0.13039003312587738, "entropy": 0.342529296875, "epoch": 0.11314285714285714, "grad_norm": 0.09671887010335922, "kl": 0.0012235641479492188, "learning_rate": 3.911812458787591e-07, "loss": 0.015, "reward": 0.5520833395421505, "reward_std": 0.22183798253536224, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.34375000558793545, "step": 99, "w_high_ratio": 0.0, "w_low_ratio": 0.016014322638511658, "w_max": 1.2210949659347534, "w_mean": 1.0700498223304749, "w_min": 0.5234909653663635, "w_std": 0.06671209260821342 }, { "completion_length": 2941.4271850585938, "cov_mean": 3.641827424871735e-05, "cov_std": 0.311247356235981, "entropy": 0.352783203125, "epoch": 0.11428571428571428, "grad_norm": 0.40575236082077026, "kl": 0.0022411346435546875, "learning_rate": 3.8142703296283953e-07, "loss": -0.0488, "reward": 0.8229167088866234, "reward_std": 0.4384620487689972, "rewards/accuracy_reward": 0.3020833507180214, "rewards/format_reward": 0.5208333507180214, "step": 100, "w_high_ratio": 0.057203881442546844, "w_low_ratio": 0.037438319995999336, "w_max": 2.0794378519058228, "w_mean": 1.3217148184776306, "w_min": 0.0, "w_std": 0.19731487706303596 }, { "completion_length": 3064.322967529297, "cov_mean": -0.00016853955486340055, "cov_std": 0.3906340226531029, "entropy": 0.49560546875, "epoch": 0.11542857142857142, "grad_norm": 0.3836102783679962, "kl": 0.0015306472778320312, "learning_rate": 3.7176410528237945e-07, "loss": 0.0071, "reward": 0.6041666716337204, "reward_std": 0.37709444761276245, "rewards/accuracy_reward": 0.20833334419876337, "rewards/format_reward": 0.3958333469927311, "step": 101, "w_high_ratio": 0.039534129202365875, "w_low_ratio": 0.04537520185112953, "w_max": 1.8732931017875671, "w_mean": 1.2176434397697449, "w_min": 1.2642545573704715e-38, "w_std": 0.2533961348235607 }, { "completion_length": 2447.7188415527344, "cov_mean": 6.931000660870268e-05, "cov_std": 0.49404649436473846, "entropy": 0.41943359375, "epoch": 0.11657142857142858, "grad_norm": 0.8746929168701172, "kl": 0.0036649703979492188, "learning_rate": 3.62197695483182e-07, "loss": -0.1189, "reward": 0.8541666939854622, "reward_std": 0.36090725660324097, "rewards/accuracy_reward": 0.15625000279396772, "rewards/format_reward": 0.6979166865348816, "step": 102, "w_high_ratio": 0.21037982031702995, "w_low_ratio": 0.04308201279491186, "w_max": 3.2181393206119537, "w_mean": 1.75843146443367, "w_min": 0.0, "w_std": 0.38950372859835625 }, { "completion_length": 2925.8646850585938, "cov_mean": 7.4649460657383315e-06, "cov_std": 0.41748112440109253, "entropy": 0.384765625, "epoch": 0.11771428571428572, "grad_norm": 0.6049470901489258, "kl": 0.0027923583984375, "learning_rate": 3.5273298394491515e-07, "loss": -0.0624, "reward": 0.8125000298023224, "reward_std": 0.4502665400505066, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.5937500149011612, "step": 103, "w_high_ratio": 0.21904528141021729, "w_low_ratio": 0.04505133908241987, "w_max": 2.339956372976303, "w_mean": 1.4152106940746307, "w_min": 0.0, "w_std": 0.29690178483724594 }, { "completion_length": 3003.3230590820312, "cov_mean": -3.357986315677408e-07, "cov_std": 0.30858776718378067, "entropy": 0.4453125, "epoch": 0.11885714285714286, "grad_norm": 1.4959157705307007, "kl": 0.016025543212890625, "learning_rate": 3.433750959758446e-07, "loss": -0.0262, "reward": 0.5833333432674408, "reward_std": 0.418441042304039, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.4166666828095913, "step": 104, "w_high_ratio": 0.11394603550434113, "w_low_ratio": 0.0399134736508131, "w_max": 1.7974587678909302, "w_mean": 1.2668271660804749, "w_min": 0.25, "w_std": 0.18237562477588654 }, { "completion_length": 2974.947998046875, "cov_mean": -6.136376214271877e-06, "cov_std": 0.4218733385205269, "entropy": 0.38671875, "epoch": 0.12, "grad_norm": 0.5280556678771973, "kl": 0.004073143005371094, "learning_rate": 3.3412909903738936e-07, "loss": -0.0267, "reward": 0.8020833535119891, "reward_std": 0.6002020314335823, "rewards/accuracy_reward": 0.322916679084301, "rewards/format_reward": 0.4791666818782687, "step": 105, "w_high_ratio": 0.07510412856936455, "w_low_ratio": 0.05238847387954593, "w_max": 2.132063180208206, "w_mean": 1.2634376883506775, "w_min": 0.0, "w_std": 0.2662976738065481 }, { "completion_length": 2262.4375915527344, "cov_mean": 7.913076842669398e-07, "cov_std": 0.2457173652946949, "entropy": 0.291748046875, "epoch": 0.12114285714285715, "grad_norm": 0.2537207007408142, "kl": 0.00292205810546875, "learning_rate": 3.250000000000001e-07, "loss": 0.0557, "reward": 1.0833333535119891, "reward_std": 0.31829095631837845, "rewards/accuracy_reward": 0.4270833395421505, "rewards/format_reward": 0.6562500251457095, "step": 106, "w_high_ratio": 0.05819880962371826, "w_low_ratio": 0.029597220942378044, "w_max": 2.0277227461338043, "w_mean": 1.3821330666542053, "w_min": 0.0, "w_std": 0.1694270297884941 }, { "completion_length": 3115.3021240234375, "cov_mean": -5.469087955134455e-05, "cov_std": 0.3848882205784321, "entropy": 0.5244140625, "epoch": 0.12228571428571429, "grad_norm": 0.38348808884620667, "kl": 0.0030164718627929688, "learning_rate": 3.159927424318531e-07, "loss": -0.0623, "reward": 0.6250000251457095, "reward_std": 0.3455836847424507, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.4583333535119891, "step": 107, "w_high_ratio": 0.16982319951057434, "w_low_ratio": 0.04572468576952815, "w_max": 1.996431291103363, "w_mean": 1.4110250174999237, "w_min": 0.0, "w_std": 0.2217676378786564 }, { "completion_length": 3326.885498046875, "cov_mean": -3.166737906212802e-05, "cov_std": 0.34976962953805923, "entropy": 0.42529296875, "epoch": 0.12342857142857143, "grad_norm": 0.4025486409664154, "kl": 0.0019197463989257812, "learning_rate": 3.0711220392181934e-07, "loss": 0.0339, "reward": 0.5104166716337204, "reward_std": 0.45747723430395126, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.3229166716337204, "step": 108, "w_high_ratio": 0.05910671502351761, "w_low_ratio": 0.04742087051272392, "w_max": 1.6517033874988556, "w_mean": 1.2254261672496796, "w_min": 0.25, "w_std": 0.23562095686793327 }, { "completion_length": 3175.3541870117188, "cov_mean": 7.027760148048401e-06, "cov_std": 0.17652258835732937, "entropy": 0.388671875, "epoch": 0.12457142857142857, "grad_norm": 0.47681912779808044, "kl": 0.0006990432739257812, "learning_rate": 2.9836319343816397e-07, "loss": -0.0293, "reward": 0.4895833460614085, "reward_std": 0.20087094604969025, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.3645833386108279, "step": 109, "w_high_ratio": 0.11448103934526443, "w_low_ratio": 0.018031115527264774, "w_max": 1.5564889311790466, "w_mean": 1.2944257855415344, "w_min": 0.25, "w_std": 0.10801565833389759 }, { "completion_length": 2903.9375610351562, "cov_mean": 2.2163485482451506e-05, "cov_std": 0.3046695999801159, "entropy": 0.37548828125, "epoch": 0.12571428571428572, "grad_norm": 0.574763298034668, "kl": 0.0011281967163085938, "learning_rate": 2.897504487244061e-07, "loss": 0.0666, "reward": 0.6458333656191826, "reward_std": 0.4784049317240715, "rewards/accuracy_reward": 0.16666667256504297, "rewards/format_reward": 0.479166679084301, "step": 110, "w_high_ratio": 0.11397865414619446, "w_low_ratio": 0.036777073866687715, "w_max": 1.7808756828308105, "w_mean": 1.2882322669029236, "w_min": 0.0, "w_std": 0.17456290312111378 }, { "completion_length": 3608.7813110351562, "cov_mean": 2.5331736196676502e-05, "cov_std": 0.3403046578168869, "entropy": 0.54296875, "epoch": 0.12685714285714286, "grad_norm": 0.28652071952819824, "kl": 0.0020961761474609375, "learning_rate": 2.812786337337463e-07, "loss": -0.0593, "reward": 0.4479166716337204, "reward_std": 0.5819729715585709, "rewards/accuracy_reward": 0.19791667442768812, "rewards/format_reward": 0.25000000558793545, "step": 111, "w_high_ratio": 0.0, "w_low_ratio": 0.040594917722046375, "w_max": 1.5843260884284973, "w_mean": 1.1159851551055908, "w_min": 0.0, "w_std": 0.1932711023837328 }, { "completion_length": 3511.9791870117188, "cov_mean": -8.262183837359771e-05, "cov_std": 0.3419903479516506, "entropy": 0.466796875, "epoch": 0.128, "grad_norm": 0.21528662741184235, "kl": 0.0010385513305664062, "learning_rate": 2.729523361034538e-07, "loss": 0.0036, "reward": 0.7395833432674408, "reward_std": 0.4909324310719967, "rewards/accuracy_reward": 0.3229166716337204, "rewards/format_reward": 0.4166666716337204, "step": 112, "w_high_ratio": 0.0, "w_low_ratio": 0.039533226285129786, "w_max": 1.4701470732688904, "w_mean": 1.1230643689632416, "w_min": 2.796174871268219e-27, "w_std": 0.17366146482527256 }, { "completion_length": 2989.6250610351562, "cov_mean": -9.21895634746761e-05, "cov_std": 0.4826783090829849, "entropy": 0.47412109375, "epoch": 0.12914285714285714, "grad_norm": 0.48934122920036316, "kl": 0.0016803741455078125, "learning_rate": 2.6477606467058035e-07, "loss": -0.0453, "reward": 0.739583358168602, "reward_std": 0.46098607778549194, "rewards/accuracy_reward": 0.22916666977107525, "rewards/format_reward": 0.510416679084301, "step": 113, "w_high_ratio": 0.05894821137189865, "w_low_ratio": 0.05880188010632992, "w_max": 2.289375811815262, "w_mean": 1.3520435392856598, "w_min": 0.0, "w_std": 0.3024050109088421 }, { "completion_length": 2825.6771850585938, "cov_mean": 8.050216638366692e-05, "cov_std": 0.36271025612950325, "entropy": 0.38232421875, "epoch": 0.13028571428571428, "grad_norm": 0.3449488580226898, "kl": 0.006592750549316406, "learning_rate": 2.567542470303452e-07, "loss": -0.0924, "reward": 0.6250000298023224, "reward_std": 0.24164991825819016, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.5625000149011612, "step": 114, "w_high_ratio": 0.04218151792883873, "w_low_ratio": 0.03556477651000023, "w_max": 1.9164948165416718, "w_mean": 1.3027912080287933, "w_min": 0.25, "w_std": 0.2133668176829815 }, { "completion_length": 3166.1146240234375, "cov_mean": 5.38662197868689e-06, "cov_std": 0.25203782320022583, "entropy": 0.43505859375, "epoch": 0.13142857142857142, "grad_norm": 0.26439687609672546, "kl": 0.0024480819702148438, "learning_rate": 2.488912271385139e-07, "loss": -0.0066, "reward": 0.5937500298023224, "reward_std": 0.28200745210051537, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.3229166716337204, "step": 115, "w_high_ratio": 0.10065623372793198, "w_low_ratio": 0.02666568197309971, "w_max": 1.5669940114021301, "w_mean": 1.224227637052536, "w_min": 0.25, "w_std": 0.1499568410217762 }, { "completion_length": 3656.2604370117188, "cov_mean": -1.988131225516554e-05, "cov_std": 0.35770974680781364, "entropy": 0.5078125, "epoch": 0.13257142857142856, "grad_norm": 0.29910722374916077, "kl": 0.0016193389892578125, "learning_rate": 2.411912629590699e-07, "loss": 0.0044, "reward": 0.2812500027939677, "reward_std": 0.3736678585410118, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.19791667442768812, "step": 116, "w_high_ratio": 0.055632032454013824, "w_low_ratio": 0.04639715701341629, "w_max": 1.5884924530982971, "w_mean": 1.1101520657539368, "w_min": 0.25, "w_std": 0.2023993842303753 }, { "completion_length": 3502.0833740234375, "cov_mean": 1.3217656032793457e-05, "cov_std": 0.24507246166467667, "entropy": 0.48486328125, "epoch": 0.1337142857142857, "grad_norm": 0.19559521973133087, "kl": 0.0014400482177734375, "learning_rate": 2.336585241584522e-07, "loss": 0.0266, "reward": 0.31250000558793545, "reward_std": 0.2934442162513733, "rewards/accuracy_reward": 0.041666666977107525, "rewards/format_reward": 0.2708333386108279, "step": 117, "w_high_ratio": 0.0, "w_low_ratio": 0.03416412137448788, "w_max": 1.3934015035629272, "w_mean": 1.121408373117447, "w_min": 0.25, "w_std": 0.12930788472294807 }, { "completion_length": 3388.2291870117188, "cov_mean": 8.877824029696058e-05, "cov_std": 0.4343739002943039, "entropy": 0.39794921875, "epoch": 0.13485714285714287, "grad_norm": 0.2352270781993866, "kl": 0.001239776611328125, "learning_rate": 2.2629708984760706e-07, "loss": -0.001, "reward": 0.8750000596046448, "reward_std": 0.7063143625855446, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.458333358168602, "step": 118, "w_high_ratio": 0.025391947478055954, "w_low_ratio": 0.0545379314571619, "w_max": 1.6127934455871582, "w_mean": 1.1610458493232727, "w_min": 0.0, "w_std": 0.23880053497850895 }, { "completion_length": 2543.125030517578, "cov_mean": -5.281608531504389e-05, "cov_std": 0.3761717230081558, "entropy": 0.46533203125, "epoch": 0.136, "grad_norm": 0.4445283114910126, "kl": 0.0053043365478515625, "learning_rate": 2.1911094637307714e-07, "loss": -0.0188, "reward": 0.7812500298023224, "reward_std": 0.3462969809770584, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.5729166865348816, "step": 119, "w_high_ratio": 0.14202763978391886, "w_low_ratio": 0.04364374093711376, "w_max": 2.151145786046982, "w_mean": 1.4133342802524567, "w_min": 0.0, "w_std": 0.27028138749301434 }, { "completion_length": 2731.8021240234375, "cov_mean": -1.2950695236213505e-06, "cov_std": 0.40660279989242554, "entropy": 0.45849609375, "epoch": 0.13714285714285715, "grad_norm": 0.44800451397895813, "kl": 0.0019931793212890625, "learning_rate": 2.1210398515832536e-07, "loss": 0.0072, "reward": 0.6770833432674408, "reward_std": 0.37227439880371094, "rewards/accuracy_reward": 0.17708333395421505, "rewards/format_reward": 0.5000000074505806, "step": 120, "w_high_ratio": 0.06295246630907059, "w_low_ratio": 0.051405247300863266, "w_max": 1.8777723908424377, "w_mean": 1.2852334678173065, "w_min": 0.0, "w_std": 0.24544718861579895 }, { "completion_length": 1963.3125610351562, "cov_mean": -5.540308461604582e-05, "cov_std": 0.36702772229909897, "entropy": 0.38525390625, "epoch": 0.1382857142857143, "grad_norm": 0.5723959803581238, "kl": 0.00885772705078125, "learning_rate": 2.0528000059645995e-07, "loss": -0.076, "reward": 1.0104167014360428, "reward_std": 0.406433891505003, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.7812500149011612, "step": 121, "w_high_ratio": 0.25104042887687683, "w_low_ratio": 0.04047479620203376, "w_max": 2.346793830394745, "w_mean": 1.5671572387218475, "w_min": 0.0, "w_std": 0.25984594970941544 }, { "completion_length": 3143.0208740234375, "cov_mean": 1.6946690038821544e-05, "cov_std": 0.2671542540192604, "entropy": 0.453125, "epoch": 0.13942857142857143, "grad_norm": 0.1998731791973114, "kl": 0.0013976097106933594, "learning_rate": 1.986426879955034e-07, "loss": -0.0537, "reward": 0.7812500149011612, "reward_std": 0.4281647428870201, "rewards/accuracy_reward": 0.3020833469927311, "rewards/format_reward": 0.479166679084301, "step": 122, "w_high_ratio": 0.01932075247168541, "w_low_ratio": 0.030062017496675253, "w_max": 1.5524874925613403, "w_mean": 1.134686678647995, "w_min": 0.0, "w_std": 0.1492646411061287 }, { "completion_length": 3167.3959350585938, "cov_mean": -5.797519952466246e-05, "cov_std": 0.44271689653396606, "entropy": 0.4228515625, "epoch": 0.14057142857142857, "grad_norm": 0.32559123635292053, "kl": 0.001354217529296875, "learning_rate": 1.9219564157731844e-07, "loss": -0.0282, "reward": 0.6562500260770321, "reward_std": 0.49003005772829056, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.4687500111758709, "step": 123, "w_high_ratio": 0.03850603708997369, "w_low_ratio": 0.054318103939294815, "w_max": 1.9482422471046448, "w_mean": 1.2491904497146606, "w_min": 0.0, "w_std": 0.273440919816494 }, { "completion_length": 2679.291748046875, "cov_mean": 4.984476254321635e-05, "cov_std": 0.3510932922363281, "entropy": 0.32421875, "epoch": 0.1417142857142857, "grad_norm": 0.5072447061538696, "kl": 0.0061626434326171875, "learning_rate": 1.8594235253127372e-07, "loss": 0.0459, "reward": 0.8645833730697632, "reward_std": 0.44617248326539993, "rewards/accuracy_reward": 0.29166667722165585, "rewards/format_reward": 0.5729166865348816, "step": 124, "w_high_ratio": 0.0684627341106534, "w_low_ratio": 0.03564309095963836, "w_max": 2.0593042075634003, "w_mean": 1.2975650131702423, "w_min": 0.0, "w_std": 0.22546635568141937 }, { "completion_length": 2886.1666870117188, "cov_mean": -3.579185113267158e-05, "cov_std": 0.20644951611757278, "entropy": 0.365478515625, "epoch": 0.14285714285714285, "grad_norm": 1.0346649885177612, "kl": 0.001129150390625, "learning_rate": 1.7988620712370195e-07, "loss": -0.0434, "reward": 0.6041666772216558, "reward_std": 0.16948115080595016, "rewards/accuracy_reward": 0.23958333861082792, "rewards/format_reward": 0.3645833386108279, "step": 125, "w_high_ratio": 0.10017836093902588, "w_low_ratio": 0.013640805147588253, "w_max": 1.8374318480491638, "w_mean": 1.238489419221878, "w_min": 0.25, "w_std": 0.1324586421251297 }, { "completion_length": 3072.1355590820312, "cov_mean": -7.128902507247403e-05, "cov_std": 0.268012635409832, "entropy": 0.42138671875, "epoch": 0.144, "grad_norm": 0.24976088106632233, "kl": 0.001007080078125, "learning_rate": 1.7403048486417868e-07, "loss": -0.003, "reward": 0.7604166939854622, "reward_std": 0.3946729302406311, "rewards/accuracy_reward": 0.2395833395421505, "rewards/format_reward": 0.5208333432674408, "step": 126, "w_high_ratio": 0.0, "w_low_ratio": 0.02901885099709034, "w_max": 1.5019842684268951, "w_mean": 1.154728651046753, "w_min": 0.25, "w_std": 0.13917932659387589 }, { "completion_length": 3639.2084350585938, "cov_mean": -4.922010657537612e-05, "cov_std": 0.40233808010816574, "entropy": 0.4443359375, "epoch": 0.14514285714285713, "grad_norm": 0.2971266508102417, "kl": 0.001190185546875, "learning_rate": 1.6837835672960831e-07, "loss": 0.0023, "reward": 0.291666679084301, "reward_std": 0.3814757987856865, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.2604166716337204, "step": 127, "w_high_ratio": 0.0, "w_low_ratio": 0.048589578829705715, "w_max": 1.546887069940567, "w_mean": 1.0961028933525085, "w_min": 0.0, "w_std": 0.21367743983864784 }, { "completion_length": 3072.729248046875, "cov_mean": 2.080060630760272e-05, "cov_std": 0.1757410392165184, "entropy": 0.4794921875, "epoch": 0.1462857142857143, "grad_norm": 0.20411358773708344, "kl": 0.0022602081298828125, "learning_rate": 1.6293288344708566e-07, "loss": -0.0426, "reward": 0.8854166865348816, "reward_std": 0.23283471912145615, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.46875, "step": 128, "w_high_ratio": 0.0, "w_low_ratio": 0.023187558632344007, "w_max": 1.2965095043182373, "w_mean": 1.0975826382637024, "w_min": 0.5, "w_std": 0.09349002316594124 }, { "completion_length": 3820.5000610351562, "cov_mean": -7.538520003436133e-05, "cov_std": 0.28777727484703064, "entropy": 0.4658203125, "epoch": 0.14742857142857144, "grad_norm": 0.15165992081165314, "kl": 0.0022630691528320312, "learning_rate": 1.5769701383645698e-07, "loss": 0.0248, "reward": 0.3125, "reward_std": 0.40841156244277954, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.1875, "step": 129, "w_high_ratio": 0.0, "w_low_ratio": 0.03958333469927311, "w_max": 1.3145957589149475, "w_mean": 1.0334198474884033, "w_min": 0.25, "w_std": 0.15427661687135696 }, { "completion_length": 3234.9375610351562, "cov_mean": -4.075149445270654e-05, "cov_std": 0.40202101692557335, "entropy": 0.4580078125, "epoch": 0.14857142857142858, "grad_norm": 0.5406985282897949, "kl": 0.0025501251220703125, "learning_rate": 1.5267358321348285e-07, "loss": 0.0054, "reward": 0.43750001676380634, "reward_std": 0.48847879469394684, "rewards/accuracy_reward": 0.13541666697710752, "rewards/format_reward": 0.30208334140479565, "step": 130, "w_high_ratio": 0.1068541444838047, "w_low_ratio": 0.04362851567566395, "w_max": 2.2782379388809204, "w_mean": 1.2775286734104156, "w_min": 0.0, "w_std": 0.27442070841789246 }, { "completion_length": 2971.3751220703125, "cov_mean": 7.5797214776685e-05, "cov_std": 0.296669140458107, "entropy": 0.42236328125, "epoch": 0.14971428571428572, "grad_norm": 0.3472649157047272, "kl": 0.0027103424072265625, "learning_rate": 1.4786531185446452e-07, "loss": -0.0555, "reward": 0.7812500223517418, "reward_std": 0.35348715633153915, "rewards/accuracy_reward": 0.3437500074505806, "rewards/format_reward": 0.4375000074505806, "step": 131, "w_high_ratio": 0.03943436220288277, "w_low_ratio": 0.02768976055085659, "w_max": 1.649653136730194, "w_mean": 1.195732295513153, "w_min": 0.25, "w_std": 0.17519061639904976 }, { "completion_length": 3084.197998046875, "cov_mean": 8.538130737179017e-05, "cov_std": 0.4518180638551712, "entropy": 0.4111328125, "epoch": 0.15085714285714286, "grad_norm": 0.3087000846862793, "kl": 0.0020132064819335938, "learning_rate": 1.432748035231658e-07, "loss": 0.0503, "reward": 0.6458333507180214, "reward_std": 0.5204262509942055, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.3958333432674408, "step": 132, "w_high_ratio": 0.16728225722908974, "w_low_ratio": 0.057774459943175316, "w_max": 2.160991072654724, "w_mean": 1.3055765330791473, "w_min": 0.0, "w_std": 0.2905358038842678 }, { "completion_length": 3655.4166870117188, "cov_mean": 7.652423300896771e-05, "cov_std": 0.2661324590444565, "entropy": 0.49365234375, "epoch": 0.152, "grad_norm": 0.18705269694328308, "kl": 0.001617431640625, "learning_rate": 1.3890454406082956e-07, "loss": 0.075, "reward": 0.3645833358168602, "reward_std": 0.23336705565452576, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.1979166679084301, "step": 133, "w_high_ratio": 0.0, "w_low_ratio": 0.04366964288055897, "w_max": 1.3597778677940369, "w_mean": 1.095180168747902, "w_min": 0.5, "w_std": 0.1226506233215332 }, { "completion_length": 2935.791748046875, "cov_mean": 4.5339866119320504e-05, "cov_std": 0.3691224604845047, "entropy": 0.47900390625, "epoch": 0.15314285714285714, "grad_norm": 0.3127249777317047, "kl": 0.0026226043701171875, "learning_rate": 1.3475690004005097e-07, "loss": 0.0052, "reward": 0.739583358168602, "reward_std": 0.5291576012969017, "rewards/accuracy_reward": 0.25000000931322575, "rewards/format_reward": 0.4895833507180214, "step": 134, "w_high_ratio": 0.0, "w_low_ratio": 0.049329387256875634, "w_max": 1.5319222509860992, "w_mean": 1.2102845907211304, "w_min": 0.0, "w_std": 0.20571278221905231 }, { "completion_length": 2150.0625610351562, "cov_mean": -6.792098974983674e-05, "cov_std": 0.41625837981700897, "entropy": 0.40966796875, "epoch": 0.15428571428571428, "grad_norm": 0.5556284189224243, "kl": 0.00433349609375, "learning_rate": 1.308341174832359e-07, "loss": -0.0542, "reward": 1.2187500149011612, "reward_std": 0.4045410081744194, "rewards/accuracy_reward": 0.5312500074505806, "rewards/format_reward": 0.6875000074505806, "step": 135, "w_high_ratio": 0.1767500415444374, "w_low_ratio": 0.04722660221159458, "w_max": 2.584423005580902, "w_mean": 1.3822406232357025, "w_min": 3.783363098268088e-33, "w_std": 0.2943294197320938 }, { "completion_length": 3219.2188110351562, "cov_mean": 7.927787930839258e-05, "cov_std": 0.36336907744407654, "entropy": 0.36572265625, "epoch": 0.15542857142857142, "grad_norm": 0.4164927303791046, "kl": 0.0019321441650390625, "learning_rate": 1.2713832064634125e-07, "loss": -0.1128, "reward": 0.9062500298023224, "reward_std": 0.520418331027031, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.4687500149011612, "step": 136, "w_high_ratio": 0.09939645975828171, "w_low_ratio": 0.0331453662365675, "w_max": 1.8468493521213531, "w_mean": 1.2761128842830658, "w_min": 0.25, "w_std": 0.22711537778377533 }, { "completion_length": 3419.6458740234375, "cov_mean": -3.98908814531751e-05, "cov_std": 0.4879545792937279, "entropy": 0.36328125, "epoch": 0.15657142857142858, "grad_norm": 0.6263577938079834, "kl": 0.0011224746704101562, "learning_rate": 1.2367151086855187e-07, "loss": 0.0354, "reward": 0.4479166716337204, "reward_std": 0.5504394620656967, "rewards/accuracy_reward": 0.10416667070239782, "rewards/format_reward": 0.3437500074505806, "step": 137, "w_high_ratio": 0.040824100375175476, "w_low_ratio": 0.054094865918159485, "w_max": 1.8979838490486145, "w_mean": 1.1621877253055573, "w_min": 0.0, "w_std": 0.2735421061515808 }, { "completion_length": 3128.0626220703125, "cov_mean": -2.278413830936188e-05, "cov_std": 0.27040576189756393, "entropy": 0.38330078125, "epoch": 0.15771428571428572, "grad_norm": 0.9205570816993713, "kl": 0.049633026123046875, "learning_rate": 1.2043556548852063e-07, "loss": 0.036, "reward": 0.5520833358168602, "reward_std": 0.3116639107465744, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.385416679084301, "step": 138, "w_high_ratio": 0.0, "w_low_ratio": 0.038355547934770584, "w_max": 1.4525592625141144, "w_mean": 1.1665604412555695, "w_min": 0.25, "w_std": 0.14510583132505417 }, { "completion_length": 3528.5313110351562, "cov_mean": 3.168399962305557e-05, "cov_std": 0.5140600129961967, "entropy": 0.498046875, "epoch": 0.15885714285714286, "grad_norm": 0.4198448061943054, "kl": 0.0019092559814453125, "learning_rate": 1.1743223682775649e-07, "loss": -0.0322, "reward": 0.5104166865348816, "reward_std": 0.562984399497509, "rewards/accuracy_reward": 0.13541666977107525, "rewards/format_reward": 0.3750000186264515, "step": 139, "w_high_ratio": 0.04249488562345505, "w_low_ratio": 0.06252239271998405, "w_max": 1.7476003468036652, "w_mean": 1.1608193814754486, "w_min": 0.0, "w_std": 0.2752341143786907 }, { "completion_length": 3422.666748046875, "cov_mean": -5.857029464095831e-05, "cov_std": 0.4820387288928032, "entropy": 0.51123046875, "epoch": 0.16, "grad_norm": 0.747706413269043, "kl": 0.00482940673828125, "learning_rate": 1.1466315124171128e-07, "loss": -0.0162, "reward": 0.4791666865348816, "reward_std": 0.521266907453537, "rewards/accuracy_reward": 0.18750000186264515, "rewards/format_reward": 0.2916666753590107, "step": 140, "w_high_ratio": 0.09309478104114532, "w_low_ratio": 0.05631279572844505, "w_max": 2.1824983656406403, "w_mean": 1.2857783138751984, "w_min": 4.5542200090556555e-45, "w_std": 0.3347093164920807 }, { "completion_length": 3233.166748046875, "cov_mean": 2.940024387498852e-06, "cov_std": 0.45166684687137604, "entropy": 0.40185546875, "epoch": 0.16114285714285714, "grad_norm": 0.3896685838699341, "kl": 0.0029087066650390625, "learning_rate": 1.1212980823907929e-07, "loss": -0.0202, "reward": 0.6875000149011612, "reward_std": 0.5667420700192451, "rewards/accuracy_reward": 0.21875000651925802, "rewards/format_reward": 0.4687500149011612, "step": 141, "w_high_ratio": 0.04078603908419609, "w_low_ratio": 0.0461601298302412, "w_max": 1.661450743675232, "w_mean": 1.181685209274292, "w_min": 0.0, "w_std": 0.24826455861330032 }, { "completion_length": 3014.7084045410156, "cov_mean": 1.010456662697834e-05, "cov_std": 0.35788750648498535, "entropy": 0.43212890625, "epoch": 0.16228571428571428, "grad_norm": 0.37326905131340027, "kl": 0.0032243728637695312, "learning_rate": 1.0983357966978745e-07, "loss": 0.0113, "reward": 0.8020833730697632, "reward_std": 0.4403490200638771, "rewards/accuracy_reward": 0.23958334233611822, "rewards/format_reward": 0.5625000074505806, "step": 142, "w_high_ratio": 0.07575653120875359, "w_low_ratio": 0.04324484569951892, "w_max": 1.8279287815093994, "w_mean": 1.3297627568244934, "w_min": 0.0, "w_std": 0.21703040227293968 }, { "completion_length": 3143.479248046875, "cov_mean": 4.2679124817368574e-05, "cov_std": 0.5287895128130913, "entropy": 0.4609375, "epoch": 0.16342857142857142, "grad_norm": 0.8503015637397766, "kl": 0.004160881042480469, "learning_rate": 1.0777570898211405e-07, "loss": -0.0551, "reward": 0.5208333507180214, "reward_std": 0.4103339910507202, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.416666679084301, "step": 143, "w_high_ratio": 0.06985687837004662, "w_low_ratio": 0.06702116876840591, "w_max": 2.2525435388088226, "w_mean": 1.2808727622032166, "w_min": 0.0, "w_std": 0.33132829889655113 }, { "completion_length": 3187.3750610351562, "cov_mean": -3.3114460165961646e-05, "cov_std": 0.4341953620314598, "entropy": 0.40185546875, "epoch": 0.16457142857142856, "grad_norm": 0.36935943365097046, "kl": 0.0021028518676757812, "learning_rate": 1.0595731054933934e-07, "loss": 0.0152, "reward": 0.6250000223517418, "reward_std": 0.54064517095685, "rewards/accuracy_reward": 0.27083333395421505, "rewards/format_reward": 0.3541666716337204, "step": 144, "w_high_ratio": 0.0, "w_low_ratio": 0.04692948702722788, "w_max": 1.7171835899353027, "w_mean": 1.1371115744113922, "w_min": 0.0, "w_std": 0.24135740101337433 }, { "completion_length": 2363.416748046875, "cov_mean": 7.833678682800382e-05, "cov_std": 0.23034526035189629, "entropy": 0.3486328125, "epoch": 0.1657142857142857, "grad_norm": 0.32663026452064514, "kl": 0.0025043487548828125, "learning_rate": 1.0437936906629334e-07, "loss": 0.0397, "reward": 0.916666692122817, "reward_std": 0.28357625752687454, "rewards/accuracy_reward": 0.3125000111758709, "rewards/format_reward": 0.6041666772216558, "step": 145, "w_high_ratio": 0.0, "w_low_ratio": 0.027998102828860283, "w_max": 1.3861334919929504, "w_mean": 1.1757619678974152, "w_min": 0.2742290794849396, "w_std": 0.13859782367944717 }, { "completion_length": 2963.791748046875, "cov_mean": 3.222269015168422e-05, "cov_std": 0.2941744774580002, "entropy": 0.43359375, "epoch": 0.16685714285714287, "grad_norm": 0.466317743062973, "kl": 0.0017948150634765625, "learning_rate": 1.0304273901612565e-07, "loss": -0.0398, "reward": 0.5312500074505806, "reward_std": 0.31367800384759903, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.4687500074505806, "step": 146, "w_high_ratio": 0.08747749403119087, "w_low_ratio": 0.03374567721039057, "w_max": 2.10350301861763, "w_mean": 1.2755134403705597, "w_min": 0.25, "w_std": 0.2049087956547737 }, { "completion_length": 3316.1875610351562, "cov_mean": -2.5228303002222674e-07, "cov_std": 0.2783627863973379, "entropy": 0.42431640625, "epoch": 0.168, "grad_norm": 0.2964071035385132, "kl": 0.00269317626953125, "learning_rate": 1.0194814420758804e-07, "loss": -0.0063, "reward": 0.4687500102445483, "reward_std": 0.41155891865491867, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.3125000102445483, "step": 147, "w_high_ratio": 0.08768598735332489, "w_low_ratio": 0.027966859750449657, "w_max": 1.8361600935459137, "w_mean": 1.2343448996543884, "w_min": 0.0, "w_std": 0.19136979151517153 }, { "completion_length": 2667.0625, "cov_mean": 6.741791366948746e-05, "cov_std": 0.2421601451933384, "entropy": 0.3486328125, "epoch": 0.16914285714285715, "grad_norm": 0.5170913338661194, "kl": 0.0018463134765625, "learning_rate": 1.0109617738307911e-07, "loss": -0.0546, "reward": 0.708333358168602, "reward_std": 0.22040386497974396, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.5416666865348816, "step": 148, "w_high_ratio": 0.10121732205152512, "w_low_ratio": 0.022710513323545456, "w_max": 2.0310742557048798, "w_mean": 1.4218811392784119, "w_min": 0.25, "w_std": 0.20712272450327873 }, { "completion_length": 3187.947998046875, "cov_mean": -3.065193595830351e-05, "cov_std": 0.278855100274086, "entropy": 0.34326171875, "epoch": 0.1702857142857143, "grad_norm": 0.3262748122215271, "kl": 0.011348247528076172, "learning_rate": 1.0048729989766394e-07, "loss": -0.0407, "reward": 0.656250013038516, "reward_std": 0.2951196879148483, "rewards/accuracy_reward": 0.1979166716337204, "rewards/format_reward": 0.45833334140479565, "step": 149, "w_high_ratio": 0.006521851755678654, "w_low_ratio": 0.021616162732243538, "w_max": 1.6156696677207947, "w_mean": 1.1841600239276886, "w_min": 0.0, "w_std": 0.13553307205438614 }, { "completion_length": 3107.3125, "cov_mean": 1.709405751171289e-05, "cov_std": 0.34910060465335846, "entropy": 0.3984375, "epoch": 0.17142857142857143, "grad_norm": 0.33226433396339417, "kl": 0.0028133392333984375, "learning_rate": 1.0012184146924223e-07, "loss": 0.0548, "reward": 0.520833358168602, "reward_std": 0.40651097148656845, "rewards/accuracy_reward": 0.13541666977107525, "rewards/format_reward": 0.3854166679084301, "step": 150, "w_high_ratio": 0.05817456915974617, "w_low_ratio": 0.04275808576494455, "w_max": 1.5990401804447174, "w_mean": 1.167884886264801, "w_min": 0.25, "w_std": 0.18246712163090706 }, { "epoch": 0.17142857142857143, "step": 150, "total_flos": 0.0, "train_loss": -0.007013439348277946, "train_runtime": 12590.1794, "train_samples_per_second": 1.144, "train_steps_per_second": 0.012 } ], "logging_steps": 1, "max_steps": 150, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }