{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.26763013515321826, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 166.625, "epoch": 0.001070520540612873, "grad_norm": 1.2984755039215088, "kl": 0.0, "learning_rate": 5.319148936170213e-08, "loss": -0.0, "reward": 0.24379686824977398, "reward_std": 0.43905802024528384, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.0546875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0016093750018626451, "step": 1 }, { "completion_length": 155.09375, "epoch": 0.002141041081225746, "grad_norm": 5.433530330657959, "kl": 0.0, "learning_rate": 1.0638297872340426e-07, "loss": -0.0, "reward": 0.7448437176644802, "reward_std": 0.824664918705821, "rewards/correctness_reward_func": 0.59375, "rewards/int_reward_func": 0.1640625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.012968750204890966, "step": 2 }, { "completion_length": 156.65625, "epoch": 0.003211561621838619, "grad_norm": 1.5975662469863892, "kl": 0.0003120364726783009, "learning_rate": 1.5957446808510638e-07, "loss": 0.0, "reward": 0.5617187460884452, "reward_std": 0.6682680626399815, "rewards/correctness_reward_func": 0.40625, "rewards/int_reward_func": 0.1328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02265625004656613, "step": 3 }, { "completion_length": 151.328125, "epoch": 0.004282082162451492, "grad_norm": 1.3767573833465576, "kl": 0.00039493154326919466, "learning_rate": 2.1276595744680852e-07, "loss": 0.0, "reward": 0.14101563091389835, "reward_std": 0.40191352693364024, "rewards/correctness_reward_func": 0.125, "rewards/int_reward_func": 0.0546875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.038671874441206455, "step": 4 }, { "completion_length": 143.8125, "epoch": 0.005352602703064365, "grad_norm": 1.7029815912246704, "kl": 0.0003161150925734546, "learning_rate": 2.6595744680851066e-07, "loss": 0.0, "reward": 0.6667499775066972, "reward_std": 0.943404046818614, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03393749863607809, "step": 5 }, { "completion_length": 146.5625, "epoch": 0.006423123243677238, "grad_norm": 4.5992960929870605, "kl": 0.0014150730130495504, "learning_rate": 3.1914893617021275e-07, "loss": 0.0001, "reward": 0.6142812594771385, "reward_std": 0.8342159832827747, "rewards/correctness_reward_func": 0.46875, "rewards/int_reward_func": 0.1328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01271874993108213, "step": 6 }, { "completion_length": 163.9375, "epoch": 0.007493643784290111, "grad_norm": 7.240438461303711, "kl": 0.0018134960264433175, "learning_rate": 3.723404255319149e-07, "loss": 0.0001, "reward": 0.24757812730967999, "reward_std": 0.5592395211569965, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.0703125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.010234375484287739, "step": 7 }, { "completion_length": 151.84375, "epoch": 0.008564164324902984, "grad_norm": 4.826539993286133, "kl": 0.0011592731952987378, "learning_rate": 4.2553191489361704e-07, "loss": 0.0, "reward": 0.24482813104987144, "reward_std": 0.4538201582618058, "rewards/correctness_reward_func": 0.15625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04170312359929085, "step": 8 }, { "completion_length": 148.828125, "epoch": 0.009634684865515858, "grad_norm": 4.097943305969238, "kl": 0.0009884996707114624, "learning_rate": 4.787234042553192e-07, "loss": 0.0, "reward": 0.6986562423408031, "reward_std": 1.089426226913929, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.1171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.018968748801853508, "step": 9 }, { "completion_length": 160.3125, "epoch": 0.01070520540612873, "grad_norm": 3.2978594303131104, "kl": 0.0008196280577976722, "learning_rate": 5.319148936170213e-07, "loss": 0.0, "reward": 0.48834376223385334, "reward_std": 0.790836479049176, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.1171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.003843750571832061, "step": 10 }, { "completion_length": 146.96875, "epoch": 0.011775725946741603, "grad_norm": 3.537848472595215, "kl": 0.0007083387099555694, "learning_rate": 5.851063829787235e-07, "loss": 0.0, "reward": 0.5805312437005341, "reward_std": 0.7569947894662619, "rewards/correctness_reward_func": 0.46875, "rewards/int_reward_func": 0.1328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.021031250711530447, "step": 11 }, { "completion_length": 145.890625, "epoch": 0.012846246487354477, "grad_norm": 3.8045260906219482, "kl": 0.0013589818336186, "learning_rate": 6.382978723404255e-07, "loss": 0.0001, "reward": 0.46695311937946826, "reward_std": 0.6352905407547951, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04507812508381903, "step": 12 }, { "completion_length": 146.5, "epoch": 0.013916767027967349, "grad_norm": 1.7494301795959473, "kl": 0.000400338125473354, "learning_rate": 6.914893617021278e-07, "loss": 0.0, "reward": 0.5322968787513673, "reward_std": 0.7736401874572039, "rewards/correctness_reward_func": 0.40625, "rewards/int_reward_func": 0.1171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.00885937490966171, "step": 13 }, { "completion_length": 133.671875, "epoch": 0.014987287568580221, "grad_norm": 1.8054606914520264, "kl": 0.00036212212944519706, "learning_rate": 7.446808510638298e-07, "loss": 0.0, "reward": 0.6521718641743064, "reward_std": 1.0104734068736434, "rewards/correctness_reward_func": 0.53125, "rewards/int_reward_func": 0.1328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.011890625639352947, "step": 14 }, { "completion_length": 130.0, "epoch": 0.016057808109193095, "grad_norm": 1.5193248987197876, "kl": 0.00036491416904027574, "learning_rate": 7.97872340425532e-07, "loss": 0.0, "reward": 0.5991093763150275, "reward_std": 0.8120721196755767, "rewards/correctness_reward_func": 0.46875, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.005359375616535544, "step": 15 }, { "completion_length": 162.640625, "epoch": 0.017128328649805968, "grad_norm": 4.938427448272705, "kl": 0.0014603480958612636, "learning_rate": 8.510638297872341e-07, "loss": 0.0001, "reward": 0.2470156280323863, "reward_std": 0.5583461234346032, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.0703125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.010796874179504812, "step": 16 }, { "completion_length": 165.796875, "epoch": 0.01819884919041884, "grad_norm": 3.923428535461426, "kl": 0.001025654159093392, "learning_rate": 9.042553191489363e-07, "loss": 0.0, "reward": 0.5893437387421727, "reward_std": 0.8829143429175019, "rewards/correctness_reward_func": 0.46875, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.004406251944601536, "step": 17 }, { "completion_length": 152.359375, "epoch": 0.019269369731031716, "grad_norm": 5.248744487762451, "kl": 0.0020197606609144714, "learning_rate": 9.574468085106384e-07, "loss": 0.0001, "reward": 0.46303125098347664, "reward_std": 0.9115365371108055, "rewards/correctness_reward_func": 0.34375, "rewards/int_reward_func": 0.1015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.017718749470077455, "step": 18 }, { "completion_length": 151.359375, "epoch": 0.020339890271644588, "grad_norm": 1.7978895902633667, "kl": 0.00032554956487729214, "learning_rate": 1.0106382978723404e-06, "loss": 0.0, "reward": 0.48937500442843884, "reward_std": 0.7031663246452808, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.005000000121071935, "step": 19 }, { "completion_length": 175.90625, "epoch": 0.02141041081225746, "grad_norm": 2.1679224967956543, "kl": 0.0004576210667437408, "learning_rate": 1.0638297872340427e-06, "loss": 0.0, "reward": 0.44935936853289604, "reward_std": 0.6872247559949756, "rewards/correctness_reward_func": 0.34375, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.011859375052154064, "step": 20 }, { "completion_length": 167.265625, "epoch": 0.022480931352870333, "grad_norm": 1.475581407546997, "kl": 0.0007880991906858981, "learning_rate": 1.1170212765957447e-06, "loss": 0.0, "reward": 0.28507812274619937, "reward_std": 0.5523091573268175, "rewards/correctness_reward_func": 0.21875, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.003828124259598553, "step": 21 }, { "completion_length": 153.296875, "epoch": 0.023551451893483205, "grad_norm": 1.6419923305511475, "kl": 0.00039223546627908945, "learning_rate": 1.170212765957447e-06, "loss": 0.0, "reward": 0.6624062322080135, "reward_std": 0.9240671265870333, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.1328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0295937517657876, "step": 22 }, { "completion_length": 143.515625, "epoch": 0.02462197243409608, "grad_norm": 1.3294110298156738, "kl": 0.00030555322518921457, "learning_rate": 1.223404255319149e-06, "loss": 0.0, "reward": 0.26720312132965773, "reward_std": 0.4894332850817591, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.0546875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025015624007210135, "step": 23 }, { "completion_length": 148.3125, "epoch": 0.025692492974708953, "grad_norm": 6.915380954742432, "kl": 0.0062694076787011, "learning_rate": 1.276595744680851e-06, "loss": 0.0003, "reward": 0.28068749560043216, "reward_std": 0.5439753192476928, "rewards/correctness_reward_func": 0.1875, "rewards/int_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.030687499791383743, "step": 24 }, { "completion_length": 145.421875, "epoch": 0.026763013515321826, "grad_norm": 1.5796961784362793, "kl": 0.0008160970010067103, "learning_rate": 1.3297872340425533e-06, "loss": 0.0, "reward": 0.604515643324703, "reward_std": 0.6846362175419927, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.1171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.049828124698251486, "step": 25 }, { "completion_length": 144.21875, "epoch": 0.027833534055934698, "grad_norm": 3.3064963817596436, "kl": 0.0009157936146948487, "learning_rate": 1.3829787234042555e-06, "loss": 0.0, "reward": 0.5731093874201179, "reward_std": 0.8484273846261203, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.1171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.018421874032355845, "step": 26 }, { "completion_length": 135.921875, "epoch": 0.02890405459654757, "grad_norm": 1.8949307203292847, "kl": 0.0004942502673657145, "learning_rate": 1.4361702127659578e-06, "loss": 0.0, "reward": 0.6309375101700425, "reward_std": 0.9806207492947578, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.005937499925494194, "step": 27 }, { "completion_length": 153.59375, "epoch": 0.029974575137160443, "grad_norm": 1.7615420818328857, "kl": 0.0004919220991723705, "learning_rate": 1.4893617021276596e-06, "loss": 0.0, "reward": 0.1835937526775524, "reward_std": 0.45627398509532213, "rewards/correctness_reward_func": 0.15625, "rewards/int_reward_func": 0.0546875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.027343749883584678, "step": 28 }, { "completion_length": 164.640625, "epoch": 0.03104509567777332, "grad_norm": 4.3193230628967285, "kl": 0.0013858377351425588, "learning_rate": 1.5425531914893618e-06, "loss": 0.0001, "reward": 0.40034375386312604, "reward_std": 0.7546388749033213, "rewards/correctness_reward_func": 0.28125, "rewards/int_reward_func": 0.109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.009718750137835741, "step": 29 }, { "completion_length": 128.234375, "epoch": 0.03211561621838619, "grad_norm": 5.113959312438965, "kl": 0.0021859680928173475, "learning_rate": 1.595744680851064e-06, "loss": 0.0001, "reward": 0.8881718653719872, "reward_std": 0.9410315058194101, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.1796875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02098437474342063, "step": 30 }, { "completion_length": 140.8125, "epoch": 0.03318613675899906, "grad_norm": 7.337815284729004, "kl": 0.00254826245145523, "learning_rate": 1.648936170212766e-06, "loss": 0.0001, "reward": 0.7087812423706055, "reward_std": 0.9508242532610893, "rewards/correctness_reward_func": 0.53125, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021281250286847353, "step": 31 }, { "completion_length": 141.0625, "epoch": 0.034256657299611935, "grad_norm": 4.055324554443359, "kl": 0.0016690000156813767, "learning_rate": 1.7021276595744682e-06, "loss": 0.0001, "reward": 0.7299062572419643, "reward_std": 0.8444140013307333, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.1328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03459375072270632, "step": 32 }, { "completion_length": 154.296875, "epoch": 0.03532717784022481, "grad_norm": 3.565863847732544, "kl": 0.0011733200299204327, "learning_rate": 1.7553191489361704e-06, "loss": 0.0, "reward": 0.7152343707857653, "reward_std": 0.986182201653719, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.1640625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.011328124441206455, "step": 33 }, { "completion_length": 156.484375, "epoch": 0.03639769838083768, "grad_norm": 2.782845973968506, "kl": 0.001061345017660642, "learning_rate": 1.8085106382978727e-06, "loss": 0.0, "reward": 0.4085781138855964, "reward_std": 0.6946883676573634, "rewards/correctness_reward_func": 0.3125, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0023281261092051864, "step": 34 }, { "completion_length": 166.203125, "epoch": 0.03746821892145055, "grad_norm": 1.6768676042556763, "kl": 0.0008204600453609601, "learning_rate": 1.8617021276595745e-06, "loss": 0.0, "reward": 0.5197031321004033, "reward_std": 0.8639188781380653, "rewards/correctness_reward_func": 0.40625, "rewards/int_reward_func": 0.1171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0037343755830079317, "step": 35 }, { "completion_length": 168.828125, "epoch": 0.03853873946206343, "grad_norm": 7.892724990844727, "kl": 0.0029609855264425278, "learning_rate": 1.9148936170212767e-06, "loss": 0.0001, "reward": 0.23489062942098826, "reward_std": 0.356358939781785, "rewards/correctness_reward_func": 0.15625, "rewards/int_reward_func": 0.046875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.031765625230036676, "step": 36 }, { "completion_length": 123.53125, "epoch": 0.039609260002676304, "grad_norm": 2.5262649059295654, "kl": 0.0010402118496131152, "learning_rate": 1.968085106382979e-06, "loss": 0.0, "reward": 0.9631250270176679, "reward_std": 1.0183926988393068, "rewards/correctness_reward_func": 0.71875, "rewards/int_reward_func": 0.1796875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06468750000931323, "step": 37 }, { "completion_length": 159.46875, "epoch": 0.040679780543289176, "grad_norm": 1.3111544847488403, "kl": 0.0011480498651508242, "learning_rate": 2.021276595744681e-06, "loss": 0.0, "reward": 0.4472187543287873, "reward_std": 0.6874936055392027, "rewards/correctness_reward_func": 0.34375, "rewards/int_reward_func": 0.09375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.009718748508021235, "step": 38 }, { "completion_length": 142.25, "epoch": 0.04175030108390205, "grad_norm": 2.2276878356933594, "kl": 0.0014690053576487117, "learning_rate": 2.074468085106383e-06, "loss": 0.0001, "reward": 1.0481719109229743, "reward_std": 0.7983668614178896, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.1796875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05598437529988587, "step": 39 }, { "completion_length": 149.71875, "epoch": 0.04282082162451492, "grad_norm": 3.2188539505004883, "kl": 0.002757413443760015, "learning_rate": 2.1276595744680853e-06, "loss": 0.0001, "reward": 0.8439687644131482, "reward_std": 0.9366709599271417, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04709374951198697, "step": 40 }, { "completion_length": 153.75, "epoch": 0.04389134216512779, "grad_norm": 1.609779953956604, "kl": 0.002587423972727265, "learning_rate": 2.1808510638297876e-06, "loss": 0.0001, "reward": 0.9616250339895487, "reward_std": 0.8783023115247488, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.1953125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01631249929778278, "step": 41 }, { "completion_length": 156.15625, "epoch": 0.044961862705740666, "grad_norm": 5.789142608642578, "kl": 0.0045223182532936335, "learning_rate": 2.2340425531914894e-06, "loss": 0.0002, "reward": 0.8380624754354358, "reward_std": 0.881235895678401, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.1796875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03337499825283885, "step": 42 }, { "completion_length": 161.390625, "epoch": 0.04603238324635354, "grad_norm": 5.589737892150879, "kl": 0.005504744782228954, "learning_rate": 2.2872340425531916e-06, "loss": 0.0002, "reward": 0.8749218666926026, "reward_std": 0.9216371476650238, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.1796875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007734375540167093, "step": 43 }, { "completion_length": 161.796875, "epoch": 0.04710290378696641, "grad_norm": 5.604761600494385, "kl": 0.0046821657015243545, "learning_rate": 2.340425531914894e-06, "loss": 0.0002, "reward": 0.2252656314522028, "reward_std": 0.47946364153176546, "rewards/correctness_reward_func": 0.15625, "rewards/int_reward_func": 0.0546875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.014328125165775418, "step": 44 }, { "completion_length": 154.453125, "epoch": 0.04817342432757928, "grad_norm": 9.506386756896973, "kl": 0.007164878101320937, "learning_rate": 2.393617021276596e-06, "loss": 0.0003, "reward": 0.6195624829269946, "reward_std": 1.0045473407953978, "rewards/correctness_reward_func": 0.46875, "rewards/int_reward_func": 0.1328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01799999945797026, "step": 45 }, { "completion_length": 140.390625, "epoch": 0.04924394486819216, "grad_norm": 3.766388416290283, "kl": 0.00619677483337, "learning_rate": 2.446808510638298e-06, "loss": 0.0002, "reward": 0.9423750173300505, "reward_std": 0.5746848955750465, "rewards/correctness_reward_func": 0.65625, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08299999847076833, "step": 46 }, { "completion_length": 143.09375, "epoch": 0.050314465408805034, "grad_norm": 4.4466705322265625, "kl": 0.009431202546693385, "learning_rate": 2.5e-06, "loss": 0.0004, "reward": 0.9380937227979302, "reward_std": 0.658873830921948, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.1796875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07090624794363976, "step": 47 }, { "completion_length": 165.0625, "epoch": 0.051384985949417906, "grad_norm": 1.3459303379058838, "kl": 0.00600922666490078, "learning_rate": 2.553191489361702e-06, "loss": 0.0002, "reward": 0.6878281269455329, "reward_std": 0.9325386872515082, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03157812531571835, "step": 48 }, { "completion_length": 134.421875, "epoch": 0.05245550649003078, "grad_norm": 1.628507375717163, "kl": 0.008930853742640465, "learning_rate": 2.6063829787234047e-06, "loss": 0.0004, "reward": 0.8599374926416203, "reward_std": 0.9755922555923462, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.1796875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05525000113993883, "step": 49 }, { "completion_length": 149.5625, "epoch": 0.05352602703064365, "grad_norm": 1.1767226457595825, "kl": 0.012399342958815396, "learning_rate": 2.6595744680851065e-06, "loss": 0.0005, "reward": 1.0091875102370977, "reward_std": 0.8712345249950886, "rewards/correctness_reward_func": 0.71875, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08731249999254942, "step": 50 }, { "completion_length": 169.078125, "epoch": 0.05459654757125652, "grad_norm": 2.5901050567626953, "kl": 0.007969769008923322, "learning_rate": 2.7127659574468084e-06, "loss": 0.0003, "reward": 0.9503125064074993, "reward_std": 0.9690856691449881, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.028437498025596142, "step": 51 }, { "completion_length": 166.390625, "epoch": 0.055667068111869396, "grad_norm": 3.764841079711914, "kl": 0.011693944863509387, "learning_rate": 2.765957446808511e-06, "loss": 0.0005, "reward": 0.7889687474817038, "reward_std": 0.8161248974502087, "rewards/correctness_reward_func": 0.5625, "rewards/int_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03896875097416341, "step": 52 }, { "completion_length": 158.3125, "epoch": 0.05673758865248227, "grad_norm": 4.34719705581665, "kl": 0.00852247714647092, "learning_rate": 2.819148936170213e-06, "loss": 0.0003, "reward": 0.5854531275108457, "reward_std": 0.579142062459141, "rewards/correctness_reward_func": 0.4375, "rewards/int_reward_func": 0.1484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.00048437435179948807, "step": 53 }, { "completion_length": 149.15625, "epoch": 0.05780810919309514, "grad_norm": 1.3242722749710083, "kl": 0.006340013263979927, "learning_rate": 2.8723404255319155e-06, "loss": 0.0003, "reward": 0.7202812694013119, "reward_std": 0.6025513117201626, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07965625170618296, "step": 54 }, { "completion_length": 131.0, "epoch": 0.05887862973370801, "grad_norm": 1.2887805700302124, "kl": 0.005301086028339341, "learning_rate": 2.9255319148936174e-06, "loss": 0.0002, "reward": 1.1997031308710575, "reward_std": 0.8454109290614724, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10595312505029142, "step": 55 }, { "completion_length": 132.828125, "epoch": 0.059949150274320885, "grad_norm": 3.085810899734497, "kl": 0.03685568018408958, "learning_rate": 2.978723404255319e-06, "loss": 0.0015, "reward": 1.32792192324996, "reward_std": 0.8808077229186893, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.062296870397403836, "step": 56 }, { "completion_length": 169.5, "epoch": 0.061019670814933764, "grad_norm": 3.218595266342163, "kl": 0.019670582871185616, "learning_rate": 3.031914893617022e-06, "loss": 0.0008, "reward": 1.0494843795895576, "reward_std": 0.8383767995983362, "rewards/correctness_reward_func": 0.78125, "rewards/int_reward_func": 0.1953125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07292187376879156, "step": 57 }, { "completion_length": 164.71875, "epoch": 0.06209019135554664, "grad_norm": 1.189056634902954, "kl": 0.010589714744128287, "learning_rate": 3.0851063829787237e-06, "loss": 0.0004, "reward": 0.9995937808416784, "reward_std": 0.8763523772358894, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.015218749409541488, "step": 58 }, { "completion_length": 173.109375, "epoch": 0.0631607118961595, "grad_norm": 2.9426801204681396, "kl": 0.010116680678038392, "learning_rate": 3.1382978723404255e-06, "loss": 0.0004, "reward": 0.5431718821637332, "reward_std": 0.5429769204929471, "rewards/correctness_reward_func": 0.40625, "rewards/int_reward_func": 0.1015625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03535937680862844, "step": 59 }, { "completion_length": 148.328125, "epoch": 0.06423123243677238, "grad_norm": 1.2039525508880615, "kl": 0.012681124440860003, "learning_rate": 3.191489361702128e-06, "loss": 0.0005, "reward": 0.7227343516424298, "reward_std": 0.7870303755626082, "rewards/correctness_reward_func": 0.53125, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06648437259718776, "step": 60 }, { "completion_length": 125.5625, "epoch": 0.06530175297738525, "grad_norm": 1.2423548698425293, "kl": 0.007948026963276789, "learning_rate": 3.24468085106383e-06, "loss": 0.0003, "reward": 1.3104218831285834, "reward_std": 0.6878003547899425, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12292187649291009, "step": 61 }, { "completion_length": 152.296875, "epoch": 0.06637227351799813, "grad_norm": 5.324446678161621, "kl": 0.043227474874584004, "learning_rate": 3.297872340425532e-06, "loss": 0.0017, "reward": 1.0643124831840396, "reward_std": 0.9667724259197712, "rewards/correctness_reward_func": 0.78125, "rewards/int_reward_func": 0.2109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07212500204332173, "step": 62 }, { "completion_length": 157.328125, "epoch": 0.067442794058611, "grad_norm": 1.2027546167373657, "kl": 0.004988896660506725, "learning_rate": 3.3510638297872345e-06, "loss": 0.0002, "reward": 1.1987031551543623, "reward_std": 0.8610715009272099, "rewards/correctness_reward_func": 0.90625, "rewards/int_reward_func": 0.2578125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.034640624886378646, "step": 63 }, { "completion_length": 163.9375, "epoch": 0.06851331459922387, "grad_norm": 1.126978874206543, "kl": 0.004329273069743067, "learning_rate": 3.4042553191489363e-06, "loss": 0.0002, "reward": 0.8524375010747463, "reward_std": 0.742443086579442, "rewards/correctness_reward_func": 0.65625, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.024312500143423676, "step": 64 }, { "completion_length": 135.765625, "epoch": 0.06958383513983675, "grad_norm": 4.760307788848877, "kl": 0.0423761896090582, "learning_rate": 3.457446808510639e-06, "loss": 0.0017, "reward": 1.4641093388199806, "reward_std": 1.0102775804698467, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1047343765385449, "step": 65 }, { "completion_length": 155.796875, "epoch": 0.07065435568044962, "grad_norm": 3.1110665798187256, "kl": 0.061538238427601755, "learning_rate": 3.510638297872341e-06, "loss": 0.0025, "reward": 1.033312514424324, "reward_std": 0.8865859052166343, "rewards/correctness_reward_func": 0.78125, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.048937500920146704, "step": 66 }, { "completion_length": 164.578125, "epoch": 0.0717248762210625, "grad_norm": 1.2862604856491089, "kl": 0.0052650388242909685, "learning_rate": 3.5638297872340426e-06, "loss": 0.0002, "reward": 0.7733906293287873, "reward_std": 0.8027388863265514, "rewards/correctness_reward_func": 0.59375, "rewards/int_reward_func": 0.1640625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.015578125370666385, "step": 67 }, { "completion_length": 163.75, "epoch": 0.07279539676167536, "grad_norm": 6.033888816833496, "kl": 0.07815390304313041, "learning_rate": 3.6170212765957453e-06, "loss": 0.0031, "reward": 1.1917500102426857, "reward_std": 0.7897173650562763, "rewards/correctness_reward_func": 0.90625, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019875000230968, "step": 68 }, { "completion_length": 149.40625, "epoch": 0.07386591730228824, "grad_norm": 1.29887056350708, "kl": 0.01987961767008528, "learning_rate": 3.670212765957447e-06, "loss": 0.0008, "reward": 1.1079531812574714, "reward_std": 0.7181853111833334, "rewards/correctness_reward_func": 0.78125, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06107812421396375, "step": 69 }, { "completion_length": 141.859375, "epoch": 0.0749364378429011, "grad_norm": 5.329657077789307, "kl": 0.0800003606127575, "learning_rate": 3.723404255319149e-06, "loss": 0.0032, "reward": 0.9977656248956919, "reward_std": 0.5876570995897055, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09151562419719994, "step": 70 }, { "completion_length": 138.5625, "epoch": 0.07600695838351398, "grad_norm": 2.7627108097076416, "kl": 0.03701730686589144, "learning_rate": 3.7765957446808516e-06, "loss": 0.0015, "reward": 1.2351874904707074, "reward_std": 0.8267962019890547, "rewards/correctness_reward_func": 0.90625, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07893750071525574, "step": 71 }, { "completion_length": 133.984375, "epoch": 0.07707747892412686, "grad_norm": 1.414104700088501, "kl": 0.016362678608857095, "learning_rate": 3.8297872340425535e-06, "loss": 0.0007, "reward": 1.827203094959259, "reward_std": 1.0086707267910242, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0928281235974282, "step": 72 }, { "completion_length": 148.0625, "epoch": 0.07814799946473973, "grad_norm": 1.3797897100448608, "kl": 0.010796019807457924, "learning_rate": 3.882978723404256e-06, "loss": 0.0004, "reward": 1.1499999817460775, "reward_std": 0.9028994599357247, "rewards/correctness_reward_func": 0.84375, "rewards/int_reward_func": 0.2109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09531250316649675, "step": 73 }, { "completion_length": 155.828125, "epoch": 0.07921852000535261, "grad_norm": 1.2884279489517212, "kl": 0.017199350346345454, "learning_rate": 3.936170212765958e-06, "loss": 0.0007, "reward": 1.0700937574729323, "reward_std": 1.0012164115905762, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.023218751302920282, "step": 74 }, { "completion_length": 146.96875, "epoch": 0.08028904054596547, "grad_norm": 3.5342090129852295, "kl": 0.05702902490156703, "learning_rate": 3.98936170212766e-06, "loss": 0.0023, "reward": 1.2074843887239695, "reward_std": 0.8127174219116569, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.2578125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07467187196016312, "step": 75 }, { "completion_length": 135.140625, "epoch": 0.08135956108657835, "grad_norm": 1.7845228910446167, "kl": 0.01959521723620128, "learning_rate": 4.042553191489362e-06, "loss": 0.0008, "reward": 1.4087968692183495, "reward_std": 0.9582029562443495, "rewards/correctness_reward_func": 1.03125, "rewards/int_reward_func": 0.2734375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10410937643609941, "step": 76 }, { "completion_length": 126.125, "epoch": 0.08243008162719122, "grad_norm": 1.3039053678512573, "kl": 0.008768495463300496, "learning_rate": 4.095744680851064e-06, "loss": 0.0004, "reward": 1.3187343887984753, "reward_std": 0.5690027270466089, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.2578125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12342187319882214, "step": 77 }, { "completion_length": 148.6875, "epoch": 0.0835006021678041, "grad_norm": 3.3175690174102783, "kl": 0.014638990571256727, "learning_rate": 4.148936170212766e-06, "loss": 0.0006, "reward": 1.4877656111493707, "reward_std": 0.8957763649523258, "rewards/correctness_reward_func": 1.09375, "rewards/int_reward_func": 0.2890625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10495312558487058, "step": 78 }, { "completion_length": 171.0625, "epoch": 0.08457112270841696, "grad_norm": 5.574616432189941, "kl": 0.01483242801623419, "learning_rate": 4.202127659574468e-06, "loss": 0.0006, "reward": 1.0053124725818634, "reward_std": 1.1163944154977798, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.2265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02874999982304871, "step": 79 }, { "completion_length": 150.5625, "epoch": 0.08564164324902984, "grad_norm": 3.4894111156463623, "kl": 0.020942480681696907, "learning_rate": 4.255319148936171e-06, "loss": 0.0008, "reward": 1.081656239926815, "reward_std": 0.8474766900762916, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.234375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09728124877437949, "step": 80 }, { "completion_length": 145.8125, "epoch": 0.08671216378964271, "grad_norm": 4.479030609130859, "kl": 0.02067101007560268, "learning_rate": 4.308510638297873e-06, "loss": 0.0008, "reward": 0.9469531225040555, "reward_std": 0.5945024443790317, "rewards/correctness_reward_func": 0.625, "rewards/int_reward_func": 0.2421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07976562529802322, "step": 81 }, { "completion_length": 142.609375, "epoch": 0.08778268433025559, "grad_norm": 9.429178237915039, "kl": 0.032599265803582966, "learning_rate": 4.361702127659575e-06, "loss": 0.0013, "reward": 0.7288906406611204, "reward_std": 0.8307479582726955, "rewards/correctness_reward_func": 0.46875, "rewards/int_reward_func": 0.171875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08826562575995922, "step": 82 }, { "completion_length": 143.84375, "epoch": 0.08885320487086847, "grad_norm": 1.5900259017944336, "kl": 0.013988179998705164, "learning_rate": 4.414893617021277e-06, "loss": 0.0006, "reward": 1.591531228274107, "reward_std": 0.8249969203025103, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.3203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021218748996034265, "step": 83 }, { "completion_length": 135.8125, "epoch": 0.08992372541148133, "grad_norm": 4.648512363433838, "kl": 0.06424052006332204, "learning_rate": 4.468085106382979e-06, "loss": 0.0026, "reward": 1.4651249905582517, "reward_std": 0.8705566665157676, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.2890625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11356249963864684, "step": 84 }, { "completion_length": 123.90625, "epoch": 0.09099424595209421, "grad_norm": 6.136663913726807, "kl": 0.02576264040544629, "learning_rate": 4.521276595744681e-06, "loss": 0.001, "reward": 1.7882343754172325, "reward_std": 0.7715246099978685, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.3671875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10854687169194221, "step": 85 }, { "completion_length": 132.65625, "epoch": 0.09206476649270708, "grad_norm": 2.7052152156829834, "kl": 0.021323165216017514, "learning_rate": 4.574468085106383e-06, "loss": 0.0009, "reward": 1.2060781214386225, "reward_std": 0.74014887586236, "rewards/correctness_reward_func": 0.84375, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11232812539674342, "step": 86 }, { "completion_length": 142.3125, "epoch": 0.09313528703331996, "grad_norm": 6.834721088409424, "kl": 0.15714708802988753, "learning_rate": 4.6276595744680855e-06, "loss": 0.0063, "reward": 1.4907656013965607, "reward_std": 1.1851906776428223, "rewards/correctness_reward_func": 1.09375, "rewards/int_reward_func": 0.3203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07670312328264117, "step": 87 }, { "completion_length": 141.625, "epoch": 0.09420580757393282, "grad_norm": 1.5215015411376953, "kl": 0.012561204843223095, "learning_rate": 4.680851063829788e-06, "loss": 0.0005, "reward": 0.973562479019165, "reward_std": 0.8407629178836942, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.2109375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07512499787844718, "step": 88 }, { "completion_length": 131.109375, "epoch": 0.0952763281145457, "grad_norm": 4.186502456665039, "kl": 0.029811605345457792, "learning_rate": 4.73404255319149e-06, "loss": 0.0012, "reward": 1.571968775242567, "reward_std": 0.8088337788358331, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13446875009685755, "step": 89 }, { "completion_length": 137.5625, "epoch": 0.09634684865515857, "grad_norm": 3.539090633392334, "kl": 0.025141435849945992, "learning_rate": 4.787234042553192e-06, "loss": 0.001, "reward": 1.1439843773841858, "reward_std": 1.1783022359013557, "rewards/correctness_reward_func": 0.8125, "rewards/int_reward_func": 0.2421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08929687412455678, "step": 90 }, { "completion_length": 142.0, "epoch": 0.09741736919577144, "grad_norm": 3.9599854946136475, "kl": 0.03133453679038212, "learning_rate": 4.840425531914894e-06, "loss": 0.0013, "reward": 0.9123750082217157, "reward_std": 0.8763793092221022, "rewards/correctness_reward_func": 0.65625, "rewards/int_reward_func": 0.21875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.037375001003965735, "step": 91 }, { "completion_length": 127.0625, "epoch": 0.09848788973638432, "grad_norm": 5.762697219848633, "kl": 0.02441024547442794, "learning_rate": 4.893617021276596e-06, "loss": 0.001, "reward": 1.4343437626957893, "reward_std": 1.0151535924524069, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10621875151991844, "step": 92 }, { "completion_length": 128.421875, "epoch": 0.09955841027699719, "grad_norm": 1.7006865739822388, "kl": 0.014295783417765051, "learning_rate": 4.946808510638298e-06, "loss": 0.0006, "reward": 1.4152031522244215, "reward_std": 1.0377220567315817, "rewards/correctness_reward_func": 1.03125, "rewards/int_reward_func": 0.265625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11832812381908298, "step": 93 }, { "completion_length": 126.703125, "epoch": 0.10062893081761007, "grad_norm": 2.6587624549865723, "kl": 0.022590334410779178, "learning_rate": 5e-06, "loss": 0.0009, "reward": 1.0790781378746033, "reward_std": 0.8811032259836793, "rewards/correctness_reward_func": 0.75, "rewards/int_reward_func": 0.203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12595312716439366, "step": 94 }, { "completion_length": 134.25, "epoch": 0.10169945135822293, "grad_norm": 5.803914546966553, "kl": 0.0693736044340767, "learning_rate": 4.999982515602153e-06, "loss": 0.0028, "reward": 1.6232343390583992, "reward_std": 1.171187661588192, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09198437177110463, "step": 95 }, { "completion_length": 119.09375, "epoch": 0.10276997189883581, "grad_norm": 2.6233108043670654, "kl": 0.025776030379347503, "learning_rate": 4.999930062653175e-06, "loss": 0.001, "reward": 1.4469374530017376, "reward_std": 0.717207751236856, "rewards/correctness_reward_func": 1.03125, "rewards/int_reward_func": 0.2890625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12662499537691474, "step": 96 }, { "completion_length": 135.625, "epoch": 0.10384049243944868, "grad_norm": 1.4691489934921265, "kl": 0.018308754893951118, "learning_rate": 4.999842641886752e-06, "loss": 0.0007, "reward": 1.7408281043171883, "reward_std": 0.6604799125343561, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10020312923006713, "step": 97 }, { "completion_length": 142.53125, "epoch": 0.10491101298006156, "grad_norm": 3.7322754859924316, "kl": 0.031659536180086434, "learning_rate": 4.999720254525684e-06, "loss": 0.0013, "reward": 1.3402031436562538, "reward_std": 1.0035525700077415, "rewards/correctness_reward_func": 0.96875, "rewards/int_reward_func": 0.3203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.051140623865649104, "step": 98 }, { "completion_length": 136.859375, "epoch": 0.10598153352067442, "grad_norm": 1.445330023765564, "kl": 0.016985120251774788, "learning_rate": 4.999562902281866e-06, "loss": 0.0007, "reward": 1.4283437356352806, "reward_std": 0.9263063753023744, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13146874867379665, "step": 99 }, { "completion_length": 145.234375, "epoch": 0.1070520540612873, "grad_norm": 4.298182010650635, "kl": 0.03539641568204388, "learning_rate": 4.999370587356267e-06, "loss": 0.0014, "reward": 1.5638906005769968, "reward_std": 0.9018815001472831, "rewards/correctness_reward_func": 1.15625, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07951562479138374, "step": 100 }, { "completion_length": 126.46875, "epoch": 0.10812257460190017, "grad_norm": 1.7051573991775513, "kl": 0.0210904503474012, "learning_rate": 4.999143312438893e-06, "loss": 0.0008, "reward": 2.032781273126602, "reward_std": 0.882479477673769, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10309374984353781, "step": 101 }, { "completion_length": 138.4375, "epoch": 0.10919309514251305, "grad_norm": 5.384690284729004, "kl": 0.05343873624224216, "learning_rate": 4.998881080708759e-06, "loss": 0.0021, "reward": 1.7520312666893005, "reward_std": 1.0768068991601467, "rewards/correctness_reward_func": 1.34375, "rewards/int_reward_func": 0.3203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08796874759718776, "step": 102 }, { "completion_length": 121.265625, "epoch": 0.11026361568312593, "grad_norm": 3.4453506469726562, "kl": 0.054925739066675305, "learning_rate": 4.998583895833834e-06, "loss": 0.0022, "reward": 1.6850781589746475, "reward_std": 0.6663676341995597, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1303906268440187, "step": 103 }, { "completion_length": 114.71875, "epoch": 0.11133413622373879, "grad_norm": 1.5185428857803345, "kl": 0.022636244888417423, "learning_rate": 4.998251761970997e-06, "loss": 0.0009, "reward": 1.7328437007963657, "reward_std": 0.7276732774917036, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13909374829381704, "step": 104 }, { "completion_length": 114.359375, "epoch": 0.11240465676435167, "grad_norm": 4.903259754180908, "kl": 0.0455300398170948, "learning_rate": 4.997884683765977e-06, "loss": 0.0018, "reward": 2.070781234651804, "reward_std": 0.5790006909519434, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14109374955296516, "step": 105 }, { "completion_length": 117.578125, "epoch": 0.11347517730496454, "grad_norm": 7.719193458557129, "kl": 0.07494375784881413, "learning_rate": 4.997482666353287e-06, "loss": 0.003, "reward": 1.8749061971902847, "reward_std": 0.762595918495208, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1717812498100102, "step": 106 }, { "completion_length": 132.71875, "epoch": 0.11454569784557742, "grad_norm": 1.9178704023361206, "kl": 0.026816099416464567, "learning_rate": 4.997045715356153e-06, "loss": 0.0011, "reward": 1.4049999862909317, "reward_std": 1.0492134541273117, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13937499932944775, "step": 107 }, { "completion_length": 121.21875, "epoch": 0.11561621838619028, "grad_norm": 3.1541106700897217, "kl": 0.04801671905443072, "learning_rate": 4.9965738368864345e-06, "loss": 0.0019, "reward": 1.7343593537807465, "reward_std": 0.9352071397006512, "rewards/correctness_reward_func": 1.21875, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1562343705445528, "step": 108 }, { "completion_length": 134.40625, "epoch": 0.11668673892680316, "grad_norm": 3.4600815773010254, "kl": 0.07537143386434764, "learning_rate": 4.996067037544542e-06, "loss": 0.003, "reward": 1.6357812024652958, "reward_std": 0.7928643207997084, "rewards/correctness_reward_func": 1.15625, "rewards/int_reward_func": 0.3359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14359374903142452, "step": 109 }, { "completion_length": 121.25, "epoch": 0.11775725946741603, "grad_norm": 1.8081263303756714, "kl": 0.03252584161236882, "learning_rate": 4.995525324419338e-06, "loss": 0.0013, "reward": 1.6038124561309814, "reward_std": 0.9315616749227047, "rewards/correctness_reward_func": 1.09375, "rewards/int_reward_func": 0.3671875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14287500269711018, "step": 110 }, { "completion_length": 113.734375, "epoch": 0.1188277800080289, "grad_norm": 3.427846670150757, "kl": 0.029214507434517145, "learning_rate": 4.994948705088047e-06, "loss": 0.0012, "reward": 1.7096093818545341, "reward_std": 0.9403769830241799, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14710937440395355, "step": 111 }, { "completion_length": 138.140625, "epoch": 0.11989830054864177, "grad_norm": 5.425636291503906, "kl": 0.11221261869650334, "learning_rate": 4.99433718761614e-06, "loss": 0.0045, "reward": 1.4035156145691872, "reward_std": 1.1017278581857681, "rewards/correctness_reward_func": 0.90625, "rewards/int_reward_func": 0.3515625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14570312481373549, "step": 112 }, { "completion_length": 128.828125, "epoch": 0.12096882108925465, "grad_norm": 9.655882835388184, "kl": 0.14679548889398575, "learning_rate": 4.993690780557232e-06, "loss": 0.0059, "reward": 1.5967968963086605, "reward_std": 0.9262449182569981, "rewards/correctness_reward_func": 1.09375, "rewards/int_reward_func": 0.3359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16710936930030584, "step": 113 }, { "completion_length": 114.3125, "epoch": 0.12203934162986753, "grad_norm": 2.9527218341827393, "kl": 0.03957346314564347, "learning_rate": 4.993009492952951e-06, "loss": 0.0016, "reward": 1.7140468880534172, "reward_std": 0.9286471158266068, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16717187454923987, "step": 114 }, { "completion_length": 122.515625, "epoch": 0.1231098621704804, "grad_norm": 4.749152660369873, "kl": 0.05118492292240262, "learning_rate": 4.992293334332821e-06, "loss": 0.002, "reward": 1.9886562526226044, "reward_std": 0.9419979229569435, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14490625634789467, "step": 115 }, { "completion_length": 124.171875, "epoch": 0.12418038271109327, "grad_norm": 4.01917028427124, "kl": 0.10346966434735805, "learning_rate": 4.991542314714122e-06, "loss": 0.0041, "reward": 1.837890625, "reward_std": 0.912613769993186, "rewards/correctness_reward_func": 1.28125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15039062406867743, "step": 116 }, { "completion_length": 124.375, "epoch": 0.12525090325170615, "grad_norm": 3.7853267192840576, "kl": 0.07023024489171803, "learning_rate": 4.990756444601757e-06, "loss": 0.0028, "reward": 1.6455781627446413, "reward_std": 0.7009308515116572, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.3671875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15339062316343188, "step": 117 }, { "completion_length": 136.359375, "epoch": 0.126321423792319, "grad_norm": 2.0698678493499756, "kl": 0.03698924113996327, "learning_rate": 4.989935734988098e-06, "loss": 0.0015, "reward": 1.8074062652885914, "reward_std": 0.8670060317963362, "rewards/correctness_reward_func": 1.28125, "rewards/int_reward_func": 0.3671875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1589687503874302, "step": 118 }, { "completion_length": 148.15625, "epoch": 0.12739194433293188, "grad_norm": 7.199705123901367, "kl": 0.180443427991122, "learning_rate": 4.989080197352834e-06, "loss": 0.0072, "reward": 1.179109364748001, "reward_std": 0.7882043793797493, "rewards/correctness_reward_func": 0.71875, "rewards/int_reward_func": 0.328125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13223437825217843, "step": 119 }, { "completion_length": 122.46875, "epoch": 0.12846246487354476, "grad_norm": 1.6841520071029663, "kl": 0.0505296983756125, "learning_rate": 4.9881898436628165e-06, "loss": 0.002, "reward": 1.9381406530737877, "reward_std": 0.8331695850938559, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17251562420278788, "step": 120 }, { "completion_length": 108.1875, "epoch": 0.12953298541415764, "grad_norm": 1.8260003328323364, "kl": 0.059317339677363634, "learning_rate": 4.987264686371881e-06, "loss": 0.0024, "reward": 2.02584370970726, "reward_std": 1.0399171710014343, "rewards/correctness_reward_func": 1.40625, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19771874882280827, "step": 121 }, { "completion_length": 95.46875, "epoch": 0.1306035059547705, "grad_norm": 2.1317758560180664, "kl": 0.053682942409068346, "learning_rate": 4.986304738420684e-06, "loss": 0.0021, "reward": 2.1616249792277813, "reward_std": 0.6366997184231877, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22412499878555536, "step": 122 }, { "completion_length": 118.453125, "epoch": 0.13167402649538337, "grad_norm": 1.487423062324524, "kl": 0.04084368539042771, "learning_rate": 4.985310013236514e-06, "loss": 0.0016, "reward": 2.4017499536275864, "reward_std": 0.6733962241560221, "rewards/correctness_reward_func": 1.71875, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2064374964684248, "step": 123 }, { "completion_length": 136.75, "epoch": 0.13274454703599625, "grad_norm": 6.275110244750977, "kl": 0.10213979217223823, "learning_rate": 4.984280524733107e-06, "loss": 0.0041, "reward": 1.5195625126361847, "reward_std": 0.8954427968710661, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1445625051856041, "step": 124 }, { "completion_length": 107.84375, "epoch": 0.13381506757660913, "grad_norm": 4.696693420410156, "kl": 0.1205500855576247, "learning_rate": 4.983216287310453e-06, "loss": 0.0048, "reward": 1.828781247138977, "reward_std": 0.92250463552773, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21159374713897705, "step": 125 }, { "completion_length": 90.984375, "epoch": 0.134885588117222, "grad_norm": 5.726349830627441, "kl": 0.14590927632525563, "learning_rate": 4.982117315854594e-06, "loss": 0.0058, "reward": 2.218953087925911, "reward_std": 0.855751893715933, "rewards/correctness_reward_func": 1.53125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.24239062331616879, "step": 126 }, { "completion_length": 102.28125, "epoch": 0.13595610865783486, "grad_norm": 2.1168274879455566, "kl": 0.07060433947481215, "learning_rate": 4.980983625737411e-06, "loss": 0.0028, "reward": 2.1402343213558197, "reward_std": 0.6787437000311911, "rewards/correctness_reward_func": 1.46875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20273437071591616, "step": 127 }, { "completion_length": 122.0625, "epoch": 0.13702662919844774, "grad_norm": 9.184843063354492, "kl": 0.15047278022393584, "learning_rate": 4.9798152328164165e-06, "loss": 0.006, "reward": 1.5844999551773071, "reward_std": 1.025202952325344, "rewards/correctness_reward_func": 1.03125, "rewards/int_reward_func": 0.359375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19387499801814556, "step": 128 }, { "completion_length": 107.21875, "epoch": 0.13809714973906062, "grad_norm": 1.8967880010604858, "kl": 0.04243561811745167, "learning_rate": 4.978612153434527e-06, "loss": 0.0017, "reward": 2.1196718886494637, "reward_std": 0.5547986216843128, "rewards/correctness_reward_func": 1.46875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2134218756109476, "step": 129 }, { "completion_length": 108.03125, "epoch": 0.1391676702796735, "grad_norm": 2.011303186416626, "kl": 0.04303696344140917, "learning_rate": 4.977374404419838e-06, "loss": 0.0017, "reward": 2.1730000376701355, "reward_std": 0.9301177933812141, "rewards/correctness_reward_func": 1.53125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2042500004172325, "step": 130 }, { "completion_length": 110.453125, "epoch": 0.14023819082028635, "grad_norm": 7.903467178344727, "kl": 0.22786898026242852, "learning_rate": 4.9761020030853854e-06, "loss": 0.0091, "reward": 2.0016875714063644, "reward_std": 0.8600476859137416, "rewards/correctness_reward_func": 1.40625, "rewards/int_reward_func": 0.3984375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19699999503791332, "step": 131 }, { "completion_length": 132.625, "epoch": 0.14130871136089923, "grad_norm": 5.2613444328308105, "kl": 0.23378165811300278, "learning_rate": 4.9747949672289075e-06, "loss": 0.0094, "reward": 1.4954218715429306, "reward_std": 0.9026085883378983, "rewards/correctness_reward_func": 1.03125, "rewards/int_reward_func": 0.3203125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14385937619954348, "step": 132 }, { "completion_length": 101.125, "epoch": 0.1423792319015121, "grad_norm": 4.111429214477539, "kl": 0.18476470839232206, "learning_rate": 4.973453315132592e-06, "loss": 0.0074, "reward": 2.419406235218048, "reward_std": 0.6811097683385015, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19284374825656414, "step": 133 }, { "completion_length": 110.0625, "epoch": 0.143449752442125, "grad_norm": 4.763584613800049, "kl": 0.1902949649374932, "learning_rate": 4.9720770655628216e-06, "loss": 0.0076, "reward": 2.049671910703182, "reward_std": 0.8413272872567177, "rewards/correctness_reward_func": 1.40625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19029687531292439, "step": 134 }, { "completion_length": 95.578125, "epoch": 0.14452027298273787, "grad_norm": 4.345425128936768, "kl": 0.18916460033506155, "learning_rate": 4.970666237769913e-06, "loss": 0.0076, "reward": 2.191656231880188, "reward_std": 0.7033369969576597, "rewards/correctness_reward_func": 1.53125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20728124678134918, "step": 135 }, { "completion_length": 84.078125, "epoch": 0.14559079352335072, "grad_norm": 2.940378427505493, "kl": 0.06369929504580796, "learning_rate": 4.9692208514878445e-06, "loss": 0.0025, "reward": 2.3128594160079956, "reward_std": 0.6257542409002781, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21910937316715717, "step": 136 }, { "completion_length": 120.328125, "epoch": 0.1466613140639636, "grad_norm": 7.51654052734375, "kl": 0.3002074658870697, "learning_rate": 4.967740926933985e-06, "loss": 0.012, "reward": 1.7032031267881393, "reward_std": 1.0220621526241302, "rewards/correctness_reward_func": 1.15625, "rewards/int_reward_func": 0.3828125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16414062399417162, "step": 137 }, { "completion_length": 114.046875, "epoch": 0.14773183460457648, "grad_norm": 2.956787347793579, "kl": 0.10229182336479425, "learning_rate": 4.966226484808804e-06, "loss": 0.0041, "reward": 1.6846249997615814, "reward_std": 0.8771754652261734, "rewards/correctness_reward_func": 1.09375, "rewards/int_reward_func": 0.3984375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19243750255554914, "step": 138 }, { "completion_length": 102.34375, "epoch": 0.14880235514518936, "grad_norm": 4.8385114669799805, "kl": 0.1648537963628769, "learning_rate": 4.96467754629559e-06, "loss": 0.0066, "reward": 1.9405781105160713, "reward_std": 0.6661778870038688, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22182812727987766, "step": 139 }, { "completion_length": 99.921875, "epoch": 0.1498728756858022, "grad_norm": 2.6203041076660156, "kl": 0.06277278368361294, "learning_rate": 4.963094133060148e-06, "loss": 0.0025, "reward": 2.1328750401735306, "reward_std": 0.6620223973877728, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.4609375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23443749826401472, "step": 140 }, { "completion_length": 92.15625, "epoch": 0.1509433962264151, "grad_norm": 2.787095069885254, "kl": 0.07504080841317773, "learning_rate": 4.961476267250501e-06, "loss": 0.003, "reward": 2.2870156168937683, "reward_std": 0.6934415455907583, "rewards/correctness_reward_func": 1.59375, "rewards/int_reward_func": 0.4609375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23232812341302633, "step": 141 }, { "completion_length": 111.0, "epoch": 0.15201391676702797, "grad_norm": 2.570060968399048, "kl": 0.06025985535234213, "learning_rate": 4.959823971496575e-06, "loss": 0.0024, "reward": 1.9751719227060676, "reward_std": 0.7296689655631781, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1939218717161566, "step": 142 }, { "completion_length": 127.40625, "epoch": 0.15308443730764085, "grad_norm": 4.903911590576172, "kl": 0.20708202896639705, "learning_rate": 4.958137268909887e-06, "loss": 0.0083, "reward": 2.063531205058098, "reward_std": 0.8510163221508265, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14165624976158142, "step": 143 }, { "completion_length": 98.5625, "epoch": 0.15415495784825373, "grad_norm": 4.191147804260254, "kl": 0.2446250948123634, "learning_rate": 4.9564161830832214e-06, "loss": 0.0098, "reward": 2.337281256914139, "reward_std": 0.6763627836480737, "rewards/correctness_reward_func": 1.65625, "rewards/int_reward_func": 0.4609375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2200937452726066, "step": 144 }, { "completion_length": 88.984375, "epoch": 0.15522547838886658, "grad_norm": 168.51283264160156, "kl": 0.34985177870839834, "learning_rate": 4.954660738090297e-06, "loss": 0.014, "reward": 1.6290156617760658, "reward_std": 0.7621745709329844, "rewards/correctness_reward_func": 1.03125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.222765626385808, "step": 145 }, { "completion_length": 116.078125, "epoch": 0.15629599892947946, "grad_norm": 4.605787754058838, "kl": 0.20393797848373652, "learning_rate": 4.9528709584854316e-06, "loss": 0.0082, "reward": 1.5738593488931656, "reward_std": 1.115996390581131, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.3984375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17542187124490738, "step": 146 }, { "completion_length": 89.328125, "epoch": 0.15736651947009234, "grad_norm": 4.029605865478516, "kl": 0.21852776128798723, "learning_rate": 4.951046869303202e-06, "loss": 0.0087, "reward": 1.9777500182390213, "reward_std": 0.9872381817549467, "rewards/correctness_reward_func": 1.34375, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22774999774992466, "step": 147 }, { "completion_length": 106.75, "epoch": 0.15843704001070522, "grad_norm": 3.108353853225708, "kl": 0.07371893431991339, "learning_rate": 4.949188496058089e-06, "loss": 0.0029, "reward": 1.7372031211853027, "reward_std": 0.9112571626901627, "rewards/correctness_reward_func": 1.09375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2215781332924962, "step": 148 }, { "completion_length": 78.40625, "epoch": 0.15950756055131807, "grad_norm": 5.034030914306641, "kl": 0.11514789052307606, "learning_rate": 4.947295864744121e-06, "loss": 0.0046, "reward": 2.055265612900257, "reward_std": 0.6963230553083122, "rewards/correctness_reward_func": 1.34375, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.258390624076128, "step": 149 }, { "completion_length": 90.484375, "epoch": 0.16057808109193095, "grad_norm": 2.6227409839630127, "kl": 0.09901809925213456, "learning_rate": 4.9453690018345144e-06, "loss": 0.004, "reward": 2.031171888113022, "reward_std": 0.9270219663158059, "rewards/correctness_reward_func": 1.34375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2499218750745058, "step": 150 }, { "completion_length": 111.640625, "epoch": 0.16164860163254383, "grad_norm": 5.403820037841797, "kl": 0.24172887252643704, "learning_rate": 4.943407934281298e-06, "loss": 0.0097, "reward": 1.693390630185604, "reward_std": 1.0351360142230988, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4609375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1699531227350235, "step": 151 }, { "completion_length": 92.890625, "epoch": 0.1627191221731567, "grad_norm": 2.174766778945923, "kl": 0.07736781658604741, "learning_rate": 4.941412689514941e-06, "loss": 0.0031, "reward": 2.1249531507492065, "reward_std": 0.82035240996629, "rewards/correctness_reward_func": 1.40625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2499531265348196, "step": 152 }, { "completion_length": 96.546875, "epoch": 0.16378964271376958, "grad_norm": 2.039672374725342, "kl": 0.07323360512964427, "learning_rate": 4.939383295443966e-06, "loss": 0.0029, "reward": 2.026875004172325, "reward_std": 0.7493576873093843, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20656250044703484, "step": 153 }, { "completion_length": 106.59375, "epoch": 0.16486016325438244, "grad_norm": 2.2819831371307373, "kl": 0.07374695758335292, "learning_rate": 4.937319780454559e-06, "loss": 0.0029, "reward": 1.7915781140327454, "reward_std": 0.8505431758239865, "rewards/correctness_reward_func": 1.15625, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1900156200863421, "step": 154 }, { "completion_length": 91.90625, "epoch": 0.16593068379499532, "grad_norm": 5.251499176025391, "kl": 0.40140265179798007, "learning_rate": 4.9352221734101745e-06, "loss": 0.0161, "reward": 2.1450937539339066, "reward_std": 0.7235295535065234, "rewards/correctness_reward_func": 1.46875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2388437483459711, "step": 155 }, { "completion_length": 91.375, "epoch": 0.1670012043356082, "grad_norm": 2.6729063987731934, "kl": 0.06791439699009061, "learning_rate": 4.933090503651129e-06, "loss": 0.0027, "reward": 2.2278437092900276, "reward_std": 0.7469283854588866, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22003125306218863, "step": 156 }, { "completion_length": 83.53125, "epoch": 0.16807172487622107, "grad_norm": 2.890995979309082, "kl": 0.08941763360053301, "learning_rate": 4.930924800994192e-06, "loss": 0.0036, "reward": 2.040843792259693, "reward_std": 0.6482276869937778, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4921875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23615625128149986, "step": 157 }, { "completion_length": 103.4375, "epoch": 0.16914224541683393, "grad_norm": 4.847292900085449, "kl": 0.25062092347070575, "learning_rate": 4.9287250957321685e-06, "loss": 0.01, "reward": 1.8447500094771385, "reward_std": 0.8658850640058517, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21193750761449337, "step": 158 }, { "completion_length": 87.78125, "epoch": 0.1702127659574468, "grad_norm": 2.486091136932373, "kl": 0.10755344619974494, "learning_rate": 4.9264914186334775e-06, "loss": 0.0043, "reward": 2.14860936999321, "reward_std": 0.7705556647852063, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.4609375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25017187278717756, "step": 159 }, { "completion_length": 74.8125, "epoch": 0.17128328649805968, "grad_norm": 2.6566174030303955, "kl": 0.09692100062966347, "learning_rate": 4.924223800941718e-06, "loss": 0.0039, "reward": 2.179875001311302, "reward_std": 0.647944641765207, "rewards/correctness_reward_func": 1.40625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28924999572336674, "step": 160 }, { "completion_length": 112.390625, "epoch": 0.17235380703867256, "grad_norm": 7.983546257019043, "kl": 0.34650124446488917, "learning_rate": 4.921922274375232e-06, "loss": 0.0139, "reward": 1.7768593654036522, "reward_std": 0.6629852540791035, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17529687471687794, "step": 161 }, { "completion_length": 81.609375, "epoch": 0.17342432757928541, "grad_norm": 3.0785446166992188, "kl": 0.09424351761117578, "learning_rate": 4.919586871126667e-06, "loss": 0.0038, "reward": 2.1120937913656235, "reward_std": 0.947939082980156, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4609375, "rewards/soft_format_reward_func": 0.0078125, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26834374107420444, "step": 162 }, { "completion_length": 90.421875, "epoch": 0.1744948481198983, "grad_norm": 4.486505508422852, "kl": 0.2375591630116105, "learning_rate": 4.917217623862516e-06, "loss": 0.0095, "reward": 1.9345000088214874, "reward_std": 0.7378783877938986, "rewards/correctness_reward_func": 1.21875, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23918749950826168, "step": 163 }, { "completion_length": 89.34375, "epoch": 0.17556536866051117, "grad_norm": 5.476224422454834, "kl": 0.3916892586275935, "learning_rate": 4.914814565722671e-06, "loss": 0.0157, "reward": 1.6488437578082085, "reward_std": 0.861721821129322, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2582187484949827, "step": 164 }, { "completion_length": 76.15625, "epoch": 0.17663588920112405, "grad_norm": 2.504824161529541, "kl": 0.09898415254428983, "learning_rate": 4.912377730319951e-06, "loss": 0.004, "reward": 2.149609424173832, "reward_std": 0.8080255158711225, "rewards/correctness_reward_func": 1.4375, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29023437574505806, "step": 165 }, { "completion_length": 86.703125, "epoch": 0.17770640974173693, "grad_norm": 2.5262093544006348, "kl": 0.10452345060184598, "learning_rate": 4.909907151739634e-06, "loss": 0.0042, "reward": 2.0487187057733536, "reward_std": 1.013342872262001, "rewards/correctness_reward_func": 1.34375, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25965624768286943, "step": 166 }, { "completion_length": 83.109375, "epoch": 0.17877693028234978, "grad_norm": 3.6836423873901367, "kl": 0.12263510143384337, "learning_rate": 4.907402864538984e-06, "loss": 0.0049, "reward": 2.0983437597751617, "reward_std": 0.8467487432062626, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2545937467366457, "step": 167 }, { "completion_length": 105.828125, "epoch": 0.17984745082296266, "grad_norm": 3.9994001388549805, "kl": 0.2718197964131832, "learning_rate": 4.904864903746765e-06, "loss": 0.0109, "reward": 1.590890608727932, "reward_std": 1.0225011110305786, "rewards/correctness_reward_func": 0.96875, "rewards/int_reward_func": 0.3984375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.21589063154533505, "step": 168 }, { "completion_length": 80.734375, "epoch": 0.18091797136357554, "grad_norm": 3.370049476623535, "kl": 0.0908288094215095, "learning_rate": 4.9022933048627496e-06, "loss": 0.0036, "reward": 1.9604843854904175, "reward_std": 0.6807838249951601, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2886093705892563, "step": 169 }, { "completion_length": 89.03125, "epoch": 0.18198849190418842, "grad_norm": 2.7841224670410156, "kl": 0.09452959662303329, "learning_rate": 4.899688103857223e-06, "loss": 0.0038, "reward": 2.052734389901161, "reward_std": 0.789879210293293, "rewards/correctness_reward_func": 1.34375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25585937313735485, "step": 170 }, { "completion_length": 99.46875, "epoch": 0.18305901244480127, "grad_norm": 2.4192087650299072, "kl": 0.10028906259685755, "learning_rate": 4.897049337170483e-06, "loss": 0.004, "reward": 1.9263124987483025, "reward_std": 0.4447530438192189, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23881249967962503, "step": 171 }, { "completion_length": 85.359375, "epoch": 0.18412953298541415, "grad_norm": 2.552004814147949, "kl": 0.10651395656168461, "learning_rate": 4.894377041712327e-06, "loss": 0.0043, "reward": 2.072437509894371, "reward_std": 0.8923071715980768, "rewards/correctness_reward_func": 1.34375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.244312503375113, "step": 172 }, { "completion_length": 86.828125, "epoch": 0.18520005352602703, "grad_norm": 2.8208720684051514, "kl": 0.08628266118466854, "learning_rate": 4.891671254861535e-06, "loss": 0.0035, "reward": 2.123812586069107, "reward_std": 0.7962243193760514, "rewards/correctness_reward_func": 1.40625, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2722499957308173, "step": 173 }, { "completion_length": 68.59375, "epoch": 0.1862705740666399, "grad_norm": 3.199782133102417, "kl": 0.09673942252993584, "learning_rate": 4.8889320144653525e-06, "loss": 0.0039, "reward": 2.4416562616825104, "reward_std": 0.6845003152266145, "rewards/correctness_reward_func": 1.65625, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3010312579572201, "step": 174 }, { "completion_length": 80.265625, "epoch": 0.1873410946072528, "grad_norm": 2.8050692081451416, "kl": 0.09070709394291043, "learning_rate": 4.886159358838952e-06, "loss": 0.0036, "reward": 2.2726562321186066, "reward_std": 0.713380170520395, "rewards/correctness_reward_func": 1.53125, "rewards/int_reward_func": 0.4609375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2804687600582838, "step": 175 }, { "completion_length": 95.078125, "epoch": 0.18841161514786564, "grad_norm": 3.0878233909606934, "kl": 0.22780032362788916, "learning_rate": 4.883353326764907e-06, "loss": 0.0091, "reward": 2.0888593643903732, "reward_std": 0.632993305567652, "rewards/correctness_reward_func": 1.40625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22948437556624413, "step": 176 }, { "completion_length": 92.03125, "epoch": 0.18948213568847852, "grad_norm": 6.2119035720825195, "kl": 0.3357097846455872, "learning_rate": 4.880513957492641e-06, "loss": 0.0134, "reward": 1.9138593599200249, "reward_std": 0.8630610294640064, "rewards/correctness_reward_func": 1.21875, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24979687482118607, "step": 177 }, { "completion_length": 84.390625, "epoch": 0.1905526562290914, "grad_norm": 4.748230457305908, "kl": 0.26475911401212215, "learning_rate": 4.8776412907378845e-06, "loss": 0.0106, "reward": 1.8656718656420708, "reward_std": 0.7014469979330897, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2641093786805868, "step": 178 }, { "completion_length": 95.0625, "epoch": 0.19162317676970428, "grad_norm": 2.9860572814941406, "kl": 0.1027436142321676, "learning_rate": 4.8747353666821155e-06, "loss": 0.0041, "reward": 1.8935781568288803, "reward_std": 0.8618385540321469, "rewards/correctness_reward_func": 1.15625, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.25295313261449337, "step": 179 }, { "completion_length": 78.359375, "epoch": 0.19269369731031713, "grad_norm": 4.26141881942749, "kl": 0.19185639871284366, "learning_rate": 4.871796225972e-06, "loss": 0.0077, "reward": 1.882093757390976, "reward_std": 0.762749788351357, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2805312527343631, "step": 180 }, { "completion_length": 86.953125, "epoch": 0.19376421785093, "grad_norm": 2.9578616619110107, "kl": 0.09208998270332813, "learning_rate": 4.868823909718823e-06, "loss": 0.0037, "reward": 2.173124998807907, "reward_std": 0.8102906746789813, "rewards/correctness_reward_func": 1.46875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25125000439584255, "step": 181 }, { "completion_length": 71.671875, "epoch": 0.1948347383915429, "grad_norm": 2.6002249717712402, "kl": 0.10551499295979738, "learning_rate": 4.865818459497911e-06, "loss": 0.0042, "reward": 2.2820468470454216, "reward_std": 0.555876774713397, "rewards/correctness_reward_func": 1.46875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3132968805730343, "step": 182 }, { "completion_length": 92.03125, "epoch": 0.19590525893215577, "grad_norm": 6.459475517272949, "kl": 0.2931561325676739, "learning_rate": 4.862779917348055e-06, "loss": 0.0117, "reward": 1.95626562833786, "reward_std": 0.8608931167982519, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4609375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2453281208872795, "step": 183 }, { "completion_length": 102.515625, "epoch": 0.19697577947276865, "grad_norm": 1.968064308166504, "kl": 0.09014055877923965, "learning_rate": 4.859708325770919e-06, "loss": 0.0036, "reward": 1.539296880364418, "reward_std": 0.8695460446178913, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22679687477648258, "step": 184 }, { "completion_length": 82.21875, "epoch": 0.1980463000133815, "grad_norm": 5.502157688140869, "kl": 0.24213434057310224, "learning_rate": 4.856603727730446e-06, "loss": 0.0097, "reward": 2.103000044822693, "reward_std": 0.7971827173605561, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27487499825656414, "step": 185 }, { "completion_length": 96.796875, "epoch": 0.19911682055399438, "grad_norm": 4.163439750671387, "kl": 0.2488319119438529, "learning_rate": 4.853466166652259e-06, "loss": 0.01, "reward": 1.8271406143903732, "reward_std": 0.8965076357126236, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22557812742888927, "step": 186 }, { "completion_length": 109.671875, "epoch": 0.20018734109460726, "grad_norm": 2.380798816680908, "kl": 0.07831509876996279, "learning_rate": 4.850295686423048e-06, "loss": 0.0031, "reward": 1.7582030892372131, "reward_std": 0.8107579126954079, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.21132812649011612, "step": 187 }, { "completion_length": 96.46875, "epoch": 0.20125786163522014, "grad_norm": 5.34075927734375, "kl": 0.3953818525187671, "learning_rate": 4.8470923313899655e-06, "loss": 0.0158, "reward": 1.9607812352478504, "reward_std": 0.5701902243308723, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22640624921768904, "step": 188 }, { "completion_length": 79.265625, "epoch": 0.202328382175833, "grad_norm": 2.1705665588378906, "kl": 0.10117745213210583, "learning_rate": 4.843856146359999e-06, "loss": 0.004, "reward": 2.0710155963897705, "reward_std": 0.6723045469261706, "rewards/correctness_reward_func": 1.34375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2741406299173832, "step": 189 }, { "completion_length": 72.46875, "epoch": 0.20339890271644587, "grad_norm": 2.7847604751586914, "kl": 0.12489751679822803, "learning_rate": 4.8405871765993435e-06, "loss": 0.005, "reward": 1.8936093151569366, "reward_std": 0.8906522025354207, "rewards/correctness_reward_func": 1.15625, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.2920468747615814, "step": 190 }, { "completion_length": 74.3125, "epoch": 0.20446942325705875, "grad_norm": 7.144665241241455, "kl": 0.5474197333678603, "learning_rate": 4.837285467832775e-06, "loss": 0.0219, "reward": 1.9980624914169312, "reward_std": 1.1599431410431862, "rewards/correctness_reward_func": 1.28125, "rewards/int_reward_func": 0.4140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3027499932795763, "step": 191 }, { "completion_length": 77.015625, "epoch": 0.20553994379767163, "grad_norm": 2.282410144805908, "kl": 0.12488419935107231, "learning_rate": 4.833951066243004e-06, "loss": 0.005, "reward": 2.0859062671661377, "reward_std": 0.7159075043164194, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4609375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3124687448143959, "step": 192 }, { "completion_length": 74.6875, "epoch": 0.20661046433828448, "grad_norm": 5.521911144256592, "kl": 0.38207234255969524, "learning_rate": 4.830584018470036e-06, "loss": 0.0153, "reward": 2.290625035762787, "reward_std": 0.6622507013380527, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.2984374985098839, "step": 193 }, { "completion_length": 81.640625, "epoch": 0.20768098487889736, "grad_norm": 5.832190036773682, "kl": 0.24756696447730064, "learning_rate": 4.827184371610511e-06, "loss": 0.0099, "reward": 2.2973125129938126, "reward_std": 0.6708025210537016, "rewards/correctness_reward_func": 1.53125, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28168749529868364, "step": 194 }, { "completion_length": 74.671875, "epoch": 0.20875150541951024, "grad_norm": 7.309933662414551, "kl": 0.12391441874206066, "learning_rate": 4.8237521732170525e-06, "loss": 0.005, "reward": 2.016703099012375, "reward_std": 1.132209412753582, "rewards/correctness_reward_func": 1.28125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.29014062508940697, "step": 195 }, { "completion_length": 73.34375, "epoch": 0.20982202596012312, "grad_norm": 3.4460299015045166, "kl": 0.12927269656211138, "learning_rate": 4.820287471297598e-06, "loss": 0.0052, "reward": 1.9848750308156013, "reward_std": 0.7779360907152295, "rewards/correctness_reward_func": 1.21875, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2895624926313758, "step": 196 }, { "completion_length": 88.28125, "epoch": 0.210892546500736, "grad_norm": 4.137183666229248, "kl": 0.3276713816449046, "learning_rate": 4.816790314314729e-06, "loss": 0.0131, "reward": 1.964468702673912, "reward_std": 0.7385209053754807, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4609375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2535312492400408, "step": 197 }, { "completion_length": 73.453125, "epoch": 0.21196306704134885, "grad_norm": 6.063220500946045, "kl": 0.36109886690974236, "learning_rate": 4.813260751184992e-06, "loss": 0.0144, "reward": 2.3978749811649323, "reward_std": 0.7160034999251366, "rewards/correctness_reward_func": 1.59375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31975000351667404, "step": 198 }, { "completion_length": 90.359375, "epoch": 0.21303358758196173, "grad_norm": 9.171374320983887, "kl": 0.7352069662883878, "learning_rate": 4.809698831278217e-06, "loss": 0.0294, "reward": 1.9928593933582306, "reward_std": 0.8052435261197388, "rewards/correctness_reward_func": 1.28125, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2662968710064888, "step": 199 }, { "completion_length": 69.953125, "epoch": 0.2141041081225746, "grad_norm": 3.0394396781921387, "kl": 0.14388780342414975, "learning_rate": 4.806104604416824e-06, "loss": 0.0058, "reward": 2.525015652179718, "reward_std": 0.4300219719298184, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2906406167894602, "step": 200 }, { "completion_length": 92.03125, "epoch": 0.21517462866318748, "grad_norm": 4.5592265129089355, "kl": 0.3974157813936472, "learning_rate": 4.802478120875125e-06, "loss": 0.0159, "reward": 1.534609392285347, "reward_std": 1.030179588124156, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.3828125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2767968699336052, "step": 201 }, { "completion_length": 76.625, "epoch": 0.21624514920380034, "grad_norm": 2.740817070007324, "kl": 0.14991699904203415, "learning_rate": 4.7988194313786275e-06, "loss": 0.006, "reward": 2.134265646338463, "reward_std": 0.8027890680823475, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28270312771201134, "step": 202 }, { "completion_length": 86.53125, "epoch": 0.21731566974441321, "grad_norm": 5.491028785705566, "kl": 0.32305468805134296, "learning_rate": 4.795128587103315e-06, "loss": 0.0129, "reward": 2.177546873688698, "reward_std": 0.862015737220645, "rewards/correctness_reward_func": 1.46875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2556718774139881, "step": 203 }, { "completion_length": 80.640625, "epoch": 0.2183861902850261, "grad_norm": 3.542194366455078, "kl": 0.13027278054505587, "learning_rate": 4.791405639674941e-06, "loss": 0.0052, "reward": 1.6867187470197678, "reward_std": 0.8688598442822695, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.4609375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.28046874329447746, "step": 204 }, { "completion_length": 75.96875, "epoch": 0.21945671082563897, "grad_norm": 3.078350305557251, "kl": 0.1293167658150196, "learning_rate": 4.7876506411683e-06, "loss": 0.0052, "reward": 2.0141249895095825, "reward_std": 0.7147551532834768, "rewards/correctness_reward_func": 1.21875, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31881249509751797, "step": 205 }, { "completion_length": 62.734375, "epoch": 0.22052723136625185, "grad_norm": 7.453506946563721, "kl": 0.8046054858714342, "learning_rate": 4.783863644106502e-06, "loss": 0.0322, "reward": 1.6788437813520432, "reward_std": 0.8941609086468816, "rewards/correctness_reward_func": 0.96875, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3038437496870756, "step": 206 }, { "completion_length": 68.671875, "epoch": 0.2215977519068647, "grad_norm": 4.4995436668396, "kl": 0.14437556639313698, "learning_rate": 4.780044701460239e-06, "loss": 0.0058, "reward": 2.126671925187111, "reward_std": 0.7776627587154508, "rewards/correctness_reward_func": 1.34375, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.29854688607156277, "step": 207 }, { "completion_length": 54.75, "epoch": 0.22266827244747758, "grad_norm": 3.271150588989258, "kl": 0.2097001215443015, "learning_rate": 4.7761938666470405e-06, "loss": 0.0084, "reward": 1.956812545657158, "reward_std": 0.9061004631221294, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.3552500009536743, "step": 208 }, { "completion_length": 72.578125, "epoch": 0.22373879298809046, "grad_norm": 3.0446856021881104, "kl": 0.15493952203541994, "learning_rate": 4.7723111935305275e-06, "loss": 0.0062, "reward": 2.270968735218048, "reward_std": 0.8504615277051926, "rewards/correctness_reward_func": 1.46875, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32565624453127384, "step": 209 }, { "completion_length": 69.046875, "epoch": 0.22480931352870334, "grad_norm": 5.6605095863342285, "kl": 0.44642951618880033, "learning_rate": 4.7683967364196624e-06, "loss": 0.0179, "reward": 2.0304374992847443, "reward_std": 0.7277556583285332, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3116874936968088, "step": 210 }, { "completion_length": 64.9375, "epoch": 0.2258798340693162, "grad_norm": 4.651093482971191, "kl": 0.17161214351654053, "learning_rate": 4.764450550067986e-06, "loss": 0.0069, "reward": 1.935359388589859, "reward_std": 0.850550489500165, "rewards/correctness_reward_func": 1.15625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3259843699634075, "step": 211 }, { "completion_length": 99.921875, "epoch": 0.22695035460992907, "grad_norm": 7.470244407653809, "kl": 0.5526934135705233, "learning_rate": 4.760472689672851e-06, "loss": 0.0221, "reward": 1.62957813590765, "reward_std": 0.9631365463137627, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2155156247317791, "step": 212 }, { "completion_length": 64.296875, "epoch": 0.22802087515054195, "grad_norm": 4.030452251434326, "kl": 0.1725650643929839, "learning_rate": 4.7564632108746524e-06, "loss": 0.0069, "reward": 2.4495781660079956, "reward_std": 0.7331900605931878, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.3402031324803829, "step": 213 }, { "completion_length": 57.0625, "epoch": 0.22909139569115483, "grad_norm": 2.44975209236145, "kl": 0.1745634926483035, "learning_rate": 4.752422169756048e-06, "loss": 0.007, "reward": 2.49567186832428, "reward_std": 0.585249027935788, "rewards/correctness_reward_func": 1.65625, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3628593757748604, "step": 214 }, { "completion_length": 80.609375, "epoch": 0.2301619162317677, "grad_norm": 3.696011781692505, "kl": 0.1569390268996358, "learning_rate": 4.7483496228411754e-06, "loss": 0.0063, "reward": 2.0820469111204147, "reward_std": 0.7507896656170487, "rewards/correctness_reward_func": 1.34375, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26173438131809235, "step": 215 }, { "completion_length": 67.90625, "epoch": 0.23123243677238056, "grad_norm": 3.6006627082824707, "kl": 0.1786866094917059, "learning_rate": 4.744245627094859e-06, "loss": 0.0071, "reward": 2.0825624614953995, "reward_std": 0.8309154035523534, "rewards/correctness_reward_func": 1.28125, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.33256249874830246, "step": 216 }, { "completion_length": 82.53125, "epoch": 0.23230295731299344, "grad_norm": 4.304072856903076, "kl": 0.31106262002140284, "learning_rate": 4.740110239921813e-06, "loss": 0.0124, "reward": 1.406374990940094, "reward_std": 0.8875350207090378, "rewards/correctness_reward_func": 0.6875, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2657500021159649, "step": 217 }, { "completion_length": 57.75, "epoch": 0.23337347785360632, "grad_norm": 3.51265025138855, "kl": 0.18941342923790216, "learning_rate": 4.735943519165843e-06, "loss": 0.0076, "reward": 2.366296797990799, "reward_std": 0.6840489963069558, "rewards/correctness_reward_func": 1.5625, "rewards/int_reward_func": 0.4609375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3428593724966049, "step": 218 }, { "completion_length": 69.6875, "epoch": 0.2344439983942192, "grad_norm": 3.1696012020111084, "kl": 0.144703084602952, "learning_rate": 4.731745523109029e-06, "loss": 0.0058, "reward": 1.7885156497359276, "reward_std": 0.8288924656808376, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.3432031273841858, "step": 219 }, { "completion_length": 79.453125, "epoch": 0.23551451893483205, "grad_norm": 3.0204484462738037, "kl": 0.16810880228877068, "learning_rate": 4.72751631047092e-06, "loss": 0.0067, "reward": 1.817734345793724, "reward_std": 1.0543543472886086, "rewards/correctness_reward_func": 1.0625, "rewards/int_reward_func": 0.4296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0234375, "rewards/xmlcount_reward_func": 0.302109370008111, "step": 220 }, { "completion_length": 89.984375, "epoch": 0.23658503947544493, "grad_norm": 9.887064933776855, "kl": 0.571788308210671, "learning_rate": 4.723255940407704e-06, "loss": 0.0229, "reward": 2.150187447667122, "reward_std": 0.7866512620821595, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0234375, "rewards/xmlcount_reward_func": 0.27518749609589577, "step": 221 }, { "completion_length": 82.15625, "epoch": 0.2376555600160578, "grad_norm": 3.509312868118286, "kl": 0.16501779574900866, "learning_rate": 4.718964472511386e-06, "loss": 0.0066, "reward": 2.0418750420212746, "reward_std": 0.9721324890851974, "rewards/correctness_reward_func": 1.28125, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0234375, "rewards/xmlcount_reward_func": 0.29187500290572643, "step": 222 }, { "completion_length": 69.703125, "epoch": 0.2387260805566707, "grad_norm": 3.7087841033935547, "kl": 0.18095918465405703, "learning_rate": 4.71464196680895e-06, "loss": 0.0072, "reward": 2.18121874332428, "reward_std": 0.870441822335124, "rewards/correctness_reward_func": 1.34375, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.3452812507748604, "step": 223 }, { "completion_length": 93.453125, "epoch": 0.23979660109728354, "grad_norm": 9.079336166381836, "kl": 0.7006166982464492, "learning_rate": 4.710288483761524e-06, "loss": 0.028, "reward": 2.110390603542328, "reward_std": 0.7916512079536915, "rewards/correctness_reward_func": 1.34375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.2666406221687794, "step": 224 }, { "completion_length": 89.796875, "epoch": 0.24086712163789642, "grad_norm": 5.301533222198486, "kl": 0.33772587310522795, "learning_rate": 4.705904084263534e-06, "loss": 0.0135, "reward": 1.9977499693632126, "reward_std": 0.9460650207474828, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31025000289082527, "step": 225 }, { "completion_length": 93.375, "epoch": 0.2419376421785093, "grad_norm": 2.236737012863159, "kl": 0.11640047281980515, "learning_rate": 4.701488829641845e-06, "loss": 0.0047, "reward": 2.145703136920929, "reward_std": 0.8284804495051503, "rewards/correctness_reward_func": 1.40625, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.2863281313329935, "step": 226 }, { "completion_length": 81.8125, "epoch": 0.24300816271912218, "grad_norm": 3.1928486824035645, "kl": 0.135368085000664, "learning_rate": 4.697042781654913e-06, "loss": 0.0054, "reward": 2.135328069329262, "reward_std": 0.7982124611735344, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.33845312520861626, "step": 227 }, { "completion_length": 85.453125, "epoch": 0.24407868325973506, "grad_norm": 12.730961799621582, "kl": 0.4075321350246668, "learning_rate": 4.692566002491917e-06, "loss": 0.0163, "reward": 2.0005937218666077, "reward_std": 0.8029458876699209, "rewards/correctness_reward_func": 1.21875, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34434375166893005, "step": 228 }, { "completion_length": 72.875, "epoch": 0.2451492038003479, "grad_norm": 6.923903942108154, "kl": 0.6862649563699961, "learning_rate": 4.6880585547718845e-06, "loss": 0.0275, "reward": 1.7937656044960022, "reward_std": 0.915380734950304, "rewards/correctness_reward_func": 0.9375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.371890626847744, "step": 229 }, { "completion_length": 93.75, "epoch": 0.2462197243409608, "grad_norm": 3.041198968887329, "kl": 0.109968694858253, "learning_rate": 4.683520501542825e-06, "loss": 0.0044, "reward": 2.041828043758869, "reward_std": 0.828435555100441, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.3308906201273203, "step": 230 }, { "completion_length": 78.359375, "epoch": 0.24729024488157367, "grad_norm": 3.0014753341674805, "kl": 0.13458310719579458, "learning_rate": 4.67895190628084e-06, "loss": 0.0054, "reward": 2.4933906197547913, "reward_std": 0.8564620353281498, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0234375, "rewards/xmlcount_reward_func": 0.37620312348008156, "step": 231 }, { "completion_length": 93.8125, "epoch": 0.24836076542218655, "grad_norm": 4.2342848777771, "kl": 0.1281078103929758, "learning_rate": 4.674352832889239e-06, "loss": 0.0051, "reward": 2.4442031383514404, "reward_std": 0.7425322765484452, "rewards/correctness_reward_func": 1.59375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0234375, "rewards/xmlcount_reward_func": 0.3426406290382147, "step": 232 }, { "completion_length": 91.703125, "epoch": 0.2494312859627994, "grad_norm": 4.6626811027526855, "kl": 0.30947081558406353, "learning_rate": 4.669723345697646e-06, "loss": 0.0124, "reward": 1.7927343994379044, "reward_std": 1.0577923730015755, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0234375, "rewards/xmlcount_reward_func": 0.3317968789488077, "step": 233 }, { "completion_length": 95.1875, "epoch": 0.2505018065034123, "grad_norm": 2.6523685455322266, "kl": 0.12286860542371869, "learning_rate": 4.665063509461098e-06, "loss": 0.0049, "reward": 2.122484341263771, "reward_std": 0.6617152327671647, "rewards/correctness_reward_func": 1.28125, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.38029688596725464, "step": 234 }, { "completion_length": 100.3125, "epoch": 0.25157232704402516, "grad_norm": 3.743971586227417, "kl": 0.2634156849235296, "learning_rate": 4.660373389359137e-06, "loss": 0.0105, "reward": 1.9493124820291996, "reward_std": 0.6555369319394231, "rewards/correctness_reward_func": 1.1875, "rewards/int_reward_func": 0.4296875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3321250043809414, "step": 235 }, { "completion_length": 91.890625, "epoch": 0.252642847584638, "grad_norm": 5.421882629394531, "kl": 0.36969919549301267, "learning_rate": 4.655653050994907e-06, "loss": 0.0148, "reward": 2.489093706011772, "reward_std": 0.5151624148711562, "rewards/correctness_reward_func": 1.59375, "rewards/int_reward_func": 0.4765625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.38753124326467514, "step": 236 }, { "completion_length": 107.421875, "epoch": 0.2537133681252509, "grad_norm": 3.658877372741699, "kl": 0.3179207113571465, "learning_rate": 4.650902560394225e-06, "loss": 0.0127, "reward": 2.1520937085151672, "reward_std": 0.7971650678664446, "rewards/correctness_reward_func": 1.34375, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.3474062494933605, "step": 237 }, { "completion_length": 96.75, "epoch": 0.25478388866586377, "grad_norm": 2.333758592605591, "kl": 0.14108293130993843, "learning_rate": 4.646121984004666e-06, "loss": 0.0056, "reward": 2.4891093373298645, "reward_std": 0.7755477353930473, "rewards/correctness_reward_func": 1.59375, "rewards/int_reward_func": 0.484375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0234375, "rewards/xmlcount_reward_func": 0.38754688017070293, "step": 238 }, { "completion_length": 97.796875, "epoch": 0.2558544092064767, "grad_norm": 7.903679370880127, "kl": 0.6723966179415584, "learning_rate": 4.641311388694629e-06, "loss": 0.0269, "reward": 2.23892180621624, "reward_std": 0.8879001673776656, "rewards/correctness_reward_func": 1.40625, "rewards/int_reward_func": 0.4140625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0234375, "rewards/xmlcount_reward_func": 0.3951718807220459, "step": 239 }, { "completion_length": 128.1875, "epoch": 0.2569249297470895, "grad_norm": 2.0204544067382812, "kl": 0.09938508365303278, "learning_rate": 4.636470841752405e-06, "loss": 0.004, "reward": 1.7635936960577965, "reward_std": 0.65199055057019, "rewards/correctness_reward_func": 0.96875, "rewards/int_reward_func": 0.390625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0390625, "rewards/xmlcount_reward_func": 0.36515624821186066, "step": 240 }, { "completion_length": 95.03125, "epoch": 0.2579954502877024, "grad_norm": 2.7629244327545166, "kl": 0.16699408926069736, "learning_rate": 4.631600410885231e-06, "loss": 0.0067, "reward": 2.553484320640564, "reward_std": 0.6153040612116456, "rewards/correctness_reward_func": 1.65625, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0234375, "rewards/xmlcount_reward_func": 0.42848438024520874, "step": 241 }, { "completion_length": 91.640625, "epoch": 0.2590659708283153, "grad_norm": 2.4757933616638184, "kl": 0.12886409275233746, "learning_rate": 4.626700164218349e-06, "loss": 0.0052, "reward": 2.60553115606308, "reward_std": 0.6941613564267755, "rewards/correctness_reward_func": 1.6875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.4336562491953373, "step": 242 }, { "completion_length": 88.0, "epoch": 0.26013649136892814, "grad_norm": 5.133749961853027, "kl": 0.3455530842766166, "learning_rate": 4.621770170294049e-06, "loss": 0.0138, "reward": 1.829562470316887, "reward_std": 0.8116088081151247, "rewards/correctness_reward_func": 1.0, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.3920625038444996, "step": 243 }, { "completion_length": 106.015625, "epoch": 0.261207011909541, "grad_norm": 2.4294216632843018, "kl": 0.14171195961534977, "learning_rate": 4.6168104980707105e-06, "loss": 0.0057, "reward": 2.2230467945337296, "reward_std": 0.9806447625160217, "rewards/correctness_reward_func": 1.3125, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0390625, "rewards/xmlcount_reward_func": 0.4261718839406967, "step": 244 }, { "completion_length": 97.625, "epoch": 0.2622775324501539, "grad_norm": 6.696681976318359, "kl": 0.5324421431869268, "learning_rate": 4.61182121692184e-06, "loss": 0.0213, "reward": 2.5623437613248825, "reward_std": 0.7769194557331502, "rewards/correctness_reward_func": 1.65625, "rewards/int_reward_func": 0.453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0234375, "rewards/xmlcount_reward_func": 0.4295312501490116, "step": 245 }, { "completion_length": 105.328125, "epoch": 0.26334805299076675, "grad_norm": 3.7250571250915527, "kl": 0.3119491417892277, "learning_rate": 4.606802396635098e-06, "loss": 0.0125, "reward": 2.0679530799388885, "reward_std": 1.0327460495755076, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0078125, "rewards/xmlcount_reward_func": 0.388265622779727, "step": 246 }, { "completion_length": 101.0625, "epoch": 0.26441857353137965, "grad_norm": 2.0395541191101074, "kl": 0.1265430450439453, "learning_rate": 4.601754107411326e-06, "loss": 0.0051, "reward": 2.3826874494552612, "reward_std": 0.881409777328372, "rewards/correctness_reward_func": 1.46875, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.3983125016093254, "step": 247 }, { "completion_length": 119.25, "epoch": 0.2654890940719925, "grad_norm": 50.28816223144531, "kl": 1.0364861502312124, "learning_rate": 4.596676419863561e-06, "loss": 0.0415, "reward": 1.8263437151908875, "reward_std": 1.0226206295192242, "rewards/correctness_reward_func": 1.03125, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.37321874499320984, "step": 248 }, { "completion_length": 113.515625, "epoch": 0.26655961461260536, "grad_norm": 18.58391571044922, "kl": 1.4038016851991415, "learning_rate": 4.59156940501605e-06, "loss": 0.0562, "reward": 2.122187450528145, "reward_std": 0.7838817811571062, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.421875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.4034374989569187, "step": 249 }, { "completion_length": 77.8125, "epoch": 0.26763013515321826, "grad_norm": 2.225803852081299, "kl": 0.1744153881445527, "learning_rate": 4.586433134303257e-06, "loss": 0.007, "reward": 2.600734308362007, "reward_std": 0.6244143173098564, "rewards/correctness_reward_func": 1.59375, "rewards/int_reward_func": 0.4453125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1015625, "rewards/xmlcount_reward_func": 0.4601093679666519, "step": 250 } ], "logging_steps": 1, "max_steps": 934, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }