| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.26763013515321826, |
| "eval_steps": 500, |
| "global_step": 250, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 166.625, |
| "epoch": 0.001070520540612873, |
| "grad_norm": 1.2984755039215088, |
| "kl": 0.0, |
| "learning_rate": 5.319148936170213e-08, |
| "loss": -0.0, |
| "reward": 0.24379686824977398, |
| "reward_std": 0.43905802024528384, |
| "rewards/correctness_reward_func": 0.1875, |
| "rewards/int_reward_func": 0.0546875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0016093750018626451, |
| "step": 1 |
| }, |
| { |
| "completion_length": 155.09375, |
| "epoch": 0.002141041081225746, |
| "grad_norm": 5.433530330657959, |
| "kl": 0.0, |
| "learning_rate": 1.0638297872340426e-07, |
| "loss": -0.0, |
| "reward": 0.7448437176644802, |
| "reward_std": 0.824664918705821, |
| "rewards/correctness_reward_func": 0.59375, |
| "rewards/int_reward_func": 0.1640625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.012968750204890966, |
| "step": 2 |
| }, |
| { |
| "completion_length": 156.65625, |
| "epoch": 0.003211561621838619, |
| "grad_norm": 1.5975662469863892, |
| "kl": 0.0003120364726783009, |
| "learning_rate": 1.5957446808510638e-07, |
| "loss": 0.0, |
| "reward": 0.5617187460884452, |
| "reward_std": 0.6682680626399815, |
| "rewards/correctness_reward_func": 0.40625, |
| "rewards/int_reward_func": 0.1328125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.02265625004656613, |
| "step": 3 |
| }, |
| { |
| "completion_length": 151.328125, |
| "epoch": 0.004282082162451492, |
| "grad_norm": 1.3767573833465576, |
| "kl": 0.00039493154326919466, |
| "learning_rate": 2.1276595744680852e-07, |
| "loss": 0.0, |
| "reward": 0.14101563091389835, |
| "reward_std": 0.40191352693364024, |
| "rewards/correctness_reward_func": 0.125, |
| "rewards/int_reward_func": 0.0546875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.038671874441206455, |
| "step": 4 |
| }, |
| { |
| "completion_length": 143.8125, |
| "epoch": 0.005352602703064365, |
| "grad_norm": 1.7029815912246704, |
| "kl": 0.0003161150925734546, |
| "learning_rate": 2.6595744680851066e-07, |
| "loss": 0.0, |
| "reward": 0.6667499775066972, |
| "reward_std": 0.943404046818614, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.1328125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03393749863607809, |
| "step": 5 |
| }, |
| { |
| "completion_length": 146.5625, |
| "epoch": 0.006423123243677238, |
| "grad_norm": 4.5992960929870605, |
| "kl": 0.0014150730130495504, |
| "learning_rate": 3.1914893617021275e-07, |
| "loss": 0.0001, |
| "reward": 0.6142812594771385, |
| "reward_std": 0.8342159832827747, |
| "rewards/correctness_reward_func": 0.46875, |
| "rewards/int_reward_func": 0.1328125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.01271874993108213, |
| "step": 6 |
| }, |
| { |
| "completion_length": 163.9375, |
| "epoch": 0.007493643784290111, |
| "grad_norm": 7.240438461303711, |
| "kl": 0.0018134960264433175, |
| "learning_rate": 3.723404255319149e-07, |
| "loss": 0.0001, |
| "reward": 0.24757812730967999, |
| "reward_std": 0.5592395211569965, |
| "rewards/correctness_reward_func": 0.1875, |
| "rewards/int_reward_func": 0.0703125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.010234375484287739, |
| "step": 7 |
| }, |
| { |
| "completion_length": 151.84375, |
| "epoch": 0.008564164324902984, |
| "grad_norm": 4.826539993286133, |
| "kl": 0.0011592731952987378, |
| "learning_rate": 4.2553191489361704e-07, |
| "loss": 0.0, |
| "reward": 0.24482813104987144, |
| "reward_std": 0.4538201582618058, |
| "rewards/correctness_reward_func": 0.15625, |
| "rewards/int_reward_func": 0.046875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.04170312359929085, |
| "step": 8 |
| }, |
| { |
| "completion_length": 148.828125, |
| "epoch": 0.009634684865515858, |
| "grad_norm": 4.097943305969238, |
| "kl": 0.0009884996707114624, |
| "learning_rate": 4.787234042553192e-07, |
| "loss": 0.0, |
| "reward": 0.6986562423408031, |
| "reward_std": 1.089426226913929, |
| "rewards/correctness_reward_func": 0.5625, |
| "rewards/int_reward_func": 0.1171875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.018968748801853508, |
| "step": 9 |
| }, |
| { |
| "completion_length": 160.3125, |
| "epoch": 0.01070520540612873, |
| "grad_norm": 3.2978594303131104, |
| "kl": 0.0008196280577976722, |
| "learning_rate": 5.319148936170213e-07, |
| "loss": 0.0, |
| "reward": 0.48834376223385334, |
| "reward_std": 0.790836479049176, |
| "rewards/correctness_reward_func": 0.375, |
| "rewards/int_reward_func": 0.1171875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.003843750571832061, |
| "step": 10 |
| }, |
| { |
| "completion_length": 146.96875, |
| "epoch": 0.011775725946741603, |
| "grad_norm": 3.537848472595215, |
| "kl": 0.0007083387099555694, |
| "learning_rate": 5.851063829787235e-07, |
| "loss": 0.0, |
| "reward": 0.5805312437005341, |
| "reward_std": 0.7569947894662619, |
| "rewards/correctness_reward_func": 0.46875, |
| "rewards/int_reward_func": 0.1328125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.021031250711530447, |
| "step": 11 |
| }, |
| { |
| "completion_length": 145.890625, |
| "epoch": 0.012846246487354477, |
| "grad_norm": 3.8045260906219482, |
| "kl": 0.0013589818336186, |
| "learning_rate": 6.382978723404255e-07, |
| "loss": 0.0001, |
| "reward": 0.46695311937946826, |
| "reward_std": 0.6352905407547951, |
| "rewards/correctness_reward_func": 0.3125, |
| "rewards/int_reward_func": 0.109375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.04507812508381903, |
| "step": 12 |
| }, |
| { |
| "completion_length": 146.5, |
| "epoch": 0.013916767027967349, |
| "grad_norm": 1.7494301795959473, |
| "kl": 0.000400338125473354, |
| "learning_rate": 6.914893617021278e-07, |
| "loss": 0.0, |
| "reward": 0.5322968787513673, |
| "reward_std": 0.7736401874572039, |
| "rewards/correctness_reward_func": 0.40625, |
| "rewards/int_reward_func": 0.1171875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.00885937490966171, |
| "step": 13 |
| }, |
| { |
| "completion_length": 133.671875, |
| "epoch": 0.014987287568580221, |
| "grad_norm": 1.8054606914520264, |
| "kl": 0.00036212212944519706, |
| "learning_rate": 7.446808510638298e-07, |
| "loss": 0.0, |
| "reward": 0.6521718641743064, |
| "reward_std": 1.0104734068736434, |
| "rewards/correctness_reward_func": 0.53125, |
| "rewards/int_reward_func": 0.1328125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.011890625639352947, |
| "step": 14 |
| }, |
| { |
| "completion_length": 130.0, |
| "epoch": 0.016057808109193095, |
| "grad_norm": 1.5193248987197876, |
| "kl": 0.00036491416904027574, |
| "learning_rate": 7.97872340425532e-07, |
| "loss": 0.0, |
| "reward": 0.5991093763150275, |
| "reward_std": 0.8120721196755767, |
| "rewards/correctness_reward_func": 0.46875, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.005359375616535544, |
| "step": 15 |
| }, |
| { |
| "completion_length": 162.640625, |
| "epoch": 0.017128328649805968, |
| "grad_norm": 4.938427448272705, |
| "kl": 0.0014603480958612636, |
| "learning_rate": 8.510638297872341e-07, |
| "loss": 0.0001, |
| "reward": 0.2470156280323863, |
| "reward_std": 0.5583461234346032, |
| "rewards/correctness_reward_func": 0.1875, |
| "rewards/int_reward_func": 0.0703125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.010796874179504812, |
| "step": 16 |
| }, |
| { |
| "completion_length": 165.796875, |
| "epoch": 0.01819884919041884, |
| "grad_norm": 3.923428535461426, |
| "kl": 0.001025654159093392, |
| "learning_rate": 9.042553191489363e-07, |
| "loss": 0.0, |
| "reward": 0.5893437387421727, |
| "reward_std": 0.8829143429175019, |
| "rewards/correctness_reward_func": 0.46875, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.004406251944601536, |
| "step": 17 |
| }, |
| { |
| "completion_length": 152.359375, |
| "epoch": 0.019269369731031716, |
| "grad_norm": 5.248744487762451, |
| "kl": 0.0020197606609144714, |
| "learning_rate": 9.574468085106384e-07, |
| "loss": 0.0001, |
| "reward": 0.46303125098347664, |
| "reward_std": 0.9115365371108055, |
| "rewards/correctness_reward_func": 0.34375, |
| "rewards/int_reward_func": 0.1015625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.017718749470077455, |
| "step": 18 |
| }, |
| { |
| "completion_length": 151.359375, |
| "epoch": 0.020339890271644588, |
| "grad_norm": 1.7978895902633667, |
| "kl": 0.00032554956487729214, |
| "learning_rate": 1.0106382978723404e-06, |
| "loss": 0.0, |
| "reward": 0.48937500442843884, |
| "reward_std": 0.7031663246452808, |
| "rewards/correctness_reward_func": 0.375, |
| "rewards/int_reward_func": 0.109375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.005000000121071935, |
| "step": 19 |
| }, |
| { |
| "completion_length": 175.90625, |
| "epoch": 0.02141041081225746, |
| "grad_norm": 2.1679224967956543, |
| "kl": 0.0004576210667437408, |
| "learning_rate": 1.0638297872340427e-06, |
| "loss": 0.0, |
| "reward": 0.44935936853289604, |
| "reward_std": 0.6872247559949756, |
| "rewards/correctness_reward_func": 0.34375, |
| "rewards/int_reward_func": 0.09375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.011859375052154064, |
| "step": 20 |
| }, |
| { |
| "completion_length": 167.265625, |
| "epoch": 0.022480931352870333, |
| "grad_norm": 1.475581407546997, |
| "kl": 0.0007880991906858981, |
| "learning_rate": 1.1170212765957447e-06, |
| "loss": 0.0, |
| "reward": 0.28507812274619937, |
| "reward_std": 0.5523091573268175, |
| "rewards/correctness_reward_func": 0.21875, |
| "rewards/int_reward_func": 0.0625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.003828124259598553, |
| "step": 21 |
| }, |
| { |
| "completion_length": 153.296875, |
| "epoch": 0.023551451893483205, |
| "grad_norm": 1.6419923305511475, |
| "kl": 0.00039223546627908945, |
| "learning_rate": 1.170212765957447e-06, |
| "loss": 0.0, |
| "reward": 0.6624062322080135, |
| "reward_std": 0.9240671265870333, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.1328125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0295937517657876, |
| "step": 22 |
| }, |
| { |
| "completion_length": 143.515625, |
| "epoch": 0.02462197243409608, |
| "grad_norm": 1.3294110298156738, |
| "kl": 0.00030555322518921457, |
| "learning_rate": 1.223404255319149e-06, |
| "loss": 0.0, |
| "reward": 0.26720312132965773, |
| "reward_std": 0.4894332850817591, |
| "rewards/correctness_reward_func": 0.1875, |
| "rewards/int_reward_func": 0.0546875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.025015624007210135, |
| "step": 23 |
| }, |
| { |
| "completion_length": 148.3125, |
| "epoch": 0.025692492974708953, |
| "grad_norm": 6.915380954742432, |
| "kl": 0.0062694076787011, |
| "learning_rate": 1.276595744680851e-06, |
| "loss": 0.0003, |
| "reward": 0.28068749560043216, |
| "reward_std": 0.5439753192476928, |
| "rewards/correctness_reward_func": 0.1875, |
| "rewards/int_reward_func": 0.0625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.030687499791383743, |
| "step": 24 |
| }, |
| { |
| "completion_length": 145.421875, |
| "epoch": 0.026763013515321826, |
| "grad_norm": 1.5796961784362793, |
| "kl": 0.0008160970010067103, |
| "learning_rate": 1.3297872340425533e-06, |
| "loss": 0.0, |
| "reward": 0.604515643324703, |
| "reward_std": 0.6846362175419927, |
| "rewards/correctness_reward_func": 0.4375, |
| "rewards/int_reward_func": 0.1171875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.049828124698251486, |
| "step": 25 |
| }, |
| { |
| "completion_length": 144.21875, |
| "epoch": 0.027833534055934698, |
| "grad_norm": 3.3064963817596436, |
| "kl": 0.0009157936146948487, |
| "learning_rate": 1.3829787234042555e-06, |
| "loss": 0.0, |
| "reward": 0.5731093874201179, |
| "reward_std": 0.8484273846261203, |
| "rewards/correctness_reward_func": 0.4375, |
| "rewards/int_reward_func": 0.1171875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.018421874032355845, |
| "step": 26 |
| }, |
| { |
| "completion_length": 135.921875, |
| "epoch": 0.02890405459654757, |
| "grad_norm": 1.8949307203292847, |
| "kl": 0.0004942502673657145, |
| "learning_rate": 1.4361702127659578e-06, |
| "loss": 0.0, |
| "reward": 0.6309375101700425, |
| "reward_std": 0.9806207492947578, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.005937499925494194, |
| "step": 27 |
| }, |
| { |
| "completion_length": 153.59375, |
| "epoch": 0.029974575137160443, |
| "grad_norm": 1.7615420818328857, |
| "kl": 0.0004919220991723705, |
| "learning_rate": 1.4893617021276596e-06, |
| "loss": 0.0, |
| "reward": 0.1835937526775524, |
| "reward_std": 0.45627398509532213, |
| "rewards/correctness_reward_func": 0.15625, |
| "rewards/int_reward_func": 0.0546875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.027343749883584678, |
| "step": 28 |
| }, |
| { |
| "completion_length": 164.640625, |
| "epoch": 0.03104509567777332, |
| "grad_norm": 4.3193230628967285, |
| "kl": 0.0013858377351425588, |
| "learning_rate": 1.5425531914893618e-06, |
| "loss": 0.0001, |
| "reward": 0.40034375386312604, |
| "reward_std": 0.7546388749033213, |
| "rewards/correctness_reward_func": 0.28125, |
| "rewards/int_reward_func": 0.109375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.009718750137835741, |
| "step": 29 |
| }, |
| { |
| "completion_length": 128.234375, |
| "epoch": 0.03211561621838619, |
| "grad_norm": 5.113959312438965, |
| "kl": 0.0021859680928173475, |
| "learning_rate": 1.595744680851064e-06, |
| "loss": 0.0001, |
| "reward": 0.8881718653719872, |
| "reward_std": 0.9410315058194101, |
| "rewards/correctness_reward_func": 0.6875, |
| "rewards/int_reward_func": 0.1796875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.02098437474342063, |
| "step": 30 |
| }, |
| { |
| "completion_length": 140.8125, |
| "epoch": 0.03318613675899906, |
| "grad_norm": 7.337815284729004, |
| "kl": 0.00254826245145523, |
| "learning_rate": 1.648936170212766e-06, |
| "loss": 0.0001, |
| "reward": 0.7087812423706055, |
| "reward_std": 0.9508242532610893, |
| "rewards/correctness_reward_func": 0.53125, |
| "rewards/int_reward_func": 0.15625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.021281250286847353, |
| "step": 31 |
| }, |
| { |
| "completion_length": 141.0625, |
| "epoch": 0.034256657299611935, |
| "grad_norm": 4.055324554443359, |
| "kl": 0.0016690000156813767, |
| "learning_rate": 1.7021276595744682e-06, |
| "loss": 0.0001, |
| "reward": 0.7299062572419643, |
| "reward_std": 0.8444140013307333, |
| "rewards/correctness_reward_func": 0.5625, |
| "rewards/int_reward_func": 0.1328125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03459375072270632, |
| "step": 32 |
| }, |
| { |
| "completion_length": 154.296875, |
| "epoch": 0.03532717784022481, |
| "grad_norm": 3.565863847732544, |
| "kl": 0.0011733200299204327, |
| "learning_rate": 1.7553191489361704e-06, |
| "loss": 0.0, |
| "reward": 0.7152343707857653, |
| "reward_std": 0.986182201653719, |
| "rewards/correctness_reward_func": 0.5625, |
| "rewards/int_reward_func": 0.1640625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.011328124441206455, |
| "step": 33 |
| }, |
| { |
| "completion_length": 156.484375, |
| "epoch": 0.03639769838083768, |
| "grad_norm": 2.782845973968506, |
| "kl": 0.001061345017660642, |
| "learning_rate": 1.8085106382978727e-06, |
| "loss": 0.0, |
| "reward": 0.4085781138855964, |
| "reward_std": 0.6946883676573634, |
| "rewards/correctness_reward_func": 0.3125, |
| "rewards/int_reward_func": 0.09375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0023281261092051864, |
| "step": 34 |
| }, |
| { |
| "completion_length": 166.203125, |
| "epoch": 0.03746821892145055, |
| "grad_norm": 1.6768676042556763, |
| "kl": 0.0008204600453609601, |
| "learning_rate": 1.8617021276595745e-06, |
| "loss": 0.0, |
| "reward": 0.5197031321004033, |
| "reward_std": 0.8639188781380653, |
| "rewards/correctness_reward_func": 0.40625, |
| "rewards/int_reward_func": 0.1171875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.0037343755830079317, |
| "step": 35 |
| }, |
| { |
| "completion_length": 168.828125, |
| "epoch": 0.03853873946206343, |
| "grad_norm": 7.892724990844727, |
| "kl": 0.0029609855264425278, |
| "learning_rate": 1.9148936170212767e-06, |
| "loss": 0.0001, |
| "reward": 0.23489062942098826, |
| "reward_std": 0.356358939781785, |
| "rewards/correctness_reward_func": 0.15625, |
| "rewards/int_reward_func": 0.046875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.031765625230036676, |
| "step": 36 |
| }, |
| { |
| "completion_length": 123.53125, |
| "epoch": 0.039609260002676304, |
| "grad_norm": 2.5262649059295654, |
| "kl": 0.0010402118496131152, |
| "learning_rate": 1.968085106382979e-06, |
| "loss": 0.0, |
| "reward": 0.9631250270176679, |
| "reward_std": 1.0183926988393068, |
| "rewards/correctness_reward_func": 0.71875, |
| "rewards/int_reward_func": 0.1796875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06468750000931323, |
| "step": 37 |
| }, |
| { |
| "completion_length": 159.46875, |
| "epoch": 0.040679780543289176, |
| "grad_norm": 1.3111544847488403, |
| "kl": 0.0011480498651508242, |
| "learning_rate": 2.021276595744681e-06, |
| "loss": 0.0, |
| "reward": 0.4472187543287873, |
| "reward_std": 0.6874936055392027, |
| "rewards/correctness_reward_func": 0.34375, |
| "rewards/int_reward_func": 0.09375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.009718748508021235, |
| "step": 38 |
| }, |
| { |
| "completion_length": 142.25, |
| "epoch": 0.04175030108390205, |
| "grad_norm": 2.2276878356933594, |
| "kl": 0.0014690053576487117, |
| "learning_rate": 2.074468085106383e-06, |
| "loss": 0.0001, |
| "reward": 1.0481719109229743, |
| "reward_std": 0.7983668614178896, |
| "rewards/correctness_reward_func": 0.8125, |
| "rewards/int_reward_func": 0.1796875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.05598437529988587, |
| "step": 39 |
| }, |
| { |
| "completion_length": 149.71875, |
| "epoch": 0.04282082162451492, |
| "grad_norm": 3.2188539505004883, |
| "kl": 0.002757413443760015, |
| "learning_rate": 2.1276595744680853e-06, |
| "loss": 0.0001, |
| "reward": 0.8439687644131482, |
| "reward_std": 0.9366709599271417, |
| "rewards/correctness_reward_func": 0.625, |
| "rewards/int_reward_func": 0.171875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.04709374951198697, |
| "step": 40 |
| }, |
| { |
| "completion_length": 153.75, |
| "epoch": 0.04389134216512779, |
| "grad_norm": 1.609779953956604, |
| "kl": 0.002587423972727265, |
| "learning_rate": 2.1808510638297876e-06, |
| "loss": 0.0001, |
| "reward": 0.9616250339895487, |
| "reward_std": 0.8783023115247488, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/int_reward_func": 0.1953125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.01631249929778278, |
| "step": 41 |
| }, |
| { |
| "completion_length": 156.15625, |
| "epoch": 0.044961862705740666, |
| "grad_norm": 5.789142608642578, |
| "kl": 0.0045223182532936335, |
| "learning_rate": 2.2340425531914894e-06, |
| "loss": 0.0002, |
| "reward": 0.8380624754354358, |
| "reward_std": 0.881235895678401, |
| "rewards/correctness_reward_func": 0.625, |
| "rewards/int_reward_func": 0.1796875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03337499825283885, |
| "step": 42 |
| }, |
| { |
| "completion_length": 161.390625, |
| "epoch": 0.04603238324635354, |
| "grad_norm": 5.589737892150879, |
| "kl": 0.005504744782228954, |
| "learning_rate": 2.2872340425531916e-06, |
| "loss": 0.0002, |
| "reward": 0.8749218666926026, |
| "reward_std": 0.9216371476650238, |
| "rewards/correctness_reward_func": 0.6875, |
| "rewards/int_reward_func": 0.1796875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.007734375540167093, |
| "step": 43 |
| }, |
| { |
| "completion_length": 161.796875, |
| "epoch": 0.04710290378696641, |
| "grad_norm": 5.604761600494385, |
| "kl": 0.0046821657015243545, |
| "learning_rate": 2.340425531914894e-06, |
| "loss": 0.0002, |
| "reward": 0.2252656314522028, |
| "reward_std": 0.47946364153176546, |
| "rewards/correctness_reward_func": 0.15625, |
| "rewards/int_reward_func": 0.0546875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.014328125165775418, |
| "step": 44 |
| }, |
| { |
| "completion_length": 154.453125, |
| "epoch": 0.04817342432757928, |
| "grad_norm": 9.506386756896973, |
| "kl": 0.007164878101320937, |
| "learning_rate": 2.393617021276596e-06, |
| "loss": 0.0003, |
| "reward": 0.6195624829269946, |
| "reward_std": 1.0045473407953978, |
| "rewards/correctness_reward_func": 0.46875, |
| "rewards/int_reward_func": 0.1328125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.01799999945797026, |
| "step": 45 |
| }, |
| { |
| "completion_length": 140.390625, |
| "epoch": 0.04924394486819216, |
| "grad_norm": 3.766388416290283, |
| "kl": 0.00619677483337, |
| "learning_rate": 2.446808510638298e-06, |
| "loss": 0.0002, |
| "reward": 0.9423750173300505, |
| "reward_std": 0.5746848955750465, |
| "rewards/correctness_reward_func": 0.65625, |
| "rewards/int_reward_func": 0.203125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08299999847076833, |
| "step": 46 |
| }, |
| { |
| "completion_length": 143.09375, |
| "epoch": 0.050314465408805034, |
| "grad_norm": 4.4466705322265625, |
| "kl": 0.009431202546693385, |
| "learning_rate": 2.5e-06, |
| "loss": 0.0004, |
| "reward": 0.9380937227979302, |
| "reward_std": 0.658873830921948, |
| "rewards/correctness_reward_func": 0.6875, |
| "rewards/int_reward_func": 0.1796875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07090624794363976, |
| "step": 47 |
| }, |
| { |
| "completion_length": 165.0625, |
| "epoch": 0.051384985949417906, |
| "grad_norm": 1.3459303379058838, |
| "kl": 0.00600922666490078, |
| "learning_rate": 2.553191489361702e-06, |
| "loss": 0.0002, |
| "reward": 0.6878281269455329, |
| "reward_std": 0.9325386872515082, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.15625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03157812531571835, |
| "step": 48 |
| }, |
| { |
| "completion_length": 134.421875, |
| "epoch": 0.05245550649003078, |
| "grad_norm": 1.628507375717163, |
| "kl": 0.008930853742640465, |
| "learning_rate": 2.6063829787234047e-06, |
| "loss": 0.0004, |
| "reward": 0.8599374926416203, |
| "reward_std": 0.9755922555923462, |
| "rewards/correctness_reward_func": 0.625, |
| "rewards/int_reward_func": 0.1796875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.05525000113993883, |
| "step": 49 |
| }, |
| { |
| "completion_length": 149.5625, |
| "epoch": 0.05352602703064365, |
| "grad_norm": 1.1767226457595825, |
| "kl": 0.012399342958815396, |
| "learning_rate": 2.6595744680851065e-06, |
| "loss": 0.0005, |
| "reward": 1.0091875102370977, |
| "reward_std": 0.8712345249950886, |
| "rewards/correctness_reward_func": 0.71875, |
| "rewards/int_reward_func": 0.203125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08731249999254942, |
| "step": 50 |
| }, |
| { |
| "completion_length": 169.078125, |
| "epoch": 0.05459654757125652, |
| "grad_norm": 2.5901050567626953, |
| "kl": 0.007969769008923322, |
| "learning_rate": 2.7127659574468084e-06, |
| "loss": 0.0003, |
| "reward": 0.9503125064074993, |
| "reward_std": 0.9690856691449881, |
| "rewards/correctness_reward_func": 0.6875, |
| "rewards/int_reward_func": 0.234375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.028437498025596142, |
| "step": 51 |
| }, |
| { |
| "completion_length": 166.390625, |
| "epoch": 0.055667068111869396, |
| "grad_norm": 3.764841079711914, |
| "kl": 0.011693944863509387, |
| "learning_rate": 2.765957446808511e-06, |
| "loss": 0.0005, |
| "reward": 0.7889687474817038, |
| "reward_std": 0.8161248974502087, |
| "rewards/correctness_reward_func": 0.5625, |
| "rewards/int_reward_func": 0.1875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03896875097416341, |
| "step": 52 |
| }, |
| { |
| "completion_length": 158.3125, |
| "epoch": 0.05673758865248227, |
| "grad_norm": 4.34719705581665, |
| "kl": 0.00852247714647092, |
| "learning_rate": 2.819148936170213e-06, |
| "loss": 0.0003, |
| "reward": 0.5854531275108457, |
| "reward_std": 0.579142062459141, |
| "rewards/correctness_reward_func": 0.4375, |
| "rewards/int_reward_func": 0.1484375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.00048437435179948807, |
| "step": 53 |
| }, |
| { |
| "completion_length": 149.15625, |
| "epoch": 0.05780810919309514, |
| "grad_norm": 1.3242722749710083, |
| "kl": 0.006340013263979927, |
| "learning_rate": 2.8723404255319155e-06, |
| "loss": 0.0003, |
| "reward": 0.7202812694013119, |
| "reward_std": 0.6025513117201626, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.140625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07965625170618296, |
| "step": 54 |
| }, |
| { |
| "completion_length": 131.0, |
| "epoch": 0.05887862973370801, |
| "grad_norm": 1.2887805700302124, |
| "kl": 0.005301086028339341, |
| "learning_rate": 2.9255319148936174e-06, |
| "loss": 0.0002, |
| "reward": 1.1997031308710575, |
| "reward_std": 0.8454109290614724, |
| "rewards/correctness_reward_func": 0.875, |
| "rewards/int_reward_func": 0.21875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10595312505029142, |
| "step": 55 |
| }, |
| { |
| "completion_length": 132.828125, |
| "epoch": 0.059949150274320885, |
| "grad_norm": 3.085810899734497, |
| "kl": 0.03685568018408958, |
| "learning_rate": 2.978723404255319e-06, |
| "loss": 0.0015, |
| "reward": 1.32792192324996, |
| "reward_std": 0.8808077229186893, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.265625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.062296870397403836, |
| "step": 56 |
| }, |
| { |
| "completion_length": 169.5, |
| "epoch": 0.061019670814933764, |
| "grad_norm": 3.218595266342163, |
| "kl": 0.019670582871185616, |
| "learning_rate": 3.031914893617022e-06, |
| "loss": 0.0008, |
| "reward": 1.0494843795895576, |
| "reward_std": 0.8383767995983362, |
| "rewards/correctness_reward_func": 0.78125, |
| "rewards/int_reward_func": 0.1953125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07292187376879156, |
| "step": 57 |
| }, |
| { |
| "completion_length": 164.71875, |
| "epoch": 0.06209019135554664, |
| "grad_norm": 1.189056634902954, |
| "kl": 0.010589714744128287, |
| "learning_rate": 3.0851063829787237e-06, |
| "loss": 0.0004, |
| "reward": 0.9995937808416784, |
| "reward_std": 0.8763523772358894, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/int_reward_func": 0.234375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015218749409541488, |
| "step": 58 |
| }, |
| { |
| "completion_length": 173.109375, |
| "epoch": 0.0631607118961595, |
| "grad_norm": 2.9426801204681396, |
| "kl": 0.010116680678038392, |
| "learning_rate": 3.1382978723404255e-06, |
| "loss": 0.0004, |
| "reward": 0.5431718821637332, |
| "reward_std": 0.5429769204929471, |
| "rewards/correctness_reward_func": 0.40625, |
| "rewards/int_reward_func": 0.1015625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03535937680862844, |
| "step": 59 |
| }, |
| { |
| "completion_length": 148.328125, |
| "epoch": 0.06423123243677238, |
| "grad_norm": 1.2039525508880615, |
| "kl": 0.012681124440860003, |
| "learning_rate": 3.191489361702128e-06, |
| "loss": 0.0005, |
| "reward": 0.7227343516424298, |
| "reward_std": 0.7870303755626082, |
| "rewards/correctness_reward_func": 0.53125, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06648437259718776, |
| "step": 60 |
| }, |
| { |
| "completion_length": 125.5625, |
| "epoch": 0.06530175297738525, |
| "grad_norm": 1.2423548698425293, |
| "kl": 0.007948026963276789, |
| "learning_rate": 3.24468085106383e-06, |
| "loss": 0.0003, |
| "reward": 1.3104218831285834, |
| "reward_std": 0.6878003547899425, |
| "rewards/correctness_reward_func": 0.9375, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.12292187649291009, |
| "step": 61 |
| }, |
| { |
| "completion_length": 152.296875, |
| "epoch": 0.06637227351799813, |
| "grad_norm": 5.324446678161621, |
| "kl": 0.043227474874584004, |
| "learning_rate": 3.297872340425532e-06, |
| "loss": 0.0017, |
| "reward": 1.0643124831840396, |
| "reward_std": 0.9667724259197712, |
| "rewards/correctness_reward_func": 0.78125, |
| "rewards/int_reward_func": 0.2109375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07212500204332173, |
| "step": 62 |
| }, |
| { |
| "completion_length": 157.328125, |
| "epoch": 0.067442794058611, |
| "grad_norm": 1.2027546167373657, |
| "kl": 0.004988896660506725, |
| "learning_rate": 3.3510638297872345e-06, |
| "loss": 0.0002, |
| "reward": 1.1987031551543623, |
| "reward_std": 0.8610715009272099, |
| "rewards/correctness_reward_func": 0.90625, |
| "rewards/int_reward_func": 0.2578125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.034640624886378646, |
| "step": 63 |
| }, |
| { |
| "completion_length": 163.9375, |
| "epoch": 0.06851331459922387, |
| "grad_norm": 1.126978874206543, |
| "kl": 0.004329273069743067, |
| "learning_rate": 3.4042553191489363e-06, |
| "loss": 0.0002, |
| "reward": 0.8524375010747463, |
| "reward_std": 0.742443086579442, |
| "rewards/correctness_reward_func": 0.65625, |
| "rewards/int_reward_func": 0.171875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.024312500143423676, |
| "step": 64 |
| }, |
| { |
| "completion_length": 135.765625, |
| "epoch": 0.06958383513983675, |
| "grad_norm": 4.760307788848877, |
| "kl": 0.0423761896090582, |
| "learning_rate": 3.457446808510639e-06, |
| "loss": 0.0017, |
| "reward": 1.4641093388199806, |
| "reward_std": 1.0102775804698467, |
| "rewards/correctness_reward_func": 1.0625, |
| "rewards/int_reward_func": 0.296875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1047343765385449, |
| "step": 65 |
| }, |
| { |
| "completion_length": 155.796875, |
| "epoch": 0.07065435568044962, |
| "grad_norm": 3.1110665798187256, |
| "kl": 0.061538238427601755, |
| "learning_rate": 3.510638297872341e-06, |
| "loss": 0.0025, |
| "reward": 1.033312514424324, |
| "reward_std": 0.8865859052166343, |
| "rewards/correctness_reward_func": 0.78125, |
| "rewards/int_reward_func": 0.203125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.048937500920146704, |
| "step": 66 |
| }, |
| { |
| "completion_length": 164.578125, |
| "epoch": 0.0717248762210625, |
| "grad_norm": 1.2862604856491089, |
| "kl": 0.0052650388242909685, |
| "learning_rate": 3.5638297872340426e-06, |
| "loss": 0.0002, |
| "reward": 0.7733906293287873, |
| "reward_std": 0.8027388863265514, |
| "rewards/correctness_reward_func": 0.59375, |
| "rewards/int_reward_func": 0.1640625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.015578125370666385, |
| "step": 67 |
| }, |
| { |
| "completion_length": 163.75, |
| "epoch": 0.07279539676167536, |
| "grad_norm": 6.033888816833496, |
| "kl": 0.07815390304313041, |
| "learning_rate": 3.6170212765957453e-06, |
| "loss": 0.0031, |
| "reward": 1.1917500102426857, |
| "reward_std": 0.7897173650562763, |
| "rewards/correctness_reward_func": 0.90625, |
| "rewards/int_reward_func": 0.265625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.019875000230968, |
| "step": 68 |
| }, |
| { |
| "completion_length": 149.40625, |
| "epoch": 0.07386591730228824, |
| "grad_norm": 1.29887056350708, |
| "kl": 0.01987961767008528, |
| "learning_rate": 3.670212765957447e-06, |
| "loss": 0.0008, |
| "reward": 1.1079531812574714, |
| "reward_std": 0.7181853111833334, |
| "rewards/correctness_reward_func": 0.78125, |
| "rewards/int_reward_func": 0.265625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06107812421396375, |
| "step": 69 |
| }, |
| { |
| "completion_length": 141.859375, |
| "epoch": 0.0749364378429011, |
| "grad_norm": 5.329657077789307, |
| "kl": 0.0800003606127575, |
| "learning_rate": 3.723404255319149e-06, |
| "loss": 0.0032, |
| "reward": 0.9977656248956919, |
| "reward_std": 0.5876570995897055, |
| "rewards/correctness_reward_func": 0.6875, |
| "rewards/int_reward_func": 0.21875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.09151562419719994, |
| "step": 70 |
| }, |
| { |
| "completion_length": 138.5625, |
| "epoch": 0.07600695838351398, |
| "grad_norm": 2.7627108097076416, |
| "kl": 0.03701730686589144, |
| "learning_rate": 3.7765957446808516e-06, |
| "loss": 0.0015, |
| "reward": 1.2351874904707074, |
| "reward_std": 0.8267962019890547, |
| "rewards/correctness_reward_func": 0.90625, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07893750071525574, |
| "step": 71 |
| }, |
| { |
| "completion_length": 133.984375, |
| "epoch": 0.07707747892412686, |
| "grad_norm": 1.414104700088501, |
| "kl": 0.016362678608857095, |
| "learning_rate": 3.8297872340425535e-06, |
| "loss": 0.0007, |
| "reward": 1.827203094959259, |
| "reward_std": 1.0086707267910242, |
| "rewards/correctness_reward_func": 1.375, |
| "rewards/int_reward_func": 0.359375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0928281235974282, |
| "step": 72 |
| }, |
| { |
| "completion_length": 148.0625, |
| "epoch": 0.07814799946473973, |
| "grad_norm": 1.3797897100448608, |
| "kl": 0.010796019807457924, |
| "learning_rate": 3.882978723404256e-06, |
| "loss": 0.0004, |
| "reward": 1.1499999817460775, |
| "reward_std": 0.9028994599357247, |
| "rewards/correctness_reward_func": 0.84375, |
| "rewards/int_reward_func": 0.2109375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.09531250316649675, |
| "step": 73 |
| }, |
| { |
| "completion_length": 155.828125, |
| "epoch": 0.07921852000535261, |
| "grad_norm": 1.2884279489517212, |
| "kl": 0.017199350346345454, |
| "learning_rate": 3.936170212765958e-06, |
| "loss": 0.0007, |
| "reward": 1.0700937574729323, |
| "reward_std": 1.0012164115905762, |
| "rewards/correctness_reward_func": 0.8125, |
| "rewards/int_reward_func": 0.234375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.023218751302920282, |
| "step": 74 |
| }, |
| { |
| "completion_length": 146.96875, |
| "epoch": 0.08028904054596547, |
| "grad_norm": 3.5342090129852295, |
| "kl": 0.05702902490156703, |
| "learning_rate": 3.98936170212766e-06, |
| "loss": 0.0023, |
| "reward": 1.2074843887239695, |
| "reward_std": 0.8127174219116569, |
| "rewards/correctness_reward_func": 0.875, |
| "rewards/int_reward_func": 0.2578125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07467187196016312, |
| "step": 75 |
| }, |
| { |
| "completion_length": 135.140625, |
| "epoch": 0.08135956108657835, |
| "grad_norm": 1.7845228910446167, |
| "kl": 0.01959521723620128, |
| "learning_rate": 4.042553191489362e-06, |
| "loss": 0.0008, |
| "reward": 1.4087968692183495, |
| "reward_std": 0.9582029562443495, |
| "rewards/correctness_reward_func": 1.03125, |
| "rewards/int_reward_func": 0.2734375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10410937643609941, |
| "step": 76 |
| }, |
| { |
| "completion_length": 126.125, |
| "epoch": 0.08243008162719122, |
| "grad_norm": 1.3039053678512573, |
| "kl": 0.008768495463300496, |
| "learning_rate": 4.095744680851064e-06, |
| "loss": 0.0004, |
| "reward": 1.3187343887984753, |
| "reward_std": 0.5690027270466089, |
| "rewards/correctness_reward_func": 0.9375, |
| "rewards/int_reward_func": 0.2578125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.12342187319882214, |
| "step": 77 |
| }, |
| { |
| "completion_length": 148.6875, |
| "epoch": 0.0835006021678041, |
| "grad_norm": 3.3175690174102783, |
| "kl": 0.014638990571256727, |
| "learning_rate": 4.148936170212766e-06, |
| "loss": 0.0006, |
| "reward": 1.4877656111493707, |
| "reward_std": 0.8957763649523258, |
| "rewards/correctness_reward_func": 1.09375, |
| "rewards/int_reward_func": 0.2890625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10495312558487058, |
| "step": 78 |
| }, |
| { |
| "completion_length": 171.0625, |
| "epoch": 0.08457112270841696, |
| "grad_norm": 5.574616432189941, |
| "kl": 0.01483242801623419, |
| "learning_rate": 4.202127659574468e-06, |
| "loss": 0.0006, |
| "reward": 1.0053124725818634, |
| "reward_std": 1.1163944154977798, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/int_reward_func": 0.2265625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.02874999982304871, |
| "step": 79 |
| }, |
| { |
| "completion_length": 150.5625, |
| "epoch": 0.08564164324902984, |
| "grad_norm": 3.4894111156463623, |
| "kl": 0.020942480681696907, |
| "learning_rate": 4.255319148936171e-06, |
| "loss": 0.0008, |
| "reward": 1.081656239926815, |
| "reward_std": 0.8474766900762916, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/int_reward_func": 0.234375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.09728124877437949, |
| "step": 80 |
| }, |
| { |
| "completion_length": 145.8125, |
| "epoch": 0.08671216378964271, |
| "grad_norm": 4.479030609130859, |
| "kl": 0.02067101007560268, |
| "learning_rate": 4.308510638297873e-06, |
| "loss": 0.0008, |
| "reward": 0.9469531225040555, |
| "reward_std": 0.5945024443790317, |
| "rewards/correctness_reward_func": 0.625, |
| "rewards/int_reward_func": 0.2421875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07976562529802322, |
| "step": 81 |
| }, |
| { |
| "completion_length": 142.609375, |
| "epoch": 0.08778268433025559, |
| "grad_norm": 9.429178237915039, |
| "kl": 0.032599265803582966, |
| "learning_rate": 4.361702127659575e-06, |
| "loss": 0.0013, |
| "reward": 0.7288906406611204, |
| "reward_std": 0.8307479582726955, |
| "rewards/correctness_reward_func": 0.46875, |
| "rewards/int_reward_func": 0.171875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08826562575995922, |
| "step": 82 |
| }, |
| { |
| "completion_length": 143.84375, |
| "epoch": 0.08885320487086847, |
| "grad_norm": 1.5900259017944336, |
| "kl": 0.013988179998705164, |
| "learning_rate": 4.414893617021277e-06, |
| "loss": 0.0006, |
| "reward": 1.591531228274107, |
| "reward_std": 0.8249969203025103, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/int_reward_func": 0.3203125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.021218748996034265, |
| "step": 83 |
| }, |
| { |
| "completion_length": 135.8125, |
| "epoch": 0.08992372541148133, |
| "grad_norm": 4.648512363433838, |
| "kl": 0.06424052006332204, |
| "learning_rate": 4.468085106382979e-06, |
| "loss": 0.0026, |
| "reward": 1.4651249905582517, |
| "reward_std": 0.8705566665157676, |
| "rewards/correctness_reward_func": 1.0625, |
| "rewards/int_reward_func": 0.2890625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11356249963864684, |
| "step": 84 |
| }, |
| { |
| "completion_length": 123.90625, |
| "epoch": 0.09099424595209421, |
| "grad_norm": 6.136663913726807, |
| "kl": 0.02576264040544629, |
| "learning_rate": 4.521276595744681e-06, |
| "loss": 0.001, |
| "reward": 1.7882343754172325, |
| "reward_std": 0.7715246099978685, |
| "rewards/correctness_reward_func": 1.3125, |
| "rewards/int_reward_func": 0.3671875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10854687169194221, |
| "step": 85 |
| }, |
| { |
| "completion_length": 132.65625, |
| "epoch": 0.09206476649270708, |
| "grad_norm": 2.7052152156829834, |
| "kl": 0.021323165216017514, |
| "learning_rate": 4.574468085106383e-06, |
| "loss": 0.0009, |
| "reward": 1.2060781214386225, |
| "reward_std": 0.74014887586236, |
| "rewards/correctness_reward_func": 0.84375, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11232812539674342, |
| "step": 86 |
| }, |
| { |
| "completion_length": 142.3125, |
| "epoch": 0.09313528703331996, |
| "grad_norm": 6.834721088409424, |
| "kl": 0.15714708802988753, |
| "learning_rate": 4.6276595744680855e-06, |
| "loss": 0.0063, |
| "reward": 1.4907656013965607, |
| "reward_std": 1.1851906776428223, |
| "rewards/correctness_reward_func": 1.09375, |
| "rewards/int_reward_func": 0.3203125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07670312328264117, |
| "step": 87 |
| }, |
| { |
| "completion_length": 141.625, |
| "epoch": 0.09420580757393282, |
| "grad_norm": 1.5215015411376953, |
| "kl": 0.012561204843223095, |
| "learning_rate": 4.680851063829788e-06, |
| "loss": 0.0005, |
| "reward": 0.973562479019165, |
| "reward_std": 0.8407629178836942, |
| "rewards/correctness_reward_func": 0.6875, |
| "rewards/int_reward_func": 0.2109375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07512499787844718, |
| "step": 88 |
| }, |
| { |
| "completion_length": 131.109375, |
| "epoch": 0.0952763281145457, |
| "grad_norm": 4.186502456665039, |
| "kl": 0.029811605345457792, |
| "learning_rate": 4.73404255319149e-06, |
| "loss": 0.0012, |
| "reward": 1.571968775242567, |
| "reward_std": 0.8088337788358331, |
| "rewards/correctness_reward_func": 1.125, |
| "rewards/int_reward_func": 0.3125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13446875009685755, |
| "step": 89 |
| }, |
| { |
| "completion_length": 137.5625, |
| "epoch": 0.09634684865515857, |
| "grad_norm": 3.539090633392334, |
| "kl": 0.025141435849945992, |
| "learning_rate": 4.787234042553192e-06, |
| "loss": 0.001, |
| "reward": 1.1439843773841858, |
| "reward_std": 1.1783022359013557, |
| "rewards/correctness_reward_func": 0.8125, |
| "rewards/int_reward_func": 0.2421875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08929687412455678, |
| "step": 90 |
| }, |
| { |
| "completion_length": 142.0, |
| "epoch": 0.09741736919577144, |
| "grad_norm": 3.9599854946136475, |
| "kl": 0.03133453679038212, |
| "learning_rate": 4.840425531914894e-06, |
| "loss": 0.0013, |
| "reward": 0.9123750082217157, |
| "reward_std": 0.8763793092221022, |
| "rewards/correctness_reward_func": 0.65625, |
| "rewards/int_reward_func": 0.21875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.037375001003965735, |
| "step": 91 |
| }, |
| { |
| "completion_length": 127.0625, |
| "epoch": 0.09848788973638432, |
| "grad_norm": 5.762697219848633, |
| "kl": 0.02441024547442794, |
| "learning_rate": 4.893617021276596e-06, |
| "loss": 0.001, |
| "reward": 1.4343437626957893, |
| "reward_std": 1.0151535924524069, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.328125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10621875151991844, |
| "step": 92 |
| }, |
| { |
| "completion_length": 128.421875, |
| "epoch": 0.09955841027699719, |
| "grad_norm": 1.7006865739822388, |
| "kl": 0.014295783417765051, |
| "learning_rate": 4.946808510638298e-06, |
| "loss": 0.0006, |
| "reward": 1.4152031522244215, |
| "reward_std": 1.0377220567315817, |
| "rewards/correctness_reward_func": 1.03125, |
| "rewards/int_reward_func": 0.265625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11832812381908298, |
| "step": 93 |
| }, |
| { |
| "completion_length": 126.703125, |
| "epoch": 0.10062893081761007, |
| "grad_norm": 2.6587624549865723, |
| "kl": 0.022590334410779178, |
| "learning_rate": 5e-06, |
| "loss": 0.0009, |
| "reward": 1.0790781378746033, |
| "reward_std": 0.8811032259836793, |
| "rewards/correctness_reward_func": 0.75, |
| "rewards/int_reward_func": 0.203125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.12595312716439366, |
| "step": 94 |
| }, |
| { |
| "completion_length": 134.25, |
| "epoch": 0.10169945135822293, |
| "grad_norm": 5.803914546966553, |
| "kl": 0.0693736044340767, |
| "learning_rate": 4.999982515602153e-06, |
| "loss": 0.0028, |
| "reward": 1.6232343390583992, |
| "reward_std": 1.171187661588192, |
| "rewards/correctness_reward_func": 1.1875, |
| "rewards/int_reward_func": 0.34375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.09198437177110463, |
| "step": 95 |
| }, |
| { |
| "completion_length": 119.09375, |
| "epoch": 0.10276997189883581, |
| "grad_norm": 2.6233108043670654, |
| "kl": 0.025776030379347503, |
| "learning_rate": 4.999930062653175e-06, |
| "loss": 0.001, |
| "reward": 1.4469374530017376, |
| "reward_std": 0.717207751236856, |
| "rewards/correctness_reward_func": 1.03125, |
| "rewards/int_reward_func": 0.2890625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.12662499537691474, |
| "step": 96 |
| }, |
| { |
| "completion_length": 135.625, |
| "epoch": 0.10384049243944868, |
| "grad_norm": 1.4691489934921265, |
| "kl": 0.018308754893951118, |
| "learning_rate": 4.999842641886752e-06, |
| "loss": 0.0007, |
| "reward": 1.7408281043171883, |
| "reward_std": 0.6604799125343561, |
| "rewards/correctness_reward_func": 1.3125, |
| "rewards/int_reward_func": 0.328125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10020312923006713, |
| "step": 97 |
| }, |
| { |
| "completion_length": 142.53125, |
| "epoch": 0.10491101298006156, |
| "grad_norm": 3.7322754859924316, |
| "kl": 0.031659536180086434, |
| "learning_rate": 4.999720254525684e-06, |
| "loss": 0.0013, |
| "reward": 1.3402031436562538, |
| "reward_std": 1.0035525700077415, |
| "rewards/correctness_reward_func": 0.96875, |
| "rewards/int_reward_func": 0.3203125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.051140623865649104, |
| "step": 98 |
| }, |
| { |
| "completion_length": 136.859375, |
| "epoch": 0.10598153352067442, |
| "grad_norm": 1.445330023765564, |
| "kl": 0.016985120251774788, |
| "learning_rate": 4.999562902281866e-06, |
| "loss": 0.0007, |
| "reward": 1.4283437356352806, |
| "reward_std": 0.9263063753023744, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.296875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13146874867379665, |
| "step": 99 |
| }, |
| { |
| "completion_length": 145.234375, |
| "epoch": 0.1070520540612873, |
| "grad_norm": 4.298182010650635, |
| "kl": 0.03539641568204388, |
| "learning_rate": 4.999370587356267e-06, |
| "loss": 0.0014, |
| "reward": 1.5638906005769968, |
| "reward_std": 0.9018815001472831, |
| "rewards/correctness_reward_func": 1.15625, |
| "rewards/int_reward_func": 0.328125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07951562479138374, |
| "step": 100 |
| }, |
| { |
| "completion_length": 126.46875, |
| "epoch": 0.10812257460190017, |
| "grad_norm": 1.7051573991775513, |
| "kl": 0.0210904503474012, |
| "learning_rate": 4.999143312438893e-06, |
| "loss": 0.0008, |
| "reward": 2.032781273126602, |
| "reward_std": 0.882479477673769, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.4296875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10309374984353781, |
| "step": 101 |
| }, |
| { |
| "completion_length": 138.4375, |
| "epoch": 0.10919309514251305, |
| "grad_norm": 5.384690284729004, |
| "kl": 0.05343873624224216, |
| "learning_rate": 4.998881080708759e-06, |
| "loss": 0.0021, |
| "reward": 1.7520312666893005, |
| "reward_std": 1.0768068991601467, |
| "rewards/correctness_reward_func": 1.34375, |
| "rewards/int_reward_func": 0.3203125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08796874759718776, |
| "step": 102 |
| }, |
| { |
| "completion_length": 121.265625, |
| "epoch": 0.11026361568312593, |
| "grad_norm": 3.4453506469726562, |
| "kl": 0.054925739066675305, |
| "learning_rate": 4.998583895833834e-06, |
| "loss": 0.0022, |
| "reward": 1.6850781589746475, |
| "reward_std": 0.6663676341995597, |
| "rewards/correctness_reward_func": 1.125, |
| "rewards/int_reward_func": 0.4296875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1303906268440187, |
| "step": 103 |
| }, |
| { |
| "completion_length": 114.71875, |
| "epoch": 0.11133413622373879, |
| "grad_norm": 1.5185428857803345, |
| "kl": 0.022636244888417423, |
| "learning_rate": 4.998251761970997e-06, |
| "loss": 0.0009, |
| "reward": 1.7328437007963657, |
| "reward_std": 0.7276732774917036, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/int_reward_func": 0.34375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13909374829381704, |
| "step": 104 |
| }, |
| { |
| "completion_length": 114.359375, |
| "epoch": 0.11240465676435167, |
| "grad_norm": 4.903259754180908, |
| "kl": 0.0455300398170948, |
| "learning_rate": 4.997884683765977e-06, |
| "loss": 0.0018, |
| "reward": 2.070781234651804, |
| "reward_std": 0.5790006909519434, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.4296875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14109374955296516, |
| "step": 105 |
| }, |
| { |
| "completion_length": 117.578125, |
| "epoch": 0.11347517730496454, |
| "grad_norm": 7.719193458557129, |
| "kl": 0.07494375784881413, |
| "learning_rate": 4.997482666353287e-06, |
| "loss": 0.003, |
| "reward": 1.8749061971902847, |
| "reward_std": 0.762595918495208, |
| "rewards/correctness_reward_func": 1.3125, |
| "rewards/int_reward_func": 0.390625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1717812498100102, |
| "step": 106 |
| }, |
| { |
| "completion_length": 132.71875, |
| "epoch": 0.11454569784557742, |
| "grad_norm": 1.9178704023361206, |
| "kl": 0.026816099416464567, |
| "learning_rate": 4.997045715356153e-06, |
| "loss": 0.0011, |
| "reward": 1.4049999862909317, |
| "reward_std": 1.0492134541273117, |
| "rewards/correctness_reward_func": 0.9375, |
| "rewards/int_reward_func": 0.328125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13937499932944775, |
| "step": 107 |
| }, |
| { |
| "completion_length": 121.21875, |
| "epoch": 0.11561621838619028, |
| "grad_norm": 3.1541106700897217, |
| "kl": 0.04801671905443072, |
| "learning_rate": 4.9965738368864345e-06, |
| "loss": 0.0019, |
| "reward": 1.7343593537807465, |
| "reward_std": 0.9352071397006512, |
| "rewards/correctness_reward_func": 1.21875, |
| "rewards/int_reward_func": 0.359375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1562343705445528, |
| "step": 108 |
| }, |
| { |
| "completion_length": 134.40625, |
| "epoch": 0.11668673892680316, |
| "grad_norm": 3.4600815773010254, |
| "kl": 0.07537143386434764, |
| "learning_rate": 4.996067037544542e-06, |
| "loss": 0.003, |
| "reward": 1.6357812024652958, |
| "reward_std": 0.7928643207997084, |
| "rewards/correctness_reward_func": 1.15625, |
| "rewards/int_reward_func": 0.3359375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14359374903142452, |
| "step": 109 |
| }, |
| { |
| "completion_length": 121.25, |
| "epoch": 0.11775725946741603, |
| "grad_norm": 1.8081263303756714, |
| "kl": 0.03252584161236882, |
| "learning_rate": 4.995525324419338e-06, |
| "loss": 0.0013, |
| "reward": 1.6038124561309814, |
| "reward_std": 0.9315616749227047, |
| "rewards/correctness_reward_func": 1.09375, |
| "rewards/int_reward_func": 0.3671875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14287500269711018, |
| "step": 110 |
| }, |
| { |
| "completion_length": 113.734375, |
| "epoch": 0.1188277800080289, |
| "grad_norm": 3.427846670150757, |
| "kl": 0.029214507434517145, |
| "learning_rate": 4.994948705088047e-06, |
| "loss": 0.0012, |
| "reward": 1.7096093818545341, |
| "reward_std": 0.9403769830241799, |
| "rewards/correctness_reward_func": 1.1875, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14710937440395355, |
| "step": 111 |
| }, |
| { |
| "completion_length": 138.140625, |
| "epoch": 0.11989830054864177, |
| "grad_norm": 5.425636291503906, |
| "kl": 0.11221261869650334, |
| "learning_rate": 4.99433718761614e-06, |
| "loss": 0.0045, |
| "reward": 1.4035156145691872, |
| "reward_std": 1.1017278581857681, |
| "rewards/correctness_reward_func": 0.90625, |
| "rewards/int_reward_func": 0.3515625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14570312481373549, |
| "step": 112 |
| }, |
| { |
| "completion_length": 128.828125, |
| "epoch": 0.12096882108925465, |
| "grad_norm": 9.655882835388184, |
| "kl": 0.14679548889398575, |
| "learning_rate": 4.993690780557232e-06, |
| "loss": 0.0059, |
| "reward": 1.5967968963086605, |
| "reward_std": 0.9262449182569981, |
| "rewards/correctness_reward_func": 1.09375, |
| "rewards/int_reward_func": 0.3359375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.16710936930030584, |
| "step": 113 |
| }, |
| { |
| "completion_length": 114.3125, |
| "epoch": 0.12203934162986753, |
| "grad_norm": 2.9527218341827393, |
| "kl": 0.03957346314564347, |
| "learning_rate": 4.993009492952951e-06, |
| "loss": 0.0016, |
| "reward": 1.7140468880534172, |
| "reward_std": 0.9286471158266068, |
| "rewards/correctness_reward_func": 1.125, |
| "rewards/int_reward_func": 0.421875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.16717187454923987, |
| "step": 114 |
| }, |
| { |
| "completion_length": 122.515625, |
| "epoch": 0.1231098621704804, |
| "grad_norm": 4.749152660369873, |
| "kl": 0.05118492292240262, |
| "learning_rate": 4.992293334332821e-06, |
| "loss": 0.002, |
| "reward": 1.9886562526226044, |
| "reward_std": 0.9419979229569435, |
| "rewards/correctness_reward_func": 1.4375, |
| "rewards/int_reward_func": 0.40625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14490625634789467, |
| "step": 115 |
| }, |
| { |
| "completion_length": 124.171875, |
| "epoch": 0.12418038271109327, |
| "grad_norm": 4.01917028427124, |
| "kl": 0.10346966434735805, |
| "learning_rate": 4.991542314714122e-06, |
| "loss": 0.0041, |
| "reward": 1.837890625, |
| "reward_std": 0.912613769993186, |
| "rewards/correctness_reward_func": 1.28125, |
| "rewards/int_reward_func": 0.40625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.15039062406867743, |
| "step": 116 |
| }, |
| { |
| "completion_length": 124.375, |
| "epoch": 0.12525090325170615, |
| "grad_norm": 3.7853267192840576, |
| "kl": 0.07023024489171803, |
| "learning_rate": 4.990756444601757e-06, |
| "loss": 0.0028, |
| "reward": 1.6455781627446413, |
| "reward_std": 0.7009308515116572, |
| "rewards/correctness_reward_func": 1.125, |
| "rewards/int_reward_func": 0.3671875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.15339062316343188, |
| "step": 117 |
| }, |
| { |
| "completion_length": 136.359375, |
| "epoch": 0.126321423792319, |
| "grad_norm": 2.0698678493499756, |
| "kl": 0.03698924113996327, |
| "learning_rate": 4.989935734988098e-06, |
| "loss": 0.0015, |
| "reward": 1.8074062652885914, |
| "reward_std": 0.8670060317963362, |
| "rewards/correctness_reward_func": 1.28125, |
| "rewards/int_reward_func": 0.3671875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1589687503874302, |
| "step": 118 |
| }, |
| { |
| "completion_length": 148.15625, |
| "epoch": 0.12739194433293188, |
| "grad_norm": 7.199705123901367, |
| "kl": 0.180443427991122, |
| "learning_rate": 4.989080197352834e-06, |
| "loss": 0.0072, |
| "reward": 1.179109364748001, |
| "reward_std": 0.7882043793797493, |
| "rewards/correctness_reward_func": 0.71875, |
| "rewards/int_reward_func": 0.328125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13223437825217843, |
| "step": 119 |
| }, |
| { |
| "completion_length": 122.46875, |
| "epoch": 0.12846246487354476, |
| "grad_norm": 1.6841520071029663, |
| "kl": 0.0505296983756125, |
| "learning_rate": 4.9881898436628165e-06, |
| "loss": 0.002, |
| "reward": 1.9381406530737877, |
| "reward_std": 0.8331695850938559, |
| "rewards/correctness_reward_func": 1.375, |
| "rewards/int_reward_func": 0.390625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17251562420278788, |
| "step": 120 |
| }, |
| { |
| "completion_length": 108.1875, |
| "epoch": 0.12953298541415764, |
| "grad_norm": 1.8260003328323364, |
| "kl": 0.059317339677363634, |
| "learning_rate": 4.987264686371881e-06, |
| "loss": 0.0024, |
| "reward": 2.02584370970726, |
| "reward_std": 1.0399171710014343, |
| "rewards/correctness_reward_func": 1.40625, |
| "rewards/int_reward_func": 0.421875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.19771874882280827, |
| "step": 121 |
| }, |
| { |
| "completion_length": 95.46875, |
| "epoch": 0.1306035059547705, |
| "grad_norm": 2.1317758560180664, |
| "kl": 0.053682942409068346, |
| "learning_rate": 4.986304738420684e-06, |
| "loss": 0.0021, |
| "reward": 2.1616249792277813, |
| "reward_std": 0.6366997184231877, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.22412499878555536, |
| "step": 122 |
| }, |
| { |
| "completion_length": 118.453125, |
| "epoch": 0.13167402649538337, |
| "grad_norm": 1.487423062324524, |
| "kl": 0.04084368539042771, |
| "learning_rate": 4.985310013236514e-06, |
| "loss": 0.0016, |
| "reward": 2.4017499536275864, |
| "reward_std": 0.6733962241560221, |
| "rewards/correctness_reward_func": 1.71875, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2064374964684248, |
| "step": 123 |
| }, |
| { |
| "completion_length": 136.75, |
| "epoch": 0.13274454703599625, |
| "grad_norm": 6.275110244750977, |
| "kl": 0.10213979217223823, |
| "learning_rate": 4.984280524733107e-06, |
| "loss": 0.0041, |
| "reward": 1.5195625126361847, |
| "reward_std": 0.8954427968710661, |
| "rewards/correctness_reward_func": 1.0625, |
| "rewards/int_reward_func": 0.3125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1445625051856041, |
| "step": 124 |
| }, |
| { |
| "completion_length": 107.84375, |
| "epoch": 0.13381506757660913, |
| "grad_norm": 4.696693420410156, |
| "kl": 0.1205500855576247, |
| "learning_rate": 4.983216287310453e-06, |
| "loss": 0.0048, |
| "reward": 1.828781247138977, |
| "reward_std": 0.92250463552773, |
| "rewards/correctness_reward_func": 1.1875, |
| "rewards/int_reward_func": 0.4296875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21159374713897705, |
| "step": 125 |
| }, |
| { |
| "completion_length": 90.984375, |
| "epoch": 0.134885588117222, |
| "grad_norm": 5.726349830627441, |
| "kl": 0.14590927632525563, |
| "learning_rate": 4.982117315854594e-06, |
| "loss": 0.0058, |
| "reward": 2.218953087925911, |
| "reward_std": 0.855751893715933, |
| "rewards/correctness_reward_func": 1.53125, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.24239062331616879, |
| "step": 126 |
| }, |
| { |
| "completion_length": 102.28125, |
| "epoch": 0.13595610865783486, |
| "grad_norm": 2.1168274879455566, |
| "kl": 0.07060433947481215, |
| "learning_rate": 4.980983625737411e-06, |
| "loss": 0.0028, |
| "reward": 2.1402343213558197, |
| "reward_std": 0.6787437000311911, |
| "rewards/correctness_reward_func": 1.46875, |
| "rewards/int_reward_func": 0.46875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20273437071591616, |
| "step": 127 |
| }, |
| { |
| "completion_length": 122.0625, |
| "epoch": 0.13702662919844774, |
| "grad_norm": 9.184843063354492, |
| "kl": 0.15047278022393584, |
| "learning_rate": 4.9798152328164165e-06, |
| "loss": 0.006, |
| "reward": 1.5844999551773071, |
| "reward_std": 1.025202952325344, |
| "rewards/correctness_reward_func": 1.03125, |
| "rewards/int_reward_func": 0.359375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.19387499801814556, |
| "step": 128 |
| }, |
| { |
| "completion_length": 107.21875, |
| "epoch": 0.13809714973906062, |
| "grad_norm": 1.8967880010604858, |
| "kl": 0.04243561811745167, |
| "learning_rate": 4.978612153434527e-06, |
| "loss": 0.0017, |
| "reward": 2.1196718886494637, |
| "reward_std": 0.5547986216843128, |
| "rewards/correctness_reward_func": 1.46875, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2134218756109476, |
| "step": 129 |
| }, |
| { |
| "completion_length": 108.03125, |
| "epoch": 0.1391676702796735, |
| "grad_norm": 2.011303186416626, |
| "kl": 0.04303696344140917, |
| "learning_rate": 4.977374404419838e-06, |
| "loss": 0.0017, |
| "reward": 2.1730000376701355, |
| "reward_std": 0.9301177933812141, |
| "rewards/correctness_reward_func": 1.53125, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2042500004172325, |
| "step": 130 |
| }, |
| { |
| "completion_length": 110.453125, |
| "epoch": 0.14023819082028635, |
| "grad_norm": 7.903467178344727, |
| "kl": 0.22786898026242852, |
| "learning_rate": 4.9761020030853854e-06, |
| "loss": 0.0091, |
| "reward": 2.0016875714063644, |
| "reward_std": 0.8600476859137416, |
| "rewards/correctness_reward_func": 1.40625, |
| "rewards/int_reward_func": 0.3984375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.19699999503791332, |
| "step": 131 |
| }, |
| { |
| "completion_length": 132.625, |
| "epoch": 0.14130871136089923, |
| "grad_norm": 5.2613444328308105, |
| "kl": 0.23378165811300278, |
| "learning_rate": 4.9747949672289075e-06, |
| "loss": 0.0094, |
| "reward": 1.4954218715429306, |
| "reward_std": 0.9026085883378983, |
| "rewards/correctness_reward_func": 1.03125, |
| "rewards/int_reward_func": 0.3203125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14385937619954348, |
| "step": 132 |
| }, |
| { |
| "completion_length": 101.125, |
| "epoch": 0.1423792319015121, |
| "grad_norm": 4.111429214477539, |
| "kl": 0.18476470839232206, |
| "learning_rate": 4.973453315132592e-06, |
| "loss": 0.0074, |
| "reward": 2.419406235218048, |
| "reward_std": 0.6811097683385015, |
| "rewards/correctness_reward_func": 1.75, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.19284374825656414, |
| "step": 133 |
| }, |
| { |
| "completion_length": 110.0625, |
| "epoch": 0.143449752442125, |
| "grad_norm": 4.763584613800049, |
| "kl": 0.1902949649374932, |
| "learning_rate": 4.9720770655628216e-06, |
| "loss": 0.0076, |
| "reward": 2.049671910703182, |
| "reward_std": 0.8413272872567177, |
| "rewards/correctness_reward_func": 1.40625, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.19029687531292439, |
| "step": 134 |
| }, |
| { |
| "completion_length": 95.578125, |
| "epoch": 0.14452027298273787, |
| "grad_norm": 4.345425128936768, |
| "kl": 0.18916460033506155, |
| "learning_rate": 4.970666237769913e-06, |
| "loss": 0.0076, |
| "reward": 2.191656231880188, |
| "reward_std": 0.7033369969576597, |
| "rewards/correctness_reward_func": 1.53125, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20728124678134918, |
| "step": 135 |
| }, |
| { |
| "completion_length": 84.078125, |
| "epoch": 0.14559079352335072, |
| "grad_norm": 2.940378427505493, |
| "kl": 0.06369929504580796, |
| "learning_rate": 4.9692208514878445e-06, |
| "loss": 0.0025, |
| "reward": 2.3128594160079956, |
| "reward_std": 0.6257542409002781, |
| "rewards/correctness_reward_func": 1.625, |
| "rewards/int_reward_func": 0.46875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21910937316715717, |
| "step": 136 |
| }, |
| { |
| "completion_length": 120.328125, |
| "epoch": 0.1466613140639636, |
| "grad_norm": 7.51654052734375, |
| "kl": 0.3002074658870697, |
| "learning_rate": 4.967740926933985e-06, |
| "loss": 0.012, |
| "reward": 1.7032031267881393, |
| "reward_std": 1.0220621526241302, |
| "rewards/correctness_reward_func": 1.15625, |
| "rewards/int_reward_func": 0.3828125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.16414062399417162, |
| "step": 137 |
| }, |
| { |
| "completion_length": 114.046875, |
| "epoch": 0.14773183460457648, |
| "grad_norm": 2.956787347793579, |
| "kl": 0.10229182336479425, |
| "learning_rate": 4.966226484808804e-06, |
| "loss": 0.0041, |
| "reward": 1.6846249997615814, |
| "reward_std": 0.8771754652261734, |
| "rewards/correctness_reward_func": 1.09375, |
| "rewards/int_reward_func": 0.3984375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.19243750255554914, |
| "step": 138 |
| }, |
| { |
| "completion_length": 102.34375, |
| "epoch": 0.14880235514518936, |
| "grad_norm": 4.8385114669799805, |
| "kl": 0.1648537963628769, |
| "learning_rate": 4.96467754629559e-06, |
| "loss": 0.0066, |
| "reward": 1.9405781105160713, |
| "reward_std": 0.6661778870038688, |
| "rewards/correctness_reward_func": 1.3125, |
| "rewards/int_reward_func": 0.40625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.22182812727987766, |
| "step": 139 |
| }, |
| { |
| "completion_length": 99.921875, |
| "epoch": 0.1498728756858022, |
| "grad_norm": 2.6203041076660156, |
| "kl": 0.06277278368361294, |
| "learning_rate": 4.963094133060148e-06, |
| "loss": 0.0025, |
| "reward": 2.1328750401735306, |
| "reward_std": 0.6620223973877728, |
| "rewards/correctness_reward_func": 1.4375, |
| "rewards/int_reward_func": 0.4609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23443749826401472, |
| "step": 140 |
| }, |
| { |
| "completion_length": 92.15625, |
| "epoch": 0.1509433962264151, |
| "grad_norm": 2.787095069885254, |
| "kl": 0.07504080841317773, |
| "learning_rate": 4.961476267250501e-06, |
| "loss": 0.003, |
| "reward": 2.2870156168937683, |
| "reward_std": 0.6934415455907583, |
| "rewards/correctness_reward_func": 1.59375, |
| "rewards/int_reward_func": 0.4609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23232812341302633, |
| "step": 141 |
| }, |
| { |
| "completion_length": 111.0, |
| "epoch": 0.15201391676702797, |
| "grad_norm": 2.570060968399048, |
| "kl": 0.06025985535234213, |
| "learning_rate": 4.959823971496575e-06, |
| "loss": 0.0024, |
| "reward": 1.9751719227060676, |
| "reward_std": 0.7296689655631781, |
| "rewards/correctness_reward_func": 1.375, |
| "rewards/int_reward_func": 0.40625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1939218717161566, |
| "step": 142 |
| }, |
| { |
| "completion_length": 127.40625, |
| "epoch": 0.15308443730764085, |
| "grad_norm": 4.903911590576172, |
| "kl": 0.20708202896639705, |
| "learning_rate": 4.958137268909887e-06, |
| "loss": 0.0083, |
| "reward": 2.063531205058098, |
| "reward_std": 0.8510163221508265, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.421875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14165624976158142, |
| "step": 143 |
| }, |
| { |
| "completion_length": 98.5625, |
| "epoch": 0.15415495784825373, |
| "grad_norm": 4.191147804260254, |
| "kl": 0.2446250948123634, |
| "learning_rate": 4.9564161830832214e-06, |
| "loss": 0.0098, |
| "reward": 2.337281256914139, |
| "reward_std": 0.6763627836480737, |
| "rewards/correctness_reward_func": 1.65625, |
| "rewards/int_reward_func": 0.4609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2200937452726066, |
| "step": 144 |
| }, |
| { |
| "completion_length": 88.984375, |
| "epoch": 0.15522547838886658, |
| "grad_norm": 168.51283264160156, |
| "kl": 0.34985177870839834, |
| "learning_rate": 4.954660738090297e-06, |
| "loss": 0.014, |
| "reward": 1.6290156617760658, |
| "reward_std": 0.7621745709329844, |
| "rewards/correctness_reward_func": 1.03125, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.222765626385808, |
| "step": 145 |
| }, |
| { |
| "completion_length": 116.078125, |
| "epoch": 0.15629599892947946, |
| "grad_norm": 4.605787754058838, |
| "kl": 0.20393797848373652, |
| "learning_rate": 4.9528709584854316e-06, |
| "loss": 0.0082, |
| "reward": 1.5738593488931656, |
| "reward_std": 1.115996390581131, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.3984375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17542187124490738, |
| "step": 146 |
| }, |
| { |
| "completion_length": 89.328125, |
| "epoch": 0.15736651947009234, |
| "grad_norm": 4.029605865478516, |
| "kl": 0.21852776128798723, |
| "learning_rate": 4.951046869303202e-06, |
| "loss": 0.0087, |
| "reward": 1.9777500182390213, |
| "reward_std": 0.9872381817549467, |
| "rewards/correctness_reward_func": 1.34375, |
| "rewards/int_reward_func": 0.40625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.22774999774992466, |
| "step": 147 |
| }, |
| { |
| "completion_length": 106.75, |
| "epoch": 0.15843704001070522, |
| "grad_norm": 3.108353853225708, |
| "kl": 0.07371893431991339, |
| "learning_rate": 4.949188496058089e-06, |
| "loss": 0.0029, |
| "reward": 1.7372031211853027, |
| "reward_std": 0.9112571626901627, |
| "rewards/correctness_reward_func": 1.09375, |
| "rewards/int_reward_func": 0.421875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2215781332924962, |
| "step": 148 |
| }, |
| { |
| "completion_length": 78.40625, |
| "epoch": 0.15950756055131807, |
| "grad_norm": 5.034030914306641, |
| "kl": 0.11514789052307606, |
| "learning_rate": 4.947295864744121e-06, |
| "loss": 0.0046, |
| "reward": 2.055265612900257, |
| "reward_std": 0.6963230553083122, |
| "rewards/correctness_reward_func": 1.34375, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.258390624076128, |
| "step": 149 |
| }, |
| { |
| "completion_length": 90.484375, |
| "epoch": 0.16057808109193095, |
| "grad_norm": 2.6227409839630127, |
| "kl": 0.09901809925213456, |
| "learning_rate": 4.9453690018345144e-06, |
| "loss": 0.004, |
| "reward": 2.031171888113022, |
| "reward_std": 0.9270219663158059, |
| "rewards/correctness_reward_func": 1.34375, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2499218750745058, |
| "step": 150 |
| }, |
| { |
| "completion_length": 111.640625, |
| "epoch": 0.16164860163254383, |
| "grad_norm": 5.403820037841797, |
| "kl": 0.24172887252643704, |
| "learning_rate": 4.943407934281298e-06, |
| "loss": 0.0097, |
| "reward": 1.693390630185604, |
| "reward_std": 1.0351360142230988, |
| "rewards/correctness_reward_func": 1.0625, |
| "rewards/int_reward_func": 0.4609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1699531227350235, |
| "step": 151 |
| }, |
| { |
| "completion_length": 92.890625, |
| "epoch": 0.1627191221731567, |
| "grad_norm": 2.174766778945923, |
| "kl": 0.07736781658604741, |
| "learning_rate": 4.941412689514941e-06, |
| "loss": 0.0031, |
| "reward": 2.1249531507492065, |
| "reward_std": 0.82035240996629, |
| "rewards/correctness_reward_func": 1.40625, |
| "rewards/int_reward_func": 0.46875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2499531265348196, |
| "step": 152 |
| }, |
| { |
| "completion_length": 96.546875, |
| "epoch": 0.16378964271376958, |
| "grad_norm": 2.039672374725342, |
| "kl": 0.07323360512964427, |
| "learning_rate": 4.939383295443966e-06, |
| "loss": 0.0029, |
| "reward": 2.026875004172325, |
| "reward_std": 0.7493576873093843, |
| "rewards/correctness_reward_func": 1.375, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20656250044703484, |
| "step": 153 |
| }, |
| { |
| "completion_length": 106.59375, |
| "epoch": 0.16486016325438244, |
| "grad_norm": 2.2819831371307373, |
| "kl": 0.07374695758335292, |
| "learning_rate": 4.937319780454559e-06, |
| "loss": 0.0029, |
| "reward": 1.7915781140327454, |
| "reward_std": 0.8505431758239865, |
| "rewards/correctness_reward_func": 1.15625, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1900156200863421, |
| "step": 154 |
| }, |
| { |
| "completion_length": 91.90625, |
| "epoch": 0.16593068379499532, |
| "grad_norm": 5.251499176025391, |
| "kl": 0.40140265179798007, |
| "learning_rate": 4.9352221734101745e-06, |
| "loss": 0.0161, |
| "reward": 2.1450937539339066, |
| "reward_std": 0.7235295535065234, |
| "rewards/correctness_reward_func": 1.46875, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2388437483459711, |
| "step": 155 |
| }, |
| { |
| "completion_length": 91.375, |
| "epoch": 0.1670012043356082, |
| "grad_norm": 2.6729063987731934, |
| "kl": 0.06791439699009061, |
| "learning_rate": 4.933090503651129e-06, |
| "loss": 0.0027, |
| "reward": 2.2278437092900276, |
| "reward_std": 0.7469283854588866, |
| "rewards/correctness_reward_func": 1.5625, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.22003125306218863, |
| "step": 156 |
| }, |
| { |
| "completion_length": 83.53125, |
| "epoch": 0.16807172487622107, |
| "grad_norm": 2.890995979309082, |
| "kl": 0.08941763360053301, |
| "learning_rate": 4.930924800994192e-06, |
| "loss": 0.0036, |
| "reward": 2.040843792259693, |
| "reward_std": 0.6482276869937778, |
| "rewards/correctness_reward_func": 1.3125, |
| "rewards/int_reward_func": 0.4921875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23615625128149986, |
| "step": 157 |
| }, |
| { |
| "completion_length": 103.4375, |
| "epoch": 0.16914224541683393, |
| "grad_norm": 4.847292900085449, |
| "kl": 0.25062092347070575, |
| "learning_rate": 4.9287250957321685e-06, |
| "loss": 0.01, |
| "reward": 1.8447500094771385, |
| "reward_std": 0.8658850640058517, |
| "rewards/correctness_reward_func": 1.1875, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21193750761449337, |
| "step": 158 |
| }, |
| { |
| "completion_length": 87.78125, |
| "epoch": 0.1702127659574468, |
| "grad_norm": 2.486091136932373, |
| "kl": 0.10755344619974494, |
| "learning_rate": 4.9264914186334775e-06, |
| "loss": 0.0043, |
| "reward": 2.14860936999321, |
| "reward_std": 0.7705556647852063, |
| "rewards/correctness_reward_func": 1.4375, |
| "rewards/int_reward_func": 0.4609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25017187278717756, |
| "step": 159 |
| }, |
| { |
| "completion_length": 74.8125, |
| "epoch": 0.17128328649805968, |
| "grad_norm": 2.6566174030303955, |
| "kl": 0.09692100062966347, |
| "learning_rate": 4.924223800941718e-06, |
| "loss": 0.0039, |
| "reward": 2.179875001311302, |
| "reward_std": 0.647944641765207, |
| "rewards/correctness_reward_func": 1.40625, |
| "rewards/int_reward_func": 0.484375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.28924999572336674, |
| "step": 160 |
| }, |
| { |
| "completion_length": 112.390625, |
| "epoch": 0.17235380703867256, |
| "grad_norm": 7.983546257019043, |
| "kl": 0.34650124446488917, |
| "learning_rate": 4.921922274375232e-06, |
| "loss": 0.0139, |
| "reward": 1.7768593654036522, |
| "reward_std": 0.6629852540791035, |
| "rewards/correctness_reward_func": 1.1875, |
| "rewards/int_reward_func": 0.4140625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17529687471687794, |
| "step": 161 |
| }, |
| { |
| "completion_length": 81.609375, |
| "epoch": 0.17342432757928541, |
| "grad_norm": 3.0785446166992188, |
| "kl": 0.09424351761117578, |
| "learning_rate": 4.919586871126667e-06, |
| "loss": 0.0038, |
| "reward": 2.1120937913656235, |
| "reward_std": 0.947939082980156, |
| "rewards/correctness_reward_func": 1.375, |
| "rewards/int_reward_func": 0.4609375, |
| "rewards/soft_format_reward_func": 0.0078125, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.26834374107420444, |
| "step": 162 |
| }, |
| { |
| "completion_length": 90.421875, |
| "epoch": 0.1744948481198983, |
| "grad_norm": 4.486505508422852, |
| "kl": 0.2375591630116105, |
| "learning_rate": 4.917217623862516e-06, |
| "loss": 0.0095, |
| "reward": 1.9345000088214874, |
| "reward_std": 0.7378783877938986, |
| "rewards/correctness_reward_func": 1.21875, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23918749950826168, |
| "step": 163 |
| }, |
| { |
| "completion_length": 89.34375, |
| "epoch": 0.17556536866051117, |
| "grad_norm": 5.476224422454834, |
| "kl": 0.3916892586275935, |
| "learning_rate": 4.914814565722671e-06, |
| "loss": 0.0157, |
| "reward": 1.6488437578082085, |
| "reward_std": 0.861721821129322, |
| "rewards/correctness_reward_func": 0.9375, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2582187484949827, |
| "step": 164 |
| }, |
| { |
| "completion_length": 76.15625, |
| "epoch": 0.17663588920112405, |
| "grad_norm": 2.504824161529541, |
| "kl": 0.09898415254428983, |
| "learning_rate": 4.912377730319951e-06, |
| "loss": 0.004, |
| "reward": 2.149609424173832, |
| "reward_std": 0.8080255158711225, |
| "rewards/correctness_reward_func": 1.4375, |
| "rewards/int_reward_func": 0.421875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.29023437574505806, |
| "step": 165 |
| }, |
| { |
| "completion_length": 86.703125, |
| "epoch": 0.17770640974173693, |
| "grad_norm": 2.5262093544006348, |
| "kl": 0.10452345060184598, |
| "learning_rate": 4.909907151739634e-06, |
| "loss": 0.0042, |
| "reward": 2.0487187057733536, |
| "reward_std": 1.013342872262001, |
| "rewards/correctness_reward_func": 1.34375, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25965624768286943, |
| "step": 166 |
| }, |
| { |
| "completion_length": 83.109375, |
| "epoch": 0.17877693028234978, |
| "grad_norm": 3.6836423873901367, |
| "kl": 0.12263510143384337, |
| "learning_rate": 4.907402864538984e-06, |
| "loss": 0.0049, |
| "reward": 2.0983437597751617, |
| "reward_std": 0.8467487432062626, |
| "rewards/correctness_reward_func": 1.375, |
| "rewards/int_reward_func": 0.46875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2545937467366457, |
| "step": 167 |
| }, |
| { |
| "completion_length": 105.828125, |
| "epoch": 0.17984745082296266, |
| "grad_norm": 3.9994001388549805, |
| "kl": 0.2718197964131832, |
| "learning_rate": 4.904864903746765e-06, |
| "loss": 0.0109, |
| "reward": 1.590890608727932, |
| "reward_std": 1.0225011110305786, |
| "rewards/correctness_reward_func": 0.96875, |
| "rewards/int_reward_func": 0.3984375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.21589063154533505, |
| "step": 168 |
| }, |
| { |
| "completion_length": 80.734375, |
| "epoch": 0.18091797136357554, |
| "grad_norm": 3.370049476623535, |
| "kl": 0.0908288094215095, |
| "learning_rate": 4.9022933048627496e-06, |
| "loss": 0.0036, |
| "reward": 1.9604843854904175, |
| "reward_std": 0.6807838249951601, |
| "rewards/correctness_reward_func": 1.1875, |
| "rewards/int_reward_func": 0.484375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2886093705892563, |
| "step": 169 |
| }, |
| { |
| "completion_length": 89.03125, |
| "epoch": 0.18198849190418842, |
| "grad_norm": 2.7841224670410156, |
| "kl": 0.09452959662303329, |
| "learning_rate": 4.899688103857223e-06, |
| "loss": 0.0038, |
| "reward": 2.052734389901161, |
| "reward_std": 0.789879210293293, |
| "rewards/correctness_reward_func": 1.34375, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25585937313735485, |
| "step": 170 |
| }, |
| { |
| "completion_length": 99.46875, |
| "epoch": 0.18305901244480127, |
| "grad_norm": 2.4192087650299072, |
| "kl": 0.10028906259685755, |
| "learning_rate": 4.897049337170483e-06, |
| "loss": 0.004, |
| "reward": 1.9263124987483025, |
| "reward_std": 0.4447530438192189, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23881249967962503, |
| "step": 171 |
| }, |
| { |
| "completion_length": 85.359375, |
| "epoch": 0.18412953298541415, |
| "grad_norm": 2.552004814147949, |
| "kl": 0.10651395656168461, |
| "learning_rate": 4.894377041712327e-06, |
| "loss": 0.0043, |
| "reward": 2.072437509894371, |
| "reward_std": 0.8923071715980768, |
| "rewards/correctness_reward_func": 1.34375, |
| "rewards/int_reward_func": 0.484375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.244312503375113, |
| "step": 172 |
| }, |
| { |
| "completion_length": 86.828125, |
| "epoch": 0.18520005352602703, |
| "grad_norm": 2.8208720684051514, |
| "kl": 0.08628266118466854, |
| "learning_rate": 4.891671254861535e-06, |
| "loss": 0.0035, |
| "reward": 2.123812586069107, |
| "reward_std": 0.7962243193760514, |
| "rewards/correctness_reward_func": 1.40625, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2722499957308173, |
| "step": 173 |
| }, |
| { |
| "completion_length": 68.59375, |
| "epoch": 0.1862705740666399, |
| "grad_norm": 3.199782133102417, |
| "kl": 0.09673942252993584, |
| "learning_rate": 4.8889320144653525e-06, |
| "loss": 0.0039, |
| "reward": 2.4416562616825104, |
| "reward_std": 0.6845003152266145, |
| "rewards/correctness_reward_func": 1.65625, |
| "rewards/int_reward_func": 0.484375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3010312579572201, |
| "step": 174 |
| }, |
| { |
| "completion_length": 80.265625, |
| "epoch": 0.1873410946072528, |
| "grad_norm": 2.8050692081451416, |
| "kl": 0.09070709394291043, |
| "learning_rate": 4.886159358838952e-06, |
| "loss": 0.0036, |
| "reward": 2.2726562321186066, |
| "reward_std": 0.713380170520395, |
| "rewards/correctness_reward_func": 1.53125, |
| "rewards/int_reward_func": 0.4609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2804687600582838, |
| "step": 175 |
| }, |
| { |
| "completion_length": 95.078125, |
| "epoch": 0.18841161514786564, |
| "grad_norm": 3.0878233909606934, |
| "kl": 0.22780032362788916, |
| "learning_rate": 4.883353326764907e-06, |
| "loss": 0.0091, |
| "reward": 2.0888593643903732, |
| "reward_std": 0.632993305567652, |
| "rewards/correctness_reward_func": 1.40625, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.22948437556624413, |
| "step": 176 |
| }, |
| { |
| "completion_length": 92.03125, |
| "epoch": 0.18948213568847852, |
| "grad_norm": 6.2119035720825195, |
| "kl": 0.3357097846455872, |
| "learning_rate": 4.880513957492641e-06, |
| "loss": 0.0134, |
| "reward": 1.9138593599200249, |
| "reward_std": 0.8630610294640064, |
| "rewards/correctness_reward_func": 1.21875, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24979687482118607, |
| "step": 177 |
| }, |
| { |
| "completion_length": 84.390625, |
| "epoch": 0.1905526562290914, |
| "grad_norm": 4.748230457305908, |
| "kl": 0.26475911401212215, |
| "learning_rate": 4.8776412907378845e-06, |
| "loss": 0.0106, |
| "reward": 1.8656718656420708, |
| "reward_std": 0.7014469979330897, |
| "rewards/correctness_reward_func": 1.125, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2641093786805868, |
| "step": 178 |
| }, |
| { |
| "completion_length": 95.0625, |
| "epoch": 0.19162317676970428, |
| "grad_norm": 2.9860572814941406, |
| "kl": 0.1027436142321676, |
| "learning_rate": 4.8747353666821155e-06, |
| "loss": 0.0041, |
| "reward": 1.8935781568288803, |
| "reward_std": 0.8618385540321469, |
| "rewards/correctness_reward_func": 1.15625, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.25295313261449337, |
| "step": 179 |
| }, |
| { |
| "completion_length": 78.359375, |
| "epoch": 0.19269369731031713, |
| "grad_norm": 4.26141881942749, |
| "kl": 0.19185639871284366, |
| "learning_rate": 4.871796225972e-06, |
| "loss": 0.0077, |
| "reward": 1.882093757390976, |
| "reward_std": 0.762749788351357, |
| "rewards/correctness_reward_func": 1.1875, |
| "rewards/int_reward_func": 0.4140625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2805312527343631, |
| "step": 180 |
| }, |
| { |
| "completion_length": 86.953125, |
| "epoch": 0.19376421785093, |
| "grad_norm": 2.9578616619110107, |
| "kl": 0.09208998270332813, |
| "learning_rate": 4.868823909718823e-06, |
| "loss": 0.0037, |
| "reward": 2.173124998807907, |
| "reward_std": 0.8102906746789813, |
| "rewards/correctness_reward_func": 1.46875, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25125000439584255, |
| "step": 181 |
| }, |
| { |
| "completion_length": 71.671875, |
| "epoch": 0.1948347383915429, |
| "grad_norm": 2.6002249717712402, |
| "kl": 0.10551499295979738, |
| "learning_rate": 4.865818459497911e-06, |
| "loss": 0.0042, |
| "reward": 2.2820468470454216, |
| "reward_std": 0.555876774713397, |
| "rewards/correctness_reward_func": 1.46875, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3132968805730343, |
| "step": 182 |
| }, |
| { |
| "completion_length": 92.03125, |
| "epoch": 0.19590525893215577, |
| "grad_norm": 6.459475517272949, |
| "kl": 0.2931561325676739, |
| "learning_rate": 4.862779917348055e-06, |
| "loss": 0.0117, |
| "reward": 1.95626562833786, |
| "reward_std": 0.8608931167982519, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/int_reward_func": 0.4609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2453281208872795, |
| "step": 183 |
| }, |
| { |
| "completion_length": 102.515625, |
| "epoch": 0.19697577947276865, |
| "grad_norm": 1.968064308166504, |
| "kl": 0.09014055877923965, |
| "learning_rate": 4.859708325770919e-06, |
| "loss": 0.0036, |
| "reward": 1.539296880364418, |
| "reward_std": 0.8695460446178913, |
| "rewards/correctness_reward_func": 0.875, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.22679687477648258, |
| "step": 184 |
| }, |
| { |
| "completion_length": 82.21875, |
| "epoch": 0.1980463000133815, |
| "grad_norm": 5.502157688140869, |
| "kl": 0.24213434057310224, |
| "learning_rate": 4.856603727730446e-06, |
| "loss": 0.0097, |
| "reward": 2.103000044822693, |
| "reward_std": 0.7971827173605561, |
| "rewards/correctness_reward_func": 1.375, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.27487499825656414, |
| "step": 185 |
| }, |
| { |
| "completion_length": 96.796875, |
| "epoch": 0.19911682055399438, |
| "grad_norm": 4.163439750671387, |
| "kl": 0.2488319119438529, |
| "learning_rate": 4.853466166652259e-06, |
| "loss": 0.01, |
| "reward": 1.8271406143903732, |
| "reward_std": 0.8965076357126236, |
| "rewards/correctness_reward_func": 1.125, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.22557812742888927, |
| "step": 186 |
| }, |
| { |
| "completion_length": 109.671875, |
| "epoch": 0.20018734109460726, |
| "grad_norm": 2.380798816680908, |
| "kl": 0.07831509876996279, |
| "learning_rate": 4.850295686423048e-06, |
| "loss": 0.0031, |
| "reward": 1.7582030892372131, |
| "reward_std": 0.8107579126954079, |
| "rewards/correctness_reward_func": 1.125, |
| "rewards/int_reward_func": 0.4140625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.21132812649011612, |
| "step": 187 |
| }, |
| { |
| "completion_length": 96.46875, |
| "epoch": 0.20125786163522014, |
| "grad_norm": 5.34075927734375, |
| "kl": 0.3953818525187671, |
| "learning_rate": 4.8470923313899655e-06, |
| "loss": 0.0158, |
| "reward": 1.9607812352478504, |
| "reward_std": 0.5701902243308723, |
| "rewards/correctness_reward_func": 1.3125, |
| "rewards/int_reward_func": 0.421875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.22640624921768904, |
| "step": 188 |
| }, |
| { |
| "completion_length": 79.265625, |
| "epoch": 0.202328382175833, |
| "grad_norm": 2.1705665588378906, |
| "kl": 0.10117745213210583, |
| "learning_rate": 4.843856146359999e-06, |
| "loss": 0.004, |
| "reward": 2.0710155963897705, |
| "reward_std": 0.6723045469261706, |
| "rewards/correctness_reward_func": 1.34375, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2741406299173832, |
| "step": 189 |
| }, |
| { |
| "completion_length": 72.46875, |
| "epoch": 0.20339890271644587, |
| "grad_norm": 2.7847604751586914, |
| "kl": 0.12489751679822803, |
| "learning_rate": 4.8405871765993435e-06, |
| "loss": 0.005, |
| "reward": 1.8936093151569366, |
| "reward_std": 0.8906522025354207, |
| "rewards/correctness_reward_func": 1.15625, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.2920468747615814, |
| "step": 190 |
| }, |
| { |
| "completion_length": 74.3125, |
| "epoch": 0.20446942325705875, |
| "grad_norm": 7.144665241241455, |
| "kl": 0.5474197333678603, |
| "learning_rate": 4.837285467832775e-06, |
| "loss": 0.0219, |
| "reward": 1.9980624914169312, |
| "reward_std": 1.1599431410431862, |
| "rewards/correctness_reward_func": 1.28125, |
| "rewards/int_reward_func": 0.4140625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3027499932795763, |
| "step": 191 |
| }, |
| { |
| "completion_length": 77.015625, |
| "epoch": 0.20553994379767163, |
| "grad_norm": 2.282410144805908, |
| "kl": 0.12488419935107231, |
| "learning_rate": 4.833951066243004e-06, |
| "loss": 0.005, |
| "reward": 2.0859062671661377, |
| "reward_std": 0.7159075043164194, |
| "rewards/correctness_reward_func": 1.3125, |
| "rewards/int_reward_func": 0.4609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3124687448143959, |
| "step": 192 |
| }, |
| { |
| "completion_length": 74.6875, |
| "epoch": 0.20661046433828448, |
| "grad_norm": 5.521911144256592, |
| "kl": 0.38207234255969524, |
| "learning_rate": 4.830584018470036e-06, |
| "loss": 0.0153, |
| "reward": 2.290625035762787, |
| "reward_std": 0.6622507013380527, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.484375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.2984374985098839, |
| "step": 193 |
| }, |
| { |
| "completion_length": 81.640625, |
| "epoch": 0.20768098487889736, |
| "grad_norm": 5.832190036773682, |
| "kl": 0.24756696447730064, |
| "learning_rate": 4.827184371610511e-06, |
| "loss": 0.0099, |
| "reward": 2.2973125129938126, |
| "reward_std": 0.6708025210537016, |
| "rewards/correctness_reward_func": 1.53125, |
| "rewards/int_reward_func": 0.484375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.28168749529868364, |
| "step": 194 |
| }, |
| { |
| "completion_length": 74.671875, |
| "epoch": 0.20875150541951024, |
| "grad_norm": 7.309933662414551, |
| "kl": 0.12391441874206066, |
| "learning_rate": 4.8237521732170525e-06, |
| "loss": 0.005, |
| "reward": 2.016703099012375, |
| "reward_std": 1.132209412753582, |
| "rewards/correctness_reward_func": 1.28125, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.29014062508940697, |
| "step": 195 |
| }, |
| { |
| "completion_length": 73.34375, |
| "epoch": 0.20982202596012312, |
| "grad_norm": 3.4460299015045166, |
| "kl": 0.12927269656211138, |
| "learning_rate": 4.820287471297598e-06, |
| "loss": 0.0052, |
| "reward": 1.9848750308156013, |
| "reward_std": 0.7779360907152295, |
| "rewards/correctness_reward_func": 1.21875, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2895624926313758, |
| "step": 196 |
| }, |
| { |
| "completion_length": 88.28125, |
| "epoch": 0.210892546500736, |
| "grad_norm": 4.137183666229248, |
| "kl": 0.3276713816449046, |
| "learning_rate": 4.816790314314729e-06, |
| "loss": 0.0131, |
| "reward": 1.964468702673912, |
| "reward_std": 0.7385209053754807, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/int_reward_func": 0.4609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2535312492400408, |
| "step": 197 |
| }, |
| { |
| "completion_length": 73.453125, |
| "epoch": 0.21196306704134885, |
| "grad_norm": 6.063220500946045, |
| "kl": 0.36109886690974236, |
| "learning_rate": 4.813260751184992e-06, |
| "loss": 0.0144, |
| "reward": 2.3978749811649323, |
| "reward_std": 0.7160034999251366, |
| "rewards/correctness_reward_func": 1.59375, |
| "rewards/int_reward_func": 0.484375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.31975000351667404, |
| "step": 198 |
| }, |
| { |
| "completion_length": 90.359375, |
| "epoch": 0.21303358758196173, |
| "grad_norm": 9.171374320983887, |
| "kl": 0.7352069662883878, |
| "learning_rate": 4.809698831278217e-06, |
| "loss": 0.0294, |
| "reward": 1.9928593933582306, |
| "reward_std": 0.8052435261197388, |
| "rewards/correctness_reward_func": 1.28125, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2662968710064888, |
| "step": 199 |
| }, |
| { |
| "completion_length": 69.953125, |
| "epoch": 0.2141041081225746, |
| "grad_norm": 3.0394396781921387, |
| "kl": 0.14388780342414975, |
| "learning_rate": 4.806104604416824e-06, |
| "loss": 0.0058, |
| "reward": 2.525015652179718, |
| "reward_std": 0.4300219719298184, |
| "rewards/correctness_reward_func": 1.75, |
| "rewards/int_reward_func": 0.484375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2906406167894602, |
| "step": 200 |
| }, |
| { |
| "completion_length": 92.03125, |
| "epoch": 0.21517462866318748, |
| "grad_norm": 4.5592265129089355, |
| "kl": 0.3974157813936472, |
| "learning_rate": 4.802478120875125e-06, |
| "loss": 0.0159, |
| "reward": 1.534609392285347, |
| "reward_std": 1.030179588124156, |
| "rewards/correctness_reward_func": 0.875, |
| "rewards/int_reward_func": 0.3828125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2767968699336052, |
| "step": 201 |
| }, |
| { |
| "completion_length": 76.625, |
| "epoch": 0.21624514920380034, |
| "grad_norm": 2.740817070007324, |
| "kl": 0.14991699904203415, |
| "learning_rate": 4.7988194313786275e-06, |
| "loss": 0.006, |
| "reward": 2.134265646338463, |
| "reward_std": 0.8027890680823475, |
| "rewards/correctness_reward_func": 1.375, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.28270312771201134, |
| "step": 202 |
| }, |
| { |
| "completion_length": 86.53125, |
| "epoch": 0.21731566974441321, |
| "grad_norm": 5.491028785705566, |
| "kl": 0.32305468805134296, |
| "learning_rate": 4.795128587103315e-06, |
| "loss": 0.0129, |
| "reward": 2.177546873688698, |
| "reward_std": 0.862015737220645, |
| "rewards/correctness_reward_func": 1.46875, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2556718774139881, |
| "step": 203 |
| }, |
| { |
| "completion_length": 80.640625, |
| "epoch": 0.2183861902850261, |
| "grad_norm": 3.542194366455078, |
| "kl": 0.13027278054505587, |
| "learning_rate": 4.791405639674941e-06, |
| "loss": 0.0052, |
| "reward": 1.6867187470197678, |
| "reward_std": 0.8688598442822695, |
| "rewards/correctness_reward_func": 0.9375, |
| "rewards/int_reward_func": 0.4609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.28046874329447746, |
| "step": 204 |
| }, |
| { |
| "completion_length": 75.96875, |
| "epoch": 0.21945671082563897, |
| "grad_norm": 3.078350305557251, |
| "kl": 0.1293167658150196, |
| "learning_rate": 4.7876506411683e-06, |
| "loss": 0.0052, |
| "reward": 2.0141249895095825, |
| "reward_std": 0.7147551532834768, |
| "rewards/correctness_reward_func": 1.21875, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.31881249509751797, |
| "step": 205 |
| }, |
| { |
| "completion_length": 62.734375, |
| "epoch": 0.22052723136625185, |
| "grad_norm": 7.453506946563721, |
| "kl": 0.8046054858714342, |
| "learning_rate": 4.783863644106502e-06, |
| "loss": 0.0322, |
| "reward": 1.6788437813520432, |
| "reward_std": 0.8941609086468816, |
| "rewards/correctness_reward_func": 0.96875, |
| "rewards/int_reward_func": 0.40625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3038437496870756, |
| "step": 206 |
| }, |
| { |
| "completion_length": 68.671875, |
| "epoch": 0.2215977519068647, |
| "grad_norm": 4.4995436668396, |
| "kl": 0.14437556639313698, |
| "learning_rate": 4.780044701460239e-06, |
| "loss": 0.0058, |
| "reward": 2.126671925187111, |
| "reward_std": 0.7776627587154508, |
| "rewards/correctness_reward_func": 1.34375, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.29854688607156277, |
| "step": 207 |
| }, |
| { |
| "completion_length": 54.75, |
| "epoch": 0.22266827244747758, |
| "grad_norm": 3.271150588989258, |
| "kl": 0.2097001215443015, |
| "learning_rate": 4.7761938666470405e-06, |
| "loss": 0.0084, |
| "reward": 1.956812545657158, |
| "reward_std": 0.9061004631221294, |
| "rewards/correctness_reward_func": 1.125, |
| "rewards/int_reward_func": 0.46875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.3552500009536743, |
| "step": 208 |
| }, |
| { |
| "completion_length": 72.578125, |
| "epoch": 0.22373879298809046, |
| "grad_norm": 3.0446856021881104, |
| "kl": 0.15493952203541994, |
| "learning_rate": 4.7723111935305275e-06, |
| "loss": 0.0062, |
| "reward": 2.270968735218048, |
| "reward_std": 0.8504615277051926, |
| "rewards/correctness_reward_func": 1.46875, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.32565624453127384, |
| "step": 209 |
| }, |
| { |
| "completion_length": 69.046875, |
| "epoch": 0.22480931352870334, |
| "grad_norm": 5.6605095863342285, |
| "kl": 0.44642951618880033, |
| "learning_rate": 4.7683967364196624e-06, |
| "loss": 0.0179, |
| "reward": 2.0304374992847443, |
| "reward_std": 0.7277556583285332, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/int_reward_func": 0.46875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3116874936968088, |
| "step": 210 |
| }, |
| { |
| "completion_length": 64.9375, |
| "epoch": 0.2258798340693162, |
| "grad_norm": 4.651093482971191, |
| "kl": 0.17161214351654053, |
| "learning_rate": 4.764450550067986e-06, |
| "loss": 0.0069, |
| "reward": 1.935359388589859, |
| "reward_std": 0.850550489500165, |
| "rewards/correctness_reward_func": 1.15625, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3259843699634075, |
| "step": 211 |
| }, |
| { |
| "completion_length": 99.921875, |
| "epoch": 0.22695035460992907, |
| "grad_norm": 7.470244407653809, |
| "kl": 0.5526934135705233, |
| "learning_rate": 4.760472689672851e-06, |
| "loss": 0.0221, |
| "reward": 1.62957813590765, |
| "reward_std": 0.9631365463137627, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.4140625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2155156247317791, |
| "step": 212 |
| }, |
| { |
| "completion_length": 64.296875, |
| "epoch": 0.22802087515054195, |
| "grad_norm": 4.030452251434326, |
| "kl": 0.1725650643929839, |
| "learning_rate": 4.7564632108746524e-06, |
| "loss": 0.0069, |
| "reward": 2.4495781660079956, |
| "reward_std": 0.7331900605931878, |
| "rewards/correctness_reward_func": 1.625, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.3402031324803829, |
| "step": 213 |
| }, |
| { |
| "completion_length": 57.0625, |
| "epoch": 0.22909139569115483, |
| "grad_norm": 2.44975209236145, |
| "kl": 0.1745634926483035, |
| "learning_rate": 4.752422169756048e-06, |
| "loss": 0.007, |
| "reward": 2.49567186832428, |
| "reward_std": 0.585249027935788, |
| "rewards/correctness_reward_func": 1.65625, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3628593757748604, |
| "step": 214 |
| }, |
| { |
| "completion_length": 80.609375, |
| "epoch": 0.2301619162317677, |
| "grad_norm": 3.696011781692505, |
| "kl": 0.1569390268996358, |
| "learning_rate": 4.7483496228411754e-06, |
| "loss": 0.0063, |
| "reward": 2.0820469111204147, |
| "reward_std": 0.7507896656170487, |
| "rewards/correctness_reward_func": 1.34375, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.26173438131809235, |
| "step": 215 |
| }, |
| { |
| "completion_length": 67.90625, |
| "epoch": 0.23123243677238056, |
| "grad_norm": 3.6006627082824707, |
| "kl": 0.1786866094917059, |
| "learning_rate": 4.744245627094859e-06, |
| "loss": 0.0071, |
| "reward": 2.0825624614953995, |
| "reward_std": 0.8309154035523534, |
| "rewards/correctness_reward_func": 1.28125, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.015625, |
| "rewards/xmlcount_reward_func": 0.33256249874830246, |
| "step": 216 |
| }, |
| { |
| "completion_length": 82.53125, |
| "epoch": 0.23230295731299344, |
| "grad_norm": 4.304072856903076, |
| "kl": 0.31106262002140284, |
| "learning_rate": 4.740110239921813e-06, |
| "loss": 0.0124, |
| "reward": 1.406374990940094, |
| "reward_std": 0.8875350207090378, |
| "rewards/correctness_reward_func": 0.6875, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2657500021159649, |
| "step": 217 |
| }, |
| { |
| "completion_length": 57.75, |
| "epoch": 0.23337347785360632, |
| "grad_norm": 3.51265025138855, |
| "kl": 0.18941342923790216, |
| "learning_rate": 4.735943519165843e-06, |
| "loss": 0.0076, |
| "reward": 2.366296797990799, |
| "reward_std": 0.6840489963069558, |
| "rewards/correctness_reward_func": 1.5625, |
| "rewards/int_reward_func": 0.4609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3428593724966049, |
| "step": 218 |
| }, |
| { |
| "completion_length": 69.6875, |
| "epoch": 0.2344439983942192, |
| "grad_norm": 3.1696012020111084, |
| "kl": 0.144703084602952, |
| "learning_rate": 4.731745523109029e-06, |
| "loss": 0.0058, |
| "reward": 1.7885156497359276, |
| "reward_std": 0.8288924656808376, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.3432031273841858, |
| "step": 219 |
| }, |
| { |
| "completion_length": 79.453125, |
| "epoch": 0.23551451893483205, |
| "grad_norm": 3.0204484462738037, |
| "kl": 0.16810880228877068, |
| "learning_rate": 4.72751631047092e-06, |
| "loss": 0.0067, |
| "reward": 1.817734345793724, |
| "reward_std": 1.0543543472886086, |
| "rewards/correctness_reward_func": 1.0625, |
| "rewards/int_reward_func": 0.4296875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0234375, |
| "rewards/xmlcount_reward_func": 0.302109370008111, |
| "step": 220 |
| }, |
| { |
| "completion_length": 89.984375, |
| "epoch": 0.23658503947544493, |
| "grad_norm": 9.887064933776855, |
| "kl": 0.571788308210671, |
| "learning_rate": 4.723255940407704e-06, |
| "loss": 0.0229, |
| "reward": 2.150187447667122, |
| "reward_std": 0.7866512620821595, |
| "rewards/correctness_reward_func": 1.375, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0234375, |
| "rewards/xmlcount_reward_func": 0.27518749609589577, |
| "step": 221 |
| }, |
| { |
| "completion_length": 82.15625, |
| "epoch": 0.2376555600160578, |
| "grad_norm": 3.509312868118286, |
| "kl": 0.16501779574900866, |
| "learning_rate": 4.718964472511386e-06, |
| "loss": 0.0066, |
| "reward": 2.0418750420212746, |
| "reward_std": 0.9721324890851974, |
| "rewards/correctness_reward_func": 1.28125, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0234375, |
| "rewards/xmlcount_reward_func": 0.29187500290572643, |
| "step": 222 |
| }, |
| { |
| "completion_length": 69.703125, |
| "epoch": 0.2387260805566707, |
| "grad_norm": 3.7087841033935547, |
| "kl": 0.18095918465405703, |
| "learning_rate": 4.71464196680895e-06, |
| "loss": 0.0072, |
| "reward": 2.18121874332428, |
| "reward_std": 0.870441822335124, |
| "rewards/correctness_reward_func": 1.34375, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.015625, |
| "rewards/xmlcount_reward_func": 0.3452812507748604, |
| "step": 223 |
| }, |
| { |
| "completion_length": 93.453125, |
| "epoch": 0.23979660109728354, |
| "grad_norm": 9.079336166381836, |
| "kl": 0.7006166982464492, |
| "learning_rate": 4.710288483761524e-06, |
| "loss": 0.028, |
| "reward": 2.110390603542328, |
| "reward_std": 0.7916512079536915, |
| "rewards/correctness_reward_func": 1.34375, |
| "rewards/int_reward_func": 0.46875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.03125, |
| "rewards/xmlcount_reward_func": 0.2666406221687794, |
| "step": 224 |
| }, |
| { |
| "completion_length": 89.796875, |
| "epoch": 0.24086712163789642, |
| "grad_norm": 5.301533222198486, |
| "kl": 0.33772587310522795, |
| "learning_rate": 4.705904084263534e-06, |
| "loss": 0.0135, |
| "reward": 1.9977499693632126, |
| "reward_std": 0.9460650207474828, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.31025000289082527, |
| "step": 225 |
| }, |
| { |
| "completion_length": 93.375, |
| "epoch": 0.2419376421785093, |
| "grad_norm": 2.236737012863159, |
| "kl": 0.11640047281980515, |
| "learning_rate": 4.701488829641845e-06, |
| "loss": 0.0047, |
| "reward": 2.145703136920929, |
| "reward_std": 0.8284804495051503, |
| "rewards/correctness_reward_func": 1.40625, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.2863281313329935, |
| "step": 226 |
| }, |
| { |
| "completion_length": 81.8125, |
| "epoch": 0.24300816271912218, |
| "grad_norm": 3.1928486824035645, |
| "kl": 0.135368085000664, |
| "learning_rate": 4.697042781654913e-06, |
| "loss": 0.0054, |
| "reward": 2.135328069329262, |
| "reward_std": 0.7982124611735344, |
| "rewards/correctness_reward_func": 1.3125, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.33845312520861626, |
| "step": 227 |
| }, |
| { |
| "completion_length": 85.453125, |
| "epoch": 0.24407868325973506, |
| "grad_norm": 12.730961799621582, |
| "kl": 0.4075321350246668, |
| "learning_rate": 4.692566002491917e-06, |
| "loss": 0.0163, |
| "reward": 2.0005937218666077, |
| "reward_std": 0.8029458876699209, |
| "rewards/correctness_reward_func": 1.21875, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.34434375166893005, |
| "step": 228 |
| }, |
| { |
| "completion_length": 72.875, |
| "epoch": 0.2451492038003479, |
| "grad_norm": 6.923903942108154, |
| "kl": 0.6862649563699961, |
| "learning_rate": 4.6880585547718845e-06, |
| "loss": 0.0275, |
| "reward": 1.7937656044960022, |
| "reward_std": 0.915380734950304, |
| "rewards/correctness_reward_func": 0.9375, |
| "rewards/int_reward_func": 0.46875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.015625, |
| "rewards/xmlcount_reward_func": 0.371890626847744, |
| "step": 229 |
| }, |
| { |
| "completion_length": 93.75, |
| "epoch": 0.2462197243409608, |
| "grad_norm": 3.041198968887329, |
| "kl": 0.109968694858253, |
| "learning_rate": 4.683520501542825e-06, |
| "loss": 0.0044, |
| "reward": 2.041828043758869, |
| "reward_std": 0.828435555100441, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/int_reward_func": 0.4296875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.03125, |
| "rewards/xmlcount_reward_func": 0.3308906201273203, |
| "step": 230 |
| }, |
| { |
| "completion_length": 78.359375, |
| "epoch": 0.24729024488157367, |
| "grad_norm": 3.0014753341674805, |
| "kl": 0.13458310719579458, |
| "learning_rate": 4.67895190628084e-06, |
| "loss": 0.0054, |
| "reward": 2.4933906197547913, |
| "reward_std": 0.8564620353281498, |
| "rewards/correctness_reward_func": 1.625, |
| "rewards/int_reward_func": 0.46875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0234375, |
| "rewards/xmlcount_reward_func": 0.37620312348008156, |
| "step": 231 |
| }, |
| { |
| "completion_length": 93.8125, |
| "epoch": 0.24836076542218655, |
| "grad_norm": 4.2342848777771, |
| "kl": 0.1281078103929758, |
| "learning_rate": 4.674352832889239e-06, |
| "loss": 0.0051, |
| "reward": 2.4442031383514404, |
| "reward_std": 0.7425322765484452, |
| "rewards/correctness_reward_func": 1.59375, |
| "rewards/int_reward_func": 0.484375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0234375, |
| "rewards/xmlcount_reward_func": 0.3426406290382147, |
| "step": 232 |
| }, |
| { |
| "completion_length": 91.703125, |
| "epoch": 0.2494312859627994, |
| "grad_norm": 4.6626811027526855, |
| "kl": 0.30947081558406353, |
| "learning_rate": 4.669723345697646e-06, |
| "loss": 0.0124, |
| "reward": 1.7927343994379044, |
| "reward_std": 1.0577923730015755, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.4375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0234375, |
| "rewards/xmlcount_reward_func": 0.3317968789488077, |
| "step": 233 |
| }, |
| { |
| "completion_length": 95.1875, |
| "epoch": 0.2505018065034123, |
| "grad_norm": 2.6523685455322266, |
| "kl": 0.12286860542371869, |
| "learning_rate": 4.665063509461098e-06, |
| "loss": 0.0049, |
| "reward": 2.122484341263771, |
| "reward_std": 0.6617152327671647, |
| "rewards/correctness_reward_func": 1.28125, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.015625, |
| "rewards/xmlcount_reward_func": 0.38029688596725464, |
| "step": 234 |
| }, |
| { |
| "completion_length": 100.3125, |
| "epoch": 0.25157232704402516, |
| "grad_norm": 3.743971586227417, |
| "kl": 0.2634156849235296, |
| "learning_rate": 4.660373389359137e-06, |
| "loss": 0.0105, |
| "reward": 1.9493124820291996, |
| "reward_std": 0.6555369319394231, |
| "rewards/correctness_reward_func": 1.1875, |
| "rewards/int_reward_func": 0.4296875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3321250043809414, |
| "step": 235 |
| }, |
| { |
| "completion_length": 91.890625, |
| "epoch": 0.252642847584638, |
| "grad_norm": 5.421882629394531, |
| "kl": 0.36969919549301267, |
| "learning_rate": 4.655653050994907e-06, |
| "loss": 0.0148, |
| "reward": 2.489093706011772, |
| "reward_std": 0.5151624148711562, |
| "rewards/correctness_reward_func": 1.59375, |
| "rewards/int_reward_func": 0.4765625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.03125, |
| "rewards/xmlcount_reward_func": 0.38753124326467514, |
| "step": 236 |
| }, |
| { |
| "completion_length": 107.421875, |
| "epoch": 0.2537133681252509, |
| "grad_norm": 3.658877372741699, |
| "kl": 0.3179207113571465, |
| "learning_rate": 4.650902560394225e-06, |
| "loss": 0.0127, |
| "reward": 2.1520937085151672, |
| "reward_std": 0.7971650678664446, |
| "rewards/correctness_reward_func": 1.34375, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.3474062494933605, |
| "step": 237 |
| }, |
| { |
| "completion_length": 96.75, |
| "epoch": 0.25478388866586377, |
| "grad_norm": 2.333758592605591, |
| "kl": 0.14108293130993843, |
| "learning_rate": 4.646121984004666e-06, |
| "loss": 0.0056, |
| "reward": 2.4891093373298645, |
| "reward_std": 0.7755477353930473, |
| "rewards/correctness_reward_func": 1.59375, |
| "rewards/int_reward_func": 0.484375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0234375, |
| "rewards/xmlcount_reward_func": 0.38754688017070293, |
| "step": 238 |
| }, |
| { |
| "completion_length": 97.796875, |
| "epoch": 0.2558544092064767, |
| "grad_norm": 7.903679370880127, |
| "kl": 0.6723966179415584, |
| "learning_rate": 4.641311388694629e-06, |
| "loss": 0.0269, |
| "reward": 2.23892180621624, |
| "reward_std": 0.8879001673776656, |
| "rewards/correctness_reward_func": 1.40625, |
| "rewards/int_reward_func": 0.4140625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0234375, |
| "rewards/xmlcount_reward_func": 0.3951718807220459, |
| "step": 239 |
| }, |
| { |
| "completion_length": 128.1875, |
| "epoch": 0.2569249297470895, |
| "grad_norm": 2.0204544067382812, |
| "kl": 0.09938508365303278, |
| "learning_rate": 4.636470841752405e-06, |
| "loss": 0.004, |
| "reward": 1.7635936960577965, |
| "reward_std": 0.65199055057019, |
| "rewards/correctness_reward_func": 0.96875, |
| "rewards/int_reward_func": 0.390625, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0390625, |
| "rewards/xmlcount_reward_func": 0.36515624821186066, |
| "step": 240 |
| }, |
| { |
| "completion_length": 95.03125, |
| "epoch": 0.2579954502877024, |
| "grad_norm": 2.7629244327545166, |
| "kl": 0.16699408926069736, |
| "learning_rate": 4.631600410885231e-06, |
| "loss": 0.0067, |
| "reward": 2.553484320640564, |
| "reward_std": 0.6153040612116456, |
| "rewards/correctness_reward_func": 1.65625, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0234375, |
| "rewards/xmlcount_reward_func": 0.42848438024520874, |
| "step": 241 |
| }, |
| { |
| "completion_length": 91.640625, |
| "epoch": 0.2590659708283153, |
| "grad_norm": 2.4757933616638184, |
| "kl": 0.12886409275233746, |
| "learning_rate": 4.626700164218349e-06, |
| "loss": 0.0052, |
| "reward": 2.60553115606308, |
| "reward_std": 0.6941613564267755, |
| "rewards/correctness_reward_func": 1.6875, |
| "rewards/int_reward_func": 0.46875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.015625, |
| "rewards/xmlcount_reward_func": 0.4336562491953373, |
| "step": 242 |
| }, |
| { |
| "completion_length": 88.0, |
| "epoch": 0.26013649136892814, |
| "grad_norm": 5.133749961853027, |
| "kl": 0.3455530842766166, |
| "learning_rate": 4.621770170294049e-06, |
| "loss": 0.0138, |
| "reward": 1.829562470316887, |
| "reward_std": 0.8116088081151247, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.421875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.015625, |
| "rewards/xmlcount_reward_func": 0.3920625038444996, |
| "step": 243 |
| }, |
| { |
| "completion_length": 106.015625, |
| "epoch": 0.261207011909541, |
| "grad_norm": 2.4294216632843018, |
| "kl": 0.14171195961534977, |
| "learning_rate": 4.6168104980707105e-06, |
| "loss": 0.0057, |
| "reward": 2.2230467945337296, |
| "reward_std": 0.9806447625160217, |
| "rewards/correctness_reward_func": 1.3125, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0390625, |
| "rewards/xmlcount_reward_func": 0.4261718839406967, |
| "step": 244 |
| }, |
| { |
| "completion_length": 97.625, |
| "epoch": 0.2622775324501539, |
| "grad_norm": 6.696681976318359, |
| "kl": 0.5324421431869268, |
| "learning_rate": 4.61182121692184e-06, |
| "loss": 0.0213, |
| "reward": 2.5623437613248825, |
| "reward_std": 0.7769194557331502, |
| "rewards/correctness_reward_func": 1.65625, |
| "rewards/int_reward_func": 0.453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0234375, |
| "rewards/xmlcount_reward_func": 0.4295312501490116, |
| "step": 245 |
| }, |
| { |
| "completion_length": 105.328125, |
| "epoch": 0.26334805299076675, |
| "grad_norm": 3.7250571250915527, |
| "kl": 0.3119491417892277, |
| "learning_rate": 4.606802396635098e-06, |
| "loss": 0.0125, |
| "reward": 2.0679530799388885, |
| "reward_std": 1.0327460495755076, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/int_reward_func": 0.421875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0078125, |
| "rewards/xmlcount_reward_func": 0.388265622779727, |
| "step": 246 |
| }, |
| { |
| "completion_length": 101.0625, |
| "epoch": 0.26441857353137965, |
| "grad_norm": 2.0395541191101074, |
| "kl": 0.1265430450439453, |
| "learning_rate": 4.601754107411326e-06, |
| "loss": 0.0051, |
| "reward": 2.3826874494552612, |
| "reward_std": 0.881409777328372, |
| "rewards/correctness_reward_func": 1.46875, |
| "rewards/int_reward_func": 0.46875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.046875, |
| "rewards/xmlcount_reward_func": 0.3983125016093254, |
| "step": 247 |
| }, |
| { |
| "completion_length": 119.25, |
| "epoch": 0.2654890940719925, |
| "grad_norm": 50.28816223144531, |
| "kl": 1.0364861502312124, |
| "learning_rate": 4.596676419863561e-06, |
| "loss": 0.0415, |
| "reward": 1.8263437151908875, |
| "reward_std": 1.0226206295192242, |
| "rewards/correctness_reward_func": 1.03125, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.046875, |
| "rewards/xmlcount_reward_func": 0.37321874499320984, |
| "step": 248 |
| }, |
| { |
| "completion_length": 113.515625, |
| "epoch": 0.26655961461260536, |
| "grad_norm": 18.58391571044922, |
| "kl": 1.4038016851991415, |
| "learning_rate": 4.59156940501605e-06, |
| "loss": 0.0562, |
| "reward": 2.122187450528145, |
| "reward_std": 0.7838817811571062, |
| "rewards/correctness_reward_func": 1.25, |
| "rewards/int_reward_func": 0.421875, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.046875, |
| "rewards/xmlcount_reward_func": 0.4034374989569187, |
| "step": 249 |
| }, |
| { |
| "completion_length": 77.8125, |
| "epoch": 0.26763013515321826, |
| "grad_norm": 2.225803852081299, |
| "kl": 0.1744153881445527, |
| "learning_rate": 4.586433134303257e-06, |
| "loss": 0.007, |
| "reward": 2.600734308362007, |
| "reward_std": 0.6244143173098564, |
| "rewards/correctness_reward_func": 1.59375, |
| "rewards/int_reward_func": 0.4453125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.1015625, |
| "rewards/xmlcount_reward_func": 0.4601093679666519, |
| "step": 250 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 934, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 250, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|