Lora-grpo / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
d490501 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.26763013515321826,
"eval_steps": 500,
"global_step": 250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 166.625,
"epoch": 0.001070520540612873,
"grad_norm": 1.2984755039215088,
"kl": 0.0,
"learning_rate": 5.319148936170213e-08,
"loss": -0.0,
"reward": 0.24379686824977398,
"reward_std": 0.43905802024528384,
"rewards/correctness_reward_func": 0.1875,
"rewards/int_reward_func": 0.0546875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0016093750018626451,
"step": 1
},
{
"completion_length": 155.09375,
"epoch": 0.002141041081225746,
"grad_norm": 5.433530330657959,
"kl": 0.0,
"learning_rate": 1.0638297872340426e-07,
"loss": -0.0,
"reward": 0.7448437176644802,
"reward_std": 0.824664918705821,
"rewards/correctness_reward_func": 0.59375,
"rewards/int_reward_func": 0.1640625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.012968750204890966,
"step": 2
},
{
"completion_length": 156.65625,
"epoch": 0.003211561621838619,
"grad_norm": 1.5975662469863892,
"kl": 0.0003120364726783009,
"learning_rate": 1.5957446808510638e-07,
"loss": 0.0,
"reward": 0.5617187460884452,
"reward_std": 0.6682680626399815,
"rewards/correctness_reward_func": 0.40625,
"rewards/int_reward_func": 0.1328125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.02265625004656613,
"step": 3
},
{
"completion_length": 151.328125,
"epoch": 0.004282082162451492,
"grad_norm": 1.3767573833465576,
"kl": 0.00039493154326919466,
"learning_rate": 2.1276595744680852e-07,
"loss": 0.0,
"reward": 0.14101563091389835,
"reward_std": 0.40191352693364024,
"rewards/correctness_reward_func": 0.125,
"rewards/int_reward_func": 0.0546875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.038671874441206455,
"step": 4
},
{
"completion_length": 143.8125,
"epoch": 0.005352602703064365,
"grad_norm": 1.7029815912246704,
"kl": 0.0003161150925734546,
"learning_rate": 2.6595744680851066e-07,
"loss": 0.0,
"reward": 0.6667499775066972,
"reward_std": 0.943404046818614,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.1328125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03393749863607809,
"step": 5
},
{
"completion_length": 146.5625,
"epoch": 0.006423123243677238,
"grad_norm": 4.5992960929870605,
"kl": 0.0014150730130495504,
"learning_rate": 3.1914893617021275e-07,
"loss": 0.0001,
"reward": 0.6142812594771385,
"reward_std": 0.8342159832827747,
"rewards/correctness_reward_func": 0.46875,
"rewards/int_reward_func": 0.1328125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.01271874993108213,
"step": 6
},
{
"completion_length": 163.9375,
"epoch": 0.007493643784290111,
"grad_norm": 7.240438461303711,
"kl": 0.0018134960264433175,
"learning_rate": 3.723404255319149e-07,
"loss": 0.0001,
"reward": 0.24757812730967999,
"reward_std": 0.5592395211569965,
"rewards/correctness_reward_func": 0.1875,
"rewards/int_reward_func": 0.0703125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.010234375484287739,
"step": 7
},
{
"completion_length": 151.84375,
"epoch": 0.008564164324902984,
"grad_norm": 4.826539993286133,
"kl": 0.0011592731952987378,
"learning_rate": 4.2553191489361704e-07,
"loss": 0.0,
"reward": 0.24482813104987144,
"reward_std": 0.4538201582618058,
"rewards/correctness_reward_func": 0.15625,
"rewards/int_reward_func": 0.046875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.04170312359929085,
"step": 8
},
{
"completion_length": 148.828125,
"epoch": 0.009634684865515858,
"grad_norm": 4.097943305969238,
"kl": 0.0009884996707114624,
"learning_rate": 4.787234042553192e-07,
"loss": 0.0,
"reward": 0.6986562423408031,
"reward_std": 1.089426226913929,
"rewards/correctness_reward_func": 0.5625,
"rewards/int_reward_func": 0.1171875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.018968748801853508,
"step": 9
},
{
"completion_length": 160.3125,
"epoch": 0.01070520540612873,
"grad_norm": 3.2978594303131104,
"kl": 0.0008196280577976722,
"learning_rate": 5.319148936170213e-07,
"loss": 0.0,
"reward": 0.48834376223385334,
"reward_std": 0.790836479049176,
"rewards/correctness_reward_func": 0.375,
"rewards/int_reward_func": 0.1171875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.003843750571832061,
"step": 10
},
{
"completion_length": 146.96875,
"epoch": 0.011775725946741603,
"grad_norm": 3.537848472595215,
"kl": 0.0007083387099555694,
"learning_rate": 5.851063829787235e-07,
"loss": 0.0,
"reward": 0.5805312437005341,
"reward_std": 0.7569947894662619,
"rewards/correctness_reward_func": 0.46875,
"rewards/int_reward_func": 0.1328125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.021031250711530447,
"step": 11
},
{
"completion_length": 145.890625,
"epoch": 0.012846246487354477,
"grad_norm": 3.8045260906219482,
"kl": 0.0013589818336186,
"learning_rate": 6.382978723404255e-07,
"loss": 0.0001,
"reward": 0.46695311937946826,
"reward_std": 0.6352905407547951,
"rewards/correctness_reward_func": 0.3125,
"rewards/int_reward_func": 0.109375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.04507812508381903,
"step": 12
},
{
"completion_length": 146.5,
"epoch": 0.013916767027967349,
"grad_norm": 1.7494301795959473,
"kl": 0.000400338125473354,
"learning_rate": 6.914893617021278e-07,
"loss": 0.0,
"reward": 0.5322968787513673,
"reward_std": 0.7736401874572039,
"rewards/correctness_reward_func": 0.40625,
"rewards/int_reward_func": 0.1171875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.00885937490966171,
"step": 13
},
{
"completion_length": 133.671875,
"epoch": 0.014987287568580221,
"grad_norm": 1.8054606914520264,
"kl": 0.00036212212944519706,
"learning_rate": 7.446808510638298e-07,
"loss": 0.0,
"reward": 0.6521718641743064,
"reward_std": 1.0104734068736434,
"rewards/correctness_reward_func": 0.53125,
"rewards/int_reward_func": 0.1328125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.011890625639352947,
"step": 14
},
{
"completion_length": 130.0,
"epoch": 0.016057808109193095,
"grad_norm": 1.5193248987197876,
"kl": 0.00036491416904027574,
"learning_rate": 7.97872340425532e-07,
"loss": 0.0,
"reward": 0.5991093763150275,
"reward_std": 0.8120721196755767,
"rewards/correctness_reward_func": 0.46875,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.005359375616535544,
"step": 15
},
{
"completion_length": 162.640625,
"epoch": 0.017128328649805968,
"grad_norm": 4.938427448272705,
"kl": 0.0014603480958612636,
"learning_rate": 8.510638297872341e-07,
"loss": 0.0001,
"reward": 0.2470156280323863,
"reward_std": 0.5583461234346032,
"rewards/correctness_reward_func": 0.1875,
"rewards/int_reward_func": 0.0703125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.010796874179504812,
"step": 16
},
{
"completion_length": 165.796875,
"epoch": 0.01819884919041884,
"grad_norm": 3.923428535461426,
"kl": 0.001025654159093392,
"learning_rate": 9.042553191489363e-07,
"loss": 0.0,
"reward": 0.5893437387421727,
"reward_std": 0.8829143429175019,
"rewards/correctness_reward_func": 0.46875,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.004406251944601536,
"step": 17
},
{
"completion_length": 152.359375,
"epoch": 0.019269369731031716,
"grad_norm": 5.248744487762451,
"kl": 0.0020197606609144714,
"learning_rate": 9.574468085106384e-07,
"loss": 0.0001,
"reward": 0.46303125098347664,
"reward_std": 0.9115365371108055,
"rewards/correctness_reward_func": 0.34375,
"rewards/int_reward_func": 0.1015625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.017718749470077455,
"step": 18
},
{
"completion_length": 151.359375,
"epoch": 0.020339890271644588,
"grad_norm": 1.7978895902633667,
"kl": 0.00032554956487729214,
"learning_rate": 1.0106382978723404e-06,
"loss": 0.0,
"reward": 0.48937500442843884,
"reward_std": 0.7031663246452808,
"rewards/correctness_reward_func": 0.375,
"rewards/int_reward_func": 0.109375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.005000000121071935,
"step": 19
},
{
"completion_length": 175.90625,
"epoch": 0.02141041081225746,
"grad_norm": 2.1679224967956543,
"kl": 0.0004576210667437408,
"learning_rate": 1.0638297872340427e-06,
"loss": 0.0,
"reward": 0.44935936853289604,
"reward_std": 0.6872247559949756,
"rewards/correctness_reward_func": 0.34375,
"rewards/int_reward_func": 0.09375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.011859375052154064,
"step": 20
},
{
"completion_length": 167.265625,
"epoch": 0.022480931352870333,
"grad_norm": 1.475581407546997,
"kl": 0.0007880991906858981,
"learning_rate": 1.1170212765957447e-06,
"loss": 0.0,
"reward": 0.28507812274619937,
"reward_std": 0.5523091573268175,
"rewards/correctness_reward_func": 0.21875,
"rewards/int_reward_func": 0.0625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.003828124259598553,
"step": 21
},
{
"completion_length": 153.296875,
"epoch": 0.023551451893483205,
"grad_norm": 1.6419923305511475,
"kl": 0.00039223546627908945,
"learning_rate": 1.170212765957447e-06,
"loss": 0.0,
"reward": 0.6624062322080135,
"reward_std": 0.9240671265870333,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.1328125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0295937517657876,
"step": 22
},
{
"completion_length": 143.515625,
"epoch": 0.02462197243409608,
"grad_norm": 1.3294110298156738,
"kl": 0.00030555322518921457,
"learning_rate": 1.223404255319149e-06,
"loss": 0.0,
"reward": 0.26720312132965773,
"reward_std": 0.4894332850817591,
"rewards/correctness_reward_func": 0.1875,
"rewards/int_reward_func": 0.0546875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.025015624007210135,
"step": 23
},
{
"completion_length": 148.3125,
"epoch": 0.025692492974708953,
"grad_norm": 6.915380954742432,
"kl": 0.0062694076787011,
"learning_rate": 1.276595744680851e-06,
"loss": 0.0003,
"reward": 0.28068749560043216,
"reward_std": 0.5439753192476928,
"rewards/correctness_reward_func": 0.1875,
"rewards/int_reward_func": 0.0625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.030687499791383743,
"step": 24
},
{
"completion_length": 145.421875,
"epoch": 0.026763013515321826,
"grad_norm": 1.5796961784362793,
"kl": 0.0008160970010067103,
"learning_rate": 1.3297872340425533e-06,
"loss": 0.0,
"reward": 0.604515643324703,
"reward_std": 0.6846362175419927,
"rewards/correctness_reward_func": 0.4375,
"rewards/int_reward_func": 0.1171875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.049828124698251486,
"step": 25
},
{
"completion_length": 144.21875,
"epoch": 0.027833534055934698,
"grad_norm": 3.3064963817596436,
"kl": 0.0009157936146948487,
"learning_rate": 1.3829787234042555e-06,
"loss": 0.0,
"reward": 0.5731093874201179,
"reward_std": 0.8484273846261203,
"rewards/correctness_reward_func": 0.4375,
"rewards/int_reward_func": 0.1171875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.018421874032355845,
"step": 26
},
{
"completion_length": 135.921875,
"epoch": 0.02890405459654757,
"grad_norm": 1.8949307203292847,
"kl": 0.0004942502673657145,
"learning_rate": 1.4361702127659578e-06,
"loss": 0.0,
"reward": 0.6309375101700425,
"reward_std": 0.9806207492947578,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.005937499925494194,
"step": 27
},
{
"completion_length": 153.59375,
"epoch": 0.029974575137160443,
"grad_norm": 1.7615420818328857,
"kl": 0.0004919220991723705,
"learning_rate": 1.4893617021276596e-06,
"loss": 0.0,
"reward": 0.1835937526775524,
"reward_std": 0.45627398509532213,
"rewards/correctness_reward_func": 0.15625,
"rewards/int_reward_func": 0.0546875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.027343749883584678,
"step": 28
},
{
"completion_length": 164.640625,
"epoch": 0.03104509567777332,
"grad_norm": 4.3193230628967285,
"kl": 0.0013858377351425588,
"learning_rate": 1.5425531914893618e-06,
"loss": 0.0001,
"reward": 0.40034375386312604,
"reward_std": 0.7546388749033213,
"rewards/correctness_reward_func": 0.28125,
"rewards/int_reward_func": 0.109375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.009718750137835741,
"step": 29
},
{
"completion_length": 128.234375,
"epoch": 0.03211561621838619,
"grad_norm": 5.113959312438965,
"kl": 0.0021859680928173475,
"learning_rate": 1.595744680851064e-06,
"loss": 0.0001,
"reward": 0.8881718653719872,
"reward_std": 0.9410315058194101,
"rewards/correctness_reward_func": 0.6875,
"rewards/int_reward_func": 0.1796875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.02098437474342063,
"step": 30
},
{
"completion_length": 140.8125,
"epoch": 0.03318613675899906,
"grad_norm": 7.337815284729004,
"kl": 0.00254826245145523,
"learning_rate": 1.648936170212766e-06,
"loss": 0.0001,
"reward": 0.7087812423706055,
"reward_std": 0.9508242532610893,
"rewards/correctness_reward_func": 0.53125,
"rewards/int_reward_func": 0.15625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.021281250286847353,
"step": 31
},
{
"completion_length": 141.0625,
"epoch": 0.034256657299611935,
"grad_norm": 4.055324554443359,
"kl": 0.0016690000156813767,
"learning_rate": 1.7021276595744682e-06,
"loss": 0.0001,
"reward": 0.7299062572419643,
"reward_std": 0.8444140013307333,
"rewards/correctness_reward_func": 0.5625,
"rewards/int_reward_func": 0.1328125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03459375072270632,
"step": 32
},
{
"completion_length": 154.296875,
"epoch": 0.03532717784022481,
"grad_norm": 3.565863847732544,
"kl": 0.0011733200299204327,
"learning_rate": 1.7553191489361704e-06,
"loss": 0.0,
"reward": 0.7152343707857653,
"reward_std": 0.986182201653719,
"rewards/correctness_reward_func": 0.5625,
"rewards/int_reward_func": 0.1640625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.011328124441206455,
"step": 33
},
{
"completion_length": 156.484375,
"epoch": 0.03639769838083768,
"grad_norm": 2.782845973968506,
"kl": 0.001061345017660642,
"learning_rate": 1.8085106382978727e-06,
"loss": 0.0,
"reward": 0.4085781138855964,
"reward_std": 0.6946883676573634,
"rewards/correctness_reward_func": 0.3125,
"rewards/int_reward_func": 0.09375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0023281261092051864,
"step": 34
},
{
"completion_length": 166.203125,
"epoch": 0.03746821892145055,
"grad_norm": 1.6768676042556763,
"kl": 0.0008204600453609601,
"learning_rate": 1.8617021276595745e-06,
"loss": 0.0,
"reward": 0.5197031321004033,
"reward_std": 0.8639188781380653,
"rewards/correctness_reward_func": 0.40625,
"rewards/int_reward_func": 0.1171875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0037343755830079317,
"step": 35
},
{
"completion_length": 168.828125,
"epoch": 0.03853873946206343,
"grad_norm": 7.892724990844727,
"kl": 0.0029609855264425278,
"learning_rate": 1.9148936170212767e-06,
"loss": 0.0001,
"reward": 0.23489062942098826,
"reward_std": 0.356358939781785,
"rewards/correctness_reward_func": 0.15625,
"rewards/int_reward_func": 0.046875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.031765625230036676,
"step": 36
},
{
"completion_length": 123.53125,
"epoch": 0.039609260002676304,
"grad_norm": 2.5262649059295654,
"kl": 0.0010402118496131152,
"learning_rate": 1.968085106382979e-06,
"loss": 0.0,
"reward": 0.9631250270176679,
"reward_std": 1.0183926988393068,
"rewards/correctness_reward_func": 0.71875,
"rewards/int_reward_func": 0.1796875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.06468750000931323,
"step": 37
},
{
"completion_length": 159.46875,
"epoch": 0.040679780543289176,
"grad_norm": 1.3111544847488403,
"kl": 0.0011480498651508242,
"learning_rate": 2.021276595744681e-06,
"loss": 0.0,
"reward": 0.4472187543287873,
"reward_std": 0.6874936055392027,
"rewards/correctness_reward_func": 0.34375,
"rewards/int_reward_func": 0.09375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.009718748508021235,
"step": 38
},
{
"completion_length": 142.25,
"epoch": 0.04175030108390205,
"grad_norm": 2.2276878356933594,
"kl": 0.0014690053576487117,
"learning_rate": 2.074468085106383e-06,
"loss": 0.0001,
"reward": 1.0481719109229743,
"reward_std": 0.7983668614178896,
"rewards/correctness_reward_func": 0.8125,
"rewards/int_reward_func": 0.1796875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.05598437529988587,
"step": 39
},
{
"completion_length": 149.71875,
"epoch": 0.04282082162451492,
"grad_norm": 3.2188539505004883,
"kl": 0.002757413443760015,
"learning_rate": 2.1276595744680853e-06,
"loss": 0.0001,
"reward": 0.8439687644131482,
"reward_std": 0.9366709599271417,
"rewards/correctness_reward_func": 0.625,
"rewards/int_reward_func": 0.171875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.04709374951198697,
"step": 40
},
{
"completion_length": 153.75,
"epoch": 0.04389134216512779,
"grad_norm": 1.609779953956604,
"kl": 0.002587423972727265,
"learning_rate": 2.1808510638297876e-06,
"loss": 0.0001,
"reward": 0.9616250339895487,
"reward_std": 0.8783023115247488,
"rewards/correctness_reward_func": 0.75,
"rewards/int_reward_func": 0.1953125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.01631249929778278,
"step": 41
},
{
"completion_length": 156.15625,
"epoch": 0.044961862705740666,
"grad_norm": 5.789142608642578,
"kl": 0.0045223182532936335,
"learning_rate": 2.2340425531914894e-06,
"loss": 0.0002,
"reward": 0.8380624754354358,
"reward_std": 0.881235895678401,
"rewards/correctness_reward_func": 0.625,
"rewards/int_reward_func": 0.1796875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03337499825283885,
"step": 42
},
{
"completion_length": 161.390625,
"epoch": 0.04603238324635354,
"grad_norm": 5.589737892150879,
"kl": 0.005504744782228954,
"learning_rate": 2.2872340425531916e-06,
"loss": 0.0002,
"reward": 0.8749218666926026,
"reward_std": 0.9216371476650238,
"rewards/correctness_reward_func": 0.6875,
"rewards/int_reward_func": 0.1796875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.007734375540167093,
"step": 43
},
{
"completion_length": 161.796875,
"epoch": 0.04710290378696641,
"grad_norm": 5.604761600494385,
"kl": 0.0046821657015243545,
"learning_rate": 2.340425531914894e-06,
"loss": 0.0002,
"reward": 0.2252656314522028,
"reward_std": 0.47946364153176546,
"rewards/correctness_reward_func": 0.15625,
"rewards/int_reward_func": 0.0546875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.014328125165775418,
"step": 44
},
{
"completion_length": 154.453125,
"epoch": 0.04817342432757928,
"grad_norm": 9.506386756896973,
"kl": 0.007164878101320937,
"learning_rate": 2.393617021276596e-06,
"loss": 0.0003,
"reward": 0.6195624829269946,
"reward_std": 1.0045473407953978,
"rewards/correctness_reward_func": 0.46875,
"rewards/int_reward_func": 0.1328125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.01799999945797026,
"step": 45
},
{
"completion_length": 140.390625,
"epoch": 0.04924394486819216,
"grad_norm": 3.766388416290283,
"kl": 0.00619677483337,
"learning_rate": 2.446808510638298e-06,
"loss": 0.0002,
"reward": 0.9423750173300505,
"reward_std": 0.5746848955750465,
"rewards/correctness_reward_func": 0.65625,
"rewards/int_reward_func": 0.203125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.08299999847076833,
"step": 46
},
{
"completion_length": 143.09375,
"epoch": 0.050314465408805034,
"grad_norm": 4.4466705322265625,
"kl": 0.009431202546693385,
"learning_rate": 2.5e-06,
"loss": 0.0004,
"reward": 0.9380937227979302,
"reward_std": 0.658873830921948,
"rewards/correctness_reward_func": 0.6875,
"rewards/int_reward_func": 0.1796875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.07090624794363976,
"step": 47
},
{
"completion_length": 165.0625,
"epoch": 0.051384985949417906,
"grad_norm": 1.3459303379058838,
"kl": 0.00600922666490078,
"learning_rate": 2.553191489361702e-06,
"loss": 0.0002,
"reward": 0.6878281269455329,
"reward_std": 0.9325386872515082,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.15625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03157812531571835,
"step": 48
},
{
"completion_length": 134.421875,
"epoch": 0.05245550649003078,
"grad_norm": 1.628507375717163,
"kl": 0.008930853742640465,
"learning_rate": 2.6063829787234047e-06,
"loss": 0.0004,
"reward": 0.8599374926416203,
"reward_std": 0.9755922555923462,
"rewards/correctness_reward_func": 0.625,
"rewards/int_reward_func": 0.1796875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.05525000113993883,
"step": 49
},
{
"completion_length": 149.5625,
"epoch": 0.05352602703064365,
"grad_norm": 1.1767226457595825,
"kl": 0.012399342958815396,
"learning_rate": 2.6595744680851065e-06,
"loss": 0.0005,
"reward": 1.0091875102370977,
"reward_std": 0.8712345249950886,
"rewards/correctness_reward_func": 0.71875,
"rewards/int_reward_func": 0.203125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.08731249999254942,
"step": 50
},
{
"completion_length": 169.078125,
"epoch": 0.05459654757125652,
"grad_norm": 2.5901050567626953,
"kl": 0.007969769008923322,
"learning_rate": 2.7127659574468084e-06,
"loss": 0.0003,
"reward": 0.9503125064074993,
"reward_std": 0.9690856691449881,
"rewards/correctness_reward_func": 0.6875,
"rewards/int_reward_func": 0.234375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.028437498025596142,
"step": 51
},
{
"completion_length": 166.390625,
"epoch": 0.055667068111869396,
"grad_norm": 3.764841079711914,
"kl": 0.011693944863509387,
"learning_rate": 2.765957446808511e-06,
"loss": 0.0005,
"reward": 0.7889687474817038,
"reward_std": 0.8161248974502087,
"rewards/correctness_reward_func": 0.5625,
"rewards/int_reward_func": 0.1875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03896875097416341,
"step": 52
},
{
"completion_length": 158.3125,
"epoch": 0.05673758865248227,
"grad_norm": 4.34719705581665,
"kl": 0.00852247714647092,
"learning_rate": 2.819148936170213e-06,
"loss": 0.0003,
"reward": 0.5854531275108457,
"reward_std": 0.579142062459141,
"rewards/correctness_reward_func": 0.4375,
"rewards/int_reward_func": 0.1484375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.00048437435179948807,
"step": 53
},
{
"completion_length": 149.15625,
"epoch": 0.05780810919309514,
"grad_norm": 1.3242722749710083,
"kl": 0.006340013263979927,
"learning_rate": 2.8723404255319155e-06,
"loss": 0.0003,
"reward": 0.7202812694013119,
"reward_std": 0.6025513117201626,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.140625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.07965625170618296,
"step": 54
},
{
"completion_length": 131.0,
"epoch": 0.05887862973370801,
"grad_norm": 1.2887805700302124,
"kl": 0.005301086028339341,
"learning_rate": 2.9255319148936174e-06,
"loss": 0.0002,
"reward": 1.1997031308710575,
"reward_std": 0.8454109290614724,
"rewards/correctness_reward_func": 0.875,
"rewards/int_reward_func": 0.21875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.10595312505029142,
"step": 55
},
{
"completion_length": 132.828125,
"epoch": 0.059949150274320885,
"grad_norm": 3.085810899734497,
"kl": 0.03685568018408958,
"learning_rate": 2.978723404255319e-06,
"loss": 0.0015,
"reward": 1.32792192324996,
"reward_std": 0.8808077229186893,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.265625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.062296870397403836,
"step": 56
},
{
"completion_length": 169.5,
"epoch": 0.061019670814933764,
"grad_norm": 3.218595266342163,
"kl": 0.019670582871185616,
"learning_rate": 3.031914893617022e-06,
"loss": 0.0008,
"reward": 1.0494843795895576,
"reward_std": 0.8383767995983362,
"rewards/correctness_reward_func": 0.78125,
"rewards/int_reward_func": 0.1953125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.07292187376879156,
"step": 57
},
{
"completion_length": 164.71875,
"epoch": 0.06209019135554664,
"grad_norm": 1.189056634902954,
"kl": 0.010589714744128287,
"learning_rate": 3.0851063829787237e-06,
"loss": 0.0004,
"reward": 0.9995937808416784,
"reward_std": 0.8763523772358894,
"rewards/correctness_reward_func": 0.75,
"rewards/int_reward_func": 0.234375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.015218749409541488,
"step": 58
},
{
"completion_length": 173.109375,
"epoch": 0.0631607118961595,
"grad_norm": 2.9426801204681396,
"kl": 0.010116680678038392,
"learning_rate": 3.1382978723404255e-06,
"loss": 0.0004,
"reward": 0.5431718821637332,
"reward_std": 0.5429769204929471,
"rewards/correctness_reward_func": 0.40625,
"rewards/int_reward_func": 0.1015625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03535937680862844,
"step": 59
},
{
"completion_length": 148.328125,
"epoch": 0.06423123243677238,
"grad_norm": 1.2039525508880615,
"kl": 0.012681124440860003,
"learning_rate": 3.191489361702128e-06,
"loss": 0.0005,
"reward": 0.7227343516424298,
"reward_std": 0.7870303755626082,
"rewards/correctness_reward_func": 0.53125,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.06648437259718776,
"step": 60
},
{
"completion_length": 125.5625,
"epoch": 0.06530175297738525,
"grad_norm": 1.2423548698425293,
"kl": 0.007948026963276789,
"learning_rate": 3.24468085106383e-06,
"loss": 0.0003,
"reward": 1.3104218831285834,
"reward_std": 0.6878003547899425,
"rewards/correctness_reward_func": 0.9375,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.12292187649291009,
"step": 61
},
{
"completion_length": 152.296875,
"epoch": 0.06637227351799813,
"grad_norm": 5.324446678161621,
"kl": 0.043227474874584004,
"learning_rate": 3.297872340425532e-06,
"loss": 0.0017,
"reward": 1.0643124831840396,
"reward_std": 0.9667724259197712,
"rewards/correctness_reward_func": 0.78125,
"rewards/int_reward_func": 0.2109375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.07212500204332173,
"step": 62
},
{
"completion_length": 157.328125,
"epoch": 0.067442794058611,
"grad_norm": 1.2027546167373657,
"kl": 0.004988896660506725,
"learning_rate": 3.3510638297872345e-06,
"loss": 0.0002,
"reward": 1.1987031551543623,
"reward_std": 0.8610715009272099,
"rewards/correctness_reward_func": 0.90625,
"rewards/int_reward_func": 0.2578125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.034640624886378646,
"step": 63
},
{
"completion_length": 163.9375,
"epoch": 0.06851331459922387,
"grad_norm": 1.126978874206543,
"kl": 0.004329273069743067,
"learning_rate": 3.4042553191489363e-06,
"loss": 0.0002,
"reward": 0.8524375010747463,
"reward_std": 0.742443086579442,
"rewards/correctness_reward_func": 0.65625,
"rewards/int_reward_func": 0.171875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.024312500143423676,
"step": 64
},
{
"completion_length": 135.765625,
"epoch": 0.06958383513983675,
"grad_norm": 4.760307788848877,
"kl": 0.0423761896090582,
"learning_rate": 3.457446808510639e-06,
"loss": 0.0017,
"reward": 1.4641093388199806,
"reward_std": 1.0102775804698467,
"rewards/correctness_reward_func": 1.0625,
"rewards/int_reward_func": 0.296875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1047343765385449,
"step": 65
},
{
"completion_length": 155.796875,
"epoch": 0.07065435568044962,
"grad_norm": 3.1110665798187256,
"kl": 0.061538238427601755,
"learning_rate": 3.510638297872341e-06,
"loss": 0.0025,
"reward": 1.033312514424324,
"reward_std": 0.8865859052166343,
"rewards/correctness_reward_func": 0.78125,
"rewards/int_reward_func": 0.203125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.048937500920146704,
"step": 66
},
{
"completion_length": 164.578125,
"epoch": 0.0717248762210625,
"grad_norm": 1.2862604856491089,
"kl": 0.0052650388242909685,
"learning_rate": 3.5638297872340426e-06,
"loss": 0.0002,
"reward": 0.7733906293287873,
"reward_std": 0.8027388863265514,
"rewards/correctness_reward_func": 0.59375,
"rewards/int_reward_func": 0.1640625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.015578125370666385,
"step": 67
},
{
"completion_length": 163.75,
"epoch": 0.07279539676167536,
"grad_norm": 6.033888816833496,
"kl": 0.07815390304313041,
"learning_rate": 3.6170212765957453e-06,
"loss": 0.0031,
"reward": 1.1917500102426857,
"reward_std": 0.7897173650562763,
"rewards/correctness_reward_func": 0.90625,
"rewards/int_reward_func": 0.265625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.019875000230968,
"step": 68
},
{
"completion_length": 149.40625,
"epoch": 0.07386591730228824,
"grad_norm": 1.29887056350708,
"kl": 0.01987961767008528,
"learning_rate": 3.670212765957447e-06,
"loss": 0.0008,
"reward": 1.1079531812574714,
"reward_std": 0.7181853111833334,
"rewards/correctness_reward_func": 0.78125,
"rewards/int_reward_func": 0.265625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.06107812421396375,
"step": 69
},
{
"completion_length": 141.859375,
"epoch": 0.0749364378429011,
"grad_norm": 5.329657077789307,
"kl": 0.0800003606127575,
"learning_rate": 3.723404255319149e-06,
"loss": 0.0032,
"reward": 0.9977656248956919,
"reward_std": 0.5876570995897055,
"rewards/correctness_reward_func": 0.6875,
"rewards/int_reward_func": 0.21875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.09151562419719994,
"step": 70
},
{
"completion_length": 138.5625,
"epoch": 0.07600695838351398,
"grad_norm": 2.7627108097076416,
"kl": 0.03701730686589144,
"learning_rate": 3.7765957446808516e-06,
"loss": 0.0015,
"reward": 1.2351874904707074,
"reward_std": 0.8267962019890547,
"rewards/correctness_reward_func": 0.90625,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.07893750071525574,
"step": 71
},
{
"completion_length": 133.984375,
"epoch": 0.07707747892412686,
"grad_norm": 1.414104700088501,
"kl": 0.016362678608857095,
"learning_rate": 3.8297872340425535e-06,
"loss": 0.0007,
"reward": 1.827203094959259,
"reward_std": 1.0086707267910242,
"rewards/correctness_reward_func": 1.375,
"rewards/int_reward_func": 0.359375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0928281235974282,
"step": 72
},
{
"completion_length": 148.0625,
"epoch": 0.07814799946473973,
"grad_norm": 1.3797897100448608,
"kl": 0.010796019807457924,
"learning_rate": 3.882978723404256e-06,
"loss": 0.0004,
"reward": 1.1499999817460775,
"reward_std": 0.9028994599357247,
"rewards/correctness_reward_func": 0.84375,
"rewards/int_reward_func": 0.2109375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.09531250316649675,
"step": 73
},
{
"completion_length": 155.828125,
"epoch": 0.07921852000535261,
"grad_norm": 1.2884279489517212,
"kl": 0.017199350346345454,
"learning_rate": 3.936170212765958e-06,
"loss": 0.0007,
"reward": 1.0700937574729323,
"reward_std": 1.0012164115905762,
"rewards/correctness_reward_func": 0.8125,
"rewards/int_reward_func": 0.234375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.023218751302920282,
"step": 74
},
{
"completion_length": 146.96875,
"epoch": 0.08028904054596547,
"grad_norm": 3.5342090129852295,
"kl": 0.05702902490156703,
"learning_rate": 3.98936170212766e-06,
"loss": 0.0023,
"reward": 1.2074843887239695,
"reward_std": 0.8127174219116569,
"rewards/correctness_reward_func": 0.875,
"rewards/int_reward_func": 0.2578125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.07467187196016312,
"step": 75
},
{
"completion_length": 135.140625,
"epoch": 0.08135956108657835,
"grad_norm": 1.7845228910446167,
"kl": 0.01959521723620128,
"learning_rate": 4.042553191489362e-06,
"loss": 0.0008,
"reward": 1.4087968692183495,
"reward_std": 0.9582029562443495,
"rewards/correctness_reward_func": 1.03125,
"rewards/int_reward_func": 0.2734375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.10410937643609941,
"step": 76
},
{
"completion_length": 126.125,
"epoch": 0.08243008162719122,
"grad_norm": 1.3039053678512573,
"kl": 0.008768495463300496,
"learning_rate": 4.095744680851064e-06,
"loss": 0.0004,
"reward": 1.3187343887984753,
"reward_std": 0.5690027270466089,
"rewards/correctness_reward_func": 0.9375,
"rewards/int_reward_func": 0.2578125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.12342187319882214,
"step": 77
},
{
"completion_length": 148.6875,
"epoch": 0.0835006021678041,
"grad_norm": 3.3175690174102783,
"kl": 0.014638990571256727,
"learning_rate": 4.148936170212766e-06,
"loss": 0.0006,
"reward": 1.4877656111493707,
"reward_std": 0.8957763649523258,
"rewards/correctness_reward_func": 1.09375,
"rewards/int_reward_func": 0.2890625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.10495312558487058,
"step": 78
},
{
"completion_length": 171.0625,
"epoch": 0.08457112270841696,
"grad_norm": 5.574616432189941,
"kl": 0.01483242801623419,
"learning_rate": 4.202127659574468e-06,
"loss": 0.0006,
"reward": 1.0053124725818634,
"reward_std": 1.1163944154977798,
"rewards/correctness_reward_func": 0.75,
"rewards/int_reward_func": 0.2265625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.02874999982304871,
"step": 79
},
{
"completion_length": 150.5625,
"epoch": 0.08564164324902984,
"grad_norm": 3.4894111156463623,
"kl": 0.020942480681696907,
"learning_rate": 4.255319148936171e-06,
"loss": 0.0008,
"reward": 1.081656239926815,
"reward_std": 0.8474766900762916,
"rewards/correctness_reward_func": 0.75,
"rewards/int_reward_func": 0.234375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.09728124877437949,
"step": 80
},
{
"completion_length": 145.8125,
"epoch": 0.08671216378964271,
"grad_norm": 4.479030609130859,
"kl": 0.02067101007560268,
"learning_rate": 4.308510638297873e-06,
"loss": 0.0008,
"reward": 0.9469531225040555,
"reward_std": 0.5945024443790317,
"rewards/correctness_reward_func": 0.625,
"rewards/int_reward_func": 0.2421875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.07976562529802322,
"step": 81
},
{
"completion_length": 142.609375,
"epoch": 0.08778268433025559,
"grad_norm": 9.429178237915039,
"kl": 0.032599265803582966,
"learning_rate": 4.361702127659575e-06,
"loss": 0.0013,
"reward": 0.7288906406611204,
"reward_std": 0.8307479582726955,
"rewards/correctness_reward_func": 0.46875,
"rewards/int_reward_func": 0.171875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.08826562575995922,
"step": 82
},
{
"completion_length": 143.84375,
"epoch": 0.08885320487086847,
"grad_norm": 1.5900259017944336,
"kl": 0.013988179998705164,
"learning_rate": 4.414893617021277e-06,
"loss": 0.0006,
"reward": 1.591531228274107,
"reward_std": 0.8249969203025103,
"rewards/correctness_reward_func": 1.25,
"rewards/int_reward_func": 0.3203125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.021218748996034265,
"step": 83
},
{
"completion_length": 135.8125,
"epoch": 0.08992372541148133,
"grad_norm": 4.648512363433838,
"kl": 0.06424052006332204,
"learning_rate": 4.468085106382979e-06,
"loss": 0.0026,
"reward": 1.4651249905582517,
"reward_std": 0.8705566665157676,
"rewards/correctness_reward_func": 1.0625,
"rewards/int_reward_func": 0.2890625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.11356249963864684,
"step": 84
},
{
"completion_length": 123.90625,
"epoch": 0.09099424595209421,
"grad_norm": 6.136663913726807,
"kl": 0.02576264040544629,
"learning_rate": 4.521276595744681e-06,
"loss": 0.001,
"reward": 1.7882343754172325,
"reward_std": 0.7715246099978685,
"rewards/correctness_reward_func": 1.3125,
"rewards/int_reward_func": 0.3671875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.10854687169194221,
"step": 85
},
{
"completion_length": 132.65625,
"epoch": 0.09206476649270708,
"grad_norm": 2.7052152156829834,
"kl": 0.021323165216017514,
"learning_rate": 4.574468085106383e-06,
"loss": 0.0009,
"reward": 1.2060781214386225,
"reward_std": 0.74014887586236,
"rewards/correctness_reward_func": 0.84375,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.11232812539674342,
"step": 86
},
{
"completion_length": 142.3125,
"epoch": 0.09313528703331996,
"grad_norm": 6.834721088409424,
"kl": 0.15714708802988753,
"learning_rate": 4.6276595744680855e-06,
"loss": 0.0063,
"reward": 1.4907656013965607,
"reward_std": 1.1851906776428223,
"rewards/correctness_reward_func": 1.09375,
"rewards/int_reward_func": 0.3203125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.07670312328264117,
"step": 87
},
{
"completion_length": 141.625,
"epoch": 0.09420580757393282,
"grad_norm": 1.5215015411376953,
"kl": 0.012561204843223095,
"learning_rate": 4.680851063829788e-06,
"loss": 0.0005,
"reward": 0.973562479019165,
"reward_std": 0.8407629178836942,
"rewards/correctness_reward_func": 0.6875,
"rewards/int_reward_func": 0.2109375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.07512499787844718,
"step": 88
},
{
"completion_length": 131.109375,
"epoch": 0.0952763281145457,
"grad_norm": 4.186502456665039,
"kl": 0.029811605345457792,
"learning_rate": 4.73404255319149e-06,
"loss": 0.0012,
"reward": 1.571968775242567,
"reward_std": 0.8088337788358331,
"rewards/correctness_reward_func": 1.125,
"rewards/int_reward_func": 0.3125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.13446875009685755,
"step": 89
},
{
"completion_length": 137.5625,
"epoch": 0.09634684865515857,
"grad_norm": 3.539090633392334,
"kl": 0.025141435849945992,
"learning_rate": 4.787234042553192e-06,
"loss": 0.001,
"reward": 1.1439843773841858,
"reward_std": 1.1783022359013557,
"rewards/correctness_reward_func": 0.8125,
"rewards/int_reward_func": 0.2421875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.08929687412455678,
"step": 90
},
{
"completion_length": 142.0,
"epoch": 0.09741736919577144,
"grad_norm": 3.9599854946136475,
"kl": 0.03133453679038212,
"learning_rate": 4.840425531914894e-06,
"loss": 0.0013,
"reward": 0.9123750082217157,
"reward_std": 0.8763793092221022,
"rewards/correctness_reward_func": 0.65625,
"rewards/int_reward_func": 0.21875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.037375001003965735,
"step": 91
},
{
"completion_length": 127.0625,
"epoch": 0.09848788973638432,
"grad_norm": 5.762697219848633,
"kl": 0.02441024547442794,
"learning_rate": 4.893617021276596e-06,
"loss": 0.001,
"reward": 1.4343437626957893,
"reward_std": 1.0151535924524069,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.328125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.10621875151991844,
"step": 92
},
{
"completion_length": 128.421875,
"epoch": 0.09955841027699719,
"grad_norm": 1.7006865739822388,
"kl": 0.014295783417765051,
"learning_rate": 4.946808510638298e-06,
"loss": 0.0006,
"reward": 1.4152031522244215,
"reward_std": 1.0377220567315817,
"rewards/correctness_reward_func": 1.03125,
"rewards/int_reward_func": 0.265625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.11832812381908298,
"step": 93
},
{
"completion_length": 126.703125,
"epoch": 0.10062893081761007,
"grad_norm": 2.6587624549865723,
"kl": 0.022590334410779178,
"learning_rate": 5e-06,
"loss": 0.0009,
"reward": 1.0790781378746033,
"reward_std": 0.8811032259836793,
"rewards/correctness_reward_func": 0.75,
"rewards/int_reward_func": 0.203125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.12595312716439366,
"step": 94
},
{
"completion_length": 134.25,
"epoch": 0.10169945135822293,
"grad_norm": 5.803914546966553,
"kl": 0.0693736044340767,
"learning_rate": 4.999982515602153e-06,
"loss": 0.0028,
"reward": 1.6232343390583992,
"reward_std": 1.171187661588192,
"rewards/correctness_reward_func": 1.1875,
"rewards/int_reward_func": 0.34375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.09198437177110463,
"step": 95
},
{
"completion_length": 119.09375,
"epoch": 0.10276997189883581,
"grad_norm": 2.6233108043670654,
"kl": 0.025776030379347503,
"learning_rate": 4.999930062653175e-06,
"loss": 0.001,
"reward": 1.4469374530017376,
"reward_std": 0.717207751236856,
"rewards/correctness_reward_func": 1.03125,
"rewards/int_reward_func": 0.2890625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.12662499537691474,
"step": 96
},
{
"completion_length": 135.625,
"epoch": 0.10384049243944868,
"grad_norm": 1.4691489934921265,
"kl": 0.018308754893951118,
"learning_rate": 4.999842641886752e-06,
"loss": 0.0007,
"reward": 1.7408281043171883,
"reward_std": 0.6604799125343561,
"rewards/correctness_reward_func": 1.3125,
"rewards/int_reward_func": 0.328125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.10020312923006713,
"step": 97
},
{
"completion_length": 142.53125,
"epoch": 0.10491101298006156,
"grad_norm": 3.7322754859924316,
"kl": 0.031659536180086434,
"learning_rate": 4.999720254525684e-06,
"loss": 0.0013,
"reward": 1.3402031436562538,
"reward_std": 1.0035525700077415,
"rewards/correctness_reward_func": 0.96875,
"rewards/int_reward_func": 0.3203125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.051140623865649104,
"step": 98
},
{
"completion_length": 136.859375,
"epoch": 0.10598153352067442,
"grad_norm": 1.445330023765564,
"kl": 0.016985120251774788,
"learning_rate": 4.999562902281866e-06,
"loss": 0.0007,
"reward": 1.4283437356352806,
"reward_std": 0.9263063753023744,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.296875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.13146874867379665,
"step": 99
},
{
"completion_length": 145.234375,
"epoch": 0.1070520540612873,
"grad_norm": 4.298182010650635,
"kl": 0.03539641568204388,
"learning_rate": 4.999370587356267e-06,
"loss": 0.0014,
"reward": 1.5638906005769968,
"reward_std": 0.9018815001472831,
"rewards/correctness_reward_func": 1.15625,
"rewards/int_reward_func": 0.328125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.07951562479138374,
"step": 100
},
{
"completion_length": 126.46875,
"epoch": 0.10812257460190017,
"grad_norm": 1.7051573991775513,
"kl": 0.0210904503474012,
"learning_rate": 4.999143312438893e-06,
"loss": 0.0008,
"reward": 2.032781273126602,
"reward_std": 0.882479477673769,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.4296875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.10309374984353781,
"step": 101
},
{
"completion_length": 138.4375,
"epoch": 0.10919309514251305,
"grad_norm": 5.384690284729004,
"kl": 0.05343873624224216,
"learning_rate": 4.998881080708759e-06,
"loss": 0.0021,
"reward": 1.7520312666893005,
"reward_std": 1.0768068991601467,
"rewards/correctness_reward_func": 1.34375,
"rewards/int_reward_func": 0.3203125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.08796874759718776,
"step": 102
},
{
"completion_length": 121.265625,
"epoch": 0.11026361568312593,
"grad_norm": 3.4453506469726562,
"kl": 0.054925739066675305,
"learning_rate": 4.998583895833834e-06,
"loss": 0.0022,
"reward": 1.6850781589746475,
"reward_std": 0.6663676341995597,
"rewards/correctness_reward_func": 1.125,
"rewards/int_reward_func": 0.4296875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1303906268440187,
"step": 103
},
{
"completion_length": 114.71875,
"epoch": 0.11133413622373879,
"grad_norm": 1.5185428857803345,
"kl": 0.022636244888417423,
"learning_rate": 4.998251761970997e-06,
"loss": 0.0009,
"reward": 1.7328437007963657,
"reward_std": 0.7276732774917036,
"rewards/correctness_reward_func": 1.25,
"rewards/int_reward_func": 0.34375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.13909374829381704,
"step": 104
},
{
"completion_length": 114.359375,
"epoch": 0.11240465676435167,
"grad_norm": 4.903259754180908,
"kl": 0.0455300398170948,
"learning_rate": 4.997884683765977e-06,
"loss": 0.0018,
"reward": 2.070781234651804,
"reward_std": 0.5790006909519434,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.4296875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.14109374955296516,
"step": 105
},
{
"completion_length": 117.578125,
"epoch": 0.11347517730496454,
"grad_norm": 7.719193458557129,
"kl": 0.07494375784881413,
"learning_rate": 4.997482666353287e-06,
"loss": 0.003,
"reward": 1.8749061971902847,
"reward_std": 0.762595918495208,
"rewards/correctness_reward_func": 1.3125,
"rewards/int_reward_func": 0.390625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1717812498100102,
"step": 106
},
{
"completion_length": 132.71875,
"epoch": 0.11454569784557742,
"grad_norm": 1.9178704023361206,
"kl": 0.026816099416464567,
"learning_rate": 4.997045715356153e-06,
"loss": 0.0011,
"reward": 1.4049999862909317,
"reward_std": 1.0492134541273117,
"rewards/correctness_reward_func": 0.9375,
"rewards/int_reward_func": 0.328125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.13937499932944775,
"step": 107
},
{
"completion_length": 121.21875,
"epoch": 0.11561621838619028,
"grad_norm": 3.1541106700897217,
"kl": 0.04801671905443072,
"learning_rate": 4.9965738368864345e-06,
"loss": 0.0019,
"reward": 1.7343593537807465,
"reward_std": 0.9352071397006512,
"rewards/correctness_reward_func": 1.21875,
"rewards/int_reward_func": 0.359375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1562343705445528,
"step": 108
},
{
"completion_length": 134.40625,
"epoch": 0.11668673892680316,
"grad_norm": 3.4600815773010254,
"kl": 0.07537143386434764,
"learning_rate": 4.996067037544542e-06,
"loss": 0.003,
"reward": 1.6357812024652958,
"reward_std": 0.7928643207997084,
"rewards/correctness_reward_func": 1.15625,
"rewards/int_reward_func": 0.3359375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.14359374903142452,
"step": 109
},
{
"completion_length": 121.25,
"epoch": 0.11775725946741603,
"grad_norm": 1.8081263303756714,
"kl": 0.03252584161236882,
"learning_rate": 4.995525324419338e-06,
"loss": 0.0013,
"reward": 1.6038124561309814,
"reward_std": 0.9315616749227047,
"rewards/correctness_reward_func": 1.09375,
"rewards/int_reward_func": 0.3671875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.14287500269711018,
"step": 110
},
{
"completion_length": 113.734375,
"epoch": 0.1188277800080289,
"grad_norm": 3.427846670150757,
"kl": 0.029214507434517145,
"learning_rate": 4.994948705088047e-06,
"loss": 0.0012,
"reward": 1.7096093818545341,
"reward_std": 0.9403769830241799,
"rewards/correctness_reward_func": 1.1875,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.14710937440395355,
"step": 111
},
{
"completion_length": 138.140625,
"epoch": 0.11989830054864177,
"grad_norm": 5.425636291503906,
"kl": 0.11221261869650334,
"learning_rate": 4.99433718761614e-06,
"loss": 0.0045,
"reward": 1.4035156145691872,
"reward_std": 1.1017278581857681,
"rewards/correctness_reward_func": 0.90625,
"rewards/int_reward_func": 0.3515625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.14570312481373549,
"step": 112
},
{
"completion_length": 128.828125,
"epoch": 0.12096882108925465,
"grad_norm": 9.655882835388184,
"kl": 0.14679548889398575,
"learning_rate": 4.993690780557232e-06,
"loss": 0.0059,
"reward": 1.5967968963086605,
"reward_std": 0.9262449182569981,
"rewards/correctness_reward_func": 1.09375,
"rewards/int_reward_func": 0.3359375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.16710936930030584,
"step": 113
},
{
"completion_length": 114.3125,
"epoch": 0.12203934162986753,
"grad_norm": 2.9527218341827393,
"kl": 0.03957346314564347,
"learning_rate": 4.993009492952951e-06,
"loss": 0.0016,
"reward": 1.7140468880534172,
"reward_std": 0.9286471158266068,
"rewards/correctness_reward_func": 1.125,
"rewards/int_reward_func": 0.421875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.16717187454923987,
"step": 114
},
{
"completion_length": 122.515625,
"epoch": 0.1231098621704804,
"grad_norm": 4.749152660369873,
"kl": 0.05118492292240262,
"learning_rate": 4.992293334332821e-06,
"loss": 0.002,
"reward": 1.9886562526226044,
"reward_std": 0.9419979229569435,
"rewards/correctness_reward_func": 1.4375,
"rewards/int_reward_func": 0.40625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.14490625634789467,
"step": 115
},
{
"completion_length": 124.171875,
"epoch": 0.12418038271109327,
"grad_norm": 4.01917028427124,
"kl": 0.10346966434735805,
"learning_rate": 4.991542314714122e-06,
"loss": 0.0041,
"reward": 1.837890625,
"reward_std": 0.912613769993186,
"rewards/correctness_reward_func": 1.28125,
"rewards/int_reward_func": 0.40625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.15039062406867743,
"step": 116
},
{
"completion_length": 124.375,
"epoch": 0.12525090325170615,
"grad_norm": 3.7853267192840576,
"kl": 0.07023024489171803,
"learning_rate": 4.990756444601757e-06,
"loss": 0.0028,
"reward": 1.6455781627446413,
"reward_std": 0.7009308515116572,
"rewards/correctness_reward_func": 1.125,
"rewards/int_reward_func": 0.3671875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.15339062316343188,
"step": 117
},
{
"completion_length": 136.359375,
"epoch": 0.126321423792319,
"grad_norm": 2.0698678493499756,
"kl": 0.03698924113996327,
"learning_rate": 4.989935734988098e-06,
"loss": 0.0015,
"reward": 1.8074062652885914,
"reward_std": 0.8670060317963362,
"rewards/correctness_reward_func": 1.28125,
"rewards/int_reward_func": 0.3671875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1589687503874302,
"step": 118
},
{
"completion_length": 148.15625,
"epoch": 0.12739194433293188,
"grad_norm": 7.199705123901367,
"kl": 0.180443427991122,
"learning_rate": 4.989080197352834e-06,
"loss": 0.0072,
"reward": 1.179109364748001,
"reward_std": 0.7882043793797493,
"rewards/correctness_reward_func": 0.71875,
"rewards/int_reward_func": 0.328125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.13223437825217843,
"step": 119
},
{
"completion_length": 122.46875,
"epoch": 0.12846246487354476,
"grad_norm": 1.6841520071029663,
"kl": 0.0505296983756125,
"learning_rate": 4.9881898436628165e-06,
"loss": 0.002,
"reward": 1.9381406530737877,
"reward_std": 0.8331695850938559,
"rewards/correctness_reward_func": 1.375,
"rewards/int_reward_func": 0.390625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.17251562420278788,
"step": 120
},
{
"completion_length": 108.1875,
"epoch": 0.12953298541415764,
"grad_norm": 1.8260003328323364,
"kl": 0.059317339677363634,
"learning_rate": 4.987264686371881e-06,
"loss": 0.0024,
"reward": 2.02584370970726,
"reward_std": 1.0399171710014343,
"rewards/correctness_reward_func": 1.40625,
"rewards/int_reward_func": 0.421875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.19771874882280827,
"step": 121
},
{
"completion_length": 95.46875,
"epoch": 0.1306035059547705,
"grad_norm": 2.1317758560180664,
"kl": 0.053682942409068346,
"learning_rate": 4.986304738420684e-06,
"loss": 0.0021,
"reward": 2.1616249792277813,
"reward_std": 0.6366997184231877,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.22412499878555536,
"step": 122
},
{
"completion_length": 118.453125,
"epoch": 0.13167402649538337,
"grad_norm": 1.487423062324524,
"kl": 0.04084368539042771,
"learning_rate": 4.985310013236514e-06,
"loss": 0.0016,
"reward": 2.4017499536275864,
"reward_std": 0.6733962241560221,
"rewards/correctness_reward_func": 1.71875,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2064374964684248,
"step": 123
},
{
"completion_length": 136.75,
"epoch": 0.13274454703599625,
"grad_norm": 6.275110244750977,
"kl": 0.10213979217223823,
"learning_rate": 4.984280524733107e-06,
"loss": 0.0041,
"reward": 1.5195625126361847,
"reward_std": 0.8954427968710661,
"rewards/correctness_reward_func": 1.0625,
"rewards/int_reward_func": 0.3125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1445625051856041,
"step": 124
},
{
"completion_length": 107.84375,
"epoch": 0.13381506757660913,
"grad_norm": 4.696693420410156,
"kl": 0.1205500855576247,
"learning_rate": 4.983216287310453e-06,
"loss": 0.0048,
"reward": 1.828781247138977,
"reward_std": 0.92250463552773,
"rewards/correctness_reward_func": 1.1875,
"rewards/int_reward_func": 0.4296875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.21159374713897705,
"step": 125
},
{
"completion_length": 90.984375,
"epoch": 0.134885588117222,
"grad_norm": 5.726349830627441,
"kl": 0.14590927632525563,
"learning_rate": 4.982117315854594e-06,
"loss": 0.0058,
"reward": 2.218953087925911,
"reward_std": 0.855751893715933,
"rewards/correctness_reward_func": 1.53125,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.24239062331616879,
"step": 126
},
{
"completion_length": 102.28125,
"epoch": 0.13595610865783486,
"grad_norm": 2.1168274879455566,
"kl": 0.07060433947481215,
"learning_rate": 4.980983625737411e-06,
"loss": 0.0028,
"reward": 2.1402343213558197,
"reward_std": 0.6787437000311911,
"rewards/correctness_reward_func": 1.46875,
"rewards/int_reward_func": 0.46875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.20273437071591616,
"step": 127
},
{
"completion_length": 122.0625,
"epoch": 0.13702662919844774,
"grad_norm": 9.184843063354492,
"kl": 0.15047278022393584,
"learning_rate": 4.9798152328164165e-06,
"loss": 0.006,
"reward": 1.5844999551773071,
"reward_std": 1.025202952325344,
"rewards/correctness_reward_func": 1.03125,
"rewards/int_reward_func": 0.359375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.19387499801814556,
"step": 128
},
{
"completion_length": 107.21875,
"epoch": 0.13809714973906062,
"grad_norm": 1.8967880010604858,
"kl": 0.04243561811745167,
"learning_rate": 4.978612153434527e-06,
"loss": 0.0017,
"reward": 2.1196718886494637,
"reward_std": 0.5547986216843128,
"rewards/correctness_reward_func": 1.46875,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2134218756109476,
"step": 129
},
{
"completion_length": 108.03125,
"epoch": 0.1391676702796735,
"grad_norm": 2.011303186416626,
"kl": 0.04303696344140917,
"learning_rate": 4.977374404419838e-06,
"loss": 0.0017,
"reward": 2.1730000376701355,
"reward_std": 0.9301177933812141,
"rewards/correctness_reward_func": 1.53125,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2042500004172325,
"step": 130
},
{
"completion_length": 110.453125,
"epoch": 0.14023819082028635,
"grad_norm": 7.903467178344727,
"kl": 0.22786898026242852,
"learning_rate": 4.9761020030853854e-06,
"loss": 0.0091,
"reward": 2.0016875714063644,
"reward_std": 0.8600476859137416,
"rewards/correctness_reward_func": 1.40625,
"rewards/int_reward_func": 0.3984375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.19699999503791332,
"step": 131
},
{
"completion_length": 132.625,
"epoch": 0.14130871136089923,
"grad_norm": 5.2613444328308105,
"kl": 0.23378165811300278,
"learning_rate": 4.9747949672289075e-06,
"loss": 0.0094,
"reward": 1.4954218715429306,
"reward_std": 0.9026085883378983,
"rewards/correctness_reward_func": 1.03125,
"rewards/int_reward_func": 0.3203125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.14385937619954348,
"step": 132
},
{
"completion_length": 101.125,
"epoch": 0.1423792319015121,
"grad_norm": 4.111429214477539,
"kl": 0.18476470839232206,
"learning_rate": 4.973453315132592e-06,
"loss": 0.0074,
"reward": 2.419406235218048,
"reward_std": 0.6811097683385015,
"rewards/correctness_reward_func": 1.75,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.19284374825656414,
"step": 133
},
{
"completion_length": 110.0625,
"epoch": 0.143449752442125,
"grad_norm": 4.763584613800049,
"kl": 0.1902949649374932,
"learning_rate": 4.9720770655628216e-06,
"loss": 0.0076,
"reward": 2.049671910703182,
"reward_std": 0.8413272872567177,
"rewards/correctness_reward_func": 1.40625,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.19029687531292439,
"step": 134
},
{
"completion_length": 95.578125,
"epoch": 0.14452027298273787,
"grad_norm": 4.345425128936768,
"kl": 0.18916460033506155,
"learning_rate": 4.970666237769913e-06,
"loss": 0.0076,
"reward": 2.191656231880188,
"reward_std": 0.7033369969576597,
"rewards/correctness_reward_func": 1.53125,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.20728124678134918,
"step": 135
},
{
"completion_length": 84.078125,
"epoch": 0.14559079352335072,
"grad_norm": 2.940378427505493,
"kl": 0.06369929504580796,
"learning_rate": 4.9692208514878445e-06,
"loss": 0.0025,
"reward": 2.3128594160079956,
"reward_std": 0.6257542409002781,
"rewards/correctness_reward_func": 1.625,
"rewards/int_reward_func": 0.46875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.21910937316715717,
"step": 136
},
{
"completion_length": 120.328125,
"epoch": 0.1466613140639636,
"grad_norm": 7.51654052734375,
"kl": 0.3002074658870697,
"learning_rate": 4.967740926933985e-06,
"loss": 0.012,
"reward": 1.7032031267881393,
"reward_std": 1.0220621526241302,
"rewards/correctness_reward_func": 1.15625,
"rewards/int_reward_func": 0.3828125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.16414062399417162,
"step": 137
},
{
"completion_length": 114.046875,
"epoch": 0.14773183460457648,
"grad_norm": 2.956787347793579,
"kl": 0.10229182336479425,
"learning_rate": 4.966226484808804e-06,
"loss": 0.0041,
"reward": 1.6846249997615814,
"reward_std": 0.8771754652261734,
"rewards/correctness_reward_func": 1.09375,
"rewards/int_reward_func": 0.3984375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.19243750255554914,
"step": 138
},
{
"completion_length": 102.34375,
"epoch": 0.14880235514518936,
"grad_norm": 4.8385114669799805,
"kl": 0.1648537963628769,
"learning_rate": 4.96467754629559e-06,
"loss": 0.0066,
"reward": 1.9405781105160713,
"reward_std": 0.6661778870038688,
"rewards/correctness_reward_func": 1.3125,
"rewards/int_reward_func": 0.40625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.22182812727987766,
"step": 139
},
{
"completion_length": 99.921875,
"epoch": 0.1498728756858022,
"grad_norm": 2.6203041076660156,
"kl": 0.06277278368361294,
"learning_rate": 4.963094133060148e-06,
"loss": 0.0025,
"reward": 2.1328750401735306,
"reward_std": 0.6620223973877728,
"rewards/correctness_reward_func": 1.4375,
"rewards/int_reward_func": 0.4609375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.23443749826401472,
"step": 140
},
{
"completion_length": 92.15625,
"epoch": 0.1509433962264151,
"grad_norm": 2.787095069885254,
"kl": 0.07504080841317773,
"learning_rate": 4.961476267250501e-06,
"loss": 0.003,
"reward": 2.2870156168937683,
"reward_std": 0.6934415455907583,
"rewards/correctness_reward_func": 1.59375,
"rewards/int_reward_func": 0.4609375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.23232812341302633,
"step": 141
},
{
"completion_length": 111.0,
"epoch": 0.15201391676702797,
"grad_norm": 2.570060968399048,
"kl": 0.06025985535234213,
"learning_rate": 4.959823971496575e-06,
"loss": 0.0024,
"reward": 1.9751719227060676,
"reward_std": 0.7296689655631781,
"rewards/correctness_reward_func": 1.375,
"rewards/int_reward_func": 0.40625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1939218717161566,
"step": 142
},
{
"completion_length": 127.40625,
"epoch": 0.15308443730764085,
"grad_norm": 4.903911590576172,
"kl": 0.20708202896639705,
"learning_rate": 4.958137268909887e-06,
"loss": 0.0083,
"reward": 2.063531205058098,
"reward_std": 0.8510163221508265,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.421875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.14165624976158142,
"step": 143
},
{
"completion_length": 98.5625,
"epoch": 0.15415495784825373,
"grad_norm": 4.191147804260254,
"kl": 0.2446250948123634,
"learning_rate": 4.9564161830832214e-06,
"loss": 0.0098,
"reward": 2.337281256914139,
"reward_std": 0.6763627836480737,
"rewards/correctness_reward_func": 1.65625,
"rewards/int_reward_func": 0.4609375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2200937452726066,
"step": 144
},
{
"completion_length": 88.984375,
"epoch": 0.15522547838886658,
"grad_norm": 168.51283264160156,
"kl": 0.34985177870839834,
"learning_rate": 4.954660738090297e-06,
"loss": 0.014,
"reward": 1.6290156617760658,
"reward_std": 0.7621745709329844,
"rewards/correctness_reward_func": 1.03125,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.222765626385808,
"step": 145
},
{
"completion_length": 116.078125,
"epoch": 0.15629599892947946,
"grad_norm": 4.605787754058838,
"kl": 0.20393797848373652,
"learning_rate": 4.9528709584854316e-06,
"loss": 0.0082,
"reward": 1.5738593488931656,
"reward_std": 1.115996390581131,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.3984375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.17542187124490738,
"step": 146
},
{
"completion_length": 89.328125,
"epoch": 0.15736651947009234,
"grad_norm": 4.029605865478516,
"kl": 0.21852776128798723,
"learning_rate": 4.951046869303202e-06,
"loss": 0.0087,
"reward": 1.9777500182390213,
"reward_std": 0.9872381817549467,
"rewards/correctness_reward_func": 1.34375,
"rewards/int_reward_func": 0.40625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.22774999774992466,
"step": 147
},
{
"completion_length": 106.75,
"epoch": 0.15843704001070522,
"grad_norm": 3.108353853225708,
"kl": 0.07371893431991339,
"learning_rate": 4.949188496058089e-06,
"loss": 0.0029,
"reward": 1.7372031211853027,
"reward_std": 0.9112571626901627,
"rewards/correctness_reward_func": 1.09375,
"rewards/int_reward_func": 0.421875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2215781332924962,
"step": 148
},
{
"completion_length": 78.40625,
"epoch": 0.15950756055131807,
"grad_norm": 5.034030914306641,
"kl": 0.11514789052307606,
"learning_rate": 4.947295864744121e-06,
"loss": 0.0046,
"reward": 2.055265612900257,
"reward_std": 0.6963230553083122,
"rewards/correctness_reward_func": 1.34375,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.258390624076128,
"step": 149
},
{
"completion_length": 90.484375,
"epoch": 0.16057808109193095,
"grad_norm": 2.6227409839630127,
"kl": 0.09901809925213456,
"learning_rate": 4.9453690018345144e-06,
"loss": 0.004,
"reward": 2.031171888113022,
"reward_std": 0.9270219663158059,
"rewards/correctness_reward_func": 1.34375,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2499218750745058,
"step": 150
},
{
"completion_length": 111.640625,
"epoch": 0.16164860163254383,
"grad_norm": 5.403820037841797,
"kl": 0.24172887252643704,
"learning_rate": 4.943407934281298e-06,
"loss": 0.0097,
"reward": 1.693390630185604,
"reward_std": 1.0351360142230988,
"rewards/correctness_reward_func": 1.0625,
"rewards/int_reward_func": 0.4609375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1699531227350235,
"step": 151
},
{
"completion_length": 92.890625,
"epoch": 0.1627191221731567,
"grad_norm": 2.174766778945923,
"kl": 0.07736781658604741,
"learning_rate": 4.941412689514941e-06,
"loss": 0.0031,
"reward": 2.1249531507492065,
"reward_std": 0.82035240996629,
"rewards/correctness_reward_func": 1.40625,
"rewards/int_reward_func": 0.46875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2499531265348196,
"step": 152
},
{
"completion_length": 96.546875,
"epoch": 0.16378964271376958,
"grad_norm": 2.039672374725342,
"kl": 0.07323360512964427,
"learning_rate": 4.939383295443966e-06,
"loss": 0.0029,
"reward": 2.026875004172325,
"reward_std": 0.7493576873093843,
"rewards/correctness_reward_func": 1.375,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.20656250044703484,
"step": 153
},
{
"completion_length": 106.59375,
"epoch": 0.16486016325438244,
"grad_norm": 2.2819831371307373,
"kl": 0.07374695758335292,
"learning_rate": 4.937319780454559e-06,
"loss": 0.0029,
"reward": 1.7915781140327454,
"reward_std": 0.8505431758239865,
"rewards/correctness_reward_func": 1.15625,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1900156200863421,
"step": 154
},
{
"completion_length": 91.90625,
"epoch": 0.16593068379499532,
"grad_norm": 5.251499176025391,
"kl": 0.40140265179798007,
"learning_rate": 4.9352221734101745e-06,
"loss": 0.0161,
"reward": 2.1450937539339066,
"reward_std": 0.7235295535065234,
"rewards/correctness_reward_func": 1.46875,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2388437483459711,
"step": 155
},
{
"completion_length": 91.375,
"epoch": 0.1670012043356082,
"grad_norm": 2.6729063987731934,
"kl": 0.06791439699009061,
"learning_rate": 4.933090503651129e-06,
"loss": 0.0027,
"reward": 2.2278437092900276,
"reward_std": 0.7469283854588866,
"rewards/correctness_reward_func": 1.5625,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.22003125306218863,
"step": 156
},
{
"completion_length": 83.53125,
"epoch": 0.16807172487622107,
"grad_norm": 2.890995979309082,
"kl": 0.08941763360053301,
"learning_rate": 4.930924800994192e-06,
"loss": 0.0036,
"reward": 2.040843792259693,
"reward_std": 0.6482276869937778,
"rewards/correctness_reward_func": 1.3125,
"rewards/int_reward_func": 0.4921875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.23615625128149986,
"step": 157
},
{
"completion_length": 103.4375,
"epoch": 0.16914224541683393,
"grad_norm": 4.847292900085449,
"kl": 0.25062092347070575,
"learning_rate": 4.9287250957321685e-06,
"loss": 0.01,
"reward": 1.8447500094771385,
"reward_std": 0.8658850640058517,
"rewards/correctness_reward_func": 1.1875,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.21193750761449337,
"step": 158
},
{
"completion_length": 87.78125,
"epoch": 0.1702127659574468,
"grad_norm": 2.486091136932373,
"kl": 0.10755344619974494,
"learning_rate": 4.9264914186334775e-06,
"loss": 0.0043,
"reward": 2.14860936999321,
"reward_std": 0.7705556647852063,
"rewards/correctness_reward_func": 1.4375,
"rewards/int_reward_func": 0.4609375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25017187278717756,
"step": 159
},
{
"completion_length": 74.8125,
"epoch": 0.17128328649805968,
"grad_norm": 2.6566174030303955,
"kl": 0.09692100062966347,
"learning_rate": 4.924223800941718e-06,
"loss": 0.0039,
"reward": 2.179875001311302,
"reward_std": 0.647944641765207,
"rewards/correctness_reward_func": 1.40625,
"rewards/int_reward_func": 0.484375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.28924999572336674,
"step": 160
},
{
"completion_length": 112.390625,
"epoch": 0.17235380703867256,
"grad_norm": 7.983546257019043,
"kl": 0.34650124446488917,
"learning_rate": 4.921922274375232e-06,
"loss": 0.0139,
"reward": 1.7768593654036522,
"reward_std": 0.6629852540791035,
"rewards/correctness_reward_func": 1.1875,
"rewards/int_reward_func": 0.4140625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.17529687471687794,
"step": 161
},
{
"completion_length": 81.609375,
"epoch": 0.17342432757928541,
"grad_norm": 3.0785446166992188,
"kl": 0.09424351761117578,
"learning_rate": 4.919586871126667e-06,
"loss": 0.0038,
"reward": 2.1120937913656235,
"reward_std": 0.947939082980156,
"rewards/correctness_reward_func": 1.375,
"rewards/int_reward_func": 0.4609375,
"rewards/soft_format_reward_func": 0.0078125,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.26834374107420444,
"step": 162
},
{
"completion_length": 90.421875,
"epoch": 0.1744948481198983,
"grad_norm": 4.486505508422852,
"kl": 0.2375591630116105,
"learning_rate": 4.917217623862516e-06,
"loss": 0.0095,
"reward": 1.9345000088214874,
"reward_std": 0.7378783877938986,
"rewards/correctness_reward_func": 1.21875,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.23918749950826168,
"step": 163
},
{
"completion_length": 89.34375,
"epoch": 0.17556536866051117,
"grad_norm": 5.476224422454834,
"kl": 0.3916892586275935,
"learning_rate": 4.914814565722671e-06,
"loss": 0.0157,
"reward": 1.6488437578082085,
"reward_std": 0.861721821129322,
"rewards/correctness_reward_func": 0.9375,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2582187484949827,
"step": 164
},
{
"completion_length": 76.15625,
"epoch": 0.17663588920112405,
"grad_norm": 2.504824161529541,
"kl": 0.09898415254428983,
"learning_rate": 4.912377730319951e-06,
"loss": 0.004,
"reward": 2.149609424173832,
"reward_std": 0.8080255158711225,
"rewards/correctness_reward_func": 1.4375,
"rewards/int_reward_func": 0.421875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.29023437574505806,
"step": 165
},
{
"completion_length": 86.703125,
"epoch": 0.17770640974173693,
"grad_norm": 2.5262093544006348,
"kl": 0.10452345060184598,
"learning_rate": 4.909907151739634e-06,
"loss": 0.0042,
"reward": 2.0487187057733536,
"reward_std": 1.013342872262001,
"rewards/correctness_reward_func": 1.34375,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25965624768286943,
"step": 166
},
{
"completion_length": 83.109375,
"epoch": 0.17877693028234978,
"grad_norm": 3.6836423873901367,
"kl": 0.12263510143384337,
"learning_rate": 4.907402864538984e-06,
"loss": 0.0049,
"reward": 2.0983437597751617,
"reward_std": 0.8467487432062626,
"rewards/correctness_reward_func": 1.375,
"rewards/int_reward_func": 0.46875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2545937467366457,
"step": 167
},
{
"completion_length": 105.828125,
"epoch": 0.17984745082296266,
"grad_norm": 3.9994001388549805,
"kl": 0.2718197964131832,
"learning_rate": 4.904864903746765e-06,
"loss": 0.0109,
"reward": 1.590890608727932,
"reward_std": 1.0225011110305786,
"rewards/correctness_reward_func": 0.96875,
"rewards/int_reward_func": 0.3984375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.21589063154533505,
"step": 168
},
{
"completion_length": 80.734375,
"epoch": 0.18091797136357554,
"grad_norm": 3.370049476623535,
"kl": 0.0908288094215095,
"learning_rate": 4.9022933048627496e-06,
"loss": 0.0036,
"reward": 1.9604843854904175,
"reward_std": 0.6807838249951601,
"rewards/correctness_reward_func": 1.1875,
"rewards/int_reward_func": 0.484375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2886093705892563,
"step": 169
},
{
"completion_length": 89.03125,
"epoch": 0.18198849190418842,
"grad_norm": 2.7841224670410156,
"kl": 0.09452959662303329,
"learning_rate": 4.899688103857223e-06,
"loss": 0.0038,
"reward": 2.052734389901161,
"reward_std": 0.789879210293293,
"rewards/correctness_reward_func": 1.34375,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25585937313735485,
"step": 170
},
{
"completion_length": 99.46875,
"epoch": 0.18305901244480127,
"grad_norm": 2.4192087650299072,
"kl": 0.10028906259685755,
"learning_rate": 4.897049337170483e-06,
"loss": 0.004,
"reward": 1.9263124987483025,
"reward_std": 0.4447530438192189,
"rewards/correctness_reward_func": 1.25,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.23881249967962503,
"step": 171
},
{
"completion_length": 85.359375,
"epoch": 0.18412953298541415,
"grad_norm": 2.552004814147949,
"kl": 0.10651395656168461,
"learning_rate": 4.894377041712327e-06,
"loss": 0.0043,
"reward": 2.072437509894371,
"reward_std": 0.8923071715980768,
"rewards/correctness_reward_func": 1.34375,
"rewards/int_reward_func": 0.484375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.244312503375113,
"step": 172
},
{
"completion_length": 86.828125,
"epoch": 0.18520005352602703,
"grad_norm": 2.8208720684051514,
"kl": 0.08628266118466854,
"learning_rate": 4.891671254861535e-06,
"loss": 0.0035,
"reward": 2.123812586069107,
"reward_std": 0.7962243193760514,
"rewards/correctness_reward_func": 1.40625,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2722499957308173,
"step": 173
},
{
"completion_length": 68.59375,
"epoch": 0.1862705740666399,
"grad_norm": 3.199782133102417,
"kl": 0.09673942252993584,
"learning_rate": 4.8889320144653525e-06,
"loss": 0.0039,
"reward": 2.4416562616825104,
"reward_std": 0.6845003152266145,
"rewards/correctness_reward_func": 1.65625,
"rewards/int_reward_func": 0.484375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3010312579572201,
"step": 174
},
{
"completion_length": 80.265625,
"epoch": 0.1873410946072528,
"grad_norm": 2.8050692081451416,
"kl": 0.09070709394291043,
"learning_rate": 4.886159358838952e-06,
"loss": 0.0036,
"reward": 2.2726562321186066,
"reward_std": 0.713380170520395,
"rewards/correctness_reward_func": 1.53125,
"rewards/int_reward_func": 0.4609375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2804687600582838,
"step": 175
},
{
"completion_length": 95.078125,
"epoch": 0.18841161514786564,
"grad_norm": 3.0878233909606934,
"kl": 0.22780032362788916,
"learning_rate": 4.883353326764907e-06,
"loss": 0.0091,
"reward": 2.0888593643903732,
"reward_std": 0.632993305567652,
"rewards/correctness_reward_func": 1.40625,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.22948437556624413,
"step": 176
},
{
"completion_length": 92.03125,
"epoch": 0.18948213568847852,
"grad_norm": 6.2119035720825195,
"kl": 0.3357097846455872,
"learning_rate": 4.880513957492641e-06,
"loss": 0.0134,
"reward": 1.9138593599200249,
"reward_std": 0.8630610294640064,
"rewards/correctness_reward_func": 1.21875,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.24979687482118607,
"step": 177
},
{
"completion_length": 84.390625,
"epoch": 0.1905526562290914,
"grad_norm": 4.748230457305908,
"kl": 0.26475911401212215,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.0106,
"reward": 1.8656718656420708,
"reward_std": 0.7014469979330897,
"rewards/correctness_reward_func": 1.125,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2641093786805868,
"step": 178
},
{
"completion_length": 95.0625,
"epoch": 0.19162317676970428,
"grad_norm": 2.9860572814941406,
"kl": 0.1027436142321676,
"learning_rate": 4.8747353666821155e-06,
"loss": 0.0041,
"reward": 1.8935781568288803,
"reward_std": 0.8618385540321469,
"rewards/correctness_reward_func": 1.15625,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.25295313261449337,
"step": 179
},
{
"completion_length": 78.359375,
"epoch": 0.19269369731031713,
"grad_norm": 4.26141881942749,
"kl": 0.19185639871284366,
"learning_rate": 4.871796225972e-06,
"loss": 0.0077,
"reward": 1.882093757390976,
"reward_std": 0.762749788351357,
"rewards/correctness_reward_func": 1.1875,
"rewards/int_reward_func": 0.4140625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2805312527343631,
"step": 180
},
{
"completion_length": 86.953125,
"epoch": 0.19376421785093,
"grad_norm": 2.9578616619110107,
"kl": 0.09208998270332813,
"learning_rate": 4.868823909718823e-06,
"loss": 0.0037,
"reward": 2.173124998807907,
"reward_std": 0.8102906746789813,
"rewards/correctness_reward_func": 1.46875,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25125000439584255,
"step": 181
},
{
"completion_length": 71.671875,
"epoch": 0.1948347383915429,
"grad_norm": 2.6002249717712402,
"kl": 0.10551499295979738,
"learning_rate": 4.865818459497911e-06,
"loss": 0.0042,
"reward": 2.2820468470454216,
"reward_std": 0.555876774713397,
"rewards/correctness_reward_func": 1.46875,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3132968805730343,
"step": 182
},
{
"completion_length": 92.03125,
"epoch": 0.19590525893215577,
"grad_norm": 6.459475517272949,
"kl": 0.2931561325676739,
"learning_rate": 4.862779917348055e-06,
"loss": 0.0117,
"reward": 1.95626562833786,
"reward_std": 0.8608931167982519,
"rewards/correctness_reward_func": 1.25,
"rewards/int_reward_func": 0.4609375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2453281208872795,
"step": 183
},
{
"completion_length": 102.515625,
"epoch": 0.19697577947276865,
"grad_norm": 1.968064308166504,
"kl": 0.09014055877923965,
"learning_rate": 4.859708325770919e-06,
"loss": 0.0036,
"reward": 1.539296880364418,
"reward_std": 0.8695460446178913,
"rewards/correctness_reward_func": 0.875,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.22679687477648258,
"step": 184
},
{
"completion_length": 82.21875,
"epoch": 0.1980463000133815,
"grad_norm": 5.502157688140869,
"kl": 0.24213434057310224,
"learning_rate": 4.856603727730446e-06,
"loss": 0.0097,
"reward": 2.103000044822693,
"reward_std": 0.7971827173605561,
"rewards/correctness_reward_func": 1.375,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.27487499825656414,
"step": 185
},
{
"completion_length": 96.796875,
"epoch": 0.19911682055399438,
"grad_norm": 4.163439750671387,
"kl": 0.2488319119438529,
"learning_rate": 4.853466166652259e-06,
"loss": 0.01,
"reward": 1.8271406143903732,
"reward_std": 0.8965076357126236,
"rewards/correctness_reward_func": 1.125,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.22557812742888927,
"step": 186
},
{
"completion_length": 109.671875,
"epoch": 0.20018734109460726,
"grad_norm": 2.380798816680908,
"kl": 0.07831509876996279,
"learning_rate": 4.850295686423048e-06,
"loss": 0.0031,
"reward": 1.7582030892372131,
"reward_std": 0.8107579126954079,
"rewards/correctness_reward_func": 1.125,
"rewards/int_reward_func": 0.4140625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.21132812649011612,
"step": 187
},
{
"completion_length": 96.46875,
"epoch": 0.20125786163522014,
"grad_norm": 5.34075927734375,
"kl": 0.3953818525187671,
"learning_rate": 4.8470923313899655e-06,
"loss": 0.0158,
"reward": 1.9607812352478504,
"reward_std": 0.5701902243308723,
"rewards/correctness_reward_func": 1.3125,
"rewards/int_reward_func": 0.421875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.22640624921768904,
"step": 188
},
{
"completion_length": 79.265625,
"epoch": 0.202328382175833,
"grad_norm": 2.1705665588378906,
"kl": 0.10117745213210583,
"learning_rate": 4.843856146359999e-06,
"loss": 0.004,
"reward": 2.0710155963897705,
"reward_std": 0.6723045469261706,
"rewards/correctness_reward_func": 1.34375,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2741406299173832,
"step": 189
},
{
"completion_length": 72.46875,
"epoch": 0.20339890271644587,
"grad_norm": 2.7847604751586914,
"kl": 0.12489751679822803,
"learning_rate": 4.8405871765993435e-06,
"loss": 0.005,
"reward": 1.8936093151569366,
"reward_std": 0.8906522025354207,
"rewards/correctness_reward_func": 1.15625,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.2920468747615814,
"step": 190
},
{
"completion_length": 74.3125,
"epoch": 0.20446942325705875,
"grad_norm": 7.144665241241455,
"kl": 0.5474197333678603,
"learning_rate": 4.837285467832775e-06,
"loss": 0.0219,
"reward": 1.9980624914169312,
"reward_std": 1.1599431410431862,
"rewards/correctness_reward_func": 1.28125,
"rewards/int_reward_func": 0.4140625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3027499932795763,
"step": 191
},
{
"completion_length": 77.015625,
"epoch": 0.20553994379767163,
"grad_norm": 2.282410144805908,
"kl": 0.12488419935107231,
"learning_rate": 4.833951066243004e-06,
"loss": 0.005,
"reward": 2.0859062671661377,
"reward_std": 0.7159075043164194,
"rewards/correctness_reward_func": 1.3125,
"rewards/int_reward_func": 0.4609375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3124687448143959,
"step": 192
},
{
"completion_length": 74.6875,
"epoch": 0.20661046433828448,
"grad_norm": 5.521911144256592,
"kl": 0.38207234255969524,
"learning_rate": 4.830584018470036e-06,
"loss": 0.0153,
"reward": 2.290625035762787,
"reward_std": 0.6622507013380527,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.484375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.2984374985098839,
"step": 193
},
{
"completion_length": 81.640625,
"epoch": 0.20768098487889736,
"grad_norm": 5.832190036773682,
"kl": 0.24756696447730064,
"learning_rate": 4.827184371610511e-06,
"loss": 0.0099,
"reward": 2.2973125129938126,
"reward_std": 0.6708025210537016,
"rewards/correctness_reward_func": 1.53125,
"rewards/int_reward_func": 0.484375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.28168749529868364,
"step": 194
},
{
"completion_length": 74.671875,
"epoch": 0.20875150541951024,
"grad_norm": 7.309933662414551,
"kl": 0.12391441874206066,
"learning_rate": 4.8237521732170525e-06,
"loss": 0.005,
"reward": 2.016703099012375,
"reward_std": 1.132209412753582,
"rewards/correctness_reward_func": 1.28125,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.29014062508940697,
"step": 195
},
{
"completion_length": 73.34375,
"epoch": 0.20982202596012312,
"grad_norm": 3.4460299015045166,
"kl": 0.12927269656211138,
"learning_rate": 4.820287471297598e-06,
"loss": 0.0052,
"reward": 1.9848750308156013,
"reward_std": 0.7779360907152295,
"rewards/correctness_reward_func": 1.21875,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2895624926313758,
"step": 196
},
{
"completion_length": 88.28125,
"epoch": 0.210892546500736,
"grad_norm": 4.137183666229248,
"kl": 0.3276713816449046,
"learning_rate": 4.816790314314729e-06,
"loss": 0.0131,
"reward": 1.964468702673912,
"reward_std": 0.7385209053754807,
"rewards/correctness_reward_func": 1.25,
"rewards/int_reward_func": 0.4609375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2535312492400408,
"step": 197
},
{
"completion_length": 73.453125,
"epoch": 0.21196306704134885,
"grad_norm": 6.063220500946045,
"kl": 0.36109886690974236,
"learning_rate": 4.813260751184992e-06,
"loss": 0.0144,
"reward": 2.3978749811649323,
"reward_std": 0.7160034999251366,
"rewards/correctness_reward_func": 1.59375,
"rewards/int_reward_func": 0.484375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.31975000351667404,
"step": 198
},
{
"completion_length": 90.359375,
"epoch": 0.21303358758196173,
"grad_norm": 9.171374320983887,
"kl": 0.7352069662883878,
"learning_rate": 4.809698831278217e-06,
"loss": 0.0294,
"reward": 1.9928593933582306,
"reward_std": 0.8052435261197388,
"rewards/correctness_reward_func": 1.28125,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2662968710064888,
"step": 199
},
{
"completion_length": 69.953125,
"epoch": 0.2141041081225746,
"grad_norm": 3.0394396781921387,
"kl": 0.14388780342414975,
"learning_rate": 4.806104604416824e-06,
"loss": 0.0058,
"reward": 2.525015652179718,
"reward_std": 0.4300219719298184,
"rewards/correctness_reward_func": 1.75,
"rewards/int_reward_func": 0.484375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2906406167894602,
"step": 200
},
{
"completion_length": 92.03125,
"epoch": 0.21517462866318748,
"grad_norm": 4.5592265129089355,
"kl": 0.3974157813936472,
"learning_rate": 4.802478120875125e-06,
"loss": 0.0159,
"reward": 1.534609392285347,
"reward_std": 1.030179588124156,
"rewards/correctness_reward_func": 0.875,
"rewards/int_reward_func": 0.3828125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2767968699336052,
"step": 201
},
{
"completion_length": 76.625,
"epoch": 0.21624514920380034,
"grad_norm": 2.740817070007324,
"kl": 0.14991699904203415,
"learning_rate": 4.7988194313786275e-06,
"loss": 0.006,
"reward": 2.134265646338463,
"reward_std": 0.8027890680823475,
"rewards/correctness_reward_func": 1.375,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.28270312771201134,
"step": 202
},
{
"completion_length": 86.53125,
"epoch": 0.21731566974441321,
"grad_norm": 5.491028785705566,
"kl": 0.32305468805134296,
"learning_rate": 4.795128587103315e-06,
"loss": 0.0129,
"reward": 2.177546873688698,
"reward_std": 0.862015737220645,
"rewards/correctness_reward_func": 1.46875,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2556718774139881,
"step": 203
},
{
"completion_length": 80.640625,
"epoch": 0.2183861902850261,
"grad_norm": 3.542194366455078,
"kl": 0.13027278054505587,
"learning_rate": 4.791405639674941e-06,
"loss": 0.0052,
"reward": 1.6867187470197678,
"reward_std": 0.8688598442822695,
"rewards/correctness_reward_func": 0.9375,
"rewards/int_reward_func": 0.4609375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.28046874329447746,
"step": 204
},
{
"completion_length": 75.96875,
"epoch": 0.21945671082563897,
"grad_norm": 3.078350305557251,
"kl": 0.1293167658150196,
"learning_rate": 4.7876506411683e-06,
"loss": 0.0052,
"reward": 2.0141249895095825,
"reward_std": 0.7147551532834768,
"rewards/correctness_reward_func": 1.21875,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.31881249509751797,
"step": 205
},
{
"completion_length": 62.734375,
"epoch": 0.22052723136625185,
"grad_norm": 7.453506946563721,
"kl": 0.8046054858714342,
"learning_rate": 4.783863644106502e-06,
"loss": 0.0322,
"reward": 1.6788437813520432,
"reward_std": 0.8941609086468816,
"rewards/correctness_reward_func": 0.96875,
"rewards/int_reward_func": 0.40625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3038437496870756,
"step": 206
},
{
"completion_length": 68.671875,
"epoch": 0.2215977519068647,
"grad_norm": 4.4995436668396,
"kl": 0.14437556639313698,
"learning_rate": 4.780044701460239e-06,
"loss": 0.0058,
"reward": 2.126671925187111,
"reward_std": 0.7776627587154508,
"rewards/correctness_reward_func": 1.34375,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.29854688607156277,
"step": 207
},
{
"completion_length": 54.75,
"epoch": 0.22266827244747758,
"grad_norm": 3.271150588989258,
"kl": 0.2097001215443015,
"learning_rate": 4.7761938666470405e-06,
"loss": 0.0084,
"reward": 1.956812545657158,
"reward_std": 0.9061004631221294,
"rewards/correctness_reward_func": 1.125,
"rewards/int_reward_func": 0.46875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.3552500009536743,
"step": 208
},
{
"completion_length": 72.578125,
"epoch": 0.22373879298809046,
"grad_norm": 3.0446856021881104,
"kl": 0.15493952203541994,
"learning_rate": 4.7723111935305275e-06,
"loss": 0.0062,
"reward": 2.270968735218048,
"reward_std": 0.8504615277051926,
"rewards/correctness_reward_func": 1.46875,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.32565624453127384,
"step": 209
},
{
"completion_length": 69.046875,
"epoch": 0.22480931352870334,
"grad_norm": 5.6605095863342285,
"kl": 0.44642951618880033,
"learning_rate": 4.7683967364196624e-06,
"loss": 0.0179,
"reward": 2.0304374992847443,
"reward_std": 0.7277556583285332,
"rewards/correctness_reward_func": 1.25,
"rewards/int_reward_func": 0.46875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3116874936968088,
"step": 210
},
{
"completion_length": 64.9375,
"epoch": 0.2258798340693162,
"grad_norm": 4.651093482971191,
"kl": 0.17161214351654053,
"learning_rate": 4.764450550067986e-06,
"loss": 0.0069,
"reward": 1.935359388589859,
"reward_std": 0.850550489500165,
"rewards/correctness_reward_func": 1.15625,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3259843699634075,
"step": 211
},
{
"completion_length": 99.921875,
"epoch": 0.22695035460992907,
"grad_norm": 7.470244407653809,
"kl": 0.5526934135705233,
"learning_rate": 4.760472689672851e-06,
"loss": 0.0221,
"reward": 1.62957813590765,
"reward_std": 0.9631365463137627,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.4140625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2155156247317791,
"step": 212
},
{
"completion_length": 64.296875,
"epoch": 0.22802087515054195,
"grad_norm": 4.030452251434326,
"kl": 0.1725650643929839,
"learning_rate": 4.7564632108746524e-06,
"loss": 0.0069,
"reward": 2.4495781660079956,
"reward_std": 0.7331900605931878,
"rewards/correctness_reward_func": 1.625,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.3402031324803829,
"step": 213
},
{
"completion_length": 57.0625,
"epoch": 0.22909139569115483,
"grad_norm": 2.44975209236145,
"kl": 0.1745634926483035,
"learning_rate": 4.752422169756048e-06,
"loss": 0.007,
"reward": 2.49567186832428,
"reward_std": 0.585249027935788,
"rewards/correctness_reward_func": 1.65625,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3628593757748604,
"step": 214
},
{
"completion_length": 80.609375,
"epoch": 0.2301619162317677,
"grad_norm": 3.696011781692505,
"kl": 0.1569390268996358,
"learning_rate": 4.7483496228411754e-06,
"loss": 0.0063,
"reward": 2.0820469111204147,
"reward_std": 0.7507896656170487,
"rewards/correctness_reward_func": 1.34375,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.26173438131809235,
"step": 215
},
{
"completion_length": 67.90625,
"epoch": 0.23123243677238056,
"grad_norm": 3.6006627082824707,
"kl": 0.1786866094917059,
"learning_rate": 4.744245627094859e-06,
"loss": 0.0071,
"reward": 2.0825624614953995,
"reward_std": 0.8309154035523534,
"rewards/correctness_reward_func": 1.28125,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.33256249874830246,
"step": 216
},
{
"completion_length": 82.53125,
"epoch": 0.23230295731299344,
"grad_norm": 4.304072856903076,
"kl": 0.31106262002140284,
"learning_rate": 4.740110239921813e-06,
"loss": 0.0124,
"reward": 1.406374990940094,
"reward_std": 0.8875350207090378,
"rewards/correctness_reward_func": 0.6875,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2657500021159649,
"step": 217
},
{
"completion_length": 57.75,
"epoch": 0.23337347785360632,
"grad_norm": 3.51265025138855,
"kl": 0.18941342923790216,
"learning_rate": 4.735943519165843e-06,
"loss": 0.0076,
"reward": 2.366296797990799,
"reward_std": 0.6840489963069558,
"rewards/correctness_reward_func": 1.5625,
"rewards/int_reward_func": 0.4609375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3428593724966049,
"step": 218
},
{
"completion_length": 69.6875,
"epoch": 0.2344439983942192,
"grad_norm": 3.1696012020111084,
"kl": 0.144703084602952,
"learning_rate": 4.731745523109029e-06,
"loss": 0.0058,
"reward": 1.7885156497359276,
"reward_std": 0.8288924656808376,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.3432031273841858,
"step": 219
},
{
"completion_length": 79.453125,
"epoch": 0.23551451893483205,
"grad_norm": 3.0204484462738037,
"kl": 0.16810880228877068,
"learning_rate": 4.72751631047092e-06,
"loss": 0.0067,
"reward": 1.817734345793724,
"reward_std": 1.0543543472886086,
"rewards/correctness_reward_func": 1.0625,
"rewards/int_reward_func": 0.4296875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0234375,
"rewards/xmlcount_reward_func": 0.302109370008111,
"step": 220
},
{
"completion_length": 89.984375,
"epoch": 0.23658503947544493,
"grad_norm": 9.887064933776855,
"kl": 0.571788308210671,
"learning_rate": 4.723255940407704e-06,
"loss": 0.0229,
"reward": 2.150187447667122,
"reward_std": 0.7866512620821595,
"rewards/correctness_reward_func": 1.375,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0234375,
"rewards/xmlcount_reward_func": 0.27518749609589577,
"step": 221
},
{
"completion_length": 82.15625,
"epoch": 0.2376555600160578,
"grad_norm": 3.509312868118286,
"kl": 0.16501779574900866,
"learning_rate": 4.718964472511386e-06,
"loss": 0.0066,
"reward": 2.0418750420212746,
"reward_std": 0.9721324890851974,
"rewards/correctness_reward_func": 1.28125,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0234375,
"rewards/xmlcount_reward_func": 0.29187500290572643,
"step": 222
},
{
"completion_length": 69.703125,
"epoch": 0.2387260805566707,
"grad_norm": 3.7087841033935547,
"kl": 0.18095918465405703,
"learning_rate": 4.71464196680895e-06,
"loss": 0.0072,
"reward": 2.18121874332428,
"reward_std": 0.870441822335124,
"rewards/correctness_reward_func": 1.34375,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.3452812507748604,
"step": 223
},
{
"completion_length": 93.453125,
"epoch": 0.23979660109728354,
"grad_norm": 9.079336166381836,
"kl": 0.7006166982464492,
"learning_rate": 4.710288483761524e-06,
"loss": 0.028,
"reward": 2.110390603542328,
"reward_std": 0.7916512079536915,
"rewards/correctness_reward_func": 1.34375,
"rewards/int_reward_func": 0.46875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.2666406221687794,
"step": 224
},
{
"completion_length": 89.796875,
"epoch": 0.24086712163789642,
"grad_norm": 5.301533222198486,
"kl": 0.33772587310522795,
"learning_rate": 4.705904084263534e-06,
"loss": 0.0135,
"reward": 1.9977499693632126,
"reward_std": 0.9460650207474828,
"rewards/correctness_reward_func": 1.25,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.31025000289082527,
"step": 225
},
{
"completion_length": 93.375,
"epoch": 0.2419376421785093,
"grad_norm": 2.236737012863159,
"kl": 0.11640047281980515,
"learning_rate": 4.701488829641845e-06,
"loss": 0.0047,
"reward": 2.145703136920929,
"reward_std": 0.8284804495051503,
"rewards/correctness_reward_func": 1.40625,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.2863281313329935,
"step": 226
},
{
"completion_length": 81.8125,
"epoch": 0.24300816271912218,
"grad_norm": 3.1928486824035645,
"kl": 0.135368085000664,
"learning_rate": 4.697042781654913e-06,
"loss": 0.0054,
"reward": 2.135328069329262,
"reward_std": 0.7982124611735344,
"rewards/correctness_reward_func": 1.3125,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.33845312520861626,
"step": 227
},
{
"completion_length": 85.453125,
"epoch": 0.24407868325973506,
"grad_norm": 12.730961799621582,
"kl": 0.4075321350246668,
"learning_rate": 4.692566002491917e-06,
"loss": 0.0163,
"reward": 2.0005937218666077,
"reward_std": 0.8029458876699209,
"rewards/correctness_reward_func": 1.21875,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.34434375166893005,
"step": 228
},
{
"completion_length": 72.875,
"epoch": 0.2451492038003479,
"grad_norm": 6.923903942108154,
"kl": 0.6862649563699961,
"learning_rate": 4.6880585547718845e-06,
"loss": 0.0275,
"reward": 1.7937656044960022,
"reward_std": 0.915380734950304,
"rewards/correctness_reward_func": 0.9375,
"rewards/int_reward_func": 0.46875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.371890626847744,
"step": 229
},
{
"completion_length": 93.75,
"epoch": 0.2462197243409608,
"grad_norm": 3.041198968887329,
"kl": 0.109968694858253,
"learning_rate": 4.683520501542825e-06,
"loss": 0.0044,
"reward": 2.041828043758869,
"reward_std": 0.828435555100441,
"rewards/correctness_reward_func": 1.25,
"rewards/int_reward_func": 0.4296875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.3308906201273203,
"step": 230
},
{
"completion_length": 78.359375,
"epoch": 0.24729024488157367,
"grad_norm": 3.0014753341674805,
"kl": 0.13458310719579458,
"learning_rate": 4.67895190628084e-06,
"loss": 0.0054,
"reward": 2.4933906197547913,
"reward_std": 0.8564620353281498,
"rewards/correctness_reward_func": 1.625,
"rewards/int_reward_func": 0.46875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0234375,
"rewards/xmlcount_reward_func": 0.37620312348008156,
"step": 231
},
{
"completion_length": 93.8125,
"epoch": 0.24836076542218655,
"grad_norm": 4.2342848777771,
"kl": 0.1281078103929758,
"learning_rate": 4.674352832889239e-06,
"loss": 0.0051,
"reward": 2.4442031383514404,
"reward_std": 0.7425322765484452,
"rewards/correctness_reward_func": 1.59375,
"rewards/int_reward_func": 0.484375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0234375,
"rewards/xmlcount_reward_func": 0.3426406290382147,
"step": 232
},
{
"completion_length": 91.703125,
"epoch": 0.2494312859627994,
"grad_norm": 4.6626811027526855,
"kl": 0.30947081558406353,
"learning_rate": 4.669723345697646e-06,
"loss": 0.0124,
"reward": 1.7927343994379044,
"reward_std": 1.0577923730015755,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.4375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0234375,
"rewards/xmlcount_reward_func": 0.3317968789488077,
"step": 233
},
{
"completion_length": 95.1875,
"epoch": 0.2505018065034123,
"grad_norm": 2.6523685455322266,
"kl": 0.12286860542371869,
"learning_rate": 4.665063509461098e-06,
"loss": 0.0049,
"reward": 2.122484341263771,
"reward_std": 0.6617152327671647,
"rewards/correctness_reward_func": 1.28125,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.38029688596725464,
"step": 234
},
{
"completion_length": 100.3125,
"epoch": 0.25157232704402516,
"grad_norm": 3.743971586227417,
"kl": 0.2634156849235296,
"learning_rate": 4.660373389359137e-06,
"loss": 0.0105,
"reward": 1.9493124820291996,
"reward_std": 0.6555369319394231,
"rewards/correctness_reward_func": 1.1875,
"rewards/int_reward_func": 0.4296875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3321250043809414,
"step": 235
},
{
"completion_length": 91.890625,
"epoch": 0.252642847584638,
"grad_norm": 5.421882629394531,
"kl": 0.36969919549301267,
"learning_rate": 4.655653050994907e-06,
"loss": 0.0148,
"reward": 2.489093706011772,
"reward_std": 0.5151624148711562,
"rewards/correctness_reward_func": 1.59375,
"rewards/int_reward_func": 0.4765625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.38753124326467514,
"step": 236
},
{
"completion_length": 107.421875,
"epoch": 0.2537133681252509,
"grad_norm": 3.658877372741699,
"kl": 0.3179207113571465,
"learning_rate": 4.650902560394225e-06,
"loss": 0.0127,
"reward": 2.1520937085151672,
"reward_std": 0.7971650678664446,
"rewards/correctness_reward_func": 1.34375,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.3474062494933605,
"step": 237
},
{
"completion_length": 96.75,
"epoch": 0.25478388866586377,
"grad_norm": 2.333758592605591,
"kl": 0.14108293130993843,
"learning_rate": 4.646121984004666e-06,
"loss": 0.0056,
"reward": 2.4891093373298645,
"reward_std": 0.7755477353930473,
"rewards/correctness_reward_func": 1.59375,
"rewards/int_reward_func": 0.484375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0234375,
"rewards/xmlcount_reward_func": 0.38754688017070293,
"step": 238
},
{
"completion_length": 97.796875,
"epoch": 0.2558544092064767,
"grad_norm": 7.903679370880127,
"kl": 0.6723966179415584,
"learning_rate": 4.641311388694629e-06,
"loss": 0.0269,
"reward": 2.23892180621624,
"reward_std": 0.8879001673776656,
"rewards/correctness_reward_func": 1.40625,
"rewards/int_reward_func": 0.4140625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0234375,
"rewards/xmlcount_reward_func": 0.3951718807220459,
"step": 239
},
{
"completion_length": 128.1875,
"epoch": 0.2569249297470895,
"grad_norm": 2.0204544067382812,
"kl": 0.09938508365303278,
"learning_rate": 4.636470841752405e-06,
"loss": 0.004,
"reward": 1.7635936960577965,
"reward_std": 0.65199055057019,
"rewards/correctness_reward_func": 0.96875,
"rewards/int_reward_func": 0.390625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0390625,
"rewards/xmlcount_reward_func": 0.36515624821186066,
"step": 240
},
{
"completion_length": 95.03125,
"epoch": 0.2579954502877024,
"grad_norm": 2.7629244327545166,
"kl": 0.16699408926069736,
"learning_rate": 4.631600410885231e-06,
"loss": 0.0067,
"reward": 2.553484320640564,
"reward_std": 0.6153040612116456,
"rewards/correctness_reward_func": 1.65625,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0234375,
"rewards/xmlcount_reward_func": 0.42848438024520874,
"step": 241
},
{
"completion_length": 91.640625,
"epoch": 0.2590659708283153,
"grad_norm": 2.4757933616638184,
"kl": 0.12886409275233746,
"learning_rate": 4.626700164218349e-06,
"loss": 0.0052,
"reward": 2.60553115606308,
"reward_std": 0.6941613564267755,
"rewards/correctness_reward_func": 1.6875,
"rewards/int_reward_func": 0.46875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.4336562491953373,
"step": 242
},
{
"completion_length": 88.0,
"epoch": 0.26013649136892814,
"grad_norm": 5.133749961853027,
"kl": 0.3455530842766166,
"learning_rate": 4.621770170294049e-06,
"loss": 0.0138,
"reward": 1.829562470316887,
"reward_std": 0.8116088081151247,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.421875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.3920625038444996,
"step": 243
},
{
"completion_length": 106.015625,
"epoch": 0.261207011909541,
"grad_norm": 2.4294216632843018,
"kl": 0.14171195961534977,
"learning_rate": 4.6168104980707105e-06,
"loss": 0.0057,
"reward": 2.2230467945337296,
"reward_std": 0.9806447625160217,
"rewards/correctness_reward_func": 1.3125,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0390625,
"rewards/xmlcount_reward_func": 0.4261718839406967,
"step": 244
},
{
"completion_length": 97.625,
"epoch": 0.2622775324501539,
"grad_norm": 6.696681976318359,
"kl": 0.5324421431869268,
"learning_rate": 4.61182121692184e-06,
"loss": 0.0213,
"reward": 2.5623437613248825,
"reward_std": 0.7769194557331502,
"rewards/correctness_reward_func": 1.65625,
"rewards/int_reward_func": 0.453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0234375,
"rewards/xmlcount_reward_func": 0.4295312501490116,
"step": 245
},
{
"completion_length": 105.328125,
"epoch": 0.26334805299076675,
"grad_norm": 3.7250571250915527,
"kl": 0.3119491417892277,
"learning_rate": 4.606802396635098e-06,
"loss": 0.0125,
"reward": 2.0679530799388885,
"reward_std": 1.0327460495755076,
"rewards/correctness_reward_func": 1.25,
"rewards/int_reward_func": 0.421875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0078125,
"rewards/xmlcount_reward_func": 0.388265622779727,
"step": 246
},
{
"completion_length": 101.0625,
"epoch": 0.26441857353137965,
"grad_norm": 2.0395541191101074,
"kl": 0.1265430450439453,
"learning_rate": 4.601754107411326e-06,
"loss": 0.0051,
"reward": 2.3826874494552612,
"reward_std": 0.881409777328372,
"rewards/correctness_reward_func": 1.46875,
"rewards/int_reward_func": 0.46875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.3983125016093254,
"step": 247
},
{
"completion_length": 119.25,
"epoch": 0.2654890940719925,
"grad_norm": 50.28816223144531,
"kl": 1.0364861502312124,
"learning_rate": 4.596676419863561e-06,
"loss": 0.0415,
"reward": 1.8263437151908875,
"reward_std": 1.0226206295192242,
"rewards/correctness_reward_func": 1.03125,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.37321874499320984,
"step": 248
},
{
"completion_length": 113.515625,
"epoch": 0.26655961461260536,
"grad_norm": 18.58391571044922,
"kl": 1.4038016851991415,
"learning_rate": 4.59156940501605e-06,
"loss": 0.0562,
"reward": 2.122187450528145,
"reward_std": 0.7838817811571062,
"rewards/correctness_reward_func": 1.25,
"rewards/int_reward_func": 0.421875,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.4034374989569187,
"step": 249
},
{
"completion_length": 77.8125,
"epoch": 0.26763013515321826,
"grad_norm": 2.225803852081299,
"kl": 0.1744153881445527,
"learning_rate": 4.586433134303257e-06,
"loss": 0.007,
"reward": 2.600734308362007,
"reward_std": 0.6244143173098564,
"rewards/correctness_reward_func": 1.59375,
"rewards/int_reward_func": 0.4453125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.1015625,
"rewards/xmlcount_reward_func": 0.4601093679666519,
"step": 250
}
],
"logging_steps": 1,
"max_steps": 934,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}