2r32 / checkpoint-350 /trainer_state.json
C10X's picture
Upload folder using huggingface_hub
58960d1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0468352736518132,
"eval_steps": 500,
"global_step": 350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 698.1666870117188,
"epoch": 0.00013381506757660912,
"grad_norm": 0.07569596916437149,
"kl": 0.0006024616304785013,
"learning_rate": 6.684491978609626e-09,
"loss": 0.001,
"reward": -1.8359375,
"reward_std": 0.5859375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.8359375,
"step": 1
},
{
"completion_length": 549.0,
"epoch": 0.00026763013515321824,
"grad_norm": 0.10156559199094772,
"kl": 0.0006554799037985504,
"learning_rate": 1.3368983957219251e-08,
"loss": -0.0055,
"reward": -1.21875,
"reward_std": 0.48828125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.21875,
"step": 2
},
{
"completion_length": 509.66668701171875,
"epoch": 0.0004014452027298274,
"grad_norm": 0.1012749969959259,
"kl": 0.0006122777122072875,
"learning_rate": 2.005347593582888e-08,
"loss": 0.0032,
"reward": -1.2578125,
"reward_std": 0.37890625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.2578125,
"step": 3
},
{
"completion_length": 387.8333435058594,
"epoch": 0.0005352602703064365,
"grad_norm": 0.10009913891553879,
"kl": 0.0005205385386943817,
"learning_rate": 2.6737967914438503e-08,
"loss": 0.0007,
"reward": -0.83203125,
"reward_std": 0.1640625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.83203125,
"step": 4
},
{
"completion_length": 386.5,
"epoch": 0.0006690753378830456,
"grad_norm": 0.11404310166835785,
"kl": 0.00039041676791384816,
"learning_rate": 3.342245989304813e-08,
"loss": -0.0032,
"reward": -0.859375,
"reward_std": 0.1630859375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.859375,
"step": 5
},
{
"completion_length": 345.8333435058594,
"epoch": 0.0008028904054596548,
"grad_norm": 0.13447555899620056,
"kl": 0.0005453471094369888,
"learning_rate": 4.010695187165776e-08,
"loss": 0.0036,
"reward": -0.7109375,
"reward_std": 0.357421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.7109375,
"step": 6
},
{
"completion_length": 426.16668701171875,
"epoch": 0.0009367054730362638,
"grad_norm": 0.1324268877506256,
"kl": 0.000606791814789176,
"learning_rate": 4.679144385026738e-08,
"loss": 0.0017,
"reward": -1.09375,
"reward_std": 0.72265625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.09375,
"step": 7
},
{
"completion_length": 418.66668701171875,
"epoch": 0.001070520540612873,
"grad_norm": 0.12978878617286682,
"kl": 0.0005688891978934407,
"learning_rate": 5.3475935828877005e-08,
"loss": 0.0005,
"reward": -0.89453125,
"reward_std": 0.392578125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.89453125,
"step": 8
},
{
"completion_length": 484.5,
"epoch": 0.0012043356081894822,
"grad_norm": 0.09955421835184097,
"kl": 0.0005112257204018533,
"learning_rate": 6.016042780748664e-08,
"loss": 0.0067,
"reward": -1.25,
"reward_std": 0.53515625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.25,
"step": 9
},
{
"completion_length": 230.5,
"epoch": 0.0013381506757660913,
"grad_norm": 0.19524620473384857,
"kl": 0.0006619760533794761,
"learning_rate": 6.684491978609626e-08,
"loss": -0.0006,
"reward": -0.26953125,
"reward_std": 0.349609375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.26953125,
"step": 10
},
{
"completion_length": 369.5,
"epoch": 0.0014719657433427003,
"grad_norm": 0.1019153892993927,
"kl": 0.0006552126724272966,
"learning_rate": 7.352941176470589e-08,
"loss": -0.004,
"reward": -0.94140625,
"reward_std": 0.279296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.94140625,
"step": 11
},
{
"completion_length": 386.16668701171875,
"epoch": 0.0016057808109193096,
"grad_norm": 0.09696059674024582,
"kl": 0.0004603694542311132,
"learning_rate": 8.021390374331552e-08,
"loss": 0.002,
"reward": -0.8671875,
"reward_std": 0.42578125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8671875,
"step": 12
},
{
"completion_length": 475.16668701171875,
"epoch": 0.0017395958784959186,
"grad_norm": 0.12413895130157471,
"kl": 0.0004793051048181951,
"learning_rate": 8.689839572192514e-08,
"loss": 0.0,
"reward": -0.9375,
"reward_std": 0.28125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.9375,
"step": 13
},
{
"completion_length": 370.0,
"epoch": 0.0018734109460725277,
"grad_norm": 0.1305382251739502,
"kl": 0.0005513830110430717,
"learning_rate": 9.358288770053476e-08,
"loss": -0.0018,
"reward": -0.78515625,
"reward_std": 0.263671875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.78515625,
"step": 14
},
{
"completion_length": 431.66668701171875,
"epoch": 0.002007226013649137,
"grad_norm": 0.10463520139455795,
"kl": 0.00048596435226500034,
"learning_rate": 1.0026737967914439e-07,
"loss": 0.0032,
"reward": -0.84375,
"reward_std": 0.3984375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.84375,
"step": 15
},
{
"completion_length": 399.5,
"epoch": 0.002141041081225746,
"grad_norm": 0.1404961347579956,
"kl": 0.000555322621949017,
"learning_rate": 1.0695187165775401e-07,
"loss": -0.0057,
"reward": -1.0625,
"reward_std": 0.46484375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.0625,
"step": 16
},
{
"completion_length": 449.8333435058594,
"epoch": 0.002274856148802355,
"grad_norm": 0.10250594466924667,
"kl": 0.00048121344298124313,
"learning_rate": 1.1363636363636364e-07,
"loss": -0.0071,
"reward": -1.0234375,
"reward_std": 0.40234375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.0234375,
"step": 17
},
{
"completion_length": 324.5,
"epoch": 0.0024086712163789645,
"grad_norm": 0.12464314699172974,
"kl": 0.0005811881856061518,
"learning_rate": 1.2032085561497328e-07,
"loss": 0.0033,
"reward": -0.6875,
"reward_std": 0.26171875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6875,
"step": 18
},
{
"completion_length": 578.0,
"epoch": 0.0025424862839555735,
"grad_norm": 0.08823499828577042,
"kl": 0.000675913121085614,
"learning_rate": 1.270053475935829e-07,
"loss": 0.0075,
"reward": -1.703125,
"reward_std": 0.5703125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.703125,
"step": 19
},
{
"completion_length": 328.8333435058594,
"epoch": 0.0026763013515321826,
"grad_norm": 0.16708222031593323,
"kl": 0.0006092819385230541,
"learning_rate": 1.3368983957219251e-07,
"loss": 0.0091,
"reward": -0.71484375,
"reward_std": 0.1357421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.71484375,
"step": 20
},
{
"completion_length": 415.16668701171875,
"epoch": 0.0028101164191087916,
"grad_norm": 0.10446464270353317,
"kl": 0.0004726095939986408,
"learning_rate": 1.4037433155080215e-07,
"loss": 0.0011,
"reward": -1.0,
"reward_std": 0.50390625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.0,
"step": 21
},
{
"completion_length": 398.5,
"epoch": 0.0029439314866854006,
"grad_norm": 0.10892236977815628,
"kl": 0.000556222046725452,
"learning_rate": 1.4705882352941178e-07,
"loss": 0.0016,
"reward": -0.9765625,
"reward_std": 0.349609375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.9765625,
"step": 22
},
{
"completion_length": 351.66668701171875,
"epoch": 0.00307774655426201,
"grad_norm": 0.13707049190998077,
"kl": 0.0005205090856179595,
"learning_rate": 1.537433155080214e-07,
"loss": -0.0032,
"reward": -0.7421875,
"reward_std": 0.390625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.7421875,
"step": 23
},
{
"completion_length": 310.0,
"epoch": 0.003211561621838619,
"grad_norm": 0.1579124480485916,
"kl": 0.0007410722319036722,
"learning_rate": 1.6042780748663104e-07,
"loss": 0.0002,
"reward": -0.65625,
"reward_std": 0.6328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.65625,
"step": 24
},
{
"completion_length": 353.0,
"epoch": 0.003345376689415228,
"grad_norm": 0.11555790901184082,
"kl": 0.0005753459990955889,
"learning_rate": 1.6711229946524068e-07,
"loss": -0.0034,
"reward": -0.828125,
"reward_std": 0.314453125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.828125,
"step": 25
},
{
"completion_length": 416.5,
"epoch": 0.0034791917569918372,
"grad_norm": 0.10537782311439514,
"kl": 0.0006076883291825652,
"learning_rate": 1.7379679144385028e-07,
"loss": -0.0068,
"reward": -0.8359375,
"reward_std": 0.30859375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8359375,
"step": 26
},
{
"completion_length": 436.0,
"epoch": 0.0036130068245684463,
"grad_norm": 0.12061028182506561,
"kl": 0.0006918934523127973,
"learning_rate": 1.8048128342245991e-07,
"loss": 0.0033,
"reward": -0.91015625,
"reward_std": 0.92578125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.91015625,
"step": 27
},
{
"completion_length": 447.16668701171875,
"epoch": 0.0037468218921450553,
"grad_norm": 0.11236874759197235,
"kl": 0.0005188498180359602,
"learning_rate": 1.8716577540106952e-07,
"loss": -0.0021,
"reward": -1.078125,
"reward_std": 0.298828125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.078125,
"step": 28
},
{
"completion_length": 524.5,
"epoch": 0.003880636959721665,
"grad_norm": 0.08638511598110199,
"kl": 0.000413873785873875,
"learning_rate": 1.9385026737967918e-07,
"loss": -0.0027,
"reward": -1.1953125,
"reward_std": 0.58203125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.1953125,
"step": 29
},
{
"completion_length": 433.0,
"epoch": 0.004014452027298274,
"grad_norm": 0.10361335426568985,
"kl": 0.0005174180259928107,
"learning_rate": 2.0053475935828878e-07,
"loss": -0.001,
"reward": -0.8125,
"reward_std": 0.55078125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8125,
"step": 30
},
{
"completion_length": 429.3333435058594,
"epoch": 0.004148267094874883,
"grad_norm": 0.09831919521093369,
"kl": 0.0004531377926468849,
"learning_rate": 2.0721925133689842e-07,
"loss": -0.0034,
"reward": -0.82421875,
"reward_std": 0.412109375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.82421875,
"step": 31
},
{
"completion_length": 352.3333435058594,
"epoch": 0.004282082162451492,
"grad_norm": 0.1168479472398758,
"kl": 0.00041617939132265747,
"learning_rate": 2.1390374331550802e-07,
"loss": 0.012,
"reward": -0.671875,
"reward_std": 0.1455078125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.671875,
"step": 32
},
{
"completion_length": 328.16668701171875,
"epoch": 0.004415897230028101,
"grad_norm": 0.14010493457317352,
"kl": 0.0006999190663918853,
"learning_rate": 2.2058823529411768e-07,
"loss": -0.0003,
"reward": -0.8203125,
"reward_std": 0.4609375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8203125,
"step": 33
},
{
"completion_length": 461.0,
"epoch": 0.00454971229760471,
"grad_norm": 0.07955824583768845,
"kl": 0.000317567668389529,
"learning_rate": 2.2727272727272729e-07,
"loss": 0.0061,
"reward": -0.7421875,
"reward_std": 0.107421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.7421875,
"step": 34
},
{
"completion_length": 343.5,
"epoch": 0.004683527365181319,
"grad_norm": 0.192967027425766,
"kl": 0.0003919226583093405,
"learning_rate": 2.3395721925133692e-07,
"loss": -0.0026,
"reward": -0.71875,
"reward_std": 0.26953125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.71875,
"step": 35
},
{
"completion_length": 468.0,
"epoch": 0.004817342432757929,
"grad_norm": 0.1151042953133583,
"kl": 0.0005731440032832325,
"learning_rate": 2.4064171122994655e-07,
"loss": 0.0008,
"reward": -0.90625,
"reward_std": 0.4375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.90625,
"step": 36
},
{
"completion_length": 326.66668701171875,
"epoch": 0.004951157500334538,
"grad_norm": 0.13014303147792816,
"kl": 0.0006222401279956102,
"learning_rate": 2.473262032085562e-07,
"loss": 0.0073,
"reward": -0.58984375,
"reward_std": 0.2177734375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.58984375,
"step": 37
},
{
"completion_length": 470.16668701171875,
"epoch": 0.005084972567911147,
"grad_norm": 0.10929639637470245,
"kl": 0.0005664956988766789,
"learning_rate": 2.540106951871658e-07,
"loss": -0.001,
"reward": -1.2109375,
"reward_std": 0.451171875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.2109375,
"step": 38
},
{
"completion_length": 350.5,
"epoch": 0.005218787635487756,
"grad_norm": 0.121163509786129,
"kl": 0.0006041490705683827,
"learning_rate": 2.606951871657754e-07,
"loss": -0.0012,
"reward": -0.65234375,
"reward_std": 0.404296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.65234375,
"step": 39
},
{
"completion_length": 441.5,
"epoch": 0.005352602703064365,
"grad_norm": 0.09024005383253098,
"kl": 0.0005421562236733735,
"learning_rate": 2.6737967914438503e-07,
"loss": 0.0012,
"reward": -0.765625,
"reward_std": 0.7734375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.765625,
"step": 40
},
{
"completion_length": 587.8333740234375,
"epoch": 0.005486417770640974,
"grad_norm": 0.11247697472572327,
"kl": 0.0009425554308108985,
"learning_rate": 2.740641711229947e-07,
"loss": -0.0003,
"reward": -1.6875,
"reward_std": 0.6171875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.6875,
"step": 41
},
{
"completion_length": 523.5,
"epoch": 0.005620232838217583,
"grad_norm": 0.08999153226613998,
"kl": 0.0004692915244959295,
"learning_rate": 2.807486631016043e-07,
"loss": 0.0003,
"reward": -0.796875,
"reward_std": 1.375,
"rewards/correctness_reward_func": 0.333984375,
"rewards/int_reward_func": 0.08349609375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.2109375,
"step": 42
},
{
"completion_length": 531.0,
"epoch": 0.005754047905794192,
"grad_norm": 0.09678950905799866,
"kl": 0.0005171874072402716,
"learning_rate": 2.8743315508021395e-07,
"loss": -0.0044,
"reward": -1.421875,
"reward_std": 0.4140625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.421875,
"step": 43
},
{
"completion_length": 416.0,
"epoch": 0.005887862973370801,
"grad_norm": 0.11189436912536621,
"kl": 0.00040408255881629884,
"learning_rate": 2.9411764705882356e-07,
"loss": 0.0029,
"reward": -0.921875,
"reward_std": 0.1884765625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.921875,
"step": 44
},
{
"completion_length": 359.8333435058594,
"epoch": 0.00602167804094741,
"grad_norm": 0.10176176577806473,
"kl": 0.0005236775032244623,
"learning_rate": 3.0080213903743316e-07,
"loss": -0.0032,
"reward": -0.58203125,
"reward_std": 0.4140625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.58203125,
"step": 45
},
{
"completion_length": 416.8333435058594,
"epoch": 0.00615549310852402,
"grad_norm": 0.12203460931777954,
"kl": 0.0006941946921870112,
"learning_rate": 3.074866310160428e-07,
"loss": 0.0008,
"reward": -1.15625,
"reward_std": 0.50390625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.15625,
"step": 46
},
{
"completion_length": 524.8333740234375,
"epoch": 0.006289308176100629,
"grad_norm": 0.09807480126619339,
"kl": 0.000624034320935607,
"learning_rate": 3.1417112299465243e-07,
"loss": -0.0062,
"reward": -1.1875,
"reward_std": 0.396484375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.1875,
"step": 47
},
{
"completion_length": 419.5,
"epoch": 0.006423123243677238,
"grad_norm": 0.11151473969221115,
"kl": 0.0005637712310999632,
"learning_rate": 3.208556149732621e-07,
"loss": -0.0017,
"reward": -0.94140625,
"reward_std": 0.5546875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.94140625,
"step": 48
},
{
"completion_length": 375.3333435058594,
"epoch": 0.006556938311253847,
"grad_norm": 0.15443629026412964,
"kl": 0.0006895489059388638,
"learning_rate": 3.275401069518717e-07,
"loss": -0.0021,
"reward": -0.80078125,
"reward_std": 0.6015625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.80078125,
"step": 49
},
{
"completion_length": 538.5,
"epoch": 0.006690753378830456,
"grad_norm": 0.12232497334480286,
"kl": 0.00044502606033347547,
"learning_rate": 3.3422459893048135e-07,
"loss": 0.0038,
"reward": -1.15625,
"reward_std": 0.345703125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.15625,
"step": 50
},
{
"completion_length": 380.66668701171875,
"epoch": 0.006824568446407065,
"grad_norm": 0.09400169551372528,
"kl": 0.0004400149919092655,
"learning_rate": 3.409090909090909e-07,
"loss": -0.0005,
"reward": -0.75390625,
"reward_std": 0.75390625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.75390625,
"step": 51
},
{
"completion_length": 302.16668701171875,
"epoch": 0.0069583835139836745,
"grad_norm": 0.18885326385498047,
"kl": 0.0006017067935317755,
"learning_rate": 3.4759358288770056e-07,
"loss": 0.0001,
"reward": -0.494140625,
"reward_std": 0.5078125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.494140625,
"step": 52
},
{
"completion_length": 310.0,
"epoch": 0.0070921985815602835,
"grad_norm": 0.17508742213249207,
"kl": 0.0006495526758953929,
"learning_rate": 3.542780748663102e-07,
"loss": 0.0001,
"reward": -0.609375,
"reward_std": 0.208984375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.609375,
"step": 53
},
{
"completion_length": 348.16668701171875,
"epoch": 0.0072260136491368926,
"grad_norm": 0.1143779456615448,
"kl": 0.0005849208100698888,
"learning_rate": 3.6096256684491983e-07,
"loss": -0.0023,
"reward": -0.8984375,
"reward_std": 0.423828125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8984375,
"step": 54
},
{
"completion_length": 461.5,
"epoch": 0.007359828716713502,
"grad_norm": 0.10026198625564575,
"kl": 0.0005551945068873465,
"learning_rate": 3.6764705882352943e-07,
"loss": -0.0088,
"reward": -1.03125,
"reward_std": 0.38671875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.03125,
"step": 55
},
{
"completion_length": 581.1666870117188,
"epoch": 0.007493643784290111,
"grad_norm": 0.09014507383108139,
"kl": 0.0004388962115626782,
"learning_rate": 3.7433155080213904e-07,
"loss": 0.0007,
"reward": -1.328125,
"reward_std": 0.1474609375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.328125,
"step": 56
},
{
"completion_length": 320.3333435058594,
"epoch": 0.0076274588518667205,
"grad_norm": 0.09987051039934158,
"kl": 0.0005903591518290341,
"learning_rate": 3.810160427807487e-07,
"loss": -0.0068,
"reward": -0.609375,
"reward_std": 0.2578125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.609375,
"step": 57
},
{
"completion_length": 362.8333435058594,
"epoch": 0.00776127391944333,
"grad_norm": 0.246050164103508,
"kl": 0.0005056472145952284,
"learning_rate": 3.8770053475935836e-07,
"loss": -0.0027,
"reward": -0.625,
"reward_std": 0.5234375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.625,
"step": 58
},
{
"completion_length": 524.0,
"epoch": 0.007895088987019938,
"grad_norm": 0.12084438651800156,
"kl": 0.0005575703689828515,
"learning_rate": 3.943850267379679e-07,
"loss": 0.0114,
"reward": -1.1328125,
"reward_std": 0.28125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.1328125,
"step": 59
},
{
"completion_length": 410.3333435058594,
"epoch": 0.008028904054596548,
"grad_norm": 0.10101523995399475,
"kl": 0.0005777844344265759,
"learning_rate": 4.0106951871657757e-07,
"loss": 0.0007,
"reward": -0.94921875,
"reward_std": 0.271484375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.94921875,
"step": 60
},
{
"completion_length": 717.6666870117188,
"epoch": 0.008162719122173156,
"grad_norm": 0.09220802038908005,
"kl": 0.0006241414812393486,
"learning_rate": 4.077540106951872e-07,
"loss": -0.0078,
"reward": -2.046875,
"reward_std": 0.53515625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -2.046875,
"step": 61
},
{
"completion_length": 384.66668701171875,
"epoch": 0.008296534189749766,
"grad_norm": 0.10890569537878036,
"kl": 0.00048696936573833227,
"learning_rate": 4.1443850267379683e-07,
"loss": 0.0039,
"reward": -0.921875,
"reward_std": 0.1865234375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.921875,
"step": 62
},
{
"completion_length": 271.3333435058594,
"epoch": 0.008430349257326376,
"grad_norm": 0.11486776173114777,
"kl": 0.0005599698051810265,
"learning_rate": 4.211229946524065e-07,
"loss": -0.0005,
"reward": -0.201171875,
"reward_std": 0.396484375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.201171875,
"step": 63
},
{
"completion_length": 385.3333435058594,
"epoch": 0.008564164324902984,
"grad_norm": 0.1193244457244873,
"kl": 0.0006926630157977343,
"learning_rate": 4.2780748663101604e-07,
"loss": 0.0043,
"reward": -0.97265625,
"reward_std": 0.41015625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.97265625,
"step": 64
},
{
"completion_length": 539.6666870117188,
"epoch": 0.008697979392479594,
"grad_norm": 0.09389720857143402,
"kl": 0.0004780918825417757,
"learning_rate": 4.344919786096257e-07,
"loss": 0.0052,
"reward": -1.234375,
"reward_std": 0.373046875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.234375,
"step": 65
},
{
"completion_length": 259.8333435058594,
"epoch": 0.008831794460056202,
"grad_norm": 0.22691243886947632,
"kl": 0.0008878613589331508,
"learning_rate": 4.4117647058823536e-07,
"loss": -0.0048,
"reward": -0.51953125,
"reward_std": 0.251953125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.51953125,
"step": 66
},
{
"completion_length": 461.16668701171875,
"epoch": 0.008965609527632812,
"grad_norm": 0.12113010138273239,
"kl": 0.0006317974766716361,
"learning_rate": 4.4786096256684497e-07,
"loss": -0.0067,
"reward": -1.15625,
"reward_std": 0.39453125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.15625,
"step": 67
},
{
"completion_length": 399.8333435058594,
"epoch": 0.00909942459520942,
"grad_norm": 0.1679317206144333,
"kl": 0.000584149791393429,
"learning_rate": 4.5454545454545457e-07,
"loss": -0.0093,
"reward": -0.9609375,
"reward_std": 0.2060546875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.9609375,
"step": 68
},
{
"completion_length": 381.0,
"epoch": 0.00923323966278603,
"grad_norm": 0.2019040584564209,
"kl": 0.0007442033383995295,
"learning_rate": 4.612299465240642e-07,
"loss": 0.0034,
"reward": -0.63671875,
"reward_std": 0.494140625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.63671875,
"step": 69
},
{
"completion_length": 455.5,
"epoch": 0.009367054730362638,
"grad_norm": 0.09101377427577972,
"kl": 0.00046143907820805907,
"learning_rate": 4.6791443850267384e-07,
"loss": -0.0057,
"reward": -1.046875,
"reward_std": 0.6328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.046875,
"step": 70
},
{
"completion_length": 493.8333435058594,
"epoch": 0.009500869797939248,
"grad_norm": 0.09268555790185928,
"kl": 0.00048020537360571325,
"learning_rate": 4.745989304812835e-07,
"loss": -0.0021,
"reward": -1.28125,
"reward_std": 0.54296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.28125,
"step": 71
},
{
"completion_length": 601.6666870117188,
"epoch": 0.009634684865515858,
"grad_norm": 0.07598231732845306,
"kl": 0.0004928440321236849,
"learning_rate": 4.812834224598931e-07,
"loss": -0.0031,
"reward": -1.328125,
"reward_std": 0.74609375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.328125,
"step": 72
},
{
"completion_length": 449.16668701171875,
"epoch": 0.009768499933092466,
"grad_norm": 0.1203397586941719,
"kl": 0.0006244009709917009,
"learning_rate": 4.879679144385027e-07,
"loss": -0.0055,
"reward": -1.0703125,
"reward_std": 0.474609375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.0703125,
"step": 73
},
{
"completion_length": 499.0,
"epoch": 0.009902315000669076,
"grad_norm": 0.08029637485742569,
"kl": 0.0004115910269320011,
"learning_rate": 4.946524064171124e-07,
"loss": 0.001,
"reward": -1.1953125,
"reward_std": 0.5703125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.1953125,
"step": 74
},
{
"completion_length": 318.8333435058594,
"epoch": 0.010036130068245684,
"grad_norm": 0.10725877434015274,
"kl": 0.0005362802767194808,
"learning_rate": 5.013368983957219e-07,
"loss": -0.0039,
"reward": -0.38671875,
"reward_std": 0.25390625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.38671875,
"step": 75
},
{
"completion_length": 348.0,
"epoch": 0.010169945135822294,
"grad_norm": 0.1331893801689148,
"kl": 0.0006620581261813641,
"learning_rate": 5.080213903743316e-07,
"loss": -0.0007,
"reward": -0.84765625,
"reward_std": 0.486328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.84765625,
"step": 76
},
{
"completion_length": 456.66668701171875,
"epoch": 0.010303760203398902,
"grad_norm": 0.10820724815130234,
"kl": 0.0007615931099280715,
"learning_rate": 5.147058823529412e-07,
"loss": 0.0036,
"reward": -0.953125,
"reward_std": 0.59765625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.953125,
"step": 77
},
{
"completion_length": 335.66668701171875,
"epoch": 0.010437575270975512,
"grad_norm": 0.13866935670375824,
"kl": 0.0005373357562348247,
"learning_rate": 5.213903743315508e-07,
"loss": 0.0013,
"reward": -0.58203125,
"reward_std": 0.3359375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.58203125,
"step": 78
},
{
"completion_length": 338.0,
"epoch": 0.01057139033855212,
"grad_norm": 0.1531476229429245,
"kl": 0.000613297161180526,
"learning_rate": 5.280748663101604e-07,
"loss": -0.0006,
"reward": -0.6875,
"reward_std": 0.380859375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6875,
"step": 79
},
{
"completion_length": 323.8333435058594,
"epoch": 0.01070520540612873,
"grad_norm": 0.11174651980400085,
"kl": 0.00048220629105344415,
"learning_rate": 5.347593582887701e-07,
"loss": 0.0042,
"reward": -0.5546875,
"reward_std": 0.099609375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.5546875,
"step": 80
},
{
"completion_length": 425.0,
"epoch": 0.010839020473705338,
"grad_norm": 0.06810642778873444,
"kl": 0.0002865367860067636,
"learning_rate": 5.414438502673798e-07,
"loss": 0.0087,
"reward": -0.921875,
"reward_std": 0.1455078125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.921875,
"step": 81
},
{
"completion_length": 356.0,
"epoch": 0.010972835541281948,
"grad_norm": 0.12943901121616364,
"kl": 0.0005909207975491881,
"learning_rate": 5.481283422459894e-07,
"loss": -0.0014,
"reward": -0.80078125,
"reward_std": 0.2373046875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.80078125,
"step": 82
},
{
"completion_length": 408.66668701171875,
"epoch": 0.011106650608858558,
"grad_norm": 0.10401128232479095,
"kl": 0.0005684032803401351,
"learning_rate": 5.54812834224599e-07,
"loss": 0.0067,
"reward": -0.828125,
"reward_std": 0.279296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.828125,
"step": 83
},
{
"completion_length": 365.5,
"epoch": 0.011240465676435166,
"grad_norm": 0.11247576773166656,
"kl": 0.0005961977876722813,
"learning_rate": 5.614973262032086e-07,
"loss": 0.0049,
"reward": -0.7890625,
"reward_std": 0.43359375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.7890625,
"step": 84
},
{
"completion_length": 373.16668701171875,
"epoch": 0.011374280744011776,
"grad_norm": 0.13172324001789093,
"kl": 0.0006587211973965168,
"learning_rate": 5.681818181818182e-07,
"loss": 0.0,
"reward": -0.92578125,
"reward_std": 0.4296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.92578125,
"step": 85
},
{
"completion_length": 463.16668701171875,
"epoch": 0.011508095811588384,
"grad_norm": 0.0999283418059349,
"kl": 0.0005339820636436343,
"learning_rate": 5.748663101604279e-07,
"loss": -0.0073,
"reward": -1.1640625,
"reward_std": 0.478515625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.1640625,
"step": 86
},
{
"completion_length": 346.16668701171875,
"epoch": 0.011641910879164994,
"grad_norm": 0.1427423655986786,
"kl": 0.0005965695017948747,
"learning_rate": 5.815508021390375e-07,
"loss": 0.0052,
"reward": -0.86328125,
"reward_std": 0.36328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.86328125,
"step": 87
},
{
"completion_length": 464.3333435058594,
"epoch": 0.011775725946741603,
"grad_norm": 0.09077266603708267,
"kl": 0.0005941446870565414,
"learning_rate": 5.882352941176471e-07,
"loss": -0.0034,
"reward": -0.96875,
"reward_std": 0.3203125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.96875,
"step": 88
},
{
"completion_length": 444.3333435058594,
"epoch": 0.011909541014318212,
"grad_norm": 0.11555906385183334,
"kl": 0.0005076751112937927,
"learning_rate": 5.949197860962567e-07,
"loss": 0.0,
"reward": -0.98046875,
"reward_std": 0.3828125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.98046875,
"step": 89
},
{
"completion_length": 381.66668701171875,
"epoch": 0.01204335608189482,
"grad_norm": 0.11006759107112885,
"kl": 0.0005213702679611742,
"learning_rate": 6.016042780748663e-07,
"loss": -0.0019,
"reward": -0.8125,
"reward_std": 0.51171875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8125,
"step": 90
},
{
"completion_length": 586.6666870117188,
"epoch": 0.01217717114947143,
"grad_norm": 0.0977427214384079,
"kl": 0.0004677172692026943,
"learning_rate": 6.08288770053476e-07,
"loss": 0.0059,
"reward": -1.2265625,
"reward_std": 0.5703125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.2265625,
"step": 91
},
{
"completion_length": 523.5,
"epoch": 0.01231098621704804,
"grad_norm": 0.14523504674434662,
"kl": 0.0007363607874140143,
"learning_rate": 6.149732620320856e-07,
"loss": -0.0029,
"reward": -1.484375,
"reward_std": 0.828125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.484375,
"step": 92
},
{
"completion_length": 452.5,
"epoch": 0.012444801284624649,
"grad_norm": 0.11730131506919861,
"kl": 0.0004316701088100672,
"learning_rate": 6.216577540106952e-07,
"loss": 0.0008,
"reward": -1.328125,
"reward_std": 0.82421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.328125,
"step": 93
},
{
"completion_length": 471.66668701171875,
"epoch": 0.012578616352201259,
"grad_norm": 0.12073160707950592,
"kl": 0.000504339870531112,
"learning_rate": 6.283422459893049e-07,
"loss": 0.0029,
"reward": -1.1796875,
"reward_std": 0.40625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.1796875,
"step": 94
},
{
"completion_length": 291.66668701171875,
"epoch": 0.012712431419777867,
"grad_norm": 0.17788150906562805,
"kl": 0.0006855755927972496,
"learning_rate": 6.350267379679146e-07,
"loss": -0.0016,
"reward": -0.494140625,
"reward_std": 0.26171875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.494140625,
"step": 95
},
{
"completion_length": 411.8333435058594,
"epoch": 0.012846246487354477,
"grad_norm": 0.08730936795473099,
"kl": 0.00039596876013092697,
"learning_rate": 6.417112299465242e-07,
"loss": 0.0014,
"reward": -0.890625,
"reward_std": 0.498046875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.890625,
"step": 96
},
{
"completion_length": 376.5,
"epoch": 0.012980061554931085,
"grad_norm": 0.09182324260473251,
"kl": 0.0004653404466807842,
"learning_rate": 6.483957219251337e-07,
"loss": -0.0036,
"reward": -0.73828125,
"reward_std": 0.33203125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.73828125,
"step": 97
},
{
"completion_length": 395.66668701171875,
"epoch": 0.013113876622507695,
"grad_norm": 0.12949968874454498,
"kl": 0.0005730512784793973,
"learning_rate": 6.550802139037434e-07,
"loss": -0.0027,
"reward": -0.8359375,
"reward_std": 0.49609375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8359375,
"step": 98
},
{
"completion_length": 600.0,
"epoch": 0.013247691690084303,
"grad_norm": 0.08795811235904694,
"kl": 0.000673401344101876,
"learning_rate": 6.61764705882353e-07,
"loss": 0.0018,
"reward": -1.78125,
"reward_std": 0.5078125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.78125,
"step": 99
},
{
"completion_length": 468.0,
"epoch": 0.013381506757660913,
"grad_norm": 0.1182761937379837,
"kl": 0.0005245240754447877,
"learning_rate": 6.684491978609627e-07,
"loss": 0.0088,
"reward": -1.234375,
"reward_std": 0.4296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.234375,
"step": 100
},
{
"completion_length": 429.8333435058594,
"epoch": 0.013515321825237521,
"grad_norm": 0.10523517429828644,
"kl": 0.0004888825351372361,
"learning_rate": 6.751336898395723e-07,
"loss": 0.0049,
"reward": -0.8671875,
"reward_std": 0.36328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8671875,
"step": 101
},
{
"completion_length": 350.16668701171875,
"epoch": 0.01364913689281413,
"grad_norm": 0.1099957525730133,
"kl": 0.0004989251610822976,
"learning_rate": 6.818181818181818e-07,
"loss": -0.0032,
"reward": -0.6171875,
"reward_std": 0.2275390625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6171875,
"step": 102
},
{
"completion_length": 492.0,
"epoch": 0.01378295196039074,
"grad_norm": 0.096939817070961,
"kl": 0.0006083787302486598,
"learning_rate": 6.885026737967915e-07,
"loss": -0.0012,
"reward": -0.84765625,
"reward_std": 0.5078125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.84765625,
"step": 103
},
{
"completion_length": 520.8333740234375,
"epoch": 0.013916767027967349,
"grad_norm": 0.09864147007465363,
"kl": 0.0004971123998984694,
"learning_rate": 6.951871657754011e-07,
"loss": 0.0051,
"reward": -1.1796875,
"reward_std": 0.341796875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.1796875,
"step": 104
},
{
"completion_length": 528.0,
"epoch": 0.014050582095543959,
"grad_norm": 0.08159384876489639,
"kl": 0.0003954106941819191,
"learning_rate": 7.018716577540107e-07,
"loss": 0.013,
"reward": -1.265625,
"reward_std": 0.25390625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.265625,
"step": 105
},
{
"completion_length": 357.5,
"epoch": 0.014184397163120567,
"grad_norm": 0.103823222219944,
"kl": 0.0004869327531196177,
"learning_rate": 7.085561497326204e-07,
"loss": 0.0014,
"reward": -0.859375,
"reward_std": 0.470703125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.859375,
"step": 106
},
{
"completion_length": 475.5,
"epoch": 0.014318212230697177,
"grad_norm": 0.0782044380903244,
"kl": 0.0005046841688454151,
"learning_rate": 7.152406417112299e-07,
"loss": 0.0,
"reward": -1.1484375,
"reward_std": 0.4375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.1484375,
"step": 107
},
{
"completion_length": 467.5,
"epoch": 0.014452027298273785,
"grad_norm": 0.09171518683433533,
"kl": 0.0005273159476928413,
"learning_rate": 7.219251336898397e-07,
"loss": -0.0036,
"reward": -0.921875,
"reward_std": 0.4140625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.921875,
"step": 108
},
{
"completion_length": 461.16668701171875,
"epoch": 0.014585842365850395,
"grad_norm": 0.09841100871562958,
"kl": 0.0006487583741545677,
"learning_rate": 7.286096256684493e-07,
"loss": 0.0033,
"reward": -0.875,
"reward_std": 0.3671875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.875,
"step": 109
},
{
"completion_length": 358.5,
"epoch": 0.014719657433427003,
"grad_norm": 0.13746988773345947,
"kl": 0.0004324812616687268,
"learning_rate": 7.352941176470589e-07,
"loss": 0.0025,
"reward": -0.68359375,
"reward_std": 0.279296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.68359375,
"step": 110
},
{
"completion_length": 559.5,
"epoch": 0.014853472501003613,
"grad_norm": 0.08793191611766815,
"kl": 0.0005514743970707059,
"learning_rate": 7.419786096256686e-07,
"loss": 0.0003,
"reward": -1.5546875,
"reward_std": 0.404296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.5546875,
"step": 111
},
{
"completion_length": 396.66668701171875,
"epoch": 0.014987287568580221,
"grad_norm": 0.10710439831018448,
"kl": 0.0004422089259605855,
"learning_rate": 7.486631016042781e-07,
"loss": -0.0019,
"reward": -0.87109375,
"reward_std": 0.265625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.87109375,
"step": 112
},
{
"completion_length": 474.5,
"epoch": 0.015121102636156831,
"grad_norm": 0.09523480385541916,
"kl": 0.00043510389514267445,
"learning_rate": 7.553475935828877e-07,
"loss": -0.0029,
"reward": -0.94921875,
"reward_std": 0.376953125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.94921875,
"step": 113
},
{
"completion_length": 385.8333435058594,
"epoch": 0.015254917703733441,
"grad_norm": 0.11645786464214325,
"kl": 0.0005059984978288412,
"learning_rate": 7.620320855614974e-07,
"loss": -0.001,
"reward": -0.8671875,
"reward_std": 0.4921875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8671875,
"step": 114
},
{
"completion_length": 380.16668701171875,
"epoch": 0.01538873277131005,
"grad_norm": 0.14121747016906738,
"kl": 0.00044106499990448356,
"learning_rate": 7.68716577540107e-07,
"loss": 0.0038,
"reward": -0.703125,
"reward_std": 0.41796875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.703125,
"step": 115
},
{
"completion_length": 481.0,
"epoch": 0.01552254783888666,
"grad_norm": 0.10021474212408066,
"kl": 0.0005236791330389678,
"learning_rate": 7.754010695187167e-07,
"loss": -0.0135,
"reward": -1.09375,
"reward_std": 0.291015625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.09375,
"step": 116
},
{
"completion_length": 662.0,
"epoch": 0.01565636290646327,
"grad_norm": 0.08310368657112122,
"kl": 0.0005542068392969668,
"learning_rate": 7.820855614973262e-07,
"loss": 0.0026,
"reward": -1.6015625,
"reward_std": 0.6328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.6015625,
"step": 117
},
{
"completion_length": 469.3333435058594,
"epoch": 0.015790177974039876,
"grad_norm": 0.08709719032049179,
"kl": 0.000453361077234149,
"learning_rate": 7.887700534759358e-07,
"loss": 0.0008,
"reward": -1.140625,
"reward_std": 0.484375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.140625,
"step": 118
},
{
"completion_length": 397.16668701171875,
"epoch": 0.015923993041616485,
"grad_norm": 0.11706002801656723,
"kl": 0.0006737220101058483,
"learning_rate": 7.954545454545455e-07,
"loss": 0.0036,
"reward": -0.51953125,
"reward_std": 0.4453125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.51953125,
"step": 119
},
{
"completion_length": 431.0,
"epoch": 0.016057808109193095,
"grad_norm": 0.12378671020269394,
"kl": 0.0005190541851334274,
"learning_rate": 8.021390374331551e-07,
"loss": -0.0026,
"reward": -1.0859375,
"reward_std": 0.5390625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.0859375,
"step": 120
},
{
"completion_length": 585.8333740234375,
"epoch": 0.016191623176769705,
"grad_norm": 0.06980501115322113,
"kl": 0.0005238899611867964,
"learning_rate": 8.088235294117648e-07,
"loss": -0.0045,
"reward": -1.6796875,
"reward_std": 0.275390625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.6796875,
"step": 121
},
{
"completion_length": 289.8333435058594,
"epoch": 0.01632543824434631,
"grad_norm": 0.15975068509578705,
"kl": 0.0007231835625134408,
"learning_rate": 8.155080213903745e-07,
"loss": -0.0021,
"reward": -0.482421875,
"reward_std": 0.318359375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.482421875,
"step": 122
},
{
"completion_length": 507.3333435058594,
"epoch": 0.01645925331192292,
"grad_norm": 0.11071807146072388,
"kl": 0.0004386794753372669,
"learning_rate": 8.22192513368984e-07,
"loss": -0.0013,
"reward": -0.88671875,
"reward_std": 0.5703125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.88671875,
"step": 123
},
{
"completion_length": 577.0,
"epoch": 0.01659306837949953,
"grad_norm": 0.06843210011720657,
"kl": 0.0004015905724372715,
"learning_rate": 8.288770053475937e-07,
"loss": -0.0058,
"reward": -1.2734375,
"reward_std": 0.349609375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.2734375,
"step": 124
},
{
"completion_length": 485.3333435058594,
"epoch": 0.01672688344707614,
"grad_norm": 0.11638530343770981,
"kl": 0.000609593465924263,
"learning_rate": 8.355614973262033e-07,
"loss": -0.0047,
"reward": -1.15625,
"reward_std": 0.33984375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.15625,
"step": 125
},
{
"completion_length": 449.8333435058594,
"epoch": 0.01686069851465275,
"grad_norm": 0.06978274881839752,
"kl": 0.00046619633212685585,
"learning_rate": 8.42245989304813e-07,
"loss": -0.0045,
"reward": -0.875,
"reward_std": 0.2734375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.875,
"step": 126
},
{
"completion_length": 358.8333435058594,
"epoch": 0.016994513582229358,
"grad_norm": 0.12115911394357681,
"kl": 0.0006228546844795346,
"learning_rate": 8.489304812834226e-07,
"loss": 0.0012,
"reward": -0.80078125,
"reward_std": 0.2158203125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.80078125,
"step": 127
},
{
"completion_length": 340.8333435058594,
"epoch": 0.017128328649805968,
"grad_norm": 0.1330835521221161,
"kl": 0.0006046565249562263,
"learning_rate": 8.556149732620321e-07,
"loss": 0.001,
"reward": -0.75,
"reward_std": 0.2421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.75,
"step": 128
},
{
"completion_length": 384.66668701171875,
"epoch": 0.017262143717382578,
"grad_norm": 0.1260133534669876,
"kl": 0.0006517590372823179,
"learning_rate": 8.622994652406418e-07,
"loss": -0.0002,
"reward": -0.875,
"reward_std": 0.71875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.875,
"step": 129
},
{
"completion_length": 438.16668701171875,
"epoch": 0.017395958784959187,
"grad_norm": 0.10832860320806503,
"kl": 0.0005657231668010354,
"learning_rate": 8.689839572192514e-07,
"loss": 0.0007,
"reward": -1.0234375,
"reward_std": 0.578125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.0234375,
"step": 130
},
{
"completion_length": 395.5,
"epoch": 0.017529773852535794,
"grad_norm": 0.14919129014015198,
"kl": 0.0006569415563717484,
"learning_rate": 8.756684491978611e-07,
"loss": -0.0018,
"reward": -0.94921875,
"reward_std": 0.546875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.94921875,
"step": 131
},
{
"completion_length": 391.16668701171875,
"epoch": 0.017663588920112404,
"grad_norm": 0.14460250735282898,
"kl": 0.0006637731567025185,
"learning_rate": 8.823529411764707e-07,
"loss": -0.0013,
"reward": -0.94921875,
"reward_std": 0.25,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.94921875,
"step": 132
},
{
"completion_length": 441.3333435058594,
"epoch": 0.017797403987689014,
"grad_norm": 0.15532676875591278,
"kl": 0.0004714071692433208,
"learning_rate": 8.890374331550802e-07,
"loss": 0.0046,
"reward": -0.98828125,
"reward_std": 0.251953125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.98828125,
"step": 133
},
{
"completion_length": 370.8333435058594,
"epoch": 0.017931219055265624,
"grad_norm": 0.11382456123828888,
"kl": 0.0007278465200215578,
"learning_rate": 8.957219251336899e-07,
"loss": -0.0034,
"reward": -0.74609375,
"reward_std": 0.314453125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.74609375,
"step": 134
},
{
"completion_length": 400.5,
"epoch": 0.018065034122842234,
"grad_norm": 0.14282457530498505,
"kl": 0.00047517273924313486,
"learning_rate": 9.024064171122995e-07,
"loss": -0.006,
"reward": -0.8359375,
"reward_std": 0.322265625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8359375,
"step": 135
},
{
"completion_length": 556.3333740234375,
"epoch": 0.01819884919041884,
"grad_norm": 0.09024691581726074,
"kl": 0.00043525476939976215,
"learning_rate": 9.090909090909091e-07,
"loss": -0.0013,
"reward": -1.4296875,
"reward_std": 0.60546875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.4296875,
"step": 136
},
{
"completion_length": 301.66668701171875,
"epoch": 0.01833266425799545,
"grad_norm": 0.1422368437051773,
"kl": 0.0007189570460468531,
"learning_rate": 9.157754010695189e-07,
"loss": -0.0065,
"reward": -0.5859375,
"reward_std": 0.28125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.5859375,
"step": 137
},
{
"completion_length": 586.0,
"epoch": 0.01846647932557206,
"grad_norm": 0.07946749031543732,
"kl": 0.0004385068896226585,
"learning_rate": 9.224598930481284e-07,
"loss": 0.0013,
"reward": -1.421875,
"reward_std": 0.341796875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.421875,
"step": 138
},
{
"completion_length": 377.0,
"epoch": 0.01860029439314867,
"grad_norm": 0.12276607006788254,
"kl": 0.0007007961976341903,
"learning_rate": 9.29144385026738e-07,
"loss": -0.0027,
"reward": -0.9609375,
"reward_std": 0.48046875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.9609375,
"step": 139
},
{
"completion_length": 323.8333435058594,
"epoch": 0.018734109460725276,
"grad_norm": 0.11141712218523026,
"kl": 0.0004957327037118375,
"learning_rate": 9.358288770053477e-07,
"loss": -0.0023,
"reward": -0.7109375,
"reward_std": 0.458984375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.7109375,
"step": 140
},
{
"completion_length": 451.3333435058594,
"epoch": 0.018867924528301886,
"grad_norm": 0.08641522377729416,
"kl": 0.0004216538218315691,
"learning_rate": 9.425133689839573e-07,
"loss": -0.0001,
"reward": -0.89453125,
"reward_std": 0.33984375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.89453125,
"step": 141
},
{
"completion_length": 373.66668701171875,
"epoch": 0.019001739595878496,
"grad_norm": 0.11334878951311111,
"kl": 0.0005491083720698953,
"learning_rate": 9.49197860962567e-07,
"loss": -0.0032,
"reward": -0.89453125,
"reward_std": 0.40625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.89453125,
"step": 142
},
{
"completion_length": 365.66668701171875,
"epoch": 0.019135554663455106,
"grad_norm": 0.1251085102558136,
"kl": 0.0006641787476837635,
"learning_rate": 9.558823529411764e-07,
"loss": -0.0044,
"reward": -0.83984375,
"reward_std": 0.302734375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.83984375,
"step": 143
},
{
"completion_length": 474.66668701171875,
"epoch": 0.019269369731031716,
"grad_norm": 0.09395861625671387,
"kl": 0.0006498050643131137,
"learning_rate": 9.625668449197862e-07,
"loss": 0.0017,
"reward": -1.1328125,
"reward_std": 0.408203125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.1328125,
"step": 144
},
{
"completion_length": 323.3333435058594,
"epoch": 0.019403184798608322,
"grad_norm": 0.12278474867343903,
"kl": 0.0006131879054009914,
"learning_rate": 9.692513368983958e-07,
"loss": 0.0004,
"reward": -0.7265625,
"reward_std": 0.275390625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.7265625,
"step": 145
},
{
"completion_length": 377.3333435058594,
"epoch": 0.019536999866184932,
"grad_norm": 0.11732782423496246,
"kl": 0.0005958870751783252,
"learning_rate": 9.759358288770054e-07,
"loss": -0.0029,
"reward": -0.83203125,
"reward_std": 0.5703125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.83203125,
"step": 146
},
{
"completion_length": 440.5,
"epoch": 0.019670814933761542,
"grad_norm": 0.1868724673986435,
"kl": 0.000681176024954766,
"learning_rate": 9.82620320855615e-07,
"loss": -0.0068,
"reward": -1.09375,
"reward_std": 0.53125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.09375,
"step": 147
},
{
"completion_length": 340.8333435058594,
"epoch": 0.019804630001338152,
"grad_norm": 0.1497308760881424,
"kl": 0.0005646012723445892,
"learning_rate": 9.893048128342248e-07,
"loss": -0.0049,
"reward": -0.8125,
"reward_std": 0.396484375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8125,
"step": 148
},
{
"completion_length": 570.8333740234375,
"epoch": 0.01993844506891476,
"grad_norm": 0.09082633256912231,
"kl": 0.0005208106595091522,
"learning_rate": 9.959893048128342e-07,
"loss": -0.0026,
"reward": -1.1328125,
"reward_std": 0.671875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.1328125,
"step": 149
},
{
"completion_length": 381.3333435058594,
"epoch": 0.02007226013649137,
"grad_norm": 0.11392635107040405,
"kl": 0.0005461536347866058,
"learning_rate": 1.0026737967914438e-06,
"loss": -0.0045,
"reward": -0.65234375,
"reward_std": 0.46484375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.65234375,
"step": 150
},
{
"completion_length": 512.8333740234375,
"epoch": 0.020206075204067978,
"grad_norm": 0.10393022745847702,
"kl": 0.0005140831926837564,
"learning_rate": 1.0093582887700537e-06,
"loss": 0.0034,
"reward": -1.3125,
"reward_std": 0.470703125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.3125,
"step": 151
},
{
"completion_length": 352.8333435058594,
"epoch": 0.020339890271644588,
"grad_norm": 0.14165768027305603,
"kl": 0.000577162834815681,
"learning_rate": 1.0160427807486633e-06,
"loss": 0.0018,
"reward": -0.890625,
"reward_std": 0.375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.890625,
"step": 152
},
{
"completion_length": 379.3333435058594,
"epoch": 0.020473705339221198,
"grad_norm": 0.14219383895397186,
"kl": 0.0006267136195674539,
"learning_rate": 1.0227272727272729e-06,
"loss": -0.0019,
"reward": -0.703125,
"reward_std": 0.484375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.703125,
"step": 153
},
{
"completion_length": 433.0,
"epoch": 0.020607520406797804,
"grad_norm": 0.09045641869306564,
"kl": 0.0003349175094626844,
"learning_rate": 1.0294117647058825e-06,
"loss": 0.0128,
"reward": -1.1171875,
"reward_std": 0.240234375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.1171875,
"step": 154
},
{
"completion_length": 348.5,
"epoch": 0.020741335474374414,
"grad_norm": 0.1472688764333725,
"kl": 0.0006852279184386134,
"learning_rate": 1.036096256684492e-06,
"loss": -0.0036,
"reward": -0.71875,
"reward_std": 0.279296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.71875,
"step": 155
},
{
"completion_length": 388.0,
"epoch": 0.020875150541951024,
"grad_norm": 0.14087940752506256,
"kl": 0.0006020927103236318,
"learning_rate": 1.0427807486631017e-06,
"loss": -0.0057,
"reward": -0.64453125,
"reward_std": 0.32421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.64453125,
"step": 156
},
{
"completion_length": 327.66668701171875,
"epoch": 0.021008965609527634,
"grad_norm": 0.13045720756053925,
"kl": 0.0005312262801453471,
"learning_rate": 1.0494652406417113e-06,
"loss": -0.0019,
"reward": -0.53125,
"reward_std": 0.53125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.53125,
"step": 157
},
{
"completion_length": 409.5,
"epoch": 0.02114278067710424,
"grad_norm": 0.12158454209566116,
"kl": 0.0006615255842916667,
"learning_rate": 1.056149732620321e-06,
"loss": -0.0067,
"reward": -0.80078125,
"reward_std": 0.2490234375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.80078125,
"step": 158
},
{
"completion_length": 530.6666870117188,
"epoch": 0.02127659574468085,
"grad_norm": 0.1100451648235321,
"kl": 0.0006079694721847773,
"learning_rate": 1.0628342245989305e-06,
"loss": -0.0006,
"reward": -1.5,
"reward_std": 0.64453125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.5,
"step": 159
},
{
"completion_length": 365.0,
"epoch": 0.02141041081225746,
"grad_norm": 0.11980035901069641,
"kl": 0.0005896420334465802,
"learning_rate": 1.0695187165775401e-06,
"loss": -0.0011,
"reward": -0.69921875,
"reward_std": 0.4375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.69921875,
"step": 160
},
{
"completion_length": 312.5,
"epoch": 0.02154422587983407,
"grad_norm": 0.14624665677547455,
"kl": 0.00077395373955369,
"learning_rate": 1.0762032085561497e-06,
"loss": 0.0041,
"reward": -0.54296875,
"reward_std": 0.3046875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.54296875,
"step": 161
},
{
"completion_length": 391.8333435058594,
"epoch": 0.021678040947410677,
"grad_norm": 0.1249147579073906,
"kl": 0.0007619769312441349,
"learning_rate": 1.0828877005347595e-06,
"loss": -0.0054,
"reward": -0.875,
"reward_std": 0.361328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.875,
"step": 162
},
{
"completion_length": 630.6666870117188,
"epoch": 0.021811856014987287,
"grad_norm": 0.09878282248973846,
"kl": 0.0005383545067161322,
"learning_rate": 1.0895721925133691e-06,
"loss": 0.0016,
"reward": -1.59375,
"reward_std": 1.1171875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.59375,
"step": 163
},
{
"completion_length": 238.1666717529297,
"epoch": 0.021945671082563897,
"grad_norm": 0.16415703296661377,
"kl": 0.0007143677212297916,
"learning_rate": 1.0962566844919787e-06,
"loss": 0.0052,
"reward": -0.34375,
"reward_std": 0.1376953125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.34375,
"step": 164
},
{
"completion_length": 424.16668701171875,
"epoch": 0.022079486150140507,
"grad_norm": 0.12024425715208054,
"kl": 0.0004885084345005453,
"learning_rate": 1.1029411764705884e-06,
"loss": -0.0011,
"reward": -1.0234375,
"reward_std": 0.287109375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.0234375,
"step": 165
},
{
"completion_length": 345.0,
"epoch": 0.022213301217717116,
"grad_norm": 0.13134251534938812,
"kl": 0.0005485338624566793,
"learning_rate": 1.109625668449198e-06,
"loss": 0.0041,
"reward": -0.640625,
"reward_std": 0.296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.640625,
"step": 166
},
{
"completion_length": 449.0,
"epoch": 0.022347116285293723,
"grad_norm": 0.13914933800697327,
"kl": 0.0005790984723716974,
"learning_rate": 1.1163101604278076e-06,
"loss": -0.0024,
"reward": -0.90625,
"reward_std": 0.43359375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.90625,
"step": 167
},
{
"completion_length": 456.66668701171875,
"epoch": 0.022480931352870333,
"grad_norm": 0.11662891507148743,
"kl": 0.000677458185236901,
"learning_rate": 1.1229946524064172e-06,
"loss": -0.0081,
"reward": -1.21875,
"reward_std": 0.431640625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.21875,
"step": 168
},
{
"completion_length": 338.66668701171875,
"epoch": 0.022614746420446943,
"grad_norm": 0.14155802130699158,
"kl": 0.0005925593432039022,
"learning_rate": 1.1296791443850268e-06,
"loss": 0.0,
"reward": -0.71484375,
"reward_std": 0.40625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.71484375,
"step": 169
},
{
"completion_length": 265.8333435058594,
"epoch": 0.022748561488023553,
"grad_norm": 0.16593119502067566,
"kl": 0.0005104307783767581,
"learning_rate": 1.1363636363636364e-06,
"loss": 0.0003,
"reward": -0.48046875,
"reward_std": 0.2353515625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.48046875,
"step": 170
},
{
"completion_length": 461.3333435058594,
"epoch": 0.02288237655560016,
"grad_norm": 0.1283525973558426,
"kl": 0.0006034953985363245,
"learning_rate": 1.143048128342246e-06,
"loss": -0.0006,
"reward": -0.91015625,
"reward_std": 0.55859375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.91015625,
"step": 171
},
{
"completion_length": 567.3333740234375,
"epoch": 0.02301619162317677,
"grad_norm": 0.09264618158340454,
"kl": 0.00039277609903365374,
"learning_rate": 1.1497326203208558e-06,
"loss": 0.0011,
"reward": -1.296875,
"reward_std": 0.423828125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.296875,
"step": 172
},
{
"completion_length": 442.66668701171875,
"epoch": 0.02315000669075338,
"grad_norm": 0.06924661993980408,
"kl": 0.0002805929980240762,
"learning_rate": 1.1564171122994654e-06,
"loss": 0.0049,
"reward": -1.140625,
"reward_std": 0.130859375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.140625,
"step": 173
},
{
"completion_length": 321.5,
"epoch": 0.02328382175832999,
"grad_norm": 0.15098147094249725,
"kl": 0.0005829234141856432,
"learning_rate": 1.163101604278075e-06,
"loss": 0.0117,
"reward": -0.58203125,
"reward_std": 0.1455078125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.58203125,
"step": 174
},
{
"completion_length": 416.3333435058594,
"epoch": 0.0234176368259066,
"grad_norm": 0.11847102642059326,
"kl": 0.0006443657330237329,
"learning_rate": 1.1697860962566846e-06,
"loss": -0.0044,
"reward": -0.8671875,
"reward_std": 0.32421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8671875,
"step": 175
},
{
"completion_length": 371.3333435058594,
"epoch": 0.023551451893483205,
"grad_norm": 0.1091599240899086,
"kl": 0.0004576949286274612,
"learning_rate": 1.1764705882352942e-06,
"loss": -0.0054,
"reward": -0.73828125,
"reward_std": 0.296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.73828125,
"step": 176
},
{
"completion_length": 438.16668701171875,
"epoch": 0.023685266961059815,
"grad_norm": 0.10089421272277832,
"kl": 0.0004992609028704464,
"learning_rate": 1.1831550802139038e-06,
"loss": 0.0029,
"reward": -0.55859375,
"reward_std": 0.482421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.55859375,
"step": 177
},
{
"completion_length": 509.16668701171875,
"epoch": 0.023819082028636425,
"grad_norm": 0.10792536288499832,
"kl": 0.000662465114146471,
"learning_rate": 1.1898395721925134e-06,
"loss": -0.0029,
"reward": -1.21875,
"reward_std": 0.294921875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.21875,
"step": 178
},
{
"completion_length": 347.5,
"epoch": 0.023952897096213035,
"grad_norm": 0.2220248132944107,
"kl": 0.0006923056207597256,
"learning_rate": 1.1965240641711233e-06,
"loss": -0.0063,
"reward": -0.78125,
"reward_std": 0.29296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.78125,
"step": 179
},
{
"completion_length": 470.16668701171875,
"epoch": 0.02408671216378964,
"grad_norm": 0.09262672066688538,
"kl": 0.00041312514804303646,
"learning_rate": 1.2032085561497326e-06,
"loss": -0.0037,
"reward": -1.2109375,
"reward_std": 0.458984375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.2109375,
"step": 180
},
{
"completion_length": 484.0,
"epoch": 0.02422052723136625,
"grad_norm": 0.11066435277462006,
"kl": 0.0005693985149264336,
"learning_rate": 1.2098930481283423e-06,
"loss": -0.0112,
"reward": -1.109375,
"reward_std": 0.326171875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.109375,
"step": 181
},
{
"completion_length": 317.8333435058594,
"epoch": 0.02435434229894286,
"grad_norm": 0.1172327920794487,
"kl": 0.0005950028426013887,
"learning_rate": 1.216577540106952e-06,
"loss": -0.0006,
"reward": -0.6328125,
"reward_std": 0.34765625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6328125,
"step": 182
},
{
"completion_length": 339.3333435058594,
"epoch": 0.02448815736651947,
"grad_norm": 0.10278620570898056,
"kl": 0.00045496373786590993,
"learning_rate": 1.2232620320855617e-06,
"loss": -0.0003,
"reward": -0.671875,
"reward_std": 0.4140625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.671875,
"step": 183
},
{
"completion_length": 497.3333435058594,
"epoch": 0.02462197243409608,
"grad_norm": 0.1089860200881958,
"kl": 0.0006303495611064136,
"learning_rate": 1.2299465240641713e-06,
"loss": -0.0019,
"reward": -1.359375,
"reward_std": 1.3984375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.359375,
"step": 184
},
{
"completion_length": 304.66668701171875,
"epoch": 0.024755787501672687,
"grad_norm": 0.14699524641036987,
"kl": 0.0006107437657192349,
"learning_rate": 1.2366310160427809e-06,
"loss": -0.0026,
"reward": -0.51953125,
"reward_std": 0.3359375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.51953125,
"step": 185
},
{
"completion_length": 360.5,
"epoch": 0.024889602569249297,
"grad_norm": 0.1235690489411354,
"kl": 0.000642502389382571,
"learning_rate": 1.2433155080213905e-06,
"loss": 0.0,
"reward": -0.890625,
"reward_std": 0.1982421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.890625,
"step": 186
},
{
"completion_length": 405.8333435058594,
"epoch": 0.025023417636825907,
"grad_norm": 0.13261531293392181,
"kl": 0.0007065389072522521,
"learning_rate": 1.25e-06,
"loss": 0.0017,
"reward": -0.828125,
"reward_std": 0.427734375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.828125,
"step": 187
},
{
"completion_length": 355.8333435058594,
"epoch": 0.025157232704402517,
"grad_norm": 0.11836958676576614,
"kl": 0.0005762047949247062,
"learning_rate": 1.2566844919786097e-06,
"loss": 0.001,
"reward": -0.53515625,
"reward_std": 0.3046875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.53515625,
"step": 188
},
{
"completion_length": 295.0,
"epoch": 0.025291047771979124,
"grad_norm": 0.15814770758152008,
"kl": 0.000565587542951107,
"learning_rate": 1.2633689839572193e-06,
"loss": 0.0071,
"reward": -0.53515625,
"reward_std": 0.17578125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.53515625,
"step": 189
},
{
"completion_length": 317.8333435058594,
"epoch": 0.025424862839555733,
"grad_norm": 0.2327018529176712,
"kl": 0.0006943491753190756,
"learning_rate": 1.2700534759358291e-06,
"loss": -0.0019,
"reward": -0.640625,
"reward_std": 0.375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.640625,
"step": 190
},
{
"completion_length": 345.66668701171875,
"epoch": 0.025558677907132343,
"grad_norm": 0.15127608180046082,
"kl": 0.0005449084565043449,
"learning_rate": 1.2767379679144387e-06,
"loss": 0.0039,
"reward": -0.6875,
"reward_std": 0.25,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6875,
"step": 191
},
{
"completion_length": 372.0,
"epoch": 0.025692492974708953,
"grad_norm": 0.1675024777650833,
"kl": 0.0006789276376366615,
"learning_rate": 1.2834224598930483e-06,
"loss": 0.0001,
"reward": -0.9453125,
"reward_std": 0.486328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.9453125,
"step": 192
},
{
"completion_length": 260.8333435058594,
"epoch": 0.02582630804228556,
"grad_norm": 0.17227157950401306,
"kl": 0.0005113824736326933,
"learning_rate": 1.2901069518716577e-06,
"loss": -0.0011,
"reward": -0.41796875,
"reward_std": 0.443359375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.41796875,
"step": 193
},
{
"completion_length": 458.8333435058594,
"epoch": 0.02596012310986217,
"grad_norm": 0.13124048709869385,
"kl": 0.000769376871176064,
"learning_rate": 1.2967914438502673e-06,
"loss": 0.0117,
"reward": -1.265625,
"reward_std": 0.33984375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.265625,
"step": 194
},
{
"completion_length": 482.8333435058594,
"epoch": 0.02609393817743878,
"grad_norm": 0.11438746005296707,
"kl": 0.0005745739908888936,
"learning_rate": 1.303475935828877e-06,
"loss": 0.0003,
"reward": -1.2421875,
"reward_std": 0.625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.2421875,
"step": 195
},
{
"completion_length": 337.3333435058594,
"epoch": 0.02622775324501539,
"grad_norm": 0.12187661230564117,
"kl": 0.0005417331121861935,
"learning_rate": 1.3101604278074868e-06,
"loss": 0.0001,
"reward": -0.63671875,
"reward_std": 0.58984375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.63671875,
"step": 196
},
{
"completion_length": 304.8333435058594,
"epoch": 0.026361568312592,
"grad_norm": 0.18323078751564026,
"kl": 0.001144442823715508,
"learning_rate": 1.3168449197860964e-06,
"loss": 0.0054,
"reward": -0.66015625,
"reward_std": 0.2099609375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.66015625,
"step": 197
},
{
"completion_length": 367.3333435058594,
"epoch": 0.026495383380168606,
"grad_norm": 0.13606765866279602,
"kl": 0.0006207119440659881,
"learning_rate": 1.323529411764706e-06,
"loss": 0.0093,
"reward": -0.8359375,
"reward_std": 0.2265625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8359375,
"step": 198
},
{
"completion_length": 367.8333435058594,
"epoch": 0.026629198447745216,
"grad_norm": 0.13173972070217133,
"kl": 0.0006991230184212327,
"learning_rate": 1.3302139037433156e-06,
"loss": -0.0032,
"reward": -0.66015625,
"reward_std": 0.482421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.66015625,
"step": 199
},
{
"completion_length": 516.8333740234375,
"epoch": 0.026763013515321826,
"grad_norm": 0.113725446164608,
"kl": 0.0006071855314075947,
"learning_rate": 1.3368983957219254e-06,
"loss": 0.0021,
"reward": -1.3046875,
"reward_std": 0.53125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.3046875,
"step": 200
},
{
"completion_length": 505.0,
"epoch": 0.026896828582898435,
"grad_norm": 0.11239483207464218,
"kl": 0.0006069260416552424,
"learning_rate": 1.343582887700535e-06,
"loss": 0.0026,
"reward": -1.21875,
"reward_std": 0.66796875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.21875,
"step": 201
},
{
"completion_length": 398.0,
"epoch": 0.027030643650475042,
"grad_norm": 0.11724357306957245,
"kl": 0.0004885403905063868,
"learning_rate": 1.3502673796791446e-06,
"loss": -0.0039,
"reward": -0.875,
"reward_std": 0.4609375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.875,
"step": 202
},
{
"completion_length": 549.0,
"epoch": 0.027164458718051652,
"grad_norm": 0.1066315770149231,
"kl": 0.0005430117598734796,
"learning_rate": 1.356951871657754e-06,
"loss": -0.0029,
"reward": -1.171875,
"reward_std": 0.48828125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.171875,
"step": 203
},
{
"completion_length": 565.5,
"epoch": 0.02729827378562826,
"grad_norm": 0.11152154952287674,
"kl": 0.0006415534298866987,
"learning_rate": 1.3636363636363636e-06,
"loss": 0.0046,
"reward": -1.453125,
"reward_std": 0.6796875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.453125,
"step": 204
},
{
"completion_length": 322.16668701171875,
"epoch": 0.02743208885320487,
"grad_norm": 0.14083048701286316,
"kl": 0.0005029004532843828,
"learning_rate": 1.3703208556149732e-06,
"loss": -0.0036,
"reward": -0.48046875,
"reward_std": 0.2490234375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.48046875,
"step": 205
},
{
"completion_length": 234.1666717529297,
"epoch": 0.02756590392078148,
"grad_norm": 0.18288779258728027,
"kl": 0.0005245095817372203,
"learning_rate": 1.377005347593583e-06,
"loss": -0.0001,
"reward": -0.287109375,
"reward_std": 0.330078125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.287109375,
"step": 206
},
{
"completion_length": 414.3333435058594,
"epoch": 0.027699718988358088,
"grad_norm": 0.11854821443557739,
"kl": 0.0006570084951817989,
"learning_rate": 1.3836898395721926e-06,
"loss": 0.0049,
"reward": -0.9140625,
"reward_std": 0.376953125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.9140625,
"step": 207
},
{
"completion_length": 414.16668701171875,
"epoch": 0.027833534055934698,
"grad_norm": 0.13677829504013062,
"kl": 0.000780851929448545,
"learning_rate": 1.3903743315508022e-06,
"loss": -0.005,
"reward": -0.9296875,
"reward_std": 0.35546875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.9296875,
"step": 208
},
{
"completion_length": 357.16668701171875,
"epoch": 0.027967349123511308,
"grad_norm": 0.10833487659692764,
"kl": 0.0004263838636688888,
"learning_rate": 1.3970588235294119e-06,
"loss": -0.001,
"reward": -0.44921875,
"reward_std": 0.44921875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.44921875,
"step": 209
},
{
"completion_length": 277.3333435058594,
"epoch": 0.028101164191087918,
"grad_norm": 0.18873073160648346,
"kl": 0.0007301772711798549,
"learning_rate": 1.4037433155080215e-06,
"loss": -0.0014,
"reward": -0.5703125,
"reward_std": 0.392578125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.5703125,
"step": 210
},
{
"completion_length": 422.16668701171875,
"epoch": 0.028234979258664524,
"grad_norm": 0.140394926071167,
"kl": 0.0007065697573125362,
"learning_rate": 1.4104278074866313e-06,
"loss": 0.0013,
"reward": -1.2109375,
"reward_std": 0.88671875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.2109375,
"step": 211
},
{
"completion_length": 352.16668701171875,
"epoch": 0.028368794326241134,
"grad_norm": 0.12692704796791077,
"kl": 0.0005341452197171748,
"learning_rate": 1.4171122994652409e-06,
"loss": 0.0041,
"reward": -0.55078125,
"reward_std": 0.515625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.55078125,
"step": 212
},
{
"completion_length": 412.8333435058594,
"epoch": 0.028502609393817744,
"grad_norm": 0.1450049728155136,
"kl": 0.0005619659787043929,
"learning_rate": 1.4237967914438503e-06,
"loss": -0.0049,
"reward": -0.828125,
"reward_std": 0.400390625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.828125,
"step": 213
},
{
"completion_length": 385.0,
"epoch": 0.028636424461394354,
"grad_norm": 0.11151020228862762,
"kl": 0.000545224582310766,
"learning_rate": 1.4304812834224599e-06,
"loss": -0.0051,
"reward": -0.8671875,
"reward_std": 0.236328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8671875,
"step": 214
},
{
"completion_length": 263.8333435058594,
"epoch": 0.028770239528970964,
"grad_norm": 0.13172751665115356,
"kl": 0.000556406972464174,
"learning_rate": 1.4371657754010695e-06,
"loss": -0.0003,
"reward": -0.419921875,
"reward_std": 0.36328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.419921875,
"step": 215
},
{
"completion_length": 312.3333435058594,
"epoch": 0.02890405459654757,
"grad_norm": 0.14570048451423645,
"kl": 0.000702905235812068,
"learning_rate": 1.4438502673796793e-06,
"loss": 0.0041,
"reward": -0.63671875,
"reward_std": 0.306640625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.63671875,
"step": 216
},
{
"completion_length": 342.8333435058594,
"epoch": 0.02903786966412418,
"grad_norm": 0.13387592136859894,
"kl": 0.0006144473445601761,
"learning_rate": 1.450534759358289e-06,
"loss": -0.0003,
"reward": -0.75,
"reward_std": 0.271484375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.75,
"step": 217
},
{
"completion_length": 373.3333435058594,
"epoch": 0.02917168473170079,
"grad_norm": 0.14034722745418549,
"kl": 0.0006035550031810999,
"learning_rate": 1.4572192513368985e-06,
"loss": -0.0052,
"reward": -0.6796875,
"reward_std": 0.35546875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6796875,
"step": 218
},
{
"completion_length": 370.5,
"epoch": 0.0293054997992774,
"grad_norm": 0.17203155159950256,
"kl": 0.000694015237968415,
"learning_rate": 1.4639037433155081e-06,
"loss": -0.0037,
"reward": -0.88671875,
"reward_std": 0.291015625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.88671875,
"step": 219
},
{
"completion_length": 573.0,
"epoch": 0.029439314866854006,
"grad_norm": 0.08722779154777527,
"kl": 0.0005780010833404958,
"learning_rate": 1.4705882352941177e-06,
"loss": -0.0014,
"reward": -1.34375,
"reward_std": 0.76953125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.34375,
"step": 220
},
{
"completion_length": 342.66668701171875,
"epoch": 0.029573129934430616,
"grad_norm": 0.15573082864284515,
"kl": 0.0006010913057252765,
"learning_rate": 1.4772727272727275e-06,
"loss": 0.0088,
"reward": -0.8125,
"reward_std": 0.265625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8125,
"step": 221
},
{
"completion_length": 475.16668701171875,
"epoch": 0.029706945002007226,
"grad_norm": 0.10412527620792389,
"kl": 0.000707049563061446,
"learning_rate": 1.4839572192513372e-06,
"loss": 0.0038,
"reward": -1.328125,
"reward_std": 0.40625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.328125,
"step": 222
},
{
"completion_length": 379.8333435058594,
"epoch": 0.029840760069583836,
"grad_norm": 0.1168096736073494,
"kl": 0.0006801115232519805,
"learning_rate": 1.4906417112299468e-06,
"loss": 0.0033,
"reward": -0.80859375,
"reward_std": 0.49609375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.80859375,
"step": 223
},
{
"completion_length": 217.6666717529297,
"epoch": 0.029974575137160443,
"grad_norm": 0.17121706902980804,
"kl": 0.0008745932718738914,
"learning_rate": 1.4973262032085562e-06,
"loss": -0.0019,
"reward": -0.2578125,
"reward_std": 0.35546875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.2578125,
"step": 224
},
{
"completion_length": 233.83334350585938,
"epoch": 0.030108390204737052,
"grad_norm": 0.18835073709487915,
"kl": 0.0008036958752200007,
"learning_rate": 1.5040106951871658e-06,
"loss": -0.0013,
"reward": -0.396484375,
"reward_std": 0.361328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.396484375,
"step": 225
},
{
"completion_length": 301.3333435058594,
"epoch": 0.030242205272313662,
"grad_norm": 0.18656164407730103,
"kl": 0.0008596427505835891,
"learning_rate": 1.5106951871657754e-06,
"loss": -0.0006,
"reward": -0.5625,
"reward_std": 0.259765625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.5625,
"step": 226
},
{
"completion_length": 398.3333435058594,
"epoch": 0.030376020339890272,
"grad_norm": 0.14680197834968567,
"kl": 0.0007448707474395633,
"learning_rate": 1.5173796791443852e-06,
"loss": 0.0047,
"reward": -0.85546875,
"reward_std": 0.361328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.85546875,
"step": 227
},
{
"completion_length": 388.8333435058594,
"epoch": 0.030509835407466882,
"grad_norm": 0.1266445517539978,
"kl": 0.0006551437545567751,
"learning_rate": 1.5240641711229948e-06,
"loss": -0.0011,
"reward": -0.73046875,
"reward_std": 0.220703125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.73046875,
"step": 228
},
{
"completion_length": 371.8333435058594,
"epoch": 0.03064365047504349,
"grad_norm": 0.18683753907680511,
"kl": 0.0006818679976277053,
"learning_rate": 1.5307486631016044e-06,
"loss": -0.014,
"reward": -0.875,
"reward_std": 0.1455078125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.875,
"step": 229
},
{
"completion_length": 444.5,
"epoch": 0.0307774655426201,
"grad_norm": 0.1100480780005455,
"kl": 0.00042218127055093646,
"learning_rate": 1.537433155080214e-06,
"loss": 0.0029,
"reward": -1.0390625,
"reward_std": 0.2353515625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.0390625,
"step": 230
},
{
"completion_length": 485.8333435058594,
"epoch": 0.03091128061019671,
"grad_norm": 0.12457609176635742,
"kl": 0.0006653472664766014,
"learning_rate": 1.5441176470588238e-06,
"loss": 0.0025,
"reward": -1.375,
"reward_std": 0.99609375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.375,
"step": 231
},
{
"completion_length": 522.5,
"epoch": 0.03104509567777332,
"grad_norm": 0.09300174564123154,
"kl": 0.0004242811701260507,
"learning_rate": 1.5508021390374334e-06,
"loss": -0.0009,
"reward": -0.890625,
"reward_std": 0.392578125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.890625,
"step": 232
},
{
"completion_length": 325.3333435058594,
"epoch": 0.031178910745349925,
"grad_norm": 0.17127934098243713,
"kl": 0.0009391449275426567,
"learning_rate": 1.557486631016043e-06,
"loss": 0.0012,
"reward": -0.70703125,
"reward_std": 0.1376953125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.70703125,
"step": 233
},
{
"completion_length": 309.5,
"epoch": 0.03131272581292654,
"grad_norm": 0.11277639120817184,
"kl": 0.0005117826513014734,
"learning_rate": 1.5641711229946524e-06,
"loss": -0.0065,
"reward": -0.56640625,
"reward_std": 0.1513671875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.56640625,
"step": 234
},
{
"completion_length": 359.66668701171875,
"epoch": 0.031446540880503145,
"grad_norm": 0.14879803359508514,
"kl": 0.0008924457943066955,
"learning_rate": 1.570855614973262e-06,
"loss": -0.0051,
"reward": -0.9140625,
"reward_std": 0.44921875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.9140625,
"step": 235
},
{
"completion_length": 402.3333435058594,
"epoch": 0.03158035594807975,
"grad_norm": 0.10776454210281372,
"kl": 0.0005682529299519956,
"learning_rate": 1.5775401069518716e-06,
"loss": -0.0023,
"reward": -0.74609375,
"reward_std": 0.314453125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.74609375,
"step": 236
},
{
"completion_length": 478.16668701171875,
"epoch": 0.031714171015656364,
"grad_norm": 0.10001283884048462,
"kl": 0.0004951292648911476,
"learning_rate": 1.5842245989304815e-06,
"loss": -0.0045,
"reward": -1.0859375,
"reward_std": 0.458984375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.0859375,
"step": 237
},
{
"completion_length": 356.3333435058594,
"epoch": 0.03184798608323297,
"grad_norm": 0.18665075302124023,
"kl": 0.0009052582900039852,
"learning_rate": 1.590909090909091e-06,
"loss": 0.0032,
"reward": -0.8359375,
"reward_std": 0.369140625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8359375,
"step": 238
},
{
"completion_length": 570.0,
"epoch": 0.031981801150809584,
"grad_norm": 0.12214533984661102,
"kl": 0.0008592414669692516,
"learning_rate": 1.5975935828877007e-06,
"loss": -0.0031,
"reward": -1.78125,
"reward_std": 0.4921875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.78125,
"step": 239
},
{
"completion_length": 619.8333740234375,
"epoch": 0.03211561621838619,
"grad_norm": 0.1160222515463829,
"kl": 0.0008299415349029005,
"learning_rate": 1.6042780748663103e-06,
"loss": 0.0026,
"reward": -1.96875,
"reward_std": 0.66796875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.96875,
"step": 240
},
{
"completion_length": 533.1666870117188,
"epoch": 0.0322494312859628,
"grad_norm": 0.08083510398864746,
"kl": 0.0004741963930428028,
"learning_rate": 1.6109625668449199e-06,
"loss": -0.0167,
"reward": -1.109375,
"reward_std": 0.23828125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.109375,
"step": 241
},
{
"completion_length": 375.3333435058594,
"epoch": 0.03238324635353941,
"grad_norm": 0.15658938884735107,
"kl": 0.0007425328949466348,
"learning_rate": 1.6176470588235297e-06,
"loss": -0.0074,
"reward": -0.9140625,
"reward_std": 0.2412109375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.9140625,
"step": 242
},
{
"completion_length": 456.66668701171875,
"epoch": 0.03251706142111602,
"grad_norm": 0.14069658517837524,
"kl": 0.0008559181587770581,
"learning_rate": 1.6243315508021393e-06,
"loss": 0.0035,
"reward": -1.3515625,
"reward_std": 0.390625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.3515625,
"step": 243
},
{
"completion_length": 294.16668701171875,
"epoch": 0.03265087648869262,
"grad_norm": 0.17595937848091125,
"kl": 0.0008926563896238804,
"learning_rate": 1.631016042780749e-06,
"loss": -0.0018,
"reward": -0.671875,
"reward_std": 0.4140625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.671875,
"step": 244
},
{
"completion_length": 489.3333435058594,
"epoch": 0.03278469155626924,
"grad_norm": 0.1979399472475052,
"kl": 0.001042112591676414,
"learning_rate": 1.6377005347593583e-06,
"loss": -0.0008,
"reward": -1.359375,
"reward_std": 0.57421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.359375,
"step": 245
},
{
"completion_length": 416.8333435058594,
"epoch": 0.03291850662384584,
"grad_norm": 0.12472230195999146,
"kl": 0.0007299688877537847,
"learning_rate": 1.644385026737968e-06,
"loss": -0.0026,
"reward": -0.75,
"reward_std": 0.68359375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.75,
"step": 246
},
{
"completion_length": 333.0,
"epoch": 0.03305232169142246,
"grad_norm": 0.17285002768039703,
"kl": 0.0012012843508273363,
"learning_rate": 1.6510695187165775e-06,
"loss": -0.0048,
"reward": -0.640625,
"reward_std": 0.28515625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.640625,
"step": 247
},
{
"completion_length": 333.0,
"epoch": 0.03318613675899906,
"grad_norm": 0.12894539535045624,
"kl": 0.0008531633648090065,
"learning_rate": 1.6577540106951873e-06,
"loss": 0.0006,
"reward": -0.515625,
"reward_std": 0.59765625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.515625,
"step": 248
},
{
"completion_length": 403.66668701171875,
"epoch": 0.03331995182657567,
"grad_norm": 0.17463049292564392,
"kl": 0.0010805390775203705,
"learning_rate": 1.664438502673797e-06,
"loss": 0.0003,
"reward": -0.61328125,
"reward_std": 0.7109375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.61328125,
"step": 249
},
{
"completion_length": 513.5,
"epoch": 0.03345376689415228,
"grad_norm": 0.12753607332706451,
"kl": 0.0006255035405047238,
"learning_rate": 1.6711229946524065e-06,
"loss": 0.002,
"reward": -1.296875,
"reward_std": 0.6640625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.296875,
"step": 250
},
{
"completion_length": 338.5,
"epoch": 0.03358758196172889,
"grad_norm": 0.1846907138824463,
"kl": 0.0011831402080133557,
"learning_rate": 1.6778074866310161e-06,
"loss": 0.0007,
"reward": -0.7578125,
"reward_std": 0.33984375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.7578125,
"step": 251
},
{
"completion_length": 260.16668701171875,
"epoch": 0.0337213970293055,
"grad_norm": 0.17691083252429962,
"kl": 0.0010596700012683868,
"learning_rate": 1.684491978609626e-06,
"loss": -0.0024,
"reward": -0.51953125,
"reward_std": 0.287109375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.51953125,
"step": 252
},
{
"completion_length": 429.66668701171875,
"epoch": 0.03385521209688211,
"grad_norm": 0.1275603324174881,
"kl": 0.0007852836861275136,
"learning_rate": 1.6911764705882356e-06,
"loss": 0.0027,
"reward": -1.015625,
"reward_std": 0.267578125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.015625,
"step": 253
},
{
"completion_length": 318.66668701171875,
"epoch": 0.033989027164458716,
"grad_norm": 0.14481216669082642,
"kl": 0.0007643938879482448,
"learning_rate": 1.6978609625668452e-06,
"loss": -0.0063,
"reward": -0.58203125,
"reward_std": 0.2431640625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.58203125,
"step": 254
},
{
"completion_length": 398.66668701171875,
"epoch": 0.03412284223203533,
"grad_norm": 0.23024463653564453,
"kl": 0.0010699962731450796,
"learning_rate": 1.7045454545454546e-06,
"loss": -0.0022,
"reward": -0.90234375,
"reward_std": 0.61328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.90234375,
"step": 255
},
{
"completion_length": 448.16668701171875,
"epoch": 0.034256657299611935,
"grad_norm": 0.11663252115249634,
"kl": 0.0009288216824643314,
"learning_rate": 1.7112299465240642e-06,
"loss": 0.0009,
"reward": -1.0546875,
"reward_std": 0.7734375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.0546875,
"step": 256
},
{
"completion_length": 309.8333435058594,
"epoch": 0.03439047236718855,
"grad_norm": 0.16083866357803345,
"kl": 0.000840982305817306,
"learning_rate": 1.7179144385026738e-06,
"loss": -0.003,
"reward": -0.6015625,
"reward_std": 0.306640625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6015625,
"step": 257
},
{
"completion_length": 390.3333435058594,
"epoch": 0.034524287434765155,
"grad_norm": 0.1549844890832901,
"kl": 0.0006463018362410367,
"learning_rate": 1.7245989304812836e-06,
"loss": -0.0032,
"reward": -0.76953125,
"reward_std": 0.291015625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.76953125,
"step": 258
},
{
"completion_length": 330.8333435058594,
"epoch": 0.03465810250234176,
"grad_norm": 0.16839320957660675,
"kl": 0.001401308341883123,
"learning_rate": 1.7312834224598932e-06,
"loss": 0.0101,
"reward": -0.765625,
"reward_std": 0.2158203125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.765625,
"step": 259
},
{
"completion_length": 459.8333435058594,
"epoch": 0.034791917569918375,
"grad_norm": 0.10837510973215103,
"kl": 0.0012012351071462035,
"learning_rate": 1.7379679144385028e-06,
"loss": 0.0023,
"reward": -0.8828125,
"reward_std": 0.271484375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8828125,
"step": 260
},
{
"completion_length": 508.0,
"epoch": 0.03492573263749498,
"grad_norm": 0.10151364654302597,
"kl": 0.0011522852582857013,
"learning_rate": 1.7446524064171124e-06,
"loss": 0.0002,
"reward": -1.1484375,
"reward_std": 0.8203125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.1484375,
"step": 261
},
{
"completion_length": 433.5,
"epoch": 0.03505954770507159,
"grad_norm": 0.1335013061761856,
"kl": 0.0009189687552861869,
"learning_rate": 1.7513368983957222e-06,
"loss": -0.0009,
"reward": -1.0625,
"reward_std": 0.66796875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.0625,
"step": 262
},
{
"completion_length": 245.33334350585938,
"epoch": 0.0351933627726482,
"grad_norm": 0.20102928578853607,
"kl": 0.0014157379046082497,
"learning_rate": 1.7580213903743318e-06,
"loss": -0.0003,
"reward": -0.40625,
"reward_std": 0.2177734375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.40625,
"step": 263
},
{
"completion_length": 682.6666870117188,
"epoch": 0.03532717784022481,
"grad_norm": 0.0937545895576477,
"kl": 0.0005280395271256566,
"learning_rate": 1.7647058823529414e-06,
"loss": -0.0071,
"reward": -1.2578125,
"reward_std": 0.48046875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.2578125,
"step": 264
},
{
"completion_length": 500.3333435058594,
"epoch": 0.03546099290780142,
"grad_norm": 0.17000702023506165,
"kl": 0.0008603067835792899,
"learning_rate": 1.7713903743315508e-06,
"loss": 0.001,
"reward": -1.375,
"reward_std": 0.57421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.375,
"step": 265
},
{
"completion_length": 366.8333435058594,
"epoch": 0.03559480797537803,
"grad_norm": 0.17021796107292175,
"kl": 0.0011717099696397781,
"learning_rate": 1.7780748663101604e-06,
"loss": -0.0037,
"reward": -0.7734375,
"reward_std": 0.1904296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.7734375,
"step": 266
},
{
"completion_length": 287.0,
"epoch": 0.035728623042954634,
"grad_norm": 0.21704187989234924,
"kl": 0.001519282697699964,
"learning_rate": 1.78475935828877e-06,
"loss": 0.0023,
"reward": -0.6328125,
"reward_std": 0.40625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6328125,
"step": 267
},
{
"completion_length": 632.0,
"epoch": 0.03586243811053125,
"grad_norm": 0.10228416323661804,
"kl": 0.0007877530297264457,
"learning_rate": 1.7914438502673799e-06,
"loss": 0.0056,
"reward": -1.59375,
"reward_std": 0.42578125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.59375,
"step": 268
},
{
"completion_length": 411.66668701171875,
"epoch": 0.035996253178107854,
"grad_norm": 0.1366869956254959,
"kl": 0.0013742044102400541,
"learning_rate": 1.7981283422459895e-06,
"loss": -0.0045,
"reward": -0.859375,
"reward_std": 0.455078125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.859375,
"step": 269
},
{
"completion_length": 268.8333435058594,
"epoch": 0.03613006824568447,
"grad_norm": 0.23755821585655212,
"kl": 0.0016824863851070404,
"learning_rate": 1.804812834224599e-06,
"loss": 0.0,
"reward": -0.451171875,
"reward_std": 0.40625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.451171875,
"step": 270
},
{
"completion_length": 333.16668701171875,
"epoch": 0.036263883313261074,
"grad_norm": 0.1347772628068924,
"kl": 0.0012472581584006548,
"learning_rate": 1.8114973262032087e-06,
"loss": 0.0014,
"reward": -0.71484375,
"reward_std": 0.4765625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.71484375,
"step": 271
},
{
"completion_length": 404.3333435058594,
"epoch": 0.03639769838083768,
"grad_norm": 0.16489511728286743,
"kl": 0.0020543006248772144,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.0014,
"reward": -1.078125,
"reward_std": 0.703125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.078125,
"step": 272
},
{
"completion_length": 299.0,
"epoch": 0.03653151344841429,
"grad_norm": 0.15176789462566376,
"kl": 0.0013240812113508582,
"learning_rate": 1.8248663101604281e-06,
"loss": 0.0064,
"reward": -0.5234375,
"reward_std": 0.294921875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.5234375,
"step": 273
},
{
"completion_length": 388.5,
"epoch": 0.0366653285159909,
"grad_norm": 0.11341875791549683,
"kl": 0.0008515861118212342,
"learning_rate": 1.8315508021390377e-06,
"loss": 0.0005,
"reward": -0.7890625,
"reward_std": 0.337890625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.7890625,
"step": 274
},
{
"completion_length": 244.83334350585938,
"epoch": 0.036799143583567506,
"grad_norm": 0.18841837346553802,
"kl": 0.0024837306700646877,
"learning_rate": 1.8382352941176473e-06,
"loss": 0.0001,
"reward": -0.453125,
"reward_std": 0.33203125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.453125,
"step": 275
},
{
"completion_length": 283.8333435058594,
"epoch": 0.03693295865114412,
"grad_norm": 0.19005466997623444,
"kl": 0.0023008882999420166,
"learning_rate": 1.8449197860962567e-06,
"loss": 0.0043,
"reward": -0.435546875,
"reward_std": 0.1376953125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.435546875,
"step": 276
},
{
"completion_length": 413.3333435058594,
"epoch": 0.037066773718720726,
"grad_norm": 0.15900400280952454,
"kl": 0.0016828961670398712,
"learning_rate": 1.8516042780748663e-06,
"loss": 0.003,
"reward": -0.890625,
"reward_std": 0.365234375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.890625,
"step": 277
},
{
"completion_length": 330.16668701171875,
"epoch": 0.03720058878629734,
"grad_norm": 0.18195514380931854,
"kl": 0.0018070716178044677,
"learning_rate": 1.858288770053476e-06,
"loss": -0.0025,
"reward": -0.51953125,
"reward_std": 0.3359375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.51953125,
"step": 278
},
{
"completion_length": 331.66668701171875,
"epoch": 0.037334403853873946,
"grad_norm": 0.14856845140457153,
"kl": 0.002008461859077215,
"learning_rate": 1.8649732620320857e-06,
"loss": 0.0001,
"reward": -0.72265625,
"reward_std": 0.345703125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.72265625,
"step": 279
},
{
"completion_length": 283.3333435058594,
"epoch": 0.03746821892145055,
"grad_norm": 0.1611245721578598,
"kl": 0.001842797501012683,
"learning_rate": 1.8716577540106954e-06,
"loss": 0.0014,
"reward": -0.333984375,
"reward_std": 0.39453125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.333984375,
"step": 280
},
{
"completion_length": 403.3333435058594,
"epoch": 0.037602033989027166,
"grad_norm": 0.13907144963741302,
"kl": 0.0020113931968808174,
"learning_rate": 1.878342245989305e-06,
"loss": -0.0006,
"reward": -0.8125,
"reward_std": 0.447265625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8125,
"step": 281
},
{
"completion_length": 328.8333435058594,
"epoch": 0.03773584905660377,
"grad_norm": 0.13121508061885834,
"kl": 0.001702746725641191,
"learning_rate": 1.8850267379679146e-06,
"loss": 0.0054,
"reward": -0.76171875,
"reward_std": 0.30859375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.76171875,
"step": 282
},
{
"completion_length": 357.8333435058594,
"epoch": 0.037869664124180386,
"grad_norm": 0.12085507065057755,
"kl": 0.001665423158556223,
"learning_rate": 1.8917112299465244e-06,
"loss": 0.0046,
"reward": -0.796875,
"reward_std": 0.271484375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.796875,
"step": 283
},
{
"completion_length": 321.0,
"epoch": 0.03800347919175699,
"grad_norm": 0.1415272206068039,
"kl": 0.002183354925364256,
"learning_rate": 1.898395721925134e-06,
"loss": 0.0033,
"reward": -0.60546875,
"reward_std": 0.310546875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.60546875,
"step": 284
},
{
"completion_length": 326.5,
"epoch": 0.0381372942593336,
"grad_norm": 0.12236207723617554,
"kl": 0.002115039387717843,
"learning_rate": 1.9050802139037436e-06,
"loss": -0.0009,
"reward": -0.73828125,
"reward_std": 0.53125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.73828125,
"step": 285
},
{
"completion_length": 323.66668701171875,
"epoch": 0.03827110932691021,
"grad_norm": 0.15223853290081024,
"kl": 0.002893569879233837,
"learning_rate": 1.9117647058823528e-06,
"loss": 0.0048,
"reward": -0.58203125,
"reward_std": 0.322265625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.58203125,
"step": 286
},
{
"completion_length": 209.5,
"epoch": 0.03840492439448682,
"grad_norm": 0.23355820775032043,
"kl": 0.004409749526530504,
"learning_rate": 1.9184491978609626e-06,
"loss": -0.0005,
"reward": -0.22265625,
"reward_std": 0.11328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.22265625,
"step": 287
},
{
"completion_length": 373.66668701171875,
"epoch": 0.03853873946206343,
"grad_norm": 0.12933886051177979,
"kl": 0.0021153343841433525,
"learning_rate": 1.9251336898395724e-06,
"loss": 0.0001,
"reward": -0.423828125,
"reward_std": 0.27734375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.423828125,
"step": 288
},
{
"completion_length": 466.16668701171875,
"epoch": 0.03867255452964004,
"grad_norm": 0.11623230576515198,
"kl": 0.0016589018050581217,
"learning_rate": 1.931818181818182e-06,
"loss": 0.0085,
"reward": -1.125,
"reward_std": 0.400390625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.125,
"step": 289
},
{
"completion_length": 407.8333435058594,
"epoch": 0.038806369597216644,
"grad_norm": 0.1379394680261612,
"kl": 0.0021060972940176725,
"learning_rate": 1.9385026737967916e-06,
"loss": -0.0019,
"reward": -0.80859375,
"reward_std": 0.341796875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.80859375,
"step": 290
},
{
"completion_length": 251.5,
"epoch": 0.03894018466479326,
"grad_norm": 0.18607591092586517,
"kl": 0.0036474696826189756,
"learning_rate": 1.9451871657754014e-06,
"loss": -0.0008,
"reward": -0.2890625,
"reward_std": 0.248046875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.2890625,
"step": 291
},
{
"completion_length": 277.8333435058594,
"epoch": 0.039073999732369864,
"grad_norm": 0.14609812200069427,
"kl": 0.0029988684691488743,
"learning_rate": 1.951871657754011e-06,
"loss": -0.001,
"reward": -0.427734375,
"reward_std": 0.388671875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.427734375,
"step": 292
},
{
"completion_length": 399.16668701171875,
"epoch": 0.03920781479994647,
"grad_norm": 0.13495796918869019,
"kl": 0.002409814391285181,
"learning_rate": 1.9585561497326206e-06,
"loss": 0.0016,
"reward": -0.8046875,
"reward_std": 0.60546875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8046875,
"step": 293
},
{
"completion_length": 410.5,
"epoch": 0.039341629867523084,
"grad_norm": 0.12157430499792099,
"kl": 0.002072525443509221,
"learning_rate": 1.96524064171123e-06,
"loss": -0.0015,
"reward": -0.81640625,
"reward_std": 0.6015625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.81640625,
"step": 294
},
{
"completion_length": 263.8333435058594,
"epoch": 0.03947544493509969,
"grad_norm": 0.16216787695884705,
"kl": 0.0037962980568408966,
"learning_rate": 1.97192513368984e-06,
"loss": 0.0005,
"reward": -0.390625,
"reward_std": 0.091796875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.390625,
"step": 295
},
{
"completion_length": 284.3333435058594,
"epoch": 0.039609260002676304,
"grad_norm": 0.19671247899532318,
"kl": 0.0041143884882330894,
"learning_rate": 1.9786096256684497e-06,
"loss": 0.001,
"reward": -0.5859375,
"reward_std": 0.3671875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.5859375,
"step": 296
},
{
"completion_length": 280.8333435058594,
"epoch": 0.03974307507025291,
"grad_norm": 0.15721142292022705,
"kl": 0.002454590518027544,
"learning_rate": 1.985294117647059e-06,
"loss": -0.003,
"reward": -0.328125,
"reward_std": 0.2412109375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.328125,
"step": 297
},
{
"completion_length": 285.5,
"epoch": 0.03987689013782952,
"grad_norm": 0.19153378903865814,
"kl": 0.0035088087897747755,
"learning_rate": 1.9919786096256685e-06,
"loss": -0.0062,
"reward": -0.5859375,
"reward_std": 0.228515625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.5859375,
"step": 298
},
{
"completion_length": 206.33334350585938,
"epoch": 0.04001070520540613,
"grad_norm": 0.2719455659389496,
"kl": 0.0049241166561841965,
"learning_rate": 1.9986631016042783e-06,
"loss": 0.0018,
"reward": -0.322265625,
"reward_std": 0.171875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.322265625,
"step": 299
},
{
"completion_length": 363.0,
"epoch": 0.04014452027298274,
"grad_norm": 0.1396636813879013,
"kl": 0.004824022762477398,
"learning_rate": 2.0053475935828877e-06,
"loss": -0.0037,
"reward": -0.546875,
"reward_std": 0.337890625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.546875,
"step": 300
},
{
"completion_length": 357.8333435058594,
"epoch": 0.04027833534055935,
"grad_norm": 0.12899059057235718,
"kl": 0.0038679109420627356,
"learning_rate": 2.0120320855614975e-06,
"loss": -0.0,
"reward": -0.73828125,
"reward_std": 0.12060546875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.73828125,
"step": 301
},
{
"completion_length": 399.0,
"epoch": 0.040412150408135956,
"grad_norm": 0.14958561956882477,
"kl": 0.003215777687728405,
"learning_rate": 2.0187165775401073e-06,
"loss": 0.0031,
"reward": -0.90234375,
"reward_std": 0.41015625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.90234375,
"step": 302
},
{
"completion_length": 269.5,
"epoch": 0.04054596547571256,
"grad_norm": 0.21223297715187073,
"kl": 0.003930443432182074,
"learning_rate": 2.0254010695187167e-06,
"loss": -0.0004,
"reward": -0.126953125,
"reward_std": 0.365234375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.126953125,
"step": 303
},
{
"completion_length": 324.5,
"epoch": 0.040679780543289176,
"grad_norm": 0.1299857646226883,
"kl": 0.002138474490493536,
"learning_rate": 2.0320855614973265e-06,
"loss": -0.0062,
"reward": -0.79296875,
"reward_std": 0.212890625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.79296875,
"step": 304
},
{
"completion_length": 259.0,
"epoch": 0.04081359561086578,
"grad_norm": 0.1926686018705368,
"kl": 0.004935073666274548,
"learning_rate": 2.038770053475936e-06,
"loss": 0.0015,
"reward": -0.400390625,
"reward_std": 0.30859375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.400390625,
"step": 305
},
{
"completion_length": 344.16668701171875,
"epoch": 0.040947410678442396,
"grad_norm": 0.1324584186077118,
"kl": 0.0024872669018805027,
"learning_rate": 2.0454545454545457e-06,
"loss": -0.0035,
"reward": -0.515625,
"reward_std": 0.310546875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.515625,
"step": 306
},
{
"completion_length": 394.5,
"epoch": 0.041081225746019,
"grad_norm": 0.13476984202861786,
"kl": 0.0031944592483341694,
"learning_rate": 2.052139037433155e-06,
"loss": 0.0001,
"reward": -0.9921875,
"reward_std": 0.201171875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.9921875,
"step": 307
},
{
"completion_length": 340.8333435058594,
"epoch": 0.04121504081359561,
"grad_norm": 0.13522304594516754,
"kl": 0.007316060364246368,
"learning_rate": 2.058823529411765e-06,
"loss": -0.001,
"reward": -0.6953125,
"reward_std": 0.63671875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6953125,
"step": 308
},
{
"completion_length": 197.1666717529297,
"epoch": 0.04134885588117222,
"grad_norm": 0.18704450130462646,
"kl": 0.004623084794729948,
"learning_rate": 2.0655080213903743e-06,
"loss": 0.0012,
"reward": -0.201171875,
"reward_std": 0.2294921875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.201171875,
"step": 309
},
{
"completion_length": 256.5,
"epoch": 0.04148267094874883,
"grad_norm": 0.20771555602550507,
"kl": 0.006930126808583736,
"learning_rate": 2.072192513368984e-06,
"loss": 0.0003,
"reward": -0.1953125,
"reward_std": 0.21875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1953125,
"step": 310
},
{
"completion_length": 316.66668701171875,
"epoch": 0.041616486016325435,
"grad_norm": 0.15902665257453918,
"kl": 0.004400103818625212,
"learning_rate": 2.0788770053475936e-06,
"loss": -0.0005,
"reward": -0.63671875,
"reward_std": 0.470703125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.63671875,
"step": 311
},
{
"completion_length": 319.8333435058594,
"epoch": 0.04175030108390205,
"grad_norm": 0.16784004867076874,
"kl": 0.003954706247895956,
"learning_rate": 2.0855614973262034e-06,
"loss": 0.0016,
"reward": -0.6953125,
"reward_std": 0.2431640625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6953125,
"step": 312
},
{
"completion_length": 307.8333435058594,
"epoch": 0.041884116151478655,
"grad_norm": 0.18240705132484436,
"kl": 0.007933239452540874,
"learning_rate": 2.092245989304813e-06,
"loss": -0.0029,
"reward": -0.609375,
"reward_std": 0.482421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.609375,
"step": 313
},
{
"completion_length": 329.16668701171875,
"epoch": 0.04201793121905527,
"grad_norm": 0.2159043848514557,
"kl": 0.008502164855599403,
"learning_rate": 2.0989304812834226e-06,
"loss": -0.0055,
"reward": -0.66796875,
"reward_std": 0.353515625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.66796875,
"step": 314
},
{
"completion_length": 419.16668701171875,
"epoch": 0.042151746286631875,
"grad_norm": 0.12527386844158173,
"kl": 0.0033555077388882637,
"learning_rate": 2.1056149732620324e-06,
"loss": 0.0011,
"reward": -0.82421875,
"reward_std": 0.33984375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.82421875,
"step": 315
},
{
"completion_length": 352.66668701171875,
"epoch": 0.04228556135420848,
"grad_norm": 0.1702316701412201,
"kl": 0.004789026454091072,
"learning_rate": 2.112299465240642e-06,
"loss": -0.0018,
"reward": -0.625,
"reward_std": 0.447265625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.625,
"step": 316
},
{
"completion_length": 388.16668701171875,
"epoch": 0.042419376421785095,
"grad_norm": 0.1852673441171646,
"kl": 0.002679403405636549,
"learning_rate": 2.118983957219251e-06,
"loss": 0.0042,
"reward": -0.81640625,
"reward_std": 0.36328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.81640625,
"step": 317
},
{
"completion_length": 283.3333435058594,
"epoch": 0.0425531914893617,
"grad_norm": 0.21826620399951935,
"kl": 0.007197022438049316,
"learning_rate": 2.125668449197861e-06,
"loss": 0.0039,
"reward": -0.44921875,
"reward_std": 0.265625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.44921875,
"step": 318
},
{
"completion_length": 311.16668701171875,
"epoch": 0.042687006556938314,
"grad_norm": 0.2093047946691513,
"kl": 0.006402711849659681,
"learning_rate": 2.132352941176471e-06,
"loss": -0.003,
"reward": -0.62890625,
"reward_std": 0.2333984375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.62890625,
"step": 319
},
{
"completion_length": 261.16668701171875,
"epoch": 0.04282082162451492,
"grad_norm": 0.18230240046977997,
"kl": 0.010287894867360592,
"learning_rate": 2.1390374331550802e-06,
"loss": 0.0014,
"reward": -0.375,
"reward_std": 0.19921875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.375,
"step": 320
},
{
"completion_length": 241.1666717529297,
"epoch": 0.04295463669209153,
"grad_norm": 0.21121013164520264,
"kl": 0.010337308049201965,
"learning_rate": 2.14572192513369e-06,
"loss": -0.002,
"reward": -0.341796875,
"reward_std": 0.326171875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.341796875,
"step": 321
},
{
"completion_length": 356.8333435058594,
"epoch": 0.04308845175966814,
"grad_norm": 0.1783302128314972,
"kl": 0.006110279820859432,
"learning_rate": 2.1524064171122994e-06,
"loss": 0.004,
"reward": -0.76171875,
"reward_std": 0.419921875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.76171875,
"step": 322
},
{
"completion_length": 178.83334350585938,
"epoch": 0.04322226682724475,
"grad_norm": 0.26143407821655273,
"kl": 0.008865730836987495,
"learning_rate": 2.1590909090909092e-06,
"loss": 0.0012,
"reward": -0.0859375,
"reward_std": 0.1552734375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0859375,
"step": 323
},
{
"completion_length": 263.16668701171875,
"epoch": 0.043356081894821354,
"grad_norm": 0.19690528512001038,
"kl": 0.006059790961444378,
"learning_rate": 2.165775401069519e-06,
"loss": -0.0015,
"reward": -0.287109375,
"reward_std": 0.376953125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.287109375,
"step": 324
},
{
"completion_length": 258.0,
"epoch": 0.04348989696239797,
"grad_norm": 0.18371239304542542,
"kl": 0.011453388258814812,
"learning_rate": 2.1724598930481285e-06,
"loss": -0.004,
"reward": -0.365234375,
"reward_std": 0.23828125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.365234375,
"step": 325
},
{
"completion_length": 174.33334350585938,
"epoch": 0.04362371202997457,
"grad_norm": 0.3012731373310089,
"kl": 0.008510403335094452,
"learning_rate": 2.1791443850267383e-06,
"loss": 0.0,
"reward": -0.1416015625,
"reward_std": 0.0654296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1416015625,
"step": 326
},
{
"completion_length": 419.0,
"epoch": 0.04375752709755119,
"grad_norm": 0.15368571877479553,
"kl": 0.004780076909810305,
"learning_rate": 2.185828877005348e-06,
"loss": 0.0,
"reward": -0.859375,
"reward_std": 0.14453125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.859375,
"step": 327
},
{
"completion_length": 505.3333435058594,
"epoch": 0.04389134216512779,
"grad_norm": 0.1414371132850647,
"kl": 0.004692884162068367,
"learning_rate": 2.1925133689839575e-06,
"loss": 0.0067,
"reward": -1.3046875,
"reward_std": 0.45703125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.3046875,
"step": 328
},
{
"completion_length": 291.0,
"epoch": 0.0440251572327044,
"grad_norm": 0.13695944845676422,
"kl": 0.006905118003487587,
"learning_rate": 2.199197860962567e-06,
"loss": 0.0042,
"reward": -0.47265625,
"reward_std": 0.322265625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.47265625,
"step": 329
},
{
"completion_length": 393.3333435058594,
"epoch": 0.04415897230028101,
"grad_norm": 0.15617813169956207,
"kl": 0.00954591017216444,
"learning_rate": 2.2058823529411767e-06,
"loss": 0.0029,
"reward": -0.6640625,
"reward_std": 0.55078125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6640625,
"step": 330
},
{
"completion_length": 380.8333435058594,
"epoch": 0.04429278736785762,
"grad_norm": 0.10685895383358002,
"kl": 0.0035809404216706753,
"learning_rate": 2.212566844919786e-06,
"loss": 0.0033,
"reward": -0.55078125,
"reward_std": 0.1953125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.55078125,
"step": 331
},
{
"completion_length": 348.16668701171875,
"epoch": 0.04442660243543423,
"grad_norm": 0.14546167850494385,
"kl": 0.010220387950539589,
"learning_rate": 2.219251336898396e-06,
"loss": 0.0043,
"reward": -0.9375,
"reward_std": 0.5,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.9375,
"step": 332
},
{
"completion_length": 378.3333435058594,
"epoch": 0.04456041750301084,
"grad_norm": 0.16639195382595062,
"kl": 0.0055643776431679726,
"learning_rate": 2.2259358288770057e-06,
"loss": 0.0069,
"reward": -0.68359375,
"reward_std": 0.251953125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.68359375,
"step": 333
},
{
"completion_length": 335.3333435058594,
"epoch": 0.044694232570587446,
"grad_norm": 0.15007461607456207,
"kl": 0.005303717218339443,
"learning_rate": 2.232620320855615e-06,
"loss": 0.0057,
"reward": -0.70703125,
"reward_std": 0.232421875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.70703125,
"step": 334
},
{
"completion_length": 296.66668701171875,
"epoch": 0.04482804763816406,
"grad_norm": 0.1491575390100479,
"kl": 0.005803759675472975,
"learning_rate": 2.239304812834225e-06,
"loss": 0.0166,
"reward": -0.5234375,
"reward_std": 0.11328125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.5234375,
"step": 335
},
{
"completion_length": 335.5,
"epoch": 0.044961862705740666,
"grad_norm": 0.13715343177318573,
"kl": 0.006967080291360617,
"learning_rate": 2.2459893048128343e-06,
"loss": -0.0008,
"reward": -0.69921875,
"reward_std": 0.28125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.69921875,
"step": 336
},
{
"completion_length": 309.3333435058594,
"epoch": 0.04509567777331728,
"grad_norm": 0.24047787487506866,
"kl": 0.010651972144842148,
"learning_rate": 2.252673796791444e-06,
"loss": -0.0019,
"reward": -0.5859375,
"reward_std": 0.4296875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.5859375,
"step": 337
},
{
"completion_length": 353.0,
"epoch": 0.045229492840893885,
"grad_norm": 0.15517961978912354,
"kl": 0.009843084029853344,
"learning_rate": 2.2593582887700535e-06,
"loss": 0.0027,
"reward": -0.8671875,
"reward_std": 0.42578125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.8671875,
"step": 338
},
{
"completion_length": 212.5,
"epoch": 0.04536330790847049,
"grad_norm": 0.23865048587322235,
"kl": 0.010837538167834282,
"learning_rate": 2.2660427807486634e-06,
"loss": 0.0067,
"reward": -0.2734375,
"reward_std": 0.13671875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.2734375,
"step": 339
},
{
"completion_length": 363.3333435058594,
"epoch": 0.045497122976047105,
"grad_norm": 0.13407538831233978,
"kl": 0.008769119158387184,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.0023,
"reward": -0.7421875,
"reward_std": 0.2734375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.7421875,
"step": 340
},
{
"completion_length": 319.16668701171875,
"epoch": 0.04563093804362371,
"grad_norm": 0.16852368414402008,
"kl": 0.006191683933138847,
"learning_rate": 2.2794117647058826e-06,
"loss": 0.0024,
"reward": -0.390625,
"reward_std": 0.23828125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.390625,
"step": 341
},
{
"completion_length": 145.33334350585938,
"epoch": 0.04576475311120032,
"grad_norm": 0.2680109441280365,
"kl": 0.032642465084791183,
"learning_rate": 2.286096256684492e-06,
"loss": 0.0013,
"reward": -0.0281982421875,
"reward_std": 0.419921875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0281982421875,
"step": 342
},
{
"completion_length": 281.5,
"epoch": 0.04589856817877693,
"grad_norm": 0.1808832287788391,
"kl": 0.014683406800031662,
"learning_rate": 2.292780748663102e-06,
"loss": 0.0029,
"reward": -0.396484375,
"reward_std": 0.2080078125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.396484375,
"step": 343
},
{
"completion_length": 287.66668701171875,
"epoch": 0.04603238324635354,
"grad_norm": 0.1347428560256958,
"kl": 0.005576578434556723,
"learning_rate": 2.2994652406417116e-06,
"loss": 0.0015,
"reward": -0.4140625,
"reward_std": 0.439453125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.4140625,
"step": 344
},
{
"completion_length": 361.8333435058594,
"epoch": 0.04616619831393015,
"grad_norm": 0.13883595168590546,
"kl": 0.007222681771963835,
"learning_rate": 2.306149732620321e-06,
"loss": -0.0061,
"reward": -0.435546875,
"reward_std": 0.10888671875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.435546875,
"step": 345
},
{
"completion_length": 229.5,
"epoch": 0.04630001338150676,
"grad_norm": 0.23361890017986298,
"kl": 0.035799503326416016,
"learning_rate": 2.312834224598931e-06,
"loss": 0.0001,
"reward": -0.267578125,
"reward_std": 0.248046875,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.267578125,
"step": 346
},
{
"completion_length": 208.83334350585938,
"epoch": 0.046433828449083364,
"grad_norm": 0.25818759202957153,
"kl": 0.01660415530204773,
"learning_rate": 2.3195187165775402e-06,
"loss": -0.0006,
"reward": -0.2412109375,
"reward_std": 0.37109375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.2412109375,
"step": 347
},
{
"completion_length": 270.66668701171875,
"epoch": 0.04656764351665998,
"grad_norm": 0.15176504850387573,
"kl": 0.008165406994521618,
"learning_rate": 2.32620320855615e-06,
"loss": 0.0044,
"reward": -0.40625,
"reward_std": 0.1494140625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.40625,
"step": 348
},
{
"completion_length": 318.16668701171875,
"epoch": 0.046701458584236584,
"grad_norm": 0.14339715242385864,
"kl": 0.007072822656482458,
"learning_rate": 2.3328877005347594e-06,
"loss": -0.0096,
"reward": -0.5625,
"reward_std": 0.2021484375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.5625,
"step": 349
},
{
"completion_length": 281.0,
"epoch": 0.0468352736518132,
"grad_norm": 0.16681039333343506,
"kl": 0.013782523572444916,
"learning_rate": 2.3395721925133692e-06,
"loss": 0.0001,
"reward": -0.296875,
"reward_std": 0.17578125,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.296875,
"step": 350
}
],
"logging_steps": 1,
"max_steps": 7473,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}