Q1.5 / trainer_state.json
C10X's picture
Upload trainer_state.json with huggingface_hub
c2314db verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.04014452027298274,
"eval_steps": 500,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 564.25,
"epoch": 0.00013381506757660912,
"grad_norm": 0.14033202826976776,
"kl": 0.0,
"learning_rate": 8.88888888888889e-08,
"loss": 0.0,
"reward": -0.6452499628067017,
"reward_std": 0.8964393734931946,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.7702499628067017,
"step": 1
},
{
"completion_length": 523.0,
"epoch": 0.00026763013515321824,
"grad_norm": 0.1592259407043457,
"kl": 0.0,
"learning_rate": 1.777777777777778e-07,
"loss": -0.0,
"reward": -0.9787499904632568,
"reward_std": 1.8240368366241455,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.9787499904632568,
"step": 2
},
{
"completion_length": 264.25,
"epoch": 0.0004014452027298274,
"grad_norm": 0.1932271420955658,
"kl": 6.622826731472742e-06,
"learning_rate": 2.666666666666667e-07,
"loss": 0.0,
"reward": -0.226500004529953,
"reward_std": 0.26771190762519836,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.226500004529953,
"step": 3
},
{
"completion_length": 159.0,
"epoch": 0.0005352602703064365,
"grad_norm": 0.27002546191215515,
"kl": 1.078558216249803e-05,
"learning_rate": 3.555555555555556e-07,
"loss": 0.0,
"reward": 0.013750001788139343,
"reward_std": 0.2580934762954712,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11124999821186066,
"step": 4
},
{
"completion_length": 320.75,
"epoch": 0.0006690753378830456,
"grad_norm": 0.1831292062997818,
"kl": 8.28549855214078e-06,
"learning_rate": 4.444444444444445e-07,
"loss": 0.0,
"reward": 0.0624999925494194,
"reward_std": 0.21419848501682281,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0625,
"step": 5
},
{
"completion_length": 237.25,
"epoch": 0.0008028904054596548,
"grad_norm": 0.18136273324489594,
"kl": 5.389752914197743e-06,
"learning_rate": 5.333333333333335e-07,
"loss": 0.0,
"reward": -0.08349999785423279,
"reward_std": 0.16699999570846558,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08349999785423279,
"step": 6
},
{
"completion_length": 375.25,
"epoch": 0.0009367054730362638,
"grad_norm": 0.18059618771076202,
"kl": 1.1201269444427453e-05,
"learning_rate": 6.222222222222223e-07,
"loss": 0.0,
"reward": -0.45899999141693115,
"reward_std": 0.9180000424385071,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.45899999141693115,
"step": 7
},
{
"completion_length": 277.0,
"epoch": 0.001070520540612873,
"grad_norm": 0.19177855551242828,
"kl": 5.332793080015108e-06,
"learning_rate": 7.111111111111112e-07,
"loss": 0.0,
"reward": -0.0560000017285347,
"reward_std": 0.1120000034570694,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.0560000017285347,
"step": 8
},
{
"completion_length": 334.5,
"epoch": 0.0012043356081894822,
"grad_norm": 0.27444255352020264,
"kl": 1.158629765996011e-05,
"learning_rate": 8.000000000000001e-07,
"loss": 0.0,
"reward": 0.2122499942779541,
"reward_std": 0.39513909816741943,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0872499942779541,
"step": 9
},
{
"completion_length": 297.25,
"epoch": 0.0013381506757660913,
"grad_norm": 0.20776011049747467,
"kl": 8.035244718485046e-06,
"learning_rate": 8.88888888888889e-07,
"loss": 0.0,
"reward": 0.5809999704360962,
"reward_std": 1.0095866918563843,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6690000295639038,
"step": 10
},
{
"completion_length": 236.5,
"epoch": 0.0014719657433427003,
"grad_norm": 9.880962898023427e-05,
"kl": 7.1510494308313355e-06,
"learning_rate": 9.77777777777778e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 11
},
{
"completion_length": 342.25,
"epoch": 0.0016057808109193096,
"grad_norm": 0.11707701534032822,
"kl": 5.569832410401432e-06,
"learning_rate": 1.066666666666667e-06,
"loss": 0.0,
"reward": -0.4452499747276306,
"reward_std": 0.3653622269630432,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.5702500343322754,
"step": 12
},
{
"completion_length": 261.0,
"epoch": 0.0017395958784959186,
"grad_norm": 0.22486890852451324,
"kl": 7.698228728258982e-06,
"learning_rate": 1.1555555555555556e-06,
"loss": 0.0,
"reward": -0.09549999237060547,
"reward_std": 0.451213538646698,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.22049999237060547,
"step": 13
},
{
"completion_length": 289.5,
"epoch": 0.0018734109460725277,
"grad_norm": 0.0001465526584070176,
"kl": 8.079272447503172e-06,
"learning_rate": 1.2444444444444445e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 14
},
{
"completion_length": 467.75,
"epoch": 0.002007226013649137,
"grad_norm": 7.054989691823721e-05,
"kl": 8.345767128048465e-06,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 15
},
{
"completion_length": 242.5,
"epoch": 0.002141041081225746,
"grad_norm": 0.20067830383777618,
"kl": 4.895833171758568e-06,
"learning_rate": 1.4222222222222223e-06,
"loss": 0.0,
"reward": 0.5694999694824219,
"reward_std": 0.9794492721557617,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.05550000071525574,
"step": 16
},
{
"completion_length": 397.75,
"epoch": 0.002274856148802355,
"grad_norm": 6.06259964115452e-05,
"kl": 7.802555046509951e-06,
"learning_rate": 1.5111111111111112e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 17
},
{
"completion_length": 263.75,
"epoch": 0.0024086712163789645,
"grad_norm": 0.2957823872566223,
"kl": 8.453114787698723e-06,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.0,
"reward": -0.012249999679625034,
"reward_std": 0.02449999935925007,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.012249999679625034,
"step": 18
},
{
"completion_length": 301.0,
"epoch": 0.0025424862839555735,
"grad_norm": 0.277434378862381,
"kl": 8.06238858785946e-06,
"learning_rate": 1.688888888888889e-06,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.375,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0625,
"step": 19
},
{
"completion_length": 255.75,
"epoch": 0.0026763013515321826,
"grad_norm": 9.750338358571753e-05,
"kl": 7.102700237737736e-06,
"learning_rate": 1.777777777777778e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 20
},
{
"completion_length": 200.5,
"epoch": 0.0028101164191087916,
"grad_norm": 0.29159611463546753,
"kl": 8.880564564606175e-06,
"learning_rate": 1.8666666666666669e-06,
"loss": 0.0,
"reward": -0.1912499964237213,
"reward_std": 0.13046424090862274,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1912499964237213,
"step": 21
},
{
"completion_length": 265.0,
"epoch": 0.0029439314866854006,
"grad_norm": 0.31326013803482056,
"kl": 1.243360384250991e-05,
"learning_rate": 1.955555555555556e-06,
"loss": 0.0,
"reward": -0.25849997997283936,
"reward_std": 0.8479105830192566,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.38349997997283936,
"step": 22
},
{
"completion_length": 154.0,
"epoch": 0.00307774655426201,
"grad_norm": 0.2559646666049957,
"kl": 5.530042471946217e-06,
"learning_rate": 2.0444444444444447e-06,
"loss": 0.0,
"reward": -0.05624999850988388,
"reward_std": 0.11249999701976776,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.05624999850988388,
"step": 23
},
{
"completion_length": 189.75,
"epoch": 0.003211561621838619,
"grad_norm": 0.284862220287323,
"kl": 9.174956176138949e-06,
"learning_rate": 2.133333333333334e-06,
"loss": 0.0,
"reward": 0.1277499943971634,
"reward_std": 0.2554999887943268,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.002749999985098839,
"step": 24
},
{
"completion_length": 268.75,
"epoch": 0.003345376689415228,
"grad_norm": 0.15809978544712067,
"kl": 4.42105692854966e-06,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0,
"reward": -0.2212499976158142,
"reward_std": 0.4424999952316284,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.2212499976158142,
"step": 25
},
{
"completion_length": 210.0,
"epoch": 0.0034791917569918372,
"grad_norm": 0.2747434675693512,
"kl": 8.758857802604325e-06,
"learning_rate": 2.311111111111111e-06,
"loss": 0.0,
"reward": 0.6607499718666077,
"reward_std": 1.2395679950714111,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.035750001668930054,
"step": 26
},
{
"completion_length": 437.25,
"epoch": 0.0036130068245684463,
"grad_norm": 0.18731197714805603,
"kl": 1.347947727481369e-05,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.0,
"reward": -0.06724999845027924,
"reward_std": 0.13449999690055847,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.06724999845027924,
"step": 27
},
{
"completion_length": 307.75,
"epoch": 0.0037468218921450553,
"grad_norm": 0.21840307116508484,
"kl": 8.306662493851036e-06,
"learning_rate": 2.488888888888889e-06,
"loss": 0.0,
"reward": -0.07774999737739563,
"reward_std": 0.24599508941173553,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07774999737739563,
"step": 28
},
{
"completion_length": 620.25,
"epoch": 0.003880636959721665,
"grad_norm": 4.864737275056541e-05,
"kl": 6.508589649456553e-06,
"learning_rate": 2.577777777777778e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 29
},
{
"completion_length": 292.75,
"epoch": 0.004014452027298274,
"grad_norm": 0.4136326313018799,
"kl": 1.4081160770729184e-05,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0,
"reward": -0.05950000137090683,
"reward_std": 0.30902159214019775,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.05950000137090683,
"step": 30
},
{
"completion_length": 398.25,
"epoch": 0.004148267094874883,
"grad_norm": 4.4143911509308964e-05,
"kl": 5.329064151737839e-06,
"learning_rate": 2.755555555555556e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 31
},
{
"completion_length": 262.5,
"epoch": 0.004282082162451492,
"grad_norm": 5.927432721364312e-05,
"kl": 4.8590136429993436e-06,
"learning_rate": 2.8444444444444446e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 32
},
{
"completion_length": 307.0,
"epoch": 0.004415897230028101,
"grad_norm": 0.19121825695037842,
"kl": 6.706204203510424e-06,
"learning_rate": 2.9333333333333338e-06,
"loss": 0.0,
"reward": -0.3400000333786011,
"reward_std": 0.5228951573371887,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.4650000333786011,
"step": 33
},
{
"completion_length": 463.5,
"epoch": 0.00454971229760471,
"grad_norm": 5.6564931583125144e-05,
"kl": 4.871402779826894e-06,
"learning_rate": 3.0222222222222225e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 34
},
{
"completion_length": 352.25,
"epoch": 0.004683527365181319,
"grad_norm": 0.1656392216682434,
"kl": 6.969175501581049e-06,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.0,
"reward": 0.37549999356269836,
"reward_std": 0.7509999871253967,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.24950000643730164,
"step": 35
},
{
"completion_length": 388.75,
"epoch": 0.004817342432757929,
"grad_norm": 5.8849836932495236e-05,
"kl": 7.587910658912733e-06,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 36
},
{
"completion_length": 272.75,
"epoch": 0.004951157500334538,
"grad_norm": 0.19752717018127441,
"kl": 9.09756181499688e-06,
"learning_rate": 3.2888888888888894e-06,
"loss": 0.0,
"reward": 0.06274999678134918,
"reward_std": 0.12549999356269836,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.06274999678134918,
"step": 37
},
{
"completion_length": 265.25,
"epoch": 0.005084972567911147,
"grad_norm": 0.1716996431350708,
"kl": 7.701055437792093e-06,
"learning_rate": 3.377777777777778e-06,
"loss": 0.0,
"reward": -0.17024999856948853,
"reward_std": 0.197299063205719,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.2952499985694885,
"step": 38
},
{
"completion_length": 198.25,
"epoch": 0.005218787635487756,
"grad_norm": 8.153666567523032e-05,
"kl": 7.68154040997615e-06,
"learning_rate": 3.4666666666666672e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 39
},
{
"completion_length": 357.75,
"epoch": 0.005352602703064365,
"grad_norm": 0.16188818216323853,
"kl": 8.425393389188685e-06,
"learning_rate": 3.555555555555556e-06,
"loss": 0.0,
"reward": -0.09000000357627869,
"reward_std": 0.18000000715255737,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09000000357627869,
"step": 40
},
{
"completion_length": 322.5,
"epoch": 0.005486417770640974,
"grad_norm": 0.22530034184455872,
"kl": 1.0753658898465801e-05,
"learning_rate": 3.644444444444445e-06,
"loss": 0.0,
"reward": 0.045249998569488525,
"reward_std": 0.09049999713897705,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07975000143051147,
"step": 41
},
{
"completion_length": 247.75,
"epoch": 0.005620232838217583,
"grad_norm": 0.00012052639067405835,
"kl": 8.426506610703655e-06,
"learning_rate": 3.7333333333333337e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 42
},
{
"completion_length": 354.25,
"epoch": 0.005754047905794192,
"grad_norm": 0.21594372391700745,
"kl": 1.0382359505456407e-05,
"learning_rate": 3.8222222222222224e-06,
"loss": 0.0,
"reward": 0.11999999731779099,
"reward_std": 0.1444529891014099,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.11999999731779099,
"step": 43
},
{
"completion_length": 212.25,
"epoch": 0.005887862973370801,
"grad_norm": 0.00011440912930993363,
"kl": 7.224463388411095e-06,
"learning_rate": 3.911111111111112e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 44
},
{
"completion_length": 249.75,
"epoch": 0.00602167804094741,
"grad_norm": 0.20934244990348816,
"kl": 1.420614898961503e-05,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0,
"reward": 0.06699999421834946,
"reward_std": 0.12292815744876862,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.06699999421834946,
"step": 45
},
{
"completion_length": 273.0,
"epoch": 0.00615549310852402,
"grad_norm": 0.2055320143699646,
"kl": 1.246560350409709e-05,
"learning_rate": 4.088888888888889e-06,
"loss": 0.0,
"reward": 0.43024998903274536,
"reward_std": 0.8604999780654907,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.19474999606609344,
"step": 46
},
{
"completion_length": 350.25,
"epoch": 0.006289308176100629,
"grad_norm": 0.18098776042461395,
"kl": 1.056162545864936e-05,
"learning_rate": 4.177777777777778e-06,
"loss": 0.0,
"reward": -0.20074999332427979,
"reward_std": 0.40149998664855957,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.3257499933242798,
"step": 47
},
{
"completion_length": 282.25,
"epoch": 0.006423123243677238,
"grad_norm": 0.2726364731788635,
"kl": 1.3284003216540441e-05,
"learning_rate": 4.266666666666668e-06,
"loss": 0.0,
"reward": -0.6389999985694885,
"reward_std": 0.7382077574729919,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6389999985694885,
"step": 48
},
{
"completion_length": 165.5,
"epoch": 0.006556938311253847,
"grad_norm": 0.439759224653244,
"kl": 1.5033414456411265e-05,
"learning_rate": 4.3555555555555555e-06,
"loss": 0.0,
"reward": 0.5855000019073486,
"reward_std": 1.1710000038146973,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.039500001817941666,
"step": 49
},
{
"completion_length": 401.0,
"epoch": 0.006690753378830456,
"grad_norm": 0.1494322270154953,
"kl": 1.3026328815612942e-05,
"learning_rate": 4.444444444444444e-06,
"loss": 0.0,
"reward": -0.11124999821186066,
"reward_std": 0.2224999964237213,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11124999821186066,
"step": 50
},
{
"completion_length": 205.25,
"epoch": 0.006824568446407065,
"grad_norm": 0.23992237448692322,
"kl": 1.3739524547418114e-05,
"learning_rate": 4.533333333333334e-06,
"loss": 0.0,
"reward": 0.03125,
"reward_std": 0.0625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03125,
"step": 51
},
{
"completion_length": 187.25,
"epoch": 0.0069583835139836745,
"grad_norm": 0.2555287480354309,
"kl": 1.5424680896103382e-05,
"learning_rate": 4.622222222222222e-06,
"loss": 0.0,
"reward": -0.07625000178813934,
"reward_std": 0.12490096688270569,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07625000178813934,
"step": 52
},
{
"completion_length": 230.0,
"epoch": 0.0070921985815602835,
"grad_norm": 0.00012860310380347073,
"kl": 1.0372207725595217e-05,
"learning_rate": 4.711111111111111e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 53
},
{
"completion_length": 299.25,
"epoch": 0.0072260136491368926,
"grad_norm": 0.00015418548719026148,
"kl": 1.1093783541582525e-05,
"learning_rate": 4.800000000000001e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 54
},
{
"completion_length": 392.25,
"epoch": 0.007359828716713502,
"grad_norm": 0.0001652293576626107,
"kl": 1.3498687621904537e-05,
"learning_rate": 4.888888888888889e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 55
},
{
"completion_length": 316.5,
"epoch": 0.007493643784290111,
"grad_norm": 0.14983882009983063,
"kl": 2.462963675498031e-05,
"learning_rate": 4.977777777777778e-06,
"loss": 0.0,
"reward": -0.3332500159740448,
"reward_std": 0.4945647418498993,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.3332500159740448,
"step": 56
},
{
"completion_length": 219.0,
"epoch": 0.0076274588518667205,
"grad_norm": 0.2960438132286072,
"kl": 3.720477252500132e-05,
"learning_rate": 5.0666666666666676e-06,
"loss": 0.0,
"reward": -0.13199999928474426,
"reward_std": 0.2639999985694885,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13199999928474426,
"step": 57
},
{
"completion_length": 235.5,
"epoch": 0.00776127391944333,
"grad_norm": 0.000978139229118824,
"kl": 3.992651545559056e-05,
"learning_rate": 5.155555555555556e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 58
},
{
"completion_length": 249.0,
"epoch": 0.007895088987019938,
"grad_norm": 0.00023408984998241067,
"kl": 2.2337197151500732e-05,
"learning_rate": 5.244444444444445e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 59
},
{
"completion_length": 263.25,
"epoch": 0.008028904054596548,
"grad_norm": 0.2353731095790863,
"kl": 3.2109041057992727e-05,
"learning_rate": 5.333333333333334e-06,
"loss": 0.0,
"reward": -0.030500000342726707,
"reward_std": 0.061000000685453415,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.030500000342726707,
"step": 60
},
{
"completion_length": 527.75,
"epoch": 0.008162719122173156,
"grad_norm": 0.15068306028842926,
"kl": 2.8251575713511556e-05,
"learning_rate": 5.422222222222223e-06,
"loss": 0.0,
"reward": -0.27649998664855957,
"reward_std": 0.5227195024490356,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.27649998664855957,
"step": 61
},
{
"completion_length": 243.75,
"epoch": 0.008296534189749766,
"grad_norm": 0.21375156939029694,
"kl": 5.114181476528756e-05,
"learning_rate": 5.511111111111112e-06,
"loss": 0.0,
"reward": -0.06224999949336052,
"reward_std": 0.12449999898672104,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.06224999949336052,
"step": 62
},
{
"completion_length": 197.5,
"epoch": 0.008430349257326376,
"grad_norm": 0.20765534043312073,
"kl": 6.65866318740882e-05,
"learning_rate": 5.600000000000001e-06,
"loss": 0.0,
"reward": 0.06274999678134918,
"reward_std": 0.12549999356269836,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.06274999678134918,
"step": 63
},
{
"completion_length": 415.75,
"epoch": 0.008564164324902984,
"grad_norm": 0.00047714909305796027,
"kl": 5.4172531235963106e-05,
"learning_rate": 5.688888888888889e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 64
},
{
"completion_length": 561.25,
"epoch": 0.008697979392479594,
"grad_norm": 0.00017319328617304564,
"kl": 2.3787781174178235e-05,
"learning_rate": 5.777777777777778e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 65
},
{
"completion_length": 266.0,
"epoch": 0.008831794460056202,
"grad_norm": 0.18374453485012054,
"kl": 7.648408063687384e-05,
"learning_rate": 5.8666666666666675e-06,
"loss": 0.0,
"reward": -0.2854999899864197,
"reward_std": 0.40305209159851074,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.4104999899864197,
"step": 66
},
{
"completion_length": 367.0,
"epoch": 0.008965609527632812,
"grad_norm": 0.00041845859959721565,
"kl": 6.105724605731666e-05,
"learning_rate": 5.955555555555555e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 67
},
{
"completion_length": 165.25,
"epoch": 0.00909942459520942,
"grad_norm": 0.3174028694629669,
"kl": 0.00012760543904732913,
"learning_rate": 6.044444444444445e-06,
"loss": 0.0,
"reward": -0.09300000220537186,
"reward_std": 0.10758562386035919,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.09300000220537186,
"step": 68
},
{
"completion_length": 261.75,
"epoch": 0.00923323966278603,
"grad_norm": 0.0009887454798445106,
"kl": 0.00011684713535942137,
"learning_rate": 6.133333333333334e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 69
},
{
"completion_length": 351.25,
"epoch": 0.009367054730362638,
"grad_norm": 0.22120727598667145,
"kl": 8.070362673606724e-05,
"learning_rate": 6.222222222222223e-06,
"loss": 0.0,
"reward": 0.09224999696016312,
"reward_std": 0.18449999392032623,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.09224999696016312,
"step": 70
},
{
"completion_length": 364.75,
"epoch": 0.009500869797939248,
"grad_norm": 0.14219215512275696,
"kl": 0.00014313386054709554,
"learning_rate": 6.311111111111111e-06,
"loss": 0.0,
"reward": 0.03125,
"reward_std": 0.0625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03125,
"step": 71
},
{
"completion_length": 301.0,
"epoch": 0.009634684865515858,
"grad_norm": 0.00125442526768893,
"kl": 0.00022049600374884903,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 72
},
{
"completion_length": 257.5,
"epoch": 0.009768499933092466,
"grad_norm": 0.27416515350341797,
"kl": 0.0003145383088849485,
"learning_rate": 6.488888888888889e-06,
"loss": 0.0,
"reward": 0.20900002121925354,
"reward_std": 0.4749357998371124,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.16599999368190765,
"step": 73
},
{
"completion_length": 355.75,
"epoch": 0.009902315000669076,
"grad_norm": 0.0006047863862477243,
"kl": 9.019898425322026e-05,
"learning_rate": 6.577777777777779e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 74
},
{
"completion_length": 329.25,
"epoch": 0.010036130068245684,
"grad_norm": 0.19630220532417297,
"kl": 0.00013932358706369996,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0,
"reward": -0.3462499976158142,
"reward_std": 0.6924999952316284,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.3462499976158142,
"step": 75
},
{
"completion_length": 203.0,
"epoch": 0.010169945135822294,
"grad_norm": 0.19907738268375397,
"kl": 0.0003505215863697231,
"learning_rate": 6.755555555555556e-06,
"loss": 0.0,
"reward": -0.08375000208616257,
"reward_std": 0.16750000417232513,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08375000208616257,
"step": 76
},
{
"completion_length": 156.0,
"epoch": 0.010303760203398902,
"grad_norm": 0.0033248290419578552,
"kl": 0.0005173031822778285,
"learning_rate": 6.844444444444445e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 77
},
{
"completion_length": 248.5,
"epoch": 0.010437575270975512,
"grad_norm": 0.2999407947063446,
"kl": 0.0002213937696069479,
"learning_rate": 6.9333333333333344e-06,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.25,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 78
},
{
"completion_length": 243.0,
"epoch": 0.01057139033855212,
"grad_norm": 0.2764039933681488,
"kl": 0.00041108846198767424,
"learning_rate": 7.022222222222222e-06,
"loss": 0.0,
"reward": -0.08399999886751175,
"reward_std": 0.1679999977350235,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08399999886751175,
"step": 79
},
{
"completion_length": 577.0,
"epoch": 0.01070520540612873,
"grad_norm": 0.1485264003276825,
"kl": 0.0002135551767423749,
"learning_rate": 7.111111111111112e-06,
"loss": 0.0,
"reward": -0.6702499985694885,
"reward_std": 1.340499997138977,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6702499985694885,
"step": 80
},
{
"completion_length": 491.75,
"epoch": 0.010839020473705338,
"grad_norm": 0.0006590148550458252,
"kl": 0.000124384619994089,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 81
},
{
"completion_length": 183.25,
"epoch": 0.010972835541281948,
"grad_norm": 0.0028198054060339928,
"kl": 0.000547908479347825,
"learning_rate": 7.28888888888889e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 82
},
{
"completion_length": 312.25,
"epoch": 0.011106650608858558,
"grad_norm": 0.27911630272865295,
"kl": 0.0005270105320960283,
"learning_rate": 7.377777777777778e-06,
"loss": 0.0,
"reward": 0.015999972820281982,
"reward_std": 1.7774159908294678,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.734000027179718,
"step": 83
},
{
"completion_length": 348.25,
"epoch": 0.011240465676435166,
"grad_norm": 0.0015268020797520876,
"kl": 0.00035788220702670515,
"learning_rate": 7.4666666666666675e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 84
},
{
"completion_length": 430.5,
"epoch": 0.011374280744011776,
"grad_norm": 0.13874612748622894,
"kl": 0.00037030907697044313,
"learning_rate": 7.555555555555556e-06,
"loss": 0.0,
"reward": -0.6794999837875366,
"reward_std": 1.3589999675750732,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6794999837875366,
"step": 85
},
{
"completion_length": 264.5,
"epoch": 0.011508095811588384,
"grad_norm": 0.22207802534103394,
"kl": 0.00047897486365400255,
"learning_rate": 7.644444444444445e-06,
"loss": 0.0,
"reward": -0.3617500066757202,
"reward_std": 0.4231094419956207,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.3617500066757202,
"step": 86
},
{
"completion_length": 199.25,
"epoch": 0.011641910879164994,
"grad_norm": 0.26088446378707886,
"kl": 0.000934716546908021,
"learning_rate": 7.733333333333334e-06,
"loss": 0.0,
"reward": -0.17775000631809235,
"reward_std": 0.3555000126361847,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.17775000631809235,
"step": 87
},
{
"completion_length": 432.5,
"epoch": 0.011775725946741603,
"grad_norm": 0.22545722126960754,
"kl": 0.001022200332954526,
"learning_rate": 7.822222222222224e-06,
"loss": 0.0,
"reward": -0.10100000351667404,
"reward_std": 0.20200000703334808,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10100000351667404,
"step": 88
},
{
"completion_length": 290.0,
"epoch": 0.011909541014318212,
"grad_norm": 0.0022929804399609566,
"kl": 0.0007066897815093398,
"learning_rate": 7.911111111111112e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 89
},
{
"completion_length": 306.5,
"epoch": 0.01204335608189482,
"grad_norm": 0.18963995575904846,
"kl": 0.0011429399019107223,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0,
"reward": -0.3070000112056732,
"reward_std": 0.6140000224113464,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.3070000112056732,
"step": 90
},
{
"completion_length": 306.25,
"epoch": 0.01217717114947143,
"grad_norm": 0.32602232694625854,
"kl": 0.0012934200931340456,
"learning_rate": 8.08888888888889e-06,
"loss": 0.0001,
"reward": 0.021250000223517418,
"reward_std": 0.042500000447034836,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.021250000223517418,
"step": 91
},
{
"completion_length": 178.0,
"epoch": 0.01231098621704804,
"grad_norm": 0.32629159092903137,
"kl": 0.0017813891172409058,
"learning_rate": 8.177777777777779e-06,
"loss": 0.0001,
"reward": 0.6482499837875366,
"reward_std": 1.2964999675750732,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.023250000551342964,
"step": 92
},
{
"completion_length": 334.5,
"epoch": 0.012444801284624649,
"grad_norm": 0.0029060724191367626,
"kl": 0.0007846200605854392,
"learning_rate": 8.266666666666667e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 93
},
{
"completion_length": 389.5,
"epoch": 0.012578616352201259,
"grad_norm": 0.0021289298310875893,
"kl": 0.0007502713124267757,
"learning_rate": 8.355555555555556e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 94
},
{
"completion_length": 251.75,
"epoch": 0.012712431419777867,
"grad_norm": 0.011231260374188423,
"kl": 0.003988795448094606,
"learning_rate": 8.444444444444446e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 95
},
{
"completion_length": 246.25,
"epoch": 0.012846246487354477,
"grad_norm": 0.0036670551635324955,
"kl": 0.0013651195913553238,
"learning_rate": 8.533333333333335e-06,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 96
},
{
"completion_length": 338.25,
"epoch": 0.012980061554931085,
"grad_norm": 0.1569143831729889,
"kl": 0.0011384707177057862,
"learning_rate": 8.622222222222223e-06,
"loss": 0.0,
"reward": -0.36274999380111694,
"reward_std": 0.8109769225120544,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.36274999380111694,
"step": 97
},
{
"completion_length": 260.5,
"epoch": 0.013113876622507695,
"grad_norm": 0.2566815912723541,
"kl": 0.0018418811960145831,
"learning_rate": 8.711111111111111e-06,
"loss": 0.0001,
"reward": -0.10649999976158142,
"reward_std": 0.21299999952316284,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10649999976158142,
"step": 98
},
{
"completion_length": 588.5,
"epoch": 0.013247691690084303,
"grad_norm": 0.3212381899356842,
"kl": 0.0008077286183834076,
"learning_rate": 8.8e-06,
"loss": 0.0,
"reward": -0.1547500044107437,
"reward_std": 0.30949997901916504,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1547500044107437,
"step": 99
},
{
"completion_length": 202.5,
"epoch": 0.013381506757660913,
"grad_norm": 0.31029075384140015,
"kl": 0.0019261679844930768,
"learning_rate": 8.888888888888888e-06,
"loss": 0.0001,
"reward": 0.03125,
"reward_std": 0.0625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03125,
"step": 100
},
{
"completion_length": 422.0,
"epoch": 0.013515321825237521,
"grad_norm": 0.2212388813495636,
"kl": 0.0015066334744915366,
"learning_rate": 8.977777777777778e-06,
"loss": 0.0001,
"reward": 0.03324999660253525,
"reward_std": 0.27917900681495667,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03324999660253525,
"step": 101
},
{
"completion_length": 405.25,
"epoch": 0.01364913689281413,
"grad_norm": 0.0010115457698702812,
"kl": 0.0004617396043613553,
"learning_rate": 9.066666666666667e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 102
},
{
"completion_length": 254.25,
"epoch": 0.01378295196039074,
"grad_norm": 0.0024686024989932775,
"kl": 0.0011656455462798476,
"learning_rate": 9.155555555555557e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 103
},
{
"completion_length": 270.75,
"epoch": 0.013916767027967349,
"grad_norm": 0.3148210048675537,
"kl": 0.0020634387619793415,
"learning_rate": 9.244444444444445e-06,
"loss": 0.0001,
"reward": -0.37025001645088196,
"reward_std": 0.9000205397605896,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.49525001645088196,
"step": 104
},
{
"completion_length": 447.25,
"epoch": 0.014050582095543959,
"grad_norm": 0.00280591519549489,
"kl": 0.0013167858123779297,
"learning_rate": 9.333333333333334e-06,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 105
},
{
"completion_length": 250.25,
"epoch": 0.014184397163120567,
"grad_norm": 1.272940993309021,
"kl": 0.010282534174621105,
"learning_rate": 9.422222222222222e-06,
"loss": 0.0004,
"reward": -0.38475000858306885,
"reward_std": 0.7695000171661377,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.38475000858306885,
"step": 106
},
{
"completion_length": 514.0,
"epoch": 0.014318212230697177,
"grad_norm": 0.003415808780118823,
"kl": 0.0018425981979817152,
"learning_rate": 9.511111111111112e-06,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 107
},
{
"completion_length": 378.25,
"epoch": 0.014452027298273785,
"grad_norm": 0.0043991003185510635,
"kl": 0.0022172422613948584,
"learning_rate": 9.600000000000001e-06,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 108
},
{
"completion_length": 384.0,
"epoch": 0.014585842365850395,
"grad_norm": 0.15473611652851105,
"kl": 0.003075521672144532,
"learning_rate": 9.688888888888889e-06,
"loss": 0.0001,
"reward": 0.03125,
"reward_std": 0.0625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03125,
"step": 109
},
{
"completion_length": 386.25,
"epoch": 0.014719657433427003,
"grad_norm": 0.18655884265899658,
"kl": 0.0021878289990127087,
"learning_rate": 9.777777777777779e-06,
"loss": 0.0001,
"reward": -0.14174999296665192,
"reward_std": 0.28349998593330383,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14174999296665192,
"step": 110
},
{
"completion_length": 388.5,
"epoch": 0.014853472501003613,
"grad_norm": 0.005759637802839279,
"kl": 0.0032043028622865677,
"learning_rate": 9.866666666666668e-06,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 111
},
{
"completion_length": 368.0,
"epoch": 0.014987287568580221,
"grad_norm": 0.003658253001049161,
"kl": 0.0023007667623460293,
"learning_rate": 9.955555555555556e-06,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 112
},
{
"completion_length": 174.0,
"epoch": 0.015121102636156831,
"grad_norm": 0.01522353570908308,
"kl": 0.008999449200928211,
"learning_rate": 1.0044444444444446e-05,
"loss": 0.0004,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 113
},
{
"completion_length": 199.0,
"epoch": 0.015254917703733441,
"grad_norm": 0.2540055513381958,
"kl": 0.0061608292162418365,
"learning_rate": 1.0133333333333335e-05,
"loss": 0.0002,
"reward": 0.34974998235702515,
"reward_std": 0.24181587994098663,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.09974999725818634,
"step": 114
},
{
"completion_length": 261.0,
"epoch": 0.01538873277131005,
"grad_norm": 0.004695532377809286,
"kl": 0.0037373793311417103,
"learning_rate": 1.0222222222222223e-05,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 115
},
{
"completion_length": 460.75,
"epoch": 0.01552254783888666,
"grad_norm": 0.002644852502271533,
"kl": 0.0022986496333032846,
"learning_rate": 1.0311111111111113e-05,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 116
},
{
"completion_length": 199.75,
"epoch": 0.01565636290646327,
"grad_norm": 0.5485031604766846,
"kl": 0.0077914525754749775,
"learning_rate": 1.04e-05,
"loss": 0.0003,
"reward": 0.013749999925494194,
"reward_std": 0.11662010848522186,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.013749999925494194,
"step": 117
},
{
"completion_length": 221.5,
"epoch": 0.015790177974039876,
"grad_norm": 0.005294781178236008,
"kl": 0.004005097784101963,
"learning_rate": 1.048888888888889e-05,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 118
},
{
"completion_length": 261.25,
"epoch": 0.015923993041616485,
"grad_norm": 0.005487331189215183,
"kl": 0.00457819364964962,
"learning_rate": 1.0577777777777778e-05,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 119
},
{
"completion_length": 303.5,
"epoch": 0.016057808109193095,
"grad_norm": 0.18594208359718323,
"kl": 0.003819538513198495,
"learning_rate": 1.0666666666666667e-05,
"loss": 0.0002,
"reward": 0.23649999499320984,
"reward_std": 0.39350855350494385,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.11149999499320984,
"step": 120
},
{
"completion_length": 437.0,
"epoch": 0.016191623176769705,
"grad_norm": 0.003429972566664219,
"kl": 0.003270561108365655,
"learning_rate": 1.0755555555555557e-05,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 121
},
{
"completion_length": 225.75,
"epoch": 0.01632543824434631,
"grad_norm": 0.27138444781303406,
"kl": 0.008114825934171677,
"learning_rate": 1.0844444444444446e-05,
"loss": 0.0003,
"reward": 0.5557500123977661,
"reward_std": 1.1115000247955322,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.06925000250339508,
"step": 122
},
{
"completion_length": 228.75,
"epoch": 0.01645925331192292,
"grad_norm": 0.006109706126153469,
"kl": 0.0048021371476352215,
"learning_rate": 1.0933333333333334e-05,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 123
},
{
"completion_length": 260.25,
"epoch": 0.01659306837949953,
"grad_norm": 0.22336947917938232,
"kl": 0.008044867776334286,
"learning_rate": 1.1022222222222224e-05,
"loss": 0.0003,
"reward": -0.14000000059604645,
"reward_std": 0.2800000011920929,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.14000000059604645,
"step": 124
},
{
"completion_length": 285.0,
"epoch": 0.01672688344707614,
"grad_norm": 0.1823970526456833,
"kl": 0.003958097659051418,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.0002,
"reward": 0.03125,
"reward_std": 0.0625,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03125,
"step": 125
},
{
"completion_length": 510.0,
"epoch": 0.01686069851465275,
"grad_norm": 0.21305835247039795,
"kl": 0.003483413252979517,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.0001,
"reward": -0.11699999868869781,
"reward_std": 0.23399999737739563,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.11699999868869781,
"step": 126
},
{
"completion_length": 201.0,
"epoch": 0.016994513582229358,
"grad_norm": 0.00481327623128891,
"kl": 0.004589317366480827,
"learning_rate": 1.1288888888888889e-05,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 127
},
{
"completion_length": 295.25,
"epoch": 0.017128328649805968,
"grad_norm": 0.019327977672219276,
"kl": 0.007599423639476299,
"learning_rate": 1.1377777777777779e-05,
"loss": 0.0003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 128
},
{
"completion_length": 260.0,
"epoch": 0.017262143717382578,
"grad_norm": 0.25404590368270874,
"kl": 0.007972000166773796,
"learning_rate": 1.1466666666666668e-05,
"loss": 0.0003,
"reward": 0.5900000333786011,
"reward_std": 1.0515261888504028,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.034999996423721313,
"step": 129
},
{
"completion_length": 366.0,
"epoch": 0.017395958784959187,
"grad_norm": 0.2250823676586151,
"kl": 0.0046564992517232895,
"learning_rate": 1.1555555555555556e-05,
"loss": 0.0002,
"reward": -0.08275000005960464,
"reward_std": 0.25571519136428833,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.08275000005960464,
"step": 130
},
{
"completion_length": 200.25,
"epoch": 0.017529773852535794,
"grad_norm": 0.33885374665260315,
"kl": 0.010599642992019653,
"learning_rate": 1.1644444444444446e-05,
"loss": 0.0004,
"reward": 0.65625,
"reward_std": 1.3125,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03125,
"step": 131
},
{
"completion_length": 203.25,
"epoch": 0.017663588920112404,
"grad_norm": 0.35624194145202637,
"kl": 0.00767766498029232,
"learning_rate": 1.1733333333333335e-05,
"loss": 0.0003,
"reward": 0.18774999678134918,
"reward_std": 0.2974899113178253,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.06274999678134918,
"step": 132
},
{
"completion_length": 225.0,
"epoch": 0.017797403987689014,
"grad_norm": 0.21870525181293488,
"kl": 0.005271477624773979,
"learning_rate": 1.1822222222222225e-05,
"loss": 0.0002,
"reward": 0.06875000149011612,
"reward_std": 0.13750000298023224,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.05624999850988388,
"step": 133
},
{
"completion_length": 299.25,
"epoch": 0.017931219055265624,
"grad_norm": 0.008637133985757828,
"kl": 0.00660554226487875,
"learning_rate": 1.191111111111111e-05,
"loss": 0.0003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 134
},
{
"completion_length": 307.25,
"epoch": 0.018065034122842234,
"grad_norm": 0.32039323449134827,
"kl": 0.010449215769767761,
"learning_rate": 1.2e-05,
"loss": 0.0004,
"reward": 0.1067499965429306,
"reward_std": 0.12341629713773727,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1067499965429306,
"step": 135
},
{
"completion_length": 323.0,
"epoch": 0.01819884919041884,
"grad_norm": 0.1738128513097763,
"kl": 0.00565626285970211,
"learning_rate": 1.208888888888889e-05,
"loss": 0.0002,
"reward": -0.37550002336502075,
"reward_std": 0.7510000467300415,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.5005000233650208,
"step": 136
},
{
"completion_length": 136.5,
"epoch": 0.01833266425799545,
"grad_norm": 0.5343514084815979,
"kl": 0.010712994262576103,
"learning_rate": 1.217777777777778e-05,
"loss": 0.0004,
"reward": 0.03849999979138374,
"reward_std": 0.07699999213218689,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03849999979138374,
"step": 137
},
{
"completion_length": 393.5,
"epoch": 0.01846647932557206,
"grad_norm": 0.23455312848091125,
"kl": 0.0038801138289272785,
"learning_rate": 1.2266666666666667e-05,
"loss": 0.0002,
"reward": 0.125,
"reward_std": 0.25,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 138
},
{
"completion_length": 227.5,
"epoch": 0.01860029439314867,
"grad_norm": 0.011444750241935253,
"kl": 0.0084530059248209,
"learning_rate": 1.2355555555555557e-05,
"loss": 0.0003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 139
},
{
"completion_length": 214.5,
"epoch": 0.018734109460725276,
"grad_norm": 0.004309752490371466,
"kl": 0.0035294159315526485,
"learning_rate": 1.2444444444444446e-05,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.0,
"step": 140
},
{
"completion_length": 186.75,
"epoch": 0.018867924528301886,
"grad_norm": 0.30907976627349854,
"kl": 0.005284931510686874,
"learning_rate": 1.2533333333333336e-05,
"loss": 0.0002,
"reward": -0.02500000037252903,
"reward_std": 0.05000000074505806,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.02500000037252903,
"step": 141
},
{
"completion_length": 196.5,
"epoch": 0.019001739595878496,
"grad_norm": 0.482105016708374,
"kl": 0.006421719677746296,
"learning_rate": 1.2622222222222222e-05,
"loss": 0.0003,
"reward": 0.5855000019073486,
"reward_std": 1.3796979188919067,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.03949999809265137,
"step": 142
},
{
"completion_length": 147.0,
"epoch": 0.019135554663455106,
"grad_norm": 0.4996793866157532,
"kl": 0.008296657353639603,
"learning_rate": 1.2711111111111112e-05,
"loss": 0.0003,
"reward": -0.08299999684095383,
"reward_std": 0.340639591217041,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.20800000429153442,
"step": 143
},
{
"completion_length": 346.0,
"epoch": 0.019269369731031716,
"grad_norm": 0.2119971662759781,
"kl": 0.004949862137436867,
"learning_rate": 1.2800000000000001e-05,
"loss": 0.0002,
"reward": -0.26100000739097595,
"reward_std": 0.5668544769287109,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.26100000739097595,
"step": 144
},
{
"completion_length": 321.5,
"epoch": 0.019403184798608322,
"grad_norm": 0.2375519573688507,
"kl": 0.003724107053130865,
"learning_rate": 1.288888888888889e-05,
"loss": 0.0001,
"reward": 0.0794999971985817,
"reward_std": 0.49268415570259094,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1705000102519989,
"step": 145
},
{
"completion_length": 268.25,
"epoch": 0.019536999866184932,
"grad_norm": 0.3193058967590332,
"kl": 0.0033836988732218742,
"learning_rate": 1.2977777777777779e-05,
"loss": 0.0001,
"reward": -0.11349999159574509,
"reward_std": 0.5760402083396912,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.23849999904632568,
"step": 146
},
{
"completion_length": 173.25,
"epoch": 0.019670814933761542,
"grad_norm": 0.26112160086631775,
"kl": 0.006745063699781895,
"learning_rate": 1.3066666666666668e-05,
"loss": 0.0003,
"reward": 1.871500015258789,
"reward_std": 1.0523637533187866,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12849999964237213,
"step": 147
},
{
"completion_length": 261.75,
"epoch": 0.019804630001338152,
"grad_norm": 0.2869455814361572,
"kl": 0.008915035054087639,
"learning_rate": 1.3155555555555558e-05,
"loss": 0.0004,
"reward": 0.2330000102519989,
"reward_std": 0.4726901948451996,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1080000028014183,
"step": 148
},
{
"completion_length": 401.25,
"epoch": 0.01993844506891476,
"grad_norm": 0.21081912517547607,
"kl": 0.0027889276389032602,
"learning_rate": 1.3244444444444447e-05,
"loss": 0.0001,
"reward": -0.21125000715255737,
"reward_std": 0.24462132155895233,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.21125000715255737,
"step": 149
},
{
"completion_length": 314.25,
"epoch": 0.02007226013649137,
"grad_norm": 0.19632020592689514,
"kl": 0.006448840722441673,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.0003,
"reward": -0.29874998331069946,
"reward_std": 0.35072246193885803,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.29874998331069946,
"step": 150
},
{
"completion_length": 178.5,
"epoch": 0.020206075204067978,
"grad_norm": 0.43643537163734436,
"kl": 0.009863872081041336,
"learning_rate": 1.3422222222222223e-05,
"loss": 0.0004,
"reward": -0.001499999314546585,
"reward_std": 0.14289741218090057,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.001499999314546585,
"step": 151
},
{
"completion_length": 146.25,
"epoch": 0.020339890271644588,
"grad_norm": 0.32021835446357727,
"kl": 0.015754813328385353,
"learning_rate": 1.3511111111111112e-05,
"loss": 0.0006,
"reward": 0.11400000005960464,
"reward_std": 0.1562071293592453,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.11400000005960464,
"step": 152
},
{
"completion_length": 93.75,
"epoch": 0.020473705339221198,
"grad_norm": 0.531770646572113,
"kl": 0.015665946528315544,
"learning_rate": 1.3600000000000002e-05,
"loss": 0.0006,
"reward": 0.7382500171661377,
"reward_std": 1.344576358795166,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1132500022649765,
"step": 153
},
{
"completion_length": 199.75,
"epoch": 0.020607520406797804,
"grad_norm": 0.46382462978363037,
"kl": 0.012057983316481113,
"learning_rate": 1.368888888888889e-05,
"loss": 0.0005,
"reward": 0.09950000047683716,
"reward_std": 0.14664356410503387,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.15049999952316284,
"step": 154
},
{
"completion_length": 378.0,
"epoch": 0.020741335474374414,
"grad_norm": 0.2936302125453949,
"kl": 0.00859862007200718,
"learning_rate": 1.377777777777778e-05,
"loss": 0.0003,
"reward": -0.07199999690055847,
"reward_std": 0.6001955270767212,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.19699999690055847,
"step": 155
},
{
"completion_length": 143.75,
"epoch": 0.020875150541951024,
"grad_norm": 0.36545681953430176,
"kl": 0.018919892609119415,
"learning_rate": 1.3866666666666669e-05,
"loss": 0.0008,
"reward": 0.2567500174045563,
"reward_std": 0.42568716406822205,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.13175000250339508,
"step": 156
},
{
"completion_length": 110.5,
"epoch": 0.021008965609527634,
"grad_norm": 0.5882317423820496,
"kl": 0.027741603553295135,
"learning_rate": 1.3955555555555558e-05,
"loss": 0.0011,
"reward": 0.17000000178813934,
"reward_std": 0.14358505606651306,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.17000000178813934,
"step": 157
},
{
"completion_length": 64.75,
"epoch": 0.02114278067710424,
"grad_norm": 1.1799137592315674,
"kl": 0.056555233895778656,
"learning_rate": 1.4044444444444445e-05,
"loss": 0.0023,
"reward": 0.38724997639656067,
"reward_std": 0.34333789348602295,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.26225000619888306,
"step": 158
},
{
"completion_length": 194.0,
"epoch": 0.02127659574468085,
"grad_norm": 0.30677542090415955,
"kl": 0.01423485018312931,
"learning_rate": 1.4133333333333334e-05,
"loss": 0.0006,
"reward": 0.10674998164176941,
"reward_std": 0.7794790267944336,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1432499885559082,
"step": 159
},
{
"completion_length": 167.0,
"epoch": 0.02141041081225746,
"grad_norm": 0.42824897170066833,
"kl": 0.03500333055853844,
"learning_rate": 1.4222222222222224e-05,
"loss": 0.0014,
"reward": 0.3812499940395355,
"reward_std": 0.1884257048368454,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.13124999403953552,
"step": 160
},
{
"completion_length": 60.75,
"epoch": 0.02154422587983407,
"grad_norm": 0.5921115279197693,
"kl": 0.05752525106072426,
"learning_rate": 1.4311111111111111e-05,
"loss": 0.0023,
"reward": 0.2929999828338623,
"reward_std": 0.0557195246219635,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2930000126361847,
"step": 161
},
{
"completion_length": 93.0,
"epoch": 0.021678040947410677,
"grad_norm": 0.6500325202941895,
"kl": 0.05201143026351929,
"learning_rate": 1.4400000000000001e-05,
"loss": 0.0021,
"reward": 0.17274999618530273,
"reward_std": 0.06422551721334457,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.17274999618530273,
"step": 162
},
{
"completion_length": 126.25,
"epoch": 0.021811856014987287,
"grad_norm": 0.44645747542381287,
"kl": 0.03292795643210411,
"learning_rate": 1.448888888888889e-05,
"loss": 0.0013,
"reward": 0.3970000147819519,
"reward_std": 0.22381241619586945,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1469999998807907,
"step": 163
},
{
"completion_length": 79.75,
"epoch": 0.021945671082563897,
"grad_norm": 0.5969372391700745,
"kl": 0.06764098256826401,
"learning_rate": 1.457777777777778e-05,
"loss": 0.0027,
"reward": 0.1627500057220459,
"reward_std": 0.09626135975122452,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1627500057220459,
"step": 164
},
{
"completion_length": 245.75,
"epoch": 0.022079486150140507,
"grad_norm": 0.42619219422340393,
"kl": 0.031476035714149475,
"learning_rate": 1.4666666666666666e-05,
"loss": 0.0013,
"reward": 0.5945000052452087,
"reward_std": 0.4530264735221863,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.21949999034404755,
"step": 165
},
{
"completion_length": 106.5,
"epoch": 0.022213301217717116,
"grad_norm": 0.6185816526412964,
"kl": 0.07715773582458496,
"learning_rate": 1.4755555555555556e-05,
"loss": 0.0031,
"reward": 0.5134999752044678,
"reward_std": 0.33077535033226013,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.13850000500679016,
"step": 166
},
{
"completion_length": 80.0,
"epoch": 0.022347116285293723,
"grad_norm": 0.5252732038497925,
"kl": 0.09305495023727417,
"learning_rate": 1.4844444444444445e-05,
"loss": 0.0037,
"reward": 0.23899999260902405,
"reward_std": 0.10646440088748932,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.23899999260902405,
"step": 167
},
{
"completion_length": 279.0,
"epoch": 0.022480931352870333,
"grad_norm": 0.260774165391922,
"kl": 0.04496491700410843,
"learning_rate": 1.4933333333333335e-05,
"loss": 0.0018,
"reward": 0.11250001192092896,
"reward_std": 1.9712051153182983,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -1.1374999284744263,
"step": 168
},
{
"completion_length": 108.25,
"epoch": 0.022614746420446943,
"grad_norm": 0.4881371259689331,
"kl": 0.05013870447874069,
"learning_rate": 1.5022222222222223e-05,
"loss": 0.002,
"reward": 2.03725004196167,
"reward_std": 1.0539665222167969,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03725000470876694,
"step": 169
},
{
"completion_length": 99.25,
"epoch": 0.022748561488023553,
"grad_norm": 0.5610305070877075,
"kl": 0.043805379420518875,
"learning_rate": 1.5111111111111112e-05,
"loss": 0.0018,
"reward": 1.1447501182556152,
"reward_std": 1.0834035873413086,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.14474999904632568,
"step": 170
},
{
"completion_length": 94.75,
"epoch": 0.02288237655560016,
"grad_norm": 0.4598788917064667,
"kl": 0.0532003678381443,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.0021,
"reward": 0.7122499942779541,
"reward_std": 0.19165311753749847,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2122499942779541,
"step": 171
},
{
"completion_length": 137.25,
"epoch": 0.02301619162317677,
"grad_norm": 0.34965983033180237,
"kl": 0.03090197592973709,
"learning_rate": 1.528888888888889e-05,
"loss": 0.0012,
"reward": 0.4805000126361847,
"reward_std": 0.415036141872406,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.10550001263618469,
"step": 172
},
{
"completion_length": 207.0,
"epoch": 0.02315000669075338,
"grad_norm": 0.5801133513450623,
"kl": 0.02816297486424446,
"learning_rate": 1.537777777777778e-05,
"loss": 0.0011,
"reward": 0.07525002956390381,
"reward_std": 0.49743297696113586,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.2997499704360962,
"step": 173
},
{
"completion_length": 65.75,
"epoch": 0.02328382175832999,
"grad_norm": 0.6282637715339661,
"kl": 0.11807440221309662,
"learning_rate": 1.546666666666667e-05,
"loss": 0.0047,
"reward": 0.6984999775886536,
"reward_std": 0.19891957938671112,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.32350003719329834,
"step": 174
},
{
"completion_length": 70.5,
"epoch": 0.0234176368259066,
"grad_norm": 0.6953238248825073,
"kl": 0.069422647356987,
"learning_rate": 1.555555555555556e-05,
"loss": 0.0028,
"reward": 0.6110000014305115,
"reward_std": 0.4198293089866638,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.23600000143051147,
"step": 175
},
{
"completion_length": 86.5,
"epoch": 0.023551451893483205,
"grad_norm": 1.0817539691925049,
"kl": 0.08881973475217819,
"learning_rate": 1.5644444444444448e-05,
"loss": 0.0036,
"reward": 0.6867499947547913,
"reward_std": 0.059885308146476746,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.18674999475479126,
"step": 176
},
{
"completion_length": 122.5,
"epoch": 0.023685266961059815,
"grad_norm": 0.3636866509914398,
"kl": 0.056608811020851135,
"learning_rate": 1.5733333333333334e-05,
"loss": 0.0023,
"reward": 1.746999979019165,
"reward_std": 1.2025237083435059,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.24700000882148743,
"step": 177
},
{
"completion_length": 90.25,
"epoch": 0.023819082028636425,
"grad_norm": 0.7407636642456055,
"kl": 0.07338247448205948,
"learning_rate": 1.5822222222222224e-05,
"loss": 0.0029,
"reward": 0.5740000009536743,
"reward_std": 0.23982912302017212,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.19900000095367432,
"step": 178
},
{
"completion_length": 70.75,
"epoch": 0.023952897096213035,
"grad_norm": 0.731931746006012,
"kl": 0.08679507672786713,
"learning_rate": 1.5911111111111113e-05,
"loss": 0.0035,
"reward": 1.2517499923706055,
"reward_std": 0.9867871999740601,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25174999237060547,
"step": 179
},
{
"completion_length": 87.75,
"epoch": 0.02408671216378964,
"grad_norm": 0.40641459822654724,
"kl": 0.05429335683584213,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.0022,
"reward": 1.221750020980835,
"reward_std": 1.0200151205062866,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.22175000607967377,
"step": 180
},
{
"completion_length": 98.5,
"epoch": 0.02422052723136625,
"grad_norm": 0.3797653615474701,
"kl": 0.13556654751300812,
"learning_rate": 1.608888888888889e-05,
"loss": 0.0054,
"reward": 0.9449999928474426,
"reward_std": 1.1976945400238037,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.19500002264976501,
"step": 181
},
{
"completion_length": 93.5,
"epoch": 0.02435434229894286,
"grad_norm": 0.3987956643104553,
"kl": 0.06347194314002991,
"learning_rate": 1.617777777777778e-05,
"loss": 0.0025,
"reward": 0.5089999437332153,
"reward_std": 0.30833208560943604,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1339999884366989,
"step": 182
},
{
"completion_length": 167.75,
"epoch": 0.02448815736651947,
"grad_norm": 0.3698064982891083,
"kl": 0.034620750695466995,
"learning_rate": 1.6266666666666668e-05,
"loss": 0.0014,
"reward": 0.3659999966621399,
"reward_std": 0.4141714572906494,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.1340000033378601,
"step": 183
},
{
"completion_length": 59.5,
"epoch": 0.02462197243409608,
"grad_norm": 1.24988853931427,
"kl": 0.14011864364147186,
"learning_rate": 1.6355555555555557e-05,
"loss": 0.0056,
"reward": 1.0192499160766602,
"reward_std": 1.1953402757644653,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.26924997568130493,
"step": 184
},
{
"completion_length": 80.25,
"epoch": 0.024755787501672687,
"grad_norm": 0.5655855536460876,
"kl": 0.05728255584836006,
"learning_rate": 1.6444444444444444e-05,
"loss": 0.0023,
"reward": 0.8680000305175781,
"reward_std": 1.225602626800537,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.24300000071525574,
"step": 185
},
{
"completion_length": 86.75,
"epoch": 0.024889602569249297,
"grad_norm": 0.7822229266166687,
"kl": 0.08518431335687637,
"learning_rate": 1.6533333333333333e-05,
"loss": 0.0034,
"reward": 0.5862500071525574,
"reward_std": 0.19938969612121582,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.08624999970197678,
"step": 186
},
{
"completion_length": 67.75,
"epoch": 0.025023417636825907,
"grad_norm": 0.5667008757591248,
"kl": 0.06875781714916229,
"learning_rate": 1.6622222222222223e-05,
"loss": 0.0028,
"reward": 1.2515000104904175,
"reward_std": 0.9990040063858032,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2515000104904175,
"step": 187
},
{
"completion_length": 93.0,
"epoch": 0.025157232704402517,
"grad_norm": 0.456777960062027,
"kl": 0.06855905055999756,
"learning_rate": 1.6711111111111112e-05,
"loss": 0.0027,
"reward": 0.9950000047683716,
"reward_std": 1.1864756345748901,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.24500000476837158,
"step": 188
},
{
"completion_length": 103.5,
"epoch": 0.025291047771979124,
"grad_norm": 0.6969872713088989,
"kl": 0.09152114391326904,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.0037,
"reward": 1.597749948501587,
"reward_std": 1.3118852376937866,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.09775000065565109,
"step": 189
},
{
"completion_length": 59.5,
"epoch": 0.025424862839555733,
"grad_norm": 0.9019510746002197,
"kl": 0.2068648785352707,
"learning_rate": 1.688888888888889e-05,
"loss": 0.0083,
"reward": 0.5332499742507935,
"reward_std": 0.2600556015968323,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.28325000405311584,
"step": 190
},
{
"completion_length": 121.75,
"epoch": 0.025558677907132343,
"grad_norm": 0.4399167001247406,
"kl": 0.09197084605693817,
"learning_rate": 1.697777777777778e-05,
"loss": 0.0037,
"reward": 0.5910000205039978,
"reward_std": 0.3367471992969513,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.09100000560283661,
"step": 191
},
{
"completion_length": 57.75,
"epoch": 0.025692492974708953,
"grad_norm": 0.47419285774230957,
"kl": 0.12339113652706146,
"learning_rate": 1.706666666666667e-05,
"loss": 0.0049,
"reward": 0.2979999780654907,
"reward_std": 0.032321300357580185,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2979999780654907,
"step": 192
},
{
"completion_length": 73.0,
"epoch": 0.02582630804228556,
"grad_norm": 0.5090747475624084,
"kl": 0.11672109365463257,
"learning_rate": 1.7155555555555557e-05,
"loss": 0.0047,
"reward": 1.25,
"reward_std": 1.0,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25,
"step": 193
},
{
"completion_length": 78.0,
"epoch": 0.02596012310986217,
"grad_norm": 0.5393621921539307,
"kl": 0.07206133008003235,
"learning_rate": 1.7244444444444446e-05,
"loss": 0.0029,
"reward": 2.25,
"reward_std": 1.0,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25,
"step": 194
},
{
"completion_length": 76.25,
"epoch": 0.02609393817743878,
"grad_norm": 0.542004406452179,
"kl": 0.12011324614286423,
"learning_rate": 1.7333333333333336e-05,
"loss": 0.0048,
"reward": 0.625,
"reward_std": 0.25,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25,
"step": 195
},
{
"completion_length": 106.0,
"epoch": 0.02622775324501539,
"grad_norm": 0.43443137407302856,
"kl": 0.053334061056375504,
"learning_rate": 1.7422222222222222e-05,
"loss": 0.0021,
"reward": 2.25,
"reward_std": 1.0,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25,
"step": 196
},
{
"completion_length": 70.25,
"epoch": 0.026361568312592,
"grad_norm": 0.5421243906021118,
"kl": 0.14290496706962585,
"learning_rate": 1.751111111111111e-05,
"loss": 0.0057,
"reward": 1.1717499494552612,
"reward_std": 0.9182985424995422,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1717499941587448,
"step": 197
},
{
"completion_length": 50.0,
"epoch": 0.026495383380168606,
"grad_norm": 0.5998945832252502,
"kl": 0.16523699462413788,
"learning_rate": 1.76e-05,
"loss": 0.0066,
"reward": 1.125,
"reward_std": 1.108677864074707,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25,
"step": 198
},
{
"completion_length": 83.5,
"epoch": 0.026629198447745216,
"grad_norm": 0.08789081871509552,
"kl": 0.14140745997428894,
"learning_rate": 1.768888888888889e-05,
"loss": 0.0057,
"reward": 0.75,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25,
"step": 199
},
{
"completion_length": 94.0,
"epoch": 0.026763013515321826,
"grad_norm": 0.8682589530944824,
"kl": 0.24527359008789062,
"learning_rate": 1.7777777777777777e-05,
"loss": 0.0098,
"reward": 0.715499997138977,
"reward_std": 0.15306098759174347,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.21549999713897705,
"step": 200
},
{
"completion_length": 85.75,
"epoch": 0.026896828582898435,
"grad_norm": 0.5800766944885254,
"kl": 0.1065763458609581,
"learning_rate": 1.7866666666666666e-05,
"loss": 0.0043,
"reward": 0.6587499976158142,
"reward_std": 0.11285203695297241,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1587499976158142,
"step": 201
},
{
"completion_length": 72.0,
"epoch": 0.027030643650475042,
"grad_norm": 0.6035862565040588,
"kl": 0.11155687272548676,
"learning_rate": 1.7955555555555556e-05,
"loss": 0.0045,
"reward": 1.316499948501587,
"reward_std": 0.9577993750572205,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3165000081062317,
"step": 202
},
{
"completion_length": 82.75,
"epoch": 0.027164458718051652,
"grad_norm": 0.44579821825027466,
"kl": 0.0726730078458786,
"learning_rate": 1.8044444444444445e-05,
"loss": 0.0029,
"reward": 1.25,
"reward_std": 1.0,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25,
"step": 203
},
{
"completion_length": 69.0,
"epoch": 0.02729827378562826,
"grad_norm": 0.6042316555976868,
"kl": 0.10065613687038422,
"learning_rate": 1.8133333333333335e-05,
"loss": 0.004,
"reward": 0.7465000152587891,
"reward_std": 0.006999989040195942,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.24650000035762787,
"step": 204
},
{
"completion_length": 53.75,
"epoch": 0.02743208885320487,
"grad_norm": 0.9847255945205688,
"kl": 0.16128680109977722,
"learning_rate": 1.8222222222222224e-05,
"loss": 0.0065,
"reward": 0.2667500078678131,
"reward_std": 0.033500004559755325,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2667500078678131,
"step": 205
},
{
"completion_length": 59.5,
"epoch": 0.02756590392078148,
"grad_norm": 0.642667829990387,
"kl": 0.1901407241821289,
"learning_rate": 1.8311111111111114e-05,
"loss": 0.0076,
"reward": 0.503000020980835,
"reward_std": 0.29218029975891113,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2529999911785126,
"step": 206
},
{
"completion_length": 76.0,
"epoch": 0.027699718988358088,
"grad_norm": 1.0304243564605713,
"kl": 0.36415040493011475,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.0146,
"reward": 2.25,
"reward_std": 1.0,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25,
"step": 207
},
{
"completion_length": 67.0,
"epoch": 0.027833534055934698,
"grad_norm": 0.7824116945266724,
"kl": 0.35757210850715637,
"learning_rate": 1.848888888888889e-05,
"loss": 0.0143,
"reward": 1.444000005722046,
"reward_std": 1.5823231935501099,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1940000057220459,
"step": 208
},
{
"completion_length": 66.0,
"epoch": 0.027967349123511308,
"grad_norm": 0.6381920576095581,
"kl": 0.16021761298179626,
"learning_rate": 1.857777777777778e-05,
"loss": 0.0064,
"reward": 0.7674999833106995,
"reward_std": 0.03348132595419884,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.26749998331069946,
"step": 209
},
{
"completion_length": 51.5,
"epoch": 0.028101164191087918,
"grad_norm": 0.029546145349740982,
"kl": 0.1807194948196411,
"learning_rate": 1.866666666666667e-05,
"loss": 0.0072,
"reward": 0.75,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25,
"step": 210
},
{
"completion_length": 124.25,
"epoch": 0.028234979258664524,
"grad_norm": 1.192435622215271,
"kl": 0.16933509707450867,
"learning_rate": 1.8755555555555558e-05,
"loss": 0.0068,
"reward": 0.6417499780654907,
"reward_std": 0.12795408070087433,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1417500078678131,
"step": 211
},
{
"completion_length": 61.75,
"epoch": 0.028368794326241134,
"grad_norm": 0.4982728660106659,
"kl": 0.10138452053070068,
"learning_rate": 1.8844444444444444e-05,
"loss": 0.0041,
"reward": 0.746999979019165,
"reward_std": 0.06240728497505188,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.24700000882148743,
"step": 212
},
{
"completion_length": 80.25,
"epoch": 0.028502609393817744,
"grad_norm": 0.4456157386302948,
"kl": 0.10433340817689896,
"learning_rate": 1.8933333333333334e-05,
"loss": 0.0042,
"reward": 1.25,
"reward_std": 1.0,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.25,
"step": 213
},
{
"completion_length": 146.25,
"epoch": 0.028636424461394354,
"grad_norm": 0.24999132752418518,
"kl": 0.06005653738975525,
"learning_rate": 1.9022222222222223e-05,
"loss": 0.0024,
"reward": 0.574999988079071,
"reward_std": 0.22246196866035461,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.20000000298023224,
"step": 214
},
{
"completion_length": 66.25,
"epoch": 0.028770239528970964,
"grad_norm": 0.7781875133514404,
"kl": 0.16475608944892883,
"learning_rate": 1.9111111111111113e-05,
"loss": 0.0066,
"reward": 0.746999979019165,
"reward_std": 0.10004331171512604,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.24700000882148743,
"step": 215
},
{
"completion_length": 39.75,
"epoch": 0.02890405459654757,
"grad_norm": 0.7784989476203918,
"kl": 0.14634883403778076,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.0059,
"reward": 1.7787500619888306,
"reward_std": 1.170571208000183,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2787500023841858,
"step": 216
},
{
"completion_length": 72.25,
"epoch": 0.02903786966412418,
"grad_norm": 0.4837649464607239,
"kl": 0.09058363735675812,
"learning_rate": 1.928888888888889e-05,
"loss": 0.0036,
"reward": 0.27175000309944153,
"reward_std": 0.3053712248802185,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.14675000309944153,
"step": 217
},
{
"completion_length": 80.25,
"epoch": 0.02917168473170079,
"grad_norm": 0.9711887836456299,
"kl": 0.07635970413684845,
"learning_rate": 1.9377777777777778e-05,
"loss": 0.0031,
"reward": 1.7442500591278076,
"reward_std": 1.0721337795257568,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.24424999952316284,
"step": 218
},
{
"completion_length": 70.5,
"epoch": 0.0293054997992774,
"grad_norm": 0.42600736021995544,
"kl": 0.06140553951263428,
"learning_rate": 1.9466666666666668e-05,
"loss": 0.0025,
"reward": 0.7387499809265137,
"reward_std": 0.05771408975124359,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.23874998092651367,
"step": 219
},
{
"completion_length": 217.5,
"epoch": 0.029439314866854006,
"grad_norm": 0.5537015199661255,
"kl": 0.05942856892943382,
"learning_rate": 1.9555555555555557e-05,
"loss": 0.0024,
"reward": 0.3062500059604645,
"reward_std": 0.3994557559490204,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.19374999403953552,
"step": 220
},
{
"completion_length": 135.75,
"epoch": 0.029573129934430616,
"grad_norm": 0.29336312413215637,
"kl": 0.052648480981588364,
"learning_rate": 1.9644444444444447e-05,
"loss": 0.0021,
"reward": 0.5582500100135803,
"reward_std": 0.29113730788230896,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.18325001001358032,
"step": 221
},
{
"completion_length": 62.0,
"epoch": 0.029706945002007226,
"grad_norm": 0.6704282760620117,
"kl": 0.09709338843822479,
"learning_rate": 1.9733333333333336e-05,
"loss": 0.0039,
"reward": 0.7619999647140503,
"reward_std": 0.038531359285116196,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2619999945163727,
"step": 222
},
{
"completion_length": 116.5,
"epoch": 0.029840760069583836,
"grad_norm": 0.2809985876083374,
"kl": 0.11135183274745941,
"learning_rate": 1.9822222222222226e-05,
"loss": 0.0045,
"reward": 0.5497499704360962,
"reward_std": 0.36881014704704285,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.17475000023841858,
"step": 223
},
{
"completion_length": 60.5,
"epoch": 0.029974575137160443,
"grad_norm": 0.6438567638397217,
"kl": 0.133337140083313,
"learning_rate": 1.9911111111111112e-05,
"loss": 0.0053,
"reward": 1.3017499446868896,
"reward_std": 0.9685468077659607,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3017500042915344,
"step": 224
},
{
"completion_length": 74.0,
"epoch": 0.030108390204737052,
"grad_norm": 0.4634908139705658,
"kl": 0.08658132702112198,
"learning_rate": 2e-05,
"loss": 0.0035,
"reward": 1.0735000371932983,
"reward_std": 1.0860450267791748,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.19850000739097595,
"step": 225
},
{
"completion_length": 61.5,
"epoch": 0.030242205272313662,
"grad_norm": 0.6003581285476685,
"kl": 0.12208070605993271,
"learning_rate": 1.9999999060637166e-05,
"loss": 0.0049,
"reward": 0.7789999842643738,
"reward_std": 0.06427544355392456,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2789999842643738,
"step": 226
},
{
"completion_length": 73.5,
"epoch": 0.030376020339890272,
"grad_norm": 0.5231612920761108,
"kl": 0.15073804557323456,
"learning_rate": 1.9999996242548837e-05,
"loss": 0.006,
"reward": 1.16225004196167,
"reward_std": 1.0848207473754883,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.28724998235702515,
"step": 227
},
{
"completion_length": 72.5,
"epoch": 0.030509835407466882,
"grad_norm": 0.4123309552669525,
"kl": 0.07140478491783142,
"learning_rate": 1.999999154573555e-05,
"loss": 0.0029,
"reward": 0.7667500376701355,
"reward_std": 0.04211394116282463,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2667500078678131,
"step": 228
},
{
"completion_length": 88.5,
"epoch": 0.03064365047504349,
"grad_norm": 0.5252578854560852,
"kl": 0.03713103383779526,
"learning_rate": 1.9999984970198176e-05,
"loss": 0.0015,
"reward": 1.5755000114440918,
"reward_std": 1.2587233781814575,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2004999965429306,
"step": 229
},
{
"completion_length": 145.0,
"epoch": 0.0307774655426201,
"grad_norm": 0.4767831563949585,
"kl": 0.07187433540821075,
"learning_rate": 1.999997651593796e-05,
"loss": 0.0029,
"reward": 0.4542499780654907,
"reward_std": 0.4314698874950409,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.045750007033348083,
"step": 230
},
{
"completion_length": 102.75,
"epoch": 0.03091128061019671,
"grad_norm": 0.4800109267234802,
"kl": 0.07859396934509277,
"learning_rate": 1.9999966182956486e-05,
"loss": 0.0031,
"reward": 0.6054999828338623,
"reward_std": 0.21153488755226135,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.1054999977350235,
"step": 231
},
{
"completion_length": 190.25,
"epoch": 0.03104509567777332,
"grad_norm": 0.20692415535449982,
"kl": 0.04673830792307854,
"learning_rate": 1.9999953971255692e-05,
"loss": 0.0019,
"reward": 0.15949998795986176,
"reward_std": 0.31504231691360474,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.125,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.03450000286102295,
"step": 232
},
{
"completion_length": 68.0,
"epoch": 0.031178910745349925,
"grad_norm": 0.5570704936981201,
"kl": 0.08012831211090088,
"learning_rate": 1.999993988083788e-05,
"loss": 0.0032,
"reward": 2.252500057220459,
"reward_std": 0.95698082447052,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2524999976158142,
"step": 233
},
{
"completion_length": 131.5,
"epoch": 0.03131272581292654,
"grad_norm": 0.4668695628643036,
"kl": 0.0665660873055458,
"learning_rate": 1.9999923911705693e-05,
"loss": 0.0027,
"reward": 1.5532499551773071,
"reward_std": 1.3141237497329712,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.17824998497962952,
"step": 234
},
{
"completion_length": 73.25,
"epoch": 0.031446540880503145,
"grad_norm": 0.5236088633537292,
"kl": 0.14625723659992218,
"learning_rate": 1.9999906063862128e-05,
"loss": 0.0059,
"reward": 1.7309999465942383,
"reward_std": 1.0541173219680786,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.23100000619888306,
"step": 235
},
{
"completion_length": 114.25,
"epoch": 0.03158035594807975,
"grad_norm": 0.45700567960739136,
"kl": 0.08767452090978622,
"learning_rate": 1.9999886337310546e-05,
"loss": 0.0035,
"reward": 0.4410000145435333,
"reward_std": 0.4759082794189453,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.06599999964237213,
"step": 236
},
{
"completion_length": 208.0,
"epoch": 0.031714171015656364,
"grad_norm": 0.45323336124420166,
"kl": 0.0659441202878952,
"learning_rate": 1.999986473205465e-05,
"loss": 0.0026,
"reward": 0.2602500021457672,
"reward_std": 0.6481220722198486,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.2397499978542328,
"step": 237
},
{
"completion_length": 140.25,
"epoch": 0.03184798608323297,
"grad_norm": 0.4077318608760834,
"kl": 0.05185340344905853,
"learning_rate": 1.999984124809849e-05,
"loss": 0.0021,
"reward": 1.4795000553131104,
"reward_std": 1.379389762878418,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.10450000315904617,
"step": 238
},
{
"completion_length": 70.0,
"epoch": 0.031981801150809584,
"grad_norm": 0.926057755947113,
"kl": 0.11616024374961853,
"learning_rate": 1.9999815885446497e-05,
"loss": 0.0046,
"reward": 0.9787499904632568,
"reward_std": 0.3524896502494812,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.35374999046325684,
"step": 239
},
{
"completion_length": 293.5,
"epoch": 0.03211561621838619,
"grad_norm": 0.394625723361969,
"kl": 0.017029505223035812,
"learning_rate": 1.9999788644103418e-05,
"loss": 0.0007,
"reward": -0.1302500218153,
"reward_std": 0.6652750968933105,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.6302499771118164,
"step": 240
},
{
"completion_length": 135.0,
"epoch": 0.0322494312859628,
"grad_norm": 0.2847868502140045,
"kl": 0.09572663903236389,
"learning_rate": 1.9999759524074374e-05,
"loss": 0.0038,
"reward": 0.7019999623298645,
"reward_std": 0.22655829787254333,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2019999921321869,
"step": 241
},
{
"completion_length": 112.25,
"epoch": 0.03238324635353941,
"grad_norm": 0.5767569541931152,
"kl": 0.12632526457309723,
"learning_rate": 1.9999728525364848e-05,
"loss": 0.0051,
"reward": 0.6430000066757202,
"reward_std": 0.35005998611450195,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.14300000667572021,
"step": 242
},
{
"completion_length": 126.5,
"epoch": 0.03251706142111602,
"grad_norm": 0.3967666029930115,
"kl": 0.11589077860116959,
"learning_rate": 1.999969564798065e-05,
"loss": 0.0046,
"reward": 1.4282499551773071,
"reward_std": 0.553694486618042,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.1782499998807907,
"step": 243
},
{
"completion_length": 134.0,
"epoch": 0.03265087648869262,
"grad_norm": 0.38813862204551697,
"kl": 0.06340669095516205,
"learning_rate": 1.999966089192796e-05,
"loss": 0.0025,
"reward": 1.2892500162124634,
"reward_std": 1.5317614078521729,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.2892500162124634,
"step": 244
},
{
"completion_length": 109.75,
"epoch": 0.03278469155626924,
"grad_norm": 0.42066898941993713,
"kl": 0.12392938882112503,
"learning_rate": 1.9999624257213318e-05,
"loss": 0.005,
"reward": 1.7067500352859497,
"reward_std": 1.3346679210662842,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.20675000548362732,
"step": 245
},
{
"completion_length": 102.0,
"epoch": 0.03291850662384584,
"grad_norm": 0.704933762550354,
"kl": 0.12700574100017548,
"learning_rate": 1.9999585743843592e-05,
"loss": 0.0051,
"reward": 0.6620000004768372,
"reward_std": 0.6284016370773315,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.16200000047683716,
"step": 246
},
{
"completion_length": 65.75,
"epoch": 0.03305232169142246,
"grad_norm": 0.557697057723999,
"kl": 0.16324672102928162,
"learning_rate": 1.9999545351826028e-05,
"loss": 0.0065,
"reward": 1.4605000019073486,
"reward_std": 1.3599356412887573,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.335500031709671,
"step": 247
},
{
"completion_length": 50.25,
"epoch": 0.03318613675899906,
"grad_norm": 0.8064447641372681,
"kl": 0.2082303762435913,
"learning_rate": 1.9999503081168205e-05,
"loss": 0.0083,
"reward": 0.7957500219345093,
"reward_std": 0.09577882289886475,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2957499921321869,
"step": 248
},
{
"completion_length": 92.0,
"epoch": 0.03331995182657567,
"grad_norm": 0.8607310652732849,
"kl": 0.11370626091957092,
"learning_rate": 1.999945893187807e-05,
"loss": 0.0045,
"reward": 1.3242499828338623,
"reward_std": 0.351500004529953,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.375,
"rewards/xmlcount_reward_func": 0.4492499828338623,
"step": 249
},
{
"completion_length": 120.5,
"epoch": 0.03345376689415228,
"grad_norm": 0.46249887347221375,
"kl": 0.09916168451309204,
"learning_rate": 1.9999412903963925e-05,
"loss": 0.004,
"reward": 0.6180000305175781,
"reward_std": 0.44016435742378235,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.24300000071525574,
"step": 250
},
{
"completion_length": 121.75,
"epoch": 0.03358758196172889,
"grad_norm": 0.7673288583755493,
"kl": 0.14884337782859802,
"learning_rate": 1.9999364997434406e-05,
"loss": 0.006,
"reward": 0.843999981880188,
"reward_std": 1.0066499710083008,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.09400001168251038,
"step": 251
},
{
"completion_length": 67.5,
"epoch": 0.0337213970293055,
"grad_norm": 0.6919443607330322,
"kl": 0.20414642989635468,
"learning_rate": 1.9999315212298516e-05,
"loss": 0.0082,
"reward": 2.0,
"reward_std": 1.0,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 252
},
{
"completion_length": 105.0,
"epoch": 0.03385521209688211,
"grad_norm": 0.4287322461605072,
"kl": 0.16063739359378815,
"learning_rate": 1.999926354856561e-05,
"loss": 0.0064,
"reward": 1.5625,
"reward_std": 0.9213893413543701,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.3125,
"step": 253
},
{
"completion_length": 109.75,
"epoch": 0.033989027164458716,
"grad_norm": 0.5036614537239075,
"kl": 0.10087580978870392,
"learning_rate": 1.9999210006245395e-05,
"loss": 0.004,
"reward": 1.224250078201294,
"reward_std": 0.850592851638794,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.22424998879432678,
"step": 254
},
{
"completion_length": 102.5,
"epoch": 0.03412284223203533,
"grad_norm": 0.4636721909046173,
"kl": 0.10581967979669571,
"learning_rate": 1.9999154585347926e-05,
"loss": 0.0042,
"reward": 1.4709999561309814,
"reward_std": 1.0897164344787598,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.3460000157356262,
"step": 255
},
{
"completion_length": 72.25,
"epoch": 0.034256657299611935,
"grad_norm": 0.03021317906677723,
"kl": 0.1458757519721985,
"learning_rate": 1.999909728588362e-05,
"loss": 0.0058,
"reward": 1.5,
"reward_std": 0.0,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 256
},
{
"completion_length": 117.25,
"epoch": 0.03439047236718855,
"grad_norm": 0.4804219901561737,
"kl": 0.1489054560661316,
"learning_rate": 1.999903810786324e-05,
"loss": 0.006,
"reward": 1.563499927520752,
"reward_std": 0.5171675682067871,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.31349998712539673,
"step": 257
},
{
"completion_length": 132.0,
"epoch": 0.034524287434765155,
"grad_norm": 0.31325674057006836,
"kl": 0.08090350031852722,
"learning_rate": 1.99989770512979e-05,
"loss": 0.0032,
"reward": 1.2827500104904175,
"reward_std": 1.164240837097168,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2827500104904175,
"step": 258
},
{
"completion_length": 90.75,
"epoch": 0.03465810250234176,
"grad_norm": 0.40309247374534607,
"kl": 0.1473146677017212,
"learning_rate": 1.999891411619908e-05,
"loss": 0.0059,
"reward": 3.0,
"reward_std": 1.0,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 259
},
{
"completion_length": 194.75,
"epoch": 0.034791917569918375,
"grad_norm": 0.30801522731781006,
"kl": 0.05805174261331558,
"learning_rate": 1.9998849302578597e-05,
"loss": 0.0023,
"reward": 1.8217499256134033,
"reward_std": 1.0732003450393677,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.1967500001192093,
"step": 260
},
{
"completion_length": 84.25,
"epoch": 0.03492573263749498,
"grad_norm": 0.7973276376724243,
"kl": 0.12621484696865082,
"learning_rate": 1.9998782610448625e-05,
"loss": 0.005,
"reward": 2.209249973297119,
"reward_std": 0.821670413017273,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.45925000309944153,
"step": 261
},
{
"completion_length": 182.25,
"epoch": 0.03505954770507159,
"grad_norm": 0.40601614117622375,
"kl": 0.1828838288784027,
"learning_rate": 1.9998714039821703e-05,
"loss": 0.0073,
"reward": 1.312999963760376,
"reward_std": 1.4628314971923828,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.18799999356269836,
"step": 262
},
{
"completion_length": 125.25,
"epoch": 0.0351933627726482,
"grad_norm": 0.4140985608100891,
"kl": 0.0889941155910492,
"learning_rate": 1.9998643590710707e-05,
"loss": 0.0036,
"reward": 3.125,
"reward_std": 0.4787135720252991,
"rewards/correctness_reward_func": 2.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.375,
"step": 263
},
{
"completion_length": 295.75,
"epoch": 0.03532717784022481,
"grad_norm": 0.24423445761203766,
"kl": 0.05636545270681381,
"learning_rate": 1.9998571263128873e-05,
"loss": 0.0023,
"reward": 0.24674999713897705,
"reward_std": 0.20656456053256989,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.12825000286102295,
"step": 264
},
{
"completion_length": 177.75,
"epoch": 0.03546099290780142,
"grad_norm": 0.4541739821434021,
"kl": 0.09995388984680176,
"learning_rate": 1.999849705708979e-05,
"loss": 0.004,
"reward": 0.7145000100135803,
"reward_std": 0.2543770372867584,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.21450001001358032,
"step": 265
},
{
"completion_length": 250.0,
"epoch": 0.03559480797537803,
"grad_norm": 0.2651154398918152,
"kl": 0.0614202618598938,
"learning_rate": 1.99984209726074e-05,
"loss": 0.0025,
"reward": 0.718500018119812,
"reward_std": 0.25749239325523376,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.21850000321865082,
"step": 266
},
{
"completion_length": 151.75,
"epoch": 0.035728623042954634,
"grad_norm": 0.34357938170433044,
"kl": 0.11051115393638611,
"learning_rate": 1.9998343009695995e-05,
"loss": 0.0044,
"reward": 1.125,
"reward_std": 0.9464846849441528,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.125,
"step": 267
},
{
"completion_length": 178.75,
"epoch": 0.03586243811053125,
"grad_norm": 0.6356491446495056,
"kl": 0.16408377885818481,
"learning_rate": 1.9998263168370228e-05,
"loss": 0.0066,
"reward": 0.4662500023841858,
"reward_std": 0.26134318113327026,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.03375000134110451,
"step": 268
},
{
"completion_length": 114.75,
"epoch": 0.035996253178107854,
"grad_norm": 0.5377089977264404,
"kl": 0.16050302982330322,
"learning_rate": 1.9998181448645087e-05,
"loss": 0.0064,
"reward": 2.5269999504089355,
"reward_std": 1.4152441024780273,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.2770000100135803,
"step": 269
},
{
"completion_length": 123.0,
"epoch": 0.03613006824568447,
"grad_norm": 0.3413686752319336,
"kl": 0.0774858370423317,
"learning_rate": 1.999809785053594e-05,
"loss": 0.0031,
"reward": 2.625,
"reward_std": 0.8539125919342041,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.375,
"step": 270
},
{
"completion_length": 78.0,
"epoch": 0.036263883313261074,
"grad_norm": 0.6230402588844299,
"kl": 0.14687515795230865,
"learning_rate": 1.999801237405848e-05,
"loss": 0.0059,
"reward": 3.0,
"reward_std": 1.0,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 0.5,
"step": 271
},
{
"completion_length": 240.5,
"epoch": 0.03639769838083768,
"grad_norm": 0.36297154426574707,
"kl": 0.09850015491247177,
"learning_rate": 1.9997925019228775e-05,
"loss": 0.0039,
"reward": 0.5189999938011169,
"reward_std": 0.9096706509590149,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.018999993801116943,
"step": 272
},
{
"completion_length": 167.75,
"epoch": 0.03653151344841429,
"grad_norm": 0.2768608629703522,
"kl": 0.06528370082378387,
"learning_rate": 1.999783578606323e-05,
"loss": 0.0026,
"reward": 1.0750000476837158,
"reward_std": 1.1177325248718262,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.07500000298023224,
"step": 273
},
{
"completion_length": 205.25,
"epoch": 0.0366653285159909,
"grad_norm": 0.24607378244400024,
"kl": 0.0492008738219738,
"learning_rate": 1.9997744674578615e-05,
"loss": 0.002,
"reward": 1.375,
"reward_std": 1.25,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.375,
"step": 274
},
{
"completion_length": 123.75,
"epoch": 0.036799143583567506,
"grad_norm": 0.3026469647884369,
"kl": 0.11610330641269684,
"learning_rate": 1.9997651684792042e-05,
"loss": 0.0046,
"reward": 2.647249937057495,
"reward_std": 0.850723385810852,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.3972499966621399,
"step": 275
},
{
"completion_length": 191.75,
"epoch": 0.03693295865114412,
"grad_norm": 0.2089061737060547,
"kl": 0.05618685111403465,
"learning_rate": 1.9997556816720985e-05,
"loss": 0.0022,
"reward": 1.6702499389648438,
"reward_std": 1.4511858224868774,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.045249998569488525,
"step": 276
},
{
"completion_length": 207.25,
"epoch": 0.037066773718720726,
"grad_norm": 0.25519034266471863,
"kl": 0.030937891453504562,
"learning_rate": 1.9997460070383264e-05,
"loss": 0.0012,
"reward": 0.9837499856948853,
"reward_std": 0.9404298067092896,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.01625000685453415,
"step": 277
},
{
"completion_length": 110.5,
"epoch": 0.03720058878629734,
"grad_norm": 0.2822895050048828,
"kl": 0.06590020656585693,
"learning_rate": 1.9997361445797058e-05,
"loss": 0.0026,
"reward": 2.0512499809265137,
"reward_std": 1.4043407440185547,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.42625001072883606,
"step": 278
},
{
"completion_length": 195.0,
"epoch": 0.037334403853873946,
"grad_norm": 0.3637947738170624,
"kl": 0.08848227560520172,
"learning_rate": 1.9997260942980895e-05,
"loss": 0.0035,
"reward": 1.3247499465942383,
"reward_std": 0.9170319437980652,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": -0.17524999380111694,
"step": 279
},
{
"completion_length": 155.75,
"epoch": 0.03746821892145055,
"grad_norm": 0.3750488758087158,
"kl": 0.0733165591955185,
"learning_rate": 1.9997158561953655e-05,
"loss": 0.0029,
"reward": 2.0625,
"reward_std": 1.328768253326416,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.3125,
"step": 280
},
{
"completion_length": 173.75,
"epoch": 0.037602033989027166,
"grad_norm": 0.2433202862739563,
"kl": 0.07221105694770813,
"learning_rate": 1.9997054302734576e-05,
"loss": 0.0029,
"reward": 3.1524999141693115,
"reward_std": 0.4316924810409546,
"rewards/correctness_reward_func": 2.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.4025000035762787,
"step": 281
},
{
"completion_length": 173.25,
"epoch": 0.03773584905660377,
"grad_norm": 0.2556951940059662,
"kl": 0.043353110551834106,
"learning_rate": 1.9996948165343243e-05,
"loss": 0.0017,
"reward": 0.7929999828338623,
"reward_std": 0.29227155447006226,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2929999828338623,
"step": 282
},
{
"completion_length": 127.0,
"epoch": 0.037869664124180386,
"grad_norm": 0.2885083556175232,
"kl": 0.06136604771018028,
"learning_rate": 1.9996840149799594e-05,
"loss": 0.0025,
"reward": 1.2899999618530273,
"reward_std": 0.9839661717414856,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2900000214576721,
"step": 283
},
{
"completion_length": 119.75,
"epoch": 0.03800347919175699,
"grad_norm": 0.4060731828212738,
"kl": 0.0710485652089119,
"learning_rate": 1.9996730256123925e-05,
"loss": 0.0028,
"reward": 1.2477500438690186,
"reward_std": 0.7362886667251587,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.24774999916553497,
"step": 284
},
{
"completion_length": 117.25,
"epoch": 0.0381372942593336,
"grad_norm": 0.32145437598228455,
"kl": 0.06920292973518372,
"learning_rate": 1.9996618484336885e-05,
"loss": 0.0028,
"reward": 1.3524999618530273,
"reward_std": 0.9444156885147095,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3525000214576721,
"step": 285
},
{
"completion_length": 145.0,
"epoch": 0.03827110932691021,
"grad_norm": 0.29436415433883667,
"kl": 0.06218549609184265,
"learning_rate": 1.9996504834459467e-05,
"loss": 0.0025,
"reward": 1.503749966621399,
"reward_std": 1.1761995553970337,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.1287499964237213,
"step": 286
},
{
"completion_length": 87.0,
"epoch": 0.03840492439448682,
"grad_norm": 0.520235538482666,
"kl": 0.13112740218639374,
"learning_rate": 1.9996389306513028e-05,
"loss": 0.0052,
"reward": 2.296999931335449,
"reward_std": 0.8679065704345703,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.296999990940094,
"step": 287
},
{
"completion_length": 200.25,
"epoch": 0.03853873946206343,
"grad_norm": 0.21292062103748322,
"kl": 0.03701108694076538,
"learning_rate": 1.9996271900519267e-05,
"loss": 0.0015,
"reward": 0.9787499904632568,
"reward_std": 0.8390643000602722,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.02124999463558197,
"step": 288
},
{
"completion_length": 197.0,
"epoch": 0.03867255452964004,
"grad_norm": 0.3041883707046509,
"kl": 0.04169435426592827,
"learning_rate": 1.9996152616500244e-05,
"loss": 0.0017,
"reward": 1.4789999723434448,
"reward_std": 1.2745074033737183,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.021000005304813385,
"step": 289
},
{
"completion_length": 116.5,
"epoch": 0.038806369597216644,
"grad_norm": 0.5523695349693298,
"kl": 0.04961012303829193,
"learning_rate": 1.999603145447837e-05,
"loss": 0.002,
"reward": 2.204249858856201,
"reward_std": 0.949548065662384,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2042500078678131,
"step": 290
},
{
"completion_length": 162.25,
"epoch": 0.03894018466479326,
"grad_norm": 0.3126446008682251,
"kl": 0.08022642135620117,
"learning_rate": 1.999590841447641e-05,
"loss": 0.0032,
"reward": 1.3469998836517334,
"reward_std": 1.3204811811447144,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.02800000086426735,
"step": 291
},
{
"completion_length": 131.75,
"epoch": 0.039073999732369864,
"grad_norm": 0.20107072591781616,
"kl": 0.0379381999373436,
"learning_rate": 1.9995783496517476e-05,
"loss": 0.0015,
"reward": 0.2239999920129776,
"reward_std": 0.13086380064487457,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.2239999920129776,
"step": 292
},
{
"completion_length": 189.5,
"epoch": 0.03920781479994647,
"grad_norm": 0.20484226942062378,
"kl": 0.03132232278585434,
"learning_rate": 1.999565670062504e-05,
"loss": 0.0013,
"reward": 0.7402499914169312,
"reward_std": 0.5397964715957642,
"rewards/correctness_reward_func": 0.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.11524999141693115,
"step": 293
},
{
"completion_length": 205.75,
"epoch": 0.039341629867523084,
"grad_norm": 0.24561399221420288,
"kl": 0.0185256190598011,
"learning_rate": 1.9995528026822916e-05,
"loss": 0.0007,
"reward": 1.4252500534057617,
"reward_std": 1.6236603260040283,
"rewards/correctness_reward_func": 1.0,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.07475000619888306,
"step": 294
},
{
"completion_length": 201.25,
"epoch": 0.03947544493509969,
"grad_norm": 0.3286190629005432,
"kl": 0.031586844474077225,
"learning_rate": 1.999539747513529e-05,
"loss": 0.0013,
"reward": 2.189500093460083,
"reward_std": 0.9647402763366699,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.18950000405311584,
"step": 295
},
{
"completion_length": 121.0,
"epoch": 0.039609260002676304,
"grad_norm": 0.3337399661540985,
"kl": 0.03786204382777214,
"learning_rate": 1.999526504558668e-05,
"loss": 0.0015,
"reward": 2.154250144958496,
"reward_std": 0.9408127069473267,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.15424999594688416,
"step": 296
},
{
"completion_length": 106.5,
"epoch": 0.03974307507025291,
"grad_norm": 0.37861064076423645,
"kl": 0.062134772539138794,
"learning_rate": 1.9995130738201966e-05,
"loss": 0.0025,
"reward": 1.1582499742507935,
"reward_std": 0.9733763933181763,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.15825000405311584,
"step": 297
},
{
"completion_length": 243.75,
"epoch": 0.03987689013782952,
"grad_norm": 0.16624334454536438,
"kl": 0.03450315073132515,
"learning_rate": 1.9994994553006386e-05,
"loss": 0.0014,
"reward": 0.6399999856948853,
"reward_std": 1.6276360750198364,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.25,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.10999999940395355,
"step": 298
},
{
"completion_length": 93.25,
"epoch": 0.04001070520540613,
"grad_norm": 0.4910159707069397,
"kl": 0.04987555742263794,
"learning_rate": 1.999485649002552e-05,
"loss": 0.002,
"reward": 0.9787499904632568,
"reward_std": 1.2580289840698242,
"rewards/correctness_reward_func": 0.5,
"rewards/int_reward_func": 0.375,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.10375000536441803,
"step": 299
},
{
"completion_length": 247.5,
"epoch": 0.04014452027298274,
"grad_norm": 0.21725068986415863,
"kl": 0.011182424612343311,
"learning_rate": 1.9994716549285312e-05,
"loss": 0.0004,
"reward": 1.8627500534057617,
"reward_std": 0.7289958000183105,
"rewards/correctness_reward_func": 1.5,
"rewards/int_reward_func": 0.5,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": -0.13725000619888306,
"step": 300
}
],
"logging_steps": 1,
"max_steps": 7473,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}