| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.05352602703064365, |
| "eval_steps": 500, |
| "global_step": 400, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 564.25, |
| "epoch": 0.00013381506757660912, |
| "grad_norm": 0.14033202826976776, |
| "kl": 0.0, |
| "learning_rate": 8.88888888888889e-08, |
| "loss": 0.0, |
| "reward": -0.6452499628067017, |
| "reward_std": 0.8964393734931946, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7702499628067017, |
| "step": 1 |
| }, |
| { |
| "completion_length": 523.0, |
| "epoch": 0.00026763013515321824, |
| "grad_norm": 0.1592259407043457, |
| "kl": 0.0, |
| "learning_rate": 1.777777777777778e-07, |
| "loss": -0.0, |
| "reward": -0.9787499904632568, |
| "reward_std": 1.8240368366241455, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9787499904632568, |
| "step": 2 |
| }, |
| { |
| "completion_length": 264.25, |
| "epoch": 0.0004014452027298274, |
| "grad_norm": 0.1932271420955658, |
| "kl": 6.622826731472742e-06, |
| "learning_rate": 2.666666666666667e-07, |
| "loss": 0.0, |
| "reward": -0.226500004529953, |
| "reward_std": 0.26771190762519836, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.226500004529953, |
| "step": 3 |
| }, |
| { |
| "completion_length": 159.0, |
| "epoch": 0.0005352602703064365, |
| "grad_norm": 0.27002546191215515, |
| "kl": 1.078558216249803e-05, |
| "learning_rate": 3.555555555555556e-07, |
| "loss": 0.0, |
| "reward": 0.013750001788139343, |
| "reward_std": 0.2580934762954712, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.11124999821186066, |
| "step": 4 |
| }, |
| { |
| "completion_length": 320.75, |
| "epoch": 0.0006690753378830456, |
| "grad_norm": 0.1831292062997818, |
| "kl": 8.28549855214078e-06, |
| "learning_rate": 4.444444444444445e-07, |
| "loss": 0.0, |
| "reward": 0.0624999925494194, |
| "reward_std": 0.21419848501682281, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.0625, |
| "step": 5 |
| }, |
| { |
| "completion_length": 237.25, |
| "epoch": 0.0008028904054596548, |
| "grad_norm": 0.18136273324489594, |
| "kl": 5.389752914197743e-06, |
| "learning_rate": 5.333333333333335e-07, |
| "loss": 0.0, |
| "reward": -0.08349999785423279, |
| "reward_std": 0.16699999570846558, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.08349999785423279, |
| "step": 6 |
| }, |
| { |
| "completion_length": 375.25, |
| "epoch": 0.0009367054730362638, |
| "grad_norm": 0.18059618771076202, |
| "kl": 1.1201269444427453e-05, |
| "learning_rate": 6.222222222222223e-07, |
| "loss": 0.0, |
| "reward": -0.45899999141693115, |
| "reward_std": 0.9180000424385071, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.45899999141693115, |
| "step": 7 |
| }, |
| { |
| "completion_length": 277.0, |
| "epoch": 0.001070520540612873, |
| "grad_norm": 0.19177855551242828, |
| "kl": 5.332793080015108e-06, |
| "learning_rate": 7.111111111111112e-07, |
| "loss": 0.0, |
| "reward": -0.0560000017285347, |
| "reward_std": 0.1120000034570694, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.0560000017285347, |
| "step": 8 |
| }, |
| { |
| "completion_length": 334.5, |
| "epoch": 0.0012043356081894822, |
| "grad_norm": 0.27444255352020264, |
| "kl": 1.158629765996011e-05, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 0.0, |
| "reward": 0.2122499942779541, |
| "reward_std": 0.39513909816741943, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0872499942779541, |
| "step": 9 |
| }, |
| { |
| "completion_length": 297.25, |
| "epoch": 0.0013381506757660913, |
| "grad_norm": 0.20776011049747467, |
| "kl": 8.035244718485046e-06, |
| "learning_rate": 8.88888888888889e-07, |
| "loss": 0.0, |
| "reward": 0.5809999704360962, |
| "reward_std": 1.0095866918563843, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6690000295639038, |
| "step": 10 |
| }, |
| { |
| "completion_length": 236.5, |
| "epoch": 0.0014719657433427003, |
| "grad_norm": 9.880962898023427e-05, |
| "kl": 7.1510494308313355e-06, |
| "learning_rate": 9.77777777777778e-07, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 11 |
| }, |
| { |
| "completion_length": 342.25, |
| "epoch": 0.0016057808109193096, |
| "grad_norm": 0.11707701534032822, |
| "kl": 5.569832410401432e-06, |
| "learning_rate": 1.066666666666667e-06, |
| "loss": 0.0, |
| "reward": -0.4452499747276306, |
| "reward_std": 0.3653622269630432, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5702500343322754, |
| "step": 12 |
| }, |
| { |
| "completion_length": 261.0, |
| "epoch": 0.0017395958784959186, |
| "grad_norm": 0.22486890852451324, |
| "kl": 7.698228728258982e-06, |
| "learning_rate": 1.1555555555555556e-06, |
| "loss": 0.0, |
| "reward": -0.09549999237060547, |
| "reward_std": 0.451213538646698, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.22049999237060547, |
| "step": 13 |
| }, |
| { |
| "completion_length": 289.5, |
| "epoch": 0.0018734109460725277, |
| "grad_norm": 0.0001465526584070176, |
| "kl": 8.079272447503172e-06, |
| "learning_rate": 1.2444444444444445e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 14 |
| }, |
| { |
| "completion_length": 467.75, |
| "epoch": 0.002007226013649137, |
| "grad_norm": 7.054989691823721e-05, |
| "kl": 8.345767128048465e-06, |
| "learning_rate": 1.3333333333333334e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 15 |
| }, |
| { |
| "completion_length": 242.5, |
| "epoch": 0.002141041081225746, |
| "grad_norm": 0.20067830383777618, |
| "kl": 4.895833171758568e-06, |
| "learning_rate": 1.4222222222222223e-06, |
| "loss": 0.0, |
| "reward": 0.5694999694824219, |
| "reward_std": 0.9794492721557617, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.05550000071525574, |
| "step": 16 |
| }, |
| { |
| "completion_length": 397.75, |
| "epoch": 0.002274856148802355, |
| "grad_norm": 6.06259964115452e-05, |
| "kl": 7.802555046509951e-06, |
| "learning_rate": 1.5111111111111112e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 17 |
| }, |
| { |
| "completion_length": 263.75, |
| "epoch": 0.0024086712163789645, |
| "grad_norm": 0.2957823872566223, |
| "kl": 8.453114787698723e-06, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 0.0, |
| "reward": -0.012249999679625034, |
| "reward_std": 0.02449999935925007, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.012249999679625034, |
| "step": 18 |
| }, |
| { |
| "completion_length": 301.0, |
| "epoch": 0.0025424862839555735, |
| "grad_norm": 0.277434378862381, |
| "kl": 8.06238858785946e-06, |
| "learning_rate": 1.688888888888889e-06, |
| "loss": 0.0, |
| "reward": 0.1875, |
| "reward_std": 0.375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0625, |
| "step": 19 |
| }, |
| { |
| "completion_length": 255.75, |
| "epoch": 0.0026763013515321826, |
| "grad_norm": 9.750338358571753e-05, |
| "kl": 7.102700237737736e-06, |
| "learning_rate": 1.777777777777778e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 20 |
| }, |
| { |
| "completion_length": 200.5, |
| "epoch": 0.0028101164191087916, |
| "grad_norm": 0.29159611463546753, |
| "kl": 8.880564564606175e-06, |
| "learning_rate": 1.8666666666666669e-06, |
| "loss": 0.0, |
| "reward": -0.1912499964237213, |
| "reward_std": 0.13046424090862274, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.1912499964237213, |
| "step": 21 |
| }, |
| { |
| "completion_length": 265.0, |
| "epoch": 0.0029439314866854006, |
| "grad_norm": 0.31326013803482056, |
| "kl": 1.243360384250991e-05, |
| "learning_rate": 1.955555555555556e-06, |
| "loss": 0.0, |
| "reward": -0.25849997997283936, |
| "reward_std": 0.8479105830192566, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.38349997997283936, |
| "step": 22 |
| }, |
| { |
| "completion_length": 154.0, |
| "epoch": 0.00307774655426201, |
| "grad_norm": 0.2559646666049957, |
| "kl": 5.530042471946217e-06, |
| "learning_rate": 2.0444444444444447e-06, |
| "loss": 0.0, |
| "reward": -0.05624999850988388, |
| "reward_std": 0.11249999701976776, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.05624999850988388, |
| "step": 23 |
| }, |
| { |
| "completion_length": 189.75, |
| "epoch": 0.003211561621838619, |
| "grad_norm": 0.284862220287323, |
| "kl": 9.174956176138949e-06, |
| "learning_rate": 2.133333333333334e-06, |
| "loss": 0.0, |
| "reward": 0.1277499943971634, |
| "reward_std": 0.2554999887943268, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.002749999985098839, |
| "step": 24 |
| }, |
| { |
| "completion_length": 268.75, |
| "epoch": 0.003345376689415228, |
| "grad_norm": 0.15809978544712067, |
| "kl": 4.42105692854966e-06, |
| "learning_rate": 2.222222222222222e-06, |
| "loss": 0.0, |
| "reward": -0.2212499976158142, |
| "reward_std": 0.4424999952316284, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.2212499976158142, |
| "step": 25 |
| }, |
| { |
| "completion_length": 210.0, |
| "epoch": 0.0034791917569918372, |
| "grad_norm": 0.2747434675693512, |
| "kl": 8.758857802604325e-06, |
| "learning_rate": 2.311111111111111e-06, |
| "loss": 0.0, |
| "reward": 0.6607499718666077, |
| "reward_std": 1.2395679950714111, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.035750001668930054, |
| "step": 26 |
| }, |
| { |
| "completion_length": 437.25, |
| "epoch": 0.0036130068245684463, |
| "grad_norm": 0.18731197714805603, |
| "kl": 1.347947727481369e-05, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 0.0, |
| "reward": -0.06724999845027924, |
| "reward_std": 0.13449999690055847, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.06724999845027924, |
| "step": 27 |
| }, |
| { |
| "completion_length": 307.75, |
| "epoch": 0.0037468218921450553, |
| "grad_norm": 0.21840307116508484, |
| "kl": 8.306662493851036e-06, |
| "learning_rate": 2.488888888888889e-06, |
| "loss": 0.0, |
| "reward": -0.07774999737739563, |
| "reward_std": 0.24599508941173553, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.07774999737739563, |
| "step": 28 |
| }, |
| { |
| "completion_length": 620.25, |
| "epoch": 0.003880636959721665, |
| "grad_norm": 4.864737275056541e-05, |
| "kl": 6.508589649456553e-06, |
| "learning_rate": 2.577777777777778e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 29 |
| }, |
| { |
| "completion_length": 292.75, |
| "epoch": 0.004014452027298274, |
| "grad_norm": 0.4136326313018799, |
| "kl": 1.4081160770729184e-05, |
| "learning_rate": 2.666666666666667e-06, |
| "loss": 0.0, |
| "reward": -0.05950000137090683, |
| "reward_std": 0.30902159214019775, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.05950000137090683, |
| "step": 30 |
| }, |
| { |
| "completion_length": 398.25, |
| "epoch": 0.004148267094874883, |
| "grad_norm": 4.4143911509308964e-05, |
| "kl": 5.329064151737839e-06, |
| "learning_rate": 2.755555555555556e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 31 |
| }, |
| { |
| "completion_length": 262.5, |
| "epoch": 0.004282082162451492, |
| "grad_norm": 5.927432721364312e-05, |
| "kl": 4.8590136429993436e-06, |
| "learning_rate": 2.8444444444444446e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 32 |
| }, |
| { |
| "completion_length": 307.0, |
| "epoch": 0.004415897230028101, |
| "grad_norm": 0.19121825695037842, |
| "kl": 6.706204203510424e-06, |
| "learning_rate": 2.9333333333333338e-06, |
| "loss": 0.0, |
| "reward": -0.3400000333786011, |
| "reward_std": 0.5228951573371887, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.4650000333786011, |
| "step": 33 |
| }, |
| { |
| "completion_length": 463.5, |
| "epoch": 0.00454971229760471, |
| "grad_norm": 5.6564931583125144e-05, |
| "kl": 4.871402779826894e-06, |
| "learning_rate": 3.0222222222222225e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 34 |
| }, |
| { |
| "completion_length": 352.25, |
| "epoch": 0.004683527365181319, |
| "grad_norm": 0.1656392216682434, |
| "kl": 6.969175501581049e-06, |
| "learning_rate": 3.1111111111111116e-06, |
| "loss": 0.0, |
| "reward": 0.37549999356269836, |
| "reward_std": 0.7509999871253967, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.24950000643730164, |
| "step": 35 |
| }, |
| { |
| "completion_length": 388.75, |
| "epoch": 0.004817342432757929, |
| "grad_norm": 5.8849836932495236e-05, |
| "kl": 7.587910658912733e-06, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 36 |
| }, |
| { |
| "completion_length": 272.75, |
| "epoch": 0.004951157500334538, |
| "grad_norm": 0.19752717018127441, |
| "kl": 9.09756181499688e-06, |
| "learning_rate": 3.2888888888888894e-06, |
| "loss": 0.0, |
| "reward": 0.06274999678134918, |
| "reward_std": 0.12549999356269836, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06274999678134918, |
| "step": 37 |
| }, |
| { |
| "completion_length": 265.25, |
| "epoch": 0.005084972567911147, |
| "grad_norm": 0.1716996431350708, |
| "kl": 7.701055437792093e-06, |
| "learning_rate": 3.377777777777778e-06, |
| "loss": 0.0, |
| "reward": -0.17024999856948853, |
| "reward_std": 0.197299063205719, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.2952499985694885, |
| "step": 38 |
| }, |
| { |
| "completion_length": 198.25, |
| "epoch": 0.005218787635487756, |
| "grad_norm": 8.153666567523032e-05, |
| "kl": 7.68154040997615e-06, |
| "learning_rate": 3.4666666666666672e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 39 |
| }, |
| { |
| "completion_length": 357.75, |
| "epoch": 0.005352602703064365, |
| "grad_norm": 0.16188818216323853, |
| "kl": 8.425393389188685e-06, |
| "learning_rate": 3.555555555555556e-06, |
| "loss": 0.0, |
| "reward": -0.09000000357627869, |
| "reward_std": 0.18000000715255737, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.09000000357627869, |
| "step": 40 |
| }, |
| { |
| "completion_length": 322.5, |
| "epoch": 0.005486417770640974, |
| "grad_norm": 0.22530034184455872, |
| "kl": 1.0753658898465801e-05, |
| "learning_rate": 3.644444444444445e-06, |
| "loss": 0.0, |
| "reward": 0.045249998569488525, |
| "reward_std": 0.09049999713897705, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.07975000143051147, |
| "step": 41 |
| }, |
| { |
| "completion_length": 247.75, |
| "epoch": 0.005620232838217583, |
| "grad_norm": 0.00012052639067405835, |
| "kl": 8.426506610703655e-06, |
| "learning_rate": 3.7333333333333337e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 42 |
| }, |
| { |
| "completion_length": 354.25, |
| "epoch": 0.005754047905794192, |
| "grad_norm": 0.21594372391700745, |
| "kl": 1.0382359505456407e-05, |
| "learning_rate": 3.8222222222222224e-06, |
| "loss": 0.0, |
| "reward": 0.11999999731779099, |
| "reward_std": 0.1444529891014099, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11999999731779099, |
| "step": 43 |
| }, |
| { |
| "completion_length": 212.25, |
| "epoch": 0.005887862973370801, |
| "grad_norm": 0.00011440912930993363, |
| "kl": 7.224463388411095e-06, |
| "learning_rate": 3.911111111111112e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 44 |
| }, |
| { |
| "completion_length": 249.75, |
| "epoch": 0.00602167804094741, |
| "grad_norm": 0.20934244990348816, |
| "kl": 1.420614898961503e-05, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.0, |
| "reward": 0.06699999421834946, |
| "reward_std": 0.12292815744876862, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06699999421834946, |
| "step": 45 |
| }, |
| { |
| "completion_length": 273.0, |
| "epoch": 0.00615549310852402, |
| "grad_norm": 0.2055320143699646, |
| "kl": 1.246560350409709e-05, |
| "learning_rate": 4.088888888888889e-06, |
| "loss": 0.0, |
| "reward": 0.43024998903274536, |
| "reward_std": 0.8604999780654907, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.19474999606609344, |
| "step": 46 |
| }, |
| { |
| "completion_length": 350.25, |
| "epoch": 0.006289308176100629, |
| "grad_norm": 0.18098776042461395, |
| "kl": 1.056162545864936e-05, |
| "learning_rate": 4.177777777777778e-06, |
| "loss": 0.0, |
| "reward": -0.20074999332427979, |
| "reward_std": 0.40149998664855957, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.3257499933242798, |
| "step": 47 |
| }, |
| { |
| "completion_length": 282.25, |
| "epoch": 0.006423123243677238, |
| "grad_norm": 0.2726364731788635, |
| "kl": 1.3284003216540441e-05, |
| "learning_rate": 4.266666666666668e-06, |
| "loss": 0.0, |
| "reward": -0.6389999985694885, |
| "reward_std": 0.7382077574729919, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6389999985694885, |
| "step": 48 |
| }, |
| { |
| "completion_length": 165.5, |
| "epoch": 0.006556938311253847, |
| "grad_norm": 0.439759224653244, |
| "kl": 1.5033414456411265e-05, |
| "learning_rate": 4.3555555555555555e-06, |
| "loss": 0.0, |
| "reward": 0.5855000019073486, |
| "reward_std": 1.1710000038146973, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.039500001817941666, |
| "step": 49 |
| }, |
| { |
| "completion_length": 401.0, |
| "epoch": 0.006690753378830456, |
| "grad_norm": 0.1494322270154953, |
| "kl": 1.3026328815612942e-05, |
| "learning_rate": 4.444444444444444e-06, |
| "loss": 0.0, |
| "reward": -0.11124999821186066, |
| "reward_std": 0.2224999964237213, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.11124999821186066, |
| "step": 50 |
| }, |
| { |
| "completion_length": 205.25, |
| "epoch": 0.006824568446407065, |
| "grad_norm": 0.23992237448692322, |
| "kl": 1.3739524547418114e-05, |
| "learning_rate": 4.533333333333334e-06, |
| "loss": 0.0, |
| "reward": 0.03125, |
| "reward_std": 0.0625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 51 |
| }, |
| { |
| "completion_length": 187.25, |
| "epoch": 0.0069583835139836745, |
| "grad_norm": 0.2555287480354309, |
| "kl": 1.5424680896103382e-05, |
| "learning_rate": 4.622222222222222e-06, |
| "loss": 0.0, |
| "reward": -0.07625000178813934, |
| "reward_std": 0.12490096688270569, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.07625000178813934, |
| "step": 52 |
| }, |
| { |
| "completion_length": 230.0, |
| "epoch": 0.0070921985815602835, |
| "grad_norm": 0.00012860310380347073, |
| "kl": 1.0372207725595217e-05, |
| "learning_rate": 4.711111111111111e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 53 |
| }, |
| { |
| "completion_length": 299.25, |
| "epoch": 0.0072260136491368926, |
| "grad_norm": 0.00015418548719026148, |
| "kl": 1.1093783541582525e-05, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 54 |
| }, |
| { |
| "completion_length": 392.25, |
| "epoch": 0.007359828716713502, |
| "grad_norm": 0.0001652293576626107, |
| "kl": 1.3498687621904537e-05, |
| "learning_rate": 4.888888888888889e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 55 |
| }, |
| { |
| "completion_length": 316.5, |
| "epoch": 0.007493643784290111, |
| "grad_norm": 0.14983882009983063, |
| "kl": 2.462963675498031e-05, |
| "learning_rate": 4.977777777777778e-06, |
| "loss": 0.0, |
| "reward": -0.3332500159740448, |
| "reward_std": 0.4945647418498993, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.3332500159740448, |
| "step": 56 |
| }, |
| { |
| "completion_length": 219.0, |
| "epoch": 0.0076274588518667205, |
| "grad_norm": 0.2960438132286072, |
| "kl": 3.720477252500132e-05, |
| "learning_rate": 5.0666666666666676e-06, |
| "loss": 0.0, |
| "reward": -0.13199999928474426, |
| "reward_std": 0.2639999985694885, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.13199999928474426, |
| "step": 57 |
| }, |
| { |
| "completion_length": 235.5, |
| "epoch": 0.00776127391944333, |
| "grad_norm": 0.000978139229118824, |
| "kl": 3.992651545559056e-05, |
| "learning_rate": 5.155555555555556e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 58 |
| }, |
| { |
| "completion_length": 249.0, |
| "epoch": 0.007895088987019938, |
| "grad_norm": 0.00023408984998241067, |
| "kl": 2.2337197151500732e-05, |
| "learning_rate": 5.244444444444445e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 59 |
| }, |
| { |
| "completion_length": 263.25, |
| "epoch": 0.008028904054596548, |
| "grad_norm": 0.2353731095790863, |
| "kl": 3.2109041057992727e-05, |
| "learning_rate": 5.333333333333334e-06, |
| "loss": 0.0, |
| "reward": -0.030500000342726707, |
| "reward_std": 0.061000000685453415, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.030500000342726707, |
| "step": 60 |
| }, |
| { |
| "completion_length": 527.75, |
| "epoch": 0.008162719122173156, |
| "grad_norm": 0.15068306028842926, |
| "kl": 2.8251575713511556e-05, |
| "learning_rate": 5.422222222222223e-06, |
| "loss": 0.0, |
| "reward": -0.27649998664855957, |
| "reward_std": 0.5227195024490356, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.27649998664855957, |
| "step": 61 |
| }, |
| { |
| "completion_length": 243.75, |
| "epoch": 0.008296534189749766, |
| "grad_norm": 0.21375156939029694, |
| "kl": 5.114181476528756e-05, |
| "learning_rate": 5.511111111111112e-06, |
| "loss": 0.0, |
| "reward": -0.06224999949336052, |
| "reward_std": 0.12449999898672104, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.06224999949336052, |
| "step": 62 |
| }, |
| { |
| "completion_length": 197.5, |
| "epoch": 0.008430349257326376, |
| "grad_norm": 0.20765534043312073, |
| "kl": 6.65866318740882e-05, |
| "learning_rate": 5.600000000000001e-06, |
| "loss": 0.0, |
| "reward": 0.06274999678134918, |
| "reward_std": 0.12549999356269836, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06274999678134918, |
| "step": 63 |
| }, |
| { |
| "completion_length": 415.75, |
| "epoch": 0.008564164324902984, |
| "grad_norm": 0.00047714909305796027, |
| "kl": 5.4172531235963106e-05, |
| "learning_rate": 5.688888888888889e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 64 |
| }, |
| { |
| "completion_length": 561.25, |
| "epoch": 0.008697979392479594, |
| "grad_norm": 0.00017319328617304564, |
| "kl": 2.3787781174178235e-05, |
| "learning_rate": 5.777777777777778e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 65 |
| }, |
| { |
| "completion_length": 266.0, |
| "epoch": 0.008831794460056202, |
| "grad_norm": 0.18374453485012054, |
| "kl": 7.648408063687384e-05, |
| "learning_rate": 5.8666666666666675e-06, |
| "loss": 0.0, |
| "reward": -0.2854999899864197, |
| "reward_std": 0.40305209159851074, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.4104999899864197, |
| "step": 66 |
| }, |
| { |
| "completion_length": 367.0, |
| "epoch": 0.008965609527632812, |
| "grad_norm": 0.00041845859959721565, |
| "kl": 6.105724605731666e-05, |
| "learning_rate": 5.955555555555555e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 67 |
| }, |
| { |
| "completion_length": 165.25, |
| "epoch": 0.00909942459520942, |
| "grad_norm": 0.3174028694629669, |
| "kl": 0.00012760543904732913, |
| "learning_rate": 6.044444444444445e-06, |
| "loss": 0.0, |
| "reward": -0.09300000220537186, |
| "reward_std": 0.10758562386035919, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.09300000220537186, |
| "step": 68 |
| }, |
| { |
| "completion_length": 261.75, |
| "epoch": 0.00923323966278603, |
| "grad_norm": 0.0009887454798445106, |
| "kl": 0.00011684713535942137, |
| "learning_rate": 6.133333333333334e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 69 |
| }, |
| { |
| "completion_length": 351.25, |
| "epoch": 0.009367054730362638, |
| "grad_norm": 0.22120727598667145, |
| "kl": 8.070362673606724e-05, |
| "learning_rate": 6.222222222222223e-06, |
| "loss": 0.0, |
| "reward": 0.09224999696016312, |
| "reward_std": 0.18449999392032623, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.09224999696016312, |
| "step": 70 |
| }, |
| { |
| "completion_length": 364.75, |
| "epoch": 0.009500869797939248, |
| "grad_norm": 0.14219215512275696, |
| "kl": 0.00014313386054709554, |
| "learning_rate": 6.311111111111111e-06, |
| "loss": 0.0, |
| "reward": 0.03125, |
| "reward_std": 0.0625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 71 |
| }, |
| { |
| "completion_length": 301.0, |
| "epoch": 0.009634684865515858, |
| "grad_norm": 0.00125442526768893, |
| "kl": 0.00022049600374884903, |
| "learning_rate": 6.4000000000000006e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 72 |
| }, |
| { |
| "completion_length": 257.5, |
| "epoch": 0.009768499933092466, |
| "grad_norm": 0.27416515350341797, |
| "kl": 0.0003145383088849485, |
| "learning_rate": 6.488888888888889e-06, |
| "loss": 0.0, |
| "reward": 0.20900002121925354, |
| "reward_std": 0.4749357998371124, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.16599999368190765, |
| "step": 73 |
| }, |
| { |
| "completion_length": 355.75, |
| "epoch": 0.009902315000669076, |
| "grad_norm": 0.0006047863862477243, |
| "kl": 9.019898425322026e-05, |
| "learning_rate": 6.577777777777779e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 74 |
| }, |
| { |
| "completion_length": 329.25, |
| "epoch": 0.010036130068245684, |
| "grad_norm": 0.19630220532417297, |
| "kl": 0.00013932358706369996, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 0.0, |
| "reward": -0.3462499976158142, |
| "reward_std": 0.6924999952316284, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.3462499976158142, |
| "step": 75 |
| }, |
| { |
| "completion_length": 203.0, |
| "epoch": 0.010169945135822294, |
| "grad_norm": 0.19907738268375397, |
| "kl": 0.0003505215863697231, |
| "learning_rate": 6.755555555555556e-06, |
| "loss": 0.0, |
| "reward": -0.08375000208616257, |
| "reward_std": 0.16750000417232513, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.08375000208616257, |
| "step": 76 |
| }, |
| { |
| "completion_length": 156.0, |
| "epoch": 0.010303760203398902, |
| "grad_norm": 0.0033248290419578552, |
| "kl": 0.0005173031822778285, |
| "learning_rate": 6.844444444444445e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 77 |
| }, |
| { |
| "completion_length": 248.5, |
| "epoch": 0.010437575270975512, |
| "grad_norm": 0.2999407947063446, |
| "kl": 0.0002213937696069479, |
| "learning_rate": 6.9333333333333344e-06, |
| "loss": 0.0, |
| "reward": 0.125, |
| "reward_std": 0.25, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 78 |
| }, |
| { |
| "completion_length": 243.0, |
| "epoch": 0.01057139033855212, |
| "grad_norm": 0.2764039933681488, |
| "kl": 0.00041108846198767424, |
| "learning_rate": 7.022222222222222e-06, |
| "loss": 0.0, |
| "reward": -0.08399999886751175, |
| "reward_std": 0.1679999977350235, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.08399999886751175, |
| "step": 79 |
| }, |
| { |
| "completion_length": 577.0, |
| "epoch": 0.01070520540612873, |
| "grad_norm": 0.1485264003276825, |
| "kl": 0.0002135551767423749, |
| "learning_rate": 7.111111111111112e-06, |
| "loss": 0.0, |
| "reward": -0.6702499985694885, |
| "reward_std": 1.340499997138977, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6702499985694885, |
| "step": 80 |
| }, |
| { |
| "completion_length": 491.75, |
| "epoch": 0.010839020473705338, |
| "grad_norm": 0.0006590148550458252, |
| "kl": 0.000124384619994089, |
| "learning_rate": 7.2000000000000005e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 81 |
| }, |
| { |
| "completion_length": 183.25, |
| "epoch": 0.010972835541281948, |
| "grad_norm": 0.0028198054060339928, |
| "kl": 0.000547908479347825, |
| "learning_rate": 7.28888888888889e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 82 |
| }, |
| { |
| "completion_length": 312.25, |
| "epoch": 0.011106650608858558, |
| "grad_norm": 0.27911630272865295, |
| "kl": 0.0005270105320960283, |
| "learning_rate": 7.377777777777778e-06, |
| "loss": 0.0, |
| "reward": 0.015999972820281982, |
| "reward_std": 1.7774159908294678, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.734000027179718, |
| "step": 83 |
| }, |
| { |
| "completion_length": 348.25, |
| "epoch": 0.011240465676435166, |
| "grad_norm": 0.0015268020797520876, |
| "kl": 0.00035788220702670515, |
| "learning_rate": 7.4666666666666675e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 84 |
| }, |
| { |
| "completion_length": 430.5, |
| "epoch": 0.011374280744011776, |
| "grad_norm": 0.13874612748622894, |
| "kl": 0.00037030907697044313, |
| "learning_rate": 7.555555555555556e-06, |
| "loss": 0.0, |
| "reward": -0.6794999837875366, |
| "reward_std": 1.3589999675750732, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6794999837875366, |
| "step": 85 |
| }, |
| { |
| "completion_length": 264.5, |
| "epoch": 0.011508095811588384, |
| "grad_norm": 0.22207802534103394, |
| "kl": 0.00047897486365400255, |
| "learning_rate": 7.644444444444445e-06, |
| "loss": 0.0, |
| "reward": -0.3617500066757202, |
| "reward_std": 0.4231094419956207, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.3617500066757202, |
| "step": 86 |
| }, |
| { |
| "completion_length": 199.25, |
| "epoch": 0.011641910879164994, |
| "grad_norm": 0.26088446378707886, |
| "kl": 0.000934716546908021, |
| "learning_rate": 7.733333333333334e-06, |
| "loss": 0.0, |
| "reward": -0.17775000631809235, |
| "reward_std": 0.3555000126361847, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.17775000631809235, |
| "step": 87 |
| }, |
| { |
| "completion_length": 432.5, |
| "epoch": 0.011775725946741603, |
| "grad_norm": 0.22545722126960754, |
| "kl": 0.001022200332954526, |
| "learning_rate": 7.822222222222224e-06, |
| "loss": 0.0, |
| "reward": -0.10100000351667404, |
| "reward_std": 0.20200000703334808, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.10100000351667404, |
| "step": 88 |
| }, |
| { |
| "completion_length": 290.0, |
| "epoch": 0.011909541014318212, |
| "grad_norm": 0.0022929804399609566, |
| "kl": 0.0007066897815093398, |
| "learning_rate": 7.911111111111112e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 89 |
| }, |
| { |
| "completion_length": 306.5, |
| "epoch": 0.01204335608189482, |
| "grad_norm": 0.18963995575904846, |
| "kl": 0.0011429399019107223, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.0, |
| "reward": -0.3070000112056732, |
| "reward_std": 0.6140000224113464, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.3070000112056732, |
| "step": 90 |
| }, |
| { |
| "completion_length": 306.25, |
| "epoch": 0.01217717114947143, |
| "grad_norm": 0.32602232694625854, |
| "kl": 0.0012934200931340456, |
| "learning_rate": 8.08888888888889e-06, |
| "loss": 0.0001, |
| "reward": 0.021250000223517418, |
| "reward_std": 0.042500000447034836, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.021250000223517418, |
| "step": 91 |
| }, |
| { |
| "completion_length": 178.0, |
| "epoch": 0.01231098621704804, |
| "grad_norm": 0.32629159092903137, |
| "kl": 0.0017813891172409058, |
| "learning_rate": 8.177777777777779e-06, |
| "loss": 0.0001, |
| "reward": 0.6482499837875366, |
| "reward_std": 1.2964999675750732, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.023250000551342964, |
| "step": 92 |
| }, |
| { |
| "completion_length": 334.5, |
| "epoch": 0.012444801284624649, |
| "grad_norm": 0.0029060724191367626, |
| "kl": 0.0007846200605854392, |
| "learning_rate": 8.266666666666667e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 93 |
| }, |
| { |
| "completion_length": 389.5, |
| "epoch": 0.012578616352201259, |
| "grad_norm": 0.0021289298310875893, |
| "kl": 0.0007502713124267757, |
| "learning_rate": 8.355555555555556e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 94 |
| }, |
| { |
| "completion_length": 251.75, |
| "epoch": 0.012712431419777867, |
| "grad_norm": 0.011231260374188423, |
| "kl": 0.003988795448094606, |
| "learning_rate": 8.444444444444446e-06, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 95 |
| }, |
| { |
| "completion_length": 246.25, |
| "epoch": 0.012846246487354477, |
| "grad_norm": 0.0036670551635324955, |
| "kl": 0.0013651195913553238, |
| "learning_rate": 8.533333333333335e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 96 |
| }, |
| { |
| "completion_length": 338.25, |
| "epoch": 0.012980061554931085, |
| "grad_norm": 0.1569143831729889, |
| "kl": 0.0011384707177057862, |
| "learning_rate": 8.622222222222223e-06, |
| "loss": 0.0, |
| "reward": -0.36274999380111694, |
| "reward_std": 0.8109769225120544, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.36274999380111694, |
| "step": 97 |
| }, |
| { |
| "completion_length": 260.5, |
| "epoch": 0.013113876622507695, |
| "grad_norm": 0.2566815912723541, |
| "kl": 0.0018418811960145831, |
| "learning_rate": 8.711111111111111e-06, |
| "loss": 0.0001, |
| "reward": -0.10649999976158142, |
| "reward_std": 0.21299999952316284, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.10649999976158142, |
| "step": 98 |
| }, |
| { |
| "completion_length": 588.5, |
| "epoch": 0.013247691690084303, |
| "grad_norm": 0.3212381899356842, |
| "kl": 0.0008077286183834076, |
| "learning_rate": 8.8e-06, |
| "loss": 0.0, |
| "reward": -0.1547500044107437, |
| "reward_std": 0.30949997901916504, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.1547500044107437, |
| "step": 99 |
| }, |
| { |
| "completion_length": 202.5, |
| "epoch": 0.013381506757660913, |
| "grad_norm": 0.31029075384140015, |
| "kl": 0.0019261679844930768, |
| "learning_rate": 8.888888888888888e-06, |
| "loss": 0.0001, |
| "reward": 0.03125, |
| "reward_std": 0.0625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 100 |
| }, |
| { |
| "completion_length": 422.0, |
| "epoch": 0.013515321825237521, |
| "grad_norm": 0.2212388813495636, |
| "kl": 0.0015066334744915366, |
| "learning_rate": 8.977777777777778e-06, |
| "loss": 0.0001, |
| "reward": 0.03324999660253525, |
| "reward_std": 0.27917900681495667, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03324999660253525, |
| "step": 101 |
| }, |
| { |
| "completion_length": 405.25, |
| "epoch": 0.01364913689281413, |
| "grad_norm": 0.0010115457698702812, |
| "kl": 0.0004617396043613553, |
| "learning_rate": 9.066666666666667e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 102 |
| }, |
| { |
| "completion_length": 254.25, |
| "epoch": 0.01378295196039074, |
| "grad_norm": 0.0024686024989932775, |
| "kl": 0.0011656455462798476, |
| "learning_rate": 9.155555555555557e-06, |
| "loss": 0.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 103 |
| }, |
| { |
| "completion_length": 270.75, |
| "epoch": 0.013916767027967349, |
| "grad_norm": 0.3148210048675537, |
| "kl": 0.0020634387619793415, |
| "learning_rate": 9.244444444444445e-06, |
| "loss": 0.0001, |
| "reward": -0.37025001645088196, |
| "reward_std": 0.9000205397605896, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.49525001645088196, |
| "step": 104 |
| }, |
| { |
| "completion_length": 447.25, |
| "epoch": 0.014050582095543959, |
| "grad_norm": 0.00280591519549489, |
| "kl": 0.0013167858123779297, |
| "learning_rate": 9.333333333333334e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 105 |
| }, |
| { |
| "completion_length": 250.25, |
| "epoch": 0.014184397163120567, |
| "grad_norm": 1.272940993309021, |
| "kl": 0.010282534174621105, |
| "learning_rate": 9.422222222222222e-06, |
| "loss": 0.0004, |
| "reward": -0.38475000858306885, |
| "reward_std": 0.7695000171661377, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.38475000858306885, |
| "step": 106 |
| }, |
| { |
| "completion_length": 514.0, |
| "epoch": 0.014318212230697177, |
| "grad_norm": 0.003415808780118823, |
| "kl": 0.0018425981979817152, |
| "learning_rate": 9.511111111111112e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 107 |
| }, |
| { |
| "completion_length": 378.25, |
| "epoch": 0.014452027298273785, |
| "grad_norm": 0.0043991003185510635, |
| "kl": 0.0022172422613948584, |
| "learning_rate": 9.600000000000001e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 108 |
| }, |
| { |
| "completion_length": 384.0, |
| "epoch": 0.014585842365850395, |
| "grad_norm": 0.15473611652851105, |
| "kl": 0.003075521672144532, |
| "learning_rate": 9.688888888888889e-06, |
| "loss": 0.0001, |
| "reward": 0.03125, |
| "reward_std": 0.0625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 109 |
| }, |
| { |
| "completion_length": 386.25, |
| "epoch": 0.014719657433427003, |
| "grad_norm": 0.18655884265899658, |
| "kl": 0.0021878289990127087, |
| "learning_rate": 9.777777777777779e-06, |
| "loss": 0.0001, |
| "reward": -0.14174999296665192, |
| "reward_std": 0.28349998593330383, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.14174999296665192, |
| "step": 110 |
| }, |
| { |
| "completion_length": 388.5, |
| "epoch": 0.014853472501003613, |
| "grad_norm": 0.005759637802839279, |
| "kl": 0.0032043028622865677, |
| "learning_rate": 9.866666666666668e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 111 |
| }, |
| { |
| "completion_length": 368.0, |
| "epoch": 0.014987287568580221, |
| "grad_norm": 0.003658253001049161, |
| "kl": 0.0023007667623460293, |
| "learning_rate": 9.955555555555556e-06, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 112 |
| }, |
| { |
| "completion_length": 174.0, |
| "epoch": 0.015121102636156831, |
| "grad_norm": 0.01522353570908308, |
| "kl": 0.008999449200928211, |
| "learning_rate": 1.0044444444444446e-05, |
| "loss": 0.0004, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 113 |
| }, |
| { |
| "completion_length": 199.0, |
| "epoch": 0.015254917703733441, |
| "grad_norm": 0.2540055513381958, |
| "kl": 0.0061608292162418365, |
| "learning_rate": 1.0133333333333335e-05, |
| "loss": 0.0002, |
| "reward": 0.34974998235702515, |
| "reward_std": 0.24181587994098663, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.09974999725818634, |
| "step": 114 |
| }, |
| { |
| "completion_length": 261.0, |
| "epoch": 0.01538873277131005, |
| "grad_norm": 0.004695532377809286, |
| "kl": 0.0037373793311417103, |
| "learning_rate": 1.0222222222222223e-05, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 115 |
| }, |
| { |
| "completion_length": 460.75, |
| "epoch": 0.01552254783888666, |
| "grad_norm": 0.002644852502271533, |
| "kl": 0.0022986496333032846, |
| "learning_rate": 1.0311111111111113e-05, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 116 |
| }, |
| { |
| "completion_length": 199.75, |
| "epoch": 0.01565636290646327, |
| "grad_norm": 0.5485031604766846, |
| "kl": 0.0077914525754749775, |
| "learning_rate": 1.04e-05, |
| "loss": 0.0003, |
| "reward": 0.013749999925494194, |
| "reward_std": 0.11662010848522186, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.013749999925494194, |
| "step": 117 |
| }, |
| { |
| "completion_length": 221.5, |
| "epoch": 0.015790177974039876, |
| "grad_norm": 0.005294781178236008, |
| "kl": 0.004005097784101963, |
| "learning_rate": 1.048888888888889e-05, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 118 |
| }, |
| { |
| "completion_length": 261.25, |
| "epoch": 0.015923993041616485, |
| "grad_norm": 0.005487331189215183, |
| "kl": 0.00457819364964962, |
| "learning_rate": 1.0577777777777778e-05, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 119 |
| }, |
| { |
| "completion_length": 303.5, |
| "epoch": 0.016057808109193095, |
| "grad_norm": 0.18594208359718323, |
| "kl": 0.003819538513198495, |
| "learning_rate": 1.0666666666666667e-05, |
| "loss": 0.0002, |
| "reward": 0.23649999499320984, |
| "reward_std": 0.39350855350494385, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11149999499320984, |
| "step": 120 |
| }, |
| { |
| "completion_length": 437.0, |
| "epoch": 0.016191623176769705, |
| "grad_norm": 0.003429972566664219, |
| "kl": 0.003270561108365655, |
| "learning_rate": 1.0755555555555557e-05, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 121 |
| }, |
| { |
| "completion_length": 225.75, |
| "epoch": 0.01632543824434631, |
| "grad_norm": 0.27138444781303406, |
| "kl": 0.008114825934171677, |
| "learning_rate": 1.0844444444444446e-05, |
| "loss": 0.0003, |
| "reward": 0.5557500123977661, |
| "reward_std": 1.1115000247955322, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.06925000250339508, |
| "step": 122 |
| }, |
| { |
| "completion_length": 228.75, |
| "epoch": 0.01645925331192292, |
| "grad_norm": 0.006109706126153469, |
| "kl": 0.0048021371476352215, |
| "learning_rate": 1.0933333333333334e-05, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 123 |
| }, |
| { |
| "completion_length": 260.25, |
| "epoch": 0.01659306837949953, |
| "grad_norm": 0.22336947917938232, |
| "kl": 0.008044867776334286, |
| "learning_rate": 1.1022222222222224e-05, |
| "loss": 0.0003, |
| "reward": -0.14000000059604645, |
| "reward_std": 0.2800000011920929, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.14000000059604645, |
| "step": 124 |
| }, |
| { |
| "completion_length": 285.0, |
| "epoch": 0.01672688344707614, |
| "grad_norm": 0.1823970526456833, |
| "kl": 0.003958097659051418, |
| "learning_rate": 1.1111111111111113e-05, |
| "loss": 0.0002, |
| "reward": 0.03125, |
| "reward_std": 0.0625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 125 |
| }, |
| { |
| "completion_length": 510.0, |
| "epoch": 0.01686069851465275, |
| "grad_norm": 0.21305835247039795, |
| "kl": 0.003483413252979517, |
| "learning_rate": 1.1200000000000001e-05, |
| "loss": 0.0001, |
| "reward": -0.11699999868869781, |
| "reward_std": 0.23399999737739563, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.11699999868869781, |
| "step": 126 |
| }, |
| { |
| "completion_length": 201.0, |
| "epoch": 0.016994513582229358, |
| "grad_norm": 0.00481327623128891, |
| "kl": 0.004589317366480827, |
| "learning_rate": 1.1288888888888889e-05, |
| "loss": 0.0002, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 127 |
| }, |
| { |
| "completion_length": 295.25, |
| "epoch": 0.017128328649805968, |
| "grad_norm": 0.019327977672219276, |
| "kl": 0.007599423639476299, |
| "learning_rate": 1.1377777777777779e-05, |
| "loss": 0.0003, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 128 |
| }, |
| { |
| "completion_length": 260.0, |
| "epoch": 0.017262143717382578, |
| "grad_norm": 0.25404590368270874, |
| "kl": 0.007972000166773796, |
| "learning_rate": 1.1466666666666668e-05, |
| "loss": 0.0003, |
| "reward": 0.5900000333786011, |
| "reward_std": 1.0515261888504028, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.034999996423721313, |
| "step": 129 |
| }, |
| { |
| "completion_length": 366.0, |
| "epoch": 0.017395958784959187, |
| "grad_norm": 0.2250823676586151, |
| "kl": 0.0046564992517232895, |
| "learning_rate": 1.1555555555555556e-05, |
| "loss": 0.0002, |
| "reward": -0.08275000005960464, |
| "reward_std": 0.25571519136428833, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.08275000005960464, |
| "step": 130 |
| }, |
| { |
| "completion_length": 200.25, |
| "epoch": 0.017529773852535794, |
| "grad_norm": 0.33885374665260315, |
| "kl": 0.010599642992019653, |
| "learning_rate": 1.1644444444444446e-05, |
| "loss": 0.0004, |
| "reward": 0.65625, |
| "reward_std": 1.3125, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03125, |
| "step": 131 |
| }, |
| { |
| "completion_length": 203.25, |
| "epoch": 0.017663588920112404, |
| "grad_norm": 0.35624194145202637, |
| "kl": 0.00767766498029232, |
| "learning_rate": 1.1733333333333335e-05, |
| "loss": 0.0003, |
| "reward": 0.18774999678134918, |
| "reward_std": 0.2974899113178253, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06274999678134918, |
| "step": 132 |
| }, |
| { |
| "completion_length": 225.0, |
| "epoch": 0.017797403987689014, |
| "grad_norm": 0.21870525181293488, |
| "kl": 0.005271477624773979, |
| "learning_rate": 1.1822222222222225e-05, |
| "loss": 0.0002, |
| "reward": 0.06875000149011612, |
| "reward_std": 0.13750000298023224, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.05624999850988388, |
| "step": 133 |
| }, |
| { |
| "completion_length": 299.25, |
| "epoch": 0.017931219055265624, |
| "grad_norm": 0.008637133985757828, |
| "kl": 0.00660554226487875, |
| "learning_rate": 1.191111111111111e-05, |
| "loss": 0.0003, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 134 |
| }, |
| { |
| "completion_length": 307.25, |
| "epoch": 0.018065034122842234, |
| "grad_norm": 0.32039323449134827, |
| "kl": 0.010449215769767761, |
| "learning_rate": 1.2e-05, |
| "loss": 0.0004, |
| "reward": 0.1067499965429306, |
| "reward_std": 0.12341629713773727, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1067499965429306, |
| "step": 135 |
| }, |
| { |
| "completion_length": 323.0, |
| "epoch": 0.01819884919041884, |
| "grad_norm": 0.1738128513097763, |
| "kl": 0.00565626285970211, |
| "learning_rate": 1.208888888888889e-05, |
| "loss": 0.0002, |
| "reward": -0.37550002336502075, |
| "reward_std": 0.7510000467300415, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5005000233650208, |
| "step": 136 |
| }, |
| { |
| "completion_length": 136.5, |
| "epoch": 0.01833266425799545, |
| "grad_norm": 0.5343514084815979, |
| "kl": 0.010712994262576103, |
| "learning_rate": 1.217777777777778e-05, |
| "loss": 0.0004, |
| "reward": 0.03849999979138374, |
| "reward_std": 0.07699999213218689, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03849999979138374, |
| "step": 137 |
| }, |
| { |
| "completion_length": 393.5, |
| "epoch": 0.01846647932557206, |
| "grad_norm": 0.23455312848091125, |
| "kl": 0.0038801138289272785, |
| "learning_rate": 1.2266666666666667e-05, |
| "loss": 0.0002, |
| "reward": 0.125, |
| "reward_std": 0.25, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 138 |
| }, |
| { |
| "completion_length": 227.5, |
| "epoch": 0.01860029439314867, |
| "grad_norm": 0.011444750241935253, |
| "kl": 0.0084530059248209, |
| "learning_rate": 1.2355555555555557e-05, |
| "loss": 0.0003, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 139 |
| }, |
| { |
| "completion_length": 214.5, |
| "epoch": 0.018734109460725276, |
| "grad_norm": 0.004309752490371466, |
| "kl": 0.0035294159315526485, |
| "learning_rate": 1.2444444444444446e-05, |
| "loss": 0.0001, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0, |
| "step": 140 |
| }, |
| { |
| "completion_length": 186.75, |
| "epoch": 0.018867924528301886, |
| "grad_norm": 0.30907976627349854, |
| "kl": 0.005284931510686874, |
| "learning_rate": 1.2533333333333336e-05, |
| "loss": 0.0002, |
| "reward": -0.02500000037252903, |
| "reward_std": 0.05000000074505806, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.02500000037252903, |
| "step": 141 |
| }, |
| { |
| "completion_length": 196.5, |
| "epoch": 0.019001739595878496, |
| "grad_norm": 0.482105016708374, |
| "kl": 0.006421719677746296, |
| "learning_rate": 1.2622222222222222e-05, |
| "loss": 0.0003, |
| "reward": 0.5855000019073486, |
| "reward_std": 1.3796979188919067, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.03949999809265137, |
| "step": 142 |
| }, |
| { |
| "completion_length": 147.0, |
| "epoch": 0.019135554663455106, |
| "grad_norm": 0.4996793866157532, |
| "kl": 0.008296657353639603, |
| "learning_rate": 1.2711111111111112e-05, |
| "loss": 0.0003, |
| "reward": -0.08299999684095383, |
| "reward_std": 0.340639591217041, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.20800000429153442, |
| "step": 143 |
| }, |
| { |
| "completion_length": 346.0, |
| "epoch": 0.019269369731031716, |
| "grad_norm": 0.2119971662759781, |
| "kl": 0.004949862137436867, |
| "learning_rate": 1.2800000000000001e-05, |
| "loss": 0.0002, |
| "reward": -0.26100000739097595, |
| "reward_std": 0.5668544769287109, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.26100000739097595, |
| "step": 144 |
| }, |
| { |
| "completion_length": 321.5, |
| "epoch": 0.019403184798608322, |
| "grad_norm": 0.2375519573688507, |
| "kl": 0.003724107053130865, |
| "learning_rate": 1.288888888888889e-05, |
| "loss": 0.0001, |
| "reward": 0.0794999971985817, |
| "reward_std": 0.49268415570259094, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.1705000102519989, |
| "step": 145 |
| }, |
| { |
| "completion_length": 268.25, |
| "epoch": 0.019536999866184932, |
| "grad_norm": 0.3193058967590332, |
| "kl": 0.0033836988732218742, |
| "learning_rate": 1.2977777777777779e-05, |
| "loss": 0.0001, |
| "reward": -0.11349999159574509, |
| "reward_std": 0.5760402083396912, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.23849999904632568, |
| "step": 146 |
| }, |
| { |
| "completion_length": 173.25, |
| "epoch": 0.019670814933761542, |
| "grad_norm": 0.26112160086631775, |
| "kl": 0.006745063699781895, |
| "learning_rate": 1.3066666666666668e-05, |
| "loss": 0.0003, |
| "reward": 1.871500015258789, |
| "reward_std": 1.0523637533187866, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.12849999964237213, |
| "step": 147 |
| }, |
| { |
| "completion_length": 261.75, |
| "epoch": 0.019804630001338152, |
| "grad_norm": 0.2869455814361572, |
| "kl": 0.008915035054087639, |
| "learning_rate": 1.3155555555555558e-05, |
| "loss": 0.0004, |
| "reward": 0.2330000102519989, |
| "reward_std": 0.4726901948451996, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1080000028014183, |
| "step": 148 |
| }, |
| { |
| "completion_length": 401.25, |
| "epoch": 0.01993844506891476, |
| "grad_norm": 0.21081912517547607, |
| "kl": 0.0027889276389032602, |
| "learning_rate": 1.3244444444444447e-05, |
| "loss": 0.0001, |
| "reward": -0.21125000715255737, |
| "reward_std": 0.24462132155895233, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.21125000715255737, |
| "step": 149 |
| }, |
| { |
| "completion_length": 314.25, |
| "epoch": 0.02007226013649137, |
| "grad_norm": 0.19632020592689514, |
| "kl": 0.006448840722441673, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 0.0003, |
| "reward": -0.29874998331069946, |
| "reward_std": 0.35072246193885803, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.29874998331069946, |
| "step": 150 |
| }, |
| { |
| "completion_length": 178.5, |
| "epoch": 0.020206075204067978, |
| "grad_norm": 0.43643537163734436, |
| "kl": 0.009863872081041336, |
| "learning_rate": 1.3422222222222223e-05, |
| "loss": 0.0004, |
| "reward": -0.001499999314546585, |
| "reward_std": 0.14289741218090057, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.001499999314546585, |
| "step": 151 |
| }, |
| { |
| "completion_length": 146.25, |
| "epoch": 0.020339890271644588, |
| "grad_norm": 0.32021835446357727, |
| "kl": 0.015754813328385353, |
| "learning_rate": 1.3511111111111112e-05, |
| "loss": 0.0006, |
| "reward": 0.11400000005960464, |
| "reward_std": 0.1562071293592453, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11400000005960464, |
| "step": 152 |
| }, |
| { |
| "completion_length": 93.75, |
| "epoch": 0.020473705339221198, |
| "grad_norm": 0.531770646572113, |
| "kl": 0.015665946528315544, |
| "learning_rate": 1.3600000000000002e-05, |
| "loss": 0.0006, |
| "reward": 0.7382500171661377, |
| "reward_std": 1.344576358795166, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1132500022649765, |
| "step": 153 |
| }, |
| { |
| "completion_length": 199.75, |
| "epoch": 0.020607520406797804, |
| "grad_norm": 0.46382462978363037, |
| "kl": 0.012057983316481113, |
| "learning_rate": 1.368888888888889e-05, |
| "loss": 0.0005, |
| "reward": 0.09950000047683716, |
| "reward_std": 0.14664356410503387, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.15049999952316284, |
| "step": 154 |
| }, |
| { |
| "completion_length": 378.0, |
| "epoch": 0.020741335474374414, |
| "grad_norm": 0.2936302125453949, |
| "kl": 0.00859862007200718, |
| "learning_rate": 1.377777777777778e-05, |
| "loss": 0.0003, |
| "reward": -0.07199999690055847, |
| "reward_std": 0.6001955270767212, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.19699999690055847, |
| "step": 155 |
| }, |
| { |
| "completion_length": 143.75, |
| "epoch": 0.020875150541951024, |
| "grad_norm": 0.36545681953430176, |
| "kl": 0.018919892609119415, |
| "learning_rate": 1.3866666666666669e-05, |
| "loss": 0.0008, |
| "reward": 0.2567500174045563, |
| "reward_std": 0.42568716406822205, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13175000250339508, |
| "step": 156 |
| }, |
| { |
| "completion_length": 110.5, |
| "epoch": 0.021008965609527634, |
| "grad_norm": 0.5882317423820496, |
| "kl": 0.027741603553295135, |
| "learning_rate": 1.3955555555555558e-05, |
| "loss": 0.0011, |
| "reward": 0.17000000178813934, |
| "reward_std": 0.14358505606651306, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17000000178813934, |
| "step": 157 |
| }, |
| { |
| "completion_length": 64.75, |
| "epoch": 0.02114278067710424, |
| "grad_norm": 1.1799137592315674, |
| "kl": 0.056555233895778656, |
| "learning_rate": 1.4044444444444445e-05, |
| "loss": 0.0023, |
| "reward": 0.38724997639656067, |
| "reward_std": 0.34333789348602295, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.26225000619888306, |
| "step": 158 |
| }, |
| { |
| "completion_length": 194.0, |
| "epoch": 0.02127659574468085, |
| "grad_norm": 0.30677542090415955, |
| "kl": 0.01423485018312931, |
| "learning_rate": 1.4133333333333334e-05, |
| "loss": 0.0006, |
| "reward": 0.10674998164176941, |
| "reward_std": 0.7794790267944336, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.1432499885559082, |
| "step": 159 |
| }, |
| { |
| "completion_length": 167.0, |
| "epoch": 0.02141041081225746, |
| "grad_norm": 0.42824897170066833, |
| "kl": 0.03500333055853844, |
| "learning_rate": 1.4222222222222224e-05, |
| "loss": 0.0014, |
| "reward": 0.3812499940395355, |
| "reward_std": 0.1884257048368454, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13124999403953552, |
| "step": 160 |
| }, |
| { |
| "completion_length": 60.75, |
| "epoch": 0.02154422587983407, |
| "grad_norm": 0.5921115279197693, |
| "kl": 0.05752525106072426, |
| "learning_rate": 1.4311111111111111e-05, |
| "loss": 0.0023, |
| "reward": 0.2929999828338623, |
| "reward_std": 0.0557195246219635, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2930000126361847, |
| "step": 161 |
| }, |
| { |
| "completion_length": 93.0, |
| "epoch": 0.021678040947410677, |
| "grad_norm": 0.6500325202941895, |
| "kl": 0.05201143026351929, |
| "learning_rate": 1.4400000000000001e-05, |
| "loss": 0.0021, |
| "reward": 0.17274999618530273, |
| "reward_std": 0.06422551721334457, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17274999618530273, |
| "step": 162 |
| }, |
| { |
| "completion_length": 126.25, |
| "epoch": 0.021811856014987287, |
| "grad_norm": 0.44645747542381287, |
| "kl": 0.03292795643210411, |
| "learning_rate": 1.448888888888889e-05, |
| "loss": 0.0013, |
| "reward": 0.3970000147819519, |
| "reward_std": 0.22381241619586945, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1469999998807907, |
| "step": 163 |
| }, |
| { |
| "completion_length": 79.75, |
| "epoch": 0.021945671082563897, |
| "grad_norm": 0.5969372391700745, |
| "kl": 0.06764098256826401, |
| "learning_rate": 1.457777777777778e-05, |
| "loss": 0.0027, |
| "reward": 0.1627500057220459, |
| "reward_std": 0.09626135975122452, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1627500057220459, |
| "step": 164 |
| }, |
| { |
| "completion_length": 245.75, |
| "epoch": 0.022079486150140507, |
| "grad_norm": 0.42619219422340393, |
| "kl": 0.031476035714149475, |
| "learning_rate": 1.4666666666666666e-05, |
| "loss": 0.0013, |
| "reward": 0.5945000052452087, |
| "reward_std": 0.4530264735221863, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21949999034404755, |
| "step": 165 |
| }, |
| { |
| "completion_length": 106.5, |
| "epoch": 0.022213301217717116, |
| "grad_norm": 0.6185816526412964, |
| "kl": 0.07715773582458496, |
| "learning_rate": 1.4755555555555556e-05, |
| "loss": 0.0031, |
| "reward": 0.5134999752044678, |
| "reward_std": 0.33077535033226013, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13850000500679016, |
| "step": 166 |
| }, |
| { |
| "completion_length": 80.0, |
| "epoch": 0.022347116285293723, |
| "grad_norm": 0.5252732038497925, |
| "kl": 0.09305495023727417, |
| "learning_rate": 1.4844444444444445e-05, |
| "loss": 0.0037, |
| "reward": 0.23899999260902405, |
| "reward_std": 0.10646440088748932, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23899999260902405, |
| "step": 167 |
| }, |
| { |
| "completion_length": 279.0, |
| "epoch": 0.022480931352870333, |
| "grad_norm": 0.260774165391922, |
| "kl": 0.04496491700410843, |
| "learning_rate": 1.4933333333333335e-05, |
| "loss": 0.0018, |
| "reward": 0.11250001192092896, |
| "reward_std": 1.9712051153182983, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1374999284744263, |
| "step": 168 |
| }, |
| { |
| "completion_length": 108.25, |
| "epoch": 0.022614746420446943, |
| "grad_norm": 0.4881371259689331, |
| "kl": 0.05013870447874069, |
| "learning_rate": 1.5022222222222223e-05, |
| "loss": 0.002, |
| "reward": 2.03725004196167, |
| "reward_std": 1.0539665222167969, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03725000470876694, |
| "step": 169 |
| }, |
| { |
| "completion_length": 99.25, |
| "epoch": 0.022748561488023553, |
| "grad_norm": 0.5610305070877075, |
| "kl": 0.043805379420518875, |
| "learning_rate": 1.5111111111111112e-05, |
| "loss": 0.0018, |
| "reward": 1.1447501182556152, |
| "reward_std": 1.0834035873413086, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14474999904632568, |
| "step": 170 |
| }, |
| { |
| "completion_length": 94.75, |
| "epoch": 0.02288237655560016, |
| "grad_norm": 0.4598788917064667, |
| "kl": 0.0532003678381443, |
| "learning_rate": 1.5200000000000002e-05, |
| "loss": 0.0021, |
| "reward": 0.7122499942779541, |
| "reward_std": 0.19165311753749847, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2122499942779541, |
| "step": 171 |
| }, |
| { |
| "completion_length": 137.25, |
| "epoch": 0.02301619162317677, |
| "grad_norm": 0.34965983033180237, |
| "kl": 0.03090197592973709, |
| "learning_rate": 1.528888888888889e-05, |
| "loss": 0.0012, |
| "reward": 0.4805000126361847, |
| "reward_std": 0.415036141872406, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10550001263618469, |
| "step": 172 |
| }, |
| { |
| "completion_length": 207.0, |
| "epoch": 0.02315000669075338, |
| "grad_norm": 0.5801133513450623, |
| "kl": 0.02816297486424446, |
| "learning_rate": 1.537777777777778e-05, |
| "loss": 0.0011, |
| "reward": 0.07525002956390381, |
| "reward_std": 0.49743297696113586, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.2997499704360962, |
| "step": 173 |
| }, |
| { |
| "completion_length": 65.75, |
| "epoch": 0.02328382175832999, |
| "grad_norm": 0.6282637715339661, |
| "kl": 0.11807440221309662, |
| "learning_rate": 1.546666666666667e-05, |
| "loss": 0.0047, |
| "reward": 0.6984999775886536, |
| "reward_std": 0.19891957938671112, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.32350003719329834, |
| "step": 174 |
| }, |
| { |
| "completion_length": 70.5, |
| "epoch": 0.0234176368259066, |
| "grad_norm": 0.6953238248825073, |
| "kl": 0.069422647356987, |
| "learning_rate": 1.555555555555556e-05, |
| "loss": 0.0028, |
| "reward": 0.6110000014305115, |
| "reward_std": 0.4198293089866638, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23600000143051147, |
| "step": 175 |
| }, |
| { |
| "completion_length": 86.5, |
| "epoch": 0.023551451893483205, |
| "grad_norm": 1.0817539691925049, |
| "kl": 0.08881973475217819, |
| "learning_rate": 1.5644444444444448e-05, |
| "loss": 0.0036, |
| "reward": 0.6867499947547913, |
| "reward_std": 0.059885308146476746, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.18674999475479126, |
| "step": 176 |
| }, |
| { |
| "completion_length": 122.5, |
| "epoch": 0.023685266961059815, |
| "grad_norm": 0.3636866509914398, |
| "kl": 0.056608811020851135, |
| "learning_rate": 1.5733333333333334e-05, |
| "loss": 0.0023, |
| "reward": 1.746999979019165, |
| "reward_std": 1.2025237083435059, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24700000882148743, |
| "step": 177 |
| }, |
| { |
| "completion_length": 90.25, |
| "epoch": 0.023819082028636425, |
| "grad_norm": 0.7407636642456055, |
| "kl": 0.07338247448205948, |
| "learning_rate": 1.5822222222222224e-05, |
| "loss": 0.0029, |
| "reward": 0.5740000009536743, |
| "reward_std": 0.23982912302017212, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.19900000095367432, |
| "step": 178 |
| }, |
| { |
| "completion_length": 70.75, |
| "epoch": 0.023952897096213035, |
| "grad_norm": 0.731931746006012, |
| "kl": 0.08679507672786713, |
| "learning_rate": 1.5911111111111113e-05, |
| "loss": 0.0035, |
| "reward": 1.2517499923706055, |
| "reward_std": 0.9867871999740601, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25174999237060547, |
| "step": 179 |
| }, |
| { |
| "completion_length": 87.75, |
| "epoch": 0.02408671216378964, |
| "grad_norm": 0.40641459822654724, |
| "kl": 0.05429335683584213, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.0022, |
| "reward": 1.221750020980835, |
| "reward_std": 1.0200151205062866, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.22175000607967377, |
| "step": 180 |
| }, |
| { |
| "completion_length": 98.5, |
| "epoch": 0.02422052723136625, |
| "grad_norm": 0.3797653615474701, |
| "kl": 0.13556654751300812, |
| "learning_rate": 1.608888888888889e-05, |
| "loss": 0.0054, |
| "reward": 0.9449999928474426, |
| "reward_std": 1.1976945400238037, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.19500002264976501, |
| "step": 181 |
| }, |
| { |
| "completion_length": 93.5, |
| "epoch": 0.02435434229894286, |
| "grad_norm": 0.3987956643104553, |
| "kl": 0.06347194314002991, |
| "learning_rate": 1.617777777777778e-05, |
| "loss": 0.0025, |
| "reward": 0.5089999437332153, |
| "reward_std": 0.30833208560943604, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1339999884366989, |
| "step": 182 |
| }, |
| { |
| "completion_length": 167.75, |
| "epoch": 0.02448815736651947, |
| "grad_norm": 0.3698064982891083, |
| "kl": 0.034620750695466995, |
| "learning_rate": 1.6266666666666668e-05, |
| "loss": 0.0014, |
| "reward": 0.3659999966621399, |
| "reward_std": 0.4141714572906494, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.1340000033378601, |
| "step": 183 |
| }, |
| { |
| "completion_length": 59.5, |
| "epoch": 0.02462197243409608, |
| "grad_norm": 1.24988853931427, |
| "kl": 0.14011864364147186, |
| "learning_rate": 1.6355555555555557e-05, |
| "loss": 0.0056, |
| "reward": 1.0192499160766602, |
| "reward_std": 1.1953402757644653, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.26924997568130493, |
| "step": 184 |
| }, |
| { |
| "completion_length": 80.25, |
| "epoch": 0.024755787501672687, |
| "grad_norm": 0.5655855536460876, |
| "kl": 0.05728255584836006, |
| "learning_rate": 1.6444444444444444e-05, |
| "loss": 0.0023, |
| "reward": 0.8680000305175781, |
| "reward_std": 1.225602626800537, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24300000071525574, |
| "step": 185 |
| }, |
| { |
| "completion_length": 86.75, |
| "epoch": 0.024889602569249297, |
| "grad_norm": 0.7822229266166687, |
| "kl": 0.08518431335687637, |
| "learning_rate": 1.6533333333333333e-05, |
| "loss": 0.0034, |
| "reward": 0.5862500071525574, |
| "reward_std": 0.19938969612121582, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08624999970197678, |
| "step": 186 |
| }, |
| { |
| "completion_length": 67.75, |
| "epoch": 0.025023417636825907, |
| "grad_norm": 0.5667008757591248, |
| "kl": 0.06875781714916229, |
| "learning_rate": 1.6622222222222223e-05, |
| "loss": 0.0028, |
| "reward": 1.2515000104904175, |
| "reward_std": 0.9990040063858032, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2515000104904175, |
| "step": 187 |
| }, |
| { |
| "completion_length": 93.0, |
| "epoch": 0.025157232704402517, |
| "grad_norm": 0.456777960062027, |
| "kl": 0.06855905055999756, |
| "learning_rate": 1.6711111111111112e-05, |
| "loss": 0.0027, |
| "reward": 0.9950000047683716, |
| "reward_std": 1.1864756345748901, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24500000476837158, |
| "step": 188 |
| }, |
| { |
| "completion_length": 103.5, |
| "epoch": 0.025291047771979124, |
| "grad_norm": 0.6969872713088989, |
| "kl": 0.09152114391326904, |
| "learning_rate": 1.6800000000000002e-05, |
| "loss": 0.0037, |
| "reward": 1.597749948501587, |
| "reward_std": 1.3118852376937866, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.09775000065565109, |
| "step": 189 |
| }, |
| { |
| "completion_length": 59.5, |
| "epoch": 0.025424862839555733, |
| "grad_norm": 0.9019510746002197, |
| "kl": 0.2068648785352707, |
| "learning_rate": 1.688888888888889e-05, |
| "loss": 0.0083, |
| "reward": 0.5332499742507935, |
| "reward_std": 0.2600556015968323, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.28325000405311584, |
| "step": 190 |
| }, |
| { |
| "completion_length": 121.75, |
| "epoch": 0.025558677907132343, |
| "grad_norm": 0.4399167001247406, |
| "kl": 0.09197084605693817, |
| "learning_rate": 1.697777777777778e-05, |
| "loss": 0.0037, |
| "reward": 0.5910000205039978, |
| "reward_std": 0.3367471992969513, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.09100000560283661, |
| "step": 191 |
| }, |
| { |
| "completion_length": 57.75, |
| "epoch": 0.025692492974708953, |
| "grad_norm": 0.47419285774230957, |
| "kl": 0.12339113652706146, |
| "learning_rate": 1.706666666666667e-05, |
| "loss": 0.0049, |
| "reward": 0.2979999780654907, |
| "reward_std": 0.032321300357580185, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2979999780654907, |
| "step": 192 |
| }, |
| { |
| "completion_length": 73.0, |
| "epoch": 0.02582630804228556, |
| "grad_norm": 0.5090747475624084, |
| "kl": 0.11672109365463257, |
| "learning_rate": 1.7155555555555557e-05, |
| "loss": 0.0047, |
| "reward": 1.25, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 193 |
| }, |
| { |
| "completion_length": 78.0, |
| "epoch": 0.02596012310986217, |
| "grad_norm": 0.5393621921539307, |
| "kl": 0.07206133008003235, |
| "learning_rate": 1.7244444444444446e-05, |
| "loss": 0.0029, |
| "reward": 2.25, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 194 |
| }, |
| { |
| "completion_length": 76.25, |
| "epoch": 0.02609393817743878, |
| "grad_norm": 0.542004406452179, |
| "kl": 0.12011324614286423, |
| "learning_rate": 1.7333333333333336e-05, |
| "loss": 0.0048, |
| "reward": 0.625, |
| "reward_std": 0.25, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 195 |
| }, |
| { |
| "completion_length": 106.0, |
| "epoch": 0.02622775324501539, |
| "grad_norm": 0.43443137407302856, |
| "kl": 0.053334061056375504, |
| "learning_rate": 1.7422222222222222e-05, |
| "loss": 0.0021, |
| "reward": 2.25, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 196 |
| }, |
| { |
| "completion_length": 70.25, |
| "epoch": 0.026361568312592, |
| "grad_norm": 0.5421243906021118, |
| "kl": 0.14290496706962585, |
| "learning_rate": 1.751111111111111e-05, |
| "loss": 0.0057, |
| "reward": 1.1717499494552612, |
| "reward_std": 0.9182985424995422, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1717499941587448, |
| "step": 197 |
| }, |
| { |
| "completion_length": 50.0, |
| "epoch": 0.026495383380168606, |
| "grad_norm": 0.5998945832252502, |
| "kl": 0.16523699462413788, |
| "learning_rate": 1.76e-05, |
| "loss": 0.0066, |
| "reward": 1.125, |
| "reward_std": 1.108677864074707, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 198 |
| }, |
| { |
| "completion_length": 83.5, |
| "epoch": 0.026629198447745216, |
| "grad_norm": 0.08789081871509552, |
| "kl": 0.14140745997428894, |
| "learning_rate": 1.768888888888889e-05, |
| "loss": 0.0057, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 199 |
| }, |
| { |
| "completion_length": 94.0, |
| "epoch": 0.026763013515321826, |
| "grad_norm": 0.8682589530944824, |
| "kl": 0.24527359008789062, |
| "learning_rate": 1.7777777777777777e-05, |
| "loss": 0.0098, |
| "reward": 0.715499997138977, |
| "reward_std": 0.15306098759174347, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21549999713897705, |
| "step": 200 |
| }, |
| { |
| "completion_length": 85.75, |
| "epoch": 0.026896828582898435, |
| "grad_norm": 0.5800766944885254, |
| "kl": 0.1065763458609581, |
| "learning_rate": 1.7866666666666666e-05, |
| "loss": 0.0043, |
| "reward": 0.6587499976158142, |
| "reward_std": 0.11285203695297241, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1587499976158142, |
| "step": 201 |
| }, |
| { |
| "completion_length": 72.0, |
| "epoch": 0.027030643650475042, |
| "grad_norm": 0.6035862565040588, |
| "kl": 0.11155687272548676, |
| "learning_rate": 1.7955555555555556e-05, |
| "loss": 0.0045, |
| "reward": 1.316499948501587, |
| "reward_std": 0.9577993750572205, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3165000081062317, |
| "step": 202 |
| }, |
| { |
| "completion_length": 82.75, |
| "epoch": 0.027164458718051652, |
| "grad_norm": 0.44579821825027466, |
| "kl": 0.0726730078458786, |
| "learning_rate": 1.8044444444444445e-05, |
| "loss": 0.0029, |
| "reward": 1.25, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 203 |
| }, |
| { |
| "completion_length": 69.0, |
| "epoch": 0.02729827378562826, |
| "grad_norm": 0.6042316555976868, |
| "kl": 0.10065613687038422, |
| "learning_rate": 1.8133333333333335e-05, |
| "loss": 0.004, |
| "reward": 0.7465000152587891, |
| "reward_std": 0.006999989040195942, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24650000035762787, |
| "step": 204 |
| }, |
| { |
| "completion_length": 53.75, |
| "epoch": 0.02743208885320487, |
| "grad_norm": 0.9847255945205688, |
| "kl": 0.16128680109977722, |
| "learning_rate": 1.8222222222222224e-05, |
| "loss": 0.0065, |
| "reward": 0.2667500078678131, |
| "reward_std": 0.033500004559755325, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2667500078678131, |
| "step": 205 |
| }, |
| { |
| "completion_length": 59.5, |
| "epoch": 0.02756590392078148, |
| "grad_norm": 0.642667829990387, |
| "kl": 0.1901407241821289, |
| "learning_rate": 1.8311111111111114e-05, |
| "loss": 0.0076, |
| "reward": 0.503000020980835, |
| "reward_std": 0.29218029975891113, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2529999911785126, |
| "step": 206 |
| }, |
| { |
| "completion_length": 76.0, |
| "epoch": 0.027699718988358088, |
| "grad_norm": 1.0304243564605713, |
| "kl": 0.36415040493011475, |
| "learning_rate": 1.8400000000000003e-05, |
| "loss": 0.0146, |
| "reward": 2.25, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 207 |
| }, |
| { |
| "completion_length": 67.0, |
| "epoch": 0.027833534055934698, |
| "grad_norm": 0.7824116945266724, |
| "kl": 0.35757210850715637, |
| "learning_rate": 1.848888888888889e-05, |
| "loss": 0.0143, |
| "reward": 1.444000005722046, |
| "reward_std": 1.5823231935501099, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1940000057220459, |
| "step": 208 |
| }, |
| { |
| "completion_length": 66.0, |
| "epoch": 0.027967349123511308, |
| "grad_norm": 0.6381920576095581, |
| "kl": 0.16021761298179626, |
| "learning_rate": 1.857777777777778e-05, |
| "loss": 0.0064, |
| "reward": 0.7674999833106995, |
| "reward_std": 0.03348132595419884, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.26749998331069946, |
| "step": 209 |
| }, |
| { |
| "completion_length": 51.5, |
| "epoch": 0.028101164191087918, |
| "grad_norm": 0.029546145349740982, |
| "kl": 0.1807194948196411, |
| "learning_rate": 1.866666666666667e-05, |
| "loss": 0.0072, |
| "reward": 0.75, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 210 |
| }, |
| { |
| "completion_length": 124.25, |
| "epoch": 0.028234979258664524, |
| "grad_norm": 1.192435622215271, |
| "kl": 0.16933509707450867, |
| "learning_rate": 1.8755555555555558e-05, |
| "loss": 0.0068, |
| "reward": 0.6417499780654907, |
| "reward_std": 0.12795408070087433, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1417500078678131, |
| "step": 211 |
| }, |
| { |
| "completion_length": 61.75, |
| "epoch": 0.028368794326241134, |
| "grad_norm": 0.4982728660106659, |
| "kl": 0.10138452053070068, |
| "learning_rate": 1.8844444444444444e-05, |
| "loss": 0.0041, |
| "reward": 0.746999979019165, |
| "reward_std": 0.06240728497505188, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24700000882148743, |
| "step": 212 |
| }, |
| { |
| "completion_length": 80.25, |
| "epoch": 0.028502609393817744, |
| "grad_norm": 0.4456157386302948, |
| "kl": 0.10433340817689896, |
| "learning_rate": 1.8933333333333334e-05, |
| "loss": 0.0042, |
| "reward": 1.25, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 213 |
| }, |
| { |
| "completion_length": 146.25, |
| "epoch": 0.028636424461394354, |
| "grad_norm": 0.24999132752418518, |
| "kl": 0.06005653738975525, |
| "learning_rate": 1.9022222222222223e-05, |
| "loss": 0.0024, |
| "reward": 0.574999988079071, |
| "reward_std": 0.22246196866035461, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20000000298023224, |
| "step": 214 |
| }, |
| { |
| "completion_length": 66.25, |
| "epoch": 0.028770239528970964, |
| "grad_norm": 0.7781875133514404, |
| "kl": 0.16475608944892883, |
| "learning_rate": 1.9111111111111113e-05, |
| "loss": 0.0066, |
| "reward": 0.746999979019165, |
| "reward_std": 0.10004331171512604, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24700000882148743, |
| "step": 215 |
| }, |
| { |
| "completion_length": 39.75, |
| "epoch": 0.02890405459654757, |
| "grad_norm": 0.7784989476203918, |
| "kl": 0.14634883403778076, |
| "learning_rate": 1.9200000000000003e-05, |
| "loss": 0.0059, |
| "reward": 1.7787500619888306, |
| "reward_std": 1.170571208000183, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2787500023841858, |
| "step": 216 |
| }, |
| { |
| "completion_length": 72.25, |
| "epoch": 0.02903786966412418, |
| "grad_norm": 0.4837649464607239, |
| "kl": 0.09058363735675812, |
| "learning_rate": 1.928888888888889e-05, |
| "loss": 0.0036, |
| "reward": 0.27175000309944153, |
| "reward_std": 0.3053712248802185, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14675000309944153, |
| "step": 217 |
| }, |
| { |
| "completion_length": 80.25, |
| "epoch": 0.02917168473170079, |
| "grad_norm": 0.9711887836456299, |
| "kl": 0.07635970413684845, |
| "learning_rate": 1.9377777777777778e-05, |
| "loss": 0.0031, |
| "reward": 1.7442500591278076, |
| "reward_std": 1.0721337795257568, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24424999952316284, |
| "step": 218 |
| }, |
| { |
| "completion_length": 70.5, |
| "epoch": 0.0293054997992774, |
| "grad_norm": 0.42600736021995544, |
| "kl": 0.06140553951263428, |
| "learning_rate": 1.9466666666666668e-05, |
| "loss": 0.0025, |
| "reward": 0.7387499809265137, |
| "reward_std": 0.05771408975124359, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23874998092651367, |
| "step": 219 |
| }, |
| { |
| "completion_length": 217.5, |
| "epoch": 0.029439314866854006, |
| "grad_norm": 0.5537015199661255, |
| "kl": 0.05942856892943382, |
| "learning_rate": 1.9555555555555557e-05, |
| "loss": 0.0024, |
| "reward": 0.3062500059604645, |
| "reward_std": 0.3994557559490204, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.19374999403953552, |
| "step": 220 |
| }, |
| { |
| "completion_length": 135.75, |
| "epoch": 0.029573129934430616, |
| "grad_norm": 0.29336312413215637, |
| "kl": 0.052648480981588364, |
| "learning_rate": 1.9644444444444447e-05, |
| "loss": 0.0021, |
| "reward": 0.5582500100135803, |
| "reward_std": 0.29113730788230896, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.18325001001358032, |
| "step": 221 |
| }, |
| { |
| "completion_length": 62.0, |
| "epoch": 0.029706945002007226, |
| "grad_norm": 0.6704282760620117, |
| "kl": 0.09709338843822479, |
| "learning_rate": 1.9733333333333336e-05, |
| "loss": 0.0039, |
| "reward": 0.7619999647140503, |
| "reward_std": 0.038531359285116196, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2619999945163727, |
| "step": 222 |
| }, |
| { |
| "completion_length": 116.5, |
| "epoch": 0.029840760069583836, |
| "grad_norm": 0.2809985876083374, |
| "kl": 0.11135183274745941, |
| "learning_rate": 1.9822222222222226e-05, |
| "loss": 0.0045, |
| "reward": 0.5497499704360962, |
| "reward_std": 0.36881014704704285, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17475000023841858, |
| "step": 223 |
| }, |
| { |
| "completion_length": 60.5, |
| "epoch": 0.029974575137160443, |
| "grad_norm": 0.6438567638397217, |
| "kl": 0.133337140083313, |
| "learning_rate": 1.9911111111111112e-05, |
| "loss": 0.0053, |
| "reward": 1.3017499446868896, |
| "reward_std": 0.9685468077659607, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3017500042915344, |
| "step": 224 |
| }, |
| { |
| "completion_length": 74.0, |
| "epoch": 0.030108390204737052, |
| "grad_norm": 0.4634908139705658, |
| "kl": 0.08658132702112198, |
| "learning_rate": 2e-05, |
| "loss": 0.0035, |
| "reward": 1.0735000371932983, |
| "reward_std": 1.0860450267791748, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.19850000739097595, |
| "step": 225 |
| }, |
| { |
| "completion_length": 61.5, |
| "epoch": 0.030242205272313662, |
| "grad_norm": 0.6003581285476685, |
| "kl": 0.12208070605993271, |
| "learning_rate": 1.9999999060637166e-05, |
| "loss": 0.0049, |
| "reward": 0.7789999842643738, |
| "reward_std": 0.06427544355392456, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2789999842643738, |
| "step": 226 |
| }, |
| { |
| "completion_length": 73.5, |
| "epoch": 0.030376020339890272, |
| "grad_norm": 0.5231612920761108, |
| "kl": 0.15073804557323456, |
| "learning_rate": 1.9999996242548837e-05, |
| "loss": 0.006, |
| "reward": 1.16225004196167, |
| "reward_std": 1.0848207473754883, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.28724998235702515, |
| "step": 227 |
| }, |
| { |
| "completion_length": 72.5, |
| "epoch": 0.030509835407466882, |
| "grad_norm": 0.4123309552669525, |
| "kl": 0.07140478491783142, |
| "learning_rate": 1.999999154573555e-05, |
| "loss": 0.0029, |
| "reward": 0.7667500376701355, |
| "reward_std": 0.04211394116282463, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2667500078678131, |
| "step": 228 |
| }, |
| { |
| "completion_length": 88.5, |
| "epoch": 0.03064365047504349, |
| "grad_norm": 0.5252578854560852, |
| "kl": 0.03713103383779526, |
| "learning_rate": 1.9999984970198176e-05, |
| "loss": 0.0015, |
| "reward": 1.5755000114440918, |
| "reward_std": 1.2587233781814575, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2004999965429306, |
| "step": 229 |
| }, |
| { |
| "completion_length": 145.0, |
| "epoch": 0.0307774655426201, |
| "grad_norm": 0.4767831563949585, |
| "kl": 0.07187433540821075, |
| "learning_rate": 1.999997651593796e-05, |
| "loss": 0.0029, |
| "reward": 0.4542499780654907, |
| "reward_std": 0.4314698874950409, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.045750007033348083, |
| "step": 230 |
| }, |
| { |
| "completion_length": 102.75, |
| "epoch": 0.03091128061019671, |
| "grad_norm": 0.4800109267234802, |
| "kl": 0.07859396934509277, |
| "learning_rate": 1.9999966182956486e-05, |
| "loss": 0.0031, |
| "reward": 0.6054999828338623, |
| "reward_std": 0.21153488755226135, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1054999977350235, |
| "step": 231 |
| }, |
| { |
| "completion_length": 190.25, |
| "epoch": 0.03104509567777332, |
| "grad_norm": 0.20692415535449982, |
| "kl": 0.04673830792307854, |
| "learning_rate": 1.9999953971255692e-05, |
| "loss": 0.0019, |
| "reward": 0.15949998795986176, |
| "reward_std": 0.31504231691360474, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.125, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.03450000286102295, |
| "step": 232 |
| }, |
| { |
| "completion_length": 68.0, |
| "epoch": 0.031178910745349925, |
| "grad_norm": 0.5570704936981201, |
| "kl": 0.08012831211090088, |
| "learning_rate": 1.999993988083788e-05, |
| "loss": 0.0032, |
| "reward": 2.252500057220459, |
| "reward_std": 0.95698082447052, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2524999976158142, |
| "step": 233 |
| }, |
| { |
| "completion_length": 131.5, |
| "epoch": 0.03131272581292654, |
| "grad_norm": 0.4668695628643036, |
| "kl": 0.0665660873055458, |
| "learning_rate": 1.9999923911705693e-05, |
| "loss": 0.0027, |
| "reward": 1.5532499551773071, |
| "reward_std": 1.3141237497329712, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17824998497962952, |
| "step": 234 |
| }, |
| { |
| "completion_length": 73.25, |
| "epoch": 0.031446540880503145, |
| "grad_norm": 0.5236088633537292, |
| "kl": 0.14625723659992218, |
| "learning_rate": 1.9999906063862128e-05, |
| "loss": 0.0059, |
| "reward": 1.7309999465942383, |
| "reward_std": 1.0541173219680786, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23100000619888306, |
| "step": 235 |
| }, |
| { |
| "completion_length": 114.25, |
| "epoch": 0.03158035594807975, |
| "grad_norm": 0.45700567960739136, |
| "kl": 0.08767452090978622, |
| "learning_rate": 1.9999886337310546e-05, |
| "loss": 0.0035, |
| "reward": 0.4410000145435333, |
| "reward_std": 0.4759082794189453, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06599999964237213, |
| "step": 236 |
| }, |
| { |
| "completion_length": 208.0, |
| "epoch": 0.031714171015656364, |
| "grad_norm": 0.45323336124420166, |
| "kl": 0.0659441202878952, |
| "learning_rate": 1.999986473205465e-05, |
| "loss": 0.0026, |
| "reward": 0.2602500021457672, |
| "reward_std": 0.6481220722198486, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.2397499978542328, |
| "step": 237 |
| }, |
| { |
| "completion_length": 140.25, |
| "epoch": 0.03184798608323297, |
| "grad_norm": 0.4077318608760834, |
| "kl": 0.05185340344905853, |
| "learning_rate": 1.999984124809849e-05, |
| "loss": 0.0021, |
| "reward": 1.4795000553131104, |
| "reward_std": 1.379389762878418, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10450000315904617, |
| "step": 238 |
| }, |
| { |
| "completion_length": 70.0, |
| "epoch": 0.031981801150809584, |
| "grad_norm": 0.926057755947113, |
| "kl": 0.11616024374961853, |
| "learning_rate": 1.9999815885446497e-05, |
| "loss": 0.0046, |
| "reward": 0.9787499904632568, |
| "reward_std": 0.3524896502494812, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.35374999046325684, |
| "step": 239 |
| }, |
| { |
| "completion_length": 293.5, |
| "epoch": 0.03211561621838619, |
| "grad_norm": 0.394625723361969, |
| "kl": 0.017029505223035812, |
| "learning_rate": 1.9999788644103418e-05, |
| "loss": 0.0007, |
| "reward": -0.1302500218153, |
| "reward_std": 0.6652750968933105, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6302499771118164, |
| "step": 240 |
| }, |
| { |
| "completion_length": 135.0, |
| "epoch": 0.0322494312859628, |
| "grad_norm": 0.2847868502140045, |
| "kl": 0.09572663903236389, |
| "learning_rate": 1.9999759524074374e-05, |
| "loss": 0.0038, |
| "reward": 0.7019999623298645, |
| "reward_std": 0.22655829787254333, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2019999921321869, |
| "step": 241 |
| }, |
| { |
| "completion_length": 112.25, |
| "epoch": 0.03238324635353941, |
| "grad_norm": 0.5767569541931152, |
| "kl": 0.12632526457309723, |
| "learning_rate": 1.9999728525364848e-05, |
| "loss": 0.0051, |
| "reward": 0.6430000066757202, |
| "reward_std": 0.35005998611450195, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14300000667572021, |
| "step": 242 |
| }, |
| { |
| "completion_length": 126.5, |
| "epoch": 0.03251706142111602, |
| "grad_norm": 0.3967666029930115, |
| "kl": 0.11589077860116959, |
| "learning_rate": 1.999969564798065e-05, |
| "loss": 0.0046, |
| "reward": 1.4282499551773071, |
| "reward_std": 0.553694486618042, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.1782499998807907, |
| "step": 243 |
| }, |
| { |
| "completion_length": 134.0, |
| "epoch": 0.03265087648869262, |
| "grad_norm": 0.38813862204551697, |
| "kl": 0.06340669095516205, |
| "learning_rate": 1.999966089192796e-05, |
| "loss": 0.0025, |
| "reward": 1.2892500162124634, |
| "reward_std": 1.5317614078521729, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.2892500162124634, |
| "step": 244 |
| }, |
| { |
| "completion_length": 109.75, |
| "epoch": 0.03278469155626924, |
| "grad_norm": 0.42066898941993713, |
| "kl": 0.12392938882112503, |
| "learning_rate": 1.9999624257213318e-05, |
| "loss": 0.005, |
| "reward": 1.7067500352859497, |
| "reward_std": 1.3346679210662842, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20675000548362732, |
| "step": 245 |
| }, |
| { |
| "completion_length": 102.0, |
| "epoch": 0.03291850662384584, |
| "grad_norm": 0.704933762550354, |
| "kl": 0.12700574100017548, |
| "learning_rate": 1.9999585743843592e-05, |
| "loss": 0.0051, |
| "reward": 0.6620000004768372, |
| "reward_std": 0.6284016370773315, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.16200000047683716, |
| "step": 246 |
| }, |
| { |
| "completion_length": 65.75, |
| "epoch": 0.03305232169142246, |
| "grad_norm": 0.557697057723999, |
| "kl": 0.16324672102928162, |
| "learning_rate": 1.9999545351826028e-05, |
| "loss": 0.0065, |
| "reward": 1.4605000019073486, |
| "reward_std": 1.3599356412887573, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.335500031709671, |
| "step": 247 |
| }, |
| { |
| "completion_length": 50.25, |
| "epoch": 0.03318613675899906, |
| "grad_norm": 0.8064447641372681, |
| "kl": 0.2082303762435913, |
| "learning_rate": 1.9999503081168205e-05, |
| "loss": 0.0083, |
| "reward": 0.7957500219345093, |
| "reward_std": 0.09577882289886475, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2957499921321869, |
| "step": 248 |
| }, |
| { |
| "completion_length": 92.0, |
| "epoch": 0.03331995182657567, |
| "grad_norm": 0.8607310652732849, |
| "kl": 0.11370626091957092, |
| "learning_rate": 1.999945893187807e-05, |
| "loss": 0.0045, |
| "reward": 1.3242499828338623, |
| "reward_std": 0.351500004529953, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.375, |
| "rewards/xmlcount_reward_func": 0.4492499828338623, |
| "step": 249 |
| }, |
| { |
| "completion_length": 120.5, |
| "epoch": 0.03345376689415228, |
| "grad_norm": 0.46249887347221375, |
| "kl": 0.09916168451309204, |
| "learning_rate": 1.9999412903963925e-05, |
| "loss": 0.004, |
| "reward": 0.6180000305175781, |
| "reward_std": 0.44016435742378235, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24300000071525574, |
| "step": 250 |
| }, |
| { |
| "completion_length": 121.75, |
| "epoch": 0.03358758196172889, |
| "grad_norm": 0.7673288583755493, |
| "kl": 0.14884337782859802, |
| "learning_rate": 1.9999364997434406e-05, |
| "loss": 0.006, |
| "reward": 0.843999981880188, |
| "reward_std": 1.0066499710083008, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.09400001168251038, |
| "step": 251 |
| }, |
| { |
| "completion_length": 67.5, |
| "epoch": 0.0337213970293055, |
| "grad_norm": 0.6919443607330322, |
| "kl": 0.20414642989635468, |
| "learning_rate": 1.9999315212298516e-05, |
| "loss": 0.0082, |
| "reward": 2.0, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.5, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 252 |
| }, |
| { |
| "completion_length": 105.0, |
| "epoch": 0.03385521209688211, |
| "grad_norm": 0.4287322461605072, |
| "kl": 0.16063739359378815, |
| "learning_rate": 1.999926354856561e-05, |
| "loss": 0.0064, |
| "reward": 1.5625, |
| "reward_std": 0.9213893413543701, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.3125, |
| "step": 253 |
| }, |
| { |
| "completion_length": 109.75, |
| "epoch": 0.033989027164458716, |
| "grad_norm": 0.5036614537239075, |
| "kl": 0.10087580978870392, |
| "learning_rate": 1.9999210006245395e-05, |
| "loss": 0.004, |
| "reward": 1.224250078201294, |
| "reward_std": 0.850592851638794, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.22424998879432678, |
| "step": 254 |
| }, |
| { |
| "completion_length": 102.5, |
| "epoch": 0.03412284223203533, |
| "grad_norm": 0.4636721909046173, |
| "kl": 0.10581967979669571, |
| "learning_rate": 1.9999154585347926e-05, |
| "loss": 0.0042, |
| "reward": 1.4709999561309814, |
| "reward_std": 1.0897164344787598, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.3460000157356262, |
| "step": 255 |
| }, |
| { |
| "completion_length": 72.25, |
| "epoch": 0.034256657299611935, |
| "grad_norm": 0.03021317906677723, |
| "kl": 0.1458757519721985, |
| "learning_rate": 1.999909728588362e-05, |
| "loss": 0.0058, |
| "reward": 1.5, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.5, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 256 |
| }, |
| { |
| "completion_length": 117.25, |
| "epoch": 0.03439047236718855, |
| "grad_norm": 0.4804219901561737, |
| "kl": 0.1489054560661316, |
| "learning_rate": 1.999903810786324e-05, |
| "loss": 0.006, |
| "reward": 1.563499927520752, |
| "reward_std": 0.5171675682067871, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.31349998712539673, |
| "step": 257 |
| }, |
| { |
| "completion_length": 132.0, |
| "epoch": 0.034524287434765155, |
| "grad_norm": 0.31325674057006836, |
| "kl": 0.08090350031852722, |
| "learning_rate": 1.99989770512979e-05, |
| "loss": 0.0032, |
| "reward": 1.2827500104904175, |
| "reward_std": 1.164240837097168, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2827500104904175, |
| "step": 258 |
| }, |
| { |
| "completion_length": 90.75, |
| "epoch": 0.03465810250234176, |
| "grad_norm": 0.40309247374534607, |
| "kl": 0.1473146677017212, |
| "learning_rate": 1.999891411619908e-05, |
| "loss": 0.0059, |
| "reward": 3.0, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.5, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 259 |
| }, |
| { |
| "completion_length": 194.75, |
| "epoch": 0.034791917569918375, |
| "grad_norm": 0.30801522731781006, |
| "kl": 0.05805174261331558, |
| "learning_rate": 1.9998849302578597e-05, |
| "loss": 0.0023, |
| "reward": 1.8217499256134033, |
| "reward_std": 1.0732003450393677, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.1967500001192093, |
| "step": 260 |
| }, |
| { |
| "completion_length": 84.25, |
| "epoch": 0.03492573263749498, |
| "grad_norm": 0.7973276376724243, |
| "kl": 0.12621484696865082, |
| "learning_rate": 1.9998782610448625e-05, |
| "loss": 0.005, |
| "reward": 2.209249973297119, |
| "reward_std": 0.821670413017273, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.45925000309944153, |
| "step": 261 |
| }, |
| { |
| "completion_length": 182.25, |
| "epoch": 0.03505954770507159, |
| "grad_norm": 0.40601614117622375, |
| "kl": 0.1828838288784027, |
| "learning_rate": 1.9998714039821703e-05, |
| "loss": 0.0073, |
| "reward": 1.312999963760376, |
| "reward_std": 1.4628314971923828, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.18799999356269836, |
| "step": 262 |
| }, |
| { |
| "completion_length": 125.25, |
| "epoch": 0.0351933627726482, |
| "grad_norm": 0.4140985608100891, |
| "kl": 0.0889941155910492, |
| "learning_rate": 1.9998643590710707e-05, |
| "loss": 0.0036, |
| "reward": 3.125, |
| "reward_std": 0.4787135720252991, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.375, |
| "step": 263 |
| }, |
| { |
| "completion_length": 295.75, |
| "epoch": 0.03532717784022481, |
| "grad_norm": 0.24423445761203766, |
| "kl": 0.05636545270681381, |
| "learning_rate": 1.9998571263128873e-05, |
| "loss": 0.0023, |
| "reward": 0.24674999713897705, |
| "reward_std": 0.20656456053256989, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.12825000286102295, |
| "step": 264 |
| }, |
| { |
| "completion_length": 177.75, |
| "epoch": 0.03546099290780142, |
| "grad_norm": 0.4541739821434021, |
| "kl": 0.09995388984680176, |
| "learning_rate": 1.999849705708979e-05, |
| "loss": 0.004, |
| "reward": 0.7145000100135803, |
| "reward_std": 0.2543770372867584, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21450001001358032, |
| "step": 265 |
| }, |
| { |
| "completion_length": 250.0, |
| "epoch": 0.03559480797537803, |
| "grad_norm": 0.2651154398918152, |
| "kl": 0.0614202618598938, |
| "learning_rate": 1.99984209726074e-05, |
| "loss": 0.0025, |
| "reward": 0.718500018119812, |
| "reward_std": 0.25749239325523376, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21850000321865082, |
| "step": 266 |
| }, |
| { |
| "completion_length": 151.75, |
| "epoch": 0.035728623042954634, |
| "grad_norm": 0.34357938170433044, |
| "kl": 0.11051115393638611, |
| "learning_rate": 1.9998343009695995e-05, |
| "loss": 0.0044, |
| "reward": 1.125, |
| "reward_std": 0.9464846849441528, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.125, |
| "step": 267 |
| }, |
| { |
| "completion_length": 178.75, |
| "epoch": 0.03586243811053125, |
| "grad_norm": 0.6356491446495056, |
| "kl": 0.16408377885818481, |
| "learning_rate": 1.9998263168370228e-05, |
| "loss": 0.0066, |
| "reward": 0.4662500023841858, |
| "reward_std": 0.26134318113327026, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.03375000134110451, |
| "step": 268 |
| }, |
| { |
| "completion_length": 114.75, |
| "epoch": 0.035996253178107854, |
| "grad_norm": 0.5377089977264404, |
| "kl": 0.16050302982330322, |
| "learning_rate": 1.9998181448645087e-05, |
| "loss": 0.0064, |
| "reward": 2.5269999504089355, |
| "reward_std": 1.4152441024780273, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.2770000100135803, |
| "step": 269 |
| }, |
| { |
| "completion_length": 123.0, |
| "epoch": 0.03613006824568447, |
| "grad_norm": 0.3413686752319336, |
| "kl": 0.0774858370423317, |
| "learning_rate": 1.999809785053594e-05, |
| "loss": 0.0031, |
| "reward": 2.625, |
| "reward_std": 0.8539125919342041, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.375, |
| "step": 270 |
| }, |
| { |
| "completion_length": 78.0, |
| "epoch": 0.036263883313261074, |
| "grad_norm": 0.6230402588844299, |
| "kl": 0.14687515795230865, |
| "learning_rate": 1.999801237405848e-05, |
| "loss": 0.0059, |
| "reward": 3.0, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.5, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 271 |
| }, |
| { |
| "completion_length": 240.5, |
| "epoch": 0.03639769838083768, |
| "grad_norm": 0.36297154426574707, |
| "kl": 0.09850015491247177, |
| "learning_rate": 1.9997925019228775e-05, |
| "loss": 0.0039, |
| "reward": 0.5189999938011169, |
| "reward_std": 0.9096706509590149, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.018999993801116943, |
| "step": 272 |
| }, |
| { |
| "completion_length": 167.75, |
| "epoch": 0.03653151344841429, |
| "grad_norm": 0.2768608629703522, |
| "kl": 0.06528370082378387, |
| "learning_rate": 1.999783578606323e-05, |
| "loss": 0.0026, |
| "reward": 1.0750000476837158, |
| "reward_std": 1.1177325248718262, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.07500000298023224, |
| "step": 273 |
| }, |
| { |
| "completion_length": 205.25, |
| "epoch": 0.0366653285159909, |
| "grad_norm": 0.24607378244400024, |
| "kl": 0.0492008738219738, |
| "learning_rate": 1.9997744674578615e-05, |
| "loss": 0.002, |
| "reward": 1.375, |
| "reward_std": 1.25, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.375, |
| "step": 274 |
| }, |
| { |
| "completion_length": 123.75, |
| "epoch": 0.036799143583567506, |
| "grad_norm": 0.3026469647884369, |
| "kl": 0.11610330641269684, |
| "learning_rate": 1.9997651684792042e-05, |
| "loss": 0.0046, |
| "reward": 2.647249937057495, |
| "reward_std": 0.850723385810852, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.3972499966621399, |
| "step": 275 |
| }, |
| { |
| "completion_length": 191.75, |
| "epoch": 0.03693295865114412, |
| "grad_norm": 0.2089061737060547, |
| "kl": 0.05618685111403465, |
| "learning_rate": 1.9997556816720985e-05, |
| "loss": 0.0022, |
| "reward": 1.6702499389648438, |
| "reward_std": 1.4511858224868774, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.045249998569488525, |
| "step": 276 |
| }, |
| { |
| "completion_length": 207.25, |
| "epoch": 0.037066773718720726, |
| "grad_norm": 0.25519034266471863, |
| "kl": 0.030937891453504562, |
| "learning_rate": 1.9997460070383264e-05, |
| "loss": 0.0012, |
| "reward": 0.9837499856948853, |
| "reward_std": 0.9404298067092896, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.01625000685453415, |
| "step": 277 |
| }, |
| { |
| "completion_length": 110.5, |
| "epoch": 0.03720058878629734, |
| "grad_norm": 0.2822895050048828, |
| "kl": 0.06590020656585693, |
| "learning_rate": 1.9997361445797058e-05, |
| "loss": 0.0026, |
| "reward": 2.0512499809265137, |
| "reward_std": 1.4043407440185547, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.42625001072883606, |
| "step": 278 |
| }, |
| { |
| "completion_length": 195.0, |
| "epoch": 0.037334403853873946, |
| "grad_norm": 0.3637947738170624, |
| "kl": 0.08848227560520172, |
| "learning_rate": 1.9997260942980895e-05, |
| "loss": 0.0035, |
| "reward": 1.3247499465942383, |
| "reward_std": 0.9170319437980652, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": -0.17524999380111694, |
| "step": 279 |
| }, |
| { |
| "completion_length": 155.75, |
| "epoch": 0.03746821892145055, |
| "grad_norm": 0.3750488758087158, |
| "kl": 0.0733165591955185, |
| "learning_rate": 1.9997158561953655e-05, |
| "loss": 0.0029, |
| "reward": 2.0625, |
| "reward_std": 1.328768253326416, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.3125, |
| "step": 280 |
| }, |
| { |
| "completion_length": 173.75, |
| "epoch": 0.037602033989027166, |
| "grad_norm": 0.2433202862739563, |
| "kl": 0.07221105694770813, |
| "learning_rate": 1.9997054302734576e-05, |
| "loss": 0.0029, |
| "reward": 3.1524999141693115, |
| "reward_std": 0.4316924810409546, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.4025000035762787, |
| "step": 281 |
| }, |
| { |
| "completion_length": 173.25, |
| "epoch": 0.03773584905660377, |
| "grad_norm": 0.2556951940059662, |
| "kl": 0.043353110551834106, |
| "learning_rate": 1.9996948165343243e-05, |
| "loss": 0.0017, |
| "reward": 0.7929999828338623, |
| "reward_std": 0.29227155447006226, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2929999828338623, |
| "step": 282 |
| }, |
| { |
| "completion_length": 127.0, |
| "epoch": 0.037869664124180386, |
| "grad_norm": 0.2885083556175232, |
| "kl": 0.06136604771018028, |
| "learning_rate": 1.9996840149799594e-05, |
| "loss": 0.0025, |
| "reward": 1.2899999618530273, |
| "reward_std": 0.9839661717414856, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2900000214576721, |
| "step": 283 |
| }, |
| { |
| "completion_length": 119.75, |
| "epoch": 0.03800347919175699, |
| "grad_norm": 0.4060731828212738, |
| "kl": 0.0710485652089119, |
| "learning_rate": 1.9996730256123925e-05, |
| "loss": 0.0028, |
| "reward": 1.2477500438690186, |
| "reward_std": 0.7362886667251587, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24774999916553497, |
| "step": 284 |
| }, |
| { |
| "completion_length": 117.25, |
| "epoch": 0.0381372942593336, |
| "grad_norm": 0.32145437598228455, |
| "kl": 0.06920292973518372, |
| "learning_rate": 1.9996618484336885e-05, |
| "loss": 0.0028, |
| "reward": 1.3524999618530273, |
| "reward_std": 0.9444156885147095, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3525000214576721, |
| "step": 285 |
| }, |
| { |
| "completion_length": 145.0, |
| "epoch": 0.03827110932691021, |
| "grad_norm": 0.29436415433883667, |
| "kl": 0.06218549609184265, |
| "learning_rate": 1.9996504834459467e-05, |
| "loss": 0.0025, |
| "reward": 1.503749966621399, |
| "reward_std": 1.1761995553970337, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.1287499964237213, |
| "step": 286 |
| }, |
| { |
| "completion_length": 87.0, |
| "epoch": 0.03840492439448682, |
| "grad_norm": 0.520235538482666, |
| "kl": 0.13112740218639374, |
| "learning_rate": 1.9996389306513028e-05, |
| "loss": 0.0052, |
| "reward": 2.296999931335449, |
| "reward_std": 0.8679065704345703, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.296999990940094, |
| "step": 287 |
| }, |
| { |
| "completion_length": 200.25, |
| "epoch": 0.03853873946206343, |
| "grad_norm": 0.21292062103748322, |
| "kl": 0.03701108694076538, |
| "learning_rate": 1.9996271900519267e-05, |
| "loss": 0.0015, |
| "reward": 0.9787499904632568, |
| "reward_std": 0.8390643000602722, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.02124999463558197, |
| "step": 288 |
| }, |
| { |
| "completion_length": 197.0, |
| "epoch": 0.03867255452964004, |
| "grad_norm": 0.3041883707046509, |
| "kl": 0.04169435426592827, |
| "learning_rate": 1.9996152616500244e-05, |
| "loss": 0.0017, |
| "reward": 1.4789999723434448, |
| "reward_std": 1.2745074033737183, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.021000005304813385, |
| "step": 289 |
| }, |
| { |
| "completion_length": 116.5, |
| "epoch": 0.038806369597216644, |
| "grad_norm": 0.5523695349693298, |
| "kl": 0.04961012303829193, |
| "learning_rate": 1.999603145447837e-05, |
| "loss": 0.002, |
| "reward": 2.204249858856201, |
| "reward_std": 0.949548065662384, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2042500078678131, |
| "step": 290 |
| }, |
| { |
| "completion_length": 162.25, |
| "epoch": 0.03894018466479326, |
| "grad_norm": 0.3126446008682251, |
| "kl": 0.08022642135620117, |
| "learning_rate": 1.999590841447641e-05, |
| "loss": 0.0032, |
| "reward": 1.3469998836517334, |
| "reward_std": 1.3204811811447144, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.02800000086426735, |
| "step": 291 |
| }, |
| { |
| "completion_length": 131.75, |
| "epoch": 0.039073999732369864, |
| "grad_norm": 0.20107072591781616, |
| "kl": 0.0379381999373436, |
| "learning_rate": 1.9995783496517476e-05, |
| "loss": 0.0015, |
| "reward": 0.2239999920129776, |
| "reward_std": 0.13086380064487457, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2239999920129776, |
| "step": 292 |
| }, |
| { |
| "completion_length": 189.5, |
| "epoch": 0.03920781479994647, |
| "grad_norm": 0.20484226942062378, |
| "kl": 0.03132232278585434, |
| "learning_rate": 1.999565670062504e-05, |
| "loss": 0.0013, |
| "reward": 0.7402499914169312, |
| "reward_std": 0.5397964715957642, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.11524999141693115, |
| "step": 293 |
| }, |
| { |
| "completion_length": 205.75, |
| "epoch": 0.039341629867523084, |
| "grad_norm": 0.24561399221420288, |
| "kl": 0.0185256190598011, |
| "learning_rate": 1.9995528026822916e-05, |
| "loss": 0.0007, |
| "reward": 1.4252500534057617, |
| "reward_std": 1.6236603260040283, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.07475000619888306, |
| "step": 294 |
| }, |
| { |
| "completion_length": 201.25, |
| "epoch": 0.03947544493509969, |
| "grad_norm": 0.3286190629005432, |
| "kl": 0.031586844474077225, |
| "learning_rate": 1.999539747513529e-05, |
| "loss": 0.0013, |
| "reward": 2.189500093460083, |
| "reward_std": 0.9647402763366699, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.18950000405311584, |
| "step": 295 |
| }, |
| { |
| "completion_length": 121.0, |
| "epoch": 0.039609260002676304, |
| "grad_norm": 0.3337399661540985, |
| "kl": 0.03786204382777214, |
| "learning_rate": 1.999526504558668e-05, |
| "loss": 0.0015, |
| "reward": 2.154250144958496, |
| "reward_std": 0.9408127069473267, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.15424999594688416, |
| "step": 296 |
| }, |
| { |
| "completion_length": 106.5, |
| "epoch": 0.03974307507025291, |
| "grad_norm": 0.37861064076423645, |
| "kl": 0.062134772539138794, |
| "learning_rate": 1.9995130738201966e-05, |
| "loss": 0.0025, |
| "reward": 1.1582499742507935, |
| "reward_std": 0.9733763933181763, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.15825000405311584, |
| "step": 297 |
| }, |
| { |
| "completion_length": 243.75, |
| "epoch": 0.03987689013782952, |
| "grad_norm": 0.16624334454536438, |
| "kl": 0.03450315073132515, |
| "learning_rate": 1.9994994553006386e-05, |
| "loss": 0.0014, |
| "reward": 0.6399999856948853, |
| "reward_std": 1.6276360750198364, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.10999999940395355, |
| "step": 298 |
| }, |
| { |
| "completion_length": 93.25, |
| "epoch": 0.04001070520540613, |
| "grad_norm": 0.4910159707069397, |
| "kl": 0.04987555742263794, |
| "learning_rate": 1.999485649002552e-05, |
| "loss": 0.002, |
| "reward": 0.9787499904632568, |
| "reward_std": 1.2580289840698242, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.10375000536441803, |
| "step": 299 |
| }, |
| { |
| "completion_length": 247.5, |
| "epoch": 0.04014452027298274, |
| "grad_norm": 0.21725068986415863, |
| "kl": 0.011182424612343311, |
| "learning_rate": 1.9994716549285312e-05, |
| "loss": 0.0004, |
| "reward": 1.8627500534057617, |
| "reward_std": 0.7289958000183105, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.13725000619888306, |
| "step": 300 |
| }, |
| { |
| "completion_length": 325.5, |
| "epoch": 0.04027833534055935, |
| "grad_norm": 0.22278064489364624, |
| "kl": 0.02587048150599003, |
| "learning_rate": 1.999457473081205e-05, |
| "loss": 0.001, |
| "reward": 0.12974999845027924, |
| "reward_std": 0.6960710287094116, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.37024998664855957, |
| "step": 301 |
| }, |
| { |
| "completion_length": 137.75, |
| "epoch": 0.040412150408135956, |
| "grad_norm": 0.31161320209503174, |
| "kl": 0.09189829975366592, |
| "learning_rate": 1.999443103463238e-05, |
| "loss": 0.0037, |
| "reward": 2.146749973297119, |
| "reward_std": 1.1551616191864014, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.27175000309944153, |
| "step": 302 |
| }, |
| { |
| "completion_length": 250.25, |
| "epoch": 0.04054596547571256, |
| "grad_norm": 0.22641287744045258, |
| "kl": 0.02101261168718338, |
| "learning_rate": 1.9994285460773294e-05, |
| "loss": 0.0008, |
| "reward": 0.3440000116825104, |
| "reward_std": 0.5435936450958252, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.15599998831748962, |
| "step": 303 |
| }, |
| { |
| "completion_length": 186.0, |
| "epoch": 0.040679780543289176, |
| "grad_norm": 0.25890541076660156, |
| "kl": 0.02058418095111847, |
| "learning_rate": 1.9994138009262146e-05, |
| "loss": 0.0008, |
| "reward": 0.5234999656677246, |
| "reward_std": 0.26434004306793213, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.023499995470046997, |
| "step": 304 |
| }, |
| { |
| "completion_length": 131.75, |
| "epoch": 0.04081359561086578, |
| "grad_norm": 0.2875959575176239, |
| "kl": 0.044362254440784454, |
| "learning_rate": 1.999398868012663e-05, |
| "loss": 0.0018, |
| "reward": 2.746500015258789, |
| "reward_std": 0.30240532755851746, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24650001525878906, |
| "step": 305 |
| }, |
| { |
| "completion_length": 161.75, |
| "epoch": 0.040947410678442396, |
| "grad_norm": 0.1745125651359558, |
| "kl": 0.06078009307384491, |
| "learning_rate": 1.999383747339481e-05, |
| "loss": 0.0024, |
| "reward": 0.9670000076293945, |
| "reward_std": 1.158180832862854, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.09200000017881393, |
| "step": 306 |
| }, |
| { |
| "completion_length": 217.75, |
| "epoch": 0.041081225746019, |
| "grad_norm": 0.23282963037490845, |
| "kl": 0.0332004614174366, |
| "learning_rate": 1.9993684389095095e-05, |
| "loss": 0.0013, |
| "reward": 0.1862500011920929, |
| "reward_std": 0.40866151452064514, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.1887499988079071, |
| "step": 307 |
| }, |
| { |
| "completion_length": 164.75, |
| "epoch": 0.04121504081359561, |
| "grad_norm": 0.34631866216659546, |
| "kl": 0.028073476627469063, |
| "learning_rate": 1.9993529427256236e-05, |
| "loss": 0.0011, |
| "reward": 2.7482500076293945, |
| "reward_std": 0.0034999847412109375, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24825000762939453, |
| "step": 308 |
| }, |
| { |
| "completion_length": 130.25, |
| "epoch": 0.04134885588117222, |
| "grad_norm": 0.3634851276874542, |
| "kl": 0.09160412847995758, |
| "learning_rate": 1.9993372587907348e-05, |
| "loss": 0.0037, |
| "reward": 1.8429999351501465, |
| "reward_std": 1.123504638671875, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.34299999475479126, |
| "step": 309 |
| }, |
| { |
| "completion_length": 196.0, |
| "epoch": 0.04148267094874883, |
| "grad_norm": 0.21235792338848114, |
| "kl": 0.04639435186982155, |
| "learning_rate": 1.9993213871077904e-05, |
| "loss": 0.0019, |
| "reward": 1.4287500381469727, |
| "reward_std": 1.2699767351150513, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.053749993443489075, |
| "step": 310 |
| }, |
| { |
| "completion_length": 237.0, |
| "epoch": 0.041616486016325435, |
| "grad_norm": 0.28208789229393005, |
| "kl": 0.01907791756093502, |
| "learning_rate": 1.9993053276797717e-05, |
| "loss": 0.0008, |
| "reward": 0.9800000190734863, |
| "reward_std": 0.35580894351005554, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.35499998927116394, |
| "step": 311 |
| }, |
| { |
| "completion_length": 216.75, |
| "epoch": 0.04175030108390205, |
| "grad_norm": 0.2388726770877838, |
| "kl": 0.025296643376350403, |
| "learning_rate": 1.999289080509696e-05, |
| "loss": 0.001, |
| "reward": 0.13650000095367432, |
| "reward_std": 0.5733442902565002, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.11350000649690628, |
| "step": 312 |
| }, |
| { |
| "completion_length": 191.75, |
| "epoch": 0.041884116151478655, |
| "grad_norm": 0.3149755597114563, |
| "kl": 0.055035725235939026, |
| "learning_rate": 1.9992726456006157e-05, |
| "loss": 0.0022, |
| "reward": 1.215000033378601, |
| "reward_std": 1.045577883720398, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3400000035762787, |
| "step": 313 |
| }, |
| { |
| "completion_length": 262.25, |
| "epoch": 0.04201793121905527, |
| "grad_norm": 0.1590775102376938, |
| "kl": 0.020836668089032173, |
| "learning_rate": 1.9992560229556184e-05, |
| "loss": 0.0008, |
| "reward": 2.6440000534057617, |
| "reward_std": 0.5096639394760132, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.14400000870227814, |
| "step": 314 |
| }, |
| { |
| "completion_length": 272.5, |
| "epoch": 0.042151746286631875, |
| "grad_norm": 0.15592096745967865, |
| "kl": 0.022328346967697144, |
| "learning_rate": 1.9992392125778267e-05, |
| "loss": 0.0009, |
| "reward": 0.5824999809265137, |
| "reward_std": 1.0135630369186401, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.41749998927116394, |
| "step": 315 |
| }, |
| { |
| "completion_length": 217.5, |
| "epoch": 0.04228556135420848, |
| "grad_norm": 0.2921476662158966, |
| "kl": 0.07508786767721176, |
| "learning_rate": 1.9992222144703993e-05, |
| "loss": 0.003, |
| "reward": 1.8257499933242798, |
| "reward_std": 0.8752471208572388, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.17424999177455902, |
| "step": 316 |
| }, |
| { |
| "completion_length": 178.0, |
| "epoch": 0.042419376421785095, |
| "grad_norm": 0.30130401253700256, |
| "kl": 0.014766698703169823, |
| "learning_rate": 1.9992050286365296e-05, |
| "loss": 0.0006, |
| "reward": 1.0767500400543213, |
| "reward_std": 1.127523422241211, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.0767500028014183, |
| "step": 317 |
| }, |
| { |
| "completion_length": 174.75, |
| "epoch": 0.0425531914893617, |
| "grad_norm": 0.19542303681373596, |
| "kl": 0.031994640827178955, |
| "learning_rate": 1.9991876550794465e-05, |
| "loss": 0.0013, |
| "reward": 2.2990000247955322, |
| "reward_std": 1.0411230325698853, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.29899999499320984, |
| "step": 318 |
| }, |
| { |
| "completion_length": 166.75, |
| "epoch": 0.042687006556938314, |
| "grad_norm": 0.44654956459999084, |
| "kl": 0.04502785950899124, |
| "learning_rate": 1.9991700938024134e-05, |
| "loss": 0.0018, |
| "reward": 2.1157500743865967, |
| "reward_std": 0.7704381942749023, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11574998497962952, |
| "step": 319 |
| }, |
| { |
| "completion_length": 181.5, |
| "epoch": 0.04282082162451492, |
| "grad_norm": 0.16935208439826965, |
| "kl": 0.02041492983698845, |
| "learning_rate": 1.9991523448087303e-05, |
| "loss": 0.0008, |
| "reward": 0.9904999732971191, |
| "reward_std": 0.8769502639770508, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.009499996900558472, |
| "step": 320 |
| }, |
| { |
| "completion_length": 162.5, |
| "epoch": 0.04295463669209153, |
| "grad_norm": 0.23541118204593658, |
| "kl": 0.02936200052499771, |
| "learning_rate": 1.9991344081017312e-05, |
| "loss": 0.0012, |
| "reward": 1.8782498836517334, |
| "reward_std": 1.1906729936599731, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.003249998204410076, |
| "step": 321 |
| }, |
| { |
| "completion_length": 338.0, |
| "epoch": 0.04308845175966814, |
| "grad_norm": 0.17548249661922455, |
| "kl": 0.02140561304986477, |
| "learning_rate": 1.9991162836847863e-05, |
| "loss": 0.0009, |
| "reward": 1.7432500123977661, |
| "reward_std": 1.3156806230545044, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.24324999749660492, |
| "step": 322 |
| }, |
| { |
| "completion_length": 144.0, |
| "epoch": 0.04322226682724475, |
| "grad_norm": 0.15269918739795685, |
| "kl": 0.041680727154016495, |
| "learning_rate": 1.9990979715613005e-05, |
| "loss": 0.0017, |
| "reward": 2.738499879837036, |
| "reward_std": 0.19149500131607056, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23849999904632568, |
| "step": 323 |
| }, |
| { |
| "completion_length": 144.5, |
| "epoch": 0.043356081894821354, |
| "grad_norm": 0.295144647359848, |
| "kl": 0.04146221652626991, |
| "learning_rate": 1.999079471734714e-05, |
| "loss": 0.0017, |
| "reward": 2.274749994277954, |
| "reward_std": 0.8752064108848572, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2747499942779541, |
| "step": 324 |
| }, |
| { |
| "completion_length": 166.25, |
| "epoch": 0.04348989696239797, |
| "grad_norm": 0.3544755280017853, |
| "kl": 0.03895579278469086, |
| "learning_rate": 1.9990607842085025e-05, |
| "loss": 0.0016, |
| "reward": 2.1337499618530273, |
| "reward_std": 1.3412890434265137, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25874999165534973, |
| "step": 325 |
| }, |
| { |
| "completion_length": 115.0, |
| "epoch": 0.04362371202997457, |
| "grad_norm": 0.2468242645263672, |
| "kl": 0.0689246729016304, |
| "learning_rate": 1.9990419089861772e-05, |
| "loss": 0.0028, |
| "reward": 2.1852500438690186, |
| "reward_std": 1.6674143075942993, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.18524999916553497, |
| "step": 326 |
| }, |
| { |
| "completion_length": 227.0, |
| "epoch": 0.04375752709755119, |
| "grad_norm": 0.21105855703353882, |
| "kl": 0.032667145133018494, |
| "learning_rate": 1.999022846071284e-05, |
| "loss": 0.0013, |
| "reward": 1.3339999914169312, |
| "reward_std": 1.1114063262939453, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.33399999141693115, |
| "step": 327 |
| }, |
| { |
| "completion_length": 306.5, |
| "epoch": 0.04389134216512779, |
| "grad_norm": 0.14921171963214874, |
| "kl": 0.01686778850853443, |
| "learning_rate": 1.9990035954674042e-05, |
| "loss": 0.0007, |
| "reward": 0.9865000247955322, |
| "reward_std": 1.2040247917175293, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.013500001281499863, |
| "step": 328 |
| }, |
| { |
| "completion_length": 213.75, |
| "epoch": 0.0440251572327044, |
| "grad_norm": 0.31435203552246094, |
| "kl": 0.04900065064430237, |
| "learning_rate": 1.9989841571781543e-05, |
| "loss": 0.002, |
| "reward": 1.6059999465942383, |
| "reward_std": 1.3639472723007202, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.23100000619888306, |
| "step": 329 |
| }, |
| { |
| "completion_length": 273.25, |
| "epoch": 0.04415897230028101, |
| "grad_norm": 0.26014062762260437, |
| "kl": 0.023984603583812714, |
| "learning_rate": 1.9989645312071867e-05, |
| "loss": 0.001, |
| "reward": 1.1297500133514404, |
| "reward_std": 1.157586693763733, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.12974999845027924, |
| "step": 330 |
| }, |
| { |
| "completion_length": 290.75, |
| "epoch": 0.04429278736785762, |
| "grad_norm": 0.19021719694137573, |
| "kl": 0.017466582357883453, |
| "learning_rate": 1.9989447175581884e-05, |
| "loss": 0.0007, |
| "reward": 1.5542500019073486, |
| "reward_std": 1.1368547677993774, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.17925000190734863, |
| "step": 331 |
| }, |
| { |
| "completion_length": 232.75, |
| "epoch": 0.04442660243543423, |
| "grad_norm": 0.19858774542808533, |
| "kl": 0.03412783890962601, |
| "learning_rate": 1.9989247162348816e-05, |
| "loss": 0.0014, |
| "reward": 0.972000002861023, |
| "reward_std": 1.296072006225586, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.02799999713897705, |
| "step": 332 |
| }, |
| { |
| "completion_length": 418.0, |
| "epoch": 0.04456041750301084, |
| "grad_norm": 0.15417160093784332, |
| "kl": 0.010793618857860565, |
| "learning_rate": 1.9989045272410242e-05, |
| "loss": 0.0004, |
| "reward": 1.5210000276565552, |
| "reward_std": 0.5337172150611877, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.4790000319480896, |
| "step": 333 |
| }, |
| { |
| "completion_length": 201.75, |
| "epoch": 0.044694232570587446, |
| "grad_norm": 0.3863411545753479, |
| "kl": 0.03746401518583298, |
| "learning_rate": 1.9988841505804094e-05, |
| "loss": 0.0015, |
| "reward": 0.4192500114440918, |
| "reward_std": 0.3784824013710022, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1692499965429306, |
| "step": 334 |
| }, |
| { |
| "completion_length": 286.5, |
| "epoch": 0.04482804763816406, |
| "grad_norm": 0.1543397456407547, |
| "kl": 0.035325512290000916, |
| "learning_rate": 1.998863586256865e-05, |
| "loss": 0.0014, |
| "reward": 1.4367499351501465, |
| "reward_std": 1.2802351713180542, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.06175000220537186, |
| "step": 335 |
| }, |
| { |
| "completion_length": 301.25, |
| "epoch": 0.044961862705740666, |
| "grad_norm": 0.2297045886516571, |
| "kl": 0.03229169175028801, |
| "learning_rate": 1.9988428342742544e-05, |
| "loss": 0.0013, |
| "reward": -0.07874998450279236, |
| "reward_std": 0.6149121522903442, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.45374998450279236, |
| "step": 336 |
| }, |
| { |
| "completion_length": 143.0, |
| "epoch": 0.04509567777331728, |
| "grad_norm": 0.26111480593681335, |
| "kl": 0.047426097095012665, |
| "learning_rate": 1.998821894636477e-05, |
| "loss": 0.0019, |
| "reward": 1.0780000686645508, |
| "reward_std": 1.149530053138733, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.203000009059906, |
| "step": 337 |
| }, |
| { |
| "completion_length": 167.5, |
| "epoch": 0.045229492840893885, |
| "grad_norm": 0.20062367618083954, |
| "kl": 0.03473719209432602, |
| "learning_rate": 1.998800767347466e-05, |
| "loss": 0.0014, |
| "reward": 2.8125, |
| "reward_std": 0.125, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3125, |
| "step": 338 |
| }, |
| { |
| "completion_length": 133.75, |
| "epoch": 0.04536330790847049, |
| "grad_norm": 0.2992906868457794, |
| "kl": 0.0852011889219284, |
| "learning_rate": 1.998779452411191e-05, |
| "loss": 0.0034, |
| "reward": 2.010499954223633, |
| "reward_std": 1.278815746307373, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1354999989271164, |
| "step": 339 |
| }, |
| { |
| "completion_length": 187.75, |
| "epoch": 0.045497122976047105, |
| "grad_norm": 0.2593173682689667, |
| "kl": 0.040597543120384216, |
| "learning_rate": 1.9987579498316568e-05, |
| "loss": 0.0016, |
| "reward": 1.750249981880188, |
| "reward_std": 1.154989242553711, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.250249981880188, |
| "step": 340 |
| }, |
| { |
| "completion_length": 173.0, |
| "epoch": 0.04563093804362371, |
| "grad_norm": 0.24178040027618408, |
| "kl": 0.05775593966245651, |
| "learning_rate": 1.9987362596129026e-05, |
| "loss": 0.0023, |
| "reward": 0.6192499995231628, |
| "reward_std": 0.3756961226463318, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.11924999952316284, |
| "step": 341 |
| }, |
| { |
| "completion_length": 118.25, |
| "epoch": 0.04576475311120032, |
| "grad_norm": 0.3264365792274475, |
| "kl": 0.052456341683864594, |
| "learning_rate": 1.998714381759004e-05, |
| "loss": 0.0021, |
| "reward": 1.8232500553131104, |
| "reward_std": 1.1960315704345703, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3232499957084656, |
| "step": 342 |
| }, |
| { |
| "completion_length": 181.75, |
| "epoch": 0.04589856817877693, |
| "grad_norm": 0.3560587763786316, |
| "kl": 0.04571708291769028, |
| "learning_rate": 1.99869231627407e-05, |
| "loss": 0.0018, |
| "reward": 2.189499855041504, |
| "reward_std": 1.4641177654266357, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.31450000405311584, |
| "step": 343 |
| }, |
| { |
| "completion_length": 189.5, |
| "epoch": 0.04603238324635354, |
| "grad_norm": 0.24474014341831207, |
| "kl": 0.046364784240722656, |
| "learning_rate": 1.9986700631622477e-05, |
| "loss": 0.0019, |
| "reward": 0.9375, |
| "reward_std": 1.2479149103164673, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1875, |
| "step": 344 |
| }, |
| { |
| "completion_length": 232.0, |
| "epoch": 0.04616619831393015, |
| "grad_norm": 0.23129980266094208, |
| "kl": 0.029866104945540428, |
| "learning_rate": 1.9986476224277167e-05, |
| "loss": 0.0012, |
| "reward": 0.13249999284744263, |
| "reward_std": 0.23499999940395355, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13249999284744263, |
| "step": 345 |
| }, |
| { |
| "completion_length": 202.75, |
| "epoch": 0.04630001338150676, |
| "grad_norm": 0.2577056288719177, |
| "kl": 0.037398550659418106, |
| "learning_rate": 1.9986249940746937e-05, |
| "loss": 0.0015, |
| "reward": 2.6357498168945312, |
| "reward_std": 0.31156212091445923, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.13574999570846558, |
| "step": 346 |
| }, |
| { |
| "completion_length": 124.25, |
| "epoch": 0.046433828449083364, |
| "grad_norm": 0.526066780090332, |
| "kl": 0.13436244428157806, |
| "learning_rate": 1.9986021781074294e-05, |
| "loss": 0.0054, |
| "reward": 2.7130000591278076, |
| "reward_std": 0.07399996370077133, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.21299999952316284, |
| "step": 347 |
| }, |
| { |
| "completion_length": 234.75, |
| "epoch": 0.04656764351665998, |
| "grad_norm": 0.23598410189151764, |
| "kl": 0.03225242346525192, |
| "learning_rate": 1.9985791745302108e-05, |
| "loss": 0.0013, |
| "reward": 0.3554999828338623, |
| "reward_std": 0.6230471134185791, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.0195000022649765, |
| "step": 348 |
| }, |
| { |
| "completion_length": 190.0, |
| "epoch": 0.046701458584236584, |
| "grad_norm": 0.268939733505249, |
| "kl": 0.04406317323446274, |
| "learning_rate": 1.998555983347359e-05, |
| "loss": 0.0018, |
| "reward": 0.8125, |
| "reward_std": 0.125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.3125, |
| "step": 349 |
| }, |
| { |
| "completion_length": 175.0, |
| "epoch": 0.0468352736518132, |
| "grad_norm": 0.26921752095222473, |
| "kl": 0.0636858195066452, |
| "learning_rate": 1.998532604563232e-05, |
| "loss": 0.0025, |
| "reward": 2.28125, |
| "reward_std": 1.4803398847579956, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.28125, |
| "step": 350 |
| }, |
| { |
| "completion_length": 173.25, |
| "epoch": 0.046969088719389804, |
| "grad_norm": 0.264315664768219, |
| "kl": 0.056716956198215485, |
| "learning_rate": 1.998509038182221e-05, |
| "loss": 0.0023, |
| "reward": 2.802000045776367, |
| "reward_std": 0.5308634042739868, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.1770000010728836, |
| "step": 351 |
| }, |
| { |
| "completion_length": 254.5, |
| "epoch": 0.04710290378696641, |
| "grad_norm": 0.22594784200191498, |
| "kl": 0.03160718083381653, |
| "learning_rate": 1.998485284208754e-05, |
| "loss": 0.0013, |
| "reward": 0.5897499918937683, |
| "reward_std": 0.3205000162124634, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.08974999934434891, |
| "step": 352 |
| }, |
| { |
| "completion_length": 146.0, |
| "epoch": 0.047236718854543024, |
| "grad_norm": 0.2990126311779022, |
| "kl": 0.05723201110959053, |
| "learning_rate": 1.9984613426472934e-05, |
| "loss": 0.0023, |
| "reward": 1.875, |
| "reward_std": 1.1636866331100464, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.375, |
| "step": 353 |
| }, |
| { |
| "completion_length": 175.25, |
| "epoch": 0.04737053392211963, |
| "grad_norm": 0.3130423426628113, |
| "kl": 0.047985732555389404, |
| "learning_rate": 1.9984372135023375e-05, |
| "loss": 0.0019, |
| "reward": 2.485499858856201, |
| "reward_std": 1.1999337673187256, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.3605000078678131, |
| "step": 354 |
| }, |
| { |
| "completion_length": 209.75, |
| "epoch": 0.047504348989696236, |
| "grad_norm": 0.23971174657344818, |
| "kl": 0.03946828842163086, |
| "learning_rate": 1.9984128967784193e-05, |
| "loss": 0.0016, |
| "reward": 2.875, |
| "reward_std": 0.14433756470680237, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.375, |
| "step": 355 |
| }, |
| { |
| "completion_length": 157.0, |
| "epoch": 0.04763816405727285, |
| "grad_norm": 0.44046178460121155, |
| "kl": 0.06311355531215668, |
| "learning_rate": 1.9983883924801075e-05, |
| "loss": 0.0025, |
| "reward": 0.6875, |
| "reward_std": 0.125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1875, |
| "step": 356 |
| }, |
| { |
| "completion_length": 216.75, |
| "epoch": 0.047771979124849456, |
| "grad_norm": 0.2562897503376007, |
| "kl": 0.05822507664561272, |
| "learning_rate": 1.9983637006120054e-05, |
| "loss": 0.0023, |
| "reward": 2.2200000286102295, |
| "reward_std": 1.0125545263290405, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.2199999988079071, |
| "step": 357 |
| }, |
| { |
| "completion_length": 193.75, |
| "epoch": 0.04790579419242607, |
| "grad_norm": 0.4285341799259186, |
| "kl": 0.1787978708744049, |
| "learning_rate": 1.9983388211787523e-05, |
| "loss": 0.0072, |
| "reward": 2.8627500534057617, |
| "reward_std": 0.6519331932067871, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.23774999380111694, |
| "step": 358 |
| }, |
| { |
| "completion_length": 248.5, |
| "epoch": 0.048039609260002676, |
| "grad_norm": 0.16509771347045898, |
| "kl": 0.03431170433759689, |
| "learning_rate": 1.9983137541850225e-05, |
| "loss": 0.0014, |
| "reward": 2.118499994277954, |
| "reward_std": 1.4345158338546753, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.1184999942779541, |
| "step": 359 |
| }, |
| { |
| "completion_length": 147.25, |
| "epoch": 0.04817342432757928, |
| "grad_norm": 0.3071553111076355, |
| "kl": 0.07505498826503754, |
| "learning_rate": 1.9982884996355248e-05, |
| "loss": 0.003, |
| "reward": 0.9375, |
| "reward_std": 0.7180703282356262, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.3125, |
| "step": 360 |
| }, |
| { |
| "completion_length": 138.25, |
| "epoch": 0.048307239395155896, |
| "grad_norm": 0.2983650863170624, |
| "kl": 0.08266126364469528, |
| "learning_rate": 1.998263057535004e-05, |
| "loss": 0.0033, |
| "reward": 2.25, |
| "reward_std": 1.1902379989624023, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 361 |
| }, |
| { |
| "completion_length": 205.5, |
| "epoch": 0.0484410544627325, |
| "grad_norm": 0.23639518022537231, |
| "kl": 0.0964873731136322, |
| "learning_rate": 1.9982374278882402e-05, |
| "loss": 0.0039, |
| "reward": 0.5615000128746033, |
| "reward_std": 0.7200433015823364, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.18649999797344208, |
| "step": 362 |
| }, |
| { |
| "completion_length": 220.0, |
| "epoch": 0.048574869530309116, |
| "grad_norm": 0.1766330450773239, |
| "kl": 0.08415135741233826, |
| "learning_rate": 1.9982116107000485e-05, |
| "loss": 0.0034, |
| "reward": 1.5, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 363 |
| }, |
| { |
| "completion_length": 133.0, |
| "epoch": 0.04870868459788572, |
| "grad_norm": 0.2641361951828003, |
| "kl": 0.09088826179504395, |
| "learning_rate": 1.998185605975279e-05, |
| "loss": 0.0036, |
| "reward": 3.0, |
| "reward_std": 0.3535533845424652, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.375, |
| "step": 364 |
| }, |
| { |
| "completion_length": 162.25, |
| "epoch": 0.04884249966546233, |
| "grad_norm": 0.23563699424266815, |
| "kl": 0.05810078978538513, |
| "learning_rate": 1.9981594137188172e-05, |
| "loss": 0.0023, |
| "reward": 2.875, |
| "reward_std": 0.9464846849441528, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.375, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 365 |
| }, |
| { |
| "completion_length": 190.5, |
| "epoch": 0.04897631473303894, |
| "grad_norm": 0.17748123407363892, |
| "kl": 0.05805326998233795, |
| "learning_rate": 1.9981330339355846e-05, |
| "loss": 0.0023, |
| "reward": 2.70674991607666, |
| "reward_std": 0.4360606074333191, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.20675000548362732, |
| "step": 366 |
| }, |
| { |
| "completion_length": 120.5, |
| "epoch": 0.04911012980061555, |
| "grad_norm": 0.31116271018981934, |
| "kl": 0.08874954283237457, |
| "learning_rate": 1.9981064666305365e-05, |
| "loss": 0.0035, |
| "reward": 2.0, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.5, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 367 |
| }, |
| { |
| "completion_length": 111.75, |
| "epoch": 0.04924394486819216, |
| "grad_norm": 0.5003930330276489, |
| "kl": 0.12456310540437698, |
| "learning_rate": 1.9980797118086644e-05, |
| "loss": 0.005, |
| "reward": 2.6005001068115234, |
| "reward_std": 0.7351197004318237, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.4754999876022339, |
| "step": 368 |
| }, |
| { |
| "completion_length": 231.0, |
| "epoch": 0.04937775993576877, |
| "grad_norm": 0.2768784761428833, |
| "kl": 0.06610839068889618, |
| "learning_rate": 1.9980527694749952e-05, |
| "loss": 0.0026, |
| "reward": 2.125, |
| "reward_std": 1.314977765083313, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 369 |
| }, |
| { |
| "completion_length": 138.25, |
| "epoch": 0.049511575003345375, |
| "grad_norm": 0.44074746966362, |
| "kl": 0.13742223381996155, |
| "learning_rate": 1.9980256396345902e-05, |
| "loss": 0.0055, |
| "reward": 2.125, |
| "reward_std": 1.0307763814926147, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 370 |
| }, |
| { |
| "completion_length": 196.75, |
| "epoch": 0.04964539007092199, |
| "grad_norm": 0.2018011510372162, |
| "kl": 0.063106469810009, |
| "learning_rate": 1.997998322292546e-05, |
| "loss": 0.0025, |
| "reward": 1.436500072479248, |
| "reward_std": 0.8756687045097351, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.43650001287460327, |
| "step": 371 |
| }, |
| { |
| "completion_length": 176.25, |
| "epoch": 0.049779205138498595, |
| "grad_norm": 0.2578439712524414, |
| "kl": 0.1250925213098526, |
| "learning_rate": 1.9979708174539954e-05, |
| "loss": 0.005, |
| "reward": 2.5, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 372 |
| }, |
| { |
| "completion_length": 201.5, |
| "epoch": 0.0499130202060752, |
| "grad_norm": 0.2045677751302719, |
| "kl": 0.09883946925401688, |
| "learning_rate": 1.9979431251241057e-05, |
| "loss": 0.004, |
| "reward": 1.6414999961853027, |
| "reward_std": 1.2570159435272217, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.14149999618530273, |
| "step": 373 |
| }, |
| { |
| "completion_length": 243.25, |
| "epoch": 0.050046835273651814, |
| "grad_norm": 0.296312153339386, |
| "kl": 0.04688343405723572, |
| "learning_rate": 1.9979152453080795e-05, |
| "loss": 0.0019, |
| "reward": 2.6065001487731934, |
| "reward_std": 0.7384910583496094, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.4814999997615814, |
| "step": 374 |
| }, |
| { |
| "completion_length": 316.75, |
| "epoch": 0.05018065034122842, |
| "grad_norm": 0.16760791838169098, |
| "kl": 0.031152646988630295, |
| "learning_rate": 1.9978871780111544e-05, |
| "loss": 0.0012, |
| "reward": 0.75, |
| "reward_std": 0.5, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.375, |
| "step": 375 |
| }, |
| { |
| "completion_length": 204.75, |
| "epoch": 0.050314465408805034, |
| "grad_norm": 0.28139740228652954, |
| "kl": 0.07774727046489716, |
| "learning_rate": 1.9978589232386036e-05, |
| "loss": 0.0031, |
| "reward": 1.25, |
| "reward_std": 0.28867512941360474, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 376 |
| }, |
| { |
| "completion_length": 115.25, |
| "epoch": 0.05044828047638164, |
| "grad_norm": 0.2897559404373169, |
| "kl": 0.10221213102340698, |
| "learning_rate": 1.9978304809957355e-05, |
| "loss": 0.0041, |
| "reward": 3.25, |
| "reward_std": 0.28867512941360474, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 377 |
| }, |
| { |
| "completion_length": 178.0, |
| "epoch": 0.05058209554395825, |
| "grad_norm": 0.22679723799228668, |
| "kl": 0.06140456348657608, |
| "learning_rate": 1.9978018512878938e-05, |
| "loss": 0.0025, |
| "reward": 3.0625, |
| "reward_std": 0.3145764470100403, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.4375, |
| "step": 378 |
| }, |
| { |
| "completion_length": 215.25, |
| "epoch": 0.05071591061153486, |
| "grad_norm": 0.007152177859097719, |
| "kl": 0.04352927207946777, |
| "learning_rate": 1.997773034120457e-05, |
| "loss": 0.0017, |
| "reward": 3.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 379 |
| }, |
| { |
| "completion_length": 193.5, |
| "epoch": 0.05084972567911147, |
| "grad_norm": 0.3108645975589752, |
| "kl": 0.083529032766819, |
| "learning_rate": 1.9977440294988388e-05, |
| "loss": 0.0033, |
| "reward": 2.4375, |
| "reward_std": 0.9655525088310242, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.4375, |
| "step": 380 |
| }, |
| { |
| "completion_length": 153.0, |
| "epoch": 0.05098354074668808, |
| "grad_norm": 0.2845223546028137, |
| "kl": 0.05798358842730522, |
| "learning_rate": 1.9977148374284886e-05, |
| "loss": 0.0023, |
| "reward": 1.975000023841858, |
| "reward_std": 1.5607155561447144, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.3499999940395355, |
| "step": 381 |
| }, |
| { |
| "completion_length": 165.25, |
| "epoch": 0.05111735581426469, |
| "grad_norm": 0.4504396617412567, |
| "kl": 0.06921510398387909, |
| "learning_rate": 1.997685457914891e-05, |
| "loss": 0.0028, |
| "reward": 2.995500087738037, |
| "reward_std": 0.009000062942504883, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.49549999833106995, |
| "step": 382 |
| }, |
| { |
| "completion_length": 225.75, |
| "epoch": 0.05125117088184129, |
| "grad_norm": 0.7333559989929199, |
| "kl": 0.3219738006591797, |
| "learning_rate": 1.997655890963565e-05, |
| "loss": 0.0129, |
| "reward": 2.0, |
| "reward_std": 1.154700517654419, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 383 |
| }, |
| { |
| "completion_length": 272.25, |
| "epoch": 0.051384985949417906, |
| "grad_norm": 0.2574349641799927, |
| "kl": 0.055140066891908646, |
| "learning_rate": 1.9976261365800666e-05, |
| "loss": 0.0022, |
| "reward": 1.40625, |
| "reward_std": 1.455503225326538, |
| "rewards/correctness_reward_func": 0.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.40625, |
| "step": 384 |
| }, |
| { |
| "completion_length": 188.75, |
| "epoch": 0.05151880101699451, |
| "grad_norm": 0.34119075536727905, |
| "kl": 0.05860240384936333, |
| "learning_rate": 1.9975961947699848e-05, |
| "loss": 0.0023, |
| "reward": 2.75, |
| "reward_std": 1.1902379989624023, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 385 |
| }, |
| { |
| "completion_length": 210.0, |
| "epoch": 0.05165261608457112, |
| "grad_norm": 0.20947568118572235, |
| "kl": 0.07599402219057083, |
| "learning_rate": 1.997566065538945e-05, |
| "loss": 0.003, |
| "reward": 1.9175000190734863, |
| "reward_std": 0.8663477301597595, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.29249998927116394, |
| "step": 386 |
| }, |
| { |
| "completion_length": 213.5, |
| "epoch": 0.05178643115214773, |
| "grad_norm": 0.17745548486709595, |
| "kl": 0.05279946327209473, |
| "learning_rate": 1.9975357488926077e-05, |
| "loss": 0.0021, |
| "reward": 2.25, |
| "reward_std": 1.1902379989624023, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 387 |
| }, |
| { |
| "completion_length": 118.5, |
| "epoch": 0.05192024621972434, |
| "grad_norm": 0.24092203378677368, |
| "kl": 0.09460026025772095, |
| "learning_rate": 1.997505244836669e-05, |
| "loss": 0.0038, |
| "reward": 3.375, |
| "reward_std": 0.25, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.375, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 388 |
| }, |
| { |
| "completion_length": 199.25, |
| "epoch": 0.05205406128730095, |
| "grad_norm": 0.23589850962162018, |
| "kl": 0.035461872816085815, |
| "learning_rate": 1.997474553376859e-05, |
| "loss": 0.0014, |
| "reward": 0.5, |
| "reward_std": 0.5773502588272095, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.25, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.25, |
| "step": 389 |
| }, |
| { |
| "completion_length": 235.25, |
| "epoch": 0.05218787635487756, |
| "grad_norm": 0.16748349368572235, |
| "kl": 0.052706725895404816, |
| "learning_rate": 1.9974436745189444e-05, |
| "loss": 0.0021, |
| "reward": 2.5, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 390 |
| }, |
| { |
| "completion_length": 134.25, |
| "epoch": 0.052321691422454165, |
| "grad_norm": 0.30344611406326294, |
| "kl": 0.07609152793884277, |
| "learning_rate": 1.997412608268726e-05, |
| "loss": 0.003, |
| "reward": 2.9075000286102295, |
| "reward_std": 0.18499994277954102, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.4074999988079071, |
| "step": 391 |
| }, |
| { |
| "completion_length": 223.0, |
| "epoch": 0.05245550649003078, |
| "grad_norm": 0.16224347054958344, |
| "kl": 0.07183663547039032, |
| "learning_rate": 1.9973813546320412e-05, |
| "loss": 0.0029, |
| "reward": 2.625, |
| "reward_std": 0.75, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 392 |
| }, |
| { |
| "completion_length": 257.0, |
| "epoch": 0.052589321557607385, |
| "grad_norm": 0.022620225325226784, |
| "kl": 0.04837610572576523, |
| "learning_rate": 1.997349913614761e-05, |
| "loss": 0.0019, |
| "reward": 3.0, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 393 |
| }, |
| { |
| "completion_length": 342.5, |
| "epoch": 0.052723136625184, |
| "grad_norm": 0.16642311215400696, |
| "kl": 0.036274395883083344, |
| "learning_rate": 1.9973182852227917e-05, |
| "loss": 0.0015, |
| "reward": 2.125, |
| "reward_std": 1.0307763814926147, |
| "rewards/correctness_reward_func": 1.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 394 |
| }, |
| { |
| "completion_length": 191.25, |
| "epoch": 0.052856951692760605, |
| "grad_norm": 0.20706374943256378, |
| "kl": 0.06688298285007477, |
| "learning_rate": 1.9972864694620767e-05, |
| "loss": 0.0027, |
| "reward": 2.25, |
| "reward_std": 1.5, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.375, |
| "step": 395 |
| }, |
| { |
| "completion_length": 124.75, |
| "epoch": 0.05299076676033721, |
| "grad_norm": 0.254744291305542, |
| "kl": 0.09049347043037415, |
| "learning_rate": 1.9972544663385927e-05, |
| "loss": 0.0036, |
| "reward": 2.902750015258789, |
| "reward_std": 0.19449996948242188, |
| "rewards/correctness_reward_func": 2.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.40275001525878906, |
| "step": 396 |
| }, |
| { |
| "completion_length": 184.5, |
| "epoch": 0.053124581827913825, |
| "grad_norm": 0.30488380789756775, |
| "kl": 0.08555734157562256, |
| "learning_rate": 1.997222275858352e-05, |
| "loss": 0.0034, |
| "reward": 2.375, |
| "reward_std": 1.25, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 397 |
| }, |
| { |
| "completion_length": 187.5, |
| "epoch": 0.05325839689549043, |
| "grad_norm": 0.21778874099254608, |
| "kl": 0.04590243473649025, |
| "learning_rate": 1.9971898980274024e-05, |
| "loss": 0.0018, |
| "reward": 2.625, |
| "reward_std": 1.4361406564712524, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.25, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 398 |
| }, |
| { |
| "completion_length": 273.25, |
| "epoch": 0.053392211963067045, |
| "grad_norm": 0.16300994157791138, |
| "kl": 0.040011197328567505, |
| "learning_rate": 1.9971573328518273e-05, |
| "loss": 0.0016, |
| "reward": 2.5, |
| "reward_std": 1.0, |
| "rewards/correctness_reward_func": 1.5, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": 0.5, |
| "step": 399 |
| }, |
| { |
| "completion_length": 267.25, |
| "epoch": 0.05352602703064365, |
| "grad_norm": 0.2036687582731247, |
| "kl": 0.04908774048089981, |
| "learning_rate": 1.997124580337744e-05, |
| "loss": 0.002, |
| "reward": 1.0625, |
| "reward_std": 0.3145764470100403, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.5, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.125, |
| "rewards/xmlcount_reward_func": 0.4375, |
| "step": 400 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 7473, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|