diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,3377 +1,49 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 500, - "global_step": 209, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "completion_length": 691.5, - "epoch": 0.004784688995215311, - "grad_norm": 0.16871348023414612, - "kl": 0.0, - "learning_rate": 2.3809523809523811e-07, - "loss": -0.0, - "reward": -0.1928749978542328, - "reward_std": 0.5455328822135925, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.1928749978542328, - "step": 1 - }, - { - "completion_length": 407.0, - "epoch": 0.009569377990430622, - "grad_norm": 0.14342093467712402, - "kl": 0.0, - "learning_rate": 4.7619047619047623e-07, - "loss": -0.0, - "reward": -0.3125, - "reward_std": 0.5840203166007996, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.3125, - "step": 2 - }, - { - "completion_length": 552.5, - "epoch": 0.014354066985645933, - "grad_norm": 0.1257794201374054, - "kl": 5.403280283644563e-06, - "learning_rate": 7.142857142857143e-07, - "loss": 0.0, - "reward": -0.40062499046325684, - "reward_std": 0.889271080493927, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.40062499046325684, - "step": 3 - }, - { - "completion_length": 394.5, - "epoch": 0.019138755980861243, - "grad_norm": 4.7705951146781445e-05, - "kl": 5.921259798924439e-06, - "learning_rate": 9.523809523809525e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 4 - }, - { - "completion_length": 540.25, - "epoch": 0.023923444976076555, - "grad_norm": 0.16090112924575806, - "kl": 6.965140983083984e-06, - "learning_rate": 1.1904761904761906e-06, - "loss": 0.0, - "reward": 0.4424999952316284, - "reward_std": 1.171410083770752, - "rewards/correctness_reward_func": 0.5, - "rewards/int_reward_func": 0.125, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.18250000476837158, - "step": 5 - }, - { - "completion_length": 273.125, - "epoch": 0.028708133971291867, - "grad_norm": 0.17737899720668793, - "kl": 7.017245934548555e-06, - "learning_rate": 1.4285714285714286e-06, - "loss": 0.0, - "reward": -0.09487500786781311, - "reward_std": 0.23548395931720734, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.09487500786781311, - "step": 6 - }, - { - "completion_length": 863.0, - "epoch": 0.03349282296650718, - "grad_norm": 0.13521058857440948, - "kl": 8.277294909930788e-06, - "learning_rate": 1.6666666666666667e-06, - "loss": 0.0, - "reward": -0.4580000042915344, - "reward_std": 1.0750995874404907, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.125, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.5830000042915344, - "step": 7 - }, - { - "completion_length": 290.5, - "epoch": 0.03827751196172249, - "grad_norm": 0.13415156304836273, - "kl": 5.513447831617668e-06, - "learning_rate": 1.904761904761905e-06, - "loss": 0.0, - "reward": 0.11012500524520874, - "reward_std": 0.7398378849029541, - "rewards/correctness_reward_func": 0.25, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.20237499475479126, - "step": 8 - }, - { - "completion_length": 420.875, - "epoch": 0.0430622009569378, - "grad_norm": 0.09467775374650955, - "kl": 5.87351632930222e-06, - "learning_rate": 2.1428571428571427e-06, - "loss": 0.0, - "reward": 0.02187499962747097, - "reward_std": 0.06187184154987335, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.02187499962747097, - "step": 9 - }, - { - "completion_length": 585.25, - "epoch": 0.04784688995215311, - "grad_norm": 0.17043040692806244, - "kl": 7.560887297586305e-06, - "learning_rate": 2.380952380952381e-06, - "loss": 0.0, - "reward": -0.10100000351667404, - "reward_std": 0.2856711447238922, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.10100000351667404, - "step": 10 - }, - { - "completion_length": 373.125, - "epoch": 0.05263157894736842, - "grad_norm": 0.21753454208374023, - "kl": 8.626009730505757e-06, - "learning_rate": 2.6190476190476192e-06, - "loss": 0.0, - "reward": -0.09137500077486038, - "reward_std": 0.34128451347351074, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.09137500077486038, - "step": 11 - }, - { - "completion_length": 219.0, - "epoch": 0.05741626794258373, - "grad_norm": 7.077892223605886e-05, - "kl": 3.93135633203201e-06, - "learning_rate": 2.8571428571428573e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 12 - }, - { - "completion_length": 411.25, - "epoch": 0.06220095693779904, - "grad_norm": 0.21400289237499237, - "kl": 1.1138845366076566e-05, - "learning_rate": 3.0952380952380957e-06, - "loss": 0.0, - "reward": -0.23725000023841858, - "reward_std": 0.4926139712333679, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.23725000023841858, - "step": 13 - }, - { - "completion_length": 252.875, - "epoch": 0.06698564593301436, - "grad_norm": 0.21576255559921265, - "kl": 9.548981324769557e-06, - "learning_rate": 3.3333333333333333e-06, - "loss": 0.0, - "reward": -0.12124999612569809, - "reward_std": 0.38313212990760803, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.12124999612569809, - "step": 14 - }, - { - "completion_length": 281.375, - "epoch": 0.07177033492822966, - "grad_norm": 0.2283598929643631, - "kl": 1.0562575880612712e-05, - "learning_rate": 3.5714285714285718e-06, - "loss": 0.0, - "reward": 0.25187501311302185, - "reward_std": 1.467455267906189, - "rewards/correctness_reward_func": 0.5, - "rewards/int_reward_func": 0.25, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.49812498688697815, - "step": 15 - }, - { - "completion_length": 369.375, - "epoch": 0.07655502392344497, - "grad_norm": 0.14756156504154205, - "kl": 7.879279110056814e-06, - "learning_rate": 3.80952380952381e-06, - "loss": 0.0, - "reward": -0.05525000020861626, - "reward_std": 0.1562705934047699, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.05525000020861626, - "step": 16 - }, - { - "completion_length": 457.625, - "epoch": 0.08133971291866028, - "grad_norm": 0.2603680491447449, - "kl": 1.730734402372036e-05, - "learning_rate": 4.047619047619048e-06, - "loss": 0.0, - "reward": 0.0963749885559082, - "reward_std": 1.1280322074890137, - "rewards/correctness_reward_func": 0.25, - "rewards/int_reward_func": 0.25, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.4036250114440918, - "step": 17 - }, - { - "completion_length": 386.625, - "epoch": 0.0861244019138756, - "grad_norm": 0.00014964047295507044, - "kl": 9.983908967114985e-06, - "learning_rate": 4.2857142857142855e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 18 - }, - { - "completion_length": 440.75, - "epoch": 0.09090909090909091, - "grad_norm": 0.00017180817667394876, - "kl": 1.582676668476779e-05, - "learning_rate": 4.523809523809524e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 19 - }, - { - "completion_length": 523.125, - "epoch": 0.09569377990430622, - "grad_norm": 0.1381252110004425, - "kl": 1.0601282156130765e-05, - "learning_rate": 4.761904761904762e-06, - "loss": 0.0, - "reward": 0.04699999839067459, - "reward_std": 0.13293607532978058, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.04699999839067459, - "step": 20 - }, - { - "completion_length": 345.125, - "epoch": 0.10047846889952153, - "grad_norm": 0.16844867169857025, - "kl": 1.5000337043602485e-05, - "learning_rate": 5e-06, - "loss": 0.0, - "reward": 0.28575000166893005, - "reward_std": 0.8961601853370667, - "rewards/correctness_reward_func": 0.25, - "rewards/int_reward_func": 0.125, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.08924999833106995, - "step": 21 - }, - { - "completion_length": 184.25, - "epoch": 0.10526315789473684, - "grad_norm": 0.0002866210415959358, - "kl": 1.6033938663895242e-05, - "learning_rate": 4.999650952964643e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 22 - }, - { - "completion_length": 389.625, - "epoch": 0.11004784688995216, - "grad_norm": 0.22452810406684875, - "kl": 3.2393058063462377e-05, - "learning_rate": 4.998603909325636e-06, - "loss": 0.0, - "reward": 0.28337499499320984, - "reward_std": 0.8015055060386658, - "rewards/correctness_reward_func": 0.25, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.029124999418854713, - "step": 23 - }, - { - "completion_length": 335.375, - "epoch": 0.11483253588516747, - "grad_norm": 0.0004265092429704964, - "kl": 2.8664630008279346e-05, - "learning_rate": 4.996859161456965e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 24 - }, - { - "completion_length": 231.0, - "epoch": 0.11961722488038277, - "grad_norm": 0.322028785943985, - "kl": 3.968990858993493e-05, - "learning_rate": 4.994417196557884e-06, - "loss": 0.0, - "reward": 0.057500001043081284, - "reward_std": 0.16263455152511597, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.004999999888241291, - "step": 25 - }, - { - "completion_length": 741.0, - "epoch": 0.12440191387559808, - "grad_norm": 0.11335761845111847, - "kl": 2.1649524569511414e-05, - "learning_rate": 4.991278696516879e-06, - "loss": 0.0, - "reward": -0.26512500643730164, - "reward_std": 0.6683556437492371, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.32762500643730164, - "step": 26 - }, - { - "completion_length": 579.375, - "epoch": 0.1291866028708134, - "grad_norm": 0.16209539771080017, - "kl": 3.5106801078654826e-05, - "learning_rate": 4.98744453772126e-06, - "loss": 0.0, - "reward": -0.37037500739097595, - "reward_std": 0.5334842205047607, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.43287500739097595, - "step": 27 - }, - { - "completion_length": 468.625, - "epoch": 0.1339712918660287, - "grad_norm": 0.14830249547958374, - "kl": 5.958353722235188e-05, - "learning_rate": 4.982915790812436e-06, - "loss": 0.0, - "reward": -0.484375, - "reward_std": 1.0699535608291626, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.484375, - "step": 28 - }, - { - "completion_length": 422.625, - "epoch": 0.13875598086124402, - "grad_norm": 0.1415875107049942, - "kl": 4.186444130027667e-05, - "learning_rate": 4.977693720386951e-06, - "loss": 0.0, - "reward": -0.14512500166893005, - "reward_std": 0.27205851674079895, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.14512500166893005, - "step": 29 - }, - { - "completion_length": 538.0, - "epoch": 0.14354066985645933, - "grad_norm": 0.0002491377235855907, - "kl": 2.5585266484995373e-05, - "learning_rate": 4.9717797846433655e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 30 - }, - { - "completion_length": 481.0, - "epoch": 0.14832535885167464, - "grad_norm": 0.1954212486743927, - "kl": 4.6573721192544326e-05, - "learning_rate": 4.965175634975072e-06, - "loss": 0.0, - "reward": -0.035875000059604645, - "reward_std": 0.10146982967853546, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.09837500005960464, - "step": 31 - }, - { - "completion_length": 458.5, - "epoch": 0.15311004784688995, - "grad_norm": 0.1300901472568512, - "kl": 3.143128560623154e-05, - "learning_rate": 4.9578831155091585e-06, - "loss": 0.0, - "reward": -0.1340000033378601, - "reward_std": 0.2986603379249573, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.1340000033378601, - "step": 32 - }, - { - "completion_length": 277.375, - "epoch": 0.15789473684210525, - "grad_norm": 0.0005956153036095202, - "kl": 6.650951399933547e-05, - "learning_rate": 4.949904262591467e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 33 - }, - { - "completion_length": 224.5, - "epoch": 0.16267942583732056, - "grad_norm": 0.0008701166370883584, - "kl": 8.910963515518233e-05, - "learning_rate": 4.941241304217962e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 34 - }, - { - "completion_length": 571.875, - "epoch": 0.1674641148325359, - "grad_norm": 0.08852925151586533, - "kl": 8.730205445317551e-05, - "learning_rate": 4.931896659412593e-06, - "loss": 0.0, - "reward": 0.10175000131130219, - "reward_std": 0.8402824401855469, - "rewards/correctness_reward_func": 0.25, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.21074999868869781, - "step": 35 - }, - { - "completion_length": 378.875, - "epoch": 0.1722488038277512, - "grad_norm": 0.0003980418259743601, - "kl": 5.1553357479861006e-05, - "learning_rate": 4.921872937551814e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 36 - }, - { - "completion_length": 381.25, - "epoch": 0.17703349282296652, - "grad_norm": 0.00038408322143368423, - "kl": 5.539537232834846e-05, - "learning_rate": 4.911172937635942e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 37 - }, - { - "completion_length": 484.875, - "epoch": 0.18181818181818182, - "grad_norm": 0.0009112621773965657, - "kl": 0.00013256669626571238, - "learning_rate": 4.899799647507577e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 38 - }, - { - "completion_length": 709.125, - "epoch": 0.18660287081339713, - "grad_norm": 0.0746121034026146, - "kl": 4.9048678192775697e-05, - "learning_rate": 4.887756243017282e-06, - "loss": 0.0, - "reward": -0.5247499942779541, - "reward_std": 1.4842170476913452, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.5247499942779541, - "step": 39 - }, - { - "completion_length": 455.875, - "epoch": 0.19138755980861244, - "grad_norm": 0.1864699274301529, - "kl": 9.326951840193942e-05, - "learning_rate": 4.87504608713676e-06, - "loss": 0.0, - "reward": -0.12825000286102295, - "reward_std": 0.41556185483932495, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.12825000286102295, - "step": 40 - }, - { - "completion_length": 641.875, - "epoch": 0.19617224880382775, - "grad_norm": 0.0006094383425079286, - "kl": 8.655336569063365e-05, - "learning_rate": 4.861672729019798e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 41 - }, - { - "completion_length": 497.125, - "epoch": 0.20095693779904306, - "grad_norm": 0.00045034781214781106, - "kl": 7.392139377770945e-05, - "learning_rate": 4.847639903011196e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 42 - }, - { - "completion_length": 292.75, - "epoch": 0.20574162679425836, - "grad_norm": 0.1903112530708313, - "kl": 0.00015642325161024928, - "learning_rate": 4.832951527604007e-06, - "loss": 0.0, - "reward": 0.2068749964237213, - "reward_std": 0.49162107706069946, - "rewards/correctness_reward_func": 0.25, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.10562499612569809, - "step": 43 - }, - { - "completion_length": 751.25, - "epoch": 0.21052631578947367, - "grad_norm": 0.00024830171605572104, - "kl": 4.184572389931418e-05, - "learning_rate": 4.817611704345344e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 44 - }, - { - "completion_length": 330.125, - "epoch": 0.215311004784689, - "grad_norm": 0.30768927931785583, - "kl": 0.0003127607924398035, - "learning_rate": 4.801624716691072e-06, - "loss": 0.0, - "reward": -0.3476249873638153, - "reward_std": 0.7044243812561035, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.125, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.4726249873638153, - "step": 45 - }, - { - "completion_length": 415.625, - "epoch": 0.22009569377990432, - "grad_norm": 0.16692359745502472, - "kl": 0.00011098584946012124, - "learning_rate": 4.784995028809707e-06, - "loss": 0.0, - "reward": 0.014750003814697266, - "reward_std": 1.0180972814559937, - "rewards/correctness_reward_func": 0.25, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.29774999618530273, - "step": 46 - }, - { - "completion_length": 394.375, - "epoch": 0.22488038277511962, - "grad_norm": 0.0005798178026452661, - "kl": 8.575244282837957e-05, - "learning_rate": 4.767727284335852e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 47 - }, - { - "completion_length": 246.75, - "epoch": 0.22966507177033493, - "grad_norm": 0.28091752529144287, - "kl": 0.00018115453713107854, - "learning_rate": 4.74982630507352e-06, - "loss": 0.0, - "reward": 0.0022499999031424522, - "reward_std": 0.006363960448652506, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0022499999031424522, - "step": 48 - }, - { - "completion_length": 438.5, - "epoch": 0.23444976076555024, - "grad_norm": 0.16050425171852112, - "kl": 0.00014010320592205971, - "learning_rate": 4.731297089649704e-06, - "loss": 0.0, - "reward": -0.18050000071525574, - "reward_std": 0.5105310678482056, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.18050000071525574, - "step": 49 - }, - { - "completion_length": 311.5, - "epoch": 0.23923444976076555, - "grad_norm": 0.25653553009033203, - "kl": 0.0002142376033589244, - "learning_rate": 4.7121448121185716e-06, - "loss": 0.0, - "reward": 0.010124999098479748, - "reward_std": 0.08468502014875412, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.05237499997019768, - "step": 50 - }, - { - "completion_length": 206.125, - "epoch": 0.24401913875598086, - "grad_norm": 0.24129430949687958, - "kl": 0.00015309952141251415, - "learning_rate": 4.692374820516679e-06, - "loss": 0.0, - "reward": -0.03137499839067459, - "reward_std": 0.1459578573703766, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.03137499839067459, - "step": 51 - }, - { - "completion_length": 312.625, - "epoch": 0.24880382775119617, - "grad_norm": 0.0005034372443333268, - "kl": 7.53342974348925e-05, - "learning_rate": 4.671992635369592e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 52 - }, - { - "completion_length": 385.0, - "epoch": 0.2535885167464115, - "grad_norm": 0.0004993649199604988, - "kl": 9.168663382297382e-05, - "learning_rate": 4.651003948150349e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 53 - }, - { - "completion_length": 166.0, - "epoch": 0.2583732057416268, - "grad_norm": 0.001001685275696218, - "kl": 0.00014484986604657024, - "learning_rate": 4.62941461969019e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 54 - }, - { - "completion_length": 338.875, - "epoch": 0.2631578947368421, - "grad_norm": 0.14065545797348022, - "kl": 0.00018690418801270425, - "learning_rate": 4.607230678541993e-06, - "loss": 0.0, - "reward": -0.38087502121925354, - "reward_std": 1.18196439743042, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.38087502121925354, - "step": 55 - }, - { - "completion_length": 483.75, - "epoch": 0.2679425837320574, - "grad_norm": 0.13930781185626984, - "kl": 0.0001991519093280658, - "learning_rate": 4.584458319296868e-06, - "loss": 0.0, - "reward": -0.765874981880188, - "reward_std": 1.2982186079025269, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.828374981880188, - "step": 56 - }, - { - "completion_length": 582.875, - "epoch": 0.2727272727272727, - "grad_norm": 0.11850852519273758, - "kl": 8.17330801510252e-05, - "learning_rate": 4.561103900854401e-06, - "loss": 0.0, - "reward": -0.17162500321865082, - "reward_std": 0.5060643553733826, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.171625018119812, - "step": 57 - }, - { - "completion_length": 242.25, - "epoch": 0.27751196172248804, - "grad_norm": 0.0009432674269191921, - "kl": 0.00014569271297659725, - "learning_rate": 4.5371739446470085e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 58 - }, - { - "completion_length": 183.375, - "epoch": 0.2822966507177033, - "grad_norm": 0.23372147977352142, - "kl": 0.0002689736138563603, - "learning_rate": 4.512675132818908e-06, - "loss": 0.0, - "reward": 0.9496250152587891, - "reward_std": 1.3218873739242554, - "rewards/correctness_reward_func": 0.75, - "rewards/int_reward_func": 0.1875, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.012125000357627869, - "step": 59 - }, - { - "completion_length": 442.875, - "epoch": 0.28708133971291866, - "grad_norm": 0.20401550829410553, - "kl": 0.00011811378499260172, - "learning_rate": 4.487614306360208e-06, - "loss": 0.0, - "reward": -0.008375000208616257, - "reward_std": 0.18639278411865234, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.008375000208616257, - "step": 60 - }, - { - "completion_length": 294.0, - "epoch": 0.291866028708134, - "grad_norm": 0.16501188278198242, - "kl": 0.00019776706176344305, - "learning_rate": 4.461998463196653e-06, - "loss": 0.0, - "reward": -0.14887499809265137, - "reward_std": 0.421082079410553, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.14887499809265137, - "step": 61 - }, - { - "completion_length": 471.875, - "epoch": 0.2966507177033493, - "grad_norm": 0.14112193882465363, - "kl": 0.00012477418931666762, - "learning_rate": 4.435834756235534e-06, - "loss": 0.0, - "reward": -0.05587500333786011, - "reward_std": 0.3656686544418335, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.05587500333786011, - "step": 62 - }, - { - "completion_length": 720.625, - "epoch": 0.3014354066985646, - "grad_norm": 0.15206323564052582, - "kl": 0.00012914805847685784, - "learning_rate": 4.409130491368331e-06, - "loss": 0.0, - "reward": 0.015625, - "reward_std": 0.04419417306780815, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.015625, - "step": 63 - }, - { - "completion_length": 193.25, - "epoch": 0.3062200956937799, - "grad_norm": 0.2196258306503296, - "kl": 0.0005463301204144955, - "learning_rate": 4.381893125430629e-06, - "loss": 0.0, - "reward": 0.015625, - "reward_std": 0.04419417306780815, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.015625, - "step": 64 - }, - { - "completion_length": 752.125, - "epoch": 0.31100478468899523, - "grad_norm": 0.0012149108806625009, - "kl": 0.0001829929678933695, - "learning_rate": 4.354130264119894e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 65 - }, - { - "completion_length": 446.25, - "epoch": 0.3157894736842105, - "grad_norm": 0.2558337450027466, - "kl": 0.00029457241180352867, - "learning_rate": 4.325849659871674e-06, - "loss": 0.0, - "reward": -0.04975000023841858, - "reward_std": 0.257072389125824, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.04975000023841858, - "step": 66 - }, - { - "completion_length": 305.75, - "epoch": 0.32057416267942584, - "grad_norm": 0.14558549225330353, - "kl": 0.00023881300876382738, - "learning_rate": 4.297059209694824e-06, - "loss": 0.0, - "reward": -0.5303750038146973, - "reward_std": 1.5001270771026611, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.5303750038146973, - "step": 67 - }, - { - "completion_length": 337.125, - "epoch": 0.3253588516746411, - "grad_norm": 0.1623697131872177, - "kl": 0.0002290508127771318, - "learning_rate": 4.267766952966369e-06, - "loss": 0.0, - "reward": 0.015625, - "reward_std": 0.04419417306780815, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.015625, - "step": 68 - }, - { - "completion_length": 335.625, - "epoch": 0.33014354066985646, - "grad_norm": 0.19331970810890198, - "kl": 0.00046445277985185385, - "learning_rate": 4.237981069186606e-06, - "loss": 0.0, - "reward": -0.25737500190734863, - "reward_std": 0.7348853945732117, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.25737497210502625, - "step": 69 - }, - { - "completion_length": 482.125, - "epoch": 0.3349282296650718, - "grad_norm": 0.0010412519332021475, - "kl": 0.00024410485639236867, - "learning_rate": 4.207709875695078e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 70 - }, - { - "completion_length": 253.625, - "epoch": 0.3397129186602871, - "grad_norm": 0.0011541300918906927, - "kl": 0.0002700735640246421, - "learning_rate": 4.176961825348059e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 71 - }, - { - "completion_length": 315.625, - "epoch": 0.3444976076555024, - "grad_norm": 0.17885912954807281, - "kl": 0.00044736871495842934, - "learning_rate": 4.1457455041582044e-06, - "loss": 0.0, - "reward": -0.21512500941753387, - "reward_std": 0.39851561188697815, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.27762502431869507, - "step": 72 - }, - { - "completion_length": 440.75, - "epoch": 0.3492822966507177, - "grad_norm": 0.0013392317341640592, - "kl": 0.0003827627224382013, - "learning_rate": 4.114069628897006e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 73 - }, - { - "completion_length": 497.625, - "epoch": 0.35406698564593303, - "grad_norm": 0.22193162143230438, - "kl": 0.0005096244858577847, - "learning_rate": 4.081943044660746e-06, - "loss": 0.0, - "reward": 0.04724999889731407, - "reward_std": 0.13364318013191223, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.04724999889731407, - "step": 74 - }, - { - "completion_length": 244.625, - "epoch": 0.3588516746411483, - "grad_norm": 0.2988503873348236, - "kl": 0.001066714059561491, - "learning_rate": 4.049374722400613e-06, - "loss": 0.0, - "reward": 0.04987499862909317, - "reward_std": 0.09467905759811401, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.04987499862909317, - "step": 75 - }, - { - "completion_length": 391.625, - "epoch": 0.36363636363636365, - "grad_norm": 0.15623344480991364, - "kl": 0.0004087413544766605, - "learning_rate": 4.016373756417669e-06, - "loss": 0.0, - "reward": 0.04125000163912773, - "reward_std": 0.11667262762784958, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.04125000163912773, - "step": 76 - }, - { - "completion_length": 467.625, - "epoch": 0.3684210526315789, - "grad_norm": 0.1304023414850235, - "kl": 0.00029414560412988067, - "learning_rate": 3.982949361823388e-06, - "loss": 0.0, - "reward": 0.04699999839067459, - "reward_std": 0.13293607532978058, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.04699999839067459, - "step": 77 - }, - { - "completion_length": 293.875, - "epoch": 0.37320574162679426, - "grad_norm": 0.18905651569366455, - "kl": 0.0007206588634289801, - "learning_rate": 3.949110871966444e-06, - "loss": 0.0, - "reward": -0.0951249971985817, - "reward_std": 0.269054114818573, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.0951249971985817, - "step": 78 - }, - { - "completion_length": 340.25, - "epoch": 0.37799043062200954, - "grad_norm": 0.16222861409187317, - "kl": 0.0009388496982865036, - "learning_rate": 3.914867735826489e-06, - "loss": 0.0, - "reward": -0.0012499999720603228, - "reward_std": 0.0035355337895452976, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.0012499999720603228, - "step": 79 - }, - { - "completion_length": 248.75, - "epoch": 0.3827751196172249, - "grad_norm": 0.0018477136036381125, - "kl": 0.0005005531711503863, - "learning_rate": 3.880229515375642e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 80 - }, - { - "completion_length": 398.5, - "epoch": 0.3875598086124402, - "grad_norm": 0.0027135207783430815, - "kl": 0.0006622711080126464, - "learning_rate": 3.845205882908432e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 81 - }, - { - "completion_length": 226.0, - "epoch": 0.3923444976076555, - "grad_norm": 0.22955507040023804, - "kl": 0.0008301762863993645, - "learning_rate": 3.8098066183409223e-06, - "loss": 0.0, - "reward": 0.18949998915195465, - "reward_std": 0.773678183555603, - "rewards/correctness_reward_func": 0.25, - "rewards/int_reward_func": 0.125, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.18549999594688416, - "step": 82 - }, - { - "completion_length": 392.75, - "epoch": 0.39712918660287083, - "grad_norm": 0.24336187541484833, - "kl": 0.0009957973379641771, - "learning_rate": 3.774041606479794e-06, - "loss": 0.0, - "reward": -0.10050000250339508, - "reward_std": 0.20037394762039185, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.10050000250339508, - "step": 83 - }, - { - "completion_length": 523.5, - "epoch": 0.4019138755980861, - "grad_norm": 0.0021875619422644377, - "kl": 0.0005730820703320205, - "learning_rate": 3.737920834262134e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 84 - }, - { - "completion_length": 149.125, - "epoch": 0.40669856459330145, - "grad_norm": 0.27528756856918335, - "kl": 0.001626197132281959, - "learning_rate": 3.7014543879667097e-06, - "loss": 0.0001, - "reward": 0.2956250011920929, - "reward_std": 0.8296367526054382, - "rewards/correctness_reward_func": 0.25, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.016875002533197403, - "step": 85 - }, - { - "completion_length": 367.125, - "epoch": 0.41148325358851673, - "grad_norm": 0.2960017919540405, - "kl": 0.0007133784238249063, - "learning_rate": 3.6646524503974955e-06, - "loss": 0.0, - "reward": 0.078125, - "reward_std": 0.22097086906433105, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.015625, - "step": 86 - }, - { - "completion_length": 448.125, - "epoch": 0.41626794258373206, - "grad_norm": 0.19843104481697083, - "kl": 0.0006177867180667818, - "learning_rate": 3.627525298040255e-06, - "loss": 0.0, - "reward": -0.00975000113248825, - "reward_std": 0.17306047677993774, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.07225000113248825, - "step": 87 - }, - { - "completion_length": 340.25, - "epoch": 0.42105263157894735, - "grad_norm": 0.23097792267799377, - "kl": 0.004078476224094629, - "learning_rate": 3.5900832981929574e-06, - "loss": 0.0002, - "reward": -0.2122499942779541, - "reward_std": 0.6463525891304016, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.2122499942779541, - "step": 88 - }, - { - "completion_length": 473.0, - "epoch": 0.4258373205741627, - "grad_norm": 0.00352810719050467, - "kl": 0.0008594461833126843, - "learning_rate": 3.552336906070838e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 89 - }, - { - "completion_length": 221.0, - "epoch": 0.430622009569378, - "grad_norm": 0.2433868646621704, - "kl": 0.0008713887655176222, - "learning_rate": 3.5142966618869096e-06, - "loss": 0.0, - "reward": -0.1054999977350235, - "reward_std": 0.22372113168239594, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.1054999977350235, - "step": 90 - }, - { - "completion_length": 689.625, - "epoch": 0.4354066985645933, - "grad_norm": 0.19714897871017456, - "kl": 0.000722284079529345, - "learning_rate": 3.4759731879087373e-06, - "loss": 0.0, - "reward": -0.12437500059604645, - "reward_std": 0.580796480178833, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.18687500059604645, - "step": 91 - }, - { - "completion_length": 382.0, - "epoch": 0.44019138755980863, - "grad_norm": 0.18885308504104614, - "kl": 0.0009461307781748474, - "learning_rate": 3.4373771854923032e-06, - "loss": 0.0, - "reward": 0.15937501192092896, - "reward_std": 0.9205639958381653, - "rewards/correctness_reward_func": 0.25, - "rewards/int_reward_func": 0.25, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.34062501788139343, - "step": 92 - }, - { - "completion_length": 507.25, - "epoch": 0.4449760765550239, - "grad_norm": 0.18828719854354858, - "kl": 0.0007705223397351801, - "learning_rate": 3.398519432093782e-06, - "loss": 0.0, - "reward": -0.08250000327825546, - "reward_std": 0.23836825788021088, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.08250000327825546, - "step": 93 - }, - { - "completion_length": 370.375, - "epoch": 0.44976076555023925, - "grad_norm": 0.24172991514205933, - "kl": 0.001011163112707436, - "learning_rate": 3.3594107782600754e-06, - "loss": 0.0, - "reward": -0.0040000006556510925, - "reward_std": 0.125724196434021, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.0040000006556510925, - "step": 94 - }, - { - "completion_length": 310.25, - "epoch": 0.45454545454545453, - "grad_norm": 0.19088450074195862, - "kl": 0.0008973844815045595, - "learning_rate": 3.3200621445989227e-06, - "loss": 0.0, - "reward": -0.04125000163912773, - "reward_std": 0.17858392000198364, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.04125000163912773, - "step": 95 - }, - { - "completion_length": 429.75, - "epoch": 0.45933014354066987, - "grad_norm": 0.15251575410366058, - "kl": 0.0008894134080037475, - "learning_rate": 3.2804845187294666e-06, - "loss": 0.0, - "reward": -0.5216249823570251, - "reward_std": 1.2643816471099854, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.5841249823570251, - "step": 96 - }, - { - "completion_length": 608.75, - "epoch": 0.46411483253588515, - "grad_norm": 0.004114707466214895, - "kl": 0.001418918720446527, - "learning_rate": 3.2406889522140854e-06, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 97 - }, - { - "completion_length": 283.75, - "epoch": 0.4688995215311005, - "grad_norm": 0.19318333268165588, - "kl": 0.0011450113961473107, - "learning_rate": 3.2006865574723907e-06, - "loss": 0.0, - "reward": -0.14900000393390656, - "reward_std": 0.24991369247436523, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.14900000393390656, - "step": 98 - }, - { - "completion_length": 363.5, - "epoch": 0.47368421052631576, - "grad_norm": 0.2640697956085205, - "kl": 0.0014903987757861614, - "learning_rate": 3.1604885046782158e-06, - "loss": 0.0001, - "reward": -0.054999999701976776, - "reward_std": 0.15556350350379944, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.11749999970197678, - "step": 99 - }, - { - "completion_length": 598.75, - "epoch": 0.4784688995215311, - "grad_norm": 0.0019880945328623056, - "kl": 0.0007876585004851222, - "learning_rate": 3.1201060186404836e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 100 - }, - { - "completion_length": 511.625, - "epoch": 0.48325358851674644, - "grad_norm": 0.1489640474319458, - "kl": 0.0011680320603772998, - "learning_rate": 3.0795503756688212e-06, - "loss": 0.0, - "reward": -0.08962500095367432, - "reward_std": 0.25349777936935425, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.08962500095367432, - "step": 101 - }, - { - "completion_length": 675.375, - "epoch": 0.4880382775119617, - "grad_norm": 0.13992537558078766, - "kl": 0.0005181314772926271, - "learning_rate": 3.038832900424784e-06, - "loss": 0.0, - "reward": -0.07262499630451202, - "reward_std": 0.20541450381278992, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.13512499630451202, - "step": 102 - }, - { - "completion_length": 431.0, - "epoch": 0.49282296650717705, - "grad_norm": 0.1417127102613449, - "kl": 0.0010593538172543049, - "learning_rate": 2.9979649627595904e-06, - "loss": 0.0, - "reward": -0.048375003039836884, - "reward_std": 0.13682517409324646, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.11087500303983688, - "step": 103 - }, - { - "completion_length": 301.0, - "epoch": 0.49760765550239233, - "grad_norm": 0.2090919315814972, - "kl": 0.001190081238746643, - "learning_rate": 2.9569579745392263e-06, - "loss": 0.0, - "reward": -0.09950000047683716, - "reward_std": 0.2814285159111023, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.09950000047683716, - "step": 104 - }, - { - "completion_length": 408.5, - "epoch": 0.5023923444976076, - "grad_norm": 0.3273250162601471, - "kl": 0.00099503668025136, - "learning_rate": 2.9158233864578256e-06, - "loss": 0.0, - "reward": 0.078125, - "reward_std": 0.22097086906433105, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.015625, - "step": 105 - }, - { - "completion_length": 269.125, - "epoch": 0.507177033492823, - "grad_norm": 0.30214449763298035, - "kl": 0.0014681998873129487, - "learning_rate": 2.8745726848402037e-06, - "loss": 0.0001, - "reward": -0.06849999725818634, - "reward_std": 0.19374725222587585, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.06849999725818634, - "step": 106 - }, - { - "completion_length": 388.625, - "epoch": 0.5119617224880383, - "grad_norm": 0.0030519121792167425, - "kl": 0.0012719701044261456, - "learning_rate": 2.8332173884344477e-06, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 107 - }, - { - "completion_length": 480.125, - "epoch": 0.5167464114832536, - "grad_norm": 0.20971782505512238, - "kl": 0.0014634812250733376, - "learning_rate": 2.791769045195441e-06, - "loss": 0.0001, - "reward": -0.17350000143051147, - "reward_std": 0.35396167635917664, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.17350000143051147, - "step": 108 - }, - { - "completion_length": 289.625, - "epoch": 0.5215311004784688, - "grad_norm": 0.16469347476959229, - "kl": 0.001274844747968018, - "learning_rate": 2.7502392290602463e-06, - "loss": 0.0001, - "reward": -0.16437500715255737, - "reward_std": 0.4649227261543274, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.22687500715255737, - "step": 109 - }, - { - "completion_length": 335.375, - "epoch": 0.5263157894736842, - "grad_norm": 0.1470983624458313, - "kl": 0.0009442528826184571, - "learning_rate": 2.708639536716225e-06, - "loss": 0.0, - "reward": -0.19949999451637268, - "reward_std": 0.5642712116241455, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.19949999451637268, - "step": 110 - }, - { - "completion_length": 202.0, - "epoch": 0.5311004784688995, - "grad_norm": 0.005657874513417482, - "kl": 0.0020560992415994406, - "learning_rate": 2.6669815843628043e-06, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 111 - }, - { - "completion_length": 411.875, - "epoch": 0.5358851674641149, - "grad_norm": 0.15740437805652618, - "kl": 0.0006687308778055012, - "learning_rate": 2.625277004467798e-06, - "loss": 0.0, - "reward": -0.304625004529953, - "reward_std": 0.5726122856140137, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.304625004529953, - "step": 112 - }, - { - "completion_length": 509.25, - "epoch": 0.5406698564593302, - "grad_norm": 0.0011531610507518053, - "kl": 0.00017873164324555546, - "learning_rate": 2.5835374425191867e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 113 - }, - { - "completion_length": 377.75, - "epoch": 0.5454545454545454, - "grad_norm": 0.0029194948729127645, - "kl": 0.0013420789036899805, - "learning_rate": 2.5417745537732524e-06, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 114 - }, - { - "completion_length": 281.25, - "epoch": 0.5502392344497608, - "grad_norm": 0.31120428442955017, - "kl": 0.0016515403985977173, - "learning_rate": 2.5e-06, - "loss": 0.0001, - "reward": 0.03125, - "reward_std": 0.0883883461356163, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.03125, - "step": 115 - }, - { - "completion_length": 418.875, - "epoch": 0.5550239234449761, - "grad_norm": 0.001880040392279625, - "kl": 0.0005604763864539564, - "learning_rate": 2.4582254462267476e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 116 - }, - { - "completion_length": 553.5, - "epoch": 0.5598086124401914, - "grad_norm": 0.2177758365869522, - "kl": 0.0008635988924652338, - "learning_rate": 2.4164625574808145e-06, - "loss": 0.0, - "reward": 0.06274999678134918, - "reward_std": 0.11619041860103607, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.06274999678134918, - "step": 117 - }, - { - "completion_length": 341.0, - "epoch": 0.5645933014354066, - "grad_norm": 0.19450907409191132, - "kl": 0.0016808846266940236, - "learning_rate": 2.3747229955322022e-06, - "loss": 0.0001, - "reward": -0.08187499642372131, - "reward_std": 0.23157745599746704, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.08187499642372131, - "step": 118 - }, - { - "completion_length": 649.75, - "epoch": 0.569377990430622, - "grad_norm": 0.34568649530410767, - "kl": 0.0005209344089962542, - "learning_rate": 2.333018415637196e-06, - "loss": 0.0, - "reward": -0.03175000101327896, - "reward_std": 0.0898025631904602, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.03175000101327896, - "step": 119 - }, - { - "completion_length": 809.75, - "epoch": 0.5741626794258373, - "grad_norm": 0.0016891593113541603, - "kl": 0.000417547911638394, - "learning_rate": 2.291360463283776e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 120 - }, - { - "completion_length": 245.0, - "epoch": 0.5789473684210527, - "grad_norm": 0.005028568208217621, - "kl": 0.002079009311273694, - "learning_rate": 2.249760770939754e-06, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 121 - }, - { - "completion_length": 296.125, - "epoch": 0.583732057416268, - "grad_norm": 0.19734592735767365, - "kl": 0.0013405050849542022, - "learning_rate": 2.2082309548045595e-06, - "loss": 0.0001, - "reward": 0.015625, - "reward_std": 0.04419417306780815, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.015625, - "step": 122 - }, - { - "completion_length": 462.375, - "epoch": 0.5885167464114832, - "grad_norm": 0.174298956990242, - "kl": 0.0012827491154894233, - "learning_rate": 2.1667826115655536e-06, - "loss": 0.0001, - "reward": -0.3242499828338623, - "reward_std": 0.756491482257843, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.3242499828338623, - "step": 123 - }, - { - "completion_length": 187.375, - "epoch": 0.5933014354066986, - "grad_norm": 0.3845757246017456, - "kl": 0.0024415762163698673, - "learning_rate": 2.1254273151597967e-06, - "loss": 0.0001, - "reward": -0.027499999850988388, - "reward_std": 0.07778175175189972, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.027499999850988388, - "step": 124 - }, - { - "completion_length": 473.0, - "epoch": 0.5980861244019139, - "grad_norm": 0.19211901724338531, - "kl": 0.0008851208258420229, - "learning_rate": 2.0841766135421753e-06, - "loss": 0.0, - "reward": 0.005125001072883606, - "reward_std": 0.01449569221585989, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.057374998927116394, - "step": 125 - }, - { - "completion_length": 315.875, - "epoch": 0.6028708133971292, - "grad_norm": 0.2990177869796753, - "kl": 0.001107478397898376, - "learning_rate": 2.043042025460775e-06, - "loss": 0.0, - "reward": -0.002374999923631549, - "reward_std": 0.006717514246702194, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.002374999923631549, - "step": 126 - }, - { - "completion_length": 198.375, - "epoch": 0.6076555023923444, - "grad_norm": 0.0063351416029036045, - "kl": 0.0022553387098014355, - "learning_rate": 2.0020350372404104e-06, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 127 - }, - { - "completion_length": 479.625, - "epoch": 0.6124401913875598, - "grad_norm": 0.002753006760030985, - "kl": 0.0011882447870448232, - "learning_rate": 1.9611670995752164e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 128 - }, - { - "completion_length": 809.875, - "epoch": 0.6172248803827751, - "grad_norm": 0.0011510710464790463, - "kl": 0.0003804916050285101, - "learning_rate": 1.920449624331179e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 129 - }, - { - "completion_length": 343.5, - "epoch": 0.6220095693779905, - "grad_norm": 0.0033244211226701736, - "kl": 0.0008740820921957493, - "learning_rate": 1.8798939813595169e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 130 - }, - { - "completion_length": 400.75, - "epoch": 0.6267942583732058, - "grad_norm": 0.2215835005044937, - "kl": 0.0016903302166610956, - "learning_rate": 1.8395114953217853e-06, - "loss": 0.0001, - "reward": -0.04649999737739563, - "reward_std": 0.37665554881095886, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.10899999737739563, - "step": 131 - }, - { - "completion_length": 225.25, - "epoch": 0.631578947368421, - "grad_norm": 0.003872235771268606, - "kl": 0.001767667941749096, - "learning_rate": 1.7993134425276095e-06, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 132 - }, - { - "completion_length": 307.25, - "epoch": 0.6363636363636364, - "grad_norm": 0.25055792927742004, - "kl": 0.004441059194505215, - "learning_rate": 1.7593110477859155e-06, - "loss": 0.0002, - "reward": -0.2758750021457672, - "reward_std": 0.7802923321723938, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.2758750021457672, - "step": 133 - }, - { - "completion_length": 272.125, - "epoch": 0.6411483253588517, - "grad_norm": 0.002152123022824526, - "kl": 0.000766753451898694, - "learning_rate": 1.7195154812705344e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 134 - }, - { - "completion_length": 305.125, - "epoch": 0.645933014354067, - "grad_norm": 0.002228331984952092, - "kl": 0.0006539694149978459, - "learning_rate": 1.6799378554010773e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 135 - }, - { - "completion_length": 638.25, - "epoch": 0.6507177033492823, - "grad_norm": 0.1335768848657608, - "kl": 0.0007467590039595962, - "learning_rate": 1.640589221739926e-06, - "loss": 0.0, - "reward": -0.5721249580383301, - "reward_std": 1.767741084098816, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.5721249580383301, - "step": 136 - }, - { - "completion_length": 534.625, - "epoch": 0.6555023923444976, - "grad_norm": 0.16583861410617828, - "kl": 0.0008073956123553216, - "learning_rate": 1.6014805679062185e-06, - "loss": 0.0, - "reward": -0.051375001668930054, - "reward_std": 0.1453104466199875, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.11387500166893005, - "step": 137 - }, - { - "completion_length": 329.625, - "epoch": 0.6602870813397129, - "grad_norm": 0.0035209464840590954, - "kl": 0.0013154707849025726, - "learning_rate": 1.5626228145076976e-06, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 138 - }, - { - "completion_length": 558.125, - "epoch": 0.6650717703349283, - "grad_norm": 0.1203470379114151, - "kl": 0.0009491723612882197, - "learning_rate": 1.5240268120912631e-06, - "loss": 0.0, - "reward": 0.015625, - "reward_std": 0.04419417306780815, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.015625, - "step": 139 - }, - { - "completion_length": 329.75, - "epoch": 0.6698564593301436, - "grad_norm": 0.24603353440761566, - "kl": 0.0025463057681918144, - "learning_rate": 1.4857033381130912e-06, - "loss": 0.0001, - "reward": -0.11762499809265137, - "reward_std": 0.3326937258243561, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.11762499809265137, - "step": 140 - }, - { - "completion_length": 250.75, - "epoch": 0.6746411483253588, - "grad_norm": 0.1753588765859604, - "kl": 0.002383821876719594, - "learning_rate": 1.4476630939291631e-06, - "loss": 0.0001, - "reward": 0.015625, - "reward_std": 0.04419417306780815, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.015625, - "step": 141 - }, - { - "completion_length": 489.0, - "epoch": 0.6794258373205742, - "grad_norm": 0.002371679525822401, - "kl": 0.0010258345864713192, - "learning_rate": 1.4099167018070436e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 142 - }, - { - "completion_length": 358.875, - "epoch": 0.6842105263157895, - "grad_norm": 0.19335803389549255, - "kl": 0.0008560760761611164, - "learning_rate": 1.372474701959745e-06, - "loss": 0.0, - "reward": -0.07362499833106995, - "reward_std": 0.20824295282363892, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.07362499833106995, - "step": 143 - }, - { - "completion_length": 589.125, - "epoch": 0.6889952153110048, - "grad_norm": 0.2443452626466751, - "kl": 0.0007797165890224278, - "learning_rate": 1.3353475496025049e-06, - "loss": 0.0, - "reward": -0.0572500005364418, - "reward_std": 0.43959587812423706, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.0572500005364418, - "step": 144 - }, - { - "completion_length": 421.625, - "epoch": 0.69377990430622, - "grad_norm": 0.004174467176198959, - "kl": 0.0017937483498826623, - "learning_rate": 1.2985456120332907e-06, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 145 - }, - { - "completion_length": 374.125, - "epoch": 0.6985645933014354, - "grad_norm": 0.0018811938352882862, - "kl": 0.0008153574890457094, - "learning_rate": 1.2620791657378664e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 146 - }, - { - "completion_length": 361.125, - "epoch": 0.7033492822966507, - "grad_norm": 0.003773119766265154, - "kl": 0.001508379471488297, - "learning_rate": 1.2259583935202063e-06, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 147 - }, - { - "completion_length": 496.625, - "epoch": 0.7081339712918661, - "grad_norm": 0.15050680935382843, - "kl": 0.0005133798695169389, - "learning_rate": 1.1901933816590787e-06, - "loss": 0.0, - "reward": -0.05524999648332596, - "reward_std": 0.1562705934047699, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.11774999648332596, - "step": 148 - }, - { - "completion_length": 264.875, - "epoch": 0.7129186602870813, - "grad_norm": 0.02200624905526638, - "kl": 0.0019598701037466526, - "learning_rate": 1.1547941170915686e-06, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 149 - }, - { - "completion_length": 273.0, - "epoch": 0.7177033492822966, - "grad_norm": 0.005137371364980936, - "kl": 0.0018933727405965328, - "learning_rate": 1.1197704846243587e-06, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 150 - }, - { - "completion_length": 313.0, - "epoch": 0.722488038277512, - "grad_norm": 0.0021030320785939693, - "kl": 0.0007236721576191485, - "learning_rate": 1.0851322641735119e-06, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 151 - }, - { - "completion_length": 430.625, - "epoch": 0.7272727272727273, - "grad_norm": 0.2271677553653717, - "kl": 0.0013897118624299765, - "learning_rate": 1.0508891280335562e-06, - "loss": 0.0001, - "reward": 0.01575000025331974, - "reward_std": 0.044547729194164276, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.01575000025331974, - "step": 152 - }, - { - "completion_length": 260.375, - "epoch": 0.7320574162679426, - "grad_norm": 0.009021527133882046, - "kl": 0.004078378435224295, - "learning_rate": 1.0170506381766121e-06, - "loss": 0.0002, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 153 - }, - { - "completion_length": 236.375, - "epoch": 0.7368421052631579, - "grad_norm": 0.20537212491035461, - "kl": 0.0018334795022383332, - "learning_rate": 9.836262435823316e-07, - "loss": 0.0001, - "reward": -0.03487499803304672, - "reward_std": 0.2179812788963318, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.03487499803304672, - "step": 154 - }, - { - "completion_length": 328.0, - "epoch": 0.7416267942583732, - "grad_norm": 0.16007816791534424, - "kl": 0.001545554492622614, - "learning_rate": 9.506252775993882e-07, - "loss": 0.0001, - "reward": 0.25687500834465027, - "reward_std": 0.7265522480010986, - "rewards/correctness_reward_func": 0.25, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.05562499910593033, - "step": 155 - }, - { - "completion_length": 387.25, - "epoch": 0.7464114832535885, - "grad_norm": 0.0015982083277776837, - "kl": 0.00045389338629320264, - "learning_rate": 9.180569553392535e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 156 - }, - { - "completion_length": 654.25, - "epoch": 0.7511961722488039, - "grad_norm": 0.4694746732711792, - "kl": 0.0017813507001847029, - "learning_rate": 8.85930371102994e-07, - "loss": 0.0001, - "reward": 0.032749999314546585, - "reward_std": 0.09263098239898682, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.029750000685453415, - "step": 157 - }, - { - "completion_length": 179.75, - "epoch": 0.7559808612440191, - "grad_norm": 0.32413187623023987, - "kl": 0.0018096218118444085, - "learning_rate": 8.542544958417962e-07, - "loss": 0.0001, - "reward": 0.07512500137090683, - "reward_std": 0.23334550857543945, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.01262499950826168, - "step": 158 - }, - { - "completion_length": 395.875, - "epoch": 0.7607655502392344, - "grad_norm": 0.18941275775432587, - "kl": 0.001029332634061575, - "learning_rate": 8.23038174651942e-07, - "loss": 0.0, - "reward": 0.015625, - "reward_std": 0.04419417306780815, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.015625, - "step": 159 - }, - { - "completion_length": 241.375, - "epoch": 0.7655502392344498, - "grad_norm": 0.001631981460377574, - "kl": 0.0005422905087471008, - "learning_rate": 7.922901243049231e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 160 - }, - { - "completion_length": 402.5, - "epoch": 0.7703349282296651, - "grad_norm": 0.002342314226552844, - "kl": 0.0009872502414509654, - "learning_rate": 7.620189308133943e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 161 - }, - { - "completion_length": 322.75, - "epoch": 0.7751196172248804, - "grad_norm": 0.20069563388824463, - "kl": 0.001466510584577918, - "learning_rate": 7.322330470336314e-07, - "loss": 0.0001, - "reward": -0.0651250034570694, - "reward_std": 0.18420132994651794, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.0651250034570694, - "step": 162 - }, - { - "completion_length": 604.0, - "epoch": 0.7799043062200957, - "grad_norm": 0.0010124208638444543, - "kl": 0.0003470084920991212, - "learning_rate": 7.029407903051771e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 163 - }, - { - "completion_length": 205.625, - "epoch": 0.784688995215311, - "grad_norm": 0.0032879177015274763, - "kl": 0.0016050429549068213, - "learning_rate": 6.741503401283273e-07, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 164 - }, - { - "completion_length": 524.25, - "epoch": 0.7894736842105263, - "grad_norm": 0.27872103452682495, - "kl": 0.0011987939942628145, - "learning_rate": 6.458697358801061e-07, - "loss": 0.0, - "reward": -0.008124999701976776, - "reward_std": 0.08552099019289017, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.008124999701976776, - "step": 165 - }, - { - "completion_length": 480.5, - "epoch": 0.7942583732057417, - "grad_norm": 0.18480350077152252, - "kl": 0.00131966732442379, - "learning_rate": 6.181068745693716e-07, - "loss": 0.0001, - "reward": -0.1316249966621399, - "reward_std": 0.3722917139530182, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.1316249966621399, - "step": 166 - }, - { - "completion_length": 528.0, - "epoch": 0.7990430622009569, - "grad_norm": 0.0013328964123502374, - "kl": 0.0005824090330861509, - "learning_rate": 5.908695086316701e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 167 - }, - { - "completion_length": 769.25, - "epoch": 0.8038277511961722, - "grad_norm": 0.0013855216093361378, - "kl": 0.0005960515700280666, - "learning_rate": 5.641652437644668e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 168 - }, - { - "completion_length": 271.75, - "epoch": 0.8086124401913876, - "grad_norm": 0.0021942348685115576, - "kl": 0.0005290519329719245, - "learning_rate": 5.380015368033476e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 169 - }, - { - "completion_length": 293.875, - "epoch": 0.8133971291866029, - "grad_norm": 0.1995118409395218, - "kl": 0.0019058845937252045, - "learning_rate": 5.123856936397925e-07, - "loss": 0.0001, - "reward": -0.07100000232458115, - "reward_std": 0.20081833004951477, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.07100000232458115, - "step": 170 - }, - { - "completion_length": 338.75, - "epoch": 0.8181818181818182, - "grad_norm": 0.2031295895576477, - "kl": 0.002219364745542407, - "learning_rate": 4.873248671810929e-07, - "loss": 0.0001, - "reward": -0.33812499046325684, - "reward_std": 0.6984007358551025, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.33812499046325684, - "step": 171 - }, - { - "completion_length": 338.125, - "epoch": 0.8229665071770335, - "grad_norm": 0.001654602587223053, - "kl": 0.0005403622635640204, - "learning_rate": 4.628260553529917e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 172 - }, - { - "completion_length": 541.5, - "epoch": 0.8277511961722488, - "grad_norm": 0.0025884828064590693, - "kl": 0.0010407287627458572, - "learning_rate": 4.388960991455998e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 173 - }, - { - "completion_length": 316.5, - "epoch": 0.8325358851674641, - "grad_norm": 0.004390951711684465, - "kl": 0.0020219332072883844, - "learning_rate": 4.155416807031326e-07, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 174 - }, - { - "completion_length": 262.875, - "epoch": 0.8373205741626795, - "grad_norm": 0.24399395287036896, - "kl": 0.0021297840867191553, - "learning_rate": 3.927693214580075e-07, - "loss": 0.0001, - "reward": -0.04699999839067459, - "reward_std": 0.13293607532978058, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.10949999839067459, - "step": 175 - }, - { - "completion_length": 435.75, - "epoch": 0.8421052631578947, - "grad_norm": 0.001452272874303162, - "kl": 0.00046882437891326845, - "learning_rate": 3.7058538030980946e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 176 - }, - { - "completion_length": 544.25, - "epoch": 0.84688995215311, - "grad_norm": 0.12825724482536316, - "kl": 0.0010842983610928059, - "learning_rate": 3.489960518496521e-07, - "loss": 0.0, - "reward": -0.09724999964237213, - "reward_std": 0.2750645577907562, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.09724999964237213, - "step": 177 - }, - { - "completion_length": 486.625, - "epoch": 0.8516746411483254, - "grad_norm": 0.0027086371555924416, - "kl": 0.0011139459675177932, - "learning_rate": 3.2800736463040883e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 178 - }, - { - "completion_length": 496.875, - "epoch": 0.8564593301435407, - "grad_norm": 0.00243828515522182, - "kl": 0.0009955157293006778, - "learning_rate": 3.076251794833213e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 179 - }, - { - "completion_length": 370.625, - "epoch": 0.861244019138756, - "grad_norm": 0.002080935752019286, - "kl": 0.0007676138775423169, - "learning_rate": 2.878551878814287e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 180 - }, - { - "completion_length": 591.875, - "epoch": 0.8660287081339713, - "grad_norm": 0.20895244181156158, - "kl": 0.001007277867756784, - "learning_rate": 2.6870291035029724e-07, - "loss": 0.0, - "reward": 0.015625, - "reward_std": 0.04419417306780815, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.015625, - "step": 181 - }, - { - "completion_length": 625.125, - "epoch": 0.8708133971291866, - "grad_norm": 0.0017763936193659902, - "kl": 0.0007598320953547955, - "learning_rate": 2.501736949264805e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 182 - }, - { - "completion_length": 424.5, - "epoch": 0.8755980861244019, - "grad_norm": 0.301670640707016, - "kl": 0.0022517747711390257, - "learning_rate": 2.3227271566414827e-07, - "loss": 0.0001, - "reward": -0.032624997198581696, - "reward_std": 0.1498236507177353, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.032624997198581696, - "step": 183 - }, - { - "completion_length": 442.375, - "epoch": 0.8803827751196173, - "grad_norm": 0.0037657152861356735, - "kl": 0.0014969324693083763, - "learning_rate": 2.1500497119029324e-07, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 184 - }, - { - "completion_length": 652.25, - "epoch": 0.8851674641148325, - "grad_norm": 0.19338779151439667, - "kl": 0.0006012283847667277, - "learning_rate": 1.9837528330892781e-07, - "loss": 0.0, - "reward": -0.07612500339746475, - "reward_std": 0.21531401574611664, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.07612500339746475, - "step": 185 - }, - { - "completion_length": 441.375, - "epoch": 0.8899521531100478, - "grad_norm": 0.004363907501101494, - "kl": 0.0016148254508152604, - "learning_rate": 1.823882956546566e-07, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 186 - }, - { - "completion_length": 389.25, - "epoch": 0.8947368421052632, - "grad_norm": 0.14348091185092926, - "kl": 0.0007823936175554991, - "learning_rate": 1.6704847239599364e-07, - "loss": 0.0, - "reward": -0.21924999356269836, - "reward_std": 0.6201326251029968, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.21924999356269836, - "step": 187 - }, - { - "completion_length": 308.125, - "epoch": 0.8995215311004785, - "grad_norm": 0.14146213233470917, - "kl": 0.0014763720100745559, - "learning_rate": 1.5236009698880532e-07, - "loss": 0.0001, - "reward": -0.025499999523162842, - "reward_std": 0.16306614875793457, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.025499999523162842, - "step": 188 - }, - { - "completion_length": 191.0, - "epoch": 0.9043062200956937, - "grad_norm": 0.005878888536244631, - "kl": 0.0025621799286454916, - "learning_rate": 1.3832727098020333e-07, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 189 - }, - { - "completion_length": 295.75, - "epoch": 0.9090909090909091, - "grad_norm": 0.006161710247397423, - "kl": 0.0028807735070586205, - "learning_rate": 1.2495391286323988e-07, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 190 - }, - { - "completion_length": 440.75, - "epoch": 0.9138755980861244, - "grad_norm": 0.0018302258104085922, - "kl": 0.00062660250114277, - "learning_rate": 1.1224375698271894e-07, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 191 - }, - { - "completion_length": 265.375, - "epoch": 0.9186602870813397, - "grad_norm": 0.27701327204704285, - "kl": 0.0017912256298586726, - "learning_rate": 1.0020035249242304e-07, - "loss": 0.0001, - "reward": -0.026750000193715096, - "reward_std": 0.07566042244434357, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.026750000193715096, - "step": 192 - }, - { - "completion_length": 306.375, - "epoch": 0.9234449760765551, - "grad_norm": 0.20841118693351746, - "kl": 0.001377519336529076, - "learning_rate": 8.882706236405886e-08, - "loss": 0.0001, - "reward": 0.10949999839067459, - "reward_std": 0.20544515550136566, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0625, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.04699999839067459, - "step": 193 - }, - { - "completion_length": 584.75, - "epoch": 0.9282296650717703, - "grad_norm": 0.0015115796122699976, - "kl": 0.0006448360509239137, - "learning_rate": 7.812706244818669e-08, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 194 - }, - { - "completion_length": 263.625, - "epoch": 0.9330143540669856, - "grad_norm": 0.008000055328011513, - "kl": 0.0035724889021366835, - "learning_rate": 6.810334058740736e-08, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 195 - }, - { - "completion_length": 430.5, - "epoch": 0.937799043062201, - "grad_norm": 0.14772750437259674, - "kl": 0.001422423985786736, - "learning_rate": 5.8758695782038245e-08, - "loss": 0.0001, - "reward": -0.6988750100135803, - "reward_std": 1.9767169952392578, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.6988750100135803, - "step": 196 - }, - { - "completion_length": 318.875, - "epoch": 0.9425837320574163, - "grad_norm": 0.002858949825167656, - "kl": 0.0010267450707033277, - "learning_rate": 5.009573740853313e-08, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 197 - }, - { - "completion_length": 669.375, - "epoch": 0.9473684210526315, - "grad_norm": 0.13853329420089722, - "kl": 0.0006254981271922588, - "learning_rate": 4.211688449084123e-08, - "loss": 0.0, - "reward": -0.07575000077486038, - "reward_std": 0.2877284288406372, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.07575000077486038, - "step": 198 - }, - { - "completion_length": 158.5, - "epoch": 0.9521531100478469, - "grad_norm": 0.34314221143722534, - "kl": 0.0019462838536128402, - "learning_rate": 3.4824365024928585e-08, - "loss": 0.0001, - "reward": -0.006624999921768904, - "reward_std": 0.01873832941055298, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.006624999921768904, - "step": 199 - }, - { - "completion_length": 716.875, - "epoch": 0.9569377990430622, - "grad_norm": 0.0014241288881748915, - "kl": 0.0005508558824658394, - "learning_rate": 2.8220215356634662e-08, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 200 - }, - { - "completion_length": 500.875, - "epoch": 0.9617224880382775, - "grad_norm": 0.001157790538854897, - "kl": 0.0004124369006603956, - "learning_rate": 2.230627961304993e-08, - "loss": 0.0, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 201 - }, - { - "completion_length": 304.75, - "epoch": 0.9665071770334929, - "grad_norm": 0.18415965139865875, - "kl": 0.0014049847377464175, - "learning_rate": 1.708420918756476e-08, - "loss": 0.0001, - "reward": -0.048624999821186066, - "reward_std": 0.1375322788953781, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.048624999821186066, - "step": 202 - }, - { - "completion_length": 212.875, - "epoch": 0.9712918660287081, - "grad_norm": 0.004437373951077461, - "kl": 0.0017340413760393858, - "learning_rate": 1.255546227873966e-08, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 203 - }, - { - "completion_length": 295.125, - "epoch": 0.9760765550239234, - "grad_norm": 0.44255179166793823, - "kl": 0.008340583182871342, - "learning_rate": 8.721303483121002e-09, - "loss": 0.0003, - "reward": -0.042750000953674316, - "reward_std": 0.1209152564406395, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.042750000953674316, - "step": 204 - }, - { - "completion_length": 268.625, - "epoch": 0.9808612440191388, - "grad_norm": 0.1835043728351593, - "kl": 0.0025195367634296417, - "learning_rate": 5.582803442117091e-09, - "loss": 0.0001, - "reward": 0.04637499898672104, - "reward_std": 0.13116830587387085, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.04637499898672104, - "step": 205 - }, - { - "completion_length": 413.375, - "epoch": 0.9856459330143541, - "grad_norm": 0.0029366558883339167, - "kl": 0.0013898334000259638, - "learning_rate": 3.1408385430356513e-09, - "loss": 0.0001, - "reward": 0.0, - "reward_std": 0.0, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.0, - "step": 206 - }, - { - "completion_length": 371.375, - "epoch": 0.9904306220095693, - "grad_norm": 0.19324921071529388, - "kl": 0.00130726199131459, - "learning_rate": 1.3960906743634706e-09, - "loss": 0.0001, - "reward": -0.24437500536441803, - "reward_std": 0.5071607232093811, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.24437500536441803, - "step": 207 - }, - { - "completion_length": 1108.125, - "epoch": 0.9952153110047847, - "grad_norm": 0.15215414762496948, - "kl": 0.0008032690966501832, - "learning_rate": 3.490470353573194e-10, - "loss": 0.0, - "reward": -0.17787499725818634, - "reward_std": 0.5031064748764038, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": -0.17787499725818634, - "step": 208 - }, - { - "completion_length": 602.875, - "epoch": 1.0, - "grad_norm": 0.12628914415836334, - "kl": 0.000902353844139725, - "learning_rate": 0.0, - "loss": 0.0, - "reward": 0.04699999839067459, - "reward_std": 0.13293607532978058, - "rewards/correctness_reward_func": 0.0, - "rewards/int_reward_func": 0.0, - "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.0, - "rewards/xmlcount_reward_func": 0.04699999839067459, - "step": 209 - } - ], - "logging_steps": 1, - "max_steps": 209, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 100, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 0.0, - "train_batch_size": 8, - "trial_name": null, - "trial_params": null -} +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 209, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 691.5, + "epoch": 0.004784688995215311, + "grad_norm": 0.16871348023414612, + "kl": 0.0, + "learning_rate": 2.3809523809523811e-07, + "loss": -0.0, + "reward": -0.1928749978542328, + "reward_std": 0.5455328822135925, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.1928749978542328, + "step": 1 + } +], +"logging_steps": 1, +"max_steps": 1, +"num_input_tokens_seen": 0, +"num_train_epochs": 1, +"save_steps": 100, +"stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } +}, +"total_flos": 0.0, +"train_batch_size": 8, +"trial_name": null, +"trial_params": null +}