{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7142857142857143, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 200.0, "epoch": 0.007142857142857143, "grad_norm": 0.00043654805631376803, "kl": 1.1190734767296817e-05, "learning_rate": 3.5714285714285714e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 1 }, { "completion_length": 292.875, "epoch": 0.014285714285714285, "grad_norm": 0.1742347776889801, "kl": 5.753119239670923e-06, "learning_rate": 7.142857142857143e-06, "loss": 0.0, "reward": -0.3997499942779541, "reward_std": 0.15906040370464325, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3997499942779541, "step": 2 }, { "completion_length": 445.5, "epoch": 0.02142857142857143, "grad_norm": 0.0913790687918663, "kl": 5.179101663088659e-06, "learning_rate": 1.0714285714285714e-05, "loss": 0.0, "reward": -0.7059999704360962, "reward_std": 0.729467511177063, "rewards/correctness_reward_func": 0.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.9559999704360962, "step": 3 }, { "completion_length": 397.875, "epoch": 0.02857142857142857, "grad_norm": 0.18434225022792816, "kl": 1.612883534107823e-05, "learning_rate": 1.4285714285714285e-05, "loss": 0.0, "reward": -0.7477500438690186, "reward_std": 0.5131211876869202, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7477500438690186, "step": 4 }, { "completion_length": 374.25, "epoch": 0.03571428571428571, "grad_norm": 0.15434053540229797, "kl": 3.564520739018917e-05, "learning_rate": 1.785714285714286e-05, "loss": 0.0, "reward": -0.4762499928474426, "reward_std": 0.7022607922554016, "rewards/correctness_reward_func": 0.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.7262499928474426, "step": 5 }, { "completion_length": 264.25, "epoch": 0.04285714285714286, "grad_norm": 0.14609746634960175, "kl": 0.0002129438507836312, "learning_rate": 2.1428571428571428e-05, "loss": 0.0, "reward": -0.1522500216960907, "reward_std": 0.6662349700927734, "rewards/correctness_reward_func": 0.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4022500216960907, "step": 6 }, { "completion_length": 221.5, "epoch": 0.05, "grad_norm": 0.22101597487926483, "kl": 0.0012037234846502542, "learning_rate": 2.5e-05, "loss": 0.0, "reward": -0.11937500536441803, "reward_std": 0.2714921534061432, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11937499791383743, "step": 7 }, { "completion_length": 247.875, "epoch": 0.05714285714285714, "grad_norm": 0.22249825298786163, "kl": 0.0021207458339631557, "learning_rate": 2.857142857142857e-05, "loss": 0.0001, "reward": -0.19712500274181366, "reward_std": 0.36591196060180664, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19712500274181366, "step": 8 }, { "completion_length": 511.75, "epoch": 0.06428571428571428, "grad_norm": 0.11614850163459778, "kl": 0.0037874511908739805, "learning_rate": 3.2142857142857144e-05, "loss": 0.0002, "reward": -0.7689999938011169, "reward_std": 1.3408536911010742, "rewards/correctness_reward_func": 0.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -1.0189999341964722, "step": 9 }, { "completion_length": 221.625, "epoch": 0.07142857142857142, "grad_norm": 0.21436981856822968, "kl": 0.012746547348797321, "learning_rate": 3.571428571428572e-05, "loss": 0.0005, "reward": -0.23862498998641968, "reward_std": 0.11265870183706284, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.23862498998641968, "step": 10 }, { "completion_length": 292.75, "epoch": 0.07857142857142857, "grad_norm": 0.19347621500492096, "kl": 0.011033562943339348, "learning_rate": 3.928571428571429e-05, "loss": 0.0004, "reward": -0.4468750059604645, "reward_std": 0.16547630727291107, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4468749761581421, "step": 11 }, { "completion_length": 110.625, "epoch": 0.08571428571428572, "grad_norm": 0.44592323899269104, "kl": 0.060783885419368744, "learning_rate": 4.2857142857142856e-05, "loss": 0.0024, "reward": 1.424625039100647, "reward_std": 1.0278345346450806, "rewards/correctness_reward_func": 1.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17462500929832458, "step": 12 }, { "completion_length": 678.75, "epoch": 0.09285714285714286, "grad_norm": 0.1104876920580864, "kl": 0.013322519138455391, "learning_rate": 4.642857142857143e-05, "loss": 0.0005, "reward": 0.468500018119812, "reward_std": 0.9677569270133972, "rewards/correctness_reward_func": 1.0, "rewards/length_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.656499981880188, "step": 13 }, { "completion_length": 495.25, "epoch": 0.1, "grad_norm": 0.13062289357185364, "kl": 0.021864598616957664, "learning_rate": 5e-05, "loss": 0.0009, "reward": -0.5831249952316284, "reward_std": 0.676399827003479, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5831249952316284, "step": 14 }, { "completion_length": 781.375, "epoch": 0.10714285714285714, "grad_norm": 0.09523279964923859, "kl": 0.013723693788051605, "learning_rate": 4.999222955002041e-05, "loss": 0.0005, "reward": -0.13512498140335083, "reward_std": 0.9264968633651733, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13512498140335083, "step": 15 }, { "completion_length": 405.5, "epoch": 0.11428571428571428, "grad_norm": 0.15296146273612976, "kl": 0.03523610904812813, "learning_rate": 4.996892303047306e-05, "loss": 0.0014, "reward": 2.193000078201294, "reward_std": 0.43767958879470825, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19300000369548798, "step": 16 }, { "completion_length": 214.75, "epoch": 0.12142857142857143, "grad_norm": 0.2985721230506897, "kl": 0.0879557877779007, "learning_rate": 4.9930094929529506e-05, "loss": 0.0035, "reward": -0.011874988675117493, "reward_std": 0.6198051571846008, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.011875003576278687, "step": 17 }, { "completion_length": 121.25, "epoch": 0.12857142857142856, "grad_norm": 0.417566180229187, "kl": 0.22521352767944336, "learning_rate": 4.987576938413504e-05, "loss": 0.009, "reward": 2.4171249866485596, "reward_std": 0.1779337227344513, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.41712498664855957, "step": 18 }, { "completion_length": 179.0, "epoch": 0.1357142857142857, "grad_norm": 0.285080224275589, "kl": 0.10962098836898804, "learning_rate": 4.9805980165004304e-05, "loss": 0.0044, "reward": 1.75, "reward_std": 1.0350983142852783, "rewards/correctness_reward_func": 1.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 19 }, { "completion_length": 282.875, "epoch": 0.14285714285714285, "grad_norm": 0.27518561482429504, "kl": 0.06473983079195023, "learning_rate": 4.972077065562821e-05, "loss": 0.0026, "reward": 1.2120000123977661, "reward_std": 1.0715053081512451, "rewards/correctness_reward_func": 0.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4620000123977661, "step": 20 }, { "completion_length": 146.0, "epoch": 0.15, "grad_norm": 0.01954607106745243, "kl": 0.1001419946551323, "learning_rate": 4.962019382530521e-05, "loss": 0.004, "reward": 2.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 21 }, { "completion_length": 233.125, "epoch": 0.15714285714285714, "grad_norm": 0.019903944805264473, "kl": 0.0685255229473114, "learning_rate": 4.9504312196213596e-05, "loss": 0.0027, "reward": 2.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 22 }, { "completion_length": 199.125, "epoch": 0.16428571428571428, "grad_norm": 0.022628186270594597, "kl": 0.08269491791725159, "learning_rate": 4.937319780454559e-05, "loss": 0.0033, "reward": 2.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 23 }, { "completion_length": 388.25, "epoch": 0.17142857142857143, "grad_norm": 0.12237099558115005, "kl": 0.031046129763126373, "learning_rate": 4.922693215572695e-05, "loss": 0.0012, "reward": 1.5, "reward_std": 1.0690449476242065, "rewards/correctness_reward_func": 1.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 24 }, { "completion_length": 262.875, "epoch": 0.17857142857142858, "grad_norm": 0.026575006544589996, "kl": 0.08844916522502899, "learning_rate": 4.90656061737503e-05, "loss": 0.0035, "reward": 2.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 25 }, { "completion_length": 290.125, "epoch": 0.18571428571428572, "grad_norm": 0.5788246989250183, "kl": 0.06681957095861435, "learning_rate": 4.888932014465352e-05, "loss": 0.0027, "reward": 0.375, "reward_std": 0.2314550280570984, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.375, "step": 26 }, { "completion_length": 224.375, "epoch": 0.19285714285714287, "grad_norm": 0.4455007314682007, "kl": 0.2504517138004303, "learning_rate": 4.86981836541783e-05, "loss": 0.01, "reward": 1.625, "reward_std": 1.2174328565597534, "rewards/correctness_reward_func": 1.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.375, "step": 27 }, { "completion_length": 232.125, "epoch": 0.2, "grad_norm": 0.23774927854537964, "kl": 0.1069759875535965, "learning_rate": 4.849231551964771e-05, "loss": 0.0043, "reward": 1.921875, "reward_std": 0.8937718272209167, "rewards/correctness_reward_func": 1.5, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.421875, "step": 28 }, { "completion_length": 458.625, "epoch": 0.20714285714285716, "grad_norm": 0.09729224443435669, "kl": 0.03021763078868389, "learning_rate": 4.827184371610511e-05, "loss": 0.0012, "reward": 1.75, "reward_std": 1.0350983142852783, "rewards/correctness_reward_func": 1.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 29 }, { "completion_length": 422.375, "epoch": 0.21428571428571427, "grad_norm": 0.12195795774459839, "kl": 0.04913492873311043, "learning_rate": 4.803690529676019e-05, "loss": 0.002, "reward": 2.25, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4375, "step": 30 }, { "completion_length": 270.875, "epoch": 0.22142857142857142, "grad_norm": 0.21031659841537476, "kl": 0.0698024183511734, "learning_rate": 4.778764630779183e-05, "loss": 0.0028, "reward": 2.21875, "reward_std": 0.6999680995941162, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.46875, "step": 31 }, { "completion_length": 377.5, "epoch": 0.22857142857142856, "grad_norm": 0.010702410712838173, "kl": 0.037429843097925186, "learning_rate": 4.752422169756048e-05, "loss": 0.0015, "reward": 2.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 32 }, { "completion_length": 360.125, "epoch": 0.2357142857142857, "grad_norm": 0.14157415926456451, "kl": 0.08244010806083679, "learning_rate": 4.724679522028672e-05, "loss": 0.0033, "reward": 1.6486248970031738, "reward_std": 1.1074291467666626, "rewards/correctness_reward_func": 1.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3986250162124634, "step": 33 }, { "completion_length": 116.125, "epoch": 0.24285714285714285, "grad_norm": 1.3088772296905518, "kl": 0.7027589082717896, "learning_rate": 4.6955539334255716e-05, "loss": 0.0281, "reward": 1.75, "reward_std": 1.0350983142852783, "rewards/correctness_reward_func": 1.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 34 }, { "completion_length": 311.0, "epoch": 0.25, "grad_norm": 0.7312092781066895, "kl": 0.07371841371059418, "learning_rate": 4.665063509461097e-05, "loss": 0.0029, "reward": 0.6698750257492065, "reward_std": 0.7472349405288696, "rewards/correctness_reward_func": 0.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.41987499594688416, "step": 35 }, { "completion_length": 329.625, "epoch": 0.2571428571428571, "grad_norm": 0.006386851891875267, "kl": 0.039320699870586395, "learning_rate": 4.6332272040803895e-05, "loss": 0.0016, "reward": 2.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 36 }, { "completion_length": 365.875, "epoch": 0.2642857142857143, "grad_norm": 0.20290029048919678, "kl": 0.05483713746070862, "learning_rate": 4.600064807876929e-05, "loss": 0.0022, "reward": 1.6407499313354492, "reward_std": 1.0948845148086548, "rewards/correctness_reward_func": 1.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.390749990940094, "step": 37 }, { "completion_length": 287.0, "epoch": 0.2714285714285714, "grad_norm": 0.19799265265464783, "kl": 0.13818374276161194, "learning_rate": 4.5655969357899874e-05, "loss": 0.0055, "reward": 1.875999927520752, "reward_std": 0.9384979605674744, "rewards/correctness_reward_func": 1.5, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 38 }, { "completion_length": 289.875, "epoch": 0.2785714285714286, "grad_norm": 0.37082406878471375, "kl": 0.14701411128044128, "learning_rate": 4.529845014289642e-05, "loss": 0.0059, "reward": 2.0653750896453857, "reward_std": 0.6956271529197693, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3153750002384186, "step": 39 }, { "completion_length": 725.125, "epoch": 0.2857142857142857, "grad_norm": 0.11310902237892151, "kl": 0.05320233479142189, "learning_rate": 4.4928312680573064e-05, "loss": 0.0021, "reward": 0.5361250042915344, "reward_std": 0.2760908007621765, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.1875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3486250042915344, "step": 40 }, { "completion_length": 373.625, "epoch": 0.29285714285714287, "grad_norm": 0.005955233704298735, "kl": 0.09833689779043198, "learning_rate": 4.454578706170075e-05, "loss": 0.0039, "reward": 0.36500000953674316, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36500000953674316, "step": 41 }, { "completion_length": 337.75, "epoch": 0.3, "grad_norm": 0.18655544519424438, "kl": 0.13665920495986938, "learning_rate": 4.415111107797445e-05, "loss": 0.0055, "reward": 1.3464999198913574, "reward_std": 1.0498690605163574, "rewards/correctness_reward_func": 1.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3465000092983246, "step": 42 }, { "completion_length": 463.75, "epoch": 0.30714285714285716, "grad_norm": 0.18999366462230682, "kl": 0.1055900901556015, "learning_rate": 4.374453007419336e-05, "loss": 0.0042, "reward": 0.3206250071525574, "reward_std": 0.08869842439889908, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3206250071525574, "step": 43 }, { "completion_length": 350.25, "epoch": 0.3142857142857143, "grad_norm": 0.16436553001403809, "kl": 0.12820430099964142, "learning_rate": 4.332629679574566e-05, "loss": 0.0051, "reward": 2.3486249446868896, "reward_std": 0.04361177235841751, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3486250042915344, "step": 44 }, { "completion_length": 396.625, "epoch": 0.32142857142857145, "grad_norm": 0.12755045294761658, "kl": 0.10351184010505676, "learning_rate": 4.2896671231492966e-05, "loss": 0.0041, "reward": 2.1152501106262207, "reward_std": 0.707207977771759, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36524999141693115, "step": 45 }, { "completion_length": 323.0, "epoch": 0.32857142857142857, "grad_norm": 0.1433708518743515, "kl": 0.12643718719482422, "learning_rate": 4.245592045215182e-05, "loss": 0.0051, "reward": 2.085624933242798, "reward_std": 0.6997721791267395, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3356249928474426, "step": 46 }, { "completion_length": 465.875, "epoch": 0.3357142857142857, "grad_norm": 0.14527598023414612, "kl": 0.09910832345485687, "learning_rate": 4.2004318444272985e-05, "loss": 0.004, "reward": 1.0987499952316284, "reward_std": 1.0445955991744995, "rewards/correctness_reward_func": 0.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3487499952316284, "step": 47 }, { "completion_length": 443.875, "epoch": 0.34285714285714286, "grad_norm": 0.11678742617368698, "kl": 0.09625618904829025, "learning_rate": 4.154214593992149e-05, "loss": 0.0039, "reward": 2.1332499980926514, "reward_std": 0.716006875038147, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.38324999809265137, "step": 48 }, { "completion_length": 298.875, "epoch": 0.35, "grad_norm": 0.2462267130613327, "kl": 0.14918765425682068, "learning_rate": 4.1069690242163484e-05, "loss": 0.006, "reward": 1.3640000820159912, "reward_std": 1.0677127838134766, "rewards/correctness_reward_func": 1.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36400002241134644, "step": 49 }, { "completion_length": 604.125, "epoch": 0.35714285714285715, "grad_norm": 0.11957182735204697, "kl": 0.06581288576126099, "learning_rate": 4.058724504646834e-05, "loss": 0.0026, "reward": 0.42887499928474426, "reward_std": 0.1762550324201584, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36637499928474426, "step": 50 }, { "completion_length": 630.75, "epoch": 0.36428571428571427, "grad_norm": 0.20287089049816132, "kl": 0.15163259208202362, "learning_rate": 4.009511025813694e-05, "loss": 0.0061, "reward": 0.3474999964237213, "reward_std": 0.10380475223064423, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3475000262260437, "step": 51 }, { "completion_length": 409.875, "epoch": 0.37142857142857144, "grad_norm": 0.15633811056613922, "kl": 0.12077239155769348, "learning_rate": 3.959359180586975e-05, "loss": 0.0048, "reward": 1.6152499914169312, "reward_std": 1.0375232696533203, "rewards/correctness_reward_func": 1.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36524999141693115, "step": 52 }, { "completion_length": 473.875, "epoch": 0.37857142857142856, "grad_norm": 0.1574457734823227, "kl": 0.08710946887731552, "learning_rate": 3.908300145159055e-05, "loss": 0.0035, "reward": 0.8807500004768372, "reward_std": 0.9173192381858826, "rewards/correctness_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.38075000047683716, "step": 53 }, { "completion_length": 321.25, "epoch": 0.38571428571428573, "grad_norm": 0.19621425867080688, "kl": 0.17770174145698547, "learning_rate": 3.856365659664399e-05, "loss": 0.0071, "reward": 2.1021249294281006, "reward_std": 0.7031054496765137, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35212498903274536, "step": 54 }, { "completion_length": 273.75, "epoch": 0.39285714285714285, "grad_norm": 0.19145628809928894, "kl": 0.16191110014915466, "learning_rate": 3.803588008448745e-05, "loss": 0.0065, "reward": 0.6318750381469727, "reward_std": 0.7018797993659973, "rewards/correctness_reward_func": 0.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.38187500834465027, "step": 55 }, { "completion_length": 281.25, "epoch": 0.4, "grad_norm": 0.30190473794937134, "kl": 0.1685158759355545, "learning_rate": 3.7500000000000003e-05, "loss": 0.0067, "reward": 0.8665000200271606, "reward_std": 0.9249003529548645, "rewards/correctness_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36650002002716064, "step": 56 }, { "completion_length": 340.75, "epoch": 0.40714285714285714, "grad_norm": 0.15627902746200562, "kl": 0.12346489727497101, "learning_rate": 3.695634946553296e-05, "loss": 0.0049, "reward": 0.32987499237060547, "reward_std": 0.06595548242330551, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32987502217292786, "step": 57 }, { "completion_length": 993.25, "epoch": 0.4142857142857143, "grad_norm": 0.0931173712015152, "kl": 0.018140610307455063, "learning_rate": 3.6405266433829075e-05, "loss": 0.0007, "reward": 0.4088750183582306, "reward_std": 0.2790803611278534, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1588750034570694, "step": 58 }, { "completion_length": 474.875, "epoch": 0.42142857142857143, "grad_norm": 0.10973533242940903, "kl": 0.09137643128633499, "learning_rate": 3.5847093477938956e-05, "loss": 0.0037, "reward": 0.42787498235702515, "reward_std": 0.17703060805797577, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36537498235702515, "step": 59 }, { "completion_length": 326.0, "epoch": 0.42857142857142855, "grad_norm": 0.19174934923648834, "kl": 0.13062816858291626, "learning_rate": 3.5282177578265296e-05, "loss": 0.0052, "reward": 0.6150000095367432, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36500000953674316, "step": 60 }, { "completion_length": 180.125, "epoch": 0.4357142857142857, "grad_norm": 0.48899680376052856, "kl": 0.4511120319366455, "learning_rate": 3.471086990686737e-05, "loss": 0.018, "reward": 2.324000120162964, "reward_std": 0.11677447706460953, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3240000009536743, "step": 61 }, { "completion_length": 442.0, "epoch": 0.44285714285714284, "grad_norm": 0.14630813896656036, "kl": 0.11019708961248398, "learning_rate": 3.413352560915988e-05, "loss": 0.0044, "reward": 0.17625001072883606, "reward_std": 0.5338656306266785, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17625001072883606, "step": 62 }, { "completion_length": 337.625, "epoch": 0.45, "grad_norm": 0.15681755542755127, "kl": 0.1342012584209442, "learning_rate": 3.355050358314172e-05, "loss": 0.0054, "reward": 1.1151249408721924, "reward_std": 1.0349948406219482, "rewards/correctness_reward_func": 0.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36512500047683716, "step": 63 }, { "completion_length": 255.375, "epoch": 0.45714285714285713, "grad_norm": 0.15988513827323914, "kl": 0.14537671208381653, "learning_rate": 3.2962166256292113e-05, "loss": 0.0059, "reward": 2.36537504196167, "reward_std": 0.0005174623220227659, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36537498235702515, "step": 64 }, { "completion_length": 303.875, "epoch": 0.4642857142857143, "grad_norm": 0.6239961981773376, "kl": 0.10595186054706573, "learning_rate": 3.2368879360272606e-05, "loss": 0.0042, "reward": 2.1156249046325684, "reward_std": 0.7073594331741333, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3656249940395355, "step": 65 }, { "completion_length": 554.625, "epoch": 0.4714285714285714, "grad_norm": 0.0981917455792427, "kl": 0.06483708322048187, "learning_rate": 3.177101170357513e-05, "loss": 0.0026, "reward": 0.33550000190734863, "reward_std": 0.08505627512931824, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33550000190734863, "step": 66 }, { "completion_length": 164.0, "epoch": 0.4785714285714286, "grad_norm": 0.2880488634109497, "kl": 0.24433362483978271, "learning_rate": 3.116893494225734e-05, "loss": 0.0098, "reward": 0.41474997997283936, "reward_std": 0.3112049102783203, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.35224997997283936, "step": 67 }, { "completion_length": 327.375, "epoch": 0.4857142857142857, "grad_norm": 0.15951593220233917, "kl": 0.09316325187683105, "learning_rate": 3.056302334890786e-05, "loss": 0.0037, "reward": 1.013374924659729, "reward_std": 0.8874584436416626, "rewards/correctness_reward_func": 0.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2633749842643738, "step": 68 }, { "completion_length": 183.625, "epoch": 0.4928571428571429, "grad_norm": 0.19407892227172852, "kl": 0.2086183726787567, "learning_rate": 2.9953653579984942e-05, "loss": 0.0083, "reward": 2.365499973297119, "reward_std": 0.0005344200180843472, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36549997329711914, "step": 69 }, { "completion_length": 260.375, "epoch": 0.5, "grad_norm": 0.2512143850326538, "kl": 0.14164677262306213, "learning_rate": 2.9341204441673266e-05, "loss": 0.0057, "reward": 2.020250082015991, "reward_std": 0.7143486142158508, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27024999260902405, "step": 70 }, { "completion_length": 191.125, "epoch": 0.5071428571428571, "grad_norm": 0.3029208779335022, "kl": 0.11700859665870667, "learning_rate": 2.872605665440436e-05, "loss": 0.0047, "reward": 2.012125015258789, "reward_std": 0.7928214073181152, "rewards/correctness_reward_func": 1.5, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.44962501525878906, "step": 71 }, { "completion_length": 500.5, "epoch": 0.5142857142857142, "grad_norm": 0.12518590688705444, "kl": 0.02341982163488865, "learning_rate": 2.8108592616187133e-05, "loss": 0.0009, "reward": -0.19187498092651367, "reward_std": 0.7031666040420532, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19187499582767487, "step": 72 }, { "completion_length": 710.25, "epoch": 0.5214285714285715, "grad_norm": 0.11030412465333939, "kl": 0.01801430992782116, "learning_rate": 2.748919616489542e-05, "loss": 0.0007, "reward": -0.19962501525878906, "reward_std": 1.141786813735962, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3246249854564667, "step": 73 }, { "completion_length": 395.0, "epoch": 0.5285714285714286, "grad_norm": 0.08873239159584045, "kl": 0.02234470844268799, "learning_rate": 2.686825233966061e-05, "loss": 0.0009, "reward": 1.4856250286102295, "reward_std": 0.9701153039932251, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2643750011920929, "step": 74 }, { "completion_length": 386.125, "epoch": 0.5357142857142857, "grad_norm": 0.09125195443630219, "kl": 0.029970454052090645, "learning_rate": 2.624614714151743e-05, "loss": 0.0012, "reward": 1.2940000295639038, "reward_std": 0.5996603965759277, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.45600003004074097, "step": 75 }, { "completion_length": 239.375, "epoch": 0.5428571428571428, "grad_norm": 0.1535108983516693, "kl": 0.05824340879917145, "learning_rate": 2.5623267293451826e-05, "loss": 0.0023, "reward": 1.394374966621399, "reward_std": 0.8498432040214539, "rewards/correctness_reward_func": 1.5, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10562500357627869, "step": 76 }, { "completion_length": 351.625, "epoch": 0.55, "grad_norm": 0.15809431672096252, "kl": 0.02210851013660431, "learning_rate": 2.5e-05, "loss": 0.0009, "reward": 1.4718749523162842, "reward_std": 1.3246859312057495, "rewards/correctness_reward_func": 1.5, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.02812500298023224, "step": 77 }, { "completion_length": 199.375, "epoch": 0.5571428571428572, "grad_norm": 0.20733776688575745, "kl": 0.06919591128826141, "learning_rate": 2.4376732706548183e-05, "loss": 0.0028, "reward": 2.0334999561309814, "reward_std": 0.9354116320610046, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.22100000083446503, "step": 78 }, { "completion_length": 490.625, "epoch": 0.5642857142857143, "grad_norm": 0.3020256459712982, "kl": 0.020964641124010086, "learning_rate": 2.375385285848257e-05, "loss": 0.0008, "reward": 0.2606250047683716, "reward_std": 1.5538617372512817, "rewards/correctness_reward_func": 0.75, "rewards/length_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6143749952316284, "step": 79 }, { "completion_length": 257.875, "epoch": 0.5714285714285714, "grad_norm": 0.17333918809890747, "kl": 0.040599275380373, "learning_rate": 2.3131747660339394e-05, "loss": 0.0016, "reward": 1.8927500247955322, "reward_std": 0.6703715324401855, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14274999499320984, "step": 80 }, { "completion_length": 292.625, "epoch": 0.5785714285714286, "grad_norm": 0.14176879823207855, "kl": 0.03724910691380501, "learning_rate": 2.251080383510459e-05, "loss": 0.0015, "reward": 2.164875030517578, "reward_std": 0.35948193073272705, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16487500071525574, "step": 81 }, { "completion_length": 500.25, "epoch": 0.5857142857142857, "grad_norm": 0.1182783842086792, "kl": 0.023212479427456856, "learning_rate": 2.189140738381288e-05, "loss": 0.0009, "reward": 0.7456250190734863, "reward_std": 1.171677589416504, "rewards/correctness_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24562500417232513, "step": 82 }, { "completion_length": 487.125, "epoch": 0.5928571428571429, "grad_norm": 0.12226809561252594, "kl": 0.019420940428972244, "learning_rate": 2.1273943345595637e-05, "loss": 0.0008, "reward": 0.3148750066757202, "reward_std": 1.1947816610336304, "rewards/correctness_reward_func": 0.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06487500667572021, "step": 83 }, { "completion_length": 152.875, "epoch": 0.6, "grad_norm": 0.27960774302482605, "kl": 0.07769262790679932, "learning_rate": 2.0658795558326743e-05, "loss": 0.0031, "reward": 2.5625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.5, "step": 84 }, { "completion_length": 235.75, "epoch": 0.6071428571428571, "grad_norm": 0.2650700807571411, "kl": 0.0834224745631218, "learning_rate": 2.0046346420015067e-05, "loss": 0.0033, "reward": 2.243499994277954, "reward_std": 0.5382997393608093, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.4309999942779541, "step": 85 }, { "completion_length": 324.0, "epoch": 0.6142857142857143, "grad_norm": 0.00965797994285822, "kl": 0.04011977091431618, "learning_rate": 1.9436976651092144e-05, "loss": 0.0016, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 86 }, { "completion_length": 227.0, "epoch": 0.6214285714285714, "grad_norm": 0.19126032292842865, "kl": 0.12643316388130188, "learning_rate": 1.8831065057742657e-05, "loss": 0.0051, "reward": 2.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 87 }, { "completion_length": 510.75, "epoch": 0.6285714285714286, "grad_norm": 0.003844304708763957, "kl": 0.016257166862487793, "learning_rate": 1.8228988296424877e-05, "loss": 0.0007, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 88 }, { "completion_length": 243.0, "epoch": 0.6357142857142857, "grad_norm": 0.008268176577985287, "kl": 0.045144014060497284, "learning_rate": 1.7631120639727393e-05, "loss": 0.0018, "reward": 2.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 89 }, { "completion_length": 235.25, "epoch": 0.6428571428571429, "grad_norm": 0.14653581380844116, "kl": 0.038456518203020096, "learning_rate": 1.7037833743707892e-05, "loss": 0.0015, "reward": 2.375, "reward_std": 0.5824823379516602, "rewards/correctness_reward_func": 1.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.5, "step": 90 }, { "completion_length": 220.125, "epoch": 0.65, "grad_norm": 0.14805400371551514, "kl": 0.07326260954141617, "learning_rate": 1.6449496416858284e-05, "loss": 0.0029, "reward": 1.3125, "reward_std": 1.1319231986999512, "rewards/correctness_reward_func": 0.75, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.5, "step": 91 }, { "completion_length": 449.0, "epoch": 0.6571428571428571, "grad_norm": 0.11858955770730972, "kl": 0.027846258133649826, "learning_rate": 1.5866474390840125e-05, "loss": 0.0011, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.25, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 92 }, { "completion_length": 219.5, "epoch": 0.6642857142857143, "grad_norm": 0.21257686614990234, "kl": 0.058622974902391434, "learning_rate": 1.5289130093132632e-05, "loss": 0.0023, "reward": 1.046875, "reward_std": 0.9159731268882751, "rewards/correctness_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.484375, "step": 93 }, { "completion_length": 191.125, "epoch": 0.6714285714285714, "grad_norm": 0.017704889178276062, "kl": 0.06602377444505692, "learning_rate": 1.4717822421734718e-05, "loss": 0.0026, "reward": 2.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 94 }, { "completion_length": 727.75, "epoch": 0.6785714285714286, "grad_norm": 0.07640424370765686, "kl": 0.015565132722258568, "learning_rate": 1.4152906522061048e-05, "loss": 0.0006, "reward": 1.875, "reward_std": 1.0264363288879395, "rewards/correctness_reward_func": 1.25, "rewards/length_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 95 }, { "completion_length": 333.625, "epoch": 0.6857142857142857, "grad_norm": 0.011164901778101921, "kl": 0.03891594707965851, "learning_rate": 1.3594733566170926e-05, "loss": 0.0016, "reward": 2.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 96 }, { "completion_length": 795.5, "epoch": 0.6928571428571428, "grad_norm": 0.07001640647649765, "kl": 0.014953548088669777, "learning_rate": 1.3043650534467053e-05, "loss": 0.0006, "reward": 1.203125, "reward_std": 0.9863747954368591, "rewards/correctness_reward_func": 0.5, "rewards/length_reward_func": 0.25, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.453125, "step": 97 }, { "completion_length": 342.5, "epoch": 0.7, "grad_norm": 0.015015755780041218, "kl": 0.03935668244957924, "learning_rate": 1.2500000000000006e-05, "loss": 0.0016, "reward": 2.5, "reward_std": 0.0, "rewards/correctness_reward_func": 2.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 98 }, { "completion_length": 550.25, "epoch": 0.7071428571428572, "grad_norm": 0.10194458067417145, "kl": 0.022631347179412842, "learning_rate": 1.196411991551255e-05, "loss": 0.0009, "reward": 1.5, "reward_std": 1.0690449476242065, "rewards/correctness_reward_func": 1.0, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5, "step": 99 }, { "completion_length": 247.375, "epoch": 0.7142857142857143, "grad_norm": 0.15261337161064148, "kl": 0.08087805658578873, "learning_rate": 1.1436343403356017e-05, "loss": 0.0032, "reward": 2.0625, "reward_std": 0.979704737663269, "rewards/correctness_reward_func": 1.5, "rewards/length_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.5, "step": 100 } ], "logging_steps": 1, "max_steps": 140, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }