| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.7142857142857143, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 200.0, | |
| "epoch": 0.007142857142857143, | |
| "grad_norm": 0.00043654805631376803, | |
| "kl": 1.1190734767296817e-05, | |
| "learning_rate": 3.5714285714285714e-06, | |
| "loss": 0.0, | |
| "reward": 0.125, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.125, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 292.875, | |
| "epoch": 0.014285714285714285, | |
| "grad_norm": 0.1742347776889801, | |
| "kl": 5.753119239670923e-06, | |
| "learning_rate": 7.142857142857143e-06, | |
| "loss": 0.0, | |
| "reward": -0.3997499942779541, | |
| "reward_std": 0.15906040370464325, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.3997499942779541, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 445.5, | |
| "epoch": 0.02142857142857143, | |
| "grad_norm": 0.0913790687918663, | |
| "kl": 5.179101663088659e-06, | |
| "learning_rate": 1.0714285714285714e-05, | |
| "loss": 0.0, | |
| "reward": -0.7059999704360962, | |
| "reward_std": 0.729467511177063, | |
| "rewards/correctness_reward_func": 0.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.9559999704360962, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 397.875, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.18434225022792816, | |
| "kl": 1.612883534107823e-05, | |
| "learning_rate": 1.4285714285714285e-05, | |
| "loss": 0.0, | |
| "reward": -0.7477500438690186, | |
| "reward_std": 0.5131211876869202, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.7477500438690186, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 374.25, | |
| "epoch": 0.03571428571428571, | |
| "grad_norm": 0.15434053540229797, | |
| "kl": 3.564520739018917e-05, | |
| "learning_rate": 1.785714285714286e-05, | |
| "loss": 0.0, | |
| "reward": -0.4762499928474426, | |
| "reward_std": 0.7022607922554016, | |
| "rewards/correctness_reward_func": 0.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.7262499928474426, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 264.25, | |
| "epoch": 0.04285714285714286, | |
| "grad_norm": 0.14609746634960175, | |
| "kl": 0.0002129438507836312, | |
| "learning_rate": 2.1428571428571428e-05, | |
| "loss": 0.0, | |
| "reward": -0.1522500216960907, | |
| "reward_std": 0.6662349700927734, | |
| "rewards/correctness_reward_func": 0.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.4022500216960907, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 221.5, | |
| "epoch": 0.05, | |
| "grad_norm": 0.22101597487926483, | |
| "kl": 0.0012037234846502542, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.0, | |
| "reward": -0.11937500536441803, | |
| "reward_std": 0.2714921534061432, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.11937499791383743, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 247.875, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.22249825298786163, | |
| "kl": 0.0021207458339631557, | |
| "learning_rate": 2.857142857142857e-05, | |
| "loss": 0.0001, | |
| "reward": -0.19712500274181366, | |
| "reward_std": 0.36591196060180664, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.19712500274181366, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 511.75, | |
| "epoch": 0.06428571428571428, | |
| "grad_norm": 0.11614850163459778, | |
| "kl": 0.0037874511908739805, | |
| "learning_rate": 3.2142857142857144e-05, | |
| "loss": 0.0002, | |
| "reward": -0.7689999938011169, | |
| "reward_std": 1.3408536911010742, | |
| "rewards/correctness_reward_func": 0.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -1.0189999341964722, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 221.625, | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 0.21436981856822968, | |
| "kl": 0.012746547348797321, | |
| "learning_rate": 3.571428571428572e-05, | |
| "loss": 0.0005, | |
| "reward": -0.23862498998641968, | |
| "reward_std": 0.11265870183706284, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.23862498998641968, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 292.75, | |
| "epoch": 0.07857142857142857, | |
| "grad_norm": 0.19347621500492096, | |
| "kl": 0.011033562943339348, | |
| "learning_rate": 3.928571428571429e-05, | |
| "loss": 0.0004, | |
| "reward": -0.4468750059604645, | |
| "reward_std": 0.16547630727291107, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.4468749761581421, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 110.625, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.44592323899269104, | |
| "kl": 0.060783885419368744, | |
| "learning_rate": 4.2857142857142856e-05, | |
| "loss": 0.0024, | |
| "reward": 1.424625039100647, | |
| "reward_std": 1.0278345346450806, | |
| "rewards/correctness_reward_func": 1.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.17462500929832458, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 678.75, | |
| "epoch": 0.09285714285714286, | |
| "grad_norm": 0.1104876920580864, | |
| "kl": 0.013322519138455391, | |
| "learning_rate": 4.642857142857143e-05, | |
| "loss": 0.0005, | |
| "reward": 0.468500018119812, | |
| "reward_std": 0.9677569270133972, | |
| "rewards/correctness_reward_func": 1.0, | |
| "rewards/length_reward_func": 0.125, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.656499981880188, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 495.25, | |
| "epoch": 0.1, | |
| "grad_norm": 0.13062289357185364, | |
| "kl": 0.021864598616957664, | |
| "learning_rate": 5e-05, | |
| "loss": 0.0009, | |
| "reward": -0.5831249952316284, | |
| "reward_std": 0.676399827003479, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.5831249952316284, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 781.375, | |
| "epoch": 0.10714285714285714, | |
| "grad_norm": 0.09523279964923859, | |
| "kl": 0.013723693788051605, | |
| "learning_rate": 4.999222955002041e-05, | |
| "loss": 0.0005, | |
| "reward": -0.13512498140335083, | |
| "reward_std": 0.9264968633651733, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.13512498140335083, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 405.5, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.15296146273612976, | |
| "kl": 0.03523610904812813, | |
| "learning_rate": 4.996892303047306e-05, | |
| "loss": 0.0014, | |
| "reward": 2.193000078201294, | |
| "reward_std": 0.43767958879470825, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.19300000369548798, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 214.75, | |
| "epoch": 0.12142857142857143, | |
| "grad_norm": 0.2985721230506897, | |
| "kl": 0.0879557877779007, | |
| "learning_rate": 4.9930094929529506e-05, | |
| "loss": 0.0035, | |
| "reward": -0.011874988675117493, | |
| "reward_std": 0.6198051571846008, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.011875003576278687, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 121.25, | |
| "epoch": 0.12857142857142856, | |
| "grad_norm": 0.417566180229187, | |
| "kl": 0.22521352767944336, | |
| "learning_rate": 4.987576938413504e-05, | |
| "loss": 0.009, | |
| "reward": 2.4171249866485596, | |
| "reward_std": 0.1779337227344513, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.41712498664855957, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 179.0, | |
| "epoch": 0.1357142857142857, | |
| "grad_norm": 0.285080224275589, | |
| "kl": 0.10962098836898804, | |
| "learning_rate": 4.9805980165004304e-05, | |
| "loss": 0.0044, | |
| "reward": 1.75, | |
| "reward_std": 1.0350983142852783, | |
| "rewards/correctness_reward_func": 1.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 282.875, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.27518561482429504, | |
| "kl": 0.06473983079195023, | |
| "learning_rate": 4.972077065562821e-05, | |
| "loss": 0.0026, | |
| "reward": 1.2120000123977661, | |
| "reward_std": 1.0715053081512451, | |
| "rewards/correctness_reward_func": 0.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.4620000123977661, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 146.0, | |
| "epoch": 0.15, | |
| "grad_norm": 0.01954607106745243, | |
| "kl": 0.1001419946551323, | |
| "learning_rate": 4.962019382530521e-05, | |
| "loss": 0.004, | |
| "reward": 2.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 233.125, | |
| "epoch": 0.15714285714285714, | |
| "grad_norm": 0.019903944805264473, | |
| "kl": 0.0685255229473114, | |
| "learning_rate": 4.9504312196213596e-05, | |
| "loss": 0.0027, | |
| "reward": 2.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 199.125, | |
| "epoch": 0.16428571428571428, | |
| "grad_norm": 0.022628186270594597, | |
| "kl": 0.08269491791725159, | |
| "learning_rate": 4.937319780454559e-05, | |
| "loss": 0.0033, | |
| "reward": 2.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 388.25, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.12237099558115005, | |
| "kl": 0.031046129763126373, | |
| "learning_rate": 4.922693215572695e-05, | |
| "loss": 0.0012, | |
| "reward": 1.5, | |
| "reward_std": 1.0690449476242065, | |
| "rewards/correctness_reward_func": 1.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 262.875, | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 0.026575006544589996, | |
| "kl": 0.08844916522502899, | |
| "learning_rate": 4.90656061737503e-05, | |
| "loss": 0.0035, | |
| "reward": 2.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 290.125, | |
| "epoch": 0.18571428571428572, | |
| "grad_norm": 0.5788246989250183, | |
| "kl": 0.06681957095861435, | |
| "learning_rate": 4.888932014465352e-05, | |
| "loss": 0.0027, | |
| "reward": 0.375, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.375, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 224.375, | |
| "epoch": 0.19285714285714287, | |
| "grad_norm": 0.4455007314682007, | |
| "kl": 0.2504517138004303, | |
| "learning_rate": 4.86981836541783e-05, | |
| "loss": 0.01, | |
| "reward": 1.625, | |
| "reward_std": 1.2174328565597534, | |
| "rewards/correctness_reward_func": 1.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.375, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 232.125, | |
| "epoch": 0.2, | |
| "grad_norm": 0.23774927854537964, | |
| "kl": 0.1069759875535965, | |
| "learning_rate": 4.849231551964771e-05, | |
| "loss": 0.0043, | |
| "reward": 1.921875, | |
| "reward_std": 0.8937718272209167, | |
| "rewards/correctness_reward_func": 1.5, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.421875, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 458.625, | |
| "epoch": 0.20714285714285716, | |
| "grad_norm": 0.09729224443435669, | |
| "kl": 0.03021763078868389, | |
| "learning_rate": 4.827184371610511e-05, | |
| "loss": 0.0012, | |
| "reward": 1.75, | |
| "reward_std": 1.0350983142852783, | |
| "rewards/correctness_reward_func": 1.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 422.375, | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 0.12195795774459839, | |
| "kl": 0.04913492873311043, | |
| "learning_rate": 4.803690529676019e-05, | |
| "loss": 0.002, | |
| "reward": 2.25, | |
| "reward_std": 0.5345224738121033, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0625, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.4375, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 270.875, | |
| "epoch": 0.22142857142857142, | |
| "grad_norm": 0.21031659841537476, | |
| "kl": 0.0698024183511734, | |
| "learning_rate": 4.778764630779183e-05, | |
| "loss": 0.0028, | |
| "reward": 2.21875, | |
| "reward_std": 0.6999680995941162, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.46875, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 377.5, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.010702410712838173, | |
| "kl": 0.037429843097925186, | |
| "learning_rate": 4.752422169756048e-05, | |
| "loss": 0.0015, | |
| "reward": 2.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 360.125, | |
| "epoch": 0.2357142857142857, | |
| "grad_norm": 0.14157415926456451, | |
| "kl": 0.08244010806083679, | |
| "learning_rate": 4.724679522028672e-05, | |
| "loss": 0.0033, | |
| "reward": 1.6486248970031738, | |
| "reward_std": 1.1074291467666626, | |
| "rewards/correctness_reward_func": 1.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.3986250162124634, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 116.125, | |
| "epoch": 0.24285714285714285, | |
| "grad_norm": 1.3088772296905518, | |
| "kl": 0.7027589082717896, | |
| "learning_rate": 4.6955539334255716e-05, | |
| "loss": 0.0281, | |
| "reward": 1.75, | |
| "reward_std": 1.0350983142852783, | |
| "rewards/correctness_reward_func": 1.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 311.0, | |
| "epoch": 0.25, | |
| "grad_norm": 0.7312092781066895, | |
| "kl": 0.07371841371059418, | |
| "learning_rate": 4.665063509461097e-05, | |
| "loss": 0.0029, | |
| "reward": 0.6698750257492065, | |
| "reward_std": 0.7472349405288696, | |
| "rewards/correctness_reward_func": 0.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.41987499594688416, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 329.625, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.006386851891875267, | |
| "kl": 0.039320699870586395, | |
| "learning_rate": 4.6332272040803895e-05, | |
| "loss": 0.0016, | |
| "reward": 2.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 365.875, | |
| "epoch": 0.2642857142857143, | |
| "grad_norm": 0.20290029048919678, | |
| "kl": 0.05483713746070862, | |
| "learning_rate": 4.600064807876929e-05, | |
| "loss": 0.0022, | |
| "reward": 1.6407499313354492, | |
| "reward_std": 1.0948845148086548, | |
| "rewards/correctness_reward_func": 1.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.390749990940094, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 287.0, | |
| "epoch": 0.2714285714285714, | |
| "grad_norm": 0.19799265265464783, | |
| "kl": 0.13818374276161194, | |
| "learning_rate": 4.5655969357899874e-05, | |
| "loss": 0.0055, | |
| "reward": 1.875999927520752, | |
| "reward_std": 0.9384979605674744, | |
| "rewards/correctness_reward_func": 1.5, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.37599998712539673, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 289.875, | |
| "epoch": 0.2785714285714286, | |
| "grad_norm": 0.37082406878471375, | |
| "kl": 0.14701411128044128, | |
| "learning_rate": 4.529845014289642e-05, | |
| "loss": 0.0059, | |
| "reward": 2.0653750896453857, | |
| "reward_std": 0.6956271529197693, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.3153750002384186, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 725.125, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.11310902237892151, | |
| "kl": 0.05320233479142189, | |
| "learning_rate": 4.4928312680573064e-05, | |
| "loss": 0.0021, | |
| "reward": 0.5361250042915344, | |
| "reward_std": 0.2760908007621765, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.1875, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.3486250042915344, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 373.625, | |
| "epoch": 0.29285714285714287, | |
| "grad_norm": 0.005955233704298735, | |
| "kl": 0.09833689779043198, | |
| "learning_rate": 4.454578706170075e-05, | |
| "loss": 0.0039, | |
| "reward": 0.36500000953674316, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.36500000953674316, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 337.75, | |
| "epoch": 0.3, | |
| "grad_norm": 0.18655544519424438, | |
| "kl": 0.13665920495986938, | |
| "learning_rate": 4.415111107797445e-05, | |
| "loss": 0.0055, | |
| "reward": 1.3464999198913574, | |
| "reward_std": 1.0498690605163574, | |
| "rewards/correctness_reward_func": 1.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.3465000092983246, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 463.75, | |
| "epoch": 0.30714285714285716, | |
| "grad_norm": 0.18999366462230682, | |
| "kl": 0.1055900901556015, | |
| "learning_rate": 4.374453007419336e-05, | |
| "loss": 0.0042, | |
| "reward": 0.3206250071525574, | |
| "reward_std": 0.08869842439889908, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.3206250071525574, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 350.25, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.16436553001403809, | |
| "kl": 0.12820430099964142, | |
| "learning_rate": 4.332629679574566e-05, | |
| "loss": 0.0051, | |
| "reward": 2.3486249446868896, | |
| "reward_std": 0.04361177235841751, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.3486250042915344, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 396.625, | |
| "epoch": 0.32142857142857145, | |
| "grad_norm": 0.12755045294761658, | |
| "kl": 0.10351184010505676, | |
| "learning_rate": 4.2896671231492966e-05, | |
| "loss": 0.0041, | |
| "reward": 2.1152501106262207, | |
| "reward_std": 0.707207977771759, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.36524999141693115, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 323.0, | |
| "epoch": 0.32857142857142857, | |
| "grad_norm": 0.1433708518743515, | |
| "kl": 0.12643718719482422, | |
| "learning_rate": 4.245592045215182e-05, | |
| "loss": 0.0051, | |
| "reward": 2.085624933242798, | |
| "reward_std": 0.6997721791267395, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.3356249928474426, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 465.875, | |
| "epoch": 0.3357142857142857, | |
| "grad_norm": 0.14527598023414612, | |
| "kl": 0.09910832345485687, | |
| "learning_rate": 4.2004318444272985e-05, | |
| "loss": 0.004, | |
| "reward": 1.0987499952316284, | |
| "reward_std": 1.0445955991744995, | |
| "rewards/correctness_reward_func": 0.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.3487499952316284, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 443.875, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.11678742617368698, | |
| "kl": 0.09625618904829025, | |
| "learning_rate": 4.154214593992149e-05, | |
| "loss": 0.0039, | |
| "reward": 2.1332499980926514, | |
| "reward_std": 0.716006875038147, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.38324999809265137, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 298.875, | |
| "epoch": 0.35, | |
| "grad_norm": 0.2462267130613327, | |
| "kl": 0.14918765425682068, | |
| "learning_rate": 4.1069690242163484e-05, | |
| "loss": 0.006, | |
| "reward": 1.3640000820159912, | |
| "reward_std": 1.0677127838134766, | |
| "rewards/correctness_reward_func": 1.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.36400002241134644, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 604.125, | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.11957182735204697, | |
| "kl": 0.06581288576126099, | |
| "learning_rate": 4.058724504646834e-05, | |
| "loss": 0.0026, | |
| "reward": 0.42887499928474426, | |
| "reward_std": 0.1762550324201584, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0625, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.36637499928474426, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 630.75, | |
| "epoch": 0.36428571428571427, | |
| "grad_norm": 0.20287089049816132, | |
| "kl": 0.15163259208202362, | |
| "learning_rate": 4.009511025813694e-05, | |
| "loss": 0.0061, | |
| "reward": 0.3474999964237213, | |
| "reward_std": 0.10380475223064423, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.3475000262260437, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 409.875, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.15633811056613922, | |
| "kl": 0.12077239155769348, | |
| "learning_rate": 3.959359180586975e-05, | |
| "loss": 0.0048, | |
| "reward": 1.6152499914169312, | |
| "reward_std": 1.0375232696533203, | |
| "rewards/correctness_reward_func": 1.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.36524999141693115, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 473.875, | |
| "epoch": 0.37857142857142856, | |
| "grad_norm": 0.1574457734823227, | |
| "kl": 0.08710946887731552, | |
| "learning_rate": 3.908300145159055e-05, | |
| "loss": 0.0035, | |
| "reward": 0.8807500004768372, | |
| "reward_std": 0.9173192381858826, | |
| "rewards/correctness_reward_func": 0.5, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.38075000047683716, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 321.25, | |
| "epoch": 0.38571428571428573, | |
| "grad_norm": 0.19621425867080688, | |
| "kl": 0.17770174145698547, | |
| "learning_rate": 3.856365659664399e-05, | |
| "loss": 0.0071, | |
| "reward": 2.1021249294281006, | |
| "reward_std": 0.7031054496765137, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.35212498903274536, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 273.75, | |
| "epoch": 0.39285714285714285, | |
| "grad_norm": 0.19145628809928894, | |
| "kl": 0.16191110014915466, | |
| "learning_rate": 3.803588008448745e-05, | |
| "loss": 0.0065, | |
| "reward": 0.6318750381469727, | |
| "reward_std": 0.7018797993659973, | |
| "rewards/correctness_reward_func": 0.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.38187500834465027, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 281.25, | |
| "epoch": 0.4, | |
| "grad_norm": 0.30190473794937134, | |
| "kl": 0.1685158759355545, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.0067, | |
| "reward": 0.8665000200271606, | |
| "reward_std": 0.9249003529548645, | |
| "rewards/correctness_reward_func": 0.5, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.36650002002716064, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 340.75, | |
| "epoch": 0.40714285714285714, | |
| "grad_norm": 0.15627902746200562, | |
| "kl": 0.12346489727497101, | |
| "learning_rate": 3.695634946553296e-05, | |
| "loss": 0.0049, | |
| "reward": 0.32987499237060547, | |
| "reward_std": 0.06595548242330551, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.32987502217292786, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 993.25, | |
| "epoch": 0.4142857142857143, | |
| "grad_norm": 0.0931173712015152, | |
| "kl": 0.018140610307455063, | |
| "learning_rate": 3.6405266433829075e-05, | |
| "loss": 0.0007, | |
| "reward": 0.4088750183582306, | |
| "reward_std": 0.2790803611278534, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.25, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.1588750034570694, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 474.875, | |
| "epoch": 0.42142857142857143, | |
| "grad_norm": 0.10973533242940903, | |
| "kl": 0.09137643128633499, | |
| "learning_rate": 3.5847093477938956e-05, | |
| "loss": 0.0037, | |
| "reward": 0.42787498235702515, | |
| "reward_std": 0.17703060805797577, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0625, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.36537498235702515, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 326.0, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.19174934923648834, | |
| "kl": 0.13062816858291626, | |
| "learning_rate": 3.5282177578265296e-05, | |
| "loss": 0.0052, | |
| "reward": 0.6150000095367432, | |
| "reward_std": 0.7071067690849304, | |
| "rewards/correctness_reward_func": 0.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.36500000953674316, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 180.125, | |
| "epoch": 0.4357142857142857, | |
| "grad_norm": 0.48899680376052856, | |
| "kl": 0.4511120319366455, | |
| "learning_rate": 3.471086990686737e-05, | |
| "loss": 0.018, | |
| "reward": 2.324000120162964, | |
| "reward_std": 0.11677447706460953, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.3240000009536743, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 442.0, | |
| "epoch": 0.44285714285714284, | |
| "grad_norm": 0.14630813896656036, | |
| "kl": 0.11019708961248398, | |
| "learning_rate": 3.413352560915988e-05, | |
| "loss": 0.0044, | |
| "reward": 0.17625001072883606, | |
| "reward_std": 0.5338656306266785, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.17625001072883606, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 337.625, | |
| "epoch": 0.45, | |
| "grad_norm": 0.15681755542755127, | |
| "kl": 0.1342012584209442, | |
| "learning_rate": 3.355050358314172e-05, | |
| "loss": 0.0054, | |
| "reward": 1.1151249408721924, | |
| "reward_std": 1.0349948406219482, | |
| "rewards/correctness_reward_func": 0.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.36512500047683716, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 255.375, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.15988513827323914, | |
| "kl": 0.14537671208381653, | |
| "learning_rate": 3.2962166256292113e-05, | |
| "loss": 0.0059, | |
| "reward": 2.36537504196167, | |
| "reward_std": 0.0005174623220227659, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.36537498235702515, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 303.875, | |
| "epoch": 0.4642857142857143, | |
| "grad_norm": 0.6239961981773376, | |
| "kl": 0.10595186054706573, | |
| "learning_rate": 3.2368879360272606e-05, | |
| "loss": 0.0042, | |
| "reward": 2.1156249046325684, | |
| "reward_std": 0.7073594331741333, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.3656249940395355, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 554.625, | |
| "epoch": 0.4714285714285714, | |
| "grad_norm": 0.0981917455792427, | |
| "kl": 0.06483708322048187, | |
| "learning_rate": 3.177101170357513e-05, | |
| "loss": 0.0026, | |
| "reward": 0.33550000190734863, | |
| "reward_std": 0.08505627512931824, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.33550000190734863, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 164.0, | |
| "epoch": 0.4785714285714286, | |
| "grad_norm": 0.2880488634109497, | |
| "kl": 0.24433362483978271, | |
| "learning_rate": 3.116893494225734e-05, | |
| "loss": 0.0098, | |
| "reward": 0.41474997997283936, | |
| "reward_std": 0.3112049102783203, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0625, | |
| "rewards/xmlcount_reward_func": 0.35224997997283936, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 327.375, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 0.15951593220233917, | |
| "kl": 0.09316325187683105, | |
| "learning_rate": 3.056302334890786e-05, | |
| "loss": 0.0037, | |
| "reward": 1.013374924659729, | |
| "reward_std": 0.8874584436416626, | |
| "rewards/correctness_reward_func": 0.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.2633749842643738, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 183.625, | |
| "epoch": 0.4928571428571429, | |
| "grad_norm": 0.19407892227172852, | |
| "kl": 0.2086183726787567, | |
| "learning_rate": 2.9953653579984942e-05, | |
| "loss": 0.0083, | |
| "reward": 2.365499973297119, | |
| "reward_std": 0.0005344200180843472, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.36549997329711914, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 260.375, | |
| "epoch": 0.5, | |
| "grad_norm": 0.2512143850326538, | |
| "kl": 0.14164677262306213, | |
| "learning_rate": 2.9341204441673266e-05, | |
| "loss": 0.0057, | |
| "reward": 2.020250082015991, | |
| "reward_std": 0.7143486142158508, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.27024999260902405, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 191.125, | |
| "epoch": 0.5071428571428571, | |
| "grad_norm": 0.3029208779335022, | |
| "kl": 0.11700859665870667, | |
| "learning_rate": 2.872605665440436e-05, | |
| "loss": 0.0047, | |
| "reward": 2.012125015258789, | |
| "reward_std": 0.7928214073181152, | |
| "rewards/correctness_reward_func": 1.5, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0625, | |
| "rewards/xmlcount_reward_func": 0.44962501525878906, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 500.5, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 0.12518590688705444, | |
| "kl": 0.02341982163488865, | |
| "learning_rate": 2.8108592616187133e-05, | |
| "loss": 0.0009, | |
| "reward": -0.19187498092651367, | |
| "reward_std": 0.7031666040420532, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.19187499582767487, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 710.25, | |
| "epoch": 0.5214285714285715, | |
| "grad_norm": 0.11030412465333939, | |
| "kl": 0.01801430992782116, | |
| "learning_rate": 2.748919616489542e-05, | |
| "loss": 0.0007, | |
| "reward": -0.19962501525878906, | |
| "reward_std": 1.141786813735962, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.125, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.3246249854564667, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 395.0, | |
| "epoch": 0.5285714285714286, | |
| "grad_norm": 0.08873239159584045, | |
| "kl": 0.02234470844268799, | |
| "learning_rate": 2.686825233966061e-05, | |
| "loss": 0.0009, | |
| "reward": 1.4856250286102295, | |
| "reward_std": 0.9701153039932251, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.2643750011920929, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 386.125, | |
| "epoch": 0.5357142857142857, | |
| "grad_norm": 0.09125195443630219, | |
| "kl": 0.029970454052090645, | |
| "learning_rate": 2.624614714151743e-05, | |
| "loss": 0.0012, | |
| "reward": 1.2940000295639038, | |
| "reward_std": 0.5996603965759277, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.45600003004074097, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 239.375, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 0.1535108983516693, | |
| "kl": 0.05824340879917145, | |
| "learning_rate": 2.5623267293451826e-05, | |
| "loss": 0.0023, | |
| "reward": 1.394374966621399, | |
| "reward_std": 0.8498432040214539, | |
| "rewards/correctness_reward_func": 1.5, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.10562500357627869, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 351.625, | |
| "epoch": 0.55, | |
| "grad_norm": 0.15809431672096252, | |
| "kl": 0.02210851013660431, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.0009, | |
| "reward": 1.4718749523162842, | |
| "reward_std": 1.3246859312057495, | |
| "rewards/correctness_reward_func": 1.5, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.02812500298023224, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 199.375, | |
| "epoch": 0.5571428571428572, | |
| "grad_norm": 0.20733776688575745, | |
| "kl": 0.06919591128826141, | |
| "learning_rate": 2.4376732706548183e-05, | |
| "loss": 0.0028, | |
| "reward": 2.0334999561309814, | |
| "reward_std": 0.9354116320610046, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0625, | |
| "rewards/xmlcount_reward_func": 0.22100000083446503, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 490.625, | |
| "epoch": 0.5642857142857143, | |
| "grad_norm": 0.3020256459712982, | |
| "kl": 0.020964641124010086, | |
| "learning_rate": 2.375385285848257e-05, | |
| "loss": 0.0008, | |
| "reward": 0.2606250047683716, | |
| "reward_std": 1.5538617372512817, | |
| "rewards/correctness_reward_func": 0.75, | |
| "rewards/length_reward_func": 0.125, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": -0.6143749952316284, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 257.875, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.17333918809890747, | |
| "kl": 0.040599275380373, | |
| "learning_rate": 2.3131747660339394e-05, | |
| "loss": 0.0016, | |
| "reward": 1.8927500247955322, | |
| "reward_std": 0.6703715324401855, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.14274999499320984, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 292.625, | |
| "epoch": 0.5785714285714286, | |
| "grad_norm": 0.14176879823207855, | |
| "kl": 0.03724910691380501, | |
| "learning_rate": 2.251080383510459e-05, | |
| "loss": 0.0015, | |
| "reward": 2.164875030517578, | |
| "reward_std": 0.35948193073272705, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.16487500071525574, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 500.25, | |
| "epoch": 0.5857142857142857, | |
| "grad_norm": 0.1182783842086792, | |
| "kl": 0.023212479427456856, | |
| "learning_rate": 2.189140738381288e-05, | |
| "loss": 0.0009, | |
| "reward": 0.7456250190734863, | |
| "reward_std": 1.171677589416504, | |
| "rewards/correctness_reward_func": 0.5, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.24562500417232513, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 487.125, | |
| "epoch": 0.5928571428571429, | |
| "grad_norm": 0.12226809561252594, | |
| "kl": 0.019420940428972244, | |
| "learning_rate": 2.1273943345595637e-05, | |
| "loss": 0.0008, | |
| "reward": 0.3148750066757202, | |
| "reward_std": 1.1947816610336304, | |
| "rewards/correctness_reward_func": 0.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.06487500667572021, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 152.875, | |
| "epoch": 0.6, | |
| "grad_norm": 0.27960774302482605, | |
| "kl": 0.07769262790679932, | |
| "learning_rate": 2.0658795558326743e-05, | |
| "loss": 0.0031, | |
| "reward": 2.5625, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0625, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 235.75, | |
| "epoch": 0.6071428571428571, | |
| "grad_norm": 0.2650700807571411, | |
| "kl": 0.0834224745631218, | |
| "learning_rate": 2.0046346420015067e-05, | |
| "loss": 0.0033, | |
| "reward": 2.243499994277954, | |
| "reward_std": 0.5382997393608093, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0625, | |
| "rewards/xmlcount_reward_func": 0.4309999942779541, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 324.0, | |
| "epoch": 0.6142857142857143, | |
| "grad_norm": 0.00965797994285822, | |
| "kl": 0.04011977091431618, | |
| "learning_rate": 1.9436976651092144e-05, | |
| "loss": 0.0016, | |
| "reward": 0.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 227.0, | |
| "epoch": 0.6214285714285714, | |
| "grad_norm": 0.19126032292842865, | |
| "kl": 0.12643316388130188, | |
| "learning_rate": 1.8831065057742657e-05, | |
| "loss": 0.0051, | |
| "reward": 2.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 510.75, | |
| "epoch": 0.6285714285714286, | |
| "grad_norm": 0.003844304708763957, | |
| "kl": 0.016257166862487793, | |
| "learning_rate": 1.8228988296424877e-05, | |
| "loss": 0.0007, | |
| "reward": 0.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 0.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 243.0, | |
| "epoch": 0.6357142857142857, | |
| "grad_norm": 0.008268176577985287, | |
| "kl": 0.045144014060497284, | |
| "learning_rate": 1.7631120639727393e-05, | |
| "loss": 0.0018, | |
| "reward": 2.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 235.25, | |
| "epoch": 0.6428571428571429, | |
| "grad_norm": 0.14653581380844116, | |
| "kl": 0.038456518203020096, | |
| "learning_rate": 1.7037833743707892e-05, | |
| "loss": 0.0015, | |
| "reward": 2.375, | |
| "reward_std": 0.5824823379516602, | |
| "rewards/correctness_reward_func": 1.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.125, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 220.125, | |
| "epoch": 0.65, | |
| "grad_norm": 0.14805400371551514, | |
| "kl": 0.07326260954141617, | |
| "learning_rate": 1.6449496416858284e-05, | |
| "loss": 0.0029, | |
| "reward": 1.3125, | |
| "reward_std": 1.1319231986999512, | |
| "rewards/correctness_reward_func": 0.75, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0625, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 449.0, | |
| "epoch": 0.6571428571428571, | |
| "grad_norm": 0.11858955770730972, | |
| "kl": 0.027846258133649826, | |
| "learning_rate": 1.5866474390840125e-05, | |
| "loss": 0.0011, | |
| "reward": 0.75, | |
| "reward_std": 0.7071067690849304, | |
| "rewards/correctness_reward_func": 0.25, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 219.5, | |
| "epoch": 0.6642857142857143, | |
| "grad_norm": 0.21257686614990234, | |
| "kl": 0.058622974902391434, | |
| "learning_rate": 1.5289130093132632e-05, | |
| "loss": 0.0023, | |
| "reward": 1.046875, | |
| "reward_std": 0.9159731268882751, | |
| "rewards/correctness_reward_func": 0.5, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0625, | |
| "rewards/xmlcount_reward_func": 0.484375, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 191.125, | |
| "epoch": 0.6714285714285714, | |
| "grad_norm": 0.017704889178276062, | |
| "kl": 0.06602377444505692, | |
| "learning_rate": 1.4717822421734718e-05, | |
| "loss": 0.0026, | |
| "reward": 2.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 727.75, | |
| "epoch": 0.6785714285714286, | |
| "grad_norm": 0.07640424370765686, | |
| "kl": 0.015565132722258568, | |
| "learning_rate": 1.4152906522061048e-05, | |
| "loss": 0.0006, | |
| "reward": 1.875, | |
| "reward_std": 1.0264363288879395, | |
| "rewards/correctness_reward_func": 1.25, | |
| "rewards/length_reward_func": 0.125, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 333.625, | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 0.011164901778101921, | |
| "kl": 0.03891594707965851, | |
| "learning_rate": 1.3594733566170926e-05, | |
| "loss": 0.0016, | |
| "reward": 2.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 795.5, | |
| "epoch": 0.6928571428571428, | |
| "grad_norm": 0.07001640647649765, | |
| "kl": 0.014953548088669777, | |
| "learning_rate": 1.3043650534467053e-05, | |
| "loss": 0.0006, | |
| "reward": 1.203125, | |
| "reward_std": 0.9863747954368591, | |
| "rewards/correctness_reward_func": 0.5, | |
| "rewards/length_reward_func": 0.25, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.453125, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 342.5, | |
| "epoch": 0.7, | |
| "grad_norm": 0.015015755780041218, | |
| "kl": 0.03935668244957924, | |
| "learning_rate": 1.2500000000000006e-05, | |
| "loss": 0.0016, | |
| "reward": 2.5, | |
| "reward_std": 0.0, | |
| "rewards/correctness_reward_func": 2.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 550.25, | |
| "epoch": 0.7071428571428572, | |
| "grad_norm": 0.10194458067417145, | |
| "kl": 0.022631347179412842, | |
| "learning_rate": 1.196411991551255e-05, | |
| "loss": 0.0009, | |
| "reward": 1.5, | |
| "reward_std": 1.0690449476242065, | |
| "rewards/correctness_reward_func": 1.0, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 247.375, | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.15261337161064148, | |
| "kl": 0.08087805658578873, | |
| "learning_rate": 1.1436343403356017e-05, | |
| "loss": 0.0032, | |
| "reward": 2.0625, | |
| "reward_std": 0.979704737663269, | |
| "rewards/correctness_reward_func": 1.5, | |
| "rewards/length_reward_func": 0.0, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.0625, | |
| "rewards/xmlcount_reward_func": 0.5, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 140, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |