| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.0468352736518132, |
| "eval_steps": 500, |
| "global_step": 350, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 698.1666870117188, |
| "epoch": 0.00013381506757660912, |
| "grad_norm": 0.07569596916437149, |
| "kl": 0.0006024616304785013, |
| "learning_rate": 6.684491978609626e-09, |
| "loss": 0.001, |
| "reward": -1.8359375, |
| "reward_std": 0.5859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.8359375, |
| "step": 1 |
| }, |
| { |
| "completion_length": 549.0, |
| "epoch": 0.00026763013515321824, |
| "grad_norm": 0.10156559199094772, |
| "kl": 0.0006554799037985504, |
| "learning_rate": 1.3368983957219251e-08, |
| "loss": -0.0055, |
| "reward": -1.21875, |
| "reward_std": 0.48828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.21875, |
| "step": 2 |
| }, |
| { |
| "completion_length": 509.66668701171875, |
| "epoch": 0.0004014452027298274, |
| "grad_norm": 0.1012749969959259, |
| "kl": 0.0006122777122072875, |
| "learning_rate": 2.005347593582888e-08, |
| "loss": 0.0032, |
| "reward": -1.2578125, |
| "reward_std": 0.37890625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2578125, |
| "step": 3 |
| }, |
| { |
| "completion_length": 387.8333435058594, |
| "epoch": 0.0005352602703064365, |
| "grad_norm": 0.10009913891553879, |
| "kl": 0.0005205385386943817, |
| "learning_rate": 2.6737967914438503e-08, |
| "loss": 0.0007, |
| "reward": -0.83203125, |
| "reward_std": 0.1640625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.83203125, |
| "step": 4 |
| }, |
| { |
| "completion_length": 386.5, |
| "epoch": 0.0006690753378830456, |
| "grad_norm": 0.11404310166835785, |
| "kl": 0.00039041676791384816, |
| "learning_rate": 3.342245989304813e-08, |
| "loss": -0.0032, |
| "reward": -0.859375, |
| "reward_std": 0.1630859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.859375, |
| "step": 5 |
| }, |
| { |
| "completion_length": 345.8333435058594, |
| "epoch": 0.0008028904054596548, |
| "grad_norm": 0.13447555899620056, |
| "kl": 0.0005453471094369888, |
| "learning_rate": 4.010695187165776e-08, |
| "loss": 0.0036, |
| "reward": -0.7109375, |
| "reward_std": 0.357421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7109375, |
| "step": 6 |
| }, |
| { |
| "completion_length": 426.16668701171875, |
| "epoch": 0.0009367054730362638, |
| "grad_norm": 0.1324268877506256, |
| "kl": 0.000606791814789176, |
| "learning_rate": 4.679144385026738e-08, |
| "loss": 0.0017, |
| "reward": -1.09375, |
| "reward_std": 0.72265625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.09375, |
| "step": 7 |
| }, |
| { |
| "completion_length": 418.66668701171875, |
| "epoch": 0.001070520540612873, |
| "grad_norm": 0.12978878617286682, |
| "kl": 0.0005688891978934407, |
| "learning_rate": 5.3475935828877005e-08, |
| "loss": 0.0005, |
| "reward": -0.89453125, |
| "reward_std": 0.392578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.89453125, |
| "step": 8 |
| }, |
| { |
| "completion_length": 484.5, |
| "epoch": 0.0012043356081894822, |
| "grad_norm": 0.09955421835184097, |
| "kl": 0.0005112257204018533, |
| "learning_rate": 6.016042780748664e-08, |
| "loss": 0.0067, |
| "reward": -1.25, |
| "reward_std": 0.53515625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.25, |
| "step": 9 |
| }, |
| { |
| "completion_length": 230.5, |
| "epoch": 0.0013381506757660913, |
| "grad_norm": 0.19524620473384857, |
| "kl": 0.0006619760533794761, |
| "learning_rate": 6.684491978609626e-08, |
| "loss": -0.0006, |
| "reward": -0.26953125, |
| "reward_std": 0.349609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.26953125, |
| "step": 10 |
| }, |
| { |
| "completion_length": 369.5, |
| "epoch": 0.0014719657433427003, |
| "grad_norm": 0.1019153892993927, |
| "kl": 0.0006552126724272966, |
| "learning_rate": 7.352941176470589e-08, |
| "loss": -0.004, |
| "reward": -0.94140625, |
| "reward_std": 0.279296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.94140625, |
| "step": 11 |
| }, |
| { |
| "completion_length": 386.16668701171875, |
| "epoch": 0.0016057808109193096, |
| "grad_norm": 0.09696059674024582, |
| "kl": 0.0004603694542311132, |
| "learning_rate": 8.021390374331552e-08, |
| "loss": 0.002, |
| "reward": -0.8671875, |
| "reward_std": 0.42578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8671875, |
| "step": 12 |
| }, |
| { |
| "completion_length": 475.16668701171875, |
| "epoch": 0.0017395958784959186, |
| "grad_norm": 0.12413895130157471, |
| "kl": 0.0004793051048181951, |
| "learning_rate": 8.689839572192514e-08, |
| "loss": 0.0, |
| "reward": -0.9375, |
| "reward_std": 0.28125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9375, |
| "step": 13 |
| }, |
| { |
| "completion_length": 370.0, |
| "epoch": 0.0018734109460725277, |
| "grad_norm": 0.1305382251739502, |
| "kl": 0.0005513830110430717, |
| "learning_rate": 9.358288770053476e-08, |
| "loss": -0.0018, |
| "reward": -0.78515625, |
| "reward_std": 0.263671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.78515625, |
| "step": 14 |
| }, |
| { |
| "completion_length": 431.66668701171875, |
| "epoch": 0.002007226013649137, |
| "grad_norm": 0.10463520139455795, |
| "kl": 0.00048596435226500034, |
| "learning_rate": 1.0026737967914439e-07, |
| "loss": 0.0032, |
| "reward": -0.84375, |
| "reward_std": 0.3984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.84375, |
| "step": 15 |
| }, |
| { |
| "completion_length": 399.5, |
| "epoch": 0.002141041081225746, |
| "grad_norm": 0.1404961347579956, |
| "kl": 0.000555322621949017, |
| "learning_rate": 1.0695187165775401e-07, |
| "loss": -0.0057, |
| "reward": -1.0625, |
| "reward_std": 0.46484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0625, |
| "step": 16 |
| }, |
| { |
| "completion_length": 449.8333435058594, |
| "epoch": 0.002274856148802355, |
| "grad_norm": 0.10250594466924667, |
| "kl": 0.00048121344298124313, |
| "learning_rate": 1.1363636363636364e-07, |
| "loss": -0.0071, |
| "reward": -1.0234375, |
| "reward_std": 0.40234375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0234375, |
| "step": 17 |
| }, |
| { |
| "completion_length": 324.5, |
| "epoch": 0.0024086712163789645, |
| "grad_norm": 0.12464314699172974, |
| "kl": 0.0005811881856061518, |
| "learning_rate": 1.2032085561497328e-07, |
| "loss": 0.0033, |
| "reward": -0.6875, |
| "reward_std": 0.26171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6875, |
| "step": 18 |
| }, |
| { |
| "completion_length": 578.0, |
| "epoch": 0.0025424862839555735, |
| "grad_norm": 0.08823499828577042, |
| "kl": 0.000675913121085614, |
| "learning_rate": 1.270053475935829e-07, |
| "loss": 0.0075, |
| "reward": -1.703125, |
| "reward_std": 0.5703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.703125, |
| "step": 19 |
| }, |
| { |
| "completion_length": 328.8333435058594, |
| "epoch": 0.0026763013515321826, |
| "grad_norm": 0.16708222031593323, |
| "kl": 0.0006092819385230541, |
| "learning_rate": 1.3368983957219251e-07, |
| "loss": 0.0091, |
| "reward": -0.71484375, |
| "reward_std": 0.1357421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.71484375, |
| "step": 20 |
| }, |
| { |
| "completion_length": 415.16668701171875, |
| "epoch": 0.0028101164191087916, |
| "grad_norm": 0.10446464270353317, |
| "kl": 0.0004726095939986408, |
| "learning_rate": 1.4037433155080215e-07, |
| "loss": 0.0011, |
| "reward": -1.0, |
| "reward_std": 0.50390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0, |
| "step": 21 |
| }, |
| { |
| "completion_length": 398.5, |
| "epoch": 0.0029439314866854006, |
| "grad_norm": 0.10892236977815628, |
| "kl": 0.000556222046725452, |
| "learning_rate": 1.4705882352941178e-07, |
| "loss": 0.0016, |
| "reward": -0.9765625, |
| "reward_std": 0.349609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9765625, |
| "step": 22 |
| }, |
| { |
| "completion_length": 351.66668701171875, |
| "epoch": 0.00307774655426201, |
| "grad_norm": 0.13707049190998077, |
| "kl": 0.0005205090856179595, |
| "learning_rate": 1.537433155080214e-07, |
| "loss": -0.0032, |
| "reward": -0.7421875, |
| "reward_std": 0.390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7421875, |
| "step": 23 |
| }, |
| { |
| "completion_length": 310.0, |
| "epoch": 0.003211561621838619, |
| "grad_norm": 0.1579124480485916, |
| "kl": 0.0007410722319036722, |
| "learning_rate": 1.6042780748663104e-07, |
| "loss": 0.0002, |
| "reward": -0.65625, |
| "reward_std": 0.6328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.65625, |
| "step": 24 |
| }, |
| { |
| "completion_length": 353.0, |
| "epoch": 0.003345376689415228, |
| "grad_norm": 0.11555790901184082, |
| "kl": 0.0005753459990955889, |
| "learning_rate": 1.6711229946524068e-07, |
| "loss": -0.0034, |
| "reward": -0.828125, |
| "reward_std": 0.314453125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.828125, |
| "step": 25 |
| }, |
| { |
| "completion_length": 416.5, |
| "epoch": 0.0034791917569918372, |
| "grad_norm": 0.10537782311439514, |
| "kl": 0.0006076883291825652, |
| "learning_rate": 1.7379679144385028e-07, |
| "loss": -0.0068, |
| "reward": -0.8359375, |
| "reward_std": 0.30859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8359375, |
| "step": 26 |
| }, |
| { |
| "completion_length": 436.0, |
| "epoch": 0.0036130068245684463, |
| "grad_norm": 0.12061028182506561, |
| "kl": 0.0006918934523127973, |
| "learning_rate": 1.8048128342245991e-07, |
| "loss": 0.0033, |
| "reward": -0.91015625, |
| "reward_std": 0.92578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.91015625, |
| "step": 27 |
| }, |
| { |
| "completion_length": 447.16668701171875, |
| "epoch": 0.0037468218921450553, |
| "grad_norm": 0.11236874759197235, |
| "kl": 0.0005188498180359602, |
| "learning_rate": 1.8716577540106952e-07, |
| "loss": -0.0021, |
| "reward": -1.078125, |
| "reward_std": 0.298828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.078125, |
| "step": 28 |
| }, |
| { |
| "completion_length": 524.5, |
| "epoch": 0.003880636959721665, |
| "grad_norm": 0.08638511598110199, |
| "kl": 0.000413873785873875, |
| "learning_rate": 1.9385026737967918e-07, |
| "loss": -0.0027, |
| "reward": -1.1953125, |
| "reward_std": 0.58203125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1953125, |
| "step": 29 |
| }, |
| { |
| "completion_length": 433.0, |
| "epoch": 0.004014452027298274, |
| "grad_norm": 0.10361335426568985, |
| "kl": 0.0005174180259928107, |
| "learning_rate": 2.0053475935828878e-07, |
| "loss": -0.001, |
| "reward": -0.8125, |
| "reward_std": 0.55078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8125, |
| "step": 30 |
| }, |
| { |
| "completion_length": 429.3333435058594, |
| "epoch": 0.004148267094874883, |
| "grad_norm": 0.09831919521093369, |
| "kl": 0.0004531377926468849, |
| "learning_rate": 2.0721925133689842e-07, |
| "loss": -0.0034, |
| "reward": -0.82421875, |
| "reward_std": 0.412109375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.82421875, |
| "step": 31 |
| }, |
| { |
| "completion_length": 352.3333435058594, |
| "epoch": 0.004282082162451492, |
| "grad_norm": 0.1168479472398758, |
| "kl": 0.00041617939132265747, |
| "learning_rate": 2.1390374331550802e-07, |
| "loss": 0.012, |
| "reward": -0.671875, |
| "reward_std": 0.1455078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.671875, |
| "step": 32 |
| }, |
| { |
| "completion_length": 328.16668701171875, |
| "epoch": 0.004415897230028101, |
| "grad_norm": 0.14010493457317352, |
| "kl": 0.0006999190663918853, |
| "learning_rate": 2.2058823529411768e-07, |
| "loss": -0.0003, |
| "reward": -0.8203125, |
| "reward_std": 0.4609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8203125, |
| "step": 33 |
| }, |
| { |
| "completion_length": 461.0, |
| "epoch": 0.00454971229760471, |
| "grad_norm": 0.07955824583768845, |
| "kl": 0.000317567668389529, |
| "learning_rate": 2.2727272727272729e-07, |
| "loss": 0.0061, |
| "reward": -0.7421875, |
| "reward_std": 0.107421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7421875, |
| "step": 34 |
| }, |
| { |
| "completion_length": 343.5, |
| "epoch": 0.004683527365181319, |
| "grad_norm": 0.192967027425766, |
| "kl": 0.0003919226583093405, |
| "learning_rate": 2.3395721925133692e-07, |
| "loss": -0.0026, |
| "reward": -0.71875, |
| "reward_std": 0.26953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.71875, |
| "step": 35 |
| }, |
| { |
| "completion_length": 468.0, |
| "epoch": 0.004817342432757929, |
| "grad_norm": 0.1151042953133583, |
| "kl": 0.0005731440032832325, |
| "learning_rate": 2.4064171122994655e-07, |
| "loss": 0.0008, |
| "reward": -0.90625, |
| "reward_std": 0.4375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.90625, |
| "step": 36 |
| }, |
| { |
| "completion_length": 326.66668701171875, |
| "epoch": 0.004951157500334538, |
| "grad_norm": 0.13014303147792816, |
| "kl": 0.0006222401279956102, |
| "learning_rate": 2.473262032085562e-07, |
| "loss": 0.0073, |
| "reward": -0.58984375, |
| "reward_std": 0.2177734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.58984375, |
| "step": 37 |
| }, |
| { |
| "completion_length": 470.16668701171875, |
| "epoch": 0.005084972567911147, |
| "grad_norm": 0.10929639637470245, |
| "kl": 0.0005664956988766789, |
| "learning_rate": 2.540106951871658e-07, |
| "loss": -0.001, |
| "reward": -1.2109375, |
| "reward_std": 0.451171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2109375, |
| "step": 38 |
| }, |
| { |
| "completion_length": 350.5, |
| "epoch": 0.005218787635487756, |
| "grad_norm": 0.121163509786129, |
| "kl": 0.0006041490705683827, |
| "learning_rate": 2.606951871657754e-07, |
| "loss": -0.0012, |
| "reward": -0.65234375, |
| "reward_std": 0.404296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.65234375, |
| "step": 39 |
| }, |
| { |
| "completion_length": 441.5, |
| "epoch": 0.005352602703064365, |
| "grad_norm": 0.09024005383253098, |
| "kl": 0.0005421562236733735, |
| "learning_rate": 2.6737967914438503e-07, |
| "loss": 0.0012, |
| "reward": -0.765625, |
| "reward_std": 0.7734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.765625, |
| "step": 40 |
| }, |
| { |
| "completion_length": 587.8333740234375, |
| "epoch": 0.005486417770640974, |
| "grad_norm": 0.11247697472572327, |
| "kl": 0.0009425554308108985, |
| "learning_rate": 2.740641711229947e-07, |
| "loss": -0.0003, |
| "reward": -1.6875, |
| "reward_std": 0.6171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.6875, |
| "step": 41 |
| }, |
| { |
| "completion_length": 523.5, |
| "epoch": 0.005620232838217583, |
| "grad_norm": 0.08999153226613998, |
| "kl": 0.0004692915244959295, |
| "learning_rate": 2.807486631016043e-07, |
| "loss": 0.0003, |
| "reward": -0.796875, |
| "reward_std": 1.375, |
| "rewards/correctness_reward_func": 0.333984375, |
| "rewards/int_reward_func": 0.08349609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2109375, |
| "step": 42 |
| }, |
| { |
| "completion_length": 531.0, |
| "epoch": 0.005754047905794192, |
| "grad_norm": 0.09678950905799866, |
| "kl": 0.0005171874072402716, |
| "learning_rate": 2.8743315508021395e-07, |
| "loss": -0.0044, |
| "reward": -1.421875, |
| "reward_std": 0.4140625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.421875, |
| "step": 43 |
| }, |
| { |
| "completion_length": 416.0, |
| "epoch": 0.005887862973370801, |
| "grad_norm": 0.11189436912536621, |
| "kl": 0.00040408255881629884, |
| "learning_rate": 2.9411764705882356e-07, |
| "loss": 0.0029, |
| "reward": -0.921875, |
| "reward_std": 0.1884765625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.921875, |
| "step": 44 |
| }, |
| { |
| "completion_length": 359.8333435058594, |
| "epoch": 0.00602167804094741, |
| "grad_norm": 0.10176176577806473, |
| "kl": 0.0005236775032244623, |
| "learning_rate": 3.0080213903743316e-07, |
| "loss": -0.0032, |
| "reward": -0.58203125, |
| "reward_std": 0.4140625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.58203125, |
| "step": 45 |
| }, |
| { |
| "completion_length": 416.8333435058594, |
| "epoch": 0.00615549310852402, |
| "grad_norm": 0.12203460931777954, |
| "kl": 0.0006941946921870112, |
| "learning_rate": 3.074866310160428e-07, |
| "loss": 0.0008, |
| "reward": -1.15625, |
| "reward_std": 0.50390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.15625, |
| "step": 46 |
| }, |
| { |
| "completion_length": 524.8333740234375, |
| "epoch": 0.006289308176100629, |
| "grad_norm": 0.09807480126619339, |
| "kl": 0.000624034320935607, |
| "learning_rate": 3.1417112299465243e-07, |
| "loss": -0.0062, |
| "reward": -1.1875, |
| "reward_std": 0.396484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1875, |
| "step": 47 |
| }, |
| { |
| "completion_length": 419.5, |
| "epoch": 0.006423123243677238, |
| "grad_norm": 0.11151473969221115, |
| "kl": 0.0005637712310999632, |
| "learning_rate": 3.208556149732621e-07, |
| "loss": -0.0017, |
| "reward": -0.94140625, |
| "reward_std": 0.5546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.94140625, |
| "step": 48 |
| }, |
| { |
| "completion_length": 375.3333435058594, |
| "epoch": 0.006556938311253847, |
| "grad_norm": 0.15443629026412964, |
| "kl": 0.0006895489059388638, |
| "learning_rate": 3.275401069518717e-07, |
| "loss": -0.0021, |
| "reward": -0.80078125, |
| "reward_std": 0.6015625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.80078125, |
| "step": 49 |
| }, |
| { |
| "completion_length": 538.5, |
| "epoch": 0.006690753378830456, |
| "grad_norm": 0.12232497334480286, |
| "kl": 0.00044502606033347547, |
| "learning_rate": 3.3422459893048135e-07, |
| "loss": 0.0038, |
| "reward": -1.15625, |
| "reward_std": 0.345703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.15625, |
| "step": 50 |
| }, |
| { |
| "completion_length": 380.66668701171875, |
| "epoch": 0.006824568446407065, |
| "grad_norm": 0.09400169551372528, |
| "kl": 0.0004400149919092655, |
| "learning_rate": 3.409090909090909e-07, |
| "loss": -0.0005, |
| "reward": -0.75390625, |
| "reward_std": 0.75390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.75390625, |
| "step": 51 |
| }, |
| { |
| "completion_length": 302.16668701171875, |
| "epoch": 0.0069583835139836745, |
| "grad_norm": 0.18885326385498047, |
| "kl": 0.0006017067935317755, |
| "learning_rate": 3.4759358288770056e-07, |
| "loss": 0.0001, |
| "reward": -0.494140625, |
| "reward_std": 0.5078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.494140625, |
| "step": 52 |
| }, |
| { |
| "completion_length": 310.0, |
| "epoch": 0.0070921985815602835, |
| "grad_norm": 0.17508742213249207, |
| "kl": 0.0006495526758953929, |
| "learning_rate": 3.542780748663102e-07, |
| "loss": 0.0001, |
| "reward": -0.609375, |
| "reward_std": 0.208984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.609375, |
| "step": 53 |
| }, |
| { |
| "completion_length": 348.16668701171875, |
| "epoch": 0.0072260136491368926, |
| "grad_norm": 0.1143779456615448, |
| "kl": 0.0005849208100698888, |
| "learning_rate": 3.6096256684491983e-07, |
| "loss": -0.0023, |
| "reward": -0.8984375, |
| "reward_std": 0.423828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8984375, |
| "step": 54 |
| }, |
| { |
| "completion_length": 461.5, |
| "epoch": 0.007359828716713502, |
| "grad_norm": 0.10026198625564575, |
| "kl": 0.0005551945068873465, |
| "learning_rate": 3.6764705882352943e-07, |
| "loss": -0.0088, |
| "reward": -1.03125, |
| "reward_std": 0.38671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.03125, |
| "step": 55 |
| }, |
| { |
| "completion_length": 581.1666870117188, |
| "epoch": 0.007493643784290111, |
| "grad_norm": 0.09014507383108139, |
| "kl": 0.0004388962115626782, |
| "learning_rate": 3.7433155080213904e-07, |
| "loss": 0.0007, |
| "reward": -1.328125, |
| "reward_std": 0.1474609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.328125, |
| "step": 56 |
| }, |
| { |
| "completion_length": 320.3333435058594, |
| "epoch": 0.0076274588518667205, |
| "grad_norm": 0.09987051039934158, |
| "kl": 0.0005903591518290341, |
| "learning_rate": 3.810160427807487e-07, |
| "loss": -0.0068, |
| "reward": -0.609375, |
| "reward_std": 0.2578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.609375, |
| "step": 57 |
| }, |
| { |
| "completion_length": 362.8333435058594, |
| "epoch": 0.00776127391944333, |
| "grad_norm": 0.246050164103508, |
| "kl": 0.0005056472145952284, |
| "learning_rate": 3.8770053475935836e-07, |
| "loss": -0.0027, |
| "reward": -0.625, |
| "reward_std": 0.5234375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.625, |
| "step": 58 |
| }, |
| { |
| "completion_length": 524.0, |
| "epoch": 0.007895088987019938, |
| "grad_norm": 0.12084438651800156, |
| "kl": 0.0005575703689828515, |
| "learning_rate": 3.943850267379679e-07, |
| "loss": 0.0114, |
| "reward": -1.1328125, |
| "reward_std": 0.28125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1328125, |
| "step": 59 |
| }, |
| { |
| "completion_length": 410.3333435058594, |
| "epoch": 0.008028904054596548, |
| "grad_norm": 0.10101523995399475, |
| "kl": 0.0005777844344265759, |
| "learning_rate": 4.0106951871657757e-07, |
| "loss": 0.0007, |
| "reward": -0.94921875, |
| "reward_std": 0.271484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.94921875, |
| "step": 60 |
| }, |
| { |
| "completion_length": 717.6666870117188, |
| "epoch": 0.008162719122173156, |
| "grad_norm": 0.09220802038908005, |
| "kl": 0.0006241414812393486, |
| "learning_rate": 4.077540106951872e-07, |
| "loss": -0.0078, |
| "reward": -2.046875, |
| "reward_std": 0.53515625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -2.046875, |
| "step": 61 |
| }, |
| { |
| "completion_length": 384.66668701171875, |
| "epoch": 0.008296534189749766, |
| "grad_norm": 0.10890569537878036, |
| "kl": 0.00048696936573833227, |
| "learning_rate": 4.1443850267379683e-07, |
| "loss": 0.0039, |
| "reward": -0.921875, |
| "reward_std": 0.1865234375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.921875, |
| "step": 62 |
| }, |
| { |
| "completion_length": 271.3333435058594, |
| "epoch": 0.008430349257326376, |
| "grad_norm": 0.11486776173114777, |
| "kl": 0.0005599698051810265, |
| "learning_rate": 4.211229946524065e-07, |
| "loss": -0.0005, |
| "reward": -0.201171875, |
| "reward_std": 0.396484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.201171875, |
| "step": 63 |
| }, |
| { |
| "completion_length": 385.3333435058594, |
| "epoch": 0.008564164324902984, |
| "grad_norm": 0.1193244457244873, |
| "kl": 0.0006926630157977343, |
| "learning_rate": 4.2780748663101604e-07, |
| "loss": 0.0043, |
| "reward": -0.97265625, |
| "reward_std": 0.41015625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.97265625, |
| "step": 64 |
| }, |
| { |
| "completion_length": 539.6666870117188, |
| "epoch": 0.008697979392479594, |
| "grad_norm": 0.09389720857143402, |
| "kl": 0.0004780918825417757, |
| "learning_rate": 4.344919786096257e-07, |
| "loss": 0.0052, |
| "reward": -1.234375, |
| "reward_std": 0.373046875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.234375, |
| "step": 65 |
| }, |
| { |
| "completion_length": 259.8333435058594, |
| "epoch": 0.008831794460056202, |
| "grad_norm": 0.22691243886947632, |
| "kl": 0.0008878613589331508, |
| "learning_rate": 4.4117647058823536e-07, |
| "loss": -0.0048, |
| "reward": -0.51953125, |
| "reward_std": 0.251953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.51953125, |
| "step": 66 |
| }, |
| { |
| "completion_length": 461.16668701171875, |
| "epoch": 0.008965609527632812, |
| "grad_norm": 0.12113010138273239, |
| "kl": 0.0006317974766716361, |
| "learning_rate": 4.4786096256684497e-07, |
| "loss": -0.0067, |
| "reward": -1.15625, |
| "reward_std": 0.39453125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.15625, |
| "step": 67 |
| }, |
| { |
| "completion_length": 399.8333435058594, |
| "epoch": 0.00909942459520942, |
| "grad_norm": 0.1679317206144333, |
| "kl": 0.000584149791393429, |
| "learning_rate": 4.5454545454545457e-07, |
| "loss": -0.0093, |
| "reward": -0.9609375, |
| "reward_std": 0.2060546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9609375, |
| "step": 68 |
| }, |
| { |
| "completion_length": 381.0, |
| "epoch": 0.00923323966278603, |
| "grad_norm": 0.2019040584564209, |
| "kl": 0.0007442033383995295, |
| "learning_rate": 4.612299465240642e-07, |
| "loss": 0.0034, |
| "reward": -0.63671875, |
| "reward_std": 0.494140625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.63671875, |
| "step": 69 |
| }, |
| { |
| "completion_length": 455.5, |
| "epoch": 0.009367054730362638, |
| "grad_norm": 0.09101377427577972, |
| "kl": 0.00046143907820805907, |
| "learning_rate": 4.6791443850267384e-07, |
| "loss": -0.0057, |
| "reward": -1.046875, |
| "reward_std": 0.6328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.046875, |
| "step": 70 |
| }, |
| { |
| "completion_length": 493.8333435058594, |
| "epoch": 0.009500869797939248, |
| "grad_norm": 0.09268555790185928, |
| "kl": 0.00048020537360571325, |
| "learning_rate": 4.745989304812835e-07, |
| "loss": -0.0021, |
| "reward": -1.28125, |
| "reward_std": 0.54296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.28125, |
| "step": 71 |
| }, |
| { |
| "completion_length": 601.6666870117188, |
| "epoch": 0.009634684865515858, |
| "grad_norm": 0.07598231732845306, |
| "kl": 0.0004928440321236849, |
| "learning_rate": 4.812834224598931e-07, |
| "loss": -0.0031, |
| "reward": -1.328125, |
| "reward_std": 0.74609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.328125, |
| "step": 72 |
| }, |
| { |
| "completion_length": 449.16668701171875, |
| "epoch": 0.009768499933092466, |
| "grad_norm": 0.1203397586941719, |
| "kl": 0.0006244009709917009, |
| "learning_rate": 4.879679144385027e-07, |
| "loss": -0.0055, |
| "reward": -1.0703125, |
| "reward_std": 0.474609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0703125, |
| "step": 73 |
| }, |
| { |
| "completion_length": 499.0, |
| "epoch": 0.009902315000669076, |
| "grad_norm": 0.08029637485742569, |
| "kl": 0.0004115910269320011, |
| "learning_rate": 4.946524064171124e-07, |
| "loss": 0.001, |
| "reward": -1.1953125, |
| "reward_std": 0.5703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1953125, |
| "step": 74 |
| }, |
| { |
| "completion_length": 318.8333435058594, |
| "epoch": 0.010036130068245684, |
| "grad_norm": 0.10725877434015274, |
| "kl": 0.0005362802767194808, |
| "learning_rate": 5.013368983957219e-07, |
| "loss": -0.0039, |
| "reward": -0.38671875, |
| "reward_std": 0.25390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.38671875, |
| "step": 75 |
| }, |
| { |
| "completion_length": 348.0, |
| "epoch": 0.010169945135822294, |
| "grad_norm": 0.1331893801689148, |
| "kl": 0.0006620581261813641, |
| "learning_rate": 5.080213903743316e-07, |
| "loss": -0.0007, |
| "reward": -0.84765625, |
| "reward_std": 0.486328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.84765625, |
| "step": 76 |
| }, |
| { |
| "completion_length": 456.66668701171875, |
| "epoch": 0.010303760203398902, |
| "grad_norm": 0.10820724815130234, |
| "kl": 0.0007615931099280715, |
| "learning_rate": 5.147058823529412e-07, |
| "loss": 0.0036, |
| "reward": -0.953125, |
| "reward_std": 0.59765625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.953125, |
| "step": 77 |
| }, |
| { |
| "completion_length": 335.66668701171875, |
| "epoch": 0.010437575270975512, |
| "grad_norm": 0.13866935670375824, |
| "kl": 0.0005373357562348247, |
| "learning_rate": 5.213903743315508e-07, |
| "loss": 0.0013, |
| "reward": -0.58203125, |
| "reward_std": 0.3359375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.58203125, |
| "step": 78 |
| }, |
| { |
| "completion_length": 338.0, |
| "epoch": 0.01057139033855212, |
| "grad_norm": 0.1531476229429245, |
| "kl": 0.000613297161180526, |
| "learning_rate": 5.280748663101604e-07, |
| "loss": -0.0006, |
| "reward": -0.6875, |
| "reward_std": 0.380859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6875, |
| "step": 79 |
| }, |
| { |
| "completion_length": 323.8333435058594, |
| "epoch": 0.01070520540612873, |
| "grad_norm": 0.11174651980400085, |
| "kl": 0.00048220629105344415, |
| "learning_rate": 5.347593582887701e-07, |
| "loss": 0.0042, |
| "reward": -0.5546875, |
| "reward_std": 0.099609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5546875, |
| "step": 80 |
| }, |
| { |
| "completion_length": 425.0, |
| "epoch": 0.010839020473705338, |
| "grad_norm": 0.06810642778873444, |
| "kl": 0.0002865367860067636, |
| "learning_rate": 5.414438502673798e-07, |
| "loss": 0.0087, |
| "reward": -0.921875, |
| "reward_std": 0.1455078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.921875, |
| "step": 81 |
| }, |
| { |
| "completion_length": 356.0, |
| "epoch": 0.010972835541281948, |
| "grad_norm": 0.12943901121616364, |
| "kl": 0.0005909207975491881, |
| "learning_rate": 5.481283422459894e-07, |
| "loss": -0.0014, |
| "reward": -0.80078125, |
| "reward_std": 0.2373046875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.80078125, |
| "step": 82 |
| }, |
| { |
| "completion_length": 408.66668701171875, |
| "epoch": 0.011106650608858558, |
| "grad_norm": 0.10401128232479095, |
| "kl": 0.0005684032803401351, |
| "learning_rate": 5.54812834224599e-07, |
| "loss": 0.0067, |
| "reward": -0.828125, |
| "reward_std": 0.279296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.828125, |
| "step": 83 |
| }, |
| { |
| "completion_length": 365.5, |
| "epoch": 0.011240465676435166, |
| "grad_norm": 0.11247576773166656, |
| "kl": 0.0005961977876722813, |
| "learning_rate": 5.614973262032086e-07, |
| "loss": 0.0049, |
| "reward": -0.7890625, |
| "reward_std": 0.43359375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7890625, |
| "step": 84 |
| }, |
| { |
| "completion_length": 373.16668701171875, |
| "epoch": 0.011374280744011776, |
| "grad_norm": 0.13172324001789093, |
| "kl": 0.0006587211973965168, |
| "learning_rate": 5.681818181818182e-07, |
| "loss": 0.0, |
| "reward": -0.92578125, |
| "reward_std": 0.4296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.92578125, |
| "step": 85 |
| }, |
| { |
| "completion_length": 463.16668701171875, |
| "epoch": 0.011508095811588384, |
| "grad_norm": 0.0999283418059349, |
| "kl": 0.0005339820636436343, |
| "learning_rate": 5.748663101604279e-07, |
| "loss": -0.0073, |
| "reward": -1.1640625, |
| "reward_std": 0.478515625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1640625, |
| "step": 86 |
| }, |
| { |
| "completion_length": 346.16668701171875, |
| "epoch": 0.011641910879164994, |
| "grad_norm": 0.1427423655986786, |
| "kl": 0.0005965695017948747, |
| "learning_rate": 5.815508021390375e-07, |
| "loss": 0.0052, |
| "reward": -0.86328125, |
| "reward_std": 0.36328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.86328125, |
| "step": 87 |
| }, |
| { |
| "completion_length": 464.3333435058594, |
| "epoch": 0.011775725946741603, |
| "grad_norm": 0.09077266603708267, |
| "kl": 0.0005941446870565414, |
| "learning_rate": 5.882352941176471e-07, |
| "loss": -0.0034, |
| "reward": -0.96875, |
| "reward_std": 0.3203125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.96875, |
| "step": 88 |
| }, |
| { |
| "completion_length": 444.3333435058594, |
| "epoch": 0.011909541014318212, |
| "grad_norm": 0.11555906385183334, |
| "kl": 0.0005076751112937927, |
| "learning_rate": 5.949197860962567e-07, |
| "loss": 0.0, |
| "reward": -0.98046875, |
| "reward_std": 0.3828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.98046875, |
| "step": 89 |
| }, |
| { |
| "completion_length": 381.66668701171875, |
| "epoch": 0.01204335608189482, |
| "grad_norm": 0.11006759107112885, |
| "kl": 0.0005213702679611742, |
| "learning_rate": 6.016042780748663e-07, |
| "loss": -0.0019, |
| "reward": -0.8125, |
| "reward_std": 0.51171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8125, |
| "step": 90 |
| }, |
| { |
| "completion_length": 586.6666870117188, |
| "epoch": 0.01217717114947143, |
| "grad_norm": 0.0977427214384079, |
| "kl": 0.0004677172692026943, |
| "learning_rate": 6.08288770053476e-07, |
| "loss": 0.0059, |
| "reward": -1.2265625, |
| "reward_std": 0.5703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2265625, |
| "step": 91 |
| }, |
| { |
| "completion_length": 523.5, |
| "epoch": 0.01231098621704804, |
| "grad_norm": 0.14523504674434662, |
| "kl": 0.0007363607874140143, |
| "learning_rate": 6.149732620320856e-07, |
| "loss": -0.0029, |
| "reward": -1.484375, |
| "reward_std": 0.828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.484375, |
| "step": 92 |
| }, |
| { |
| "completion_length": 452.5, |
| "epoch": 0.012444801284624649, |
| "grad_norm": 0.11730131506919861, |
| "kl": 0.0004316701088100672, |
| "learning_rate": 6.216577540106952e-07, |
| "loss": 0.0008, |
| "reward": -1.328125, |
| "reward_std": 0.82421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.328125, |
| "step": 93 |
| }, |
| { |
| "completion_length": 471.66668701171875, |
| "epoch": 0.012578616352201259, |
| "grad_norm": 0.12073160707950592, |
| "kl": 0.000504339870531112, |
| "learning_rate": 6.283422459893049e-07, |
| "loss": 0.0029, |
| "reward": -1.1796875, |
| "reward_std": 0.40625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1796875, |
| "step": 94 |
| }, |
| { |
| "completion_length": 291.66668701171875, |
| "epoch": 0.012712431419777867, |
| "grad_norm": 0.17788150906562805, |
| "kl": 0.0006855755927972496, |
| "learning_rate": 6.350267379679146e-07, |
| "loss": -0.0016, |
| "reward": -0.494140625, |
| "reward_std": 0.26171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.494140625, |
| "step": 95 |
| }, |
| { |
| "completion_length": 411.8333435058594, |
| "epoch": 0.012846246487354477, |
| "grad_norm": 0.08730936795473099, |
| "kl": 0.00039596876013092697, |
| "learning_rate": 6.417112299465242e-07, |
| "loss": 0.0014, |
| "reward": -0.890625, |
| "reward_std": 0.498046875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.890625, |
| "step": 96 |
| }, |
| { |
| "completion_length": 376.5, |
| "epoch": 0.012980061554931085, |
| "grad_norm": 0.09182324260473251, |
| "kl": 0.0004653404466807842, |
| "learning_rate": 6.483957219251337e-07, |
| "loss": -0.0036, |
| "reward": -0.73828125, |
| "reward_std": 0.33203125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.73828125, |
| "step": 97 |
| }, |
| { |
| "completion_length": 395.66668701171875, |
| "epoch": 0.013113876622507695, |
| "grad_norm": 0.12949968874454498, |
| "kl": 0.0005730512784793973, |
| "learning_rate": 6.550802139037434e-07, |
| "loss": -0.0027, |
| "reward": -0.8359375, |
| "reward_std": 0.49609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8359375, |
| "step": 98 |
| }, |
| { |
| "completion_length": 600.0, |
| "epoch": 0.013247691690084303, |
| "grad_norm": 0.08795811235904694, |
| "kl": 0.000673401344101876, |
| "learning_rate": 6.61764705882353e-07, |
| "loss": 0.0018, |
| "reward": -1.78125, |
| "reward_std": 0.5078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.78125, |
| "step": 99 |
| }, |
| { |
| "completion_length": 468.0, |
| "epoch": 0.013381506757660913, |
| "grad_norm": 0.1182761937379837, |
| "kl": 0.0005245240754447877, |
| "learning_rate": 6.684491978609627e-07, |
| "loss": 0.0088, |
| "reward": -1.234375, |
| "reward_std": 0.4296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.234375, |
| "step": 100 |
| }, |
| { |
| "completion_length": 429.8333435058594, |
| "epoch": 0.013515321825237521, |
| "grad_norm": 0.10523517429828644, |
| "kl": 0.0004888825351372361, |
| "learning_rate": 6.751336898395723e-07, |
| "loss": 0.0049, |
| "reward": -0.8671875, |
| "reward_std": 0.36328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8671875, |
| "step": 101 |
| }, |
| { |
| "completion_length": 350.16668701171875, |
| "epoch": 0.01364913689281413, |
| "grad_norm": 0.1099957525730133, |
| "kl": 0.0004989251610822976, |
| "learning_rate": 6.818181818181818e-07, |
| "loss": -0.0032, |
| "reward": -0.6171875, |
| "reward_std": 0.2275390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6171875, |
| "step": 102 |
| }, |
| { |
| "completion_length": 492.0, |
| "epoch": 0.01378295196039074, |
| "grad_norm": 0.096939817070961, |
| "kl": 0.0006083787302486598, |
| "learning_rate": 6.885026737967915e-07, |
| "loss": -0.0012, |
| "reward": -0.84765625, |
| "reward_std": 0.5078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.84765625, |
| "step": 103 |
| }, |
| { |
| "completion_length": 520.8333740234375, |
| "epoch": 0.013916767027967349, |
| "grad_norm": 0.09864147007465363, |
| "kl": 0.0004971123998984694, |
| "learning_rate": 6.951871657754011e-07, |
| "loss": 0.0051, |
| "reward": -1.1796875, |
| "reward_std": 0.341796875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1796875, |
| "step": 104 |
| }, |
| { |
| "completion_length": 528.0, |
| "epoch": 0.014050582095543959, |
| "grad_norm": 0.08159384876489639, |
| "kl": 0.0003954106941819191, |
| "learning_rate": 7.018716577540107e-07, |
| "loss": 0.013, |
| "reward": -1.265625, |
| "reward_std": 0.25390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.265625, |
| "step": 105 |
| }, |
| { |
| "completion_length": 357.5, |
| "epoch": 0.014184397163120567, |
| "grad_norm": 0.103823222219944, |
| "kl": 0.0004869327531196177, |
| "learning_rate": 7.085561497326204e-07, |
| "loss": 0.0014, |
| "reward": -0.859375, |
| "reward_std": 0.470703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.859375, |
| "step": 106 |
| }, |
| { |
| "completion_length": 475.5, |
| "epoch": 0.014318212230697177, |
| "grad_norm": 0.0782044380903244, |
| "kl": 0.0005046841688454151, |
| "learning_rate": 7.152406417112299e-07, |
| "loss": 0.0, |
| "reward": -1.1484375, |
| "reward_std": 0.4375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1484375, |
| "step": 107 |
| }, |
| { |
| "completion_length": 467.5, |
| "epoch": 0.014452027298273785, |
| "grad_norm": 0.09171518683433533, |
| "kl": 0.0005273159476928413, |
| "learning_rate": 7.219251336898397e-07, |
| "loss": -0.0036, |
| "reward": -0.921875, |
| "reward_std": 0.4140625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.921875, |
| "step": 108 |
| }, |
| { |
| "completion_length": 461.16668701171875, |
| "epoch": 0.014585842365850395, |
| "grad_norm": 0.09841100871562958, |
| "kl": 0.0006487583741545677, |
| "learning_rate": 7.286096256684493e-07, |
| "loss": 0.0033, |
| "reward": -0.875, |
| "reward_std": 0.3671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.875, |
| "step": 109 |
| }, |
| { |
| "completion_length": 358.5, |
| "epoch": 0.014719657433427003, |
| "grad_norm": 0.13746988773345947, |
| "kl": 0.0004324812616687268, |
| "learning_rate": 7.352941176470589e-07, |
| "loss": 0.0025, |
| "reward": -0.68359375, |
| "reward_std": 0.279296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.68359375, |
| "step": 110 |
| }, |
| { |
| "completion_length": 559.5, |
| "epoch": 0.014853472501003613, |
| "grad_norm": 0.08793191611766815, |
| "kl": 0.0005514743970707059, |
| "learning_rate": 7.419786096256686e-07, |
| "loss": 0.0003, |
| "reward": -1.5546875, |
| "reward_std": 0.404296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.5546875, |
| "step": 111 |
| }, |
| { |
| "completion_length": 396.66668701171875, |
| "epoch": 0.014987287568580221, |
| "grad_norm": 0.10710439831018448, |
| "kl": 0.0004422089259605855, |
| "learning_rate": 7.486631016042781e-07, |
| "loss": -0.0019, |
| "reward": -0.87109375, |
| "reward_std": 0.265625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.87109375, |
| "step": 112 |
| }, |
| { |
| "completion_length": 474.5, |
| "epoch": 0.015121102636156831, |
| "grad_norm": 0.09523480385541916, |
| "kl": 0.00043510389514267445, |
| "learning_rate": 7.553475935828877e-07, |
| "loss": -0.0029, |
| "reward": -0.94921875, |
| "reward_std": 0.376953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.94921875, |
| "step": 113 |
| }, |
| { |
| "completion_length": 385.8333435058594, |
| "epoch": 0.015254917703733441, |
| "grad_norm": 0.11645786464214325, |
| "kl": 0.0005059984978288412, |
| "learning_rate": 7.620320855614974e-07, |
| "loss": -0.001, |
| "reward": -0.8671875, |
| "reward_std": 0.4921875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8671875, |
| "step": 114 |
| }, |
| { |
| "completion_length": 380.16668701171875, |
| "epoch": 0.01538873277131005, |
| "grad_norm": 0.14121747016906738, |
| "kl": 0.00044106499990448356, |
| "learning_rate": 7.68716577540107e-07, |
| "loss": 0.0038, |
| "reward": -0.703125, |
| "reward_std": 0.41796875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.703125, |
| "step": 115 |
| }, |
| { |
| "completion_length": 481.0, |
| "epoch": 0.01552254783888666, |
| "grad_norm": 0.10021474212408066, |
| "kl": 0.0005236791330389678, |
| "learning_rate": 7.754010695187167e-07, |
| "loss": -0.0135, |
| "reward": -1.09375, |
| "reward_std": 0.291015625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.09375, |
| "step": 116 |
| }, |
| { |
| "completion_length": 662.0, |
| "epoch": 0.01565636290646327, |
| "grad_norm": 0.08310368657112122, |
| "kl": 0.0005542068392969668, |
| "learning_rate": 7.820855614973262e-07, |
| "loss": 0.0026, |
| "reward": -1.6015625, |
| "reward_std": 0.6328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.6015625, |
| "step": 117 |
| }, |
| { |
| "completion_length": 469.3333435058594, |
| "epoch": 0.015790177974039876, |
| "grad_norm": 0.08709719032049179, |
| "kl": 0.000453361077234149, |
| "learning_rate": 7.887700534759358e-07, |
| "loss": 0.0008, |
| "reward": -1.140625, |
| "reward_std": 0.484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.140625, |
| "step": 118 |
| }, |
| { |
| "completion_length": 397.16668701171875, |
| "epoch": 0.015923993041616485, |
| "grad_norm": 0.11706002801656723, |
| "kl": 0.0006737220101058483, |
| "learning_rate": 7.954545454545455e-07, |
| "loss": 0.0036, |
| "reward": -0.51953125, |
| "reward_std": 0.4453125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.51953125, |
| "step": 119 |
| }, |
| { |
| "completion_length": 431.0, |
| "epoch": 0.016057808109193095, |
| "grad_norm": 0.12378671020269394, |
| "kl": 0.0005190541851334274, |
| "learning_rate": 8.021390374331551e-07, |
| "loss": -0.0026, |
| "reward": -1.0859375, |
| "reward_std": 0.5390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0859375, |
| "step": 120 |
| }, |
| { |
| "completion_length": 585.8333740234375, |
| "epoch": 0.016191623176769705, |
| "grad_norm": 0.06980501115322113, |
| "kl": 0.0005238899611867964, |
| "learning_rate": 8.088235294117648e-07, |
| "loss": -0.0045, |
| "reward": -1.6796875, |
| "reward_std": 0.275390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.6796875, |
| "step": 121 |
| }, |
| { |
| "completion_length": 289.8333435058594, |
| "epoch": 0.01632543824434631, |
| "grad_norm": 0.15975068509578705, |
| "kl": 0.0007231835625134408, |
| "learning_rate": 8.155080213903745e-07, |
| "loss": -0.0021, |
| "reward": -0.482421875, |
| "reward_std": 0.318359375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.482421875, |
| "step": 122 |
| }, |
| { |
| "completion_length": 507.3333435058594, |
| "epoch": 0.01645925331192292, |
| "grad_norm": 0.11071807146072388, |
| "kl": 0.0004386794753372669, |
| "learning_rate": 8.22192513368984e-07, |
| "loss": -0.0013, |
| "reward": -0.88671875, |
| "reward_std": 0.5703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.88671875, |
| "step": 123 |
| }, |
| { |
| "completion_length": 577.0, |
| "epoch": 0.01659306837949953, |
| "grad_norm": 0.06843210011720657, |
| "kl": 0.0004015905724372715, |
| "learning_rate": 8.288770053475937e-07, |
| "loss": -0.0058, |
| "reward": -1.2734375, |
| "reward_std": 0.349609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2734375, |
| "step": 124 |
| }, |
| { |
| "completion_length": 485.3333435058594, |
| "epoch": 0.01672688344707614, |
| "grad_norm": 0.11638530343770981, |
| "kl": 0.000609593465924263, |
| "learning_rate": 8.355614973262033e-07, |
| "loss": -0.0047, |
| "reward": -1.15625, |
| "reward_std": 0.33984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.15625, |
| "step": 125 |
| }, |
| { |
| "completion_length": 449.8333435058594, |
| "epoch": 0.01686069851465275, |
| "grad_norm": 0.06978274881839752, |
| "kl": 0.00046619633212685585, |
| "learning_rate": 8.42245989304813e-07, |
| "loss": -0.0045, |
| "reward": -0.875, |
| "reward_std": 0.2734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.875, |
| "step": 126 |
| }, |
| { |
| "completion_length": 358.8333435058594, |
| "epoch": 0.016994513582229358, |
| "grad_norm": 0.12115911394357681, |
| "kl": 0.0006228546844795346, |
| "learning_rate": 8.489304812834226e-07, |
| "loss": 0.0012, |
| "reward": -0.80078125, |
| "reward_std": 0.2158203125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.80078125, |
| "step": 127 |
| }, |
| { |
| "completion_length": 340.8333435058594, |
| "epoch": 0.017128328649805968, |
| "grad_norm": 0.1330835521221161, |
| "kl": 0.0006046565249562263, |
| "learning_rate": 8.556149732620321e-07, |
| "loss": 0.001, |
| "reward": -0.75, |
| "reward_std": 0.2421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.75, |
| "step": 128 |
| }, |
| { |
| "completion_length": 384.66668701171875, |
| "epoch": 0.017262143717382578, |
| "grad_norm": 0.1260133534669876, |
| "kl": 0.0006517590372823179, |
| "learning_rate": 8.622994652406418e-07, |
| "loss": -0.0002, |
| "reward": -0.875, |
| "reward_std": 0.71875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.875, |
| "step": 129 |
| }, |
| { |
| "completion_length": 438.16668701171875, |
| "epoch": 0.017395958784959187, |
| "grad_norm": 0.10832860320806503, |
| "kl": 0.0005657231668010354, |
| "learning_rate": 8.689839572192514e-07, |
| "loss": 0.0007, |
| "reward": -1.0234375, |
| "reward_std": 0.578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0234375, |
| "step": 130 |
| }, |
| { |
| "completion_length": 395.5, |
| "epoch": 0.017529773852535794, |
| "grad_norm": 0.14919129014015198, |
| "kl": 0.0006569415563717484, |
| "learning_rate": 8.756684491978611e-07, |
| "loss": -0.0018, |
| "reward": -0.94921875, |
| "reward_std": 0.546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.94921875, |
| "step": 131 |
| }, |
| { |
| "completion_length": 391.16668701171875, |
| "epoch": 0.017663588920112404, |
| "grad_norm": 0.14460250735282898, |
| "kl": 0.0006637731567025185, |
| "learning_rate": 8.823529411764707e-07, |
| "loss": -0.0013, |
| "reward": -0.94921875, |
| "reward_std": 0.25, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.94921875, |
| "step": 132 |
| }, |
| { |
| "completion_length": 441.3333435058594, |
| "epoch": 0.017797403987689014, |
| "grad_norm": 0.15532676875591278, |
| "kl": 0.0004714071692433208, |
| "learning_rate": 8.890374331550802e-07, |
| "loss": 0.0046, |
| "reward": -0.98828125, |
| "reward_std": 0.251953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.98828125, |
| "step": 133 |
| }, |
| { |
| "completion_length": 370.8333435058594, |
| "epoch": 0.017931219055265624, |
| "grad_norm": 0.11382456123828888, |
| "kl": 0.0007278465200215578, |
| "learning_rate": 8.957219251336899e-07, |
| "loss": -0.0034, |
| "reward": -0.74609375, |
| "reward_std": 0.314453125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.74609375, |
| "step": 134 |
| }, |
| { |
| "completion_length": 400.5, |
| "epoch": 0.018065034122842234, |
| "grad_norm": 0.14282457530498505, |
| "kl": 0.00047517273924313486, |
| "learning_rate": 9.024064171122995e-07, |
| "loss": -0.006, |
| "reward": -0.8359375, |
| "reward_std": 0.322265625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8359375, |
| "step": 135 |
| }, |
| { |
| "completion_length": 556.3333740234375, |
| "epoch": 0.01819884919041884, |
| "grad_norm": 0.09024691581726074, |
| "kl": 0.00043525476939976215, |
| "learning_rate": 9.090909090909091e-07, |
| "loss": -0.0013, |
| "reward": -1.4296875, |
| "reward_std": 0.60546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.4296875, |
| "step": 136 |
| }, |
| { |
| "completion_length": 301.66668701171875, |
| "epoch": 0.01833266425799545, |
| "grad_norm": 0.1422368437051773, |
| "kl": 0.0007189570460468531, |
| "learning_rate": 9.157754010695189e-07, |
| "loss": -0.0065, |
| "reward": -0.5859375, |
| "reward_std": 0.28125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5859375, |
| "step": 137 |
| }, |
| { |
| "completion_length": 586.0, |
| "epoch": 0.01846647932557206, |
| "grad_norm": 0.07946749031543732, |
| "kl": 0.0004385068896226585, |
| "learning_rate": 9.224598930481284e-07, |
| "loss": 0.0013, |
| "reward": -1.421875, |
| "reward_std": 0.341796875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.421875, |
| "step": 138 |
| }, |
| { |
| "completion_length": 377.0, |
| "epoch": 0.01860029439314867, |
| "grad_norm": 0.12276607006788254, |
| "kl": 0.0007007961976341903, |
| "learning_rate": 9.29144385026738e-07, |
| "loss": -0.0027, |
| "reward": -0.9609375, |
| "reward_std": 0.48046875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9609375, |
| "step": 139 |
| }, |
| { |
| "completion_length": 323.8333435058594, |
| "epoch": 0.018734109460725276, |
| "grad_norm": 0.11141712218523026, |
| "kl": 0.0004957327037118375, |
| "learning_rate": 9.358288770053477e-07, |
| "loss": -0.0023, |
| "reward": -0.7109375, |
| "reward_std": 0.458984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7109375, |
| "step": 140 |
| }, |
| { |
| "completion_length": 451.3333435058594, |
| "epoch": 0.018867924528301886, |
| "grad_norm": 0.08641522377729416, |
| "kl": 0.0004216538218315691, |
| "learning_rate": 9.425133689839573e-07, |
| "loss": -0.0001, |
| "reward": -0.89453125, |
| "reward_std": 0.33984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.89453125, |
| "step": 141 |
| }, |
| { |
| "completion_length": 373.66668701171875, |
| "epoch": 0.019001739595878496, |
| "grad_norm": 0.11334878951311111, |
| "kl": 0.0005491083720698953, |
| "learning_rate": 9.49197860962567e-07, |
| "loss": -0.0032, |
| "reward": -0.89453125, |
| "reward_std": 0.40625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.89453125, |
| "step": 142 |
| }, |
| { |
| "completion_length": 365.66668701171875, |
| "epoch": 0.019135554663455106, |
| "grad_norm": 0.1251085102558136, |
| "kl": 0.0006641787476837635, |
| "learning_rate": 9.558823529411764e-07, |
| "loss": -0.0044, |
| "reward": -0.83984375, |
| "reward_std": 0.302734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.83984375, |
| "step": 143 |
| }, |
| { |
| "completion_length": 474.66668701171875, |
| "epoch": 0.019269369731031716, |
| "grad_norm": 0.09395861625671387, |
| "kl": 0.0006498050643131137, |
| "learning_rate": 9.625668449197862e-07, |
| "loss": 0.0017, |
| "reward": -1.1328125, |
| "reward_std": 0.408203125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1328125, |
| "step": 144 |
| }, |
| { |
| "completion_length": 323.3333435058594, |
| "epoch": 0.019403184798608322, |
| "grad_norm": 0.12278474867343903, |
| "kl": 0.0006131879054009914, |
| "learning_rate": 9.692513368983958e-07, |
| "loss": 0.0004, |
| "reward": -0.7265625, |
| "reward_std": 0.275390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7265625, |
| "step": 145 |
| }, |
| { |
| "completion_length": 377.3333435058594, |
| "epoch": 0.019536999866184932, |
| "grad_norm": 0.11732782423496246, |
| "kl": 0.0005958870751783252, |
| "learning_rate": 9.759358288770054e-07, |
| "loss": -0.0029, |
| "reward": -0.83203125, |
| "reward_std": 0.5703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.83203125, |
| "step": 146 |
| }, |
| { |
| "completion_length": 440.5, |
| "epoch": 0.019670814933761542, |
| "grad_norm": 0.1868724673986435, |
| "kl": 0.000681176024954766, |
| "learning_rate": 9.82620320855615e-07, |
| "loss": -0.0068, |
| "reward": -1.09375, |
| "reward_std": 0.53125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.09375, |
| "step": 147 |
| }, |
| { |
| "completion_length": 340.8333435058594, |
| "epoch": 0.019804630001338152, |
| "grad_norm": 0.1497308760881424, |
| "kl": 0.0005646012723445892, |
| "learning_rate": 9.893048128342248e-07, |
| "loss": -0.0049, |
| "reward": -0.8125, |
| "reward_std": 0.396484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8125, |
| "step": 148 |
| }, |
| { |
| "completion_length": 570.8333740234375, |
| "epoch": 0.01993844506891476, |
| "grad_norm": 0.09082633256912231, |
| "kl": 0.0005208106595091522, |
| "learning_rate": 9.959893048128342e-07, |
| "loss": -0.0026, |
| "reward": -1.1328125, |
| "reward_std": 0.671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1328125, |
| "step": 149 |
| }, |
| { |
| "completion_length": 381.3333435058594, |
| "epoch": 0.02007226013649137, |
| "grad_norm": 0.11392635107040405, |
| "kl": 0.0005461536347866058, |
| "learning_rate": 1.0026737967914438e-06, |
| "loss": -0.0045, |
| "reward": -0.65234375, |
| "reward_std": 0.46484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.65234375, |
| "step": 150 |
| }, |
| { |
| "completion_length": 512.8333740234375, |
| "epoch": 0.020206075204067978, |
| "grad_norm": 0.10393022745847702, |
| "kl": 0.0005140831926837564, |
| "learning_rate": 1.0093582887700537e-06, |
| "loss": 0.0034, |
| "reward": -1.3125, |
| "reward_std": 0.470703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.3125, |
| "step": 151 |
| }, |
| { |
| "completion_length": 352.8333435058594, |
| "epoch": 0.020339890271644588, |
| "grad_norm": 0.14165768027305603, |
| "kl": 0.000577162834815681, |
| "learning_rate": 1.0160427807486633e-06, |
| "loss": 0.0018, |
| "reward": -0.890625, |
| "reward_std": 0.375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.890625, |
| "step": 152 |
| }, |
| { |
| "completion_length": 379.3333435058594, |
| "epoch": 0.020473705339221198, |
| "grad_norm": 0.14219383895397186, |
| "kl": 0.0006267136195674539, |
| "learning_rate": 1.0227272727272729e-06, |
| "loss": -0.0019, |
| "reward": -0.703125, |
| "reward_std": 0.484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.703125, |
| "step": 153 |
| }, |
| { |
| "completion_length": 433.0, |
| "epoch": 0.020607520406797804, |
| "grad_norm": 0.09045641869306564, |
| "kl": 0.0003349175094626844, |
| "learning_rate": 1.0294117647058825e-06, |
| "loss": 0.0128, |
| "reward": -1.1171875, |
| "reward_std": 0.240234375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1171875, |
| "step": 154 |
| }, |
| { |
| "completion_length": 348.5, |
| "epoch": 0.020741335474374414, |
| "grad_norm": 0.1472688764333725, |
| "kl": 0.0006852279184386134, |
| "learning_rate": 1.036096256684492e-06, |
| "loss": -0.0036, |
| "reward": -0.71875, |
| "reward_std": 0.279296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.71875, |
| "step": 155 |
| }, |
| { |
| "completion_length": 388.0, |
| "epoch": 0.020875150541951024, |
| "grad_norm": 0.14087940752506256, |
| "kl": 0.0006020927103236318, |
| "learning_rate": 1.0427807486631017e-06, |
| "loss": -0.0057, |
| "reward": -0.64453125, |
| "reward_std": 0.32421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.64453125, |
| "step": 156 |
| }, |
| { |
| "completion_length": 327.66668701171875, |
| "epoch": 0.021008965609527634, |
| "grad_norm": 0.13045720756053925, |
| "kl": 0.0005312262801453471, |
| "learning_rate": 1.0494652406417113e-06, |
| "loss": -0.0019, |
| "reward": -0.53125, |
| "reward_std": 0.53125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.53125, |
| "step": 157 |
| }, |
| { |
| "completion_length": 409.5, |
| "epoch": 0.02114278067710424, |
| "grad_norm": 0.12158454209566116, |
| "kl": 0.0006615255842916667, |
| "learning_rate": 1.056149732620321e-06, |
| "loss": -0.0067, |
| "reward": -0.80078125, |
| "reward_std": 0.2490234375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.80078125, |
| "step": 158 |
| }, |
| { |
| "completion_length": 530.6666870117188, |
| "epoch": 0.02127659574468085, |
| "grad_norm": 0.1100451648235321, |
| "kl": 0.0006079694721847773, |
| "learning_rate": 1.0628342245989305e-06, |
| "loss": -0.0006, |
| "reward": -1.5, |
| "reward_std": 0.64453125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.5, |
| "step": 159 |
| }, |
| { |
| "completion_length": 365.0, |
| "epoch": 0.02141041081225746, |
| "grad_norm": 0.11980035901069641, |
| "kl": 0.0005896420334465802, |
| "learning_rate": 1.0695187165775401e-06, |
| "loss": -0.0011, |
| "reward": -0.69921875, |
| "reward_std": 0.4375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.69921875, |
| "step": 160 |
| }, |
| { |
| "completion_length": 312.5, |
| "epoch": 0.02154422587983407, |
| "grad_norm": 0.14624665677547455, |
| "kl": 0.00077395373955369, |
| "learning_rate": 1.0762032085561497e-06, |
| "loss": 0.0041, |
| "reward": -0.54296875, |
| "reward_std": 0.3046875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.54296875, |
| "step": 161 |
| }, |
| { |
| "completion_length": 391.8333435058594, |
| "epoch": 0.021678040947410677, |
| "grad_norm": 0.1249147579073906, |
| "kl": 0.0007619769312441349, |
| "learning_rate": 1.0828877005347595e-06, |
| "loss": -0.0054, |
| "reward": -0.875, |
| "reward_std": 0.361328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.875, |
| "step": 162 |
| }, |
| { |
| "completion_length": 630.6666870117188, |
| "epoch": 0.021811856014987287, |
| "grad_norm": 0.09878282248973846, |
| "kl": 0.0005383545067161322, |
| "learning_rate": 1.0895721925133691e-06, |
| "loss": 0.0016, |
| "reward": -1.59375, |
| "reward_std": 1.1171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.59375, |
| "step": 163 |
| }, |
| { |
| "completion_length": 238.1666717529297, |
| "epoch": 0.021945671082563897, |
| "grad_norm": 0.16415703296661377, |
| "kl": 0.0007143677212297916, |
| "learning_rate": 1.0962566844919787e-06, |
| "loss": 0.0052, |
| "reward": -0.34375, |
| "reward_std": 0.1376953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.34375, |
| "step": 164 |
| }, |
| { |
| "completion_length": 424.16668701171875, |
| "epoch": 0.022079486150140507, |
| "grad_norm": 0.12024425715208054, |
| "kl": 0.0004885084345005453, |
| "learning_rate": 1.1029411764705884e-06, |
| "loss": -0.0011, |
| "reward": -1.0234375, |
| "reward_std": 0.287109375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0234375, |
| "step": 165 |
| }, |
| { |
| "completion_length": 345.0, |
| "epoch": 0.022213301217717116, |
| "grad_norm": 0.13134251534938812, |
| "kl": 0.0005485338624566793, |
| "learning_rate": 1.109625668449198e-06, |
| "loss": 0.0041, |
| "reward": -0.640625, |
| "reward_std": 0.296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.640625, |
| "step": 166 |
| }, |
| { |
| "completion_length": 449.0, |
| "epoch": 0.022347116285293723, |
| "grad_norm": 0.13914933800697327, |
| "kl": 0.0005790984723716974, |
| "learning_rate": 1.1163101604278076e-06, |
| "loss": -0.0024, |
| "reward": -0.90625, |
| "reward_std": 0.43359375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.90625, |
| "step": 167 |
| }, |
| { |
| "completion_length": 456.66668701171875, |
| "epoch": 0.022480931352870333, |
| "grad_norm": 0.11662891507148743, |
| "kl": 0.000677458185236901, |
| "learning_rate": 1.1229946524064172e-06, |
| "loss": -0.0081, |
| "reward": -1.21875, |
| "reward_std": 0.431640625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.21875, |
| "step": 168 |
| }, |
| { |
| "completion_length": 338.66668701171875, |
| "epoch": 0.022614746420446943, |
| "grad_norm": 0.14155802130699158, |
| "kl": 0.0005925593432039022, |
| "learning_rate": 1.1296791443850268e-06, |
| "loss": 0.0, |
| "reward": -0.71484375, |
| "reward_std": 0.40625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.71484375, |
| "step": 169 |
| }, |
| { |
| "completion_length": 265.8333435058594, |
| "epoch": 0.022748561488023553, |
| "grad_norm": 0.16593119502067566, |
| "kl": 0.0005104307783767581, |
| "learning_rate": 1.1363636363636364e-06, |
| "loss": 0.0003, |
| "reward": -0.48046875, |
| "reward_std": 0.2353515625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.48046875, |
| "step": 170 |
| }, |
| { |
| "completion_length": 461.3333435058594, |
| "epoch": 0.02288237655560016, |
| "grad_norm": 0.1283525973558426, |
| "kl": 0.0006034953985363245, |
| "learning_rate": 1.143048128342246e-06, |
| "loss": -0.0006, |
| "reward": -0.91015625, |
| "reward_std": 0.55859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.91015625, |
| "step": 171 |
| }, |
| { |
| "completion_length": 567.3333740234375, |
| "epoch": 0.02301619162317677, |
| "grad_norm": 0.09264618158340454, |
| "kl": 0.00039277609903365374, |
| "learning_rate": 1.1497326203208558e-06, |
| "loss": 0.0011, |
| "reward": -1.296875, |
| "reward_std": 0.423828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.296875, |
| "step": 172 |
| }, |
| { |
| "completion_length": 442.66668701171875, |
| "epoch": 0.02315000669075338, |
| "grad_norm": 0.06924661993980408, |
| "kl": 0.0002805929980240762, |
| "learning_rate": 1.1564171122994654e-06, |
| "loss": 0.0049, |
| "reward": -1.140625, |
| "reward_std": 0.130859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.140625, |
| "step": 173 |
| }, |
| { |
| "completion_length": 321.5, |
| "epoch": 0.02328382175832999, |
| "grad_norm": 0.15098147094249725, |
| "kl": 0.0005829234141856432, |
| "learning_rate": 1.163101604278075e-06, |
| "loss": 0.0117, |
| "reward": -0.58203125, |
| "reward_std": 0.1455078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.58203125, |
| "step": 174 |
| }, |
| { |
| "completion_length": 416.3333435058594, |
| "epoch": 0.0234176368259066, |
| "grad_norm": 0.11847102642059326, |
| "kl": 0.0006443657330237329, |
| "learning_rate": 1.1697860962566846e-06, |
| "loss": -0.0044, |
| "reward": -0.8671875, |
| "reward_std": 0.32421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8671875, |
| "step": 175 |
| }, |
| { |
| "completion_length": 371.3333435058594, |
| "epoch": 0.023551451893483205, |
| "grad_norm": 0.1091599240899086, |
| "kl": 0.0004576949286274612, |
| "learning_rate": 1.1764705882352942e-06, |
| "loss": -0.0054, |
| "reward": -0.73828125, |
| "reward_std": 0.296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.73828125, |
| "step": 176 |
| }, |
| { |
| "completion_length": 438.16668701171875, |
| "epoch": 0.023685266961059815, |
| "grad_norm": 0.10089421272277832, |
| "kl": 0.0004992609028704464, |
| "learning_rate": 1.1831550802139038e-06, |
| "loss": 0.0029, |
| "reward": -0.55859375, |
| "reward_std": 0.482421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.55859375, |
| "step": 177 |
| }, |
| { |
| "completion_length": 509.16668701171875, |
| "epoch": 0.023819082028636425, |
| "grad_norm": 0.10792536288499832, |
| "kl": 0.000662465114146471, |
| "learning_rate": 1.1898395721925134e-06, |
| "loss": -0.0029, |
| "reward": -1.21875, |
| "reward_std": 0.294921875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.21875, |
| "step": 178 |
| }, |
| { |
| "completion_length": 347.5, |
| "epoch": 0.023952897096213035, |
| "grad_norm": 0.2220248132944107, |
| "kl": 0.0006923056207597256, |
| "learning_rate": 1.1965240641711233e-06, |
| "loss": -0.0063, |
| "reward": -0.78125, |
| "reward_std": 0.29296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.78125, |
| "step": 179 |
| }, |
| { |
| "completion_length": 470.16668701171875, |
| "epoch": 0.02408671216378964, |
| "grad_norm": 0.09262672066688538, |
| "kl": 0.00041312514804303646, |
| "learning_rate": 1.2032085561497326e-06, |
| "loss": -0.0037, |
| "reward": -1.2109375, |
| "reward_std": 0.458984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2109375, |
| "step": 180 |
| }, |
| { |
| "completion_length": 484.0, |
| "epoch": 0.02422052723136625, |
| "grad_norm": 0.11066435277462006, |
| "kl": 0.0005693985149264336, |
| "learning_rate": 1.2098930481283423e-06, |
| "loss": -0.0112, |
| "reward": -1.109375, |
| "reward_std": 0.326171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.109375, |
| "step": 181 |
| }, |
| { |
| "completion_length": 317.8333435058594, |
| "epoch": 0.02435434229894286, |
| "grad_norm": 0.1172327920794487, |
| "kl": 0.0005950028426013887, |
| "learning_rate": 1.216577540106952e-06, |
| "loss": -0.0006, |
| "reward": -0.6328125, |
| "reward_std": 0.34765625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6328125, |
| "step": 182 |
| }, |
| { |
| "completion_length": 339.3333435058594, |
| "epoch": 0.02448815736651947, |
| "grad_norm": 0.10278620570898056, |
| "kl": 0.00045496373786590993, |
| "learning_rate": 1.2232620320855617e-06, |
| "loss": -0.0003, |
| "reward": -0.671875, |
| "reward_std": 0.4140625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.671875, |
| "step": 183 |
| }, |
| { |
| "completion_length": 497.3333435058594, |
| "epoch": 0.02462197243409608, |
| "grad_norm": 0.1089860200881958, |
| "kl": 0.0006303495611064136, |
| "learning_rate": 1.2299465240641713e-06, |
| "loss": -0.0019, |
| "reward": -1.359375, |
| "reward_std": 1.3984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.359375, |
| "step": 184 |
| }, |
| { |
| "completion_length": 304.66668701171875, |
| "epoch": 0.024755787501672687, |
| "grad_norm": 0.14699524641036987, |
| "kl": 0.0006107437657192349, |
| "learning_rate": 1.2366310160427809e-06, |
| "loss": -0.0026, |
| "reward": -0.51953125, |
| "reward_std": 0.3359375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.51953125, |
| "step": 185 |
| }, |
| { |
| "completion_length": 360.5, |
| "epoch": 0.024889602569249297, |
| "grad_norm": 0.1235690489411354, |
| "kl": 0.000642502389382571, |
| "learning_rate": 1.2433155080213905e-06, |
| "loss": 0.0, |
| "reward": -0.890625, |
| "reward_std": 0.1982421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.890625, |
| "step": 186 |
| }, |
| { |
| "completion_length": 405.8333435058594, |
| "epoch": 0.025023417636825907, |
| "grad_norm": 0.13261531293392181, |
| "kl": 0.0007065389072522521, |
| "learning_rate": 1.25e-06, |
| "loss": 0.0017, |
| "reward": -0.828125, |
| "reward_std": 0.427734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.828125, |
| "step": 187 |
| }, |
| { |
| "completion_length": 355.8333435058594, |
| "epoch": 0.025157232704402517, |
| "grad_norm": 0.11836958676576614, |
| "kl": 0.0005762047949247062, |
| "learning_rate": 1.2566844919786097e-06, |
| "loss": 0.001, |
| "reward": -0.53515625, |
| "reward_std": 0.3046875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.53515625, |
| "step": 188 |
| }, |
| { |
| "completion_length": 295.0, |
| "epoch": 0.025291047771979124, |
| "grad_norm": 0.15814770758152008, |
| "kl": 0.000565587542951107, |
| "learning_rate": 1.2633689839572193e-06, |
| "loss": 0.0071, |
| "reward": -0.53515625, |
| "reward_std": 0.17578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.53515625, |
| "step": 189 |
| }, |
| { |
| "completion_length": 317.8333435058594, |
| "epoch": 0.025424862839555733, |
| "grad_norm": 0.2327018529176712, |
| "kl": 0.0006943491753190756, |
| "learning_rate": 1.2700534759358291e-06, |
| "loss": -0.0019, |
| "reward": -0.640625, |
| "reward_std": 0.375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.640625, |
| "step": 190 |
| }, |
| { |
| "completion_length": 345.66668701171875, |
| "epoch": 0.025558677907132343, |
| "grad_norm": 0.15127608180046082, |
| "kl": 0.0005449084565043449, |
| "learning_rate": 1.2767379679144387e-06, |
| "loss": 0.0039, |
| "reward": -0.6875, |
| "reward_std": 0.25, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6875, |
| "step": 191 |
| }, |
| { |
| "completion_length": 372.0, |
| "epoch": 0.025692492974708953, |
| "grad_norm": 0.1675024777650833, |
| "kl": 0.0006789276376366615, |
| "learning_rate": 1.2834224598930483e-06, |
| "loss": 0.0001, |
| "reward": -0.9453125, |
| "reward_std": 0.486328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9453125, |
| "step": 192 |
| }, |
| { |
| "completion_length": 260.8333435058594, |
| "epoch": 0.02582630804228556, |
| "grad_norm": 0.17227157950401306, |
| "kl": 0.0005113824736326933, |
| "learning_rate": 1.2901069518716577e-06, |
| "loss": -0.0011, |
| "reward": -0.41796875, |
| "reward_std": 0.443359375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.41796875, |
| "step": 193 |
| }, |
| { |
| "completion_length": 458.8333435058594, |
| "epoch": 0.02596012310986217, |
| "grad_norm": 0.13124048709869385, |
| "kl": 0.000769376871176064, |
| "learning_rate": 1.2967914438502673e-06, |
| "loss": 0.0117, |
| "reward": -1.265625, |
| "reward_std": 0.33984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.265625, |
| "step": 194 |
| }, |
| { |
| "completion_length": 482.8333435058594, |
| "epoch": 0.02609393817743878, |
| "grad_norm": 0.11438746005296707, |
| "kl": 0.0005745739908888936, |
| "learning_rate": 1.303475935828877e-06, |
| "loss": 0.0003, |
| "reward": -1.2421875, |
| "reward_std": 0.625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2421875, |
| "step": 195 |
| }, |
| { |
| "completion_length": 337.3333435058594, |
| "epoch": 0.02622775324501539, |
| "grad_norm": 0.12187661230564117, |
| "kl": 0.0005417331121861935, |
| "learning_rate": 1.3101604278074868e-06, |
| "loss": 0.0001, |
| "reward": -0.63671875, |
| "reward_std": 0.58984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.63671875, |
| "step": 196 |
| }, |
| { |
| "completion_length": 304.8333435058594, |
| "epoch": 0.026361568312592, |
| "grad_norm": 0.18323078751564026, |
| "kl": 0.001144442823715508, |
| "learning_rate": 1.3168449197860964e-06, |
| "loss": 0.0054, |
| "reward": -0.66015625, |
| "reward_std": 0.2099609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.66015625, |
| "step": 197 |
| }, |
| { |
| "completion_length": 367.3333435058594, |
| "epoch": 0.026495383380168606, |
| "grad_norm": 0.13606765866279602, |
| "kl": 0.0006207119440659881, |
| "learning_rate": 1.323529411764706e-06, |
| "loss": 0.0093, |
| "reward": -0.8359375, |
| "reward_std": 0.2265625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8359375, |
| "step": 198 |
| }, |
| { |
| "completion_length": 367.8333435058594, |
| "epoch": 0.026629198447745216, |
| "grad_norm": 0.13173972070217133, |
| "kl": 0.0006991230184212327, |
| "learning_rate": 1.3302139037433156e-06, |
| "loss": -0.0032, |
| "reward": -0.66015625, |
| "reward_std": 0.482421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.66015625, |
| "step": 199 |
| }, |
| { |
| "completion_length": 516.8333740234375, |
| "epoch": 0.026763013515321826, |
| "grad_norm": 0.113725446164608, |
| "kl": 0.0006071855314075947, |
| "learning_rate": 1.3368983957219254e-06, |
| "loss": 0.0021, |
| "reward": -1.3046875, |
| "reward_std": 0.53125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.3046875, |
| "step": 200 |
| }, |
| { |
| "completion_length": 505.0, |
| "epoch": 0.026896828582898435, |
| "grad_norm": 0.11239483207464218, |
| "kl": 0.0006069260416552424, |
| "learning_rate": 1.343582887700535e-06, |
| "loss": 0.0026, |
| "reward": -1.21875, |
| "reward_std": 0.66796875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.21875, |
| "step": 201 |
| }, |
| { |
| "completion_length": 398.0, |
| "epoch": 0.027030643650475042, |
| "grad_norm": 0.11724357306957245, |
| "kl": 0.0004885403905063868, |
| "learning_rate": 1.3502673796791446e-06, |
| "loss": -0.0039, |
| "reward": -0.875, |
| "reward_std": 0.4609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.875, |
| "step": 202 |
| }, |
| { |
| "completion_length": 549.0, |
| "epoch": 0.027164458718051652, |
| "grad_norm": 0.1066315770149231, |
| "kl": 0.0005430117598734796, |
| "learning_rate": 1.356951871657754e-06, |
| "loss": -0.0029, |
| "reward": -1.171875, |
| "reward_std": 0.48828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.171875, |
| "step": 203 |
| }, |
| { |
| "completion_length": 565.5, |
| "epoch": 0.02729827378562826, |
| "grad_norm": 0.11152154952287674, |
| "kl": 0.0006415534298866987, |
| "learning_rate": 1.3636363636363636e-06, |
| "loss": 0.0046, |
| "reward": -1.453125, |
| "reward_std": 0.6796875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.453125, |
| "step": 204 |
| }, |
| { |
| "completion_length": 322.16668701171875, |
| "epoch": 0.02743208885320487, |
| "grad_norm": 0.14083048701286316, |
| "kl": 0.0005029004532843828, |
| "learning_rate": 1.3703208556149732e-06, |
| "loss": -0.0036, |
| "reward": -0.48046875, |
| "reward_std": 0.2490234375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.48046875, |
| "step": 205 |
| }, |
| { |
| "completion_length": 234.1666717529297, |
| "epoch": 0.02756590392078148, |
| "grad_norm": 0.18288779258728027, |
| "kl": 0.0005245095817372203, |
| "learning_rate": 1.377005347593583e-06, |
| "loss": -0.0001, |
| "reward": -0.287109375, |
| "reward_std": 0.330078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.287109375, |
| "step": 206 |
| }, |
| { |
| "completion_length": 414.3333435058594, |
| "epoch": 0.027699718988358088, |
| "grad_norm": 0.11854821443557739, |
| "kl": 0.0006570084951817989, |
| "learning_rate": 1.3836898395721926e-06, |
| "loss": 0.0049, |
| "reward": -0.9140625, |
| "reward_std": 0.376953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9140625, |
| "step": 207 |
| }, |
| { |
| "completion_length": 414.16668701171875, |
| "epoch": 0.027833534055934698, |
| "grad_norm": 0.13677829504013062, |
| "kl": 0.000780851929448545, |
| "learning_rate": 1.3903743315508022e-06, |
| "loss": -0.005, |
| "reward": -0.9296875, |
| "reward_std": 0.35546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9296875, |
| "step": 208 |
| }, |
| { |
| "completion_length": 357.16668701171875, |
| "epoch": 0.027967349123511308, |
| "grad_norm": 0.10833487659692764, |
| "kl": 0.0004263838636688888, |
| "learning_rate": 1.3970588235294119e-06, |
| "loss": -0.001, |
| "reward": -0.44921875, |
| "reward_std": 0.44921875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.44921875, |
| "step": 209 |
| }, |
| { |
| "completion_length": 277.3333435058594, |
| "epoch": 0.028101164191087918, |
| "grad_norm": 0.18873073160648346, |
| "kl": 0.0007301772711798549, |
| "learning_rate": 1.4037433155080215e-06, |
| "loss": -0.0014, |
| "reward": -0.5703125, |
| "reward_std": 0.392578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5703125, |
| "step": 210 |
| }, |
| { |
| "completion_length": 422.16668701171875, |
| "epoch": 0.028234979258664524, |
| "grad_norm": 0.140394926071167, |
| "kl": 0.0007065697573125362, |
| "learning_rate": 1.4104278074866313e-06, |
| "loss": 0.0013, |
| "reward": -1.2109375, |
| "reward_std": 0.88671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2109375, |
| "step": 211 |
| }, |
| { |
| "completion_length": 352.16668701171875, |
| "epoch": 0.028368794326241134, |
| "grad_norm": 0.12692704796791077, |
| "kl": 0.0005341452197171748, |
| "learning_rate": 1.4171122994652409e-06, |
| "loss": 0.0041, |
| "reward": -0.55078125, |
| "reward_std": 0.515625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.55078125, |
| "step": 212 |
| }, |
| { |
| "completion_length": 412.8333435058594, |
| "epoch": 0.028502609393817744, |
| "grad_norm": 0.1450049728155136, |
| "kl": 0.0005619659787043929, |
| "learning_rate": 1.4237967914438503e-06, |
| "loss": -0.0049, |
| "reward": -0.828125, |
| "reward_std": 0.400390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.828125, |
| "step": 213 |
| }, |
| { |
| "completion_length": 385.0, |
| "epoch": 0.028636424461394354, |
| "grad_norm": 0.11151020228862762, |
| "kl": 0.000545224582310766, |
| "learning_rate": 1.4304812834224599e-06, |
| "loss": -0.0051, |
| "reward": -0.8671875, |
| "reward_std": 0.236328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8671875, |
| "step": 214 |
| }, |
| { |
| "completion_length": 263.8333435058594, |
| "epoch": 0.028770239528970964, |
| "grad_norm": 0.13172751665115356, |
| "kl": 0.000556406972464174, |
| "learning_rate": 1.4371657754010695e-06, |
| "loss": -0.0003, |
| "reward": -0.419921875, |
| "reward_std": 0.36328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.419921875, |
| "step": 215 |
| }, |
| { |
| "completion_length": 312.3333435058594, |
| "epoch": 0.02890405459654757, |
| "grad_norm": 0.14570048451423645, |
| "kl": 0.000702905235812068, |
| "learning_rate": 1.4438502673796793e-06, |
| "loss": 0.0041, |
| "reward": -0.63671875, |
| "reward_std": 0.306640625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.63671875, |
| "step": 216 |
| }, |
| { |
| "completion_length": 342.8333435058594, |
| "epoch": 0.02903786966412418, |
| "grad_norm": 0.13387592136859894, |
| "kl": 0.0006144473445601761, |
| "learning_rate": 1.450534759358289e-06, |
| "loss": -0.0003, |
| "reward": -0.75, |
| "reward_std": 0.271484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.75, |
| "step": 217 |
| }, |
| { |
| "completion_length": 373.3333435058594, |
| "epoch": 0.02917168473170079, |
| "grad_norm": 0.14034722745418549, |
| "kl": 0.0006035550031810999, |
| "learning_rate": 1.4572192513368985e-06, |
| "loss": -0.0052, |
| "reward": -0.6796875, |
| "reward_std": 0.35546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6796875, |
| "step": 218 |
| }, |
| { |
| "completion_length": 370.5, |
| "epoch": 0.0293054997992774, |
| "grad_norm": 0.17203155159950256, |
| "kl": 0.000694015237968415, |
| "learning_rate": 1.4639037433155081e-06, |
| "loss": -0.0037, |
| "reward": -0.88671875, |
| "reward_std": 0.291015625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.88671875, |
| "step": 219 |
| }, |
| { |
| "completion_length": 573.0, |
| "epoch": 0.029439314866854006, |
| "grad_norm": 0.08722779154777527, |
| "kl": 0.0005780010833404958, |
| "learning_rate": 1.4705882352941177e-06, |
| "loss": -0.0014, |
| "reward": -1.34375, |
| "reward_std": 0.76953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.34375, |
| "step": 220 |
| }, |
| { |
| "completion_length": 342.66668701171875, |
| "epoch": 0.029573129934430616, |
| "grad_norm": 0.15573082864284515, |
| "kl": 0.0006010913057252765, |
| "learning_rate": 1.4772727272727275e-06, |
| "loss": 0.0088, |
| "reward": -0.8125, |
| "reward_std": 0.265625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8125, |
| "step": 221 |
| }, |
| { |
| "completion_length": 475.16668701171875, |
| "epoch": 0.029706945002007226, |
| "grad_norm": 0.10412527620792389, |
| "kl": 0.000707049563061446, |
| "learning_rate": 1.4839572192513372e-06, |
| "loss": 0.0038, |
| "reward": -1.328125, |
| "reward_std": 0.40625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.328125, |
| "step": 222 |
| }, |
| { |
| "completion_length": 379.8333435058594, |
| "epoch": 0.029840760069583836, |
| "grad_norm": 0.1168096736073494, |
| "kl": 0.0006801115232519805, |
| "learning_rate": 1.4906417112299468e-06, |
| "loss": 0.0033, |
| "reward": -0.80859375, |
| "reward_std": 0.49609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.80859375, |
| "step": 223 |
| }, |
| { |
| "completion_length": 217.6666717529297, |
| "epoch": 0.029974575137160443, |
| "grad_norm": 0.17121706902980804, |
| "kl": 0.0008745932718738914, |
| "learning_rate": 1.4973262032085562e-06, |
| "loss": -0.0019, |
| "reward": -0.2578125, |
| "reward_std": 0.35546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.2578125, |
| "step": 224 |
| }, |
| { |
| "completion_length": 233.83334350585938, |
| "epoch": 0.030108390204737052, |
| "grad_norm": 0.18835073709487915, |
| "kl": 0.0008036958752200007, |
| "learning_rate": 1.5040106951871658e-06, |
| "loss": -0.0013, |
| "reward": -0.396484375, |
| "reward_std": 0.361328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.396484375, |
| "step": 225 |
| }, |
| { |
| "completion_length": 301.3333435058594, |
| "epoch": 0.030242205272313662, |
| "grad_norm": 0.18656164407730103, |
| "kl": 0.0008596427505835891, |
| "learning_rate": 1.5106951871657754e-06, |
| "loss": -0.0006, |
| "reward": -0.5625, |
| "reward_std": 0.259765625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5625, |
| "step": 226 |
| }, |
| { |
| "completion_length": 398.3333435058594, |
| "epoch": 0.030376020339890272, |
| "grad_norm": 0.14680197834968567, |
| "kl": 0.0007448707474395633, |
| "learning_rate": 1.5173796791443852e-06, |
| "loss": 0.0047, |
| "reward": -0.85546875, |
| "reward_std": 0.361328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.85546875, |
| "step": 227 |
| }, |
| { |
| "completion_length": 388.8333435058594, |
| "epoch": 0.030509835407466882, |
| "grad_norm": 0.1266445517539978, |
| "kl": 0.0006551437545567751, |
| "learning_rate": 1.5240641711229948e-06, |
| "loss": -0.0011, |
| "reward": -0.73046875, |
| "reward_std": 0.220703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.73046875, |
| "step": 228 |
| }, |
| { |
| "completion_length": 371.8333435058594, |
| "epoch": 0.03064365047504349, |
| "grad_norm": 0.18683753907680511, |
| "kl": 0.0006818679976277053, |
| "learning_rate": 1.5307486631016044e-06, |
| "loss": -0.014, |
| "reward": -0.875, |
| "reward_std": 0.1455078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.875, |
| "step": 229 |
| }, |
| { |
| "completion_length": 444.5, |
| "epoch": 0.0307774655426201, |
| "grad_norm": 0.1100480780005455, |
| "kl": 0.00042218127055093646, |
| "learning_rate": 1.537433155080214e-06, |
| "loss": 0.0029, |
| "reward": -1.0390625, |
| "reward_std": 0.2353515625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0390625, |
| "step": 230 |
| }, |
| { |
| "completion_length": 485.8333435058594, |
| "epoch": 0.03091128061019671, |
| "grad_norm": 0.12457609176635742, |
| "kl": 0.0006653472664766014, |
| "learning_rate": 1.5441176470588238e-06, |
| "loss": 0.0025, |
| "reward": -1.375, |
| "reward_std": 0.99609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.375, |
| "step": 231 |
| }, |
| { |
| "completion_length": 522.5, |
| "epoch": 0.03104509567777332, |
| "grad_norm": 0.09300174564123154, |
| "kl": 0.0004242811701260507, |
| "learning_rate": 1.5508021390374334e-06, |
| "loss": -0.0009, |
| "reward": -0.890625, |
| "reward_std": 0.392578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.890625, |
| "step": 232 |
| }, |
| { |
| "completion_length": 325.3333435058594, |
| "epoch": 0.031178910745349925, |
| "grad_norm": 0.17127934098243713, |
| "kl": 0.0009391449275426567, |
| "learning_rate": 1.557486631016043e-06, |
| "loss": 0.0012, |
| "reward": -0.70703125, |
| "reward_std": 0.1376953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.70703125, |
| "step": 233 |
| }, |
| { |
| "completion_length": 309.5, |
| "epoch": 0.03131272581292654, |
| "grad_norm": 0.11277639120817184, |
| "kl": 0.0005117826513014734, |
| "learning_rate": 1.5641711229946524e-06, |
| "loss": -0.0065, |
| "reward": -0.56640625, |
| "reward_std": 0.1513671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.56640625, |
| "step": 234 |
| }, |
| { |
| "completion_length": 359.66668701171875, |
| "epoch": 0.031446540880503145, |
| "grad_norm": 0.14879803359508514, |
| "kl": 0.0008924457943066955, |
| "learning_rate": 1.570855614973262e-06, |
| "loss": -0.0051, |
| "reward": -0.9140625, |
| "reward_std": 0.44921875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9140625, |
| "step": 235 |
| }, |
| { |
| "completion_length": 402.3333435058594, |
| "epoch": 0.03158035594807975, |
| "grad_norm": 0.10776454210281372, |
| "kl": 0.0005682529299519956, |
| "learning_rate": 1.5775401069518716e-06, |
| "loss": -0.0023, |
| "reward": -0.74609375, |
| "reward_std": 0.314453125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.74609375, |
| "step": 236 |
| }, |
| { |
| "completion_length": 478.16668701171875, |
| "epoch": 0.031714171015656364, |
| "grad_norm": 0.10001283884048462, |
| "kl": 0.0004951292648911476, |
| "learning_rate": 1.5842245989304815e-06, |
| "loss": -0.0045, |
| "reward": -1.0859375, |
| "reward_std": 0.458984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0859375, |
| "step": 237 |
| }, |
| { |
| "completion_length": 356.3333435058594, |
| "epoch": 0.03184798608323297, |
| "grad_norm": 0.18665075302124023, |
| "kl": 0.0009052582900039852, |
| "learning_rate": 1.590909090909091e-06, |
| "loss": 0.0032, |
| "reward": -0.8359375, |
| "reward_std": 0.369140625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8359375, |
| "step": 238 |
| }, |
| { |
| "completion_length": 570.0, |
| "epoch": 0.031981801150809584, |
| "grad_norm": 0.12214533984661102, |
| "kl": 0.0008592414669692516, |
| "learning_rate": 1.5975935828877007e-06, |
| "loss": -0.0031, |
| "reward": -1.78125, |
| "reward_std": 0.4921875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.78125, |
| "step": 239 |
| }, |
| { |
| "completion_length": 619.8333740234375, |
| "epoch": 0.03211561621838619, |
| "grad_norm": 0.1160222515463829, |
| "kl": 0.0008299415349029005, |
| "learning_rate": 1.6042780748663103e-06, |
| "loss": 0.0026, |
| "reward": -1.96875, |
| "reward_std": 0.66796875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.96875, |
| "step": 240 |
| }, |
| { |
| "completion_length": 533.1666870117188, |
| "epoch": 0.0322494312859628, |
| "grad_norm": 0.08083510398864746, |
| "kl": 0.0004741963930428028, |
| "learning_rate": 1.6109625668449199e-06, |
| "loss": -0.0167, |
| "reward": -1.109375, |
| "reward_std": 0.23828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.109375, |
| "step": 241 |
| }, |
| { |
| "completion_length": 375.3333435058594, |
| "epoch": 0.03238324635353941, |
| "grad_norm": 0.15658938884735107, |
| "kl": 0.0007425328949466348, |
| "learning_rate": 1.6176470588235297e-06, |
| "loss": -0.0074, |
| "reward": -0.9140625, |
| "reward_std": 0.2412109375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9140625, |
| "step": 242 |
| }, |
| { |
| "completion_length": 456.66668701171875, |
| "epoch": 0.03251706142111602, |
| "grad_norm": 0.14069658517837524, |
| "kl": 0.0008559181587770581, |
| "learning_rate": 1.6243315508021393e-06, |
| "loss": 0.0035, |
| "reward": -1.3515625, |
| "reward_std": 0.390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.3515625, |
| "step": 243 |
| }, |
| { |
| "completion_length": 294.16668701171875, |
| "epoch": 0.03265087648869262, |
| "grad_norm": 0.17595937848091125, |
| "kl": 0.0008926563896238804, |
| "learning_rate": 1.631016042780749e-06, |
| "loss": -0.0018, |
| "reward": -0.671875, |
| "reward_std": 0.4140625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.671875, |
| "step": 244 |
| }, |
| { |
| "completion_length": 489.3333435058594, |
| "epoch": 0.03278469155626924, |
| "grad_norm": 0.1979399472475052, |
| "kl": 0.001042112591676414, |
| "learning_rate": 1.6377005347593583e-06, |
| "loss": -0.0008, |
| "reward": -1.359375, |
| "reward_std": 0.57421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.359375, |
| "step": 245 |
| }, |
| { |
| "completion_length": 416.8333435058594, |
| "epoch": 0.03291850662384584, |
| "grad_norm": 0.12472230195999146, |
| "kl": 0.0007299688877537847, |
| "learning_rate": 1.644385026737968e-06, |
| "loss": -0.0026, |
| "reward": -0.75, |
| "reward_std": 0.68359375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.75, |
| "step": 246 |
| }, |
| { |
| "completion_length": 333.0, |
| "epoch": 0.03305232169142246, |
| "grad_norm": 0.17285002768039703, |
| "kl": 0.0012012843508273363, |
| "learning_rate": 1.6510695187165775e-06, |
| "loss": -0.0048, |
| "reward": -0.640625, |
| "reward_std": 0.28515625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.640625, |
| "step": 247 |
| }, |
| { |
| "completion_length": 333.0, |
| "epoch": 0.03318613675899906, |
| "grad_norm": 0.12894539535045624, |
| "kl": 0.0008531633648090065, |
| "learning_rate": 1.6577540106951873e-06, |
| "loss": 0.0006, |
| "reward": -0.515625, |
| "reward_std": 0.59765625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.515625, |
| "step": 248 |
| }, |
| { |
| "completion_length": 403.66668701171875, |
| "epoch": 0.03331995182657567, |
| "grad_norm": 0.17463049292564392, |
| "kl": 0.0010805390775203705, |
| "learning_rate": 1.664438502673797e-06, |
| "loss": 0.0003, |
| "reward": -0.61328125, |
| "reward_std": 0.7109375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.61328125, |
| "step": 249 |
| }, |
| { |
| "completion_length": 513.5, |
| "epoch": 0.03345376689415228, |
| "grad_norm": 0.12753607332706451, |
| "kl": 0.0006255035405047238, |
| "learning_rate": 1.6711229946524065e-06, |
| "loss": 0.002, |
| "reward": -1.296875, |
| "reward_std": 0.6640625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.296875, |
| "step": 250 |
| }, |
| { |
| "completion_length": 338.5, |
| "epoch": 0.03358758196172889, |
| "grad_norm": 0.1846907138824463, |
| "kl": 0.0011831402080133557, |
| "learning_rate": 1.6778074866310161e-06, |
| "loss": 0.0007, |
| "reward": -0.7578125, |
| "reward_std": 0.33984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7578125, |
| "step": 251 |
| }, |
| { |
| "completion_length": 260.16668701171875, |
| "epoch": 0.0337213970293055, |
| "grad_norm": 0.17691083252429962, |
| "kl": 0.0010596700012683868, |
| "learning_rate": 1.684491978609626e-06, |
| "loss": -0.0024, |
| "reward": -0.51953125, |
| "reward_std": 0.287109375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.51953125, |
| "step": 252 |
| }, |
| { |
| "completion_length": 429.66668701171875, |
| "epoch": 0.03385521209688211, |
| "grad_norm": 0.1275603324174881, |
| "kl": 0.0007852836861275136, |
| "learning_rate": 1.6911764705882356e-06, |
| "loss": 0.0027, |
| "reward": -1.015625, |
| "reward_std": 0.267578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.015625, |
| "step": 253 |
| }, |
| { |
| "completion_length": 318.66668701171875, |
| "epoch": 0.033989027164458716, |
| "grad_norm": 0.14481216669082642, |
| "kl": 0.0007643938879482448, |
| "learning_rate": 1.6978609625668452e-06, |
| "loss": -0.0063, |
| "reward": -0.58203125, |
| "reward_std": 0.2431640625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.58203125, |
| "step": 254 |
| }, |
| { |
| "completion_length": 398.66668701171875, |
| "epoch": 0.03412284223203533, |
| "grad_norm": 0.23024463653564453, |
| "kl": 0.0010699962731450796, |
| "learning_rate": 1.7045454545454546e-06, |
| "loss": -0.0022, |
| "reward": -0.90234375, |
| "reward_std": 0.61328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.90234375, |
| "step": 255 |
| }, |
| { |
| "completion_length": 448.16668701171875, |
| "epoch": 0.034256657299611935, |
| "grad_norm": 0.11663252115249634, |
| "kl": 0.0009288216824643314, |
| "learning_rate": 1.7112299465240642e-06, |
| "loss": 0.0009, |
| "reward": -1.0546875, |
| "reward_std": 0.7734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0546875, |
| "step": 256 |
| }, |
| { |
| "completion_length": 309.8333435058594, |
| "epoch": 0.03439047236718855, |
| "grad_norm": 0.16083866357803345, |
| "kl": 0.000840982305817306, |
| "learning_rate": 1.7179144385026738e-06, |
| "loss": -0.003, |
| "reward": -0.6015625, |
| "reward_std": 0.306640625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6015625, |
| "step": 257 |
| }, |
| { |
| "completion_length": 390.3333435058594, |
| "epoch": 0.034524287434765155, |
| "grad_norm": 0.1549844890832901, |
| "kl": 0.0006463018362410367, |
| "learning_rate": 1.7245989304812836e-06, |
| "loss": -0.0032, |
| "reward": -0.76953125, |
| "reward_std": 0.291015625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.76953125, |
| "step": 258 |
| }, |
| { |
| "completion_length": 330.8333435058594, |
| "epoch": 0.03465810250234176, |
| "grad_norm": 0.16839320957660675, |
| "kl": 0.001401308341883123, |
| "learning_rate": 1.7312834224598932e-06, |
| "loss": 0.0101, |
| "reward": -0.765625, |
| "reward_std": 0.2158203125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.765625, |
| "step": 259 |
| }, |
| { |
| "completion_length": 459.8333435058594, |
| "epoch": 0.034791917569918375, |
| "grad_norm": 0.10837510973215103, |
| "kl": 0.0012012351071462035, |
| "learning_rate": 1.7379679144385028e-06, |
| "loss": 0.0023, |
| "reward": -0.8828125, |
| "reward_std": 0.271484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8828125, |
| "step": 260 |
| }, |
| { |
| "completion_length": 508.0, |
| "epoch": 0.03492573263749498, |
| "grad_norm": 0.10151364654302597, |
| "kl": 0.0011522852582857013, |
| "learning_rate": 1.7446524064171124e-06, |
| "loss": 0.0002, |
| "reward": -1.1484375, |
| "reward_std": 0.8203125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1484375, |
| "step": 261 |
| }, |
| { |
| "completion_length": 433.5, |
| "epoch": 0.03505954770507159, |
| "grad_norm": 0.1335013061761856, |
| "kl": 0.0009189687552861869, |
| "learning_rate": 1.7513368983957222e-06, |
| "loss": -0.0009, |
| "reward": -1.0625, |
| "reward_std": 0.66796875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0625, |
| "step": 262 |
| }, |
| { |
| "completion_length": 245.33334350585938, |
| "epoch": 0.0351933627726482, |
| "grad_norm": 0.20102928578853607, |
| "kl": 0.0014157379046082497, |
| "learning_rate": 1.7580213903743318e-06, |
| "loss": -0.0003, |
| "reward": -0.40625, |
| "reward_std": 0.2177734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.40625, |
| "step": 263 |
| }, |
| { |
| "completion_length": 682.6666870117188, |
| "epoch": 0.03532717784022481, |
| "grad_norm": 0.0937545895576477, |
| "kl": 0.0005280395271256566, |
| "learning_rate": 1.7647058823529414e-06, |
| "loss": -0.0071, |
| "reward": -1.2578125, |
| "reward_std": 0.48046875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2578125, |
| "step": 264 |
| }, |
| { |
| "completion_length": 500.3333435058594, |
| "epoch": 0.03546099290780142, |
| "grad_norm": 0.17000702023506165, |
| "kl": 0.0008603067835792899, |
| "learning_rate": 1.7713903743315508e-06, |
| "loss": 0.001, |
| "reward": -1.375, |
| "reward_std": 0.57421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.375, |
| "step": 265 |
| }, |
| { |
| "completion_length": 366.8333435058594, |
| "epoch": 0.03559480797537803, |
| "grad_norm": 0.17021796107292175, |
| "kl": 0.0011717099696397781, |
| "learning_rate": 1.7780748663101604e-06, |
| "loss": -0.0037, |
| "reward": -0.7734375, |
| "reward_std": 0.1904296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7734375, |
| "step": 266 |
| }, |
| { |
| "completion_length": 287.0, |
| "epoch": 0.035728623042954634, |
| "grad_norm": 0.21704187989234924, |
| "kl": 0.001519282697699964, |
| "learning_rate": 1.78475935828877e-06, |
| "loss": 0.0023, |
| "reward": -0.6328125, |
| "reward_std": 0.40625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6328125, |
| "step": 267 |
| }, |
| { |
| "completion_length": 632.0, |
| "epoch": 0.03586243811053125, |
| "grad_norm": 0.10228416323661804, |
| "kl": 0.0007877530297264457, |
| "learning_rate": 1.7914438502673799e-06, |
| "loss": 0.0056, |
| "reward": -1.59375, |
| "reward_std": 0.42578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.59375, |
| "step": 268 |
| }, |
| { |
| "completion_length": 411.66668701171875, |
| "epoch": 0.035996253178107854, |
| "grad_norm": 0.1366869956254959, |
| "kl": 0.0013742044102400541, |
| "learning_rate": 1.7981283422459895e-06, |
| "loss": -0.0045, |
| "reward": -0.859375, |
| "reward_std": 0.455078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.859375, |
| "step": 269 |
| }, |
| { |
| "completion_length": 268.8333435058594, |
| "epoch": 0.03613006824568447, |
| "grad_norm": 0.23755821585655212, |
| "kl": 0.0016824863851070404, |
| "learning_rate": 1.804812834224599e-06, |
| "loss": 0.0, |
| "reward": -0.451171875, |
| "reward_std": 0.40625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.451171875, |
| "step": 270 |
| }, |
| { |
| "completion_length": 333.16668701171875, |
| "epoch": 0.036263883313261074, |
| "grad_norm": 0.1347772628068924, |
| "kl": 0.0012472581584006548, |
| "learning_rate": 1.8114973262032087e-06, |
| "loss": 0.0014, |
| "reward": -0.71484375, |
| "reward_std": 0.4765625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.71484375, |
| "step": 271 |
| }, |
| { |
| "completion_length": 404.3333435058594, |
| "epoch": 0.03639769838083768, |
| "grad_norm": 0.16489511728286743, |
| "kl": 0.0020543006248772144, |
| "learning_rate": 1.8181818181818183e-06, |
| "loss": 0.0014, |
| "reward": -1.078125, |
| "reward_std": 0.703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.078125, |
| "step": 272 |
| }, |
| { |
| "completion_length": 299.0, |
| "epoch": 0.03653151344841429, |
| "grad_norm": 0.15176789462566376, |
| "kl": 0.0013240812113508582, |
| "learning_rate": 1.8248663101604281e-06, |
| "loss": 0.0064, |
| "reward": -0.5234375, |
| "reward_std": 0.294921875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5234375, |
| "step": 273 |
| }, |
| { |
| "completion_length": 388.5, |
| "epoch": 0.0366653285159909, |
| "grad_norm": 0.11341875791549683, |
| "kl": 0.0008515861118212342, |
| "learning_rate": 1.8315508021390377e-06, |
| "loss": 0.0005, |
| "reward": -0.7890625, |
| "reward_std": 0.337890625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7890625, |
| "step": 274 |
| }, |
| { |
| "completion_length": 244.83334350585938, |
| "epoch": 0.036799143583567506, |
| "grad_norm": 0.18841837346553802, |
| "kl": 0.0024837306700646877, |
| "learning_rate": 1.8382352941176473e-06, |
| "loss": 0.0001, |
| "reward": -0.453125, |
| "reward_std": 0.33203125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.453125, |
| "step": 275 |
| }, |
| { |
| "completion_length": 283.8333435058594, |
| "epoch": 0.03693295865114412, |
| "grad_norm": 0.19005466997623444, |
| "kl": 0.0023008882999420166, |
| "learning_rate": 1.8449197860962567e-06, |
| "loss": 0.0043, |
| "reward": -0.435546875, |
| "reward_std": 0.1376953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.435546875, |
| "step": 276 |
| }, |
| { |
| "completion_length": 413.3333435058594, |
| "epoch": 0.037066773718720726, |
| "grad_norm": 0.15900400280952454, |
| "kl": 0.0016828961670398712, |
| "learning_rate": 1.8516042780748663e-06, |
| "loss": 0.003, |
| "reward": -0.890625, |
| "reward_std": 0.365234375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.890625, |
| "step": 277 |
| }, |
| { |
| "completion_length": 330.16668701171875, |
| "epoch": 0.03720058878629734, |
| "grad_norm": 0.18195514380931854, |
| "kl": 0.0018070716178044677, |
| "learning_rate": 1.858288770053476e-06, |
| "loss": -0.0025, |
| "reward": -0.51953125, |
| "reward_std": 0.3359375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.51953125, |
| "step": 278 |
| }, |
| { |
| "completion_length": 331.66668701171875, |
| "epoch": 0.037334403853873946, |
| "grad_norm": 0.14856845140457153, |
| "kl": 0.002008461859077215, |
| "learning_rate": 1.8649732620320857e-06, |
| "loss": 0.0001, |
| "reward": -0.72265625, |
| "reward_std": 0.345703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.72265625, |
| "step": 279 |
| }, |
| { |
| "completion_length": 283.3333435058594, |
| "epoch": 0.03746821892145055, |
| "grad_norm": 0.1611245721578598, |
| "kl": 0.001842797501012683, |
| "learning_rate": 1.8716577540106954e-06, |
| "loss": 0.0014, |
| "reward": -0.333984375, |
| "reward_std": 0.39453125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.333984375, |
| "step": 280 |
| }, |
| { |
| "completion_length": 403.3333435058594, |
| "epoch": 0.037602033989027166, |
| "grad_norm": 0.13907144963741302, |
| "kl": 0.0020113931968808174, |
| "learning_rate": 1.878342245989305e-06, |
| "loss": -0.0006, |
| "reward": -0.8125, |
| "reward_std": 0.447265625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8125, |
| "step": 281 |
| }, |
| { |
| "completion_length": 328.8333435058594, |
| "epoch": 0.03773584905660377, |
| "grad_norm": 0.13121508061885834, |
| "kl": 0.001702746725641191, |
| "learning_rate": 1.8850267379679146e-06, |
| "loss": 0.0054, |
| "reward": -0.76171875, |
| "reward_std": 0.30859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.76171875, |
| "step": 282 |
| }, |
| { |
| "completion_length": 357.8333435058594, |
| "epoch": 0.037869664124180386, |
| "grad_norm": 0.12085507065057755, |
| "kl": 0.001665423158556223, |
| "learning_rate": 1.8917112299465244e-06, |
| "loss": 0.0046, |
| "reward": -0.796875, |
| "reward_std": 0.271484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.796875, |
| "step": 283 |
| }, |
| { |
| "completion_length": 321.0, |
| "epoch": 0.03800347919175699, |
| "grad_norm": 0.1415272206068039, |
| "kl": 0.002183354925364256, |
| "learning_rate": 1.898395721925134e-06, |
| "loss": 0.0033, |
| "reward": -0.60546875, |
| "reward_std": 0.310546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.60546875, |
| "step": 284 |
| }, |
| { |
| "completion_length": 326.5, |
| "epoch": 0.0381372942593336, |
| "grad_norm": 0.12236207723617554, |
| "kl": 0.002115039387717843, |
| "learning_rate": 1.9050802139037436e-06, |
| "loss": -0.0009, |
| "reward": -0.73828125, |
| "reward_std": 0.53125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.73828125, |
| "step": 285 |
| }, |
| { |
| "completion_length": 323.66668701171875, |
| "epoch": 0.03827110932691021, |
| "grad_norm": 0.15223853290081024, |
| "kl": 0.002893569879233837, |
| "learning_rate": 1.9117647058823528e-06, |
| "loss": 0.0048, |
| "reward": -0.58203125, |
| "reward_std": 0.322265625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.58203125, |
| "step": 286 |
| }, |
| { |
| "completion_length": 209.5, |
| "epoch": 0.03840492439448682, |
| "grad_norm": 0.23355820775032043, |
| "kl": 0.004409749526530504, |
| "learning_rate": 1.9184491978609626e-06, |
| "loss": -0.0005, |
| "reward": -0.22265625, |
| "reward_std": 0.11328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.22265625, |
| "step": 287 |
| }, |
| { |
| "completion_length": 373.66668701171875, |
| "epoch": 0.03853873946206343, |
| "grad_norm": 0.12933886051177979, |
| "kl": 0.0021153343841433525, |
| "learning_rate": 1.9251336898395724e-06, |
| "loss": 0.0001, |
| "reward": -0.423828125, |
| "reward_std": 0.27734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.423828125, |
| "step": 288 |
| }, |
| { |
| "completion_length": 466.16668701171875, |
| "epoch": 0.03867255452964004, |
| "grad_norm": 0.11623230576515198, |
| "kl": 0.0016589018050581217, |
| "learning_rate": 1.931818181818182e-06, |
| "loss": 0.0085, |
| "reward": -1.125, |
| "reward_std": 0.400390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.125, |
| "step": 289 |
| }, |
| { |
| "completion_length": 407.8333435058594, |
| "epoch": 0.038806369597216644, |
| "grad_norm": 0.1379394680261612, |
| "kl": 0.0021060972940176725, |
| "learning_rate": 1.9385026737967916e-06, |
| "loss": -0.0019, |
| "reward": -0.80859375, |
| "reward_std": 0.341796875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.80859375, |
| "step": 290 |
| }, |
| { |
| "completion_length": 251.5, |
| "epoch": 0.03894018466479326, |
| "grad_norm": 0.18607591092586517, |
| "kl": 0.0036474696826189756, |
| "learning_rate": 1.9451871657754014e-06, |
| "loss": -0.0008, |
| "reward": -0.2890625, |
| "reward_std": 0.248046875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.2890625, |
| "step": 291 |
| }, |
| { |
| "completion_length": 277.8333435058594, |
| "epoch": 0.039073999732369864, |
| "grad_norm": 0.14609812200069427, |
| "kl": 0.0029988684691488743, |
| "learning_rate": 1.951871657754011e-06, |
| "loss": -0.001, |
| "reward": -0.427734375, |
| "reward_std": 0.388671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.427734375, |
| "step": 292 |
| }, |
| { |
| "completion_length": 399.16668701171875, |
| "epoch": 0.03920781479994647, |
| "grad_norm": 0.13495796918869019, |
| "kl": 0.002409814391285181, |
| "learning_rate": 1.9585561497326206e-06, |
| "loss": 0.0016, |
| "reward": -0.8046875, |
| "reward_std": 0.60546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8046875, |
| "step": 293 |
| }, |
| { |
| "completion_length": 410.5, |
| "epoch": 0.039341629867523084, |
| "grad_norm": 0.12157430499792099, |
| "kl": 0.002072525443509221, |
| "learning_rate": 1.96524064171123e-06, |
| "loss": -0.0015, |
| "reward": -0.81640625, |
| "reward_std": 0.6015625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.81640625, |
| "step": 294 |
| }, |
| { |
| "completion_length": 263.8333435058594, |
| "epoch": 0.03947544493509969, |
| "grad_norm": 0.16216787695884705, |
| "kl": 0.0037962980568408966, |
| "learning_rate": 1.97192513368984e-06, |
| "loss": 0.0005, |
| "reward": -0.390625, |
| "reward_std": 0.091796875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.390625, |
| "step": 295 |
| }, |
| { |
| "completion_length": 284.3333435058594, |
| "epoch": 0.039609260002676304, |
| "grad_norm": 0.19671247899532318, |
| "kl": 0.0041143884882330894, |
| "learning_rate": 1.9786096256684497e-06, |
| "loss": 0.001, |
| "reward": -0.5859375, |
| "reward_std": 0.3671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5859375, |
| "step": 296 |
| }, |
| { |
| "completion_length": 280.8333435058594, |
| "epoch": 0.03974307507025291, |
| "grad_norm": 0.15721142292022705, |
| "kl": 0.002454590518027544, |
| "learning_rate": 1.985294117647059e-06, |
| "loss": -0.003, |
| "reward": -0.328125, |
| "reward_std": 0.2412109375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.328125, |
| "step": 297 |
| }, |
| { |
| "completion_length": 285.5, |
| "epoch": 0.03987689013782952, |
| "grad_norm": 0.19153378903865814, |
| "kl": 0.0035088087897747755, |
| "learning_rate": 1.9919786096256685e-06, |
| "loss": -0.0062, |
| "reward": -0.5859375, |
| "reward_std": 0.228515625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5859375, |
| "step": 298 |
| }, |
| { |
| "completion_length": 206.33334350585938, |
| "epoch": 0.04001070520540613, |
| "grad_norm": 0.2719455659389496, |
| "kl": 0.0049241166561841965, |
| "learning_rate": 1.9986631016042783e-06, |
| "loss": 0.0018, |
| "reward": -0.322265625, |
| "reward_std": 0.171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.322265625, |
| "step": 299 |
| }, |
| { |
| "completion_length": 363.0, |
| "epoch": 0.04014452027298274, |
| "grad_norm": 0.1396636813879013, |
| "kl": 0.004824022762477398, |
| "learning_rate": 2.0053475935828877e-06, |
| "loss": -0.0037, |
| "reward": -0.546875, |
| "reward_std": 0.337890625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.546875, |
| "step": 300 |
| }, |
| { |
| "completion_length": 357.8333435058594, |
| "epoch": 0.04027833534055935, |
| "grad_norm": 0.12899059057235718, |
| "kl": 0.0038679109420627356, |
| "learning_rate": 2.0120320855614975e-06, |
| "loss": -0.0, |
| "reward": -0.73828125, |
| "reward_std": 0.12060546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.73828125, |
| "step": 301 |
| }, |
| { |
| "completion_length": 399.0, |
| "epoch": 0.040412150408135956, |
| "grad_norm": 0.14958561956882477, |
| "kl": 0.003215777687728405, |
| "learning_rate": 2.0187165775401073e-06, |
| "loss": 0.0031, |
| "reward": -0.90234375, |
| "reward_std": 0.41015625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.90234375, |
| "step": 302 |
| }, |
| { |
| "completion_length": 269.5, |
| "epoch": 0.04054596547571256, |
| "grad_norm": 0.21223297715187073, |
| "kl": 0.003930443432182074, |
| "learning_rate": 2.0254010695187167e-06, |
| "loss": -0.0004, |
| "reward": -0.126953125, |
| "reward_std": 0.365234375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.126953125, |
| "step": 303 |
| }, |
| { |
| "completion_length": 324.5, |
| "epoch": 0.040679780543289176, |
| "grad_norm": 0.1299857646226883, |
| "kl": 0.002138474490493536, |
| "learning_rate": 2.0320855614973265e-06, |
| "loss": -0.0062, |
| "reward": -0.79296875, |
| "reward_std": 0.212890625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.79296875, |
| "step": 304 |
| }, |
| { |
| "completion_length": 259.0, |
| "epoch": 0.04081359561086578, |
| "grad_norm": 0.1926686018705368, |
| "kl": 0.004935073666274548, |
| "learning_rate": 2.038770053475936e-06, |
| "loss": 0.0015, |
| "reward": -0.400390625, |
| "reward_std": 0.30859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.400390625, |
| "step": 305 |
| }, |
| { |
| "completion_length": 344.16668701171875, |
| "epoch": 0.040947410678442396, |
| "grad_norm": 0.1324584186077118, |
| "kl": 0.0024872669018805027, |
| "learning_rate": 2.0454545454545457e-06, |
| "loss": -0.0035, |
| "reward": -0.515625, |
| "reward_std": 0.310546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.515625, |
| "step": 306 |
| }, |
| { |
| "completion_length": 394.5, |
| "epoch": 0.041081225746019, |
| "grad_norm": 0.13476984202861786, |
| "kl": 0.0031944592483341694, |
| "learning_rate": 2.052139037433155e-06, |
| "loss": 0.0001, |
| "reward": -0.9921875, |
| "reward_std": 0.201171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9921875, |
| "step": 307 |
| }, |
| { |
| "completion_length": 340.8333435058594, |
| "epoch": 0.04121504081359561, |
| "grad_norm": 0.13522304594516754, |
| "kl": 0.007316060364246368, |
| "learning_rate": 2.058823529411765e-06, |
| "loss": -0.001, |
| "reward": -0.6953125, |
| "reward_std": 0.63671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6953125, |
| "step": 308 |
| }, |
| { |
| "completion_length": 197.1666717529297, |
| "epoch": 0.04134885588117222, |
| "grad_norm": 0.18704450130462646, |
| "kl": 0.004623084794729948, |
| "learning_rate": 2.0655080213903743e-06, |
| "loss": 0.0012, |
| "reward": -0.201171875, |
| "reward_std": 0.2294921875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.201171875, |
| "step": 309 |
| }, |
| { |
| "completion_length": 256.5, |
| "epoch": 0.04148267094874883, |
| "grad_norm": 0.20771555602550507, |
| "kl": 0.006930126808583736, |
| "learning_rate": 2.072192513368984e-06, |
| "loss": 0.0003, |
| "reward": -0.1953125, |
| "reward_std": 0.21875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.1953125, |
| "step": 310 |
| }, |
| { |
| "completion_length": 316.66668701171875, |
| "epoch": 0.041616486016325435, |
| "grad_norm": 0.15902665257453918, |
| "kl": 0.004400103818625212, |
| "learning_rate": 2.0788770053475936e-06, |
| "loss": -0.0005, |
| "reward": -0.63671875, |
| "reward_std": 0.470703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.63671875, |
| "step": 311 |
| }, |
| { |
| "completion_length": 319.8333435058594, |
| "epoch": 0.04175030108390205, |
| "grad_norm": 0.16784004867076874, |
| "kl": 0.003954706247895956, |
| "learning_rate": 2.0855614973262034e-06, |
| "loss": 0.0016, |
| "reward": -0.6953125, |
| "reward_std": 0.2431640625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6953125, |
| "step": 312 |
| }, |
| { |
| "completion_length": 307.8333435058594, |
| "epoch": 0.041884116151478655, |
| "grad_norm": 0.18240705132484436, |
| "kl": 0.007933239452540874, |
| "learning_rate": 2.092245989304813e-06, |
| "loss": -0.0029, |
| "reward": -0.609375, |
| "reward_std": 0.482421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.609375, |
| "step": 313 |
| }, |
| { |
| "completion_length": 329.16668701171875, |
| "epoch": 0.04201793121905527, |
| "grad_norm": 0.2159043848514557, |
| "kl": 0.008502164855599403, |
| "learning_rate": 2.0989304812834226e-06, |
| "loss": -0.0055, |
| "reward": -0.66796875, |
| "reward_std": 0.353515625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.66796875, |
| "step": 314 |
| }, |
| { |
| "completion_length": 419.16668701171875, |
| "epoch": 0.042151746286631875, |
| "grad_norm": 0.12527386844158173, |
| "kl": 0.0033555077388882637, |
| "learning_rate": 2.1056149732620324e-06, |
| "loss": 0.0011, |
| "reward": -0.82421875, |
| "reward_std": 0.33984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.82421875, |
| "step": 315 |
| }, |
| { |
| "completion_length": 352.66668701171875, |
| "epoch": 0.04228556135420848, |
| "grad_norm": 0.1702316701412201, |
| "kl": 0.004789026454091072, |
| "learning_rate": 2.112299465240642e-06, |
| "loss": -0.0018, |
| "reward": -0.625, |
| "reward_std": 0.447265625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.625, |
| "step": 316 |
| }, |
| { |
| "completion_length": 388.16668701171875, |
| "epoch": 0.042419376421785095, |
| "grad_norm": 0.1852673441171646, |
| "kl": 0.002679403405636549, |
| "learning_rate": 2.118983957219251e-06, |
| "loss": 0.0042, |
| "reward": -0.81640625, |
| "reward_std": 0.36328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.81640625, |
| "step": 317 |
| }, |
| { |
| "completion_length": 283.3333435058594, |
| "epoch": 0.0425531914893617, |
| "grad_norm": 0.21826620399951935, |
| "kl": 0.007197022438049316, |
| "learning_rate": 2.125668449197861e-06, |
| "loss": 0.0039, |
| "reward": -0.44921875, |
| "reward_std": 0.265625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.44921875, |
| "step": 318 |
| }, |
| { |
| "completion_length": 311.16668701171875, |
| "epoch": 0.042687006556938314, |
| "grad_norm": 0.2093047946691513, |
| "kl": 0.006402711849659681, |
| "learning_rate": 2.132352941176471e-06, |
| "loss": -0.003, |
| "reward": -0.62890625, |
| "reward_std": 0.2333984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.62890625, |
| "step": 319 |
| }, |
| { |
| "completion_length": 261.16668701171875, |
| "epoch": 0.04282082162451492, |
| "grad_norm": 0.18230240046977997, |
| "kl": 0.010287894867360592, |
| "learning_rate": 2.1390374331550802e-06, |
| "loss": 0.0014, |
| "reward": -0.375, |
| "reward_std": 0.19921875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.375, |
| "step": 320 |
| }, |
| { |
| "completion_length": 241.1666717529297, |
| "epoch": 0.04295463669209153, |
| "grad_norm": 0.21121013164520264, |
| "kl": 0.010337308049201965, |
| "learning_rate": 2.14572192513369e-06, |
| "loss": -0.002, |
| "reward": -0.341796875, |
| "reward_std": 0.326171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.341796875, |
| "step": 321 |
| }, |
| { |
| "completion_length": 356.8333435058594, |
| "epoch": 0.04308845175966814, |
| "grad_norm": 0.1783302128314972, |
| "kl": 0.006110279820859432, |
| "learning_rate": 2.1524064171122994e-06, |
| "loss": 0.004, |
| "reward": -0.76171875, |
| "reward_std": 0.419921875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.76171875, |
| "step": 322 |
| }, |
| { |
| "completion_length": 178.83334350585938, |
| "epoch": 0.04322226682724475, |
| "grad_norm": 0.26143407821655273, |
| "kl": 0.008865730836987495, |
| "learning_rate": 2.1590909090909092e-06, |
| "loss": 0.0012, |
| "reward": -0.0859375, |
| "reward_std": 0.1552734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.0859375, |
| "step": 323 |
| }, |
| { |
| "completion_length": 263.16668701171875, |
| "epoch": 0.043356081894821354, |
| "grad_norm": 0.19690528512001038, |
| "kl": 0.006059790961444378, |
| "learning_rate": 2.165775401069519e-06, |
| "loss": -0.0015, |
| "reward": -0.287109375, |
| "reward_std": 0.376953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.287109375, |
| "step": 324 |
| }, |
| { |
| "completion_length": 258.0, |
| "epoch": 0.04348989696239797, |
| "grad_norm": 0.18371239304542542, |
| "kl": 0.011453388258814812, |
| "learning_rate": 2.1724598930481285e-06, |
| "loss": -0.004, |
| "reward": -0.365234375, |
| "reward_std": 0.23828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.365234375, |
| "step": 325 |
| }, |
| { |
| "completion_length": 174.33334350585938, |
| "epoch": 0.04362371202997457, |
| "grad_norm": 0.3012731373310089, |
| "kl": 0.008510403335094452, |
| "learning_rate": 2.1791443850267383e-06, |
| "loss": 0.0, |
| "reward": -0.1416015625, |
| "reward_std": 0.0654296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.1416015625, |
| "step": 326 |
| }, |
| { |
| "completion_length": 419.0, |
| "epoch": 0.04375752709755119, |
| "grad_norm": 0.15368571877479553, |
| "kl": 0.004780076909810305, |
| "learning_rate": 2.185828877005348e-06, |
| "loss": 0.0, |
| "reward": -0.859375, |
| "reward_std": 0.14453125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.859375, |
| "step": 327 |
| }, |
| { |
| "completion_length": 505.3333435058594, |
| "epoch": 0.04389134216512779, |
| "grad_norm": 0.1414371132850647, |
| "kl": 0.004692884162068367, |
| "learning_rate": 2.1925133689839575e-06, |
| "loss": 0.0067, |
| "reward": -1.3046875, |
| "reward_std": 0.45703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.3046875, |
| "step": 328 |
| }, |
| { |
| "completion_length": 291.0, |
| "epoch": 0.0440251572327044, |
| "grad_norm": 0.13695944845676422, |
| "kl": 0.006905118003487587, |
| "learning_rate": 2.199197860962567e-06, |
| "loss": 0.0042, |
| "reward": -0.47265625, |
| "reward_std": 0.322265625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.47265625, |
| "step": 329 |
| }, |
| { |
| "completion_length": 393.3333435058594, |
| "epoch": 0.04415897230028101, |
| "grad_norm": 0.15617813169956207, |
| "kl": 0.00954591017216444, |
| "learning_rate": 2.2058823529411767e-06, |
| "loss": 0.0029, |
| "reward": -0.6640625, |
| "reward_std": 0.55078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6640625, |
| "step": 330 |
| }, |
| { |
| "completion_length": 380.8333435058594, |
| "epoch": 0.04429278736785762, |
| "grad_norm": 0.10685895383358002, |
| "kl": 0.0035809404216706753, |
| "learning_rate": 2.212566844919786e-06, |
| "loss": 0.0033, |
| "reward": -0.55078125, |
| "reward_std": 0.1953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.55078125, |
| "step": 331 |
| }, |
| { |
| "completion_length": 348.16668701171875, |
| "epoch": 0.04442660243543423, |
| "grad_norm": 0.14546167850494385, |
| "kl": 0.010220387950539589, |
| "learning_rate": 2.219251336898396e-06, |
| "loss": 0.0043, |
| "reward": -0.9375, |
| "reward_std": 0.5, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9375, |
| "step": 332 |
| }, |
| { |
| "completion_length": 378.3333435058594, |
| "epoch": 0.04456041750301084, |
| "grad_norm": 0.16639195382595062, |
| "kl": 0.0055643776431679726, |
| "learning_rate": 2.2259358288770057e-06, |
| "loss": 0.0069, |
| "reward": -0.68359375, |
| "reward_std": 0.251953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.68359375, |
| "step": 333 |
| }, |
| { |
| "completion_length": 335.3333435058594, |
| "epoch": 0.044694232570587446, |
| "grad_norm": 0.15007461607456207, |
| "kl": 0.005303717218339443, |
| "learning_rate": 2.232620320855615e-06, |
| "loss": 0.0057, |
| "reward": -0.70703125, |
| "reward_std": 0.232421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.70703125, |
| "step": 334 |
| }, |
| { |
| "completion_length": 296.66668701171875, |
| "epoch": 0.04482804763816406, |
| "grad_norm": 0.1491575390100479, |
| "kl": 0.005803759675472975, |
| "learning_rate": 2.239304812834225e-06, |
| "loss": 0.0166, |
| "reward": -0.5234375, |
| "reward_std": 0.11328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5234375, |
| "step": 335 |
| }, |
| { |
| "completion_length": 335.5, |
| "epoch": 0.044961862705740666, |
| "grad_norm": 0.13715343177318573, |
| "kl": 0.006967080291360617, |
| "learning_rate": 2.2459893048128343e-06, |
| "loss": -0.0008, |
| "reward": -0.69921875, |
| "reward_std": 0.28125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.69921875, |
| "step": 336 |
| }, |
| { |
| "completion_length": 309.3333435058594, |
| "epoch": 0.04509567777331728, |
| "grad_norm": 0.24047787487506866, |
| "kl": 0.010651972144842148, |
| "learning_rate": 2.252673796791444e-06, |
| "loss": -0.0019, |
| "reward": -0.5859375, |
| "reward_std": 0.4296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5859375, |
| "step": 337 |
| }, |
| { |
| "completion_length": 353.0, |
| "epoch": 0.045229492840893885, |
| "grad_norm": 0.15517961978912354, |
| "kl": 0.009843084029853344, |
| "learning_rate": 2.2593582887700535e-06, |
| "loss": 0.0027, |
| "reward": -0.8671875, |
| "reward_std": 0.42578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8671875, |
| "step": 338 |
| }, |
| { |
| "completion_length": 212.5, |
| "epoch": 0.04536330790847049, |
| "grad_norm": 0.23865048587322235, |
| "kl": 0.010837538167834282, |
| "learning_rate": 2.2660427807486634e-06, |
| "loss": 0.0067, |
| "reward": -0.2734375, |
| "reward_std": 0.13671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.2734375, |
| "step": 339 |
| }, |
| { |
| "completion_length": 363.3333435058594, |
| "epoch": 0.045497122976047105, |
| "grad_norm": 0.13407538831233978, |
| "kl": 0.008769119158387184, |
| "learning_rate": 2.2727272727272728e-06, |
| "loss": 0.0023, |
| "reward": -0.7421875, |
| "reward_std": 0.2734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7421875, |
| "step": 340 |
| }, |
| { |
| "completion_length": 319.16668701171875, |
| "epoch": 0.04563093804362371, |
| "grad_norm": 0.16852368414402008, |
| "kl": 0.006191683933138847, |
| "learning_rate": 2.2794117647058826e-06, |
| "loss": 0.0024, |
| "reward": -0.390625, |
| "reward_std": 0.23828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.390625, |
| "step": 341 |
| }, |
| { |
| "completion_length": 145.33334350585938, |
| "epoch": 0.04576475311120032, |
| "grad_norm": 0.2680109441280365, |
| "kl": 0.032642465084791183, |
| "learning_rate": 2.286096256684492e-06, |
| "loss": 0.0013, |
| "reward": -0.0281982421875, |
| "reward_std": 0.419921875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.0281982421875, |
| "step": 342 |
| }, |
| { |
| "completion_length": 281.5, |
| "epoch": 0.04589856817877693, |
| "grad_norm": 0.1808832287788391, |
| "kl": 0.014683406800031662, |
| "learning_rate": 2.292780748663102e-06, |
| "loss": 0.0029, |
| "reward": -0.396484375, |
| "reward_std": 0.2080078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.396484375, |
| "step": 343 |
| }, |
| { |
| "completion_length": 287.66668701171875, |
| "epoch": 0.04603238324635354, |
| "grad_norm": 0.1347428560256958, |
| "kl": 0.005576578434556723, |
| "learning_rate": 2.2994652406417116e-06, |
| "loss": 0.0015, |
| "reward": -0.4140625, |
| "reward_std": 0.439453125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.4140625, |
| "step": 344 |
| }, |
| { |
| "completion_length": 361.8333435058594, |
| "epoch": 0.04616619831393015, |
| "grad_norm": 0.13883595168590546, |
| "kl": 0.007222681771963835, |
| "learning_rate": 2.306149732620321e-06, |
| "loss": -0.0061, |
| "reward": -0.435546875, |
| "reward_std": 0.10888671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.435546875, |
| "step": 345 |
| }, |
| { |
| "completion_length": 229.5, |
| "epoch": 0.04630001338150676, |
| "grad_norm": 0.23361890017986298, |
| "kl": 0.035799503326416016, |
| "learning_rate": 2.312834224598931e-06, |
| "loss": 0.0001, |
| "reward": -0.267578125, |
| "reward_std": 0.248046875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.267578125, |
| "step": 346 |
| }, |
| { |
| "completion_length": 208.83334350585938, |
| "epoch": 0.046433828449083364, |
| "grad_norm": 0.25818759202957153, |
| "kl": 0.01660415530204773, |
| "learning_rate": 2.3195187165775402e-06, |
| "loss": -0.0006, |
| "reward": -0.2412109375, |
| "reward_std": 0.37109375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.2412109375, |
| "step": 347 |
| }, |
| { |
| "completion_length": 270.66668701171875, |
| "epoch": 0.04656764351665998, |
| "grad_norm": 0.15176504850387573, |
| "kl": 0.008165406994521618, |
| "learning_rate": 2.32620320855615e-06, |
| "loss": 0.0044, |
| "reward": -0.40625, |
| "reward_std": 0.1494140625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.40625, |
| "step": 348 |
| }, |
| { |
| "completion_length": 318.16668701171875, |
| "epoch": 0.046701458584236584, |
| "grad_norm": 0.14339715242385864, |
| "kl": 0.007072822656482458, |
| "learning_rate": 2.3328877005347594e-06, |
| "loss": -0.0096, |
| "reward": -0.5625, |
| "reward_std": 0.2021484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5625, |
| "step": 349 |
| }, |
| { |
| "completion_length": 281.0, |
| "epoch": 0.0468352736518132, |
| "grad_norm": 0.16681039333343506, |
| "kl": 0.013782523572444916, |
| "learning_rate": 2.3395721925133692e-06, |
| "loss": 0.0001, |
| "reward": -0.296875, |
| "reward_std": 0.17578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.296875, |
| "step": 350 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 7473, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|