| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.013381506757660913, |
| "eval_steps": 500, |
| "global_step": 100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 698.1666870117188, |
| "epoch": 0.00013381506757660912, |
| "grad_norm": 0.07569596916437149, |
| "kl": 0.0006024616304785013, |
| "learning_rate": 6.684491978609626e-09, |
| "loss": 0.001, |
| "reward": -1.8359375, |
| "reward_std": 0.5859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.8359375, |
| "step": 1 |
| }, |
| { |
| "completion_length": 549.0, |
| "epoch": 0.00026763013515321824, |
| "grad_norm": 0.10156559199094772, |
| "kl": 0.0006554799037985504, |
| "learning_rate": 1.3368983957219251e-08, |
| "loss": -0.0055, |
| "reward": -1.21875, |
| "reward_std": 0.48828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.21875, |
| "step": 2 |
| }, |
| { |
| "completion_length": 509.66668701171875, |
| "epoch": 0.0004014452027298274, |
| "grad_norm": 0.1012749969959259, |
| "kl": 0.0006122777122072875, |
| "learning_rate": 2.005347593582888e-08, |
| "loss": 0.0032, |
| "reward": -1.2578125, |
| "reward_std": 0.37890625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2578125, |
| "step": 3 |
| }, |
| { |
| "completion_length": 387.8333435058594, |
| "epoch": 0.0005352602703064365, |
| "grad_norm": 0.10009913891553879, |
| "kl": 0.0005205385386943817, |
| "learning_rate": 2.6737967914438503e-08, |
| "loss": 0.0007, |
| "reward": -0.83203125, |
| "reward_std": 0.1640625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.83203125, |
| "step": 4 |
| }, |
| { |
| "completion_length": 386.5, |
| "epoch": 0.0006690753378830456, |
| "grad_norm": 0.11404310166835785, |
| "kl": 0.00039041676791384816, |
| "learning_rate": 3.342245989304813e-08, |
| "loss": -0.0032, |
| "reward": -0.859375, |
| "reward_std": 0.1630859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.859375, |
| "step": 5 |
| }, |
| { |
| "completion_length": 345.8333435058594, |
| "epoch": 0.0008028904054596548, |
| "grad_norm": 0.13447555899620056, |
| "kl": 0.0005453471094369888, |
| "learning_rate": 4.010695187165776e-08, |
| "loss": 0.0036, |
| "reward": -0.7109375, |
| "reward_std": 0.357421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7109375, |
| "step": 6 |
| }, |
| { |
| "completion_length": 426.16668701171875, |
| "epoch": 0.0009367054730362638, |
| "grad_norm": 0.1324268877506256, |
| "kl": 0.000606791814789176, |
| "learning_rate": 4.679144385026738e-08, |
| "loss": 0.0017, |
| "reward": -1.09375, |
| "reward_std": 0.72265625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.09375, |
| "step": 7 |
| }, |
| { |
| "completion_length": 418.66668701171875, |
| "epoch": 0.001070520540612873, |
| "grad_norm": 0.12978878617286682, |
| "kl": 0.0005688891978934407, |
| "learning_rate": 5.3475935828877005e-08, |
| "loss": 0.0005, |
| "reward": -0.89453125, |
| "reward_std": 0.392578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.89453125, |
| "step": 8 |
| }, |
| { |
| "completion_length": 484.5, |
| "epoch": 0.0012043356081894822, |
| "grad_norm": 0.09955421835184097, |
| "kl": 0.0005112257204018533, |
| "learning_rate": 6.016042780748664e-08, |
| "loss": 0.0067, |
| "reward": -1.25, |
| "reward_std": 0.53515625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.25, |
| "step": 9 |
| }, |
| { |
| "completion_length": 230.5, |
| "epoch": 0.0013381506757660913, |
| "grad_norm": 0.19524620473384857, |
| "kl": 0.0006619760533794761, |
| "learning_rate": 6.684491978609626e-08, |
| "loss": -0.0006, |
| "reward": -0.26953125, |
| "reward_std": 0.349609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.26953125, |
| "step": 10 |
| }, |
| { |
| "completion_length": 369.5, |
| "epoch": 0.0014719657433427003, |
| "grad_norm": 0.1019153892993927, |
| "kl": 0.0006552126724272966, |
| "learning_rate": 7.352941176470589e-08, |
| "loss": -0.004, |
| "reward": -0.94140625, |
| "reward_std": 0.279296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.94140625, |
| "step": 11 |
| }, |
| { |
| "completion_length": 386.16668701171875, |
| "epoch": 0.0016057808109193096, |
| "grad_norm": 0.09696059674024582, |
| "kl": 0.0004603694542311132, |
| "learning_rate": 8.021390374331552e-08, |
| "loss": 0.002, |
| "reward": -0.8671875, |
| "reward_std": 0.42578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8671875, |
| "step": 12 |
| }, |
| { |
| "completion_length": 475.16668701171875, |
| "epoch": 0.0017395958784959186, |
| "grad_norm": 0.12413895130157471, |
| "kl": 0.0004793051048181951, |
| "learning_rate": 8.689839572192514e-08, |
| "loss": 0.0, |
| "reward": -0.9375, |
| "reward_std": 0.28125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9375, |
| "step": 13 |
| }, |
| { |
| "completion_length": 370.0, |
| "epoch": 0.0018734109460725277, |
| "grad_norm": 0.1305382251739502, |
| "kl": 0.0005513830110430717, |
| "learning_rate": 9.358288770053476e-08, |
| "loss": -0.0018, |
| "reward": -0.78515625, |
| "reward_std": 0.263671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.78515625, |
| "step": 14 |
| }, |
| { |
| "completion_length": 431.66668701171875, |
| "epoch": 0.002007226013649137, |
| "grad_norm": 0.10463520139455795, |
| "kl": 0.00048596435226500034, |
| "learning_rate": 1.0026737967914439e-07, |
| "loss": 0.0032, |
| "reward": -0.84375, |
| "reward_std": 0.3984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.84375, |
| "step": 15 |
| }, |
| { |
| "completion_length": 399.5, |
| "epoch": 0.002141041081225746, |
| "grad_norm": 0.1404961347579956, |
| "kl": 0.000555322621949017, |
| "learning_rate": 1.0695187165775401e-07, |
| "loss": -0.0057, |
| "reward": -1.0625, |
| "reward_std": 0.46484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0625, |
| "step": 16 |
| }, |
| { |
| "completion_length": 449.8333435058594, |
| "epoch": 0.002274856148802355, |
| "grad_norm": 0.10250594466924667, |
| "kl": 0.00048121344298124313, |
| "learning_rate": 1.1363636363636364e-07, |
| "loss": -0.0071, |
| "reward": -1.0234375, |
| "reward_std": 0.40234375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0234375, |
| "step": 17 |
| }, |
| { |
| "completion_length": 324.5, |
| "epoch": 0.0024086712163789645, |
| "grad_norm": 0.12464314699172974, |
| "kl": 0.0005811881856061518, |
| "learning_rate": 1.2032085561497328e-07, |
| "loss": 0.0033, |
| "reward": -0.6875, |
| "reward_std": 0.26171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6875, |
| "step": 18 |
| }, |
| { |
| "completion_length": 578.0, |
| "epoch": 0.0025424862839555735, |
| "grad_norm": 0.08823499828577042, |
| "kl": 0.000675913121085614, |
| "learning_rate": 1.270053475935829e-07, |
| "loss": 0.0075, |
| "reward": -1.703125, |
| "reward_std": 0.5703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.703125, |
| "step": 19 |
| }, |
| { |
| "completion_length": 328.8333435058594, |
| "epoch": 0.0026763013515321826, |
| "grad_norm": 0.16708222031593323, |
| "kl": 0.0006092819385230541, |
| "learning_rate": 1.3368983957219251e-07, |
| "loss": 0.0091, |
| "reward": -0.71484375, |
| "reward_std": 0.1357421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.71484375, |
| "step": 20 |
| }, |
| { |
| "completion_length": 415.16668701171875, |
| "epoch": 0.0028101164191087916, |
| "grad_norm": 0.10446464270353317, |
| "kl": 0.0004726095939986408, |
| "learning_rate": 1.4037433155080215e-07, |
| "loss": 0.0011, |
| "reward": -1.0, |
| "reward_std": 0.50390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0, |
| "step": 21 |
| }, |
| { |
| "completion_length": 398.5, |
| "epoch": 0.0029439314866854006, |
| "grad_norm": 0.10892236977815628, |
| "kl": 0.000556222046725452, |
| "learning_rate": 1.4705882352941178e-07, |
| "loss": 0.0016, |
| "reward": -0.9765625, |
| "reward_std": 0.349609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9765625, |
| "step": 22 |
| }, |
| { |
| "completion_length": 351.66668701171875, |
| "epoch": 0.00307774655426201, |
| "grad_norm": 0.13707049190998077, |
| "kl": 0.0005205090856179595, |
| "learning_rate": 1.537433155080214e-07, |
| "loss": -0.0032, |
| "reward": -0.7421875, |
| "reward_std": 0.390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7421875, |
| "step": 23 |
| }, |
| { |
| "completion_length": 310.0, |
| "epoch": 0.003211561621838619, |
| "grad_norm": 0.1579124480485916, |
| "kl": 0.0007410722319036722, |
| "learning_rate": 1.6042780748663104e-07, |
| "loss": 0.0002, |
| "reward": -0.65625, |
| "reward_std": 0.6328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.65625, |
| "step": 24 |
| }, |
| { |
| "completion_length": 353.0, |
| "epoch": 0.003345376689415228, |
| "grad_norm": 0.11555790901184082, |
| "kl": 0.0005753459990955889, |
| "learning_rate": 1.6711229946524068e-07, |
| "loss": -0.0034, |
| "reward": -0.828125, |
| "reward_std": 0.314453125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.828125, |
| "step": 25 |
| }, |
| { |
| "completion_length": 416.5, |
| "epoch": 0.0034791917569918372, |
| "grad_norm": 0.10537782311439514, |
| "kl": 0.0006076883291825652, |
| "learning_rate": 1.7379679144385028e-07, |
| "loss": -0.0068, |
| "reward": -0.8359375, |
| "reward_std": 0.30859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8359375, |
| "step": 26 |
| }, |
| { |
| "completion_length": 436.0, |
| "epoch": 0.0036130068245684463, |
| "grad_norm": 0.12061028182506561, |
| "kl": 0.0006918934523127973, |
| "learning_rate": 1.8048128342245991e-07, |
| "loss": 0.0033, |
| "reward": -0.91015625, |
| "reward_std": 0.92578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.91015625, |
| "step": 27 |
| }, |
| { |
| "completion_length": 447.16668701171875, |
| "epoch": 0.0037468218921450553, |
| "grad_norm": 0.11236874759197235, |
| "kl": 0.0005188498180359602, |
| "learning_rate": 1.8716577540106952e-07, |
| "loss": -0.0021, |
| "reward": -1.078125, |
| "reward_std": 0.298828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.078125, |
| "step": 28 |
| }, |
| { |
| "completion_length": 524.5, |
| "epoch": 0.003880636959721665, |
| "grad_norm": 0.08638511598110199, |
| "kl": 0.000413873785873875, |
| "learning_rate": 1.9385026737967918e-07, |
| "loss": -0.0027, |
| "reward": -1.1953125, |
| "reward_std": 0.58203125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1953125, |
| "step": 29 |
| }, |
| { |
| "completion_length": 433.0, |
| "epoch": 0.004014452027298274, |
| "grad_norm": 0.10361335426568985, |
| "kl": 0.0005174180259928107, |
| "learning_rate": 2.0053475935828878e-07, |
| "loss": -0.001, |
| "reward": -0.8125, |
| "reward_std": 0.55078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8125, |
| "step": 30 |
| }, |
| { |
| "completion_length": 429.3333435058594, |
| "epoch": 0.004148267094874883, |
| "grad_norm": 0.09831919521093369, |
| "kl": 0.0004531377926468849, |
| "learning_rate": 2.0721925133689842e-07, |
| "loss": -0.0034, |
| "reward": -0.82421875, |
| "reward_std": 0.412109375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.82421875, |
| "step": 31 |
| }, |
| { |
| "completion_length": 352.3333435058594, |
| "epoch": 0.004282082162451492, |
| "grad_norm": 0.1168479472398758, |
| "kl": 0.00041617939132265747, |
| "learning_rate": 2.1390374331550802e-07, |
| "loss": 0.012, |
| "reward": -0.671875, |
| "reward_std": 0.1455078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.671875, |
| "step": 32 |
| }, |
| { |
| "completion_length": 328.16668701171875, |
| "epoch": 0.004415897230028101, |
| "grad_norm": 0.14010493457317352, |
| "kl": 0.0006999190663918853, |
| "learning_rate": 2.2058823529411768e-07, |
| "loss": -0.0003, |
| "reward": -0.8203125, |
| "reward_std": 0.4609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8203125, |
| "step": 33 |
| }, |
| { |
| "completion_length": 461.0, |
| "epoch": 0.00454971229760471, |
| "grad_norm": 0.07955824583768845, |
| "kl": 0.000317567668389529, |
| "learning_rate": 2.2727272727272729e-07, |
| "loss": 0.0061, |
| "reward": -0.7421875, |
| "reward_std": 0.107421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7421875, |
| "step": 34 |
| }, |
| { |
| "completion_length": 343.5, |
| "epoch": 0.004683527365181319, |
| "grad_norm": 0.192967027425766, |
| "kl": 0.0003919226583093405, |
| "learning_rate": 2.3395721925133692e-07, |
| "loss": -0.0026, |
| "reward": -0.71875, |
| "reward_std": 0.26953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.71875, |
| "step": 35 |
| }, |
| { |
| "completion_length": 468.0, |
| "epoch": 0.004817342432757929, |
| "grad_norm": 0.1151042953133583, |
| "kl": 0.0005731440032832325, |
| "learning_rate": 2.4064171122994655e-07, |
| "loss": 0.0008, |
| "reward": -0.90625, |
| "reward_std": 0.4375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.90625, |
| "step": 36 |
| }, |
| { |
| "completion_length": 326.66668701171875, |
| "epoch": 0.004951157500334538, |
| "grad_norm": 0.13014303147792816, |
| "kl": 0.0006222401279956102, |
| "learning_rate": 2.473262032085562e-07, |
| "loss": 0.0073, |
| "reward": -0.58984375, |
| "reward_std": 0.2177734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.58984375, |
| "step": 37 |
| }, |
| { |
| "completion_length": 470.16668701171875, |
| "epoch": 0.005084972567911147, |
| "grad_norm": 0.10929639637470245, |
| "kl": 0.0005664956988766789, |
| "learning_rate": 2.540106951871658e-07, |
| "loss": -0.001, |
| "reward": -1.2109375, |
| "reward_std": 0.451171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2109375, |
| "step": 38 |
| }, |
| { |
| "completion_length": 350.5, |
| "epoch": 0.005218787635487756, |
| "grad_norm": 0.121163509786129, |
| "kl": 0.0006041490705683827, |
| "learning_rate": 2.606951871657754e-07, |
| "loss": -0.0012, |
| "reward": -0.65234375, |
| "reward_std": 0.404296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.65234375, |
| "step": 39 |
| }, |
| { |
| "completion_length": 441.5, |
| "epoch": 0.005352602703064365, |
| "grad_norm": 0.09024005383253098, |
| "kl": 0.0005421562236733735, |
| "learning_rate": 2.6737967914438503e-07, |
| "loss": 0.0012, |
| "reward": -0.765625, |
| "reward_std": 0.7734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.765625, |
| "step": 40 |
| }, |
| { |
| "completion_length": 587.8333740234375, |
| "epoch": 0.005486417770640974, |
| "grad_norm": 0.11247697472572327, |
| "kl": 0.0009425554308108985, |
| "learning_rate": 2.740641711229947e-07, |
| "loss": -0.0003, |
| "reward": -1.6875, |
| "reward_std": 0.6171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.6875, |
| "step": 41 |
| }, |
| { |
| "completion_length": 523.5, |
| "epoch": 0.005620232838217583, |
| "grad_norm": 0.08999153226613998, |
| "kl": 0.0004692915244959295, |
| "learning_rate": 2.807486631016043e-07, |
| "loss": 0.0003, |
| "reward": -0.796875, |
| "reward_std": 1.375, |
| "rewards/correctness_reward_func": 0.333984375, |
| "rewards/int_reward_func": 0.08349609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2109375, |
| "step": 42 |
| }, |
| { |
| "completion_length": 531.0, |
| "epoch": 0.005754047905794192, |
| "grad_norm": 0.09678950905799866, |
| "kl": 0.0005171874072402716, |
| "learning_rate": 2.8743315508021395e-07, |
| "loss": -0.0044, |
| "reward": -1.421875, |
| "reward_std": 0.4140625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.421875, |
| "step": 43 |
| }, |
| { |
| "completion_length": 416.0, |
| "epoch": 0.005887862973370801, |
| "grad_norm": 0.11189436912536621, |
| "kl": 0.00040408255881629884, |
| "learning_rate": 2.9411764705882356e-07, |
| "loss": 0.0029, |
| "reward": -0.921875, |
| "reward_std": 0.1884765625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.921875, |
| "step": 44 |
| }, |
| { |
| "completion_length": 359.8333435058594, |
| "epoch": 0.00602167804094741, |
| "grad_norm": 0.10176176577806473, |
| "kl": 0.0005236775032244623, |
| "learning_rate": 3.0080213903743316e-07, |
| "loss": -0.0032, |
| "reward": -0.58203125, |
| "reward_std": 0.4140625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.58203125, |
| "step": 45 |
| }, |
| { |
| "completion_length": 416.8333435058594, |
| "epoch": 0.00615549310852402, |
| "grad_norm": 0.12203460931777954, |
| "kl": 0.0006941946921870112, |
| "learning_rate": 3.074866310160428e-07, |
| "loss": 0.0008, |
| "reward": -1.15625, |
| "reward_std": 0.50390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.15625, |
| "step": 46 |
| }, |
| { |
| "completion_length": 524.8333740234375, |
| "epoch": 0.006289308176100629, |
| "grad_norm": 0.09807480126619339, |
| "kl": 0.000624034320935607, |
| "learning_rate": 3.1417112299465243e-07, |
| "loss": -0.0062, |
| "reward": -1.1875, |
| "reward_std": 0.396484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1875, |
| "step": 47 |
| }, |
| { |
| "completion_length": 419.5, |
| "epoch": 0.006423123243677238, |
| "grad_norm": 0.11151473969221115, |
| "kl": 0.0005637712310999632, |
| "learning_rate": 3.208556149732621e-07, |
| "loss": -0.0017, |
| "reward": -0.94140625, |
| "reward_std": 0.5546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.94140625, |
| "step": 48 |
| }, |
| { |
| "completion_length": 375.3333435058594, |
| "epoch": 0.006556938311253847, |
| "grad_norm": 0.15443629026412964, |
| "kl": 0.0006895489059388638, |
| "learning_rate": 3.275401069518717e-07, |
| "loss": -0.0021, |
| "reward": -0.80078125, |
| "reward_std": 0.6015625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.80078125, |
| "step": 49 |
| }, |
| { |
| "completion_length": 538.5, |
| "epoch": 0.006690753378830456, |
| "grad_norm": 0.12232497334480286, |
| "kl": 0.00044502606033347547, |
| "learning_rate": 3.3422459893048135e-07, |
| "loss": 0.0038, |
| "reward": -1.15625, |
| "reward_std": 0.345703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.15625, |
| "step": 50 |
| }, |
| { |
| "completion_length": 380.66668701171875, |
| "epoch": 0.006824568446407065, |
| "grad_norm": 0.09400169551372528, |
| "kl": 0.0004400149919092655, |
| "learning_rate": 3.409090909090909e-07, |
| "loss": -0.0005, |
| "reward": -0.75390625, |
| "reward_std": 0.75390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.75390625, |
| "step": 51 |
| }, |
| { |
| "completion_length": 302.16668701171875, |
| "epoch": 0.0069583835139836745, |
| "grad_norm": 0.18885326385498047, |
| "kl": 0.0006017067935317755, |
| "learning_rate": 3.4759358288770056e-07, |
| "loss": 0.0001, |
| "reward": -0.494140625, |
| "reward_std": 0.5078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.494140625, |
| "step": 52 |
| }, |
| { |
| "completion_length": 310.0, |
| "epoch": 0.0070921985815602835, |
| "grad_norm": 0.17508742213249207, |
| "kl": 0.0006495526758953929, |
| "learning_rate": 3.542780748663102e-07, |
| "loss": 0.0001, |
| "reward": -0.609375, |
| "reward_std": 0.208984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.609375, |
| "step": 53 |
| }, |
| { |
| "completion_length": 348.16668701171875, |
| "epoch": 0.0072260136491368926, |
| "grad_norm": 0.1143779456615448, |
| "kl": 0.0005849208100698888, |
| "learning_rate": 3.6096256684491983e-07, |
| "loss": -0.0023, |
| "reward": -0.8984375, |
| "reward_std": 0.423828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8984375, |
| "step": 54 |
| }, |
| { |
| "completion_length": 461.5, |
| "epoch": 0.007359828716713502, |
| "grad_norm": 0.10026198625564575, |
| "kl": 0.0005551945068873465, |
| "learning_rate": 3.6764705882352943e-07, |
| "loss": -0.0088, |
| "reward": -1.03125, |
| "reward_std": 0.38671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.03125, |
| "step": 55 |
| }, |
| { |
| "completion_length": 581.1666870117188, |
| "epoch": 0.007493643784290111, |
| "grad_norm": 0.09014507383108139, |
| "kl": 0.0004388962115626782, |
| "learning_rate": 3.7433155080213904e-07, |
| "loss": 0.0007, |
| "reward": -1.328125, |
| "reward_std": 0.1474609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.328125, |
| "step": 56 |
| }, |
| { |
| "completion_length": 320.3333435058594, |
| "epoch": 0.0076274588518667205, |
| "grad_norm": 0.09987051039934158, |
| "kl": 0.0005903591518290341, |
| "learning_rate": 3.810160427807487e-07, |
| "loss": -0.0068, |
| "reward": -0.609375, |
| "reward_std": 0.2578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.609375, |
| "step": 57 |
| }, |
| { |
| "completion_length": 362.8333435058594, |
| "epoch": 0.00776127391944333, |
| "grad_norm": 0.246050164103508, |
| "kl": 0.0005056472145952284, |
| "learning_rate": 3.8770053475935836e-07, |
| "loss": -0.0027, |
| "reward": -0.625, |
| "reward_std": 0.5234375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.625, |
| "step": 58 |
| }, |
| { |
| "completion_length": 524.0, |
| "epoch": 0.007895088987019938, |
| "grad_norm": 0.12084438651800156, |
| "kl": 0.0005575703689828515, |
| "learning_rate": 3.943850267379679e-07, |
| "loss": 0.0114, |
| "reward": -1.1328125, |
| "reward_std": 0.28125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1328125, |
| "step": 59 |
| }, |
| { |
| "completion_length": 410.3333435058594, |
| "epoch": 0.008028904054596548, |
| "grad_norm": 0.10101523995399475, |
| "kl": 0.0005777844344265759, |
| "learning_rate": 4.0106951871657757e-07, |
| "loss": 0.0007, |
| "reward": -0.94921875, |
| "reward_std": 0.271484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.94921875, |
| "step": 60 |
| }, |
| { |
| "completion_length": 717.6666870117188, |
| "epoch": 0.008162719122173156, |
| "grad_norm": 0.09220802038908005, |
| "kl": 0.0006241414812393486, |
| "learning_rate": 4.077540106951872e-07, |
| "loss": -0.0078, |
| "reward": -2.046875, |
| "reward_std": 0.53515625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -2.046875, |
| "step": 61 |
| }, |
| { |
| "completion_length": 384.66668701171875, |
| "epoch": 0.008296534189749766, |
| "grad_norm": 0.10890569537878036, |
| "kl": 0.00048696936573833227, |
| "learning_rate": 4.1443850267379683e-07, |
| "loss": 0.0039, |
| "reward": -0.921875, |
| "reward_std": 0.1865234375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.921875, |
| "step": 62 |
| }, |
| { |
| "completion_length": 271.3333435058594, |
| "epoch": 0.008430349257326376, |
| "grad_norm": 0.11486776173114777, |
| "kl": 0.0005599698051810265, |
| "learning_rate": 4.211229946524065e-07, |
| "loss": -0.0005, |
| "reward": -0.201171875, |
| "reward_std": 0.396484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.201171875, |
| "step": 63 |
| }, |
| { |
| "completion_length": 385.3333435058594, |
| "epoch": 0.008564164324902984, |
| "grad_norm": 0.1193244457244873, |
| "kl": 0.0006926630157977343, |
| "learning_rate": 4.2780748663101604e-07, |
| "loss": 0.0043, |
| "reward": -0.97265625, |
| "reward_std": 0.41015625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.97265625, |
| "step": 64 |
| }, |
| { |
| "completion_length": 539.6666870117188, |
| "epoch": 0.008697979392479594, |
| "grad_norm": 0.09389720857143402, |
| "kl": 0.0004780918825417757, |
| "learning_rate": 4.344919786096257e-07, |
| "loss": 0.0052, |
| "reward": -1.234375, |
| "reward_std": 0.373046875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.234375, |
| "step": 65 |
| }, |
| { |
| "completion_length": 259.8333435058594, |
| "epoch": 0.008831794460056202, |
| "grad_norm": 0.22691243886947632, |
| "kl": 0.0008878613589331508, |
| "learning_rate": 4.4117647058823536e-07, |
| "loss": -0.0048, |
| "reward": -0.51953125, |
| "reward_std": 0.251953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.51953125, |
| "step": 66 |
| }, |
| { |
| "completion_length": 461.16668701171875, |
| "epoch": 0.008965609527632812, |
| "grad_norm": 0.12113010138273239, |
| "kl": 0.0006317974766716361, |
| "learning_rate": 4.4786096256684497e-07, |
| "loss": -0.0067, |
| "reward": -1.15625, |
| "reward_std": 0.39453125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.15625, |
| "step": 67 |
| }, |
| { |
| "completion_length": 399.8333435058594, |
| "epoch": 0.00909942459520942, |
| "grad_norm": 0.1679317206144333, |
| "kl": 0.000584149791393429, |
| "learning_rate": 4.5454545454545457e-07, |
| "loss": -0.0093, |
| "reward": -0.9609375, |
| "reward_std": 0.2060546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9609375, |
| "step": 68 |
| }, |
| { |
| "completion_length": 381.0, |
| "epoch": 0.00923323966278603, |
| "grad_norm": 0.2019040584564209, |
| "kl": 0.0007442033383995295, |
| "learning_rate": 4.612299465240642e-07, |
| "loss": 0.0034, |
| "reward": -0.63671875, |
| "reward_std": 0.494140625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.63671875, |
| "step": 69 |
| }, |
| { |
| "completion_length": 455.5, |
| "epoch": 0.009367054730362638, |
| "grad_norm": 0.09101377427577972, |
| "kl": 0.00046143907820805907, |
| "learning_rate": 4.6791443850267384e-07, |
| "loss": -0.0057, |
| "reward": -1.046875, |
| "reward_std": 0.6328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.046875, |
| "step": 70 |
| }, |
| { |
| "completion_length": 493.8333435058594, |
| "epoch": 0.009500869797939248, |
| "grad_norm": 0.09268555790185928, |
| "kl": 0.00048020537360571325, |
| "learning_rate": 4.745989304812835e-07, |
| "loss": -0.0021, |
| "reward": -1.28125, |
| "reward_std": 0.54296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.28125, |
| "step": 71 |
| }, |
| { |
| "completion_length": 601.6666870117188, |
| "epoch": 0.009634684865515858, |
| "grad_norm": 0.07598231732845306, |
| "kl": 0.0004928440321236849, |
| "learning_rate": 4.812834224598931e-07, |
| "loss": -0.0031, |
| "reward": -1.328125, |
| "reward_std": 0.74609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.328125, |
| "step": 72 |
| }, |
| { |
| "completion_length": 449.16668701171875, |
| "epoch": 0.009768499933092466, |
| "grad_norm": 0.1203397586941719, |
| "kl": 0.0006244009709917009, |
| "learning_rate": 4.879679144385027e-07, |
| "loss": -0.0055, |
| "reward": -1.0703125, |
| "reward_std": 0.474609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0703125, |
| "step": 73 |
| }, |
| { |
| "completion_length": 499.0, |
| "epoch": 0.009902315000669076, |
| "grad_norm": 0.08029637485742569, |
| "kl": 0.0004115910269320011, |
| "learning_rate": 4.946524064171124e-07, |
| "loss": 0.001, |
| "reward": -1.1953125, |
| "reward_std": 0.5703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1953125, |
| "step": 74 |
| }, |
| { |
| "completion_length": 318.8333435058594, |
| "epoch": 0.010036130068245684, |
| "grad_norm": 0.10725877434015274, |
| "kl": 0.0005362802767194808, |
| "learning_rate": 5.013368983957219e-07, |
| "loss": -0.0039, |
| "reward": -0.38671875, |
| "reward_std": 0.25390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.38671875, |
| "step": 75 |
| }, |
| { |
| "completion_length": 348.0, |
| "epoch": 0.010169945135822294, |
| "grad_norm": 0.1331893801689148, |
| "kl": 0.0006620581261813641, |
| "learning_rate": 5.080213903743316e-07, |
| "loss": -0.0007, |
| "reward": -0.84765625, |
| "reward_std": 0.486328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.84765625, |
| "step": 76 |
| }, |
| { |
| "completion_length": 456.66668701171875, |
| "epoch": 0.010303760203398902, |
| "grad_norm": 0.10820724815130234, |
| "kl": 0.0007615931099280715, |
| "learning_rate": 5.147058823529412e-07, |
| "loss": 0.0036, |
| "reward": -0.953125, |
| "reward_std": 0.59765625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.953125, |
| "step": 77 |
| }, |
| { |
| "completion_length": 335.66668701171875, |
| "epoch": 0.010437575270975512, |
| "grad_norm": 0.13866935670375824, |
| "kl": 0.0005373357562348247, |
| "learning_rate": 5.213903743315508e-07, |
| "loss": 0.0013, |
| "reward": -0.58203125, |
| "reward_std": 0.3359375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.58203125, |
| "step": 78 |
| }, |
| { |
| "completion_length": 338.0, |
| "epoch": 0.01057139033855212, |
| "grad_norm": 0.1531476229429245, |
| "kl": 0.000613297161180526, |
| "learning_rate": 5.280748663101604e-07, |
| "loss": -0.0006, |
| "reward": -0.6875, |
| "reward_std": 0.380859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6875, |
| "step": 79 |
| }, |
| { |
| "completion_length": 323.8333435058594, |
| "epoch": 0.01070520540612873, |
| "grad_norm": 0.11174651980400085, |
| "kl": 0.00048220629105344415, |
| "learning_rate": 5.347593582887701e-07, |
| "loss": 0.0042, |
| "reward": -0.5546875, |
| "reward_std": 0.099609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.5546875, |
| "step": 80 |
| }, |
| { |
| "completion_length": 425.0, |
| "epoch": 0.010839020473705338, |
| "grad_norm": 0.06810642778873444, |
| "kl": 0.0002865367860067636, |
| "learning_rate": 5.414438502673798e-07, |
| "loss": 0.0087, |
| "reward": -0.921875, |
| "reward_std": 0.1455078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.921875, |
| "step": 81 |
| }, |
| { |
| "completion_length": 356.0, |
| "epoch": 0.010972835541281948, |
| "grad_norm": 0.12943901121616364, |
| "kl": 0.0005909207975491881, |
| "learning_rate": 5.481283422459894e-07, |
| "loss": -0.0014, |
| "reward": -0.80078125, |
| "reward_std": 0.2373046875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.80078125, |
| "step": 82 |
| }, |
| { |
| "completion_length": 408.66668701171875, |
| "epoch": 0.011106650608858558, |
| "grad_norm": 0.10401128232479095, |
| "kl": 0.0005684032803401351, |
| "learning_rate": 5.54812834224599e-07, |
| "loss": 0.0067, |
| "reward": -0.828125, |
| "reward_std": 0.279296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.828125, |
| "step": 83 |
| }, |
| { |
| "completion_length": 365.5, |
| "epoch": 0.011240465676435166, |
| "grad_norm": 0.11247576773166656, |
| "kl": 0.0005961977876722813, |
| "learning_rate": 5.614973262032086e-07, |
| "loss": 0.0049, |
| "reward": -0.7890625, |
| "reward_std": 0.43359375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7890625, |
| "step": 84 |
| }, |
| { |
| "completion_length": 373.16668701171875, |
| "epoch": 0.011374280744011776, |
| "grad_norm": 0.13172324001789093, |
| "kl": 0.0006587211973965168, |
| "learning_rate": 5.681818181818182e-07, |
| "loss": 0.0, |
| "reward": -0.92578125, |
| "reward_std": 0.4296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.92578125, |
| "step": 85 |
| }, |
| { |
| "completion_length": 463.16668701171875, |
| "epoch": 0.011508095811588384, |
| "grad_norm": 0.0999283418059349, |
| "kl": 0.0005339820636436343, |
| "learning_rate": 5.748663101604279e-07, |
| "loss": -0.0073, |
| "reward": -1.1640625, |
| "reward_std": 0.478515625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1640625, |
| "step": 86 |
| }, |
| { |
| "completion_length": 346.16668701171875, |
| "epoch": 0.011641910879164994, |
| "grad_norm": 0.1427423655986786, |
| "kl": 0.0005965695017948747, |
| "learning_rate": 5.815508021390375e-07, |
| "loss": 0.0052, |
| "reward": -0.86328125, |
| "reward_std": 0.36328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.86328125, |
| "step": 87 |
| }, |
| { |
| "completion_length": 464.3333435058594, |
| "epoch": 0.011775725946741603, |
| "grad_norm": 0.09077266603708267, |
| "kl": 0.0005941446870565414, |
| "learning_rate": 5.882352941176471e-07, |
| "loss": -0.0034, |
| "reward": -0.96875, |
| "reward_std": 0.3203125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.96875, |
| "step": 88 |
| }, |
| { |
| "completion_length": 444.3333435058594, |
| "epoch": 0.011909541014318212, |
| "grad_norm": 0.11555906385183334, |
| "kl": 0.0005076751112937927, |
| "learning_rate": 5.949197860962567e-07, |
| "loss": 0.0, |
| "reward": -0.98046875, |
| "reward_std": 0.3828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.98046875, |
| "step": 89 |
| }, |
| { |
| "completion_length": 381.66668701171875, |
| "epoch": 0.01204335608189482, |
| "grad_norm": 0.11006759107112885, |
| "kl": 0.0005213702679611742, |
| "learning_rate": 6.016042780748663e-07, |
| "loss": -0.0019, |
| "reward": -0.8125, |
| "reward_std": 0.51171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8125, |
| "step": 90 |
| }, |
| { |
| "completion_length": 586.6666870117188, |
| "epoch": 0.01217717114947143, |
| "grad_norm": 0.0977427214384079, |
| "kl": 0.0004677172692026943, |
| "learning_rate": 6.08288770053476e-07, |
| "loss": 0.0059, |
| "reward": -1.2265625, |
| "reward_std": 0.5703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2265625, |
| "step": 91 |
| }, |
| { |
| "completion_length": 523.5, |
| "epoch": 0.01231098621704804, |
| "grad_norm": 0.14523504674434662, |
| "kl": 0.0007363607874140143, |
| "learning_rate": 6.149732620320856e-07, |
| "loss": -0.0029, |
| "reward": -1.484375, |
| "reward_std": 0.828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.484375, |
| "step": 92 |
| }, |
| { |
| "completion_length": 452.5, |
| "epoch": 0.012444801284624649, |
| "grad_norm": 0.11730131506919861, |
| "kl": 0.0004316701088100672, |
| "learning_rate": 6.216577540106952e-07, |
| "loss": 0.0008, |
| "reward": -1.328125, |
| "reward_std": 0.82421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.328125, |
| "step": 93 |
| }, |
| { |
| "completion_length": 471.66668701171875, |
| "epoch": 0.012578616352201259, |
| "grad_norm": 0.12073160707950592, |
| "kl": 0.000504339870531112, |
| "learning_rate": 6.283422459893049e-07, |
| "loss": 0.0029, |
| "reward": -1.1796875, |
| "reward_std": 0.40625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1796875, |
| "step": 94 |
| }, |
| { |
| "completion_length": 291.66668701171875, |
| "epoch": 0.012712431419777867, |
| "grad_norm": 0.17788150906562805, |
| "kl": 0.0006855755927972496, |
| "learning_rate": 6.350267379679146e-07, |
| "loss": -0.0016, |
| "reward": -0.494140625, |
| "reward_std": 0.26171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.494140625, |
| "step": 95 |
| }, |
| { |
| "completion_length": 411.8333435058594, |
| "epoch": 0.012846246487354477, |
| "grad_norm": 0.08730936795473099, |
| "kl": 0.00039596876013092697, |
| "learning_rate": 6.417112299465242e-07, |
| "loss": 0.0014, |
| "reward": -0.890625, |
| "reward_std": 0.498046875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.890625, |
| "step": 96 |
| }, |
| { |
| "completion_length": 376.5, |
| "epoch": 0.012980061554931085, |
| "grad_norm": 0.09182324260473251, |
| "kl": 0.0004653404466807842, |
| "learning_rate": 6.483957219251337e-07, |
| "loss": -0.0036, |
| "reward": -0.73828125, |
| "reward_std": 0.33203125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.73828125, |
| "step": 97 |
| }, |
| { |
| "completion_length": 395.66668701171875, |
| "epoch": 0.013113876622507695, |
| "grad_norm": 0.12949968874454498, |
| "kl": 0.0005730512784793973, |
| "learning_rate": 6.550802139037434e-07, |
| "loss": -0.0027, |
| "reward": -0.8359375, |
| "reward_std": 0.49609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8359375, |
| "step": 98 |
| }, |
| { |
| "completion_length": 600.0, |
| "epoch": 0.013247691690084303, |
| "grad_norm": 0.08795811235904694, |
| "kl": 0.000673401344101876, |
| "learning_rate": 6.61764705882353e-07, |
| "loss": 0.0018, |
| "reward": -1.78125, |
| "reward_std": 0.5078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.78125, |
| "step": 99 |
| }, |
| { |
| "completion_length": 468.0, |
| "epoch": 0.013381506757660913, |
| "grad_norm": 0.1182761937379837, |
| "kl": 0.0005245240754447877, |
| "learning_rate": 6.684491978609627e-07, |
| "loss": 0.0088, |
| "reward": -1.234375, |
| "reward_std": 0.4296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.234375, |
| "step": 100 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 7473, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|