| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.006690753378830456, |
| "eval_steps": 500, |
| "global_step": 50, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 698.1666870117188, |
| "epoch": 0.00013381506757660912, |
| "grad_norm": 0.07569596916437149, |
| "kl": 0.0006024616304785013, |
| "learning_rate": 6.684491978609626e-09, |
| "loss": 0.001, |
| "reward": -1.8359375, |
| "reward_std": 0.5859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.8359375, |
| "step": 1 |
| }, |
| { |
| "completion_length": 549.0, |
| "epoch": 0.00026763013515321824, |
| "grad_norm": 0.10156559199094772, |
| "kl": 0.0006554799037985504, |
| "learning_rate": 1.3368983957219251e-08, |
| "loss": -0.0055, |
| "reward": -1.21875, |
| "reward_std": 0.48828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.21875, |
| "step": 2 |
| }, |
| { |
| "completion_length": 509.66668701171875, |
| "epoch": 0.0004014452027298274, |
| "grad_norm": 0.1012749969959259, |
| "kl": 0.0006122777122072875, |
| "learning_rate": 2.005347593582888e-08, |
| "loss": 0.0032, |
| "reward": -1.2578125, |
| "reward_std": 0.37890625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2578125, |
| "step": 3 |
| }, |
| { |
| "completion_length": 387.8333435058594, |
| "epoch": 0.0005352602703064365, |
| "grad_norm": 0.10009913891553879, |
| "kl": 0.0005205385386943817, |
| "learning_rate": 2.6737967914438503e-08, |
| "loss": 0.0007, |
| "reward": -0.83203125, |
| "reward_std": 0.1640625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.83203125, |
| "step": 4 |
| }, |
| { |
| "completion_length": 386.5, |
| "epoch": 0.0006690753378830456, |
| "grad_norm": 0.11404310166835785, |
| "kl": 0.00039041676791384816, |
| "learning_rate": 3.342245989304813e-08, |
| "loss": -0.0032, |
| "reward": -0.859375, |
| "reward_std": 0.1630859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.859375, |
| "step": 5 |
| }, |
| { |
| "completion_length": 345.8333435058594, |
| "epoch": 0.0008028904054596548, |
| "grad_norm": 0.13447555899620056, |
| "kl": 0.0005453471094369888, |
| "learning_rate": 4.010695187165776e-08, |
| "loss": 0.0036, |
| "reward": -0.7109375, |
| "reward_std": 0.357421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7109375, |
| "step": 6 |
| }, |
| { |
| "completion_length": 426.16668701171875, |
| "epoch": 0.0009367054730362638, |
| "grad_norm": 0.1324268877506256, |
| "kl": 0.000606791814789176, |
| "learning_rate": 4.679144385026738e-08, |
| "loss": 0.0017, |
| "reward": -1.09375, |
| "reward_std": 0.72265625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.09375, |
| "step": 7 |
| }, |
| { |
| "completion_length": 418.66668701171875, |
| "epoch": 0.001070520540612873, |
| "grad_norm": 0.12978878617286682, |
| "kl": 0.0005688891978934407, |
| "learning_rate": 5.3475935828877005e-08, |
| "loss": 0.0005, |
| "reward": -0.89453125, |
| "reward_std": 0.392578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.89453125, |
| "step": 8 |
| }, |
| { |
| "completion_length": 484.5, |
| "epoch": 0.0012043356081894822, |
| "grad_norm": 0.09955421835184097, |
| "kl": 0.0005112257204018533, |
| "learning_rate": 6.016042780748664e-08, |
| "loss": 0.0067, |
| "reward": -1.25, |
| "reward_std": 0.53515625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.25, |
| "step": 9 |
| }, |
| { |
| "completion_length": 230.5, |
| "epoch": 0.0013381506757660913, |
| "grad_norm": 0.19524620473384857, |
| "kl": 0.0006619760533794761, |
| "learning_rate": 6.684491978609626e-08, |
| "loss": -0.0006, |
| "reward": -0.26953125, |
| "reward_std": 0.349609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.26953125, |
| "step": 10 |
| }, |
| { |
| "completion_length": 369.5, |
| "epoch": 0.0014719657433427003, |
| "grad_norm": 0.1019153892993927, |
| "kl": 0.0006552126724272966, |
| "learning_rate": 7.352941176470589e-08, |
| "loss": -0.004, |
| "reward": -0.94140625, |
| "reward_std": 0.279296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.94140625, |
| "step": 11 |
| }, |
| { |
| "completion_length": 386.16668701171875, |
| "epoch": 0.0016057808109193096, |
| "grad_norm": 0.09696059674024582, |
| "kl": 0.0004603694542311132, |
| "learning_rate": 8.021390374331552e-08, |
| "loss": 0.002, |
| "reward": -0.8671875, |
| "reward_std": 0.42578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8671875, |
| "step": 12 |
| }, |
| { |
| "completion_length": 475.16668701171875, |
| "epoch": 0.0017395958784959186, |
| "grad_norm": 0.12413895130157471, |
| "kl": 0.0004793051048181951, |
| "learning_rate": 8.689839572192514e-08, |
| "loss": 0.0, |
| "reward": -0.9375, |
| "reward_std": 0.28125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9375, |
| "step": 13 |
| }, |
| { |
| "completion_length": 370.0, |
| "epoch": 0.0018734109460725277, |
| "grad_norm": 0.1305382251739502, |
| "kl": 0.0005513830110430717, |
| "learning_rate": 9.358288770053476e-08, |
| "loss": -0.0018, |
| "reward": -0.78515625, |
| "reward_std": 0.263671875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.78515625, |
| "step": 14 |
| }, |
| { |
| "completion_length": 431.66668701171875, |
| "epoch": 0.002007226013649137, |
| "grad_norm": 0.10463520139455795, |
| "kl": 0.00048596435226500034, |
| "learning_rate": 1.0026737967914439e-07, |
| "loss": 0.0032, |
| "reward": -0.84375, |
| "reward_std": 0.3984375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.84375, |
| "step": 15 |
| }, |
| { |
| "completion_length": 399.5, |
| "epoch": 0.002141041081225746, |
| "grad_norm": 0.1404961347579956, |
| "kl": 0.000555322621949017, |
| "learning_rate": 1.0695187165775401e-07, |
| "loss": -0.0057, |
| "reward": -1.0625, |
| "reward_std": 0.46484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0625, |
| "step": 16 |
| }, |
| { |
| "completion_length": 449.8333435058594, |
| "epoch": 0.002274856148802355, |
| "grad_norm": 0.10250594466924667, |
| "kl": 0.00048121344298124313, |
| "learning_rate": 1.1363636363636364e-07, |
| "loss": -0.0071, |
| "reward": -1.0234375, |
| "reward_std": 0.40234375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0234375, |
| "step": 17 |
| }, |
| { |
| "completion_length": 324.5, |
| "epoch": 0.0024086712163789645, |
| "grad_norm": 0.12464314699172974, |
| "kl": 0.0005811881856061518, |
| "learning_rate": 1.2032085561497328e-07, |
| "loss": 0.0033, |
| "reward": -0.6875, |
| "reward_std": 0.26171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.6875, |
| "step": 18 |
| }, |
| { |
| "completion_length": 578.0, |
| "epoch": 0.0025424862839555735, |
| "grad_norm": 0.08823499828577042, |
| "kl": 0.000675913121085614, |
| "learning_rate": 1.270053475935829e-07, |
| "loss": 0.0075, |
| "reward": -1.703125, |
| "reward_std": 0.5703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.703125, |
| "step": 19 |
| }, |
| { |
| "completion_length": 328.8333435058594, |
| "epoch": 0.0026763013515321826, |
| "grad_norm": 0.16708222031593323, |
| "kl": 0.0006092819385230541, |
| "learning_rate": 1.3368983957219251e-07, |
| "loss": 0.0091, |
| "reward": -0.71484375, |
| "reward_std": 0.1357421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.71484375, |
| "step": 20 |
| }, |
| { |
| "completion_length": 415.16668701171875, |
| "epoch": 0.0028101164191087916, |
| "grad_norm": 0.10446464270353317, |
| "kl": 0.0004726095939986408, |
| "learning_rate": 1.4037433155080215e-07, |
| "loss": 0.0011, |
| "reward": -1.0, |
| "reward_std": 0.50390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.0, |
| "step": 21 |
| }, |
| { |
| "completion_length": 398.5, |
| "epoch": 0.0029439314866854006, |
| "grad_norm": 0.10892236977815628, |
| "kl": 0.000556222046725452, |
| "learning_rate": 1.4705882352941178e-07, |
| "loss": 0.0016, |
| "reward": -0.9765625, |
| "reward_std": 0.349609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.9765625, |
| "step": 22 |
| }, |
| { |
| "completion_length": 351.66668701171875, |
| "epoch": 0.00307774655426201, |
| "grad_norm": 0.13707049190998077, |
| "kl": 0.0005205090856179595, |
| "learning_rate": 1.537433155080214e-07, |
| "loss": -0.0032, |
| "reward": -0.7421875, |
| "reward_std": 0.390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7421875, |
| "step": 23 |
| }, |
| { |
| "completion_length": 310.0, |
| "epoch": 0.003211561621838619, |
| "grad_norm": 0.1579124480485916, |
| "kl": 0.0007410722319036722, |
| "learning_rate": 1.6042780748663104e-07, |
| "loss": 0.0002, |
| "reward": -0.65625, |
| "reward_std": 0.6328125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.65625, |
| "step": 24 |
| }, |
| { |
| "completion_length": 353.0, |
| "epoch": 0.003345376689415228, |
| "grad_norm": 0.11555790901184082, |
| "kl": 0.0005753459990955889, |
| "learning_rate": 1.6711229946524068e-07, |
| "loss": -0.0034, |
| "reward": -0.828125, |
| "reward_std": 0.314453125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.828125, |
| "step": 25 |
| }, |
| { |
| "completion_length": 416.5, |
| "epoch": 0.0034791917569918372, |
| "grad_norm": 0.10537782311439514, |
| "kl": 0.0006076883291825652, |
| "learning_rate": 1.7379679144385028e-07, |
| "loss": -0.0068, |
| "reward": -0.8359375, |
| "reward_std": 0.30859375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8359375, |
| "step": 26 |
| }, |
| { |
| "completion_length": 436.0, |
| "epoch": 0.0036130068245684463, |
| "grad_norm": 0.12061028182506561, |
| "kl": 0.0006918934523127973, |
| "learning_rate": 1.8048128342245991e-07, |
| "loss": 0.0033, |
| "reward": -0.91015625, |
| "reward_std": 0.92578125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.91015625, |
| "step": 27 |
| }, |
| { |
| "completion_length": 447.16668701171875, |
| "epoch": 0.0037468218921450553, |
| "grad_norm": 0.11236874759197235, |
| "kl": 0.0005188498180359602, |
| "learning_rate": 1.8716577540106952e-07, |
| "loss": -0.0021, |
| "reward": -1.078125, |
| "reward_std": 0.298828125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.078125, |
| "step": 28 |
| }, |
| { |
| "completion_length": 524.5, |
| "epoch": 0.003880636959721665, |
| "grad_norm": 0.08638511598110199, |
| "kl": 0.000413873785873875, |
| "learning_rate": 1.9385026737967918e-07, |
| "loss": -0.0027, |
| "reward": -1.1953125, |
| "reward_std": 0.58203125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1953125, |
| "step": 29 |
| }, |
| { |
| "completion_length": 433.0, |
| "epoch": 0.004014452027298274, |
| "grad_norm": 0.10361335426568985, |
| "kl": 0.0005174180259928107, |
| "learning_rate": 2.0053475935828878e-07, |
| "loss": -0.001, |
| "reward": -0.8125, |
| "reward_std": 0.55078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8125, |
| "step": 30 |
| }, |
| { |
| "completion_length": 429.3333435058594, |
| "epoch": 0.004148267094874883, |
| "grad_norm": 0.09831919521093369, |
| "kl": 0.0004531377926468849, |
| "learning_rate": 2.0721925133689842e-07, |
| "loss": -0.0034, |
| "reward": -0.82421875, |
| "reward_std": 0.412109375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.82421875, |
| "step": 31 |
| }, |
| { |
| "completion_length": 352.3333435058594, |
| "epoch": 0.004282082162451492, |
| "grad_norm": 0.1168479472398758, |
| "kl": 0.00041617939132265747, |
| "learning_rate": 2.1390374331550802e-07, |
| "loss": 0.012, |
| "reward": -0.671875, |
| "reward_std": 0.1455078125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.671875, |
| "step": 32 |
| }, |
| { |
| "completion_length": 328.16668701171875, |
| "epoch": 0.004415897230028101, |
| "grad_norm": 0.14010493457317352, |
| "kl": 0.0006999190663918853, |
| "learning_rate": 2.2058823529411768e-07, |
| "loss": -0.0003, |
| "reward": -0.8203125, |
| "reward_std": 0.4609375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.8203125, |
| "step": 33 |
| }, |
| { |
| "completion_length": 461.0, |
| "epoch": 0.00454971229760471, |
| "grad_norm": 0.07955824583768845, |
| "kl": 0.000317567668389529, |
| "learning_rate": 2.2727272727272729e-07, |
| "loss": 0.0061, |
| "reward": -0.7421875, |
| "reward_std": 0.107421875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.7421875, |
| "step": 34 |
| }, |
| { |
| "completion_length": 343.5, |
| "epoch": 0.004683527365181319, |
| "grad_norm": 0.192967027425766, |
| "kl": 0.0003919226583093405, |
| "learning_rate": 2.3395721925133692e-07, |
| "loss": -0.0026, |
| "reward": -0.71875, |
| "reward_std": 0.26953125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.71875, |
| "step": 35 |
| }, |
| { |
| "completion_length": 468.0, |
| "epoch": 0.004817342432757929, |
| "grad_norm": 0.1151042953133583, |
| "kl": 0.0005731440032832325, |
| "learning_rate": 2.4064171122994655e-07, |
| "loss": 0.0008, |
| "reward": -0.90625, |
| "reward_std": 0.4375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.90625, |
| "step": 36 |
| }, |
| { |
| "completion_length": 326.66668701171875, |
| "epoch": 0.004951157500334538, |
| "grad_norm": 0.13014303147792816, |
| "kl": 0.0006222401279956102, |
| "learning_rate": 2.473262032085562e-07, |
| "loss": 0.0073, |
| "reward": -0.58984375, |
| "reward_std": 0.2177734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.58984375, |
| "step": 37 |
| }, |
| { |
| "completion_length": 470.16668701171875, |
| "epoch": 0.005084972567911147, |
| "grad_norm": 0.10929639637470245, |
| "kl": 0.0005664956988766789, |
| "learning_rate": 2.540106951871658e-07, |
| "loss": -0.001, |
| "reward": -1.2109375, |
| "reward_std": 0.451171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2109375, |
| "step": 38 |
| }, |
| { |
| "completion_length": 350.5, |
| "epoch": 0.005218787635487756, |
| "grad_norm": 0.121163509786129, |
| "kl": 0.0006041490705683827, |
| "learning_rate": 2.606951871657754e-07, |
| "loss": -0.0012, |
| "reward": -0.65234375, |
| "reward_std": 0.404296875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.65234375, |
| "step": 39 |
| }, |
| { |
| "completion_length": 441.5, |
| "epoch": 0.005352602703064365, |
| "grad_norm": 0.09024005383253098, |
| "kl": 0.0005421562236733735, |
| "learning_rate": 2.6737967914438503e-07, |
| "loss": 0.0012, |
| "reward": -0.765625, |
| "reward_std": 0.7734375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.765625, |
| "step": 40 |
| }, |
| { |
| "completion_length": 587.8333740234375, |
| "epoch": 0.005486417770640974, |
| "grad_norm": 0.11247697472572327, |
| "kl": 0.0009425554308108985, |
| "learning_rate": 2.740641711229947e-07, |
| "loss": -0.0003, |
| "reward": -1.6875, |
| "reward_std": 0.6171875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.6875, |
| "step": 41 |
| }, |
| { |
| "completion_length": 523.5, |
| "epoch": 0.005620232838217583, |
| "grad_norm": 0.08999153226613998, |
| "kl": 0.0004692915244959295, |
| "learning_rate": 2.807486631016043e-07, |
| "loss": 0.0003, |
| "reward": -0.796875, |
| "reward_std": 1.375, |
| "rewards/correctness_reward_func": 0.333984375, |
| "rewards/int_reward_func": 0.08349609375, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.2109375, |
| "step": 42 |
| }, |
| { |
| "completion_length": 531.0, |
| "epoch": 0.005754047905794192, |
| "grad_norm": 0.09678950905799866, |
| "kl": 0.0005171874072402716, |
| "learning_rate": 2.8743315508021395e-07, |
| "loss": -0.0044, |
| "reward": -1.421875, |
| "reward_std": 0.4140625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.421875, |
| "step": 43 |
| }, |
| { |
| "completion_length": 416.0, |
| "epoch": 0.005887862973370801, |
| "grad_norm": 0.11189436912536621, |
| "kl": 0.00040408255881629884, |
| "learning_rate": 2.9411764705882356e-07, |
| "loss": 0.0029, |
| "reward": -0.921875, |
| "reward_std": 0.1884765625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.921875, |
| "step": 44 |
| }, |
| { |
| "completion_length": 359.8333435058594, |
| "epoch": 0.00602167804094741, |
| "grad_norm": 0.10176176577806473, |
| "kl": 0.0005236775032244623, |
| "learning_rate": 3.0080213903743316e-07, |
| "loss": -0.0032, |
| "reward": -0.58203125, |
| "reward_std": 0.4140625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.58203125, |
| "step": 45 |
| }, |
| { |
| "completion_length": 416.8333435058594, |
| "epoch": 0.00615549310852402, |
| "grad_norm": 0.12203460931777954, |
| "kl": 0.0006941946921870112, |
| "learning_rate": 3.074866310160428e-07, |
| "loss": 0.0008, |
| "reward": -1.15625, |
| "reward_std": 0.50390625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.15625, |
| "step": 46 |
| }, |
| { |
| "completion_length": 524.8333740234375, |
| "epoch": 0.006289308176100629, |
| "grad_norm": 0.09807480126619339, |
| "kl": 0.000624034320935607, |
| "learning_rate": 3.1417112299465243e-07, |
| "loss": -0.0062, |
| "reward": -1.1875, |
| "reward_std": 0.396484375, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.1875, |
| "step": 47 |
| }, |
| { |
| "completion_length": 419.5, |
| "epoch": 0.006423123243677238, |
| "grad_norm": 0.11151473969221115, |
| "kl": 0.0005637712310999632, |
| "learning_rate": 3.208556149732621e-07, |
| "loss": -0.0017, |
| "reward": -0.94140625, |
| "reward_std": 0.5546875, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.94140625, |
| "step": 48 |
| }, |
| { |
| "completion_length": 375.3333435058594, |
| "epoch": 0.006556938311253847, |
| "grad_norm": 0.15443629026412964, |
| "kl": 0.0006895489059388638, |
| "learning_rate": 3.275401069518717e-07, |
| "loss": -0.0021, |
| "reward": -0.80078125, |
| "reward_std": 0.6015625, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -0.80078125, |
| "step": 49 |
| }, |
| { |
| "completion_length": 538.5, |
| "epoch": 0.006690753378830456, |
| "grad_norm": 0.12232497334480286, |
| "kl": 0.00044502606033347547, |
| "learning_rate": 3.3422459893048135e-07, |
| "loss": 0.0038, |
| "reward": -1.15625, |
| "reward_std": 0.345703125, |
| "rewards/correctness_reward_func": 0.0, |
| "rewards/int_reward_func": 0.0, |
| "rewards/soft_format_reward_func": 0.0, |
| "rewards/strict_format_reward_func": 0.0, |
| "rewards/xmlcount_reward_func": -1.15625, |
| "step": 50 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 7473, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|