diff --git "a/checkpoint-200/trainer_state.json" "b/checkpoint-200/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-200/trainer_state.json" @@ -0,0 +1,3233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.026763013515321826, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 698.1666870117188, + "epoch": 0.00013381506757660912, + "grad_norm": 0.07569596916437149, + "kl": 0.0006024616304785013, + "learning_rate": 6.684491978609626e-09, + "loss": 0.001, + "reward": -1.8359375, + "reward_std": 0.5859375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.8359375, + "step": 1 + }, + { + "completion_length": 549.0, + "epoch": 0.00026763013515321824, + "grad_norm": 0.10156559199094772, + "kl": 0.0006554799037985504, + "learning_rate": 1.3368983957219251e-08, + "loss": -0.0055, + "reward": -1.21875, + "reward_std": 0.48828125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.21875, + "step": 2 + }, + { + "completion_length": 509.66668701171875, + "epoch": 0.0004014452027298274, + "grad_norm": 0.1012749969959259, + "kl": 0.0006122777122072875, + "learning_rate": 2.005347593582888e-08, + "loss": 0.0032, + "reward": -1.2578125, + "reward_std": 0.37890625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.2578125, + "step": 3 + }, + { + "completion_length": 387.8333435058594, + "epoch": 0.0005352602703064365, + "grad_norm": 0.10009913891553879, + "kl": 0.0005205385386943817, + "learning_rate": 2.6737967914438503e-08, + "loss": 0.0007, + "reward": -0.83203125, + "reward_std": 0.1640625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.83203125, + "step": 4 + }, + { + "completion_length": 386.5, + "epoch": 0.0006690753378830456, + "grad_norm": 0.11404310166835785, + "kl": 0.00039041676791384816, + "learning_rate": 3.342245989304813e-08, + "loss": -0.0032, + "reward": -0.859375, + "reward_std": 0.1630859375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.859375, + "step": 5 + }, + { + "completion_length": 345.8333435058594, + "epoch": 0.0008028904054596548, + "grad_norm": 0.13447555899620056, + "kl": 0.0005453471094369888, + "learning_rate": 4.010695187165776e-08, + "loss": 0.0036, + "reward": -0.7109375, + "reward_std": 0.357421875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.7109375, + "step": 6 + }, + { + "completion_length": 426.16668701171875, + "epoch": 0.0009367054730362638, + "grad_norm": 0.1324268877506256, + "kl": 0.000606791814789176, + "learning_rate": 4.679144385026738e-08, + "loss": 0.0017, + "reward": -1.09375, + "reward_std": 0.72265625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.09375, + "step": 7 + }, + { + "completion_length": 418.66668701171875, + "epoch": 0.001070520540612873, + "grad_norm": 0.12978878617286682, + "kl": 0.0005688891978934407, + "learning_rate": 5.3475935828877005e-08, + "loss": 0.0005, + "reward": -0.89453125, + "reward_std": 0.392578125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.89453125, + "step": 8 + }, + { + "completion_length": 484.5, + "epoch": 0.0012043356081894822, + "grad_norm": 0.09955421835184097, + "kl": 0.0005112257204018533, + "learning_rate": 6.016042780748664e-08, + "loss": 0.0067, + "reward": -1.25, + "reward_std": 0.53515625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.25, + "step": 9 + }, + { + "completion_length": 230.5, + "epoch": 0.0013381506757660913, + "grad_norm": 0.19524620473384857, + "kl": 0.0006619760533794761, + "learning_rate": 6.684491978609626e-08, + "loss": -0.0006, + "reward": -0.26953125, + "reward_std": 0.349609375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.26953125, + "step": 10 + }, + { + "completion_length": 369.5, + "epoch": 0.0014719657433427003, + "grad_norm": 0.1019153892993927, + "kl": 0.0006552126724272966, + "learning_rate": 7.352941176470589e-08, + "loss": -0.004, + "reward": -0.94140625, + "reward_std": 0.279296875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.94140625, + "step": 11 + }, + { + "completion_length": 386.16668701171875, + "epoch": 0.0016057808109193096, + "grad_norm": 0.09696059674024582, + "kl": 0.0004603694542311132, + "learning_rate": 8.021390374331552e-08, + "loss": 0.002, + "reward": -0.8671875, + "reward_std": 0.42578125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.8671875, + "step": 12 + }, + { + "completion_length": 475.16668701171875, + "epoch": 0.0017395958784959186, + "grad_norm": 0.12413895130157471, + "kl": 0.0004793051048181951, + "learning_rate": 8.689839572192514e-08, + "loss": 0.0, + "reward": -0.9375, + "reward_std": 0.28125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.9375, + "step": 13 + }, + { + "completion_length": 370.0, + "epoch": 0.0018734109460725277, + "grad_norm": 0.1305382251739502, + "kl": 0.0005513830110430717, + "learning_rate": 9.358288770053476e-08, + "loss": -0.0018, + "reward": -0.78515625, + "reward_std": 0.263671875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.78515625, + "step": 14 + }, + { + "completion_length": 431.66668701171875, + "epoch": 0.002007226013649137, + "grad_norm": 0.10463520139455795, + "kl": 0.00048596435226500034, + "learning_rate": 1.0026737967914439e-07, + "loss": 0.0032, + "reward": -0.84375, + "reward_std": 0.3984375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.84375, + "step": 15 + }, + { + "completion_length": 399.5, + "epoch": 0.002141041081225746, + "grad_norm": 0.1404961347579956, + "kl": 0.000555322621949017, + "learning_rate": 1.0695187165775401e-07, + "loss": -0.0057, + "reward": -1.0625, + "reward_std": 0.46484375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.0625, + "step": 16 + }, + { + "completion_length": 449.8333435058594, + "epoch": 0.002274856148802355, + "grad_norm": 0.10250594466924667, + "kl": 0.00048121344298124313, + "learning_rate": 1.1363636363636364e-07, + "loss": -0.0071, + "reward": -1.0234375, + "reward_std": 0.40234375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.0234375, + "step": 17 + }, + { + "completion_length": 324.5, + "epoch": 0.0024086712163789645, + "grad_norm": 0.12464314699172974, + "kl": 0.0005811881856061518, + "learning_rate": 1.2032085561497328e-07, + "loss": 0.0033, + "reward": -0.6875, + "reward_std": 0.26171875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.6875, + "step": 18 + }, + { + "completion_length": 578.0, + "epoch": 0.0025424862839555735, + "grad_norm": 0.08823499828577042, + "kl": 0.000675913121085614, + "learning_rate": 1.270053475935829e-07, + "loss": 0.0075, + "reward": -1.703125, + "reward_std": 0.5703125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.703125, + "step": 19 + }, + { + "completion_length": 328.8333435058594, + "epoch": 0.0026763013515321826, + "grad_norm": 0.16708222031593323, + "kl": 0.0006092819385230541, + "learning_rate": 1.3368983957219251e-07, + "loss": 0.0091, + "reward": -0.71484375, + "reward_std": 0.1357421875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.71484375, + "step": 20 + }, + { + "completion_length": 415.16668701171875, + "epoch": 0.0028101164191087916, + "grad_norm": 0.10446464270353317, + "kl": 0.0004726095939986408, + "learning_rate": 1.4037433155080215e-07, + "loss": 0.0011, + "reward": -1.0, + "reward_std": 0.50390625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.0, + "step": 21 + }, + { + "completion_length": 398.5, + "epoch": 0.0029439314866854006, + "grad_norm": 0.10892236977815628, + "kl": 0.000556222046725452, + "learning_rate": 1.4705882352941178e-07, + "loss": 0.0016, + "reward": -0.9765625, + "reward_std": 0.349609375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.9765625, + "step": 22 + }, + { + "completion_length": 351.66668701171875, + "epoch": 0.00307774655426201, + "grad_norm": 0.13707049190998077, + "kl": 0.0005205090856179595, + "learning_rate": 1.537433155080214e-07, + "loss": -0.0032, + "reward": -0.7421875, + "reward_std": 0.390625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.7421875, + "step": 23 + }, + { + "completion_length": 310.0, + "epoch": 0.003211561621838619, + "grad_norm": 0.1579124480485916, + "kl": 0.0007410722319036722, + "learning_rate": 1.6042780748663104e-07, + "loss": 0.0002, + "reward": -0.65625, + "reward_std": 0.6328125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.65625, + "step": 24 + }, + { + "completion_length": 353.0, + "epoch": 0.003345376689415228, + "grad_norm": 0.11555790901184082, + "kl": 0.0005753459990955889, + "learning_rate": 1.6711229946524068e-07, + "loss": -0.0034, + "reward": -0.828125, + "reward_std": 0.314453125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.828125, + "step": 25 + }, + { + "completion_length": 416.5, + "epoch": 0.0034791917569918372, + "grad_norm": 0.10537782311439514, + "kl": 0.0006076883291825652, + "learning_rate": 1.7379679144385028e-07, + "loss": -0.0068, + "reward": -0.8359375, + "reward_std": 0.30859375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.8359375, + "step": 26 + }, + { + "completion_length": 436.0, + "epoch": 0.0036130068245684463, + "grad_norm": 0.12061028182506561, + "kl": 0.0006918934523127973, + "learning_rate": 1.8048128342245991e-07, + "loss": 0.0033, + "reward": -0.91015625, + "reward_std": 0.92578125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.91015625, + "step": 27 + }, + { + "completion_length": 447.16668701171875, + "epoch": 0.0037468218921450553, + "grad_norm": 0.11236874759197235, + "kl": 0.0005188498180359602, + "learning_rate": 1.8716577540106952e-07, + "loss": -0.0021, + "reward": -1.078125, + "reward_std": 0.298828125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.078125, + "step": 28 + }, + { + "completion_length": 524.5, + "epoch": 0.003880636959721665, + "grad_norm": 0.08638511598110199, + "kl": 0.000413873785873875, + "learning_rate": 1.9385026737967918e-07, + "loss": -0.0027, + "reward": -1.1953125, + "reward_std": 0.58203125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.1953125, + "step": 29 + }, + { + "completion_length": 433.0, + "epoch": 0.004014452027298274, + "grad_norm": 0.10361335426568985, + "kl": 0.0005174180259928107, + "learning_rate": 2.0053475935828878e-07, + "loss": -0.001, + "reward": -0.8125, + "reward_std": 0.55078125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.8125, + "step": 30 + }, + { + "completion_length": 429.3333435058594, + "epoch": 0.004148267094874883, + "grad_norm": 0.09831919521093369, + "kl": 0.0004531377926468849, + "learning_rate": 2.0721925133689842e-07, + "loss": -0.0034, + "reward": -0.82421875, + "reward_std": 0.412109375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.82421875, + "step": 31 + }, + { + "completion_length": 352.3333435058594, + "epoch": 0.004282082162451492, + "grad_norm": 0.1168479472398758, + "kl": 0.00041617939132265747, + "learning_rate": 2.1390374331550802e-07, + "loss": 0.012, + "reward": -0.671875, + "reward_std": 0.1455078125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.671875, + "step": 32 + }, + { + "completion_length": 328.16668701171875, + "epoch": 0.004415897230028101, + "grad_norm": 0.14010493457317352, + "kl": 0.0006999190663918853, + "learning_rate": 2.2058823529411768e-07, + "loss": -0.0003, + "reward": -0.8203125, + "reward_std": 0.4609375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.8203125, + "step": 33 + }, + { + "completion_length": 461.0, + "epoch": 0.00454971229760471, + "grad_norm": 0.07955824583768845, + "kl": 0.000317567668389529, + "learning_rate": 2.2727272727272729e-07, + "loss": 0.0061, + "reward": -0.7421875, + "reward_std": 0.107421875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.7421875, + "step": 34 + }, + { + "completion_length": 343.5, + "epoch": 0.004683527365181319, + "grad_norm": 0.192967027425766, + "kl": 0.0003919226583093405, + "learning_rate": 2.3395721925133692e-07, + "loss": -0.0026, + "reward": -0.71875, + "reward_std": 0.26953125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.71875, + "step": 35 + }, + { + "completion_length": 468.0, + "epoch": 0.004817342432757929, + "grad_norm": 0.1151042953133583, + "kl": 0.0005731440032832325, + "learning_rate": 2.4064171122994655e-07, + "loss": 0.0008, + "reward": -0.90625, + "reward_std": 0.4375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.90625, + "step": 36 + }, + { + "completion_length": 326.66668701171875, + "epoch": 0.004951157500334538, + "grad_norm": 0.13014303147792816, + "kl": 0.0006222401279956102, + "learning_rate": 2.473262032085562e-07, + "loss": 0.0073, + "reward": -0.58984375, + "reward_std": 0.2177734375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.58984375, + "step": 37 + }, + { + "completion_length": 470.16668701171875, + "epoch": 0.005084972567911147, + "grad_norm": 0.10929639637470245, + "kl": 0.0005664956988766789, + "learning_rate": 2.540106951871658e-07, + "loss": -0.001, + "reward": -1.2109375, + "reward_std": 0.451171875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.2109375, + "step": 38 + }, + { + "completion_length": 350.5, + "epoch": 0.005218787635487756, + "grad_norm": 0.121163509786129, + "kl": 0.0006041490705683827, + "learning_rate": 2.606951871657754e-07, + "loss": -0.0012, + "reward": -0.65234375, + "reward_std": 0.404296875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.65234375, + "step": 39 + }, + { + "completion_length": 441.5, + "epoch": 0.005352602703064365, + "grad_norm": 0.09024005383253098, + "kl": 0.0005421562236733735, + "learning_rate": 2.6737967914438503e-07, + "loss": 0.0012, + "reward": -0.765625, + "reward_std": 0.7734375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.765625, + "step": 40 + }, + { + "completion_length": 587.8333740234375, + "epoch": 0.005486417770640974, + "grad_norm": 0.11247697472572327, + "kl": 0.0009425554308108985, + "learning_rate": 2.740641711229947e-07, + "loss": -0.0003, + "reward": -1.6875, + "reward_std": 0.6171875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.6875, + "step": 41 + }, + { + "completion_length": 523.5, + "epoch": 0.005620232838217583, + "grad_norm": 0.08999153226613998, + "kl": 0.0004692915244959295, + "learning_rate": 2.807486631016043e-07, + "loss": 0.0003, + "reward": -0.796875, + "reward_std": 1.375, + "rewards/correctness_reward_func": 0.333984375, + "rewards/int_reward_func": 0.08349609375, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.2109375, + "step": 42 + }, + { + "completion_length": 531.0, + "epoch": 0.005754047905794192, + "grad_norm": 0.09678950905799866, + "kl": 0.0005171874072402716, + "learning_rate": 2.8743315508021395e-07, + "loss": -0.0044, + "reward": -1.421875, + "reward_std": 0.4140625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.421875, + "step": 43 + }, + { + "completion_length": 416.0, + "epoch": 0.005887862973370801, + "grad_norm": 0.11189436912536621, + "kl": 0.00040408255881629884, + "learning_rate": 2.9411764705882356e-07, + "loss": 0.0029, + "reward": -0.921875, + "reward_std": 0.1884765625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.921875, + "step": 44 + }, + { + "completion_length": 359.8333435058594, + "epoch": 0.00602167804094741, + "grad_norm": 0.10176176577806473, + "kl": 0.0005236775032244623, + "learning_rate": 3.0080213903743316e-07, + "loss": -0.0032, + "reward": -0.58203125, + "reward_std": 0.4140625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.58203125, + "step": 45 + }, + { + "completion_length": 416.8333435058594, + "epoch": 0.00615549310852402, + "grad_norm": 0.12203460931777954, + "kl": 0.0006941946921870112, + "learning_rate": 3.074866310160428e-07, + "loss": 0.0008, + "reward": -1.15625, + "reward_std": 0.50390625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.15625, + "step": 46 + }, + { + "completion_length": 524.8333740234375, + "epoch": 0.006289308176100629, + "grad_norm": 0.09807480126619339, + "kl": 0.000624034320935607, + "learning_rate": 3.1417112299465243e-07, + "loss": -0.0062, + "reward": -1.1875, + "reward_std": 0.396484375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.1875, + "step": 47 + }, + { + "completion_length": 419.5, + "epoch": 0.006423123243677238, + "grad_norm": 0.11151473969221115, + "kl": 0.0005637712310999632, + "learning_rate": 3.208556149732621e-07, + "loss": -0.0017, + "reward": -0.94140625, + "reward_std": 0.5546875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.94140625, + "step": 48 + }, + { + "completion_length": 375.3333435058594, + "epoch": 0.006556938311253847, + "grad_norm": 0.15443629026412964, + "kl": 0.0006895489059388638, + "learning_rate": 3.275401069518717e-07, + "loss": -0.0021, + "reward": -0.80078125, + "reward_std": 0.6015625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.80078125, + "step": 49 + }, + { + "completion_length": 538.5, + "epoch": 0.006690753378830456, + "grad_norm": 0.12232497334480286, + "kl": 0.00044502606033347547, + "learning_rate": 3.3422459893048135e-07, + "loss": 0.0038, + "reward": -1.15625, + "reward_std": 0.345703125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.15625, + "step": 50 + }, + { + "completion_length": 380.66668701171875, + "epoch": 0.006824568446407065, + "grad_norm": 0.09400169551372528, + "kl": 0.0004400149919092655, + "learning_rate": 3.409090909090909e-07, + "loss": -0.0005, + "reward": -0.75390625, + "reward_std": 0.75390625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.75390625, + "step": 51 + }, + { + "completion_length": 302.16668701171875, + "epoch": 0.0069583835139836745, + "grad_norm": 0.18885326385498047, + "kl": 0.0006017067935317755, + "learning_rate": 3.4759358288770056e-07, + "loss": 0.0001, + "reward": -0.494140625, + "reward_std": 0.5078125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.494140625, + "step": 52 + }, + { + "completion_length": 310.0, + "epoch": 0.0070921985815602835, + "grad_norm": 0.17508742213249207, + "kl": 0.0006495526758953929, + "learning_rate": 3.542780748663102e-07, + "loss": 0.0001, + "reward": -0.609375, + "reward_std": 0.208984375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.609375, + "step": 53 + }, + { + "completion_length": 348.16668701171875, + "epoch": 0.0072260136491368926, + "grad_norm": 0.1143779456615448, + "kl": 0.0005849208100698888, + "learning_rate": 3.6096256684491983e-07, + "loss": -0.0023, + "reward": -0.8984375, + "reward_std": 0.423828125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.8984375, + "step": 54 + }, + { + "completion_length": 461.5, + "epoch": 0.007359828716713502, + "grad_norm": 0.10026198625564575, + "kl": 0.0005551945068873465, + "learning_rate": 3.6764705882352943e-07, + "loss": -0.0088, + "reward": -1.03125, + "reward_std": 0.38671875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.03125, + "step": 55 + }, + { + "completion_length": 581.1666870117188, + "epoch": 0.007493643784290111, + "grad_norm": 0.09014507383108139, + "kl": 0.0004388962115626782, + "learning_rate": 3.7433155080213904e-07, + "loss": 0.0007, + "reward": -1.328125, + "reward_std": 0.1474609375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.328125, + "step": 56 + }, + { + "completion_length": 320.3333435058594, + "epoch": 0.0076274588518667205, + "grad_norm": 0.09987051039934158, + "kl": 0.0005903591518290341, + "learning_rate": 3.810160427807487e-07, + "loss": -0.0068, + "reward": -0.609375, + "reward_std": 0.2578125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.609375, + "step": 57 + }, + { + "completion_length": 362.8333435058594, + "epoch": 0.00776127391944333, + "grad_norm": 0.246050164103508, + "kl": 0.0005056472145952284, + "learning_rate": 3.8770053475935836e-07, + "loss": -0.0027, + "reward": -0.625, + "reward_std": 0.5234375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.625, + "step": 58 + }, + { + "completion_length": 524.0, + "epoch": 0.007895088987019938, + "grad_norm": 0.12084438651800156, + "kl": 0.0005575703689828515, + "learning_rate": 3.943850267379679e-07, + "loss": 0.0114, + "reward": -1.1328125, + "reward_std": 0.28125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.1328125, + "step": 59 + }, + { + "completion_length": 410.3333435058594, + "epoch": 0.008028904054596548, + "grad_norm": 0.10101523995399475, + "kl": 0.0005777844344265759, + "learning_rate": 4.0106951871657757e-07, + "loss": 0.0007, + "reward": -0.94921875, + "reward_std": 0.271484375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.94921875, + "step": 60 + }, + { + "completion_length": 717.6666870117188, + "epoch": 0.008162719122173156, + "grad_norm": 0.09220802038908005, + "kl": 0.0006241414812393486, + "learning_rate": 4.077540106951872e-07, + "loss": -0.0078, + "reward": -2.046875, + "reward_std": 0.53515625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -2.046875, + "step": 61 + }, + { + "completion_length": 384.66668701171875, + "epoch": 0.008296534189749766, + "grad_norm": 0.10890569537878036, + "kl": 0.00048696936573833227, + "learning_rate": 4.1443850267379683e-07, + "loss": 0.0039, + "reward": -0.921875, + "reward_std": 0.1865234375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.921875, + "step": 62 + }, + { + "completion_length": 271.3333435058594, + "epoch": 0.008430349257326376, + "grad_norm": 0.11486776173114777, + "kl": 0.0005599698051810265, + "learning_rate": 4.211229946524065e-07, + "loss": -0.0005, + "reward": -0.201171875, + "reward_std": 0.396484375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.201171875, + "step": 63 + }, + { + "completion_length": 385.3333435058594, + "epoch": 0.008564164324902984, + "grad_norm": 0.1193244457244873, + "kl": 0.0006926630157977343, + "learning_rate": 4.2780748663101604e-07, + "loss": 0.0043, + "reward": -0.97265625, + "reward_std": 0.41015625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.97265625, + "step": 64 + }, + { + "completion_length": 539.6666870117188, + "epoch": 0.008697979392479594, + "grad_norm": 0.09389720857143402, + "kl": 0.0004780918825417757, + "learning_rate": 4.344919786096257e-07, + "loss": 0.0052, + "reward": -1.234375, + "reward_std": 0.373046875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.234375, + "step": 65 + }, + { + "completion_length": 259.8333435058594, + "epoch": 0.008831794460056202, + "grad_norm": 0.22691243886947632, + "kl": 0.0008878613589331508, + "learning_rate": 4.4117647058823536e-07, + "loss": -0.0048, + "reward": -0.51953125, + "reward_std": 0.251953125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.51953125, + "step": 66 + }, + { + "completion_length": 461.16668701171875, + "epoch": 0.008965609527632812, + "grad_norm": 0.12113010138273239, + "kl": 0.0006317974766716361, + "learning_rate": 4.4786096256684497e-07, + "loss": -0.0067, + "reward": -1.15625, + "reward_std": 0.39453125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.15625, + "step": 67 + }, + { + "completion_length": 399.8333435058594, + "epoch": 0.00909942459520942, + "grad_norm": 0.1679317206144333, + "kl": 0.000584149791393429, + "learning_rate": 4.5454545454545457e-07, + "loss": -0.0093, + "reward": -0.9609375, + "reward_std": 0.2060546875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.9609375, + "step": 68 + }, + { + "completion_length": 381.0, + "epoch": 0.00923323966278603, + "grad_norm": 0.2019040584564209, + "kl": 0.0007442033383995295, + "learning_rate": 4.612299465240642e-07, + "loss": 0.0034, + "reward": -0.63671875, + "reward_std": 0.494140625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.63671875, + "step": 69 + }, + { + "completion_length": 455.5, + "epoch": 0.009367054730362638, + "grad_norm": 0.09101377427577972, + "kl": 0.00046143907820805907, + "learning_rate": 4.6791443850267384e-07, + "loss": -0.0057, + "reward": -1.046875, + "reward_std": 0.6328125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.046875, + "step": 70 + }, + { + "completion_length": 493.8333435058594, + "epoch": 0.009500869797939248, + "grad_norm": 0.09268555790185928, + "kl": 0.00048020537360571325, + "learning_rate": 4.745989304812835e-07, + "loss": -0.0021, + "reward": -1.28125, + "reward_std": 0.54296875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.28125, + "step": 71 + }, + { + "completion_length": 601.6666870117188, + "epoch": 0.009634684865515858, + "grad_norm": 0.07598231732845306, + "kl": 0.0004928440321236849, + "learning_rate": 4.812834224598931e-07, + "loss": -0.0031, + "reward": -1.328125, + "reward_std": 0.74609375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.328125, + "step": 72 + }, + { + "completion_length": 449.16668701171875, + "epoch": 0.009768499933092466, + "grad_norm": 0.1203397586941719, + "kl": 0.0006244009709917009, + "learning_rate": 4.879679144385027e-07, + "loss": -0.0055, + "reward": -1.0703125, + "reward_std": 0.474609375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.0703125, + "step": 73 + }, + { + "completion_length": 499.0, + "epoch": 0.009902315000669076, + "grad_norm": 0.08029637485742569, + "kl": 0.0004115910269320011, + "learning_rate": 4.946524064171124e-07, + "loss": 0.001, + "reward": -1.1953125, + "reward_std": 0.5703125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.1953125, + "step": 74 + }, + { + "completion_length": 318.8333435058594, + "epoch": 0.010036130068245684, + "grad_norm": 0.10725877434015274, + "kl": 0.0005362802767194808, + "learning_rate": 5.013368983957219e-07, + "loss": -0.0039, + "reward": -0.38671875, + "reward_std": 0.25390625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.38671875, + "step": 75 + }, + { + "completion_length": 348.0, + "epoch": 0.010169945135822294, + "grad_norm": 0.1331893801689148, + "kl": 0.0006620581261813641, + "learning_rate": 5.080213903743316e-07, + "loss": -0.0007, + "reward": -0.84765625, + "reward_std": 0.486328125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.84765625, + "step": 76 + }, + { + "completion_length": 456.66668701171875, + "epoch": 0.010303760203398902, + "grad_norm": 0.10820724815130234, + "kl": 0.0007615931099280715, + "learning_rate": 5.147058823529412e-07, + "loss": 0.0036, + "reward": -0.953125, + "reward_std": 0.59765625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.953125, + "step": 77 + }, + { + "completion_length": 335.66668701171875, + "epoch": 0.010437575270975512, + "grad_norm": 0.13866935670375824, + "kl": 0.0005373357562348247, + "learning_rate": 5.213903743315508e-07, + "loss": 0.0013, + "reward": -0.58203125, + "reward_std": 0.3359375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.58203125, + "step": 78 + }, + { + "completion_length": 338.0, + "epoch": 0.01057139033855212, + "grad_norm": 0.1531476229429245, + "kl": 0.000613297161180526, + "learning_rate": 5.280748663101604e-07, + "loss": -0.0006, + "reward": -0.6875, + "reward_std": 0.380859375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.6875, + "step": 79 + }, + { + "completion_length": 323.8333435058594, + "epoch": 0.01070520540612873, + "grad_norm": 0.11174651980400085, + "kl": 0.00048220629105344415, + "learning_rate": 5.347593582887701e-07, + "loss": 0.0042, + "reward": -0.5546875, + "reward_std": 0.099609375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.5546875, + "step": 80 + }, + { + "completion_length": 425.0, + "epoch": 0.010839020473705338, + "grad_norm": 0.06810642778873444, + "kl": 0.0002865367860067636, + "learning_rate": 5.414438502673798e-07, + "loss": 0.0087, + "reward": -0.921875, + "reward_std": 0.1455078125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.921875, + "step": 81 + }, + { + "completion_length": 356.0, + "epoch": 0.010972835541281948, + "grad_norm": 0.12943901121616364, + "kl": 0.0005909207975491881, + "learning_rate": 5.481283422459894e-07, + "loss": -0.0014, + "reward": -0.80078125, + "reward_std": 0.2373046875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.80078125, + "step": 82 + }, + { + "completion_length": 408.66668701171875, + "epoch": 0.011106650608858558, + "grad_norm": 0.10401128232479095, + "kl": 0.0005684032803401351, + "learning_rate": 5.54812834224599e-07, + "loss": 0.0067, + "reward": -0.828125, + "reward_std": 0.279296875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.828125, + "step": 83 + }, + { + "completion_length": 365.5, + "epoch": 0.011240465676435166, + "grad_norm": 0.11247576773166656, + "kl": 0.0005961977876722813, + "learning_rate": 5.614973262032086e-07, + "loss": 0.0049, + "reward": -0.7890625, + "reward_std": 0.43359375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.7890625, + "step": 84 + }, + { + "completion_length": 373.16668701171875, + "epoch": 0.011374280744011776, + "grad_norm": 0.13172324001789093, + "kl": 0.0006587211973965168, + "learning_rate": 5.681818181818182e-07, + "loss": 0.0, + "reward": -0.92578125, + "reward_std": 0.4296875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.92578125, + "step": 85 + }, + { + "completion_length": 463.16668701171875, + "epoch": 0.011508095811588384, + "grad_norm": 0.0999283418059349, + "kl": 0.0005339820636436343, + "learning_rate": 5.748663101604279e-07, + "loss": -0.0073, + "reward": -1.1640625, + "reward_std": 0.478515625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.1640625, + "step": 86 + }, + { + "completion_length": 346.16668701171875, + "epoch": 0.011641910879164994, + "grad_norm": 0.1427423655986786, + "kl": 0.0005965695017948747, + "learning_rate": 5.815508021390375e-07, + "loss": 0.0052, + "reward": -0.86328125, + "reward_std": 0.36328125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.86328125, + "step": 87 + }, + { + "completion_length": 464.3333435058594, + "epoch": 0.011775725946741603, + "grad_norm": 0.09077266603708267, + "kl": 0.0005941446870565414, + "learning_rate": 5.882352941176471e-07, + "loss": -0.0034, + "reward": -0.96875, + "reward_std": 0.3203125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.96875, + "step": 88 + }, + { + "completion_length": 444.3333435058594, + "epoch": 0.011909541014318212, + "grad_norm": 0.11555906385183334, + "kl": 0.0005076751112937927, + "learning_rate": 5.949197860962567e-07, + "loss": 0.0, + "reward": -0.98046875, + "reward_std": 0.3828125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.98046875, + "step": 89 + }, + { + "completion_length": 381.66668701171875, + "epoch": 0.01204335608189482, + "grad_norm": 0.11006759107112885, + "kl": 0.0005213702679611742, + "learning_rate": 6.016042780748663e-07, + "loss": -0.0019, + "reward": -0.8125, + "reward_std": 0.51171875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.8125, + "step": 90 + }, + { + "completion_length": 586.6666870117188, + "epoch": 0.01217717114947143, + "grad_norm": 0.0977427214384079, + "kl": 0.0004677172692026943, + "learning_rate": 6.08288770053476e-07, + "loss": 0.0059, + "reward": -1.2265625, + "reward_std": 0.5703125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.2265625, + "step": 91 + }, + { + "completion_length": 523.5, + "epoch": 0.01231098621704804, + "grad_norm": 0.14523504674434662, + "kl": 0.0007363607874140143, + "learning_rate": 6.149732620320856e-07, + "loss": -0.0029, + "reward": -1.484375, + "reward_std": 0.828125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.484375, + "step": 92 + }, + { + "completion_length": 452.5, + "epoch": 0.012444801284624649, + "grad_norm": 0.11730131506919861, + "kl": 0.0004316701088100672, + "learning_rate": 6.216577540106952e-07, + "loss": 0.0008, + "reward": -1.328125, + "reward_std": 0.82421875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.328125, + "step": 93 + }, + { + "completion_length": 471.66668701171875, + "epoch": 0.012578616352201259, + "grad_norm": 0.12073160707950592, + "kl": 0.000504339870531112, + "learning_rate": 6.283422459893049e-07, + "loss": 0.0029, + "reward": -1.1796875, + "reward_std": 0.40625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.1796875, + "step": 94 + }, + { + "completion_length": 291.66668701171875, + "epoch": 0.012712431419777867, + "grad_norm": 0.17788150906562805, + "kl": 0.0006855755927972496, + "learning_rate": 6.350267379679146e-07, + "loss": -0.0016, + "reward": -0.494140625, + "reward_std": 0.26171875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.494140625, + "step": 95 + }, + { + "completion_length": 411.8333435058594, + "epoch": 0.012846246487354477, + "grad_norm": 0.08730936795473099, + "kl": 0.00039596876013092697, + "learning_rate": 6.417112299465242e-07, + "loss": 0.0014, + "reward": -0.890625, + "reward_std": 0.498046875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.890625, + "step": 96 + }, + { + "completion_length": 376.5, + "epoch": 0.012980061554931085, + "grad_norm": 0.09182324260473251, + "kl": 0.0004653404466807842, + "learning_rate": 6.483957219251337e-07, + "loss": -0.0036, + "reward": -0.73828125, + "reward_std": 0.33203125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.73828125, + "step": 97 + }, + { + "completion_length": 395.66668701171875, + "epoch": 0.013113876622507695, + "grad_norm": 0.12949968874454498, + "kl": 0.0005730512784793973, + "learning_rate": 6.550802139037434e-07, + "loss": -0.0027, + "reward": -0.8359375, + "reward_std": 0.49609375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.8359375, + "step": 98 + }, + { + "completion_length": 600.0, + "epoch": 0.013247691690084303, + "grad_norm": 0.08795811235904694, + "kl": 0.000673401344101876, + "learning_rate": 6.61764705882353e-07, + "loss": 0.0018, + "reward": -1.78125, + "reward_std": 0.5078125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.78125, + "step": 99 + }, + { + "completion_length": 468.0, + "epoch": 0.013381506757660913, + "grad_norm": 0.1182761937379837, + "kl": 0.0005245240754447877, + "learning_rate": 6.684491978609627e-07, + "loss": 0.0088, + "reward": -1.234375, + "reward_std": 0.4296875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.234375, + "step": 100 + }, + { + "completion_length": 429.8333435058594, + "epoch": 0.013515321825237521, + "grad_norm": 0.10523517429828644, + "kl": 0.0004888825351372361, + "learning_rate": 6.751336898395723e-07, + "loss": 0.0049, + "reward": -0.8671875, + "reward_std": 0.36328125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.8671875, + "step": 101 + }, + { + "completion_length": 350.16668701171875, + "epoch": 0.01364913689281413, + "grad_norm": 0.1099957525730133, + "kl": 0.0004989251610822976, + "learning_rate": 6.818181818181818e-07, + "loss": -0.0032, + "reward": -0.6171875, + "reward_std": 0.2275390625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.6171875, + "step": 102 + }, + { + "completion_length": 492.0, + "epoch": 0.01378295196039074, + "grad_norm": 0.096939817070961, + "kl": 0.0006083787302486598, + "learning_rate": 6.885026737967915e-07, + "loss": -0.0012, + "reward": -0.84765625, + "reward_std": 0.5078125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.84765625, + "step": 103 + }, + { + "completion_length": 520.8333740234375, + "epoch": 0.013916767027967349, + "grad_norm": 0.09864147007465363, + "kl": 0.0004971123998984694, + "learning_rate": 6.951871657754011e-07, + "loss": 0.0051, + "reward": -1.1796875, + "reward_std": 0.341796875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.1796875, + "step": 104 + }, + { + "completion_length": 528.0, + "epoch": 0.014050582095543959, + "grad_norm": 0.08159384876489639, + "kl": 0.0003954106941819191, + "learning_rate": 7.018716577540107e-07, + "loss": 0.013, + "reward": -1.265625, + "reward_std": 0.25390625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.265625, + "step": 105 + }, + { + "completion_length": 357.5, + "epoch": 0.014184397163120567, + "grad_norm": 0.103823222219944, + "kl": 0.0004869327531196177, + "learning_rate": 7.085561497326204e-07, + "loss": 0.0014, + "reward": -0.859375, + "reward_std": 0.470703125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.859375, + "step": 106 + }, + { + "completion_length": 475.5, + "epoch": 0.014318212230697177, + "grad_norm": 0.0782044380903244, + "kl": 0.0005046841688454151, + "learning_rate": 7.152406417112299e-07, + "loss": 0.0, + "reward": -1.1484375, + "reward_std": 0.4375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.1484375, + "step": 107 + }, + { + "completion_length": 467.5, + "epoch": 0.014452027298273785, + "grad_norm": 0.09171518683433533, + "kl": 0.0005273159476928413, + "learning_rate": 7.219251336898397e-07, + "loss": -0.0036, + "reward": -0.921875, + "reward_std": 0.4140625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.921875, + "step": 108 + }, + { + "completion_length": 461.16668701171875, + "epoch": 0.014585842365850395, + "grad_norm": 0.09841100871562958, + "kl": 0.0006487583741545677, + "learning_rate": 7.286096256684493e-07, + "loss": 0.0033, + "reward": -0.875, + "reward_std": 0.3671875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.875, + "step": 109 + }, + { + "completion_length": 358.5, + "epoch": 0.014719657433427003, + "grad_norm": 0.13746988773345947, + "kl": 0.0004324812616687268, + "learning_rate": 7.352941176470589e-07, + "loss": 0.0025, + "reward": -0.68359375, + "reward_std": 0.279296875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.68359375, + "step": 110 + }, + { + "completion_length": 559.5, + "epoch": 0.014853472501003613, + "grad_norm": 0.08793191611766815, + "kl": 0.0005514743970707059, + "learning_rate": 7.419786096256686e-07, + "loss": 0.0003, + "reward": -1.5546875, + "reward_std": 0.404296875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.5546875, + "step": 111 + }, + { + "completion_length": 396.66668701171875, + "epoch": 0.014987287568580221, + "grad_norm": 0.10710439831018448, + "kl": 0.0004422089259605855, + "learning_rate": 7.486631016042781e-07, + "loss": -0.0019, + "reward": -0.87109375, + "reward_std": 0.265625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.87109375, + "step": 112 + }, + { + "completion_length": 474.5, + "epoch": 0.015121102636156831, + "grad_norm": 0.09523480385541916, + "kl": 0.00043510389514267445, + "learning_rate": 7.553475935828877e-07, + "loss": -0.0029, + "reward": -0.94921875, + "reward_std": 0.376953125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.94921875, + "step": 113 + }, + { + "completion_length": 385.8333435058594, + "epoch": 0.015254917703733441, + "grad_norm": 0.11645786464214325, + "kl": 0.0005059984978288412, + "learning_rate": 7.620320855614974e-07, + "loss": -0.001, + "reward": -0.8671875, + "reward_std": 0.4921875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.8671875, + "step": 114 + }, + { + "completion_length": 380.16668701171875, + "epoch": 0.01538873277131005, + "grad_norm": 0.14121747016906738, + "kl": 0.00044106499990448356, + "learning_rate": 7.68716577540107e-07, + "loss": 0.0038, + "reward": -0.703125, + "reward_std": 0.41796875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.703125, + "step": 115 + }, + { + "completion_length": 481.0, + "epoch": 0.01552254783888666, + "grad_norm": 0.10021474212408066, + "kl": 0.0005236791330389678, + "learning_rate": 7.754010695187167e-07, + "loss": -0.0135, + "reward": -1.09375, + "reward_std": 0.291015625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.09375, + "step": 116 + }, + { + "completion_length": 662.0, + "epoch": 0.01565636290646327, + "grad_norm": 0.08310368657112122, + "kl": 0.0005542068392969668, + "learning_rate": 7.820855614973262e-07, + "loss": 0.0026, + "reward": -1.6015625, + "reward_std": 0.6328125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.6015625, + "step": 117 + }, + { + "completion_length": 469.3333435058594, + "epoch": 0.015790177974039876, + "grad_norm": 0.08709719032049179, + "kl": 0.000453361077234149, + "learning_rate": 7.887700534759358e-07, + "loss": 0.0008, + "reward": -1.140625, + "reward_std": 0.484375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.140625, + "step": 118 + }, + { + "completion_length": 397.16668701171875, + "epoch": 0.015923993041616485, + "grad_norm": 0.11706002801656723, + "kl": 0.0006737220101058483, + "learning_rate": 7.954545454545455e-07, + "loss": 0.0036, + "reward": -0.51953125, + "reward_std": 0.4453125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.51953125, + "step": 119 + }, + { + "completion_length": 431.0, + "epoch": 0.016057808109193095, + "grad_norm": 0.12378671020269394, + "kl": 0.0005190541851334274, + "learning_rate": 8.021390374331551e-07, + "loss": -0.0026, + "reward": -1.0859375, + "reward_std": 0.5390625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.0859375, + "step": 120 + }, + { + "completion_length": 585.8333740234375, + "epoch": 0.016191623176769705, + "grad_norm": 0.06980501115322113, + "kl": 0.0005238899611867964, + "learning_rate": 8.088235294117648e-07, + "loss": -0.0045, + "reward": -1.6796875, + "reward_std": 0.275390625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.6796875, + "step": 121 + }, + { + "completion_length": 289.8333435058594, + "epoch": 0.01632543824434631, + "grad_norm": 0.15975068509578705, + "kl": 0.0007231835625134408, + "learning_rate": 8.155080213903745e-07, + "loss": -0.0021, + "reward": -0.482421875, + "reward_std": 0.318359375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.482421875, + "step": 122 + }, + { + "completion_length": 507.3333435058594, + "epoch": 0.01645925331192292, + "grad_norm": 0.11071807146072388, + "kl": 0.0004386794753372669, + "learning_rate": 8.22192513368984e-07, + "loss": -0.0013, + "reward": -0.88671875, + "reward_std": 0.5703125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.88671875, + "step": 123 + }, + { + "completion_length": 577.0, + "epoch": 0.01659306837949953, + "grad_norm": 0.06843210011720657, + "kl": 0.0004015905724372715, + "learning_rate": 8.288770053475937e-07, + "loss": -0.0058, + "reward": -1.2734375, + "reward_std": 0.349609375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.2734375, + "step": 124 + }, + { + "completion_length": 485.3333435058594, + "epoch": 0.01672688344707614, + "grad_norm": 0.11638530343770981, + "kl": 0.000609593465924263, + "learning_rate": 8.355614973262033e-07, + "loss": -0.0047, + "reward": -1.15625, + "reward_std": 0.33984375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.15625, + "step": 125 + }, + { + "completion_length": 449.8333435058594, + "epoch": 0.01686069851465275, + "grad_norm": 0.06978274881839752, + "kl": 0.00046619633212685585, + "learning_rate": 8.42245989304813e-07, + "loss": -0.0045, + "reward": -0.875, + "reward_std": 0.2734375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.875, + "step": 126 + }, + { + "completion_length": 358.8333435058594, + "epoch": 0.016994513582229358, + "grad_norm": 0.12115911394357681, + "kl": 0.0006228546844795346, + "learning_rate": 8.489304812834226e-07, + "loss": 0.0012, + "reward": -0.80078125, + "reward_std": 0.2158203125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.80078125, + "step": 127 + }, + { + "completion_length": 340.8333435058594, + "epoch": 0.017128328649805968, + "grad_norm": 0.1330835521221161, + "kl": 0.0006046565249562263, + "learning_rate": 8.556149732620321e-07, + "loss": 0.001, + "reward": -0.75, + "reward_std": 0.2421875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.75, + "step": 128 + }, + { + "completion_length": 384.66668701171875, + "epoch": 0.017262143717382578, + "grad_norm": 0.1260133534669876, + "kl": 0.0006517590372823179, + "learning_rate": 8.622994652406418e-07, + "loss": -0.0002, + "reward": -0.875, + "reward_std": 0.71875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.875, + "step": 129 + }, + { + "completion_length": 438.16668701171875, + "epoch": 0.017395958784959187, + "grad_norm": 0.10832860320806503, + "kl": 0.0005657231668010354, + "learning_rate": 8.689839572192514e-07, + "loss": 0.0007, + "reward": -1.0234375, + "reward_std": 0.578125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.0234375, + "step": 130 + }, + { + "completion_length": 395.5, + "epoch": 0.017529773852535794, + "grad_norm": 0.14919129014015198, + "kl": 0.0006569415563717484, + "learning_rate": 8.756684491978611e-07, + "loss": -0.0018, + "reward": -0.94921875, + "reward_std": 0.546875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.94921875, + "step": 131 + }, + { + "completion_length": 391.16668701171875, + "epoch": 0.017663588920112404, + "grad_norm": 0.14460250735282898, + "kl": 0.0006637731567025185, + "learning_rate": 8.823529411764707e-07, + "loss": -0.0013, + "reward": -0.94921875, + "reward_std": 0.25, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.94921875, + "step": 132 + }, + { + "completion_length": 441.3333435058594, + "epoch": 0.017797403987689014, + "grad_norm": 0.15532676875591278, + "kl": 0.0004714071692433208, + "learning_rate": 8.890374331550802e-07, + "loss": 0.0046, + "reward": -0.98828125, + "reward_std": 0.251953125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.98828125, + "step": 133 + }, + { + "completion_length": 370.8333435058594, + "epoch": 0.017931219055265624, + "grad_norm": 0.11382456123828888, + "kl": 0.0007278465200215578, + "learning_rate": 8.957219251336899e-07, + "loss": -0.0034, + "reward": -0.74609375, + "reward_std": 0.314453125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.74609375, + "step": 134 + }, + { + "completion_length": 400.5, + "epoch": 0.018065034122842234, + "grad_norm": 0.14282457530498505, + "kl": 0.00047517273924313486, + "learning_rate": 9.024064171122995e-07, + "loss": -0.006, + "reward": -0.8359375, + "reward_std": 0.322265625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.8359375, + "step": 135 + }, + { + "completion_length": 556.3333740234375, + "epoch": 0.01819884919041884, + "grad_norm": 0.09024691581726074, + "kl": 0.00043525476939976215, + "learning_rate": 9.090909090909091e-07, + "loss": -0.0013, + "reward": -1.4296875, + "reward_std": 0.60546875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.4296875, + "step": 136 + }, + { + "completion_length": 301.66668701171875, + "epoch": 0.01833266425799545, + "grad_norm": 0.1422368437051773, + "kl": 0.0007189570460468531, + "learning_rate": 9.157754010695189e-07, + "loss": -0.0065, + "reward": -0.5859375, + "reward_std": 0.28125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.5859375, + "step": 137 + }, + { + "completion_length": 586.0, + "epoch": 0.01846647932557206, + "grad_norm": 0.07946749031543732, + "kl": 0.0004385068896226585, + "learning_rate": 9.224598930481284e-07, + "loss": 0.0013, + "reward": -1.421875, + "reward_std": 0.341796875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.421875, + "step": 138 + }, + { + "completion_length": 377.0, + "epoch": 0.01860029439314867, + "grad_norm": 0.12276607006788254, + "kl": 0.0007007961976341903, + "learning_rate": 9.29144385026738e-07, + "loss": -0.0027, + "reward": -0.9609375, + "reward_std": 0.48046875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.9609375, + "step": 139 + }, + { + "completion_length": 323.8333435058594, + "epoch": 0.018734109460725276, + "grad_norm": 0.11141712218523026, + "kl": 0.0004957327037118375, + "learning_rate": 9.358288770053477e-07, + "loss": -0.0023, + "reward": -0.7109375, + "reward_std": 0.458984375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.7109375, + "step": 140 + }, + { + "completion_length": 451.3333435058594, + "epoch": 0.018867924528301886, + "grad_norm": 0.08641522377729416, + "kl": 0.0004216538218315691, + "learning_rate": 9.425133689839573e-07, + "loss": -0.0001, + "reward": -0.89453125, + "reward_std": 0.33984375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.89453125, + "step": 141 + }, + { + "completion_length": 373.66668701171875, + "epoch": 0.019001739595878496, + "grad_norm": 0.11334878951311111, + "kl": 0.0005491083720698953, + "learning_rate": 9.49197860962567e-07, + "loss": -0.0032, + "reward": -0.89453125, + "reward_std": 0.40625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.89453125, + "step": 142 + }, + { + "completion_length": 365.66668701171875, + "epoch": 0.019135554663455106, + "grad_norm": 0.1251085102558136, + "kl": 0.0006641787476837635, + "learning_rate": 9.558823529411764e-07, + "loss": -0.0044, + "reward": -0.83984375, + "reward_std": 0.302734375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.83984375, + "step": 143 + }, + { + "completion_length": 474.66668701171875, + "epoch": 0.019269369731031716, + "grad_norm": 0.09395861625671387, + "kl": 0.0006498050643131137, + "learning_rate": 9.625668449197862e-07, + "loss": 0.0017, + "reward": -1.1328125, + "reward_std": 0.408203125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.1328125, + "step": 144 + }, + { + "completion_length": 323.3333435058594, + "epoch": 0.019403184798608322, + "grad_norm": 0.12278474867343903, + "kl": 0.0006131879054009914, + "learning_rate": 9.692513368983958e-07, + "loss": 0.0004, + "reward": -0.7265625, + "reward_std": 0.275390625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.7265625, + "step": 145 + }, + { + "completion_length": 377.3333435058594, + "epoch": 0.019536999866184932, + "grad_norm": 0.11732782423496246, + "kl": 0.0005958870751783252, + "learning_rate": 9.759358288770054e-07, + "loss": -0.0029, + "reward": -0.83203125, + "reward_std": 0.5703125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.83203125, + "step": 146 + }, + { + "completion_length": 440.5, + "epoch": 0.019670814933761542, + "grad_norm": 0.1868724673986435, + "kl": 0.000681176024954766, + "learning_rate": 9.82620320855615e-07, + "loss": -0.0068, + "reward": -1.09375, + "reward_std": 0.53125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.09375, + "step": 147 + }, + { + "completion_length": 340.8333435058594, + "epoch": 0.019804630001338152, + "grad_norm": 0.1497308760881424, + "kl": 0.0005646012723445892, + "learning_rate": 9.893048128342248e-07, + "loss": -0.0049, + "reward": -0.8125, + "reward_std": 0.396484375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.8125, + "step": 148 + }, + { + "completion_length": 570.8333740234375, + "epoch": 0.01993844506891476, + "grad_norm": 0.09082633256912231, + "kl": 0.0005208106595091522, + "learning_rate": 9.959893048128342e-07, + "loss": -0.0026, + "reward": -1.1328125, + "reward_std": 0.671875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.1328125, + "step": 149 + }, + { + "completion_length": 381.3333435058594, + "epoch": 0.02007226013649137, + "grad_norm": 0.11392635107040405, + "kl": 0.0005461536347866058, + "learning_rate": 1.0026737967914438e-06, + "loss": -0.0045, + "reward": -0.65234375, + "reward_std": 0.46484375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.65234375, + "step": 150 + }, + { + "completion_length": 512.8333740234375, + "epoch": 0.020206075204067978, + "grad_norm": 0.10393022745847702, + "kl": 0.0005140831926837564, + "learning_rate": 1.0093582887700537e-06, + "loss": 0.0034, + "reward": -1.3125, + "reward_std": 0.470703125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.3125, + "step": 151 + }, + { + "completion_length": 352.8333435058594, + "epoch": 0.020339890271644588, + "grad_norm": 0.14165768027305603, + "kl": 0.000577162834815681, + "learning_rate": 1.0160427807486633e-06, + "loss": 0.0018, + "reward": -0.890625, + "reward_std": 0.375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.890625, + "step": 152 + }, + { + "completion_length": 379.3333435058594, + "epoch": 0.020473705339221198, + "grad_norm": 0.14219383895397186, + "kl": 0.0006267136195674539, + "learning_rate": 1.0227272727272729e-06, + "loss": -0.0019, + "reward": -0.703125, + "reward_std": 0.484375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.703125, + "step": 153 + }, + { + "completion_length": 433.0, + "epoch": 0.020607520406797804, + "grad_norm": 0.09045641869306564, + "kl": 0.0003349175094626844, + "learning_rate": 1.0294117647058825e-06, + "loss": 0.0128, + "reward": -1.1171875, + "reward_std": 0.240234375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.1171875, + "step": 154 + }, + { + "completion_length": 348.5, + "epoch": 0.020741335474374414, + "grad_norm": 0.1472688764333725, + "kl": 0.0006852279184386134, + "learning_rate": 1.036096256684492e-06, + "loss": -0.0036, + "reward": -0.71875, + "reward_std": 0.279296875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.71875, + "step": 155 + }, + { + "completion_length": 388.0, + "epoch": 0.020875150541951024, + "grad_norm": 0.14087940752506256, + "kl": 0.0006020927103236318, + "learning_rate": 1.0427807486631017e-06, + "loss": -0.0057, + "reward": -0.64453125, + "reward_std": 0.32421875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.64453125, + "step": 156 + }, + { + "completion_length": 327.66668701171875, + "epoch": 0.021008965609527634, + "grad_norm": 0.13045720756053925, + "kl": 0.0005312262801453471, + "learning_rate": 1.0494652406417113e-06, + "loss": -0.0019, + "reward": -0.53125, + "reward_std": 0.53125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.53125, + "step": 157 + }, + { + "completion_length": 409.5, + "epoch": 0.02114278067710424, + "grad_norm": 0.12158454209566116, + "kl": 0.0006615255842916667, + "learning_rate": 1.056149732620321e-06, + "loss": -0.0067, + "reward": -0.80078125, + "reward_std": 0.2490234375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.80078125, + "step": 158 + }, + { + "completion_length": 530.6666870117188, + "epoch": 0.02127659574468085, + "grad_norm": 0.1100451648235321, + "kl": 0.0006079694721847773, + "learning_rate": 1.0628342245989305e-06, + "loss": -0.0006, + "reward": -1.5, + "reward_std": 0.64453125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.5, + "step": 159 + }, + { + "completion_length": 365.0, + "epoch": 0.02141041081225746, + "grad_norm": 0.11980035901069641, + "kl": 0.0005896420334465802, + "learning_rate": 1.0695187165775401e-06, + "loss": -0.0011, + "reward": -0.69921875, + "reward_std": 0.4375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.69921875, + "step": 160 + }, + { + "completion_length": 312.5, + "epoch": 0.02154422587983407, + "grad_norm": 0.14624665677547455, + "kl": 0.00077395373955369, + "learning_rate": 1.0762032085561497e-06, + "loss": 0.0041, + "reward": -0.54296875, + "reward_std": 0.3046875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.54296875, + "step": 161 + }, + { + "completion_length": 391.8333435058594, + "epoch": 0.021678040947410677, + "grad_norm": 0.1249147579073906, + "kl": 0.0007619769312441349, + "learning_rate": 1.0828877005347595e-06, + "loss": -0.0054, + "reward": -0.875, + "reward_std": 0.361328125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.875, + "step": 162 + }, + { + "completion_length": 630.6666870117188, + "epoch": 0.021811856014987287, + "grad_norm": 0.09878282248973846, + "kl": 0.0005383545067161322, + "learning_rate": 1.0895721925133691e-06, + "loss": 0.0016, + "reward": -1.59375, + "reward_std": 1.1171875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.59375, + "step": 163 + }, + { + "completion_length": 238.1666717529297, + "epoch": 0.021945671082563897, + "grad_norm": 0.16415703296661377, + "kl": 0.0007143677212297916, + "learning_rate": 1.0962566844919787e-06, + "loss": 0.0052, + "reward": -0.34375, + "reward_std": 0.1376953125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.34375, + "step": 164 + }, + { + "completion_length": 424.16668701171875, + "epoch": 0.022079486150140507, + "grad_norm": 0.12024425715208054, + "kl": 0.0004885084345005453, + "learning_rate": 1.1029411764705884e-06, + "loss": -0.0011, + "reward": -1.0234375, + "reward_std": 0.287109375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.0234375, + "step": 165 + }, + { + "completion_length": 345.0, + "epoch": 0.022213301217717116, + "grad_norm": 0.13134251534938812, + "kl": 0.0005485338624566793, + "learning_rate": 1.109625668449198e-06, + "loss": 0.0041, + "reward": -0.640625, + "reward_std": 0.296875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.640625, + "step": 166 + }, + { + "completion_length": 449.0, + "epoch": 0.022347116285293723, + "grad_norm": 0.13914933800697327, + "kl": 0.0005790984723716974, + "learning_rate": 1.1163101604278076e-06, + "loss": -0.0024, + "reward": -0.90625, + "reward_std": 0.43359375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.90625, + "step": 167 + }, + { + "completion_length": 456.66668701171875, + "epoch": 0.022480931352870333, + "grad_norm": 0.11662891507148743, + "kl": 0.000677458185236901, + "learning_rate": 1.1229946524064172e-06, + "loss": -0.0081, + "reward": -1.21875, + "reward_std": 0.431640625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.21875, + "step": 168 + }, + { + "completion_length": 338.66668701171875, + "epoch": 0.022614746420446943, + "grad_norm": 0.14155802130699158, + "kl": 0.0005925593432039022, + "learning_rate": 1.1296791443850268e-06, + "loss": 0.0, + "reward": -0.71484375, + "reward_std": 0.40625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.71484375, + "step": 169 + }, + { + "completion_length": 265.8333435058594, + "epoch": 0.022748561488023553, + "grad_norm": 0.16593119502067566, + "kl": 0.0005104307783767581, + "learning_rate": 1.1363636363636364e-06, + "loss": 0.0003, + "reward": -0.48046875, + "reward_std": 0.2353515625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.48046875, + "step": 170 + }, + { + "completion_length": 461.3333435058594, + "epoch": 0.02288237655560016, + "grad_norm": 0.1283525973558426, + "kl": 0.0006034953985363245, + "learning_rate": 1.143048128342246e-06, + "loss": -0.0006, + "reward": -0.91015625, + "reward_std": 0.55859375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.91015625, + "step": 171 + }, + { + "completion_length": 567.3333740234375, + "epoch": 0.02301619162317677, + "grad_norm": 0.09264618158340454, + "kl": 0.00039277609903365374, + "learning_rate": 1.1497326203208558e-06, + "loss": 0.0011, + "reward": -1.296875, + "reward_std": 0.423828125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.296875, + "step": 172 + }, + { + "completion_length": 442.66668701171875, + "epoch": 0.02315000669075338, + "grad_norm": 0.06924661993980408, + "kl": 0.0002805929980240762, + "learning_rate": 1.1564171122994654e-06, + "loss": 0.0049, + "reward": -1.140625, + "reward_std": 0.130859375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.140625, + "step": 173 + }, + { + "completion_length": 321.5, + "epoch": 0.02328382175832999, + "grad_norm": 0.15098147094249725, + "kl": 0.0005829234141856432, + "learning_rate": 1.163101604278075e-06, + "loss": 0.0117, + "reward": -0.58203125, + "reward_std": 0.1455078125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.58203125, + "step": 174 + }, + { + "completion_length": 416.3333435058594, + "epoch": 0.0234176368259066, + "grad_norm": 0.11847102642059326, + "kl": 0.0006443657330237329, + "learning_rate": 1.1697860962566846e-06, + "loss": -0.0044, + "reward": -0.8671875, + "reward_std": 0.32421875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.8671875, + "step": 175 + }, + { + "completion_length": 371.3333435058594, + "epoch": 0.023551451893483205, + "grad_norm": 0.1091599240899086, + "kl": 0.0004576949286274612, + "learning_rate": 1.1764705882352942e-06, + "loss": -0.0054, + "reward": -0.73828125, + "reward_std": 0.296875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.73828125, + "step": 176 + }, + { + "completion_length": 438.16668701171875, + "epoch": 0.023685266961059815, + "grad_norm": 0.10089421272277832, + "kl": 0.0004992609028704464, + "learning_rate": 1.1831550802139038e-06, + "loss": 0.0029, + "reward": -0.55859375, + "reward_std": 0.482421875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.55859375, + "step": 177 + }, + { + "completion_length": 509.16668701171875, + "epoch": 0.023819082028636425, + "grad_norm": 0.10792536288499832, + "kl": 0.000662465114146471, + "learning_rate": 1.1898395721925134e-06, + "loss": -0.0029, + "reward": -1.21875, + "reward_std": 0.294921875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.21875, + "step": 178 + }, + { + "completion_length": 347.5, + "epoch": 0.023952897096213035, + "grad_norm": 0.2220248132944107, + "kl": 0.0006923056207597256, + "learning_rate": 1.1965240641711233e-06, + "loss": -0.0063, + "reward": -0.78125, + "reward_std": 0.29296875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.78125, + "step": 179 + }, + { + "completion_length": 470.16668701171875, + "epoch": 0.02408671216378964, + "grad_norm": 0.09262672066688538, + "kl": 0.00041312514804303646, + "learning_rate": 1.2032085561497326e-06, + "loss": -0.0037, + "reward": -1.2109375, + "reward_std": 0.458984375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.2109375, + "step": 180 + }, + { + "completion_length": 484.0, + "epoch": 0.02422052723136625, + "grad_norm": 0.11066435277462006, + "kl": 0.0005693985149264336, + "learning_rate": 1.2098930481283423e-06, + "loss": -0.0112, + "reward": -1.109375, + "reward_std": 0.326171875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.109375, + "step": 181 + }, + { + "completion_length": 317.8333435058594, + "epoch": 0.02435434229894286, + "grad_norm": 0.1172327920794487, + "kl": 0.0005950028426013887, + "learning_rate": 1.216577540106952e-06, + "loss": -0.0006, + "reward": -0.6328125, + "reward_std": 0.34765625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.6328125, + "step": 182 + }, + { + "completion_length": 339.3333435058594, + "epoch": 0.02448815736651947, + "grad_norm": 0.10278620570898056, + "kl": 0.00045496373786590993, + "learning_rate": 1.2232620320855617e-06, + "loss": -0.0003, + "reward": -0.671875, + "reward_std": 0.4140625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.671875, + "step": 183 + }, + { + "completion_length": 497.3333435058594, + "epoch": 0.02462197243409608, + "grad_norm": 0.1089860200881958, + "kl": 0.0006303495611064136, + "learning_rate": 1.2299465240641713e-06, + "loss": -0.0019, + "reward": -1.359375, + "reward_std": 1.3984375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.359375, + "step": 184 + }, + { + "completion_length": 304.66668701171875, + "epoch": 0.024755787501672687, + "grad_norm": 0.14699524641036987, + "kl": 0.0006107437657192349, + "learning_rate": 1.2366310160427809e-06, + "loss": -0.0026, + "reward": -0.51953125, + "reward_std": 0.3359375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.51953125, + "step": 185 + }, + { + "completion_length": 360.5, + "epoch": 0.024889602569249297, + "grad_norm": 0.1235690489411354, + "kl": 0.000642502389382571, + "learning_rate": 1.2433155080213905e-06, + "loss": 0.0, + "reward": -0.890625, + "reward_std": 0.1982421875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.890625, + "step": 186 + }, + { + "completion_length": 405.8333435058594, + "epoch": 0.025023417636825907, + "grad_norm": 0.13261531293392181, + "kl": 0.0007065389072522521, + "learning_rate": 1.25e-06, + "loss": 0.0017, + "reward": -0.828125, + "reward_std": 0.427734375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.828125, + "step": 187 + }, + { + "completion_length": 355.8333435058594, + "epoch": 0.025157232704402517, + "grad_norm": 0.11836958676576614, + "kl": 0.0005762047949247062, + "learning_rate": 1.2566844919786097e-06, + "loss": 0.001, + "reward": -0.53515625, + "reward_std": 0.3046875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.53515625, + "step": 188 + }, + { + "completion_length": 295.0, + "epoch": 0.025291047771979124, + "grad_norm": 0.15814770758152008, + "kl": 0.000565587542951107, + "learning_rate": 1.2633689839572193e-06, + "loss": 0.0071, + "reward": -0.53515625, + "reward_std": 0.17578125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.53515625, + "step": 189 + }, + { + "completion_length": 317.8333435058594, + "epoch": 0.025424862839555733, + "grad_norm": 0.2327018529176712, + "kl": 0.0006943491753190756, + "learning_rate": 1.2700534759358291e-06, + "loss": -0.0019, + "reward": -0.640625, + "reward_std": 0.375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.640625, + "step": 190 + }, + { + "completion_length": 345.66668701171875, + "epoch": 0.025558677907132343, + "grad_norm": 0.15127608180046082, + "kl": 0.0005449084565043449, + "learning_rate": 1.2767379679144387e-06, + "loss": 0.0039, + "reward": -0.6875, + "reward_std": 0.25, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.6875, + "step": 191 + }, + { + "completion_length": 372.0, + "epoch": 0.025692492974708953, + "grad_norm": 0.1675024777650833, + "kl": 0.0006789276376366615, + "learning_rate": 1.2834224598930483e-06, + "loss": 0.0001, + "reward": -0.9453125, + "reward_std": 0.486328125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.9453125, + "step": 192 + }, + { + "completion_length": 260.8333435058594, + "epoch": 0.02582630804228556, + "grad_norm": 0.17227157950401306, + "kl": 0.0005113824736326933, + "learning_rate": 1.2901069518716577e-06, + "loss": -0.0011, + "reward": -0.41796875, + "reward_std": 0.443359375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.41796875, + "step": 193 + }, + { + "completion_length": 458.8333435058594, + "epoch": 0.02596012310986217, + "grad_norm": 0.13124048709869385, + "kl": 0.000769376871176064, + "learning_rate": 1.2967914438502673e-06, + "loss": 0.0117, + "reward": -1.265625, + "reward_std": 0.33984375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.265625, + "step": 194 + }, + { + "completion_length": 482.8333435058594, + "epoch": 0.02609393817743878, + "grad_norm": 0.11438746005296707, + "kl": 0.0005745739908888936, + "learning_rate": 1.303475935828877e-06, + "loss": 0.0003, + "reward": -1.2421875, + "reward_std": 0.625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.2421875, + "step": 195 + }, + { + "completion_length": 337.3333435058594, + "epoch": 0.02622775324501539, + "grad_norm": 0.12187661230564117, + "kl": 0.0005417331121861935, + "learning_rate": 1.3101604278074868e-06, + "loss": 0.0001, + "reward": -0.63671875, + "reward_std": 0.58984375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.63671875, + "step": 196 + }, + { + "completion_length": 304.8333435058594, + "epoch": 0.026361568312592, + "grad_norm": 0.18323078751564026, + "kl": 0.001144442823715508, + "learning_rate": 1.3168449197860964e-06, + "loss": 0.0054, + "reward": -0.66015625, + "reward_std": 0.2099609375, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.66015625, + "step": 197 + }, + { + "completion_length": 367.3333435058594, + "epoch": 0.026495383380168606, + "grad_norm": 0.13606765866279602, + "kl": 0.0006207119440659881, + "learning_rate": 1.323529411764706e-06, + "loss": 0.0093, + "reward": -0.8359375, + "reward_std": 0.2265625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.8359375, + "step": 198 + }, + { + "completion_length": 367.8333435058594, + "epoch": 0.026629198447745216, + "grad_norm": 0.13173972070217133, + "kl": 0.0006991230184212327, + "learning_rate": 1.3302139037433156e-06, + "loss": -0.0032, + "reward": -0.66015625, + "reward_std": 0.482421875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.66015625, + "step": 199 + }, + { + "completion_length": 516.8333740234375, + "epoch": 0.026763013515321826, + "grad_norm": 0.113725446164608, + "kl": 0.0006071855314075947, + "learning_rate": 1.3368983957219254e-06, + "loss": 0.0021, + "reward": -1.3046875, + "reward_std": 0.53125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -1.3046875, + "step": 200 + } + ], + "logging_steps": 1, + "max_steps": 7473, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}