| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.4086845466155811, | |
| "eval_steps": 500, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 286.25, | |
| "epoch": 0.0010217113665389529, | |
| "grad_norm": 6.432072639465332, | |
| "kl": 0.00101470947265625, | |
| "learning_rate": 9.989775051124745e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_std": 0.6933577060699463, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/score_reward": 0.65625, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 298.6875, | |
| "epoch": 0.0020434227330779057, | |
| "grad_norm": 41.437564849853516, | |
| "kl": 0.0011653900146484375, | |
| "learning_rate": 9.97955010224949e-07, | |
| "loss": 0.0, | |
| "reward": 1.375, | |
| "reward_std": 0.6811521649360657, | |
| "rewards/format_reward": 0.875, | |
| "rewards/score_reward": 0.5, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 297.59375, | |
| "epoch": 0.0030651340996168583, | |
| "grad_norm": 5.488102436065674, | |
| "kl": 0.0012264251708984375, | |
| "learning_rate": 9.969325153374232e-07, | |
| "loss": 0.0, | |
| "reward": 1.46875, | |
| "reward_std": 0.6645826250314713, | |
| "rewards/format_reward": 0.875, | |
| "rewards/score_reward": 0.59375, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 320.4375, | |
| "epoch": 0.004086845466155811, | |
| "grad_norm": 9.45959186553955, | |
| "kl": 0.002105712890625, | |
| "learning_rate": 9.959100204498977e-07, | |
| "loss": 0.0, | |
| "reward": 1.3125, | |
| "reward_std": 0.6546904295682907, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/score_reward": 0.5, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 302.1875, | |
| "epoch": 0.005108556832694764, | |
| "grad_norm": 33.590248107910156, | |
| "kl": 0.00212860107421875, | |
| "learning_rate": 9.94887525562372e-07, | |
| "loss": 0.0, | |
| "reward": 1.46875, | |
| "reward_std": 0.573425218462944, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.5, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 333.5, | |
| "epoch": 0.006130268199233717, | |
| "grad_norm": 6.729313373565674, | |
| "kl": 0.00249481201171875, | |
| "learning_rate": 9.938650306748465e-07, | |
| "loss": 0.0, | |
| "reward": 1.5625, | |
| "reward_std": 0.5081327110528946, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.59375, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 302.40625, | |
| "epoch": 0.007151979565772669, | |
| "grad_norm": 18.135210037231445, | |
| "kl": 0.0063018798828125, | |
| "learning_rate": 9.92842535787321e-07, | |
| "loss": 0.0, | |
| "reward": 1.59375, | |
| "reward_std": 0.5145231708884239, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.625, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 328.875, | |
| "epoch": 0.008173690932311623, | |
| "grad_norm": 9.06258487701416, | |
| "kl": 0.00640869140625, | |
| "learning_rate": 9.918200408997955e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_std": 0.5597654432058334, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/score_reward": 0.59375, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 325.375, | |
| "epoch": 0.009195402298850575, | |
| "grad_norm": 6.420322895050049, | |
| "kl": 0.00702667236328125, | |
| "learning_rate": 9.9079754601227e-07, | |
| "loss": 0.0, | |
| "reward": 1.6875, | |
| "reward_std": 0.49022960662841797, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.71875, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 305.90625, | |
| "epoch": 0.010217113665389528, | |
| "grad_norm": 18.078149795532227, | |
| "kl": 0.011444091796875, | |
| "learning_rate": 9.897750511247443e-07, | |
| "loss": 0.0, | |
| "reward": 1.53125, | |
| "reward_std": 0.4355708882212639, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.53125, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 348.21875, | |
| "epoch": 0.01123882503192848, | |
| "grad_norm": 7.699387550354004, | |
| "kl": 0.01019287109375, | |
| "learning_rate": 9.887525562372188e-07, | |
| "loss": 0.0, | |
| "reward": 1.46875, | |
| "reward_std": 0.6350298076868057, | |
| "rewards/format_reward": 0.875, | |
| "rewards/score_reward": 0.59375, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 345.40625, | |
| "epoch": 0.012260536398467433, | |
| "grad_norm": 5.215458393096924, | |
| "kl": 0.0137939453125, | |
| "learning_rate": 9.87730061349693e-07, | |
| "loss": 0.0, | |
| "reward": 1.53125, | |
| "reward_std": 0.5217924863100052, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.53125, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 343.0, | |
| "epoch": 0.013282247765006385, | |
| "grad_norm": 28.669469833374023, | |
| "kl": 0.016448974609375, | |
| "learning_rate": 9.867075664621678e-07, | |
| "loss": 0.0, | |
| "reward": 1.53125, | |
| "reward_std": 0.5123760402202606, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.53125, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 353.78125, | |
| "epoch": 0.014303959131545339, | |
| "grad_norm": 8.11595630645752, | |
| "kl": 0.0168609619140625, | |
| "learning_rate": 9.85685071574642e-07, | |
| "loss": 0.0, | |
| "reward": 1.375, | |
| "reward_std": 0.6134846806526184, | |
| "rewards/format_reward": 0.875, | |
| "rewards/score_reward": 0.5, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 354.0625, | |
| "epoch": 0.01532567049808429, | |
| "grad_norm": 6.02888298034668, | |
| "kl": 0.0177764892578125, | |
| "learning_rate": 9.846625766871166e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_std": 0.5813874304294586, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.53125, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 343.84375, | |
| "epoch": 0.016347381864623246, | |
| "grad_norm": 6.745128631591797, | |
| "kl": 0.0180816650390625, | |
| "learning_rate": 9.836400817995909e-07, | |
| "loss": 0.0, | |
| "reward": 1.59375, | |
| "reward_std": 0.494472935795784, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.59375, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 317.65625, | |
| "epoch": 0.017369093231162196, | |
| "grad_norm": 12.856358528137207, | |
| "kl": 0.022674560546875, | |
| "learning_rate": 9.826175869120654e-07, | |
| "loss": 0.0, | |
| "reward": 1.59375, | |
| "reward_std": 0.5217924863100052, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.59375, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 340.40625, | |
| "epoch": 0.01839080459770115, | |
| "grad_norm": 5.885622024536133, | |
| "kl": 0.01983642578125, | |
| "learning_rate": 9.815950920245399e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_std": 0.5081327110528946, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.53125, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 331.8125, | |
| "epoch": 0.019412515964240103, | |
| "grad_norm": 9.882148742675781, | |
| "kl": 0.025909423828125, | |
| "learning_rate": 9.805725971370141e-07, | |
| "loss": 0.0, | |
| "reward": 1.46875, | |
| "reward_std": 0.6674923896789551, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/score_reward": 0.5625, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 325.96875, | |
| "epoch": 0.020434227330779056, | |
| "grad_norm": 12.713711738586426, | |
| "kl": 0.026275634765625, | |
| "learning_rate": 9.795501022494888e-07, | |
| "loss": 0.0, | |
| "reward": 1.46875, | |
| "reward_std": 0.5145231708884239, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.5, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 344.15625, | |
| "epoch": 0.021455938697318006, | |
| "grad_norm": 7.077400207519531, | |
| "kl": 0.028045654296875, | |
| "learning_rate": 9.785276073619631e-07, | |
| "loss": 0.0, | |
| "reward": 1.5625, | |
| "reward_std": 0.5512787848711014, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.59375, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 342.46875, | |
| "epoch": 0.02247765006385696, | |
| "grad_norm": 5.0989837646484375, | |
| "kl": 0.027130126953125, | |
| "learning_rate": 9.775051124744376e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_std": 0.5483793616294861, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.53125, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 339.3125, | |
| "epoch": 0.023499361430395913, | |
| "grad_norm": 27.992931365966797, | |
| "kl": 0.046783447265625, | |
| "learning_rate": 9.76482617586912e-07, | |
| "loss": 0.0, | |
| "reward": 1.5625, | |
| "reward_std": 0.5260358154773712, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.5625, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 344.90625, | |
| "epoch": 0.024521072796934867, | |
| "grad_norm": 16.492258071899414, | |
| "kl": 0.02996826171875, | |
| "learning_rate": 9.754601226993864e-07, | |
| "loss": 0.0, | |
| "reward": 1.4375, | |
| "reward_std": 0.5166193693876266, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.4375, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 339.53125, | |
| "epoch": 0.02554278416347382, | |
| "grad_norm": 10.976400375366211, | |
| "kl": 0.05401611328125, | |
| "learning_rate": 9.74437627811861e-07, | |
| "loss": 0.0001, | |
| "reward": 1.59375, | |
| "reward_std": 0.3787454217672348, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.59375, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 332.25, | |
| "epoch": 0.02656449553001277, | |
| "grad_norm": 4.896389961242676, | |
| "kl": 0.0478515625, | |
| "learning_rate": 9.734151329243352e-07, | |
| "loss": 0.0, | |
| "reward": 1.6875, | |
| "reward_std": 0.49022960662841797, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.6875, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 332.9375, | |
| "epoch": 0.027586206896551724, | |
| "grad_norm": 23.714187622070312, | |
| "kl": 0.04449462890625, | |
| "learning_rate": 9.7239263803681e-07, | |
| "loss": 0.0, | |
| "reward": 1.46875, | |
| "reward_std": 0.47137709707021713, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.46875, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 342.15625, | |
| "epoch": 0.028607918263090677, | |
| "grad_norm": 18.24022102355957, | |
| "kl": 0.032135009765625, | |
| "learning_rate": 9.713701431492842e-07, | |
| "loss": 0.0, | |
| "reward": 1.625, | |
| "reward_std": 0.4355512708425522, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.625, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 317.96875, | |
| "epoch": 0.02962962962962963, | |
| "grad_norm": 9.736083984375, | |
| "kl": 0.035491943359375, | |
| "learning_rate": 9.703476482617587e-07, | |
| "loss": 0.0, | |
| "reward": 1.5625, | |
| "reward_std": 0.5647513717412949, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.59375, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 332.75, | |
| "epoch": 0.03065134099616858, | |
| "grad_norm": 4.943301200866699, | |
| "kl": 0.03399658203125, | |
| "learning_rate": 9.69325153374233e-07, | |
| "loss": 0.0, | |
| "reward": 1.625, | |
| "reward_std": 0.563484326004982, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.65625, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 319.96875, | |
| "epoch": 0.03167305236270754, | |
| "grad_norm": 17.685562133789062, | |
| "kl": 0.0460205078125, | |
| "learning_rate": 9.683026584867075e-07, | |
| "loss": 0.0, | |
| "reward": 1.5, | |
| "reward_std": 0.49022960662841797, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.5, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 322.9375, | |
| "epoch": 0.03269476372924649, | |
| "grad_norm": 15.011503219604492, | |
| "kl": 0.04730224609375, | |
| "learning_rate": 9.67280163599182e-07, | |
| "loss": 0.0, | |
| "reward": 1.5625, | |
| "reward_std": 0.5260358154773712, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.5625, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 325.75, | |
| "epoch": 0.03371647509578544, | |
| "grad_norm": 6.8639020919799805, | |
| "kl": 0.04168701171875, | |
| "learning_rate": 9.662576687116565e-07, | |
| "loss": 0.0, | |
| "reward": 1.5625, | |
| "reward_std": 0.5081327110528946, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.5625, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 317.90625, | |
| "epoch": 0.03473818646232439, | |
| "grad_norm": 6.531950950622559, | |
| "kl": 0.039306640625, | |
| "learning_rate": 9.65235173824131e-07, | |
| "loss": 0.0, | |
| "reward": 1.65625, | |
| "reward_std": 0.494472935795784, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.65625, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 300.0, | |
| "epoch": 0.035759897828863345, | |
| "grad_norm": 5.936326026916504, | |
| "kl": 0.04559326171875, | |
| "learning_rate": 9.642126789366053e-07, | |
| "loss": 0.0, | |
| "reward": 1.5625, | |
| "reward_std": 0.5260358154773712, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.5625, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 307.875, | |
| "epoch": 0.0367816091954023, | |
| "grad_norm": 4.012608528137207, | |
| "kl": 0.05096435546875, | |
| "learning_rate": 9.631901840490798e-07, | |
| "loss": 0.0001, | |
| "reward": 1.75, | |
| "reward_std": 0.4261348247528076, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.75, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 294.40625, | |
| "epoch": 0.03780332056194125, | |
| "grad_norm": 8.631688117980957, | |
| "kl": 0.0521240234375, | |
| "learning_rate": 9.62167689161554e-07, | |
| "loss": 0.0001, | |
| "reward": 1.53125, | |
| "reward_std": 0.5038893818855286, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.53125, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 320.65625, | |
| "epoch": 0.038825031928480205, | |
| "grad_norm": 12.322967529296875, | |
| "kl": 0.046630859375, | |
| "learning_rate": 9.611451942740285e-07, | |
| "loss": 0.0, | |
| "reward": 1.5625, | |
| "reward_std": 0.4765502139925957, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.5625, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 294.75, | |
| "epoch": 0.03984674329501916, | |
| "grad_norm": 7.365263938903809, | |
| "kl": 0.054931640625, | |
| "learning_rate": 9.60122699386503e-07, | |
| "loss": 0.0001, | |
| "reward": 1.53125, | |
| "reward_std": 0.5217924863100052, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.5625, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 305.0625, | |
| "epoch": 0.04086845466155811, | |
| "grad_norm": 7.387457847595215, | |
| "kl": 0.067626953125, | |
| "learning_rate": 9.591002044989775e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.3924051970243454, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.6875, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 296.40625, | |
| "epoch": 0.041890166028097066, | |
| "grad_norm": 15.78101921081543, | |
| "kl": 0.05401611328125, | |
| "learning_rate": 9.580777096114518e-07, | |
| "loss": 0.0001, | |
| "reward": 1.53125, | |
| "reward_std": 0.5302791446447372, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.53125, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 285.1875, | |
| "epoch": 0.04291187739463601, | |
| "grad_norm": 14.837868690490723, | |
| "kl": 0.05877685546875, | |
| "learning_rate": 9.570552147239263e-07, | |
| "loss": 0.0001, | |
| "reward": 1.65625, | |
| "reward_std": 0.5038893818855286, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.65625, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 288.34375, | |
| "epoch": 0.043933588761174966, | |
| "grad_norm": 16.0447940826416, | |
| "kl": 0.0618896484375, | |
| "learning_rate": 9.560327198364008e-07, | |
| "loss": 0.0001, | |
| "reward": 1.71875, | |
| "reward_std": 0.3471629247069359, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.71875, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 290.75, | |
| "epoch": 0.04495530012771392, | |
| "grad_norm": 5.146910667419434, | |
| "kl": 0.0574951171875, | |
| "learning_rate": 9.55010224948875e-07, | |
| "loss": 0.0001, | |
| "reward": 1.75, | |
| "reward_std": 0.3650856465101242, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.75, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 304.0625, | |
| "epoch": 0.04597701149425287, | |
| "grad_norm": 7.404935359954834, | |
| "kl": 0.0672607421875, | |
| "learning_rate": 9.539877300613496e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.49022960662841797, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.71875, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 277.3125, | |
| "epoch": 0.046998722860791826, | |
| "grad_norm": 8.61013126373291, | |
| "kl": 0.0732421875, | |
| "learning_rate": 9.529652351738241e-07, | |
| "loss": 0.0001, | |
| "reward": 1.5625, | |
| "reward_std": 0.5260358154773712, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.5625, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 286.15625, | |
| "epoch": 0.04802043422733078, | |
| "grad_norm": 4.638645648956299, | |
| "kl": 0.065673828125, | |
| "learning_rate": 9.519427402862985e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.3471629247069359, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 279.65625, | |
| "epoch": 0.04904214559386973, | |
| "grad_norm": 9.735628128051758, | |
| "kl": 0.060302734375, | |
| "learning_rate": 9.509202453987729e-07, | |
| "loss": 0.0001, | |
| "reward": 1.65625, | |
| "reward_std": 0.4807935431599617, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.65625, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 291.875, | |
| "epoch": 0.05006385696040869, | |
| "grad_norm": 5.960241794586182, | |
| "kl": 0.0555419921875, | |
| "learning_rate": 9.498977505112475e-07, | |
| "loss": 0.0001, | |
| "reward": 1.625, | |
| "reward_std": 0.5175491571426392, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.625, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 279.90625, | |
| "epoch": 0.05108556832694764, | |
| "grad_norm": 5.679952144622803, | |
| "kl": 0.06298828125, | |
| "learning_rate": 9.488752556237219e-07, | |
| "loss": 0.0001, | |
| "reward": 1.625, | |
| "reward_std": 0.49022960662841797, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.625, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 299.90625, | |
| "epoch": 0.05210727969348659, | |
| "grad_norm": 6.777224540710449, | |
| "kl": 0.06329345703125, | |
| "learning_rate": 9.478527607361963e-07, | |
| "loss": 0.0001, | |
| "reward": 1.65625, | |
| "reward_std": 0.4807935431599617, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.65625, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 266.84375, | |
| "epoch": 0.05312899106002554, | |
| "grad_norm": 8.792196273803711, | |
| "kl": 0.071533203125, | |
| "learning_rate": 9.468302658486708e-07, | |
| "loss": 0.0001, | |
| "reward": 1.71875, | |
| "reward_std": 0.4765698313713074, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.71875, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 284.59375, | |
| "epoch": 0.054150702426564494, | |
| "grad_norm": 5.025711536407471, | |
| "kl": 0.06463623046875, | |
| "learning_rate": 9.458077709611452e-07, | |
| "loss": 0.0001, | |
| "reward": 1.65625, | |
| "reward_std": 0.4807935431599617, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.65625, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 277.75, | |
| "epoch": 0.05517241379310345, | |
| "grad_norm": 10.143484115600586, | |
| "kl": 0.06658935546875, | |
| "learning_rate": 9.447852760736195e-07, | |
| "loss": 0.0001, | |
| "reward": 1.75, | |
| "reward_std": 0.4261348247528076, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.75, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 285.875, | |
| "epoch": 0.0561941251596424, | |
| "grad_norm": 5.966069221496582, | |
| "kl": 0.08685302734375, | |
| "learning_rate": 9.437627811860939e-07, | |
| "loss": 0.0001, | |
| "reward": 1.59375, | |
| "reward_std": 0.5217924863100052, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.59375, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 305.34375, | |
| "epoch": 0.057215836526181355, | |
| "grad_norm": 45.916839599609375, | |
| "kl": 0.064208984375, | |
| "learning_rate": 9.427402862985685e-07, | |
| "loss": 0.0001, | |
| "reward": 1.71875, | |
| "reward_std": 0.4628904387354851, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.71875, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 283.3125, | |
| "epoch": 0.05823754789272031, | |
| "grad_norm": 5.061546802520752, | |
| "kl": 0.06317138671875, | |
| "learning_rate": 9.417177914110429e-07, | |
| "loss": 0.0001, | |
| "reward": 1.65625, | |
| "reward_std": 0.5038893818855286, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.65625, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 287.34375, | |
| "epoch": 0.05925925925925926, | |
| "grad_norm": 6.127585411071777, | |
| "kl": 0.05999755859375, | |
| "learning_rate": 9.406952965235173e-07, | |
| "loss": 0.0001, | |
| "reward": 1.625, | |
| "reward_std": 0.5081327110528946, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.625, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 286.71875, | |
| "epoch": 0.060280970625798215, | |
| "grad_norm": 8.098198890686035, | |
| "kl": 0.0767822265625, | |
| "learning_rate": 9.396728016359918e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.4808131605386734, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.6875, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 284.21875, | |
| "epoch": 0.06130268199233716, | |
| "grad_norm": 10.709068298339844, | |
| "kl": 0.0743408203125, | |
| "learning_rate": 9.386503067484662e-07, | |
| "loss": 0.0001, | |
| "reward": 1.53125, | |
| "reward_std": 0.5217924863100052, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.53125, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 289.375, | |
| "epoch": 0.062324393358876115, | |
| "grad_norm": 20.22757339477539, | |
| "kl": 0.0645751953125, | |
| "learning_rate": 9.376278118609406e-07, | |
| "loss": 0.0001, | |
| "reward": 1.5625, | |
| "reward_std": 0.5175491571426392, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.5625, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 293.03125, | |
| "epoch": 0.06334610472541508, | |
| "grad_norm": 8.406437873840332, | |
| "kl": 0.0809326171875, | |
| "learning_rate": 9.36605316973415e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.4355512708425522, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.6875, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 285.125, | |
| "epoch": 0.06436781609195402, | |
| "grad_norm": 4.2449235916137695, | |
| "kl": 0.06787109375, | |
| "learning_rate": 9.355828220858896e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.3471629247069359, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 289.1875, | |
| "epoch": 0.06538952745849298, | |
| "grad_norm": 13.196094512939453, | |
| "kl": 0.0770263671875, | |
| "learning_rate": 9.34560327198364e-07, | |
| "loss": 0.0001, | |
| "reward": 1.625, | |
| "reward_std": 0.5175491571426392, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.65625, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 287.1875, | |
| "epoch": 0.06641123882503193, | |
| "grad_norm": 7.091306209564209, | |
| "kl": 0.081787109375, | |
| "learning_rate": 9.335378323108384e-07, | |
| "loss": 0.0001, | |
| "reward": 1.625, | |
| "reward_std": 0.4765502139925957, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.625, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 285.15625, | |
| "epoch": 0.06743295019157088, | |
| "grad_norm": 8.960472106933594, | |
| "kl": 0.0849609375, | |
| "learning_rate": 9.325153374233128e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.4671337679028511, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.6875, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 296.3125, | |
| "epoch": 0.06845466155810984, | |
| "grad_norm": 8.031961441040039, | |
| "kl": 0.0882568359375, | |
| "learning_rate": 9.314928425357873e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.408231720328331, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 286.875, | |
| "epoch": 0.06947637292464878, | |
| "grad_norm": 3.796295642852783, | |
| "kl": 0.0850830078125, | |
| "learning_rate": 9.304703476482617e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 307.3125, | |
| "epoch": 0.07049808429118774, | |
| "grad_norm": 6.970209121704102, | |
| "kl": 0.0810546875, | |
| "learning_rate": 9.294478527607362e-07, | |
| "loss": 0.0001, | |
| "reward": 1.71875, | |
| "reward_std": 0.4218914955854416, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.71875, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 291.34375, | |
| "epoch": 0.07151979565772669, | |
| "grad_norm": 4.853825092315674, | |
| "kl": 0.07666015625, | |
| "learning_rate": 9.284253578732107e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.4218914955854416, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 285.375, | |
| "epoch": 0.07254150702426565, | |
| "grad_norm": 18.912513732910156, | |
| "kl": 0.0770263671875, | |
| "learning_rate": 9.27402862985685e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.3608423173427582, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 270.09375, | |
| "epoch": 0.0735632183908046, | |
| "grad_norm": 11.021360397338867, | |
| "kl": 0.0799560546875, | |
| "learning_rate": 9.263803680981594e-07, | |
| "loss": 0.0001, | |
| "reward": 1.5625, | |
| "reward_std": 0.5468482673168182, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.59375, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 308.46875, | |
| "epoch": 0.07458492975734356, | |
| "grad_norm": 11.074939727783203, | |
| "kl": 0.0819091796875, | |
| "learning_rate": 9.253578732106338e-07, | |
| "loss": 0.0001, | |
| "reward": 1.5625, | |
| "reward_std": 0.38298875093460083, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.5625, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 289.21875, | |
| "epoch": 0.0756066411238825, | |
| "grad_norm": 10.272281646728516, | |
| "kl": 0.0859375, | |
| "learning_rate": 9.243353783231083e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.4671337679028511, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.6875, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 303.8125, | |
| "epoch": 0.07662835249042145, | |
| "grad_norm": 5.375434875488281, | |
| "kl": 0.07275390625, | |
| "learning_rate": 9.233128834355828e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 296.71875, | |
| "epoch": 0.07765006385696041, | |
| "grad_norm": 10.003912925720215, | |
| "kl": 0.079833984375, | |
| "learning_rate": 9.222903885480572e-07, | |
| "loss": 0.0001, | |
| "reward": 1.5625, | |
| "reward_std": 0.49022960662841797, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.5625, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 297.53125, | |
| "epoch": 0.07867177522349936, | |
| "grad_norm": 10.563873291015625, | |
| "kl": 0.1048583984375, | |
| "learning_rate": 9.212678936605317e-07, | |
| "loss": 0.0001, | |
| "reward": 1.59375, | |
| "reward_std": 0.5217924863100052, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.59375, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 289.59375, | |
| "epoch": 0.07969348659003832, | |
| "grad_norm": 5.303869247436523, | |
| "kl": 0.0816650390625, | |
| "learning_rate": 9.202453987730061e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.24511480331420898, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 302.78125, | |
| "epoch": 0.08071519795657726, | |
| "grad_norm": 200.08316040039062, | |
| "kl": 0.0753173828125, | |
| "learning_rate": 9.192229038854805e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.4671337679028511, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.6875, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 275.375, | |
| "epoch": 0.08173690932311622, | |
| "grad_norm": 10.642571449279785, | |
| "kl": 0.0750732421875, | |
| "learning_rate": 9.182004089979549e-07, | |
| "loss": 0.0001, | |
| "reward": 1.5625, | |
| "reward_std": 0.5081327110528946, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.5625, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 289.28125, | |
| "epoch": 0.08275862068965517, | |
| "grad_norm": 8.558609008789062, | |
| "kl": 0.074462890625, | |
| "learning_rate": 9.171779141104294e-07, | |
| "loss": 0.0001, | |
| "reward": 1.71875, | |
| "reward_std": 0.4397946000099182, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.75, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 277.28125, | |
| "epoch": 0.08378033205619413, | |
| "grad_norm": 8.048367500305176, | |
| "kl": 0.10791015625, | |
| "learning_rate": 9.161554192229039e-07, | |
| "loss": 0.0001, | |
| "reward": 1.75, | |
| "reward_std": 0.4492306634783745, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.75, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 282.4375, | |
| "epoch": 0.08480204342273308, | |
| "grad_norm": 7.685079097747803, | |
| "kl": 0.073486328125, | |
| "learning_rate": 9.151329243353783e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.408231720328331, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 291.125, | |
| "epoch": 0.08582375478927202, | |
| "grad_norm": 17.51926612854004, | |
| "kl": 0.07470703125, | |
| "learning_rate": 9.141104294478528e-07, | |
| "loss": 0.0001, | |
| "reward": 1.65625, | |
| "reward_std": 0.47137709707021713, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.65625, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 285.5625, | |
| "epoch": 0.08684546615581099, | |
| "grad_norm": 10.134940147399902, | |
| "kl": 0.1995849609375, | |
| "learning_rate": 9.130879345603272e-07, | |
| "loss": 0.0002, | |
| "reward": 1.71875, | |
| "reward_std": 0.5195090994238853, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.75, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 293.28125, | |
| "epoch": 0.08786717752234993, | |
| "grad_norm": 6.437090873718262, | |
| "kl": 0.0802001953125, | |
| "learning_rate": 9.120654396728016e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 299.59375, | |
| "epoch": 0.08888888888888889, | |
| "grad_norm": 33.120174407958984, | |
| "kl": 0.083251953125, | |
| "learning_rate": 9.11042944785276e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.249358132481575, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 274.3125, | |
| "epoch": 0.08991060025542784, | |
| "grad_norm": 12.132689476013184, | |
| "kl": 0.1241455078125, | |
| "learning_rate": 9.100204498977506e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.4218914955854416, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 294.90625, | |
| "epoch": 0.0909323116219668, | |
| "grad_norm": 5.772459030151367, | |
| "kl": 0.084228515625, | |
| "learning_rate": 9.08997955010225e-07, | |
| "loss": 0.0001, | |
| "reward": 1.5625, | |
| "reward_std": 0.5418623387813568, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.59375, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 298.3125, | |
| "epoch": 0.09195402298850575, | |
| "grad_norm": 5.2794389724731445, | |
| "kl": 0.077392578125, | |
| "learning_rate": 9.079754601226993e-07, | |
| "loss": 0.0001, | |
| "reward": 1.75, | |
| "reward_std": 0.3745020925998688, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.75, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 312.625, | |
| "epoch": 0.0929757343550447, | |
| "grad_norm": 4.3061370849609375, | |
| "kl": 0.0804443359375, | |
| "learning_rate": 9.069529652351737e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 312.6875, | |
| "epoch": 0.09399744572158365, | |
| "grad_norm": 6.9055304527282715, | |
| "kl": 0.0926513671875, | |
| "learning_rate": 9.059304703476482e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.4671337679028511, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.6875, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 304.0, | |
| "epoch": 0.0950191570881226, | |
| "grad_norm": 7.965025424957275, | |
| "kl": 0.0736083984375, | |
| "learning_rate": 9.049079754601226e-07, | |
| "loss": 0.0001, | |
| "reward": 1.65625, | |
| "reward_std": 0.4807935431599617, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/score_reward": 0.71875, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 314.9375, | |
| "epoch": 0.09604086845466156, | |
| "grad_norm": 17.374303817749023, | |
| "kl": 0.0794677734375, | |
| "learning_rate": 9.03885480572597e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.49022960662841797, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.6875, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 315.0625, | |
| "epoch": 0.0970625798212005, | |
| "grad_norm": 6.828720569610596, | |
| "kl": 0.0804443359375, | |
| "learning_rate": 9.028629856850716e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.24511480331420898, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 328.5625, | |
| "epoch": 0.09808429118773947, | |
| "grad_norm": 8.306486129760742, | |
| "kl": 0.078857421875, | |
| "learning_rate": 9.01840490797546e-07, | |
| "loss": 0.0001, | |
| "reward": 1.71875, | |
| "reward_std": 0.4534739926457405, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.71875, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 320.6875, | |
| "epoch": 0.09910600255427841, | |
| "grad_norm": 5.204671859741211, | |
| "kl": 0.08203125, | |
| "learning_rate": 9.008179959100204e-07, | |
| "loss": 0.0001, | |
| "reward": 1.59375, | |
| "reward_std": 0.5123760402202606, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.59375, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 340.5, | |
| "epoch": 0.10012771392081737, | |
| "grad_norm": 6.855844974517822, | |
| "kl": 0.0723876953125, | |
| "learning_rate": 8.997955010224948e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 337.75, | |
| "epoch": 0.10114942528735632, | |
| "grad_norm": 3.856675148010254, | |
| "kl": 0.077880859375, | |
| "learning_rate": 8.987730061349693e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.3924051970243454, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.71875, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 315.90625, | |
| "epoch": 0.10217113665389528, | |
| "grad_norm": 8.271965026855469, | |
| "kl": 0.07696533203125, | |
| "learning_rate": 8.977505112474437e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.49022960662841797, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.6875, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 329.5, | |
| "epoch": 0.10319284802043423, | |
| "grad_norm": 6.739444255828857, | |
| "kl": 0.082275390625, | |
| "learning_rate": 8.967280163599181e-07, | |
| "loss": 0.0001, | |
| "reward": 1.75, | |
| "reward_std": 0.4355512708425522, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.75, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 322.46875, | |
| "epoch": 0.10421455938697317, | |
| "grad_norm": 33.61738204956055, | |
| "kl": 0.0731201171875, | |
| "learning_rate": 8.957055214723927e-07, | |
| "loss": 0.0001, | |
| "reward": 1.5625, | |
| "reward_std": 0.49022960662841797, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.5625, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 329.125, | |
| "epoch": 0.10523627075351213, | |
| "grad_norm": 33.1614875793457, | |
| "kl": 0.081787109375, | |
| "learning_rate": 8.946830265848671e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.4808131605386734, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.71875, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 332.03125, | |
| "epoch": 0.10625798212005108, | |
| "grad_norm": 8.005935668945312, | |
| "kl": 0.0765380859375, | |
| "learning_rate": 8.936605316973415e-07, | |
| "loss": 0.0001, | |
| "reward": 1.65625, | |
| "reward_std": 0.494472935795784, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.6875, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 297.09375, | |
| "epoch": 0.10727969348659004, | |
| "grad_norm": 4.864773750305176, | |
| "kl": 0.080078125, | |
| "learning_rate": 8.926380368098159e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.3061639815568924, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 308.84375, | |
| "epoch": 0.10830140485312899, | |
| "grad_norm": 6.101370811462402, | |
| "kl": 0.0797119140625, | |
| "learning_rate": 8.916155419222904e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.4218914955854416, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 331.5625, | |
| "epoch": 0.10932311621966795, | |
| "grad_norm": 5.485221862792969, | |
| "kl": 0.081298828125, | |
| "learning_rate": 8.905930470347647e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.3608423173427582, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 313.21875, | |
| "epoch": 0.1103448275862069, | |
| "grad_norm": 44.665809631347656, | |
| "kl": 0.084228515625, | |
| "learning_rate": 8.895705521472392e-07, | |
| "loss": 0.0001, | |
| "reward": 1.71875, | |
| "reward_std": 0.4397946000099182, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.71875, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 324.21875, | |
| "epoch": 0.11136653895274586, | |
| "grad_norm": 5.604710102081299, | |
| "kl": 0.0872802734375, | |
| "learning_rate": 8.885480572597137e-07, | |
| "loss": 0.0001, | |
| "reward": 1.75, | |
| "reward_std": 0.4355512708425522, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.75, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 300.78125, | |
| "epoch": 0.1123882503192848, | |
| "grad_norm": 5.592599868774414, | |
| "kl": 0.072265625, | |
| "learning_rate": 8.875255623721881e-07, | |
| "loss": 0.0001, | |
| "reward": 1.71875, | |
| "reward_std": 0.4628904387354851, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.71875, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 319.5625, | |
| "epoch": 0.11340996168582375, | |
| "grad_norm": 5.424222469329834, | |
| "kl": 0.0809326171875, | |
| "learning_rate": 8.865030674846625e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.408231720328331, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 314.8125, | |
| "epoch": 0.11443167305236271, | |
| "grad_norm": 8.746443748474121, | |
| "kl": 0.0892333984375, | |
| "learning_rate": 8.854805725971369e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.4671337679028511, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.6875, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 319.28125, | |
| "epoch": 0.11545338441890166, | |
| "grad_norm": 7.040622711181641, | |
| "kl": 0.087890625, | |
| "learning_rate": 8.844580777096114e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.3335031494498253, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 327.625, | |
| "epoch": 0.11647509578544062, | |
| "grad_norm": 3.7573049068450928, | |
| "kl": 0.0821533203125, | |
| "learning_rate": 8.834355828220858e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.22201896458864212, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 320.8125, | |
| "epoch": 0.11749680715197956, | |
| "grad_norm": 8.398815155029297, | |
| "kl": 0.0875244140625, | |
| "learning_rate": 8.824130879345603e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 323.0, | |
| "epoch": 0.11851851851851852, | |
| "grad_norm": 5.150262355804443, | |
| "kl": 0.0899658203125, | |
| "learning_rate": 8.813905930470347e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.3335031494498253, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 333.46875, | |
| "epoch": 0.11954022988505747, | |
| "grad_norm": 5.349157333374023, | |
| "kl": 0.080810546875, | |
| "learning_rate": 8.803680981595092e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.3198433741927147, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 315.15625, | |
| "epoch": 0.12056194125159643, | |
| "grad_norm": 5.52967643737793, | |
| "kl": 0.083984375, | |
| "learning_rate": 8.793456032719836e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.3335031494498253, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 323.71875, | |
| "epoch": 0.12158365261813538, | |
| "grad_norm": 7.248080730438232, | |
| "kl": 0.096435546875, | |
| "learning_rate": 8.78323108384458e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.3471825420856476, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 347.125, | |
| "epoch": 0.12260536398467432, | |
| "grad_norm": 14.262347221374512, | |
| "kl": 0.0821533203125, | |
| "learning_rate": 8.773006134969325e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 326.28125, | |
| "epoch": 0.12362707535121328, | |
| "grad_norm": 10.96322250366211, | |
| "kl": 0.084228515625, | |
| "learning_rate": 8.76278118609407e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6875, | |
| "reward_std": 0.49022960662841797, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.6875, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 340.84375, | |
| "epoch": 0.12464878671775223, | |
| "grad_norm": 3.051522731781006, | |
| "kl": 0.0831298828125, | |
| "learning_rate": 8.752556237218814e-07, | |
| "loss": 0.0001, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 336.40625, | |
| "epoch": 0.12567049808429118, | |
| "grad_norm": 6.98160457611084, | |
| "kl": 0.0775146484375, | |
| "learning_rate": 8.742331288343558e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.3945523276925087, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 329.90625, | |
| "epoch": 0.12669220945083015, | |
| "grad_norm": 7.224276065826416, | |
| "kl": 0.0823974609375, | |
| "learning_rate": 8.732106339468303e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.24511480331420898, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 334.1875, | |
| "epoch": 0.1277139208173691, | |
| "grad_norm": 3.3383820056915283, | |
| "kl": 0.0880126953125, | |
| "learning_rate": 8.721881390593046e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 330.21875, | |
| "epoch": 0.12873563218390804, | |
| "grad_norm": 8.736552238464355, | |
| "kl": 0.0888671875, | |
| "learning_rate": 8.71165644171779e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.3061639815568924, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 319.78125, | |
| "epoch": 0.129757343550447, | |
| "grad_norm": 5.903339862823486, | |
| "kl": 0.0841064453125, | |
| "learning_rate": 8.701431492842535e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.3808925524353981, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 336.75, | |
| "epoch": 0.13077905491698597, | |
| "grad_norm": 18.871429443359375, | |
| "kl": 0.0933837890625, | |
| "learning_rate": 8.69120654396728e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.3198433741927147, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 333.84375, | |
| "epoch": 0.1318007662835249, | |
| "grad_norm": 6.758423328399658, | |
| "kl": 0.0882568359375, | |
| "learning_rate": 8.680981595092024e-07, | |
| "loss": 0.0001, | |
| "reward": 1.75, | |
| "reward_std": 0.481486439704895, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.78125, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 320.8125, | |
| "epoch": 0.13282247765006386, | |
| "grad_norm": 4.686440944671631, | |
| "kl": 0.0865478515625, | |
| "learning_rate": 8.670756646216768e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.3945523276925087, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 325.0, | |
| "epoch": 0.1338441890166028, | |
| "grad_norm": 3.1548657417297363, | |
| "kl": 0.086181640625, | |
| "learning_rate": 8.660531697341513e-07, | |
| "loss": 0.0001, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 314.125, | |
| "epoch": 0.13486590038314175, | |
| "grad_norm": 4.451632022857666, | |
| "kl": 0.096435546875, | |
| "learning_rate": 8.650306748466257e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.3061639815568924, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 316.09375, | |
| "epoch": 0.13588761174968073, | |
| "grad_norm": 6.362185001373291, | |
| "kl": 0.087890625, | |
| "learning_rate": 8.640081799591001e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.24511480331420898, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 320.46875, | |
| "epoch": 0.13690932311621967, | |
| "grad_norm": 7.970372200012207, | |
| "kl": 0.0863037109375, | |
| "learning_rate": 8.629856850715747e-07, | |
| "loss": 0.0001, | |
| "reward": 1.59375, | |
| "reward_std": 0.5123760402202606, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.59375, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 315.40625, | |
| "epoch": 0.13793103448275862, | |
| "grad_norm": 10.395743370056152, | |
| "kl": 0.0943603515625, | |
| "learning_rate": 8.619631901840491e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 334.0625, | |
| "epoch": 0.13895274584929757, | |
| "grad_norm": 6.411652088165283, | |
| "kl": 0.101318359375, | |
| "learning_rate": 8.609406952965235e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.4218914955854416, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 339.90625, | |
| "epoch": 0.13997445721583654, | |
| "grad_norm": 17.097640991210938, | |
| "kl": 0.098388671875, | |
| "learning_rate": 8.599182004089979e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 313.25, | |
| "epoch": 0.1409961685823755, | |
| "grad_norm": 11.207436561584473, | |
| "kl": 0.1197509765625, | |
| "learning_rate": 8.588957055214724e-07, | |
| "loss": 0.0001, | |
| "reward": 1.71875, | |
| "reward_std": 0.4765698313713074, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.71875, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 340.59375, | |
| "epoch": 0.14201787994891443, | |
| "grad_norm": 8.548532485961914, | |
| "kl": 0.08642578125, | |
| "learning_rate": 8.578732106339468e-07, | |
| "loss": 0.0001, | |
| "reward": 1.71875, | |
| "reward_std": 0.4765698313713074, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.71875, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 313.15625, | |
| "epoch": 0.14303959131545338, | |
| "grad_norm": 7.379908561706543, | |
| "kl": 0.09326171875, | |
| "learning_rate": 8.568507157464212e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.24511480331420898, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 330.21875, | |
| "epoch": 0.14406130268199233, | |
| "grad_norm": 4.854974269866943, | |
| "kl": 0.096435546875, | |
| "learning_rate": 8.558282208588958e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 324.25, | |
| "epoch": 0.1450830140485313, | |
| "grad_norm": 6.276727676391602, | |
| "kl": 0.08935546875, | |
| "learning_rate": 8.548057259713702e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 336.15625, | |
| "epoch": 0.14610472541507025, | |
| "grad_norm": 4.057552814483643, | |
| "kl": 0.094970703125, | |
| "learning_rate": 8.537832310838445e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.90625, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 327.40625, | |
| "epoch": 0.1471264367816092, | |
| "grad_norm": 13.658071517944336, | |
| "kl": 0.099365234375, | |
| "learning_rate": 8.527607361963189e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.408231720328331, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 322.8125, | |
| "epoch": 0.14814814814814814, | |
| "grad_norm": 8.570537567138672, | |
| "kl": 0.0906982421875, | |
| "learning_rate": 8.517382413087934e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 324.96875, | |
| "epoch": 0.14916985951468711, | |
| "grad_norm": 17.7524471282959, | |
| "kl": 0.0909423828125, | |
| "learning_rate": 8.507157464212678e-07, | |
| "loss": 0.0001, | |
| "reward": 1.71875, | |
| "reward_std": 0.4628904387354851, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.75, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 335.15625, | |
| "epoch": 0.15019157088122606, | |
| "grad_norm": 5.43731689453125, | |
| "kl": 0.094482421875, | |
| "learning_rate": 8.496932515337423e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 322.21875, | |
| "epoch": 0.151213282247765, | |
| "grad_norm": 4.30055046081543, | |
| "kl": 0.110595703125, | |
| "learning_rate": 8.486707566462167e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 317.21875, | |
| "epoch": 0.15223499361430395, | |
| "grad_norm": 3.138890504837036, | |
| "kl": 0.1077880859375, | |
| "learning_rate": 8.476482617586912e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 302.0, | |
| "epoch": 0.1532567049808429, | |
| "grad_norm": 6.791632652282715, | |
| "kl": 0.1146240234375, | |
| "learning_rate": 8.466257668711656e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 320.15625, | |
| "epoch": 0.15427841634738187, | |
| "grad_norm": 6.996349811553955, | |
| "kl": 0.12353515625, | |
| "learning_rate": 8.4560327198364e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.3061639815568924, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 301.65625, | |
| "epoch": 0.15530012771392082, | |
| "grad_norm": 17.680313110351562, | |
| "kl": 0.0958251953125, | |
| "learning_rate": 8.445807770961145e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.3608423173427582, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 309.0, | |
| "epoch": 0.15632183908045977, | |
| "grad_norm": 4.913498401641846, | |
| "kl": 0.1185302734375, | |
| "learning_rate": 8.435582822085889e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 299.1875, | |
| "epoch": 0.15734355044699871, | |
| "grad_norm": 8.393632888793945, | |
| "kl": 0.1197509765625, | |
| "learning_rate": 8.425357873210634e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 304.53125, | |
| "epoch": 0.1583652618135377, | |
| "grad_norm": 15.038782119750977, | |
| "kl": 0.107666015625, | |
| "learning_rate": 8.415132924335378e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.3335031494498253, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 312.75, | |
| "epoch": 0.15938697318007664, | |
| "grad_norm": 4.5288987159729, | |
| "kl": 0.109619140625, | |
| "learning_rate": 8.404907975460123e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 302.46875, | |
| "epoch": 0.16040868454661558, | |
| "grad_norm": 10.11666488647461, | |
| "kl": 0.1051025390625, | |
| "learning_rate": 8.394683026584867e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.4218914955854416, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 299.53125, | |
| "epoch": 0.16143039591315453, | |
| "grad_norm": 3.016226053237915, | |
| "kl": 0.114501953125, | |
| "learning_rate": 8.384458077709611e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 314.65625, | |
| "epoch": 0.16245210727969348, | |
| "grad_norm": 7.130186557769775, | |
| "kl": 0.1036376953125, | |
| "learning_rate": 8.374233128834356e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.3471825420856476, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 301.9375, | |
| "epoch": 0.16347381864623245, | |
| "grad_norm": 4.870277404785156, | |
| "kl": 0.1094970703125, | |
| "learning_rate": 8.3640081799591e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.3608423173427582, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 318.78125, | |
| "epoch": 0.1644955300127714, | |
| "grad_norm": 5.395737648010254, | |
| "kl": 0.1004638671875, | |
| "learning_rate": 8.353783231083844e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2177756354212761, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 300.0625, | |
| "epoch": 0.16551724137931034, | |
| "grad_norm": 10.07991886138916, | |
| "kl": 0.1109619140625, | |
| "learning_rate": 8.343558282208588e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.3335031494498253, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 305.4375, | |
| "epoch": 0.1665389527458493, | |
| "grad_norm": 19.749267578125, | |
| "kl": 0.1131591796875, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.408231720328331, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 314.0, | |
| "epoch": 0.16756066411238826, | |
| "grad_norm": 5.89938497543335, | |
| "kl": 0.10498046875, | |
| "learning_rate": 8.323108384458077e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.3061639815568924, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 298.3125, | |
| "epoch": 0.1685823754789272, | |
| "grad_norm": 7.932590484619141, | |
| "kl": 0.105224609375, | |
| "learning_rate": 8.312883435582821e-07, | |
| "loss": 0.0001, | |
| "reward": 1.75, | |
| "reward_std": 0.3745020925998688, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.75, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 295.0625, | |
| "epoch": 0.16960408684546616, | |
| "grad_norm": 2.4129507541656494, | |
| "kl": 0.1126708984375, | |
| "learning_rate": 8.302658486707566e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 305.53125, | |
| "epoch": 0.1706257982120051, | |
| "grad_norm": 2.6215834617614746, | |
| "kl": 0.1224365234375, | |
| "learning_rate": 8.292433537832311e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 304.6875, | |
| "epoch": 0.17164750957854405, | |
| "grad_norm": 4.020922660827637, | |
| "kl": 0.10693359375, | |
| "learning_rate": 8.282208588957055e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 302.3125, | |
| "epoch": 0.17266922094508302, | |
| "grad_norm": 18.06279945373535, | |
| "kl": 0.1505126953125, | |
| "learning_rate": 8.271983640081799e-07, | |
| "loss": 0.0002, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 282.5625, | |
| "epoch": 0.17369093231162197, | |
| "grad_norm": 10.634078025817871, | |
| "kl": 0.12451171875, | |
| "learning_rate": 8.261758691206544e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.3808925524353981, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 289.3125, | |
| "epoch": 0.17471264367816092, | |
| "grad_norm": 6.507157325744629, | |
| "kl": 0.1072998046875, | |
| "learning_rate": 8.251533742331288e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.90625, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 292.46875, | |
| "epoch": 0.17573435504469986, | |
| "grad_norm": 6.69488525390625, | |
| "kl": 0.1121826171875, | |
| "learning_rate": 8.241308793456032e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 300.375, | |
| "epoch": 0.17675606641123884, | |
| "grad_norm": 5.108429908752441, | |
| "kl": 0.115234375, | |
| "learning_rate": 8.231083844580777e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.3335031494498253, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 296.375, | |
| "epoch": 0.17777777777777778, | |
| "grad_norm": 13.315193176269531, | |
| "kl": 0.1217041015625, | |
| "learning_rate": 8.220858895705522e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.4355708882212639, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/score_reward": 0.84375, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 306.0625, | |
| "epoch": 0.17879948914431673, | |
| "grad_norm": 19.67743492126465, | |
| "kl": 0.11572265625, | |
| "learning_rate": 8.210633946830266e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.408231720328331, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 304.03125, | |
| "epoch": 0.17982120051085568, | |
| "grad_norm": 5.040560245513916, | |
| "kl": 0.10888671875, | |
| "learning_rate": 8.20040899795501e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 310.65625, | |
| "epoch": 0.18084291187739462, | |
| "grad_norm": 4.212230205535889, | |
| "kl": 0.1094970703125, | |
| "learning_rate": 8.190184049079755e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 306.8125, | |
| "epoch": 0.1818646232439336, | |
| "grad_norm": 3.313608169555664, | |
| "kl": 0.12255859375, | |
| "learning_rate": 8.179959100204498e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 286.90625, | |
| "epoch": 0.18288633461047255, | |
| "grad_norm": 2.3506696224212646, | |
| "kl": 0.1290283203125, | |
| "learning_rate": 8.169734151329242e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 295.34375, | |
| "epoch": 0.1839080459770115, | |
| "grad_norm": 9.331832885742188, | |
| "kl": 0.1142578125, | |
| "learning_rate": 8.159509202453987e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.3061639815568924, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 311.15625, | |
| "epoch": 0.18492975734355044, | |
| "grad_norm": 5.550638198852539, | |
| "kl": 0.1156005859375, | |
| "learning_rate": 8.149284253578732e-07, | |
| "loss": 0.0001, | |
| "reward": 1.75, | |
| "reward_std": 0.4261348247528076, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.75, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 321.5625, | |
| "epoch": 0.1859514687100894, | |
| "grad_norm": 4.090088844299316, | |
| "kl": 0.1102294921875, | |
| "learning_rate": 8.139059304703476e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.24511480331420898, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 296.0, | |
| "epoch": 0.18697318007662836, | |
| "grad_norm": 16.630859375, | |
| "kl": 0.116943359375, | |
| "learning_rate": 8.12883435582822e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 304.0, | |
| "epoch": 0.1879948914431673, | |
| "grad_norm": 18.008068084716797, | |
| "kl": 0.1181640625, | |
| "learning_rate": 8.118609406952965e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.9375, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 322.0625, | |
| "epoch": 0.18901660280970625, | |
| "grad_norm": 4.149680137634277, | |
| "kl": 0.1103515625, | |
| "learning_rate": 8.108384458077709e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.4218914955854416, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/score_reward": 0.875, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 326.65625, | |
| "epoch": 0.1900383141762452, | |
| "grad_norm": 3.189527988433838, | |
| "kl": 0.1256103515625, | |
| "learning_rate": 8.098159509202454e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.96875, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 310.34375, | |
| "epoch": 0.19106002554278417, | |
| "grad_norm": 6.5971360206604, | |
| "kl": 0.1153564453125, | |
| "learning_rate": 8.087934560327198e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 312.28125, | |
| "epoch": 0.19208173690932312, | |
| "grad_norm": 3.084676742553711, | |
| "kl": 0.107666015625, | |
| "learning_rate": 8.077709611451943e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 310.6875, | |
| "epoch": 0.19310344827586207, | |
| "grad_norm": 6.6285858154296875, | |
| "kl": 0.12890625, | |
| "learning_rate": 8.067484662576687e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.408231720328331, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.84375, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 307.59375, | |
| "epoch": 0.194125159642401, | |
| "grad_norm": 6.375519752502441, | |
| "kl": 0.122314453125, | |
| "learning_rate": 8.057259713701431e-07, | |
| "loss": 0.0001, | |
| "reward": 1.75, | |
| "reward_std": 0.4492306634783745, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/score_reward": 0.8125, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 301.03125, | |
| "epoch": 0.19514687100894, | |
| "grad_norm": 7.942838191986084, | |
| "kl": 0.1104736328125, | |
| "learning_rate": 8.047034764826176e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 319.71875, | |
| "epoch": 0.19616858237547893, | |
| "grad_norm": 7.067020893096924, | |
| "kl": 0.106201171875, | |
| "learning_rate": 8.03680981595092e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 302.375, | |
| "epoch": 0.19719029374201788, | |
| "grad_norm": 3.544130563735962, | |
| "kl": 0.1158447265625, | |
| "learning_rate": 8.026584867075665e-07, | |
| "loss": 0.0001, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 310.375, | |
| "epoch": 0.19821200510855683, | |
| "grad_norm": 2.3012943267822266, | |
| "kl": 0.12451171875, | |
| "learning_rate": 8.016359918200409e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 322.84375, | |
| "epoch": 0.19923371647509577, | |
| "grad_norm": 1.7693819999694824, | |
| "kl": 0.120361328125, | |
| "learning_rate": 8.006134969325153e-07, | |
| "loss": 0.0001, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 306.09375, | |
| "epoch": 0.20025542784163475, | |
| "grad_norm": 9.119519233703613, | |
| "kl": 0.1484375, | |
| "learning_rate": 7.995910020449897e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.3808925524353981, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 311.1875, | |
| "epoch": 0.2012771392081737, | |
| "grad_norm": 13.540163040161133, | |
| "kl": 0.1158447265625, | |
| "learning_rate": 7.985685071574641e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8125, | |
| "reward_std": 0.408231720328331, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 328.75, | |
| "epoch": 0.20229885057471264, | |
| "grad_norm": 40.199424743652344, | |
| "kl": 0.1607666015625, | |
| "learning_rate": 7.975460122699385e-07, | |
| "loss": 0.0002, | |
| "reward": 1.84375, | |
| "reward_std": 0.3808925524353981, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 316.09375, | |
| "epoch": 0.2033205619412516, | |
| "grad_norm": 1.4208647012710571, | |
| "kl": 0.1356201171875, | |
| "learning_rate": 7.965235173824131e-07, | |
| "loss": 0.0001, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 311.1875, | |
| "epoch": 0.20434227330779056, | |
| "grad_norm": 7.4315996170043945, | |
| "kl": 0.1317138671875, | |
| "learning_rate": 7.955010224948875e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 316.5625, | |
| "epoch": 0.2053639846743295, | |
| "grad_norm": 7.12362813949585, | |
| "kl": 0.123046875, | |
| "learning_rate": 7.944785276073619e-07, | |
| "loss": 0.0001, | |
| "reward": 1.84375, | |
| "reward_std": 0.3198433741927147, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 314.40625, | |
| "epoch": 0.20638569604086845, | |
| "grad_norm": 12.200655937194824, | |
| "kl": 0.131103515625, | |
| "learning_rate": 7.934560327198364e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 312.21875, | |
| "epoch": 0.2074074074074074, | |
| "grad_norm": 3.5837337970733643, | |
| "kl": 0.135986328125, | |
| "learning_rate": 7.924335378323108e-07, | |
| "loss": 0.0001, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 320.375, | |
| "epoch": 0.20842911877394635, | |
| "grad_norm": 13.410809516906738, | |
| "kl": 0.1207275390625, | |
| "learning_rate": 7.914110429447852e-07, | |
| "loss": 0.0001, | |
| "reward": 1.78125, | |
| "reward_std": 0.3608423173427582, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 321.03125, | |
| "epoch": 0.20945083014048532, | |
| "grad_norm": 7.2692084312438965, | |
| "kl": 0.1302490234375, | |
| "learning_rate": 7.903885480572596e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 316.5625, | |
| "epoch": 0.21047254150702427, | |
| "grad_norm": 5.394606590270996, | |
| "kl": 0.13525390625, | |
| "learning_rate": 7.893660531697342e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.9375, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 330.53125, | |
| "epoch": 0.21149425287356322, | |
| "grad_norm": 5.349623203277588, | |
| "kl": 0.1356201171875, | |
| "learning_rate": 7.883435582822086e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2177756354212761, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 298.3125, | |
| "epoch": 0.21251596424010216, | |
| "grad_norm": 2.81088924407959, | |
| "kl": 0.140869140625, | |
| "learning_rate": 7.87321063394683e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 321.15625, | |
| "epoch": 0.21353767560664114, | |
| "grad_norm": 3.7844772338867188, | |
| "kl": 0.141357421875, | |
| "learning_rate": 7.862985685071575e-07, | |
| "loss": 0.0001, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 313.28125, | |
| "epoch": 0.21455938697318008, | |
| "grad_norm": 4.153626918792725, | |
| "kl": 0.14404296875, | |
| "learning_rate": 7.852760736196319e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2177756354212761, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 323.03125, | |
| "epoch": 0.21558109833971903, | |
| "grad_norm": 2.0860559940338135, | |
| "kl": 0.12841796875, | |
| "learning_rate": 7.842535787321063e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 323.28125, | |
| "epoch": 0.21660280970625798, | |
| "grad_norm": 22.485321044921875, | |
| "kl": 0.139892578125, | |
| "learning_rate": 7.832310838445806e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 319.59375, | |
| "epoch": 0.21762452107279692, | |
| "grad_norm": 0.013539531268179417, | |
| "kl": 0.1484375, | |
| "learning_rate": 7.822085889570552e-07, | |
| "loss": 0.0001, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 320.8125, | |
| "epoch": 0.2186462324393359, | |
| "grad_norm": 3.0891494750976562, | |
| "kl": 0.1585693359375, | |
| "learning_rate": 7.811860940695296e-07, | |
| "loss": 0.0002, | |
| "reward": 1.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 309.71875, | |
| "epoch": 0.21966794380587484, | |
| "grad_norm": 5.0978899002075195, | |
| "kl": 0.167724609375, | |
| "learning_rate": 7.80163599182004e-07, | |
| "loss": 0.0002, | |
| "reward": 1.875, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 294.65625, | |
| "epoch": 0.2206896551724138, | |
| "grad_norm": 3.0226728916168213, | |
| "kl": 0.134765625, | |
| "learning_rate": 7.791411042944785e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 318.84375, | |
| "epoch": 0.22171136653895274, | |
| "grad_norm": 18.734834671020508, | |
| "kl": 0.139404296875, | |
| "learning_rate": 7.781186094069529e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 301.46875, | |
| "epoch": 0.2227330779054917, | |
| "grad_norm": 10.709063529968262, | |
| "kl": 0.1416015625, | |
| "learning_rate": 7.770961145194273e-07, | |
| "loss": 0.0001, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 302.0625, | |
| "epoch": 0.22375478927203066, | |
| "grad_norm": 2.3834197521209717, | |
| "kl": 0.15478515625, | |
| "learning_rate": 7.760736196319018e-07, | |
| "loss": 0.0002, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 311.96875, | |
| "epoch": 0.2247765006385696, | |
| "grad_norm": 45.573795318603516, | |
| "kl": 0.16552734375, | |
| "learning_rate": 7.750511247443763e-07, | |
| "loss": 0.0002, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 290.65625, | |
| "epoch": 0.22579821200510855, | |
| "grad_norm": 104.85749816894531, | |
| "kl": 0.1455078125, | |
| "learning_rate": 7.740286298568507e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 304.28125, | |
| "epoch": 0.2268199233716475, | |
| "grad_norm": 4.561243534088135, | |
| "kl": 0.146240234375, | |
| "learning_rate": 7.730061349693251e-07, | |
| "loss": 0.0001, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 303.125, | |
| "epoch": 0.22784163473818647, | |
| "grad_norm": 2.4489190578460693, | |
| "kl": 0.15234375, | |
| "learning_rate": 7.719836400817995e-07, | |
| "loss": 0.0002, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 284.125, | |
| "epoch": 0.22886334610472542, | |
| "grad_norm": 6.217945098876953, | |
| "kl": 0.164306640625, | |
| "learning_rate": 7.70961145194274e-07, | |
| "loss": 0.0002, | |
| "reward": 1.84375, | |
| "reward_std": 0.3061639815568924, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 280.15625, | |
| "epoch": 0.22988505747126436, | |
| "grad_norm": 20.53363800048828, | |
| "kl": 0.16845703125, | |
| "learning_rate": 7.699386503067485e-07, | |
| "loss": 0.0002, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 277.09375, | |
| "epoch": 0.2309067688378033, | |
| "grad_norm": 5.487651824951172, | |
| "kl": 0.160400390625, | |
| "learning_rate": 7.689161554192229e-07, | |
| "loss": 0.0002, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 287.5, | |
| "epoch": 0.23192848020434229, | |
| "grad_norm": 14.836731910705566, | |
| "kl": 0.19482421875, | |
| "learning_rate": 7.678936605316974e-07, | |
| "loss": 0.0002, | |
| "reward": 1.8125, | |
| "reward_std": 0.408231720328331, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 270.5, | |
| "epoch": 0.23295019157088123, | |
| "grad_norm": 7.288157939910889, | |
| "kl": 0.173828125, | |
| "learning_rate": 7.668711656441718e-07, | |
| "loss": 0.0002, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 263.71875, | |
| "epoch": 0.23397190293742018, | |
| "grad_norm": 23.819374084472656, | |
| "kl": 0.17236328125, | |
| "learning_rate": 7.658486707566462e-07, | |
| "loss": 0.0002, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 267.21875, | |
| "epoch": 0.23499361430395913, | |
| "grad_norm": 0.014544324018061161, | |
| "kl": 0.171142578125, | |
| "learning_rate": 7.648261758691205e-07, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 255.46875, | |
| "epoch": 0.23601532567049807, | |
| "grad_norm": 9.193100929260254, | |
| "kl": 0.20947265625, | |
| "learning_rate": 7.63803680981595e-07, | |
| "loss": 0.0002, | |
| "reward": 1.84375, | |
| "reward_std": 0.24511480331420898, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 260.59375, | |
| "epoch": 0.23703703703703705, | |
| "grad_norm": 8.360955238342285, | |
| "kl": 0.2080078125, | |
| "learning_rate": 7.627811860940695e-07, | |
| "loss": 0.0002, | |
| "reward": 1.875, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 257.375, | |
| "epoch": 0.238058748403576, | |
| "grad_norm": 10.487519264221191, | |
| "kl": 0.186767578125, | |
| "learning_rate": 7.617586912065439e-07, | |
| "loss": 0.0002, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 256.59375, | |
| "epoch": 0.23908045977011494, | |
| "grad_norm": 8.354109764099121, | |
| "kl": 0.202880859375, | |
| "learning_rate": 7.607361963190184e-07, | |
| "loss": 0.0002, | |
| "reward": 1.875, | |
| "reward_std": 0.2177756354212761, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 254.125, | |
| "epoch": 0.24010217113665389, | |
| "grad_norm": 12.272588729858398, | |
| "kl": 0.186767578125, | |
| "learning_rate": 7.597137014314928e-07, | |
| "loss": 0.0002, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 252.40625, | |
| "epoch": 0.24112388250319286, | |
| "grad_norm": 22.70075225830078, | |
| "kl": 0.19873046875, | |
| "learning_rate": 7.586912065439672e-07, | |
| "loss": 0.0002, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 272.3125, | |
| "epoch": 0.2421455938697318, | |
| "grad_norm": 4.7315216064453125, | |
| "kl": 0.175537109375, | |
| "learning_rate": 7.576687116564416e-07, | |
| "loss": 0.0002, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 257.75, | |
| "epoch": 0.24316730523627075, | |
| "grad_norm": 5.250993251800537, | |
| "kl": 0.172607421875, | |
| "learning_rate": 7.566462167689162e-07, | |
| "loss": 0.0002, | |
| "reward": 1.84375, | |
| "reward_std": 0.3061639815568924, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 256.3125, | |
| "epoch": 0.2441890166028097, | |
| "grad_norm": 6.388535976409912, | |
| "kl": 0.19189453125, | |
| "learning_rate": 7.556237218813906e-07, | |
| "loss": 0.0002, | |
| "reward": 1.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 253.84375, | |
| "epoch": 0.24521072796934865, | |
| "grad_norm": 9.918744087219238, | |
| "kl": 0.19091796875, | |
| "learning_rate": 7.54601226993865e-07, | |
| "loss": 0.0002, | |
| "reward": 1.84375, | |
| "reward_std": 0.3198433741927147, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 248.1875, | |
| "epoch": 0.24623243933588762, | |
| "grad_norm": 4.066084861755371, | |
| "kl": 0.183349609375, | |
| "learning_rate": 7.535787321063395e-07, | |
| "loss": 0.0002, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 248.90625, | |
| "epoch": 0.24725415070242657, | |
| "grad_norm": 5.032376766204834, | |
| "kl": 0.183349609375, | |
| "learning_rate": 7.525562372188139e-07, | |
| "loss": 0.0002, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 252.59375, | |
| "epoch": 0.2482758620689655, | |
| "grad_norm": 5.675601005554199, | |
| "kl": 0.191650390625, | |
| "learning_rate": 7.515337423312883e-07, | |
| "loss": 0.0002, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 242.75, | |
| "epoch": 0.24929757343550446, | |
| "grad_norm": 4.9524149894714355, | |
| "kl": 0.189453125, | |
| "learning_rate": 7.505112474437627e-07, | |
| "loss": 0.0002, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 236.875, | |
| "epoch": 0.2503192848020434, | |
| "grad_norm": 10.226370811462402, | |
| "kl": 0.25830078125, | |
| "learning_rate": 7.494887525562373e-07, | |
| "loss": 0.0003, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 250.0625, | |
| "epoch": 0.25134099616858235, | |
| "grad_norm": 4.868337631225586, | |
| "kl": 0.217041015625, | |
| "learning_rate": 7.484662576687117e-07, | |
| "loss": 0.0002, | |
| "reward": 1.84375, | |
| "reward_std": 0.3198433741927147, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 232.34375, | |
| "epoch": 0.25236270753512136, | |
| "grad_norm": 4.280240535736084, | |
| "kl": 0.203857421875, | |
| "learning_rate": 7.47443762781186e-07, | |
| "loss": 0.0002, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 224.65625, | |
| "epoch": 0.2533844189016603, | |
| "grad_norm": 6.966822624206543, | |
| "kl": 0.20458984375, | |
| "learning_rate": 7.464212678936604e-07, | |
| "loss": 0.0002, | |
| "reward": 1.84375, | |
| "reward_std": 0.2773705795407295, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.875, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 239.5, | |
| "epoch": 0.25440613026819925, | |
| "grad_norm": 11.961800575256348, | |
| "kl": 0.206787109375, | |
| "learning_rate": 7.453987730061349e-07, | |
| "loss": 0.0002, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 238.75, | |
| "epoch": 0.2554278416347382, | |
| "grad_norm": 3.842238426208496, | |
| "kl": 0.19873046875, | |
| "learning_rate": 7.443762781186093e-07, | |
| "loss": 0.0002, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 240.96875, | |
| "epoch": 0.25644955300127714, | |
| "grad_norm": 45.54724884033203, | |
| "kl": 0.206787109375, | |
| "learning_rate": 7.433537832310837e-07, | |
| "loss": 0.0002, | |
| "reward": 1.84375, | |
| "reward_std": 0.3808925524353981, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 234.53125, | |
| "epoch": 0.2574712643678161, | |
| "grad_norm": 9.205824851989746, | |
| "kl": 0.224853515625, | |
| "learning_rate": 7.423312883435583e-07, | |
| "loss": 0.0002, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 235.03125, | |
| "epoch": 0.25849297573435503, | |
| "grad_norm": 8.615628242492676, | |
| "kl": 0.222412109375, | |
| "learning_rate": 7.413087934560327e-07, | |
| "loss": 0.0002, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 222.8125, | |
| "epoch": 0.259514687100894, | |
| "grad_norm": 30.646860122680664, | |
| "kl": 0.2119140625, | |
| "learning_rate": 7.402862985685071e-07, | |
| "loss": 0.0002, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 232.0625, | |
| "epoch": 0.26053639846743293, | |
| "grad_norm": 4.058384418487549, | |
| "kl": 0.26513671875, | |
| "learning_rate": 7.392638036809815e-07, | |
| "loss": 0.0003, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 238.9375, | |
| "epoch": 0.26155810983397193, | |
| "grad_norm": 2.932230234146118, | |
| "kl": 0.218994140625, | |
| "learning_rate": 7.38241308793456e-07, | |
| "loss": 0.0002, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 236.03125, | |
| "epoch": 0.2625798212005109, | |
| "grad_norm": 7.106131076812744, | |
| "kl": 0.2197265625, | |
| "learning_rate": 7.372188139059304e-07, | |
| "loss": 0.0002, | |
| "reward": 1.8125, | |
| "reward_std": 0.3104073107242584, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 232.0, | |
| "epoch": 0.2636015325670498, | |
| "grad_norm": 24.948230743408203, | |
| "kl": 0.241943359375, | |
| "learning_rate": 7.361963190184049e-07, | |
| "loss": 0.0002, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 228.75, | |
| "epoch": 0.26462324393358877, | |
| "grad_norm": 5.059873104095459, | |
| "kl": 0.23291015625, | |
| "learning_rate": 7.351738241308794e-07, | |
| "loss": 0.0002, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 231.21875, | |
| "epoch": 0.2656449553001277, | |
| "grad_norm": 4.365972518920898, | |
| "kl": 0.224853515625, | |
| "learning_rate": 7.341513292433538e-07, | |
| "loss": 0.0002, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 222.0625, | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 4.550088405609131, | |
| "kl": 0.234375, | |
| "learning_rate": 7.331288343558282e-07, | |
| "loss": 0.0002, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 222.875, | |
| "epoch": 0.2676883780332056, | |
| "grad_norm": 17.764812469482422, | |
| "kl": 0.234375, | |
| "learning_rate": 7.321063394683026e-07, | |
| "loss": 0.0002, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 220.5625, | |
| "epoch": 0.26871008939974456, | |
| "grad_norm": 7.222673416137695, | |
| "kl": 0.2490234375, | |
| "learning_rate": 7.310838445807771e-07, | |
| "loss": 0.0002, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 220.375, | |
| "epoch": 0.2697318007662835, | |
| "grad_norm": 4.224782943725586, | |
| "kl": 0.249755859375, | |
| "learning_rate": 7.300613496932515e-07, | |
| "loss": 0.0002, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 209.53125, | |
| "epoch": 0.2707535121328225, | |
| "grad_norm": 2.8941519260406494, | |
| "kl": 0.23388671875, | |
| "learning_rate": 7.29038854805726e-07, | |
| "loss": 0.0002, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 215.125, | |
| "epoch": 0.27177522349936145, | |
| "grad_norm": 2.982320785522461, | |
| "kl": 0.24169921875, | |
| "learning_rate": 7.280163599182004e-07, | |
| "loss": 0.0002, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 214.625, | |
| "epoch": 0.2727969348659004, | |
| "grad_norm": 4.667297840118408, | |
| "kl": 0.21923828125, | |
| "learning_rate": 7.269938650306748e-07, | |
| "loss": 0.0002, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 212.4375, | |
| "epoch": 0.27381864623243934, | |
| "grad_norm": 6.684914588928223, | |
| "kl": 0.245849609375, | |
| "learning_rate": 7.259713701431492e-07, | |
| "loss": 0.0002, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 210.625, | |
| "epoch": 0.2748403575989783, | |
| "grad_norm": 7.654986381530762, | |
| "kl": 0.234619140625, | |
| "learning_rate": 7.249488752556236e-07, | |
| "loss": 0.0002, | |
| "reward": 1.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 216.34375, | |
| "epoch": 0.27586206896551724, | |
| "grad_norm": 8.086624145507812, | |
| "kl": 0.2763671875, | |
| "learning_rate": 7.239263803680981e-07, | |
| "loss": 0.0003, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 203.15625, | |
| "epoch": 0.2768837803320562, | |
| "grad_norm": 2.0906248092651367, | |
| "kl": 0.25341796875, | |
| "learning_rate": 7.229038854805726e-07, | |
| "loss": 0.0003, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 216.15625, | |
| "epoch": 0.27790549169859513, | |
| "grad_norm": 5.824390888214111, | |
| "kl": 0.232666015625, | |
| "learning_rate": 7.21881390593047e-07, | |
| "loss": 0.0002, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 197.71875, | |
| "epoch": 0.2789272030651341, | |
| "grad_norm": 5.233481407165527, | |
| "kl": 0.38330078125, | |
| "learning_rate": 7.208588957055214e-07, | |
| "loss": 0.0004, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 207.0, | |
| "epoch": 0.2799489144316731, | |
| "grad_norm": 5.433725833892822, | |
| "kl": 0.271240234375, | |
| "learning_rate": 7.198364008179959e-07, | |
| "loss": 0.0003, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 203.15625, | |
| "epoch": 0.280970625798212, | |
| "grad_norm": 6.671786785125732, | |
| "kl": 0.2568359375, | |
| "learning_rate": 7.188139059304703e-07, | |
| "loss": 0.0003, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 208.5, | |
| "epoch": 0.281992337164751, | |
| "grad_norm": 7.154035568237305, | |
| "kl": 0.271484375, | |
| "learning_rate": 7.177914110429447e-07, | |
| "loss": 0.0003, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 205.71875, | |
| "epoch": 0.2830140485312899, | |
| "grad_norm": 2.843066453933716, | |
| "kl": 0.2822265625, | |
| "learning_rate": 7.167689161554193e-07, | |
| "loss": 0.0003, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 199.8125, | |
| "epoch": 0.28403575989782887, | |
| "grad_norm": 9.365588188171387, | |
| "kl": 0.269287109375, | |
| "learning_rate": 7.157464212678937e-07, | |
| "loss": 0.0003, | |
| "reward": 1.78125, | |
| "reward_std": 0.4218914955854416, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.78125, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 197.78125, | |
| "epoch": 0.2850574712643678, | |
| "grad_norm": 7.305079460144043, | |
| "kl": 0.252197265625, | |
| "learning_rate": 7.147239263803681e-07, | |
| "loss": 0.0003, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 188.84375, | |
| "epoch": 0.28607918263090676, | |
| "grad_norm": 2.625617504119873, | |
| "kl": 0.2861328125, | |
| "learning_rate": 7.137014314928425e-07, | |
| "loss": 0.0003, | |
| "reward": 1.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 197.21875, | |
| "epoch": 0.2871008939974457, | |
| "grad_norm": 9.687397003173828, | |
| "kl": 0.26123046875, | |
| "learning_rate": 7.12678936605317e-07, | |
| "loss": 0.0003, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 200.0625, | |
| "epoch": 0.28812260536398465, | |
| "grad_norm": 5.423101902008057, | |
| "kl": 0.282958984375, | |
| "learning_rate": 7.116564417177914e-07, | |
| "loss": 0.0003, | |
| "reward": 1.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 186.9375, | |
| "epoch": 0.28914431673052365, | |
| "grad_norm": 2.387115001678467, | |
| "kl": 0.272705078125, | |
| "learning_rate": 7.106339468302657e-07, | |
| "loss": 0.0003, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 203.53125, | |
| "epoch": 0.2901660280970626, | |
| "grad_norm": 5.688641548156738, | |
| "kl": 0.25634765625, | |
| "learning_rate": 7.096114519427403e-07, | |
| "loss": 0.0003, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 177.71875, | |
| "epoch": 0.29118773946360155, | |
| "grad_norm": 4.236676216125488, | |
| "kl": 0.28759765625, | |
| "learning_rate": 7.085889570552147e-07, | |
| "loss": 0.0003, | |
| "reward": 1.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 186.78125, | |
| "epoch": 0.2922094508301405, | |
| "grad_norm": 0.015448813326656818, | |
| "kl": 0.28515625, | |
| "learning_rate": 7.075664621676891e-07, | |
| "loss": 0.0003, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 172.4375, | |
| "epoch": 0.29323116219667944, | |
| "grad_norm": 0.020298132672905922, | |
| "kl": 0.29150390625, | |
| "learning_rate": 7.065439672801635e-07, | |
| "loss": 0.0003, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 171.65625, | |
| "epoch": 0.2942528735632184, | |
| "grad_norm": 5.144425392150879, | |
| "kl": 0.30419921875, | |
| "learning_rate": 7.05521472392638e-07, | |
| "loss": 0.0003, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.9375, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 165.59375, | |
| "epoch": 0.29527458492975733, | |
| "grad_norm": 7.851678848266602, | |
| "kl": 0.28173828125, | |
| "learning_rate": 7.044989775051124e-07, | |
| "loss": 0.0003, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 161.71875, | |
| "epoch": 0.2962962962962963, | |
| "grad_norm": 0.027636835351586342, | |
| "kl": 0.35400390625, | |
| "learning_rate": 7.034764826175868e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 157.1875, | |
| "epoch": 0.2973180076628352, | |
| "grad_norm": 0.02421570010483265, | |
| "kl": 0.3076171875, | |
| "learning_rate": 7.024539877300614e-07, | |
| "loss": 0.0003, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 165.96875, | |
| "epoch": 0.29833971902937423, | |
| "grad_norm": 8.629912376403809, | |
| "kl": 0.2998046875, | |
| "learning_rate": 7.014314928425358e-07, | |
| "loss": 0.0003, | |
| "reward": 1.875, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 160.6875, | |
| "epoch": 0.2993614303959132, | |
| "grad_norm": 13.829442024230957, | |
| "kl": 0.30517578125, | |
| "learning_rate": 7.004089979550102e-07, | |
| "loss": 0.0003, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 167.84375, | |
| "epoch": 0.3003831417624521, | |
| "grad_norm": 3.7873756885528564, | |
| "kl": 0.3505859375, | |
| "learning_rate": 6.993865030674846e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 153.5, | |
| "epoch": 0.30140485312899107, | |
| "grad_norm": 4.651973247528076, | |
| "kl": 0.330078125, | |
| "learning_rate": 6.983640081799591e-07, | |
| "loss": 0.0003, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 161.9375, | |
| "epoch": 0.30242656449553, | |
| "grad_norm": 0.09622839093208313, | |
| "kl": 0.34033203125, | |
| "learning_rate": 6.973415132924335e-07, | |
| "loss": 0.0003, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 166.03125, | |
| "epoch": 0.30344827586206896, | |
| "grad_norm": 6.2677812576293945, | |
| "kl": 0.3583984375, | |
| "learning_rate": 6.96319018404908e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 176.53125, | |
| "epoch": 0.3044699872286079, | |
| "grad_norm": 6.2895355224609375, | |
| "kl": 0.32275390625, | |
| "learning_rate": 6.952965235173824e-07, | |
| "loss": 0.0003, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 170.9375, | |
| "epoch": 0.30549169859514685, | |
| "grad_norm": 31.085887908935547, | |
| "kl": 0.30419921875, | |
| "learning_rate": 6.942740286298569e-07, | |
| "loss": 0.0003, | |
| "reward": 1.84375, | |
| "reward_std": 0.3808925524353981, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.875, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 170.59375, | |
| "epoch": 0.3065134099616858, | |
| "grad_norm": 5.726772785186768, | |
| "kl": 0.31640625, | |
| "learning_rate": 6.932515337423313e-07, | |
| "loss": 0.0003, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 186.84375, | |
| "epoch": 0.3075351213282248, | |
| "grad_norm": 2.8802192211151123, | |
| "kl": 0.29638671875, | |
| "learning_rate": 6.922290388548056e-07, | |
| "loss": 0.0003, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 182.0, | |
| "epoch": 0.30855683269476375, | |
| "grad_norm": 5.975210189819336, | |
| "kl": 0.34716796875, | |
| "learning_rate": 6.912065439672801e-07, | |
| "loss": 0.0003, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 158.09375, | |
| "epoch": 0.3095785440613027, | |
| "grad_norm": 29.659189224243164, | |
| "kl": 0.330078125, | |
| "learning_rate": 6.901840490797545e-07, | |
| "loss": 0.0003, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 166.375, | |
| "epoch": 0.31060025542784164, | |
| "grad_norm": 0.06261244416236877, | |
| "kl": 0.34716796875, | |
| "learning_rate": 6.89161554192229e-07, | |
| "loss": 0.0003, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 160.40625, | |
| "epoch": 0.3116219667943806, | |
| "grad_norm": 43.00814437866211, | |
| "kl": 0.310546875, | |
| "learning_rate": 6.881390593047034e-07, | |
| "loss": 0.0003, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 170.625, | |
| "epoch": 0.31264367816091954, | |
| "grad_norm": 6.167644500732422, | |
| "kl": 0.333984375, | |
| "learning_rate": 6.871165644171779e-07, | |
| "loss": 0.0003, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 148.25, | |
| "epoch": 0.3136653895274585, | |
| "grad_norm": 10.081515312194824, | |
| "kl": 0.35205078125, | |
| "learning_rate": 6.860940695296523e-07, | |
| "loss": 0.0004, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.90625, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 145.6875, | |
| "epoch": 0.31468710089399743, | |
| "grad_norm": 12.819108963012695, | |
| "kl": 0.375, | |
| "learning_rate": 6.850715746421267e-07, | |
| "loss": 0.0004, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 162.21875, | |
| "epoch": 0.3157088122605364, | |
| "grad_norm": 4.192606449127197, | |
| "kl": 0.35986328125, | |
| "learning_rate": 6.840490797546012e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 152.125, | |
| "epoch": 0.3167305236270754, | |
| "grad_norm": 5.13338565826416, | |
| "kl": 0.3212890625, | |
| "learning_rate": 6.830265848670757e-07, | |
| "loss": 0.0003, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 152.6875, | |
| "epoch": 0.3177522349936143, | |
| "grad_norm": 14.295976638793945, | |
| "kl": 0.37451171875, | |
| "learning_rate": 6.820040899795501e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 142.53125, | |
| "epoch": 0.31877394636015327, | |
| "grad_norm": 0.0234597809612751, | |
| "kl": 0.357421875, | |
| "learning_rate": 6.809815950920245e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 142.09375, | |
| "epoch": 0.3197956577266922, | |
| "grad_norm": 9.601189613342285, | |
| "kl": 0.38818359375, | |
| "learning_rate": 6.79959100204499e-07, | |
| "loss": 0.0004, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 125.6875, | |
| "epoch": 0.32081736909323116, | |
| "grad_norm": 16.259977340698242, | |
| "kl": 0.3955078125, | |
| "learning_rate": 6.789366053169734e-07, | |
| "loss": 0.0004, | |
| "reward": 1.8125, | |
| "reward_std": 0.3104073107242584, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.8125, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 126.53125, | |
| "epoch": 0.3218390804597701, | |
| "grad_norm": 22.091760635375977, | |
| "kl": 0.3837890625, | |
| "learning_rate": 6.779141104294478e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 144.78125, | |
| "epoch": 0.32286079182630906, | |
| "grad_norm": 6.2110443115234375, | |
| "kl": 0.365234375, | |
| "learning_rate": 6.768916155419223e-07, | |
| "loss": 0.0004, | |
| "reward": 1.875, | |
| "reward_std": 0.2925042062997818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.21875, | |
| "epoch": 0.323882503192848, | |
| "grad_norm": 7.100230693817139, | |
| "kl": 0.388671875, | |
| "learning_rate": 6.758691206543968e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.96875, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 120.53125, | |
| "epoch": 0.32490421455938695, | |
| "grad_norm": 5.682871341705322, | |
| "kl": 0.37109375, | |
| "learning_rate": 6.748466257668711e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 126.0, | |
| "epoch": 0.32592592592592595, | |
| "grad_norm": 7.511983394622803, | |
| "kl": 0.38037109375, | |
| "learning_rate": 6.738241308793455e-07, | |
| "loss": 0.0004, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 116.28125, | |
| "epoch": 0.3269476372924649, | |
| "grad_norm": 5.289898872375488, | |
| "kl": 0.36083984375, | |
| "learning_rate": 6.7280163599182e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 132.6875, | |
| "epoch": 0.32796934865900385, | |
| "grad_norm": 6.084203243255615, | |
| "kl": 0.35693359375, | |
| "learning_rate": 6.717791411042944e-07, | |
| "loss": 0.0004, | |
| "reward": 1.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 124.8125, | |
| "epoch": 0.3289910600255428, | |
| "grad_norm": 9.163487434387207, | |
| "kl": 0.345703125, | |
| "learning_rate": 6.707566462167688e-07, | |
| "loss": 0.0003, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.625, | |
| "epoch": 0.33001277139208174, | |
| "grad_norm": 6.358760833740234, | |
| "kl": 0.400390625, | |
| "learning_rate": 6.697341513292433e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 130.96875, | |
| "epoch": 0.3310344827586207, | |
| "grad_norm": 4.290017127990723, | |
| "kl": 0.39208984375, | |
| "learning_rate": 6.687116564417178e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 118.4375, | |
| "epoch": 0.33205619412515963, | |
| "grad_norm": 9.01698112487793, | |
| "kl": 0.37060546875, | |
| "learning_rate": 6.676891615541922e-07, | |
| "loss": 0.0004, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 123.875, | |
| "epoch": 0.3330779054916986, | |
| "grad_norm": 6.9146575927734375, | |
| "kl": 0.3818359375, | |
| "learning_rate": 6.666666666666666e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.1875, | |
| "epoch": 0.3340996168582375, | |
| "grad_norm": 12.801935195922852, | |
| "kl": 0.38427734375, | |
| "learning_rate": 6.656441717791411e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.34375, | |
| "epoch": 0.3351213282247765, | |
| "grad_norm": 0.07020504027605057, | |
| "kl": 0.37841796875, | |
| "learning_rate": 6.646216768916155e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.96875, | |
| "epoch": 0.3361430395913155, | |
| "grad_norm": 0.04202645272016525, | |
| "kl": 0.404296875, | |
| "learning_rate": 6.635991820040899e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.53125, | |
| "epoch": 0.3371647509578544, | |
| "grad_norm": 10.052809715270996, | |
| "kl": 0.42041015625, | |
| "learning_rate": 6.625766871165644e-07, | |
| "loss": 0.0004, | |
| "reward": 1.84375, | |
| "reward_std": 0.30173346400260925, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.875, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.53125, | |
| "epoch": 0.33818646232439337, | |
| "grad_norm": 4.102263927459717, | |
| "kl": 0.3994140625, | |
| "learning_rate": 6.615541922290389e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.71875, | |
| "epoch": 0.3392081736909323, | |
| "grad_norm": 9.033509254455566, | |
| "kl": 0.41552734375, | |
| "learning_rate": 6.605316973415133e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.46875, | |
| "epoch": 0.34022988505747126, | |
| "grad_norm": 10.601028442382812, | |
| "kl": 0.412109375, | |
| "learning_rate": 6.595092024539877e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.6875, | |
| "epoch": 0.3412515964240102, | |
| "grad_norm": 5.923055171966553, | |
| "kl": 0.41015625, | |
| "learning_rate": 6.584867075664622e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 99.375, | |
| "epoch": 0.34227330779054915, | |
| "grad_norm": 0.044527485966682434, | |
| "kl": 0.4306640625, | |
| "learning_rate": 6.574642126789366e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.0, | |
| "epoch": 0.3432950191570881, | |
| "grad_norm": 5.072457790374756, | |
| "kl": 0.40869140625, | |
| "learning_rate": 6.56441717791411e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 86.96875, | |
| "epoch": 0.3443167305236271, | |
| "grad_norm": 0.11313924193382263, | |
| "kl": 0.44921875, | |
| "learning_rate": 6.554192229038854e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 99.125, | |
| "epoch": 0.34533844189016605, | |
| "grad_norm": 3.877697706222534, | |
| "kl": 0.408203125, | |
| "learning_rate": 6.543967280163599e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 87.8125, | |
| "epoch": 0.346360153256705, | |
| "grad_norm": 10.061586380004883, | |
| "kl": 0.41455078125, | |
| "learning_rate": 6.533742331288343e-07, | |
| "loss": 0.0004, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/score_reward": 0.96875, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.59375, | |
| "epoch": 0.34738186462324394, | |
| "grad_norm": 15.71408462524414, | |
| "kl": 0.41650390625, | |
| "learning_rate": 6.523517382413087e-07, | |
| "loss": 0.0004, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 83.75, | |
| "epoch": 0.3484035759897829, | |
| "grad_norm": 40.79656219482422, | |
| "kl": 0.4287109375, | |
| "learning_rate": 6.513292433537832e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 95.53125, | |
| "epoch": 0.34942528735632183, | |
| "grad_norm": 5.584923267364502, | |
| "kl": 0.3837890625, | |
| "learning_rate": 6.503067484662576e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 99.46875, | |
| "epoch": 0.3504469987228608, | |
| "grad_norm": 4.431502819061279, | |
| "kl": 0.49658203125, | |
| "learning_rate": 6.492842535787321e-07, | |
| "loss": 0.0005, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 86.8125, | |
| "epoch": 0.3514687100893997, | |
| "grad_norm": 12.653791427612305, | |
| "kl": 0.40966796875, | |
| "learning_rate": 6.482617586912065e-07, | |
| "loss": 0.0004, | |
| "reward": 1.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.875, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 90.34375, | |
| "epoch": 0.3524904214559387, | |
| "grad_norm": 17.112207412719727, | |
| "kl": 0.44384765625, | |
| "learning_rate": 6.47239263803681e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 97.8125, | |
| "epoch": 0.3535121328224777, | |
| "grad_norm": 8.341893196105957, | |
| "kl": 0.4013671875, | |
| "learning_rate": 6.462167689161554e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 93.40625, | |
| "epoch": 0.3545338441890166, | |
| "grad_norm": 3.5475313663482666, | |
| "kl": 0.4365234375, | |
| "learning_rate": 6.451942740286298e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 86.28125, | |
| "epoch": 0.35555555555555557, | |
| "grad_norm": 5.190430164337158, | |
| "kl": 0.40771484375, | |
| "learning_rate": 6.441717791411042e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 90.21875, | |
| "epoch": 0.3565772669220945, | |
| "grad_norm": 3.8718748092651367, | |
| "kl": 0.40771484375, | |
| "learning_rate": 6.431492842535788e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 77.9375, | |
| "epoch": 0.35759897828863346, | |
| "grad_norm": 29.05414581298828, | |
| "kl": 0.427734375, | |
| "learning_rate": 6.421267893660532e-07, | |
| "loss": 0.0004, | |
| "reward": 1.84375, | |
| "reward_std": 0.3808925524353981, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.84375, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 81.03125, | |
| "epoch": 0.3586206896551724, | |
| "grad_norm": 0.8640346527099609, | |
| "kl": 1.10595703125, | |
| "learning_rate": 6.411042944785276e-07, | |
| "loss": 0.0011, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 82.90625, | |
| "epoch": 0.35964240102171136, | |
| "grad_norm": 0.03076520748436451, | |
| "kl": 0.42724609375, | |
| "learning_rate": 6.400817995910021e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 73.5, | |
| "epoch": 0.3606641123882503, | |
| "grad_norm": 34.51716995239258, | |
| "kl": 0.4150390625, | |
| "learning_rate": 6.390593047034764e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 73.09375, | |
| "epoch": 0.36168582375478925, | |
| "grad_norm": 8.387367248535156, | |
| "kl": 0.45556640625, | |
| "learning_rate": 6.380368098159508e-07, | |
| "loss": 0.0005, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 74.8125, | |
| "epoch": 0.36270753512132825, | |
| "grad_norm": 7.5955986976623535, | |
| "kl": 0.4521484375, | |
| "learning_rate": 6.370143149284252e-07, | |
| "loss": 0.0005, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 67.96875, | |
| "epoch": 0.3637292464878672, | |
| "grad_norm": 60.00629425048828, | |
| "kl": 0.42138671875, | |
| "learning_rate": 6.359918200408998e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 83.21875, | |
| "epoch": 0.36475095785440614, | |
| "grad_norm": 4.175691604614258, | |
| "kl": 0.37841796875, | |
| "learning_rate": 6.349693251533742e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 77.21875, | |
| "epoch": 0.3657726692209451, | |
| "grad_norm": 0.028226764872670174, | |
| "kl": 0.39404296875, | |
| "learning_rate": 6.339468302658486e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 77.34375, | |
| "epoch": 0.36679438058748404, | |
| "grad_norm": 21.541412353515625, | |
| "kl": 0.42138671875, | |
| "learning_rate": 6.329243353783231e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 82.0625, | |
| "epoch": 0.367816091954023, | |
| "grad_norm": 0.05148075520992279, | |
| "kl": 0.4072265625, | |
| "learning_rate": 6.319018404907975e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 83.4375, | |
| "epoch": 0.36883780332056193, | |
| "grad_norm": 25.259191513061523, | |
| "kl": 0.41064453125, | |
| "learning_rate": 6.308793456032719e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 93.78125, | |
| "epoch": 0.3698595146871009, | |
| "grad_norm": 6.4033589363098145, | |
| "kl": 0.38427734375, | |
| "learning_rate": 6.298568507157464e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 89.8125, | |
| "epoch": 0.3708812260536398, | |
| "grad_norm": 0.0617324635386467, | |
| "kl": 0.39453125, | |
| "learning_rate": 6.288343558282209e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 80.90625, | |
| "epoch": 0.3719029374201788, | |
| "grad_norm": 7.903273582458496, | |
| "kl": 0.4169921875, | |
| "learning_rate": 6.278118609406953e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 92.9375, | |
| "epoch": 0.37292464878671777, | |
| "grad_norm": 13.616974830627441, | |
| "kl": 0.40185546875, | |
| "learning_rate": 6.267893660531697e-07, | |
| "loss": 0.0004, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 92.125, | |
| "epoch": 0.3739463601532567, | |
| "grad_norm": 0.10754834115505219, | |
| "kl": 0.36962890625, | |
| "learning_rate": 6.257668711656442e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 96.46875, | |
| "epoch": 0.37496807151979566, | |
| "grad_norm": 3.4671356678009033, | |
| "kl": 0.3798828125, | |
| "learning_rate": 6.247443762781186e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 99.78125, | |
| "epoch": 0.3759897828863346, | |
| "grad_norm": 0.10950777679681778, | |
| "kl": 0.388671875, | |
| "learning_rate": 6.23721881390593e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 91.25, | |
| "epoch": 0.37701149425287356, | |
| "grad_norm": 8.460134506225586, | |
| "kl": 0.41796875, | |
| "learning_rate": 6.226993865030675e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 92.6875, | |
| "epoch": 0.3780332056194125, | |
| "grad_norm": 0.07789568603038788, | |
| "kl": 0.38818359375, | |
| "learning_rate": 6.21676891615542e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 96.375, | |
| "epoch": 0.37905491698595145, | |
| "grad_norm": 56.66305923461914, | |
| "kl": 0.3955078125, | |
| "learning_rate": 6.206543967280163e-07, | |
| "loss": 0.0004, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 95.375, | |
| "epoch": 0.3800766283524904, | |
| "grad_norm": 14.545409202575684, | |
| "kl": 0.408203125, | |
| "learning_rate": 6.196319018404907e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 96.53125, | |
| "epoch": 0.3810983397190294, | |
| "grad_norm": 0.019573474302887917, | |
| "kl": 0.37646484375, | |
| "learning_rate": 6.186094069529652e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 94.25, | |
| "epoch": 0.38212005108556835, | |
| "grad_norm": 27.49530029296875, | |
| "kl": 0.39990234375, | |
| "learning_rate": 6.175869120654396e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.96875, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.71875, | |
| "epoch": 0.3831417624521073, | |
| "grad_norm": 33.705142974853516, | |
| "kl": 0.3798828125, | |
| "learning_rate": 6.165644171779141e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.84375, | |
| "epoch": 0.38416347381864624, | |
| "grad_norm": 7.8572587966918945, | |
| "kl": 0.38623046875, | |
| "learning_rate": 6.155419222903885e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.65625, | |
| "epoch": 0.3851851851851852, | |
| "grad_norm": 7.864315032958984, | |
| "kl": 0.35498046875, | |
| "learning_rate": 6.14519427402863e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.96875, | |
| "epoch": 0.38620689655172413, | |
| "grad_norm": 30.97239112854004, | |
| "kl": 0.38525390625, | |
| "learning_rate": 6.134969325153374e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.28125, | |
| "epoch": 0.3872286079182631, | |
| "grad_norm": 0.04002818092703819, | |
| "kl": 0.365234375, | |
| "learning_rate": 6.124744376278118e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 126.09375, | |
| "epoch": 0.388250319284802, | |
| "grad_norm": 8.548779487609863, | |
| "kl": 0.40673828125, | |
| "learning_rate": 6.114519427402862e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 129.15625, | |
| "epoch": 0.389272030651341, | |
| "grad_norm": 9.612067222595215, | |
| "kl": 0.34423828125, | |
| "learning_rate": 6.104294478527607e-07, | |
| "loss": 0.0003, | |
| "reward": 1.90625, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 127.78125, | |
| "epoch": 0.39029374201788, | |
| "grad_norm": 0.043425094336271286, | |
| "kl": 0.3974609375, | |
| "learning_rate": 6.094069529652352e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 133.09375, | |
| "epoch": 0.3913154533844189, | |
| "grad_norm": 4.689680576324463, | |
| "kl": 0.37451171875, | |
| "learning_rate": 6.083844580777096e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 129.125, | |
| "epoch": 0.39233716475095787, | |
| "grad_norm": 3.6167423725128174, | |
| "kl": 0.35986328125, | |
| "learning_rate": 6.073619631901841e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.625, | |
| "epoch": 0.3933588761174968, | |
| "grad_norm": 0.10112284868955612, | |
| "kl": 0.40869140625, | |
| "learning_rate": 6.063394683026585e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 118.03125, | |
| "epoch": 0.39438058748403576, | |
| "grad_norm": 13.891422271728516, | |
| "kl": 0.40771484375, | |
| "learning_rate": 6.053169734151329e-07, | |
| "loss": 0.0004, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.90625, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 131.96875, | |
| "epoch": 0.3954022988505747, | |
| "grad_norm": 9.368993759155273, | |
| "kl": 0.390625, | |
| "learning_rate": 6.042944785276073e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 123.34375, | |
| "epoch": 0.39642401021711365, | |
| "grad_norm": 5.774983882904053, | |
| "kl": 0.3720703125, | |
| "learning_rate": 6.032719836400819e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 117.4375, | |
| "epoch": 0.3974457215836526, | |
| "grad_norm": 6.034429550170898, | |
| "kl": 0.3828125, | |
| "learning_rate": 6.022494887525562e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.625, | |
| "epoch": 0.39846743295019155, | |
| "grad_norm": 5.9506611824035645, | |
| "kl": 0.412109375, | |
| "learning_rate": 6.012269938650306e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.96875, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 124.75, | |
| "epoch": 0.39948914431673055, | |
| "grad_norm": 5.446381568908691, | |
| "kl": 0.38916015625, | |
| "learning_rate": 6.002044989775051e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.90625, | |
| "epoch": 0.4005108556832695, | |
| "grad_norm": 6.560061454772949, | |
| "kl": 0.408203125, | |
| "learning_rate": 5.991820040899795e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 118.65625, | |
| "epoch": 0.40153256704980844, | |
| "grad_norm": 9.356738090515137, | |
| "kl": 0.39697265625, | |
| "learning_rate": 5.981595092024539e-07, | |
| "loss": 0.0004, | |
| "reward": 1.90625, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/score_reward": 0.9375, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.125, | |
| "epoch": 0.4025542784163474, | |
| "grad_norm": 8.674097061157227, | |
| "kl": 0.396484375, | |
| "learning_rate": 5.971370143149283e-07, | |
| "loss": 0.0004, | |
| "reward": 1.9375, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.9375, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.1875, | |
| "epoch": 0.40357598978288634, | |
| "grad_norm": 0.051747072488069534, | |
| "kl": 0.384765625, | |
| "learning_rate": 5.961145194274029e-07, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 1.0, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.15625, | |
| "epoch": 0.4045977011494253, | |
| "grad_norm": 3.3397045135498047, | |
| "kl": 0.4306640625, | |
| "learning_rate": 5.950920245398773e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.21875, | |
| "epoch": 0.40561941251596423, | |
| "grad_norm": 10.139057159423828, | |
| "kl": 0.47021484375, | |
| "learning_rate": 5.940695296523517e-07, | |
| "loss": 0.0005, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 99.4375, | |
| "epoch": 0.4066411238825032, | |
| "grad_norm": 3.0981621742248535, | |
| "kl": 0.42529296875, | |
| "learning_rate": 5.930470347648262e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 98.125, | |
| "epoch": 0.4076628352490421, | |
| "grad_norm": 4.855159759521484, | |
| "kl": 0.45166015625, | |
| "learning_rate": 5.920245398773006e-07, | |
| "loss": 0.0005, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 399 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 94.1875, | |
| "epoch": 0.4086845466155811, | |
| "grad_norm": 7.649573802947998, | |
| "kl": 0.44580078125, | |
| "learning_rate": 5.91002044989775e-07, | |
| "loss": 0.0004, | |
| "reward": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/score_reward": 0.96875, | |
| "step": 400 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 978, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |