| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 99.88888888888889, | |
| "eval_steps": 50, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 535.125, | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 1.7916724271880604, | |
| "kl": 0.0, | |
| "learning_rate": 5e-08, | |
| "loss": 0.0583, | |
| "reward": 2.3125, | |
| "reward_std": 1.1971687823534012, | |
| "rewards/accuracy_reward_staging": 0.09375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 555.90625, | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 1.5555075403521712, | |
| "kl": 0.0, | |
| "learning_rate": 1e-07, | |
| "loss": -0.0705, | |
| "reward": 2.5625, | |
| "reward_std": 1.2858919501304626, | |
| "rewards/accuracy_reward_staging": 0.15625, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 541.46875, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 1.6594522931688669, | |
| "kl": 0.0010576248168945312, | |
| "learning_rate": 1.5e-07, | |
| "loss": -0.0235, | |
| "reward": 2.59375, | |
| "reward_std": 1.6232599020004272, | |
| "rewards/accuracy_reward_staging": 0.15625, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 613.25, | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 2.3276142189283164, | |
| "kl": 0.0011081695556640625, | |
| "learning_rate": 2e-07, | |
| "loss": 0.1029, | |
| "reward": 2.875, | |
| "reward_std": 1.8071783781051636, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 649.84375, | |
| "epoch": 1.2222222222222223, | |
| "grad_norm": 1.5167959821278052, | |
| "kl": 0.0010709762573242188, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.0003, | |
| "reward": 2.84375, | |
| "reward_std": 1.7606024742126465, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 574.25, | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 1.491122536644779, | |
| "kl": 0.0009145736694335938, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0377, | |
| "reward": 2.75, | |
| "reward_std": 1.8017165958881378, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 550.46875, | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 1.5321454699600687, | |
| "kl": 0.0016422271728515625, | |
| "learning_rate": 3.5e-07, | |
| "loss": 0.0173, | |
| "reward": 2.8125, | |
| "reward_std": 1.498587191104889, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 530.0625, | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 1.7429693147530465, | |
| "kl": 0.0010614395141601562, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0413, | |
| "reward": 3.15625, | |
| "reward_std": 2.1272581219673157, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 643.875, | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 1.53726074310182, | |
| "kl": 0.0013751983642578125, | |
| "learning_rate": 4.5e-07, | |
| "loss": -0.005, | |
| "reward": 3.125, | |
| "reward_std": 2.054091453552246, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 614.09375, | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 1.3654100960829842, | |
| "kl": 0.0012149810791015625, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0164, | |
| "reward": 2.59375, | |
| "reward_std": 1.0483438968658447, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 555.3125, | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 1.4260001116361793, | |
| "kl": 0.0010051727294921875, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0251, | |
| "reward": 3.0625, | |
| "reward_std": 1.7733518332242966, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 552.5625, | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 1.5253120629648043, | |
| "kl": 0.001361846923828125, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0285, | |
| "reward": 3.3125, | |
| "reward_std": 1.9136751294136047, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 579.71875, | |
| "epoch": 3.2222222222222223, | |
| "grad_norm": 1.5612924435198745, | |
| "kl": 0.0019207000732421875, | |
| "learning_rate": 6.5e-07, | |
| "loss": 0.0829, | |
| "reward": 2.0625, | |
| "reward_std": 0.5475594997406006, | |
| "rewards/accuracy_reward_staging": 0.0625, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 648.125, | |
| "epoch": 3.4444444444444446, | |
| "grad_norm": 1.472369166378751, | |
| "kl": 0.0019435882568359375, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0889, | |
| "reward": 2.3125, | |
| "reward_std": 1.3669461011886597, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.84375, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 660.78125, | |
| "epoch": 3.6666666666666665, | |
| "grad_norm": 1.2833764786982476, | |
| "kl": 0.00171661376953125, | |
| "learning_rate": 7.5e-07, | |
| "loss": -0.0032, | |
| "reward": 2.28125, | |
| "reward_std": 0.9946783781051636, | |
| "rewards/accuracy_reward_staging": 0.09375, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 555.625, | |
| "epoch": 3.888888888888889, | |
| "grad_norm": 1.7981216304584955, | |
| "kl": 0.003185272216796875, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0022, | |
| "reward": 4.09375, | |
| "reward_std": 2.7086294293403625, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 581.375, | |
| "epoch": 4.222222222222222, | |
| "grad_norm": 1.8924801483136653, | |
| "kl": 0.003849029541015625, | |
| "learning_rate": 8.499999999999999e-07, | |
| "loss": 0.0192, | |
| "reward": 2.8125, | |
| "reward_std": 1.4357599020004272, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.875, | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 1.4237753323985947, | |
| "kl": 0.004940032958984375, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0048, | |
| "reward": 2.78125, | |
| "reward_std": 1.6875, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 583.875, | |
| "epoch": 4.666666666666667, | |
| "grad_norm": 1.4401282377616447, | |
| "kl": 0.00505828857421875, | |
| "learning_rate": 9.499999999999999e-07, | |
| "loss": 0.0016, | |
| "reward": 3.4375, | |
| "reward_std": 2.3147872537374496, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 562.8125, | |
| "epoch": 4.888888888888889, | |
| "grad_norm": 1.1629869227175655, | |
| "kl": 0.00585174560546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 2.5625, | |
| "reward_std": 0.9797460436820984, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 614.875, | |
| "epoch": 5.222222222222222, | |
| "grad_norm": 1.6115188653051613, | |
| "kl": 0.00612640380859375, | |
| "learning_rate": 9.999829128320873e-07, | |
| "loss": 0.0565, | |
| "reward": 3.28125, | |
| "reward_std": 2.4976893961429596, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 559.46875, | |
| "epoch": 5.444444444444445, | |
| "grad_norm": 1.465512353981508, | |
| "kl": 0.00824737548828125, | |
| "learning_rate": 9.999316524962345e-07, | |
| "loss": 0.0541, | |
| "reward": 3.3125, | |
| "reward_std": 1.8101893961429596, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 572.59375, | |
| "epoch": 5.666666666666667, | |
| "grad_norm": 1.5847579776558225, | |
| "kl": 0.0093841552734375, | |
| "learning_rate": 9.998462224960173e-07, | |
| "loss": 0.06, | |
| "reward": 3.6875, | |
| "reward_std": 2.443375587463379, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 645.9375, | |
| "epoch": 5.888888888888889, | |
| "grad_norm": 1.8362203993654154, | |
| "kl": 0.00734710693359375, | |
| "learning_rate": 9.99726628670463e-07, | |
| "loss": 0.0368, | |
| "reward": 3.03125, | |
| "reward_std": 2.283504918217659, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.71875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 572.5, | |
| "epoch": 6.222222222222222, | |
| "grad_norm": 1.6415108932304052, | |
| "kl": 0.0096588134765625, | |
| "learning_rate": 9.995728791936505e-07, | |
| "loss": 0.0267, | |
| "reward": 2.96875, | |
| "reward_std": 1.7760016024112701, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 579.96875, | |
| "epoch": 6.444444444444445, | |
| "grad_norm": 1.4689069714869325, | |
| "kl": 0.010345458984375, | |
| "learning_rate": 9.993849845741523e-07, | |
| "loss": 0.1034, | |
| "reward": 2.5625, | |
| "reward_std": 1.1108438968658447, | |
| "rewards/accuracy_reward_staging": 0.15625, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 542.46875, | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 1.7253968854719324, | |
| "kl": 0.01122283935546875, | |
| "learning_rate": 9.991629576543163e-07, | |
| "loss": -0.0129, | |
| "reward": 2.625, | |
| "reward_std": 1.316565990447998, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.0, | |
| "epoch": 6.888888888888889, | |
| "grad_norm": 1.439672104037944, | |
| "kl": 0.0132293701171875, | |
| "learning_rate": 9.989068136093872e-07, | |
| "loss": 0.0324, | |
| "reward": 3.375, | |
| "reward_std": 2.423195868730545, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 557.5625, | |
| "epoch": 7.222222222222222, | |
| "grad_norm": 1.53093980357088, | |
| "kl": 0.0146942138671875, | |
| "learning_rate": 9.986165699464705e-07, | |
| "loss": -0.0074, | |
| "reward": 3.125, | |
| "reward_std": 2.0308370888233185, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 574.90625, | |
| "epoch": 7.444444444444445, | |
| "grad_norm": 1.0715134693817079, | |
| "kl": 0.0147857666015625, | |
| "learning_rate": 9.982922465033348e-07, | |
| "loss": -0.0166, | |
| "reward": 2.5, | |
| "reward_std": 0.9858438968658447, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 602.65625, | |
| "epoch": 7.666666666666667, | |
| "grad_norm": 1.4389686833903352, | |
| "kl": 0.01611328125, | |
| "learning_rate": 9.979338654470567e-07, | |
| "loss": 0.0875, | |
| "reward": 2.4375, | |
| "reward_std": 1.2930222898721695, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 669.5625, | |
| "epoch": 7.888888888888889, | |
| "grad_norm": 1.0489321524468773, | |
| "kl": 0.01910400390625, | |
| "learning_rate": 9.975414512725056e-07, | |
| "loss": 0.0185, | |
| "reward": 2.5625, | |
| "reward_std": 1.037847101688385, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 532.625, | |
| "epoch": 8.222222222222221, | |
| "grad_norm": 1.5157291140048736, | |
| "kl": 0.01885986328125, | |
| "learning_rate": 9.971150308006687e-07, | |
| "loss": -0.0001, | |
| "reward": 4.125, | |
| "reward_std": 2.000675529241562, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 591.03125, | |
| "epoch": 8.444444444444445, | |
| "grad_norm": 1.5963578785319679, | |
| "kl": 0.0192413330078125, | |
| "learning_rate": 9.966546331768192e-07, | |
| "loss": 0.1269, | |
| "reward": 2.875, | |
| "reward_std": 2.112294152379036, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.84375, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 603.09375, | |
| "epoch": 8.666666666666666, | |
| "grad_norm": 1.4508455813252856, | |
| "kl": 0.01494598388671875, | |
| "learning_rate": 9.961602898685223e-07, | |
| "loss": 0.0585, | |
| "reward": 3.3125, | |
| "reward_std": 2.0126227736473083, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 579.9375, | |
| "epoch": 8.88888888888889, | |
| "grad_norm": 1.196537394176258, | |
| "kl": 0.0169830322265625, | |
| "learning_rate": 9.956320346634875e-07, | |
| "loss": 0.0166, | |
| "reward": 2.78125, | |
| "reward_std": 1.3710740953683853, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 572.96875, | |
| "epoch": 9.222222222222221, | |
| "grad_norm": 1.4031846103728705, | |
| "kl": 0.0164794921875, | |
| "learning_rate": 9.95069903667256e-07, | |
| "loss": 0.0257, | |
| "reward": 2.65625, | |
| "reward_std": 1.4369846880435944, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 533.8125, | |
| "epoch": 9.444444444444445, | |
| "grad_norm": 1.7378697171564481, | |
| "kl": 0.019744873046875, | |
| "learning_rate": 9.944739353007341e-07, | |
| "loss": 0.0651, | |
| "reward": 3.6875, | |
| "reward_std": 2.841255784034729, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 575.5625, | |
| "epoch": 9.666666666666666, | |
| "grad_norm": 1.6742496883549038, | |
| "kl": 0.018218994140625, | |
| "learning_rate": 9.938441702975689e-07, | |
| "loss": 0.0249, | |
| "reward": 2.4375, | |
| "reward_std": 1.2126952707767487, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 541.375, | |
| "epoch": 9.88888888888889, | |
| "grad_norm": 1.6853780037379804, | |
| "kl": 0.0196533203125, | |
| "learning_rate": 9.931806517013612e-07, | |
| "loss": 0.0121, | |
| "reward": 2.5625, | |
| "reward_std": 1.3815238624811172, | |
| "rewards/accuracy_reward_staging": 0.15625, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 550.90625, | |
| "epoch": 10.222222222222221, | |
| "grad_norm": 1.2047759950332129, | |
| "kl": 0.017730712890625, | |
| "learning_rate": 9.924834248627258e-07, | |
| "loss": 0.0398, | |
| "reward": 2.8125, | |
| "reward_std": 1.4487498700618744, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 587.34375, | |
| "epoch": 10.444444444444445, | |
| "grad_norm": 2.2662890327219642, | |
| "kl": 0.032135009765625, | |
| "learning_rate": 9.917525374361911e-07, | |
| "loss": 0.0402, | |
| "reward": 3.375, | |
| "reward_std": 2.6460810601711273, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 552.4375, | |
| "epoch": 10.666666666666666, | |
| "grad_norm": 0.8485843884389722, | |
| "kl": 0.021148681640625, | |
| "learning_rate": 9.909880393769418e-07, | |
| "loss": 0.0349, | |
| "reward": 2.5, | |
| "reward_std": 1.045437604188919, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.625, | |
| "epoch": 10.88888888888889, | |
| "grad_norm": 1.4242611362455049, | |
| "kl": 0.018280029296875, | |
| "learning_rate": 9.901899829374047e-07, | |
| "loss": 0.0405, | |
| "reward": 3.03125, | |
| "reward_std": 2.107846677303314, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 536.8125, | |
| "epoch": 11.222222222222221, | |
| "grad_norm": 1.6000184113652984, | |
| "kl": 0.025238037109375, | |
| "learning_rate": 9.893584226636772e-07, | |
| "loss": -0.0471, | |
| "reward": 2.78125, | |
| "reward_std": 1.6772827804088593, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 588.65625, | |
| "epoch": 11.444444444444445, | |
| "grad_norm": 1.2633801476740014, | |
| "kl": 0.02093505859375, | |
| "learning_rate": 9.884934153917996e-07, | |
| "loss": 0.027, | |
| "reward": 2.4375, | |
| "reward_std": 1.226884126663208, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 524.1875, | |
| "epoch": 11.666666666666666, | |
| "grad_norm": 1.7228504370636915, | |
| "kl": 0.020599365234375, | |
| "learning_rate": 9.8759502024387e-07, | |
| "loss": -0.0016, | |
| "reward": 3.125, | |
| "reward_std": 1.8041669130325317, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 572.875, | |
| "epoch": 11.88888888888889, | |
| "grad_norm": 7.859881611636793, | |
| "kl": 0.063079833984375, | |
| "learning_rate": 9.866632986240029e-07, | |
| "loss": 0.0482, | |
| "reward": 3.25, | |
| "reward_std": 2.0755133628845215, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 577.5625, | |
| "epoch": 12.222222222222221, | |
| "grad_norm": 1.7851397304147796, | |
| "kl": 0.0205078125, | |
| "learning_rate": 9.856983142141337e-07, | |
| "loss": 0.0509, | |
| "reward": 3.3125, | |
| "reward_std": 2.14286145567894, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 12.444444444444445, | |
| "grad_norm": 1.585137209096838, | |
| "learning_rate": 9.847001329696652e-07, | |
| "loss": -0.0125, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 12.444444444444445, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 597.925, | |
| "eval_kl": 0.02578125, | |
| "eval_loss": 0.024221811443567276, | |
| "eval_reward": 2.625, | |
| "eval_reward_std": 1.6041045665740967, | |
| "eval_rewards/accuracy_reward_staging": 0.175, | |
| "eval_rewards/format_reward": 0.8, | |
| "eval_rewards/format_reward_staging": 0.95, | |
| "eval_runtime": 51.776, | |
| "eval_samples_per_second": 0.695, | |
| "eval_steps_per_second": 0.097, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 567.046875, | |
| "epoch": 12.666666666666666, | |
| "grad_norm": 1.6497404550782266, | |
| "kl": 0.020294189453125, | |
| "learning_rate": 9.836688231149592e-07, | |
| "loss": -0.0235, | |
| "reward": 3.328125, | |
| "reward_std": 2.148952841758728, | |
| "rewards/accuracy_reward_staging": 0.296875, | |
| "rewards/format_reward": 0.890625, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 655.3125, | |
| "epoch": 12.88888888888889, | |
| "grad_norm": 1.0110588489868237, | |
| "kl": 0.018829345703125, | |
| "learning_rate": 9.826044551386742e-07, | |
| "loss": -0.0207, | |
| "reward": 2.5625, | |
| "reward_std": 1.046603798866272, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 571.65625, | |
| "epoch": 13.222222222222221, | |
| "grad_norm": 1.5942717910970237, | |
| "kl": 0.0233154296875, | |
| "learning_rate": 9.81507101788948e-07, | |
| "loss": 0.0327, | |
| "reward": 2.96875, | |
| "reward_std": 2.0815286338329315, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.0, | |
| "epoch": 13.444444444444445, | |
| "grad_norm": 1.6431487531106521, | |
| "kl": 0.02325439453125, | |
| "learning_rate": 9.803768380684242e-07, | |
| "loss": -0.005, | |
| "reward": 3.1875, | |
| "reward_std": 2.4305797815322876, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 530.71875, | |
| "epoch": 13.666666666666666, | |
| "grad_norm": 1.3532727186337274, | |
| "kl": 0.021026611328125, | |
| "learning_rate": 9.792137412291263e-07, | |
| "loss": -0.0091, | |
| "reward": 3.09375, | |
| "reward_std": 1.5625, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 573.375, | |
| "epoch": 13.88888888888889, | |
| "grad_norm": 1.4441812945667367, | |
| "kl": 0.024932861328125, | |
| "learning_rate": 9.780178907671788e-07, | |
| "loss": 0.0275, | |
| "reward": 3.34375, | |
| "reward_std": 2.1209341287612915, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 520.15625, | |
| "epoch": 14.222222222222221, | |
| "grad_norm": 1.6824005469371979, | |
| "kl": 0.026092529296875, | |
| "learning_rate": 9.76789368417372e-07, | |
| "loss": -0.0531, | |
| "reward": 2.8125, | |
| "reward_std": 1.377088338136673, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 599.96875, | |
| "epoch": 14.444444444444445, | |
| "grad_norm": 1.4915574785365073, | |
| "kl": 0.021026611328125, | |
| "learning_rate": 9.755282581475767e-07, | |
| "loss": 0.0364, | |
| "reward": 4.9375, | |
| "reward_std": 2.745547831058502, | |
| "rewards/accuracy_reward_staging": 0.59375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 549.71875, | |
| "epoch": 14.666666666666666, | |
| "grad_norm": 1.4821961551515155, | |
| "kl": 0.02593994140625, | |
| "learning_rate": 9.742346461530047e-07, | |
| "loss": 0.0872, | |
| "reward": 2.53125, | |
| "reward_std": 1.4375, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 601.375, | |
| "epoch": 14.88888888888889, | |
| "grad_norm": 1.249530356824017, | |
| "kl": 0.023406982421875, | |
| "learning_rate": 9.729086208503173e-07, | |
| "loss": 0.0652, | |
| "reward": 2.4375, | |
| "reward_std": 1.1680222749710083, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 613.71875, | |
| "epoch": 15.222222222222221, | |
| "grad_norm": 1.4621397072761817, | |
| "kl": 0.0252685546875, | |
| "learning_rate": 9.715502728715825e-07, | |
| "loss": 0.0108, | |
| "reward": 2.96875, | |
| "reward_std": 1.8319481909275055, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 621.71875, | |
| "epoch": 15.444444444444445, | |
| "grad_norm": 1.4960047973343167, | |
| "kl": 0.023590087890625, | |
| "learning_rate": 9.701596950580807e-07, | |
| "loss": -0.008, | |
| "reward": 3.21875, | |
| "reward_std": 2.3255662322044373, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 559.0, | |
| "epoch": 15.666666666666666, | |
| "grad_norm": 1.377229747116843, | |
| "kl": 0.031097412109375, | |
| "learning_rate": 9.687369824539576e-07, | |
| "loss": 0.072, | |
| "reward": 2.9375, | |
| "reward_std": 1.7239685356616974, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 542.125, | |
| "epoch": 15.88888888888889, | |
| "grad_norm": 1.3837348765591453, | |
| "kl": 0.034423828125, | |
| "learning_rate": 9.672822322997304e-07, | |
| "loss": 0.0508, | |
| "reward": 2.28125, | |
| "reward_std": 1.1752630770206451, | |
| "rewards/accuracy_reward_staging": 0.09375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 645.625, | |
| "epoch": 16.22222222222222, | |
| "grad_norm": 1.2294440183285422, | |
| "kl": 0.023651123046875, | |
| "learning_rate": 9.657955440256395e-07, | |
| "loss": -0.0012, | |
| "reward": 2.59375, | |
| "reward_std": 1.0483438968658447, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 593.46875, | |
| "epoch": 16.444444444444443, | |
| "grad_norm": 1.5888259552277046, | |
| "kl": 0.02777099609375, | |
| "learning_rate": 9.642770192448535e-07, | |
| "loss": 0.0496, | |
| "reward": 3.71875, | |
| "reward_std": 2.2672154307365417, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.71875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 574.875, | |
| "epoch": 16.666666666666668, | |
| "grad_norm": 1.5785381612535059, | |
| "kl": 0.034942626953125, | |
| "learning_rate": 9.627267617465243e-07, | |
| "loss": -0.0426, | |
| "reward": 3.03125, | |
| "reward_std": 1.496883064508438, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 566.4375, | |
| "epoch": 16.88888888888889, | |
| "grad_norm": 1.5972037559178247, | |
| "kl": 0.026702880859375, | |
| "learning_rate": 9.611448774886923e-07, | |
| "loss": 0.005, | |
| "reward": 3.15625, | |
| "reward_std": 1.9091877937316895, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 624.21875, | |
| "epoch": 17.22222222222222, | |
| "grad_norm": 2.781118760006495, | |
| "kl": 0.042724609375, | |
| "learning_rate": 9.595314745910455e-07, | |
| "loss": 0.0926, | |
| "reward": 3.3125, | |
| "reward_std": 2.4584514498710632, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 545.65625, | |
| "epoch": 17.444444444444443, | |
| "grad_norm": 1.6678524207695304, | |
| "kl": 0.028411865234375, | |
| "learning_rate": 9.578866633275286e-07, | |
| "loss": 0.0606, | |
| "reward": 3.75, | |
| "reward_std": 2.237764596939087, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 524.0, | |
| "epoch": 17.666666666666668, | |
| "grad_norm": 1.4163449995088322, | |
| "kl": 0.032470703125, | |
| "learning_rate": 9.562105561188068e-07, | |
| "loss": 0.0105, | |
| "reward": 3.40625, | |
| "reward_std": 1.9233438968658447, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 619.8125, | |
| "epoch": 17.88888888888889, | |
| "grad_norm": 1.330864720677356, | |
| "kl": 0.02783203125, | |
| "learning_rate": 9.545032675245813e-07, | |
| "loss": 0.0232, | |
| "reward": 2.875, | |
| "reward_std": 1.5208123177289963, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 615.5625, | |
| "epoch": 18.22222222222222, | |
| "grad_norm": 1.3940735985441313, | |
| "kl": 0.0289306640625, | |
| "learning_rate": 9.527649142357594e-07, | |
| "loss": 0.0449, | |
| "reward": 4.8125, | |
| "reward_std": 3.365248918533325, | |
| "rewards/accuracy_reward_staging": 0.59375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 560.6875, | |
| "epoch": 18.444444444444443, | |
| "grad_norm": 1.573293312721447, | |
| "kl": 0.031890869140625, | |
| "learning_rate": 9.509956150664795e-07, | |
| "loss": 0.0727, | |
| "reward": 2.40625, | |
| "reward_std": 1.0983919501304626, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.71875, | |
| "epoch": 18.666666666666668, | |
| "grad_norm": 1.3288952801951834, | |
| "kl": 0.028411865234375, | |
| "learning_rate": 9.491954909459894e-07, | |
| "loss": 0.0299, | |
| "reward": 4.125, | |
| "reward_std": 2.0565126538276672, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 566.875, | |
| "epoch": 18.88888888888889, | |
| "grad_norm": 1.600349042443185, | |
| "kl": 0.03497314453125, | |
| "learning_rate": 9.473646649103817e-07, | |
| "loss": 0.0048, | |
| "reward": 3.46875, | |
| "reward_std": 2.3291621804237366, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 527.15625, | |
| "epoch": 19.22222222222222, | |
| "grad_norm": 2.0117307258354242, | |
| "kl": 0.034820556640625, | |
| "learning_rate": 9.455032620941839e-07, | |
| "loss": 0.0076, | |
| "reward": 3.15625, | |
| "reward_std": 1.690910965204239, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 548.125, | |
| "epoch": 19.444444444444443, | |
| "grad_norm": 1.2154400614249532, | |
| "kl": 0.034393310546875, | |
| "learning_rate": 9.436114097218058e-07, | |
| "loss": 0.0153, | |
| "reward": 2.34375, | |
| "reward_std": 0.9375, | |
| "rewards/accuracy_reward_staging": 0.09375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 599.9375, | |
| "epoch": 19.666666666666668, | |
| "grad_norm": 1.6406138056170174, | |
| "kl": 0.029205322265625, | |
| "learning_rate": 9.416892370988442e-07, | |
| "loss": 0.0752, | |
| "reward": 2.75, | |
| "reward_std": 1.9128470420837402, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 553.6875, | |
| "epoch": 19.88888888888889, | |
| "grad_norm": 1.527942838909739, | |
| "kl": 0.030364990234375, | |
| "learning_rate": 9.397368756032444e-07, | |
| "loss": -0.0126, | |
| "reward": 4.34375, | |
| "reward_std": 3.079783648252487, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 561.53125, | |
| "epoch": 20.22222222222222, | |
| "grad_norm": 1.6342025185675375, | |
| "kl": 0.032318115234375, | |
| "learning_rate": 9.377544586763214e-07, | |
| "loss": -0.0331, | |
| "reward": 4.0625, | |
| "reward_std": 2.1620407104492188, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 608.125, | |
| "epoch": 20.444444444444443, | |
| "grad_norm": 0.9760974984694354, | |
| "kl": 0.03082275390625, | |
| "learning_rate": 9.357421218136386e-07, | |
| "loss": -0.0281, | |
| "reward": 2.90625, | |
| "reward_std": 1.3726893961429596, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 607.1875, | |
| "epoch": 20.666666666666668, | |
| "grad_norm": 2.9650567483991894, | |
| "kl": 0.0552978515625, | |
| "learning_rate": 9.337000025557476e-07, | |
| "loss": 0.0494, | |
| "reward": 2.78125, | |
| "reward_std": 1.907078742980957, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.875, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 648.5625, | |
| "epoch": 20.88888888888889, | |
| "grad_norm": 1.4868637308996282, | |
| "kl": 0.04937744140625, | |
| "learning_rate": 9.316282404787869e-07, | |
| "loss": 0.0813, | |
| "reward": 2.4375, | |
| "reward_std": 1.534547746181488, | |
| "rewards/accuracy_reward_staging": 0.15625, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.875, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.0, | |
| "epoch": 21.22222222222222, | |
| "grad_norm": 1.5354307812124717, | |
| "kl": 0.03326416015625, | |
| "learning_rate": 9.295269771849425e-07, | |
| "loss": 0.1102, | |
| "reward": 3.53125, | |
| "reward_std": 2.2993226647377014, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 585.25, | |
| "epoch": 21.444444444444443, | |
| "grad_norm": 1.21306534102283, | |
| "kl": 0.03662109375, | |
| "learning_rate": 9.273963562927694e-07, | |
| "loss": 0.0034, | |
| "reward": 2.90625, | |
| "reward_std": 1.0483438968658447, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 568.71875, | |
| "epoch": 21.666666666666668, | |
| "grad_norm": 11.152096799903676, | |
| "kl": 0.09722900390625, | |
| "learning_rate": 9.252365234273753e-07, | |
| "loss": 0.0125, | |
| "reward": 3.1875, | |
| "reward_std": 1.9283326417207718, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 567.0625, | |
| "epoch": 21.88888888888889, | |
| "grad_norm": 1.4820021533223564, | |
| "kl": 0.04046630859375, | |
| "learning_rate": 9.230476262104676e-07, | |
| "loss": 0.0631, | |
| "reward": 3.40625, | |
| "reward_std": 2.233847141265869, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 571.625, | |
| "epoch": 22.22222222222222, | |
| "grad_norm": 1.6685149630374954, | |
| "kl": 0.04840087890625, | |
| "learning_rate": 9.208298142502635e-07, | |
| "loss": 0.057, | |
| "reward": 2.90625, | |
| "reward_std": 1.7658206820487976, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 652.0, | |
| "epoch": 22.444444444444443, | |
| "grad_norm": 1.2598669750057847, | |
| "kl": 0.037872314453125, | |
| "learning_rate": 9.185832391312642e-07, | |
| "loss": 0.0397, | |
| "reward": 2.1875, | |
| "reward_std": 1.0936830341815948, | |
| "rewards/accuracy_reward_staging": 0.09375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.84375, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 512.84375, | |
| "epoch": 22.666666666666668, | |
| "grad_norm": 1.464116156191612, | |
| "kl": 0.0408935546875, | |
| "learning_rate": 9.163080544038952e-07, | |
| "loss": 0.0325, | |
| "reward": 3.1875, | |
| "reward_std": 2.0054054260253906, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 595.0, | |
| "epoch": 22.88888888888889, | |
| "grad_norm": 1.7502771652549964, | |
| "kl": 0.0543212890625, | |
| "learning_rate": 9.1400441557401e-07, | |
| "loss": 0.1198, | |
| "reward": 4.375, | |
| "reward_std": 2.6551371216773987, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 575.375, | |
| "epoch": 23.22222222222222, | |
| "grad_norm": 1.5494132503619473, | |
| "kl": 0.04376220703125, | |
| "learning_rate": 9.116724800922629e-07, | |
| "loss": 0.1098, | |
| "reward": 3.6875, | |
| "reward_std": 1.9493454694747925, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 577.5, | |
| "epoch": 23.444444444444443, | |
| "grad_norm": 1.2511045169588764, | |
| "kl": 0.0521240234375, | |
| "learning_rate": 9.093124073433462e-07, | |
| "loss": 0.0389, | |
| "reward": 3.5625, | |
| "reward_std": 2.1182020902633667, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 590.78125, | |
| "epoch": 23.666666666666668, | |
| "grad_norm": 1.5974928179741261, | |
| "kl": 0.045074462890625, | |
| "learning_rate": 9.069243586350975e-07, | |
| "loss": -0.0127, | |
| "reward": 4.09375, | |
| "reward_std": 2.1429253816604614, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 454.1875, | |
| "epoch": 23.88888888888889, | |
| "grad_norm": 1.885519118261372, | |
| "kl": 0.0450439453125, | |
| "learning_rate": 9.045084971874737e-07, | |
| "loss": 0.0469, | |
| "reward": 4.0625, | |
| "reward_std": 2.76924729347229, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 538.0625, | |
| "epoch": 24.22222222222222, | |
| "grad_norm": 1.5682355592026038, | |
| "kl": 0.05267333984375, | |
| "learning_rate": 9.020649881213958e-07, | |
| "loss": 0.0061, | |
| "reward": 3.40625, | |
| "reward_std": 2.1967990398406982, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.75, | |
| "epoch": 24.444444444444443, | |
| "grad_norm": 1.2736403946455588, | |
| "kl": 0.044189453125, | |
| "learning_rate": 8.995939984474623e-07, | |
| "loss": 0.0172, | |
| "reward": 3.84375, | |
| "reward_std": 2.4564297795295715, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 582.90625, | |
| "epoch": 24.666666666666668, | |
| "grad_norm": 1.5123549273068009, | |
| "kl": 0.04638671875, | |
| "learning_rate": 8.970956970545355e-07, | |
| "loss": 0.0662, | |
| "reward": 3.78125, | |
| "reward_std": 2.7111909985542297, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 24.88888888888889, | |
| "grad_norm": 1.7849471498217702, | |
| "learning_rate": 8.945702546981968e-07, | |
| "loss": 0.142, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 24.88888888888889, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 511.125, | |
| "eval_kl": 0.073193359375, | |
| "eval_loss": -0.007530718110501766, | |
| "eval_reward": 2.075, | |
| "eval_reward_std": 0.6665439963340759, | |
| "eval_rewards/accuracy_reward_staging": 0.05, | |
| "eval_rewards/format_reward": 0.85, | |
| "eval_rewards/format_reward_staging": 0.975, | |
| "eval_runtime": 50.3514, | |
| "eval_samples_per_second": 0.715, | |
| "eval_steps_per_second": 0.099, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 581.21875, | |
| "epoch": 25.22222222222222, | |
| "grad_norm": 1.8652012487452174, | |
| "kl": 0.05792236328125, | |
| "learning_rate": 8.920178439890764e-07, | |
| "loss": 0.0112, | |
| "reward": 3.46875, | |
| "reward_std": 1.8295301795005798, | |
| "rewards/accuracy_reward_staging": 0.328125, | |
| "rewards/format_reward": 0.890625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.875, | |
| "epoch": 25.444444444444443, | |
| "grad_norm": 4.84795988570716, | |
| "kl": 0.06231689453125, | |
| "learning_rate": 8.894386393810562e-07, | |
| "loss": 0.0844, | |
| "reward": 2.875, | |
| "reward_std": 1.6470783054828644, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 562.5, | |
| "epoch": 25.666666666666668, | |
| "grad_norm": 1.8754521848828094, | |
| "kl": 0.052978515625, | |
| "learning_rate": 8.868328171593446e-07, | |
| "loss": -0.0154, | |
| "reward": 4.25, | |
| "reward_std": 2.547704756259918, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 537.3125, | |
| "epoch": 25.88888888888889, | |
| "grad_norm": 1.797756724546587, | |
| "kl": 0.05206298828125, | |
| "learning_rate": 8.842005554284295e-07, | |
| "loss": -0.0275, | |
| "reward": 3.84375, | |
| "reward_std": 2.51630362868309, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.53125, | |
| "epoch": 26.22222222222222, | |
| "grad_norm": 1.300955660750681, | |
| "kl": 0.05413818359375, | |
| "learning_rate": 8.815420340999033e-07, | |
| "loss": 0.0637, | |
| "reward": 3.84375, | |
| "reward_std": 1.3620327413082123, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 594.25, | |
| "epoch": 26.444444444444443, | |
| "grad_norm": 1.43085940167237, | |
| "kl": 0.0439453125, | |
| "learning_rate": 8.788574348801674e-07, | |
| "loss": 0.0768, | |
| "reward": 4.625, | |
| "reward_std": 1.9858438968658447, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 582.15625, | |
| "epoch": 26.666666666666668, | |
| "grad_norm": 1.6135777138066925, | |
| "kl": 0.06390380859375, | |
| "learning_rate": 8.761469412580124e-07, | |
| "loss": 0.0142, | |
| "reward": 1.96875, | |
| "reward_std": 1.00966876745224, | |
| "rewards/accuracy_reward_staging": 0.0625, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.875, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 587.0625, | |
| "epoch": 26.88888888888889, | |
| "grad_norm": 2.1207393888337887, | |
| "kl": 0.06134033203125, | |
| "learning_rate": 8.734107384920769e-07, | |
| "loss": 0.0242, | |
| "reward": 4.125, | |
| "reward_std": 3.0213340520858765, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 548.40625, | |
| "epoch": 27.22222222222222, | |
| "grad_norm": 1.5591626897717148, | |
| "kl": 0.0465087890625, | |
| "learning_rate": 8.706490135981855e-07, | |
| "loss": -0.0282, | |
| "reward": 4.5625, | |
| "reward_std": 2.360237419605255, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.8125, | |
| "epoch": 27.444444444444443, | |
| "grad_norm": 1.1645802297935302, | |
| "kl": 0.04632568359375, | |
| "learning_rate": 8.678619553365658e-07, | |
| "loss": -0.0278, | |
| "reward": 3.21875, | |
| "reward_std": 1.8432062864303589, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 610.21875, | |
| "epoch": 27.666666666666668, | |
| "grad_norm": 1.7617563181087859, | |
| "kl": 0.0550537109375, | |
| "learning_rate": 8.650497541989481e-07, | |
| "loss": -0.0219, | |
| "reward": 2.84375, | |
| "reward_std": 1.7233919501304626, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 602.5, | |
| "epoch": 27.88888888888889, | |
| "grad_norm": 1.4790592908519822, | |
| "kl": 0.04412841796875, | |
| "learning_rate": 8.622126023955445e-07, | |
| "loss": 0.0624, | |
| "reward": 3.65625, | |
| "reward_std": 2.0483438968658447, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 602.21875, | |
| "epoch": 28.22222222222222, | |
| "grad_norm": 1.4599487812585739, | |
| "kl": 0.0484619140625, | |
| "learning_rate": 8.593506938419119e-07, | |
| "loss": 0.0459, | |
| "reward": 3.84375, | |
| "reward_std": 0.9925079494714737, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 569.71875, | |
| "epoch": 28.444444444444443, | |
| "grad_norm": 1.3832144010184737, | |
| "kl": 0.0467529296875, | |
| "learning_rate": 8.564642241456986e-07, | |
| "loss": 0.0025, | |
| "reward": 3.71875, | |
| "reward_std": 1.9233438968658447, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 548.78125, | |
| "epoch": 28.666666666666668, | |
| "grad_norm": 1.803131512178923, | |
| "kl": 0.0576171875, | |
| "learning_rate": 8.535533905932737e-07, | |
| "loss": -0.0187, | |
| "reward": 3.90625, | |
| "reward_std": 2.563826858997345, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 564.4375, | |
| "epoch": 28.88888888888889, | |
| "grad_norm": 1.6402368950584498, | |
| "kl": 0.05029296875, | |
| "learning_rate": 8.506183921362442e-07, | |
| "loss": -0.0174, | |
| "reward": 3.3125, | |
| "reward_std": 2.5466037690639496, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 563.5625, | |
| "epoch": 29.22222222222222, | |
| "grad_norm": 1.4985773194128882, | |
| "kl": 0.04840087890625, | |
| "learning_rate": 8.47659429377856e-07, | |
| "loss": -0.0153, | |
| "reward": 3.875, | |
| "reward_std": 2.3320942521095276, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 565.34375, | |
| "epoch": 29.444444444444443, | |
| "grad_norm": 1.7501844147476033, | |
| "kl": 0.05194091796875, | |
| "learning_rate": 8.446767045592829e-07, | |
| "loss": 0.0359, | |
| "reward": 3.84375, | |
| "reward_std": 2.3963494896888733, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 574.5, | |
| "epoch": 29.666666666666668, | |
| "grad_norm": 1.2571401212163673, | |
| "kl": 0.0498046875, | |
| "learning_rate": 8.416704215458042e-07, | |
| "loss": 0.0187, | |
| "reward": 3.3125, | |
| "reward_std": 1.125, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.25, | |
| "epoch": 29.88888888888889, | |
| "grad_norm": 1.2235795288016953, | |
| "kl": 0.04754638671875, | |
| "learning_rate": 8.386407858128706e-07, | |
| "loss": -0.0144, | |
| "reward": 3.25, | |
| "reward_std": 1.5358919501304626, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.15625, | |
| "epoch": 30.22222222222222, | |
| "grad_norm": 1.6274382257749778, | |
| "kl": 0.060791015625, | |
| "learning_rate": 8.355880044320597e-07, | |
| "loss": 0.0121, | |
| "reward": 3.34375, | |
| "reward_std": 2.7569093704223633, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.84375, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 592.8125, | |
| "epoch": 30.444444444444443, | |
| "grad_norm": 2.5186927220968895, | |
| "kl": 0.09588623046875, | |
| "learning_rate": 8.325122860569241e-07, | |
| "loss": 0.0081, | |
| "reward": 3.15625, | |
| "reward_std": 2.1270195841789246, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.875, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 564.1875, | |
| "epoch": 30.666666666666668, | |
| "grad_norm": 1.4932442148368137, | |
| "kl": 0.04656982421875, | |
| "learning_rate": 8.294138409087289e-07, | |
| "loss": 0.0298, | |
| "reward": 3.625, | |
| "reward_std": 2.008278489112854, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 607.375, | |
| "epoch": 30.88888888888889, | |
| "grad_norm": 3.4718877576698746, | |
| "kl": 0.076904296875, | |
| "learning_rate": 8.262928807620843e-07, | |
| "loss": -0.0234, | |
| "reward": 3.6875, | |
| "reward_std": 2.751339912414551, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.5625, | |
| "epoch": 31.22222222222222, | |
| "grad_norm": 1.622119125741056, | |
| "kl": 0.05914306640625, | |
| "learning_rate": 8.231496189304704e-07, | |
| "loss": 0.0119, | |
| "reward": 3.78125, | |
| "reward_std": 1.9775724411010742, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 645.84375, | |
| "epoch": 31.444444444444443, | |
| "grad_norm": 1.6061164218143151, | |
| "kl": 0.0496826171875, | |
| "learning_rate": 8.199842702516582e-07, | |
| "loss": 0.0355, | |
| "reward": 3.90625, | |
| "reward_std": 2.5803541243076324, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.3125, | |
| "epoch": 31.666666666666668, | |
| "grad_norm": 1.3457598005679037, | |
| "kl": 0.0526123046875, | |
| "learning_rate": 8.167970510730252e-07, | |
| "loss": -0.0134, | |
| "reward": 3.15625, | |
| "reward_std": 1.8007422089576721, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 640.25, | |
| "epoch": 31.88888888888889, | |
| "grad_norm": 1.5569181185599603, | |
| "kl": 0.058349609375, | |
| "learning_rate": 8.135881792367685e-07, | |
| "loss": -0.0192, | |
| "reward": 3.59375, | |
| "reward_std": 1.5271694660186768, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 574.34375, | |
| "epoch": 32.22222222222222, | |
| "grad_norm": 1.6790409041925978, | |
| "kl": 0.05426025390625, | |
| "learning_rate": 8.103578740650156e-07, | |
| "loss": -0.0013, | |
| "reward": 3.8125, | |
| "reward_std": 2.151860535144806, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 568.0, | |
| "epoch": 32.44444444444444, | |
| "grad_norm": 1.7164186713447234, | |
| "kl": 0.0628662109375, | |
| "learning_rate": 8.071063563448339e-07, | |
| "loss": 0.0355, | |
| "reward": 3.09375, | |
| "reward_std": 2.110320746898651, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 612.125, | |
| "epoch": 32.666666666666664, | |
| "grad_norm": 1.4484213626473657, | |
| "kl": 0.0438232421875, | |
| "learning_rate": 8.038338483131406e-07, | |
| "loss": 0.0675, | |
| "reward": 2.65625, | |
| "reward_std": 1.5483438968658447, | |
| "rewards/accuracy_reward_staging": 0.15625, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 633.375, | |
| "epoch": 32.888888888888886, | |
| "grad_norm": 1.4888928164051263, | |
| "kl": 0.046630859375, | |
| "learning_rate": 8.005405736415125e-07, | |
| "loss": 0.003, | |
| "reward": 3.5625, | |
| "reward_std": 2.257579743862152, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 608.28125, | |
| "epoch": 33.22222222222222, | |
| "grad_norm": 1.4537634594396451, | |
| "kl": 0.05352783203125, | |
| "learning_rate": 7.97226757420899e-07, | |
| "loss": 0.0072, | |
| "reward": 4.53125, | |
| "reward_std": 2.650395154953003, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.625, | |
| "epoch": 33.44444444444444, | |
| "grad_norm": 5.103167634384414, | |
| "kl": 0.107421875, | |
| "learning_rate": 7.938926261462365e-07, | |
| "loss": 0.0303, | |
| "reward": 3.96875, | |
| "reward_std": 1.4233438968658447, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 576.90625, | |
| "epoch": 33.666666666666664, | |
| "grad_norm": 5.13739196509469, | |
| "kl": 0.09185791015625, | |
| "learning_rate": 7.905384077009692e-07, | |
| "loss": 0.0254, | |
| "reward": 3.40625, | |
| "reward_std": 2.5271694660186768, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 565.0, | |
| "epoch": 33.888888888888886, | |
| "grad_norm": 1.3347218031999781, | |
| "kl": 0.05279541015625, | |
| "learning_rate": 7.871643313414718e-07, | |
| "loss": -0.0269, | |
| "reward": 3.78125, | |
| "reward_std": 1.9108592867851257, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 596.96875, | |
| "epoch": 34.22222222222222, | |
| "grad_norm": 1.6203773256898213, | |
| "kl": 0.05377197265625, | |
| "learning_rate": 7.837706276813818e-07, | |
| "loss": -0.0507, | |
| "reward": 3.78125, | |
| "reward_std": 2.8475868701934814, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 548.90625, | |
| "epoch": 34.44444444444444, | |
| "grad_norm": 1.7589228637659193, | |
| "kl": 0.0518798828125, | |
| "learning_rate": 7.803575286758363e-07, | |
| "loss": 0.0256, | |
| "reward": 3.84375, | |
| "reward_std": 2.3770764470100403, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 574.90625, | |
| "epoch": 34.666666666666664, | |
| "grad_norm": 1.465848261824115, | |
| "kl": 0.05047607421875, | |
| "learning_rate": 7.769252676056186e-07, | |
| "loss": 0.0121, | |
| "reward": 3.0, | |
| "reward_std": 1.999484658241272, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 621.5625, | |
| "epoch": 34.888888888888886, | |
| "grad_norm": 1.699502675045011, | |
| "kl": 0.04669189453125, | |
| "learning_rate": 7.734740790612136e-07, | |
| "loss": -0.0043, | |
| "reward": 3.65625, | |
| "reward_std": 2.740947127342224, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.84375, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 575.03125, | |
| "epoch": 35.22222222222222, | |
| "grad_norm": 1.4180294898308454, | |
| "kl": 0.04791259765625, | |
| "learning_rate": 7.700041989267736e-07, | |
| "loss": 0.0128, | |
| "reward": 3.9375, | |
| "reward_std": 1.6851893663406372, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 634.3125, | |
| "epoch": 35.44444444444444, | |
| "grad_norm": 0.97669552258444, | |
| "kl": 0.04840087890625, | |
| "learning_rate": 7.665158643639969e-07, | |
| "loss": 0.0078, | |
| "reward": 3.90625, | |
| "reward_std": 1.2753951847553253, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 564.9375, | |
| "epoch": 35.666666666666664, | |
| "grad_norm": 1.4705421421024347, | |
| "kl": 0.0458984375, | |
| "learning_rate": 7.63009313795917e-07, | |
| "loss": 0.0007, | |
| "reward": 3.375, | |
| "reward_std": 1.9858438968658447, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 630.8125, | |
| "epoch": 35.888888888888886, | |
| "grad_norm": 1.4040857696410018, | |
| "kl": 0.0491943359375, | |
| "learning_rate": 7.594847868906076e-07, | |
| "loss": 0.0157, | |
| "reward": 4.53125, | |
| "reward_std": 1.881795346736908, | |
| "rewards/accuracy_reward_staging": 0.5625, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.4375, | |
| "epoch": 36.22222222222222, | |
| "grad_norm": 1.7416315495447303, | |
| "kl": 0.05291748046875, | |
| "learning_rate": 7.559425245448005e-07, | |
| "loss": 0.1534, | |
| "reward": 4.125, | |
| "reward_std": 1.7268692255020142, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 543.34375, | |
| "epoch": 36.44444444444444, | |
| "grad_norm": 1.3338618690781434, | |
| "kl": 0.05255126953125, | |
| "learning_rate": 7.523827688674219e-07, | |
| "loss": 0.0048, | |
| "reward": 3.46875, | |
| "reward_std": 1.7618454992771149, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 579.625, | |
| "epoch": 36.666666666666664, | |
| "grad_norm": 1.8245501253344487, | |
| "kl": 0.04931640625, | |
| "learning_rate": 7.488057631630437e-07, | |
| "loss": 0.0975, | |
| "reward": 3.78125, | |
| "reward_std": 2.0842358469963074, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 556.25, | |
| "epoch": 36.888888888888886, | |
| "grad_norm": 1.60039839729214, | |
| "kl": 0.0479736328125, | |
| "learning_rate": 7.452117519152541e-07, | |
| "loss": -0.0225, | |
| "reward": 4.75, | |
| "reward_std": 2.8358521461486816, | |
| "rewards/accuracy_reward_staging": 0.59375, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 575.59375, | |
| "epoch": 37.22222222222222, | |
| "grad_norm": 1.9262708460562594, | |
| "kl": 0.04852294921875, | |
| "learning_rate": 7.416009807699481e-07, | |
| "loss": 0.0694, | |
| "reward": 3.875, | |
| "reward_std": 2.4488722383975983, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 37.44444444444444, | |
| "grad_norm": 1.4126152849193538, | |
| "learning_rate": 7.379736965185368e-07, | |
| "loss": 0.0461, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 37.44444444444444, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 583.05, | |
| "eval_kl": 0.045751953125, | |
| "eval_loss": -0.002990193199366331, | |
| "eval_reward": 2.725, | |
| "eval_reward_std": 1.3047046661376953, | |
| "eval_rewards/accuracy_reward_staging": 0.175, | |
| "eval_rewards/format_reward": 0.875, | |
| "eval_rewards/format_reward_staging": 0.975, | |
| "eval_runtime": 52.1348, | |
| "eval_samples_per_second": 0.691, | |
| "eval_steps_per_second": 0.096, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 566.234375, | |
| "epoch": 37.666666666666664, | |
| "grad_norm": 1.6128745248404066, | |
| "kl": 0.0498046875, | |
| "learning_rate": 7.343301470810807e-07, | |
| "loss": 0.0205, | |
| "reward": 3.8125, | |
| "reward_std": 2.2092738151550293, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 561.6875, | |
| "epoch": 37.888888888888886, | |
| "grad_norm": 1.6630192300364095, | |
| "kl": 0.051513671875, | |
| "learning_rate": 7.306705814893439e-07, | |
| "loss": 0.0613, | |
| "reward": 4.75, | |
| "reward_std": 3.510585069656372, | |
| "rewards/accuracy_reward_staging": 0.5625, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 587.40625, | |
| "epoch": 38.22222222222222, | |
| "grad_norm": 1.5821951322432892, | |
| "kl": 0.0535888671875, | |
| "learning_rate": 7.269952498697734e-07, | |
| "loss": 0.0053, | |
| "reward": 3.78125, | |
| "reward_std": 2.4141127467155457, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.84375, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 572.6875, | |
| "epoch": 38.44444444444444, | |
| "grad_norm": 2.2424195208633453, | |
| "kl": 0.07366943359375, | |
| "learning_rate": 7.233044034264033e-07, | |
| "loss": 0.0315, | |
| "reward": 3.84375, | |
| "reward_std": 2.3134855031967163, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 555.9375, | |
| "epoch": 38.666666666666664, | |
| "grad_norm": 1.529067187866317, | |
| "kl": 0.05157470703125, | |
| "learning_rate": 7.195982944236852e-07, | |
| "loss": 0.0321, | |
| "reward": 2.8125, | |
| "reward_std": 1.796603798866272, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 630.0, | |
| "epoch": 38.888888888888886, | |
| "grad_norm": 1.579548286063579, | |
| "kl": 0.050537109375, | |
| "learning_rate": 7.158771761692464e-07, | |
| "loss": 0.0309, | |
| "reward": 4.28125, | |
| "reward_std": 2.8335397839546204, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 563.0625, | |
| "epoch": 39.22222222222222, | |
| "grad_norm": 1.4963033786464435, | |
| "kl": 0.050048828125, | |
| "learning_rate": 7.121413029965769e-07, | |
| "loss": 0.0482, | |
| "reward": 3.8125, | |
| "reward_std": 2.3843142986297607, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 576.59375, | |
| "epoch": 39.44444444444444, | |
| "grad_norm": 1.4775879396602463, | |
| "kl": 0.054443359375, | |
| "learning_rate": 7.083909302476452e-07, | |
| "loss": 0.0164, | |
| "reward": 3.71875, | |
| "reward_std": 1.9704924821853638, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 555.8125, | |
| "epoch": 39.666666666666664, | |
| "grad_norm": 1.7649876199956425, | |
| "kl": 0.0699462890625, | |
| "learning_rate": 7.04626314255447e-07, | |
| "loss": 0.0019, | |
| "reward": 4.4375, | |
| "reward_std": 2.7981574535369873, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 498.75, | |
| "epoch": 39.888888888888886, | |
| "grad_norm": 1.3915513369029784, | |
| "kl": 0.0543212890625, | |
| "learning_rate": 7.008477123264847e-07, | |
| "loss": 0.0433, | |
| "reward": 2.90625, | |
| "reward_std": 1.3342358469963074, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 569.53125, | |
| "epoch": 40.22222222222222, | |
| "grad_norm": 1.562840542671513, | |
| "kl": 0.053955078125, | |
| "learning_rate": 6.970553827231808e-07, | |
| "loss": 0.0164, | |
| "reward": 4.625, | |
| "reward_std": 2.55762779712677, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 538.3125, | |
| "epoch": 40.44444444444444, | |
| "grad_norm": 1.4692239574350316, | |
| "kl": 0.0526123046875, | |
| "learning_rate": 6.932495846462261e-07, | |
| "loss": -0.0164, | |
| "reward": 3.65625, | |
| "reward_std": 1.8189646005630493, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.875, | |
| "epoch": 40.666666666666664, | |
| "grad_norm": 1.5332016106483515, | |
| "kl": 0.05316162109375, | |
| "learning_rate": 6.894305782168638e-07, | |
| "loss": -0.0429, | |
| "reward": 4.3125, | |
| "reward_std": 2.5211293697357178, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 545.125, | |
| "epoch": 40.888888888888886, | |
| "grad_norm": 13.136996472534078, | |
| "kl": 0.11846923828125, | |
| "learning_rate": 6.855986244591103e-07, | |
| "loss": -0.0235, | |
| "reward": 3.28125, | |
| "reward_std": 2.338345527648926, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 543.40625, | |
| "epoch": 41.22222222222222, | |
| "grad_norm": 1.3242743159265937, | |
| "kl": 0.04998779296875, | |
| "learning_rate": 6.817539852819148e-07, | |
| "loss": 0.0115, | |
| "reward": 3.1875, | |
| "reward_std": 1.375, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 520.1875, | |
| "epoch": 41.44444444444444, | |
| "grad_norm": 1.2491437366023406, | |
| "kl": 0.05328369140625, | |
| "learning_rate": 6.778969234612583e-07, | |
| "loss": 0.0198, | |
| "reward": 4.84375, | |
| "reward_std": 1.7444601655006409, | |
| "rewards/accuracy_reward_staging": 0.59375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 619.59375, | |
| "epoch": 41.666666666666664, | |
| "grad_norm": 1.6792411977674075, | |
| "kl": 0.05487060546875, | |
| "learning_rate": 6.740277026221922e-07, | |
| "loss": 0.011, | |
| "reward": 3.21875, | |
| "reward_std": 2.509488582611084, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 539.125, | |
| "epoch": 41.888888888888886, | |
| "grad_norm": 2.61987913810964, | |
| "kl": 0.08526611328125, | |
| "learning_rate": 6.701465872208216e-07, | |
| "loss": 0.0355, | |
| "reward": 5.71875, | |
| "reward_std": 2.992280900478363, | |
| "rewards/accuracy_reward_staging": 0.78125, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 601.65625, | |
| "epoch": 42.22222222222222, | |
| "grad_norm": 1.5317010015458066, | |
| "kl": 0.0543212890625, | |
| "learning_rate": 6.662538425262284e-07, | |
| "loss": -0.0412, | |
| "reward": 3.75, | |
| "reward_std": 2.802945911884308, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 607.5, | |
| "epoch": 42.44444444444444, | |
| "grad_norm": 1.5445749846218586, | |
| "kl": 0.05462646484375, | |
| "learning_rate": 6.623497346023417e-07, | |
| "loss": -0.0053, | |
| "reward": 3.0625, | |
| "reward_std": 1.4321783781051636, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.53125, | |
| "epoch": 42.666666666666664, | |
| "grad_norm": 1.727090236953967, | |
| "kl": 0.05303955078125, | |
| "learning_rate": 6.584345302897522e-07, | |
| "loss": 0.0752, | |
| "reward": 4.9375, | |
| "reward_std": 2.6843830347061157, | |
| "rewards/accuracy_reward_staging": 0.59375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 511.125, | |
| "epoch": 42.888888888888886, | |
| "grad_norm": 1.463526052255072, | |
| "kl": 0.05108642578125, | |
| "learning_rate": 6.545084971874736e-07, | |
| "loss": -0.0218, | |
| "reward": 4.28125, | |
| "reward_std": 2.3289482593536377, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 591.46875, | |
| "epoch": 43.22222222222222, | |
| "grad_norm": 1.6155726361043279, | |
| "kl": 0.06103515625, | |
| "learning_rate": 6.505719036346537e-07, | |
| "loss": 0.0385, | |
| "reward": 3.3125, | |
| "reward_std": 2.2124131619930267, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 570.96875, | |
| "epoch": 43.44444444444444, | |
| "grad_norm": 1.3751712901264466, | |
| "kl": 0.0545654296875, | |
| "learning_rate": 6.466250186922324e-07, | |
| "loss": 0.0063, | |
| "reward": 3.1875, | |
| "reward_std": 2.130874752998352, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.84375, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 545.71875, | |
| "epoch": 43.666666666666664, | |
| "grad_norm": 1.4756595692040109, | |
| "kl": 0.059326171875, | |
| "learning_rate": 6.426681121245527e-07, | |
| "loss": -0.0295, | |
| "reward": 3.59375, | |
| "reward_std": 2.3869778215885162, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 595.1875, | |
| "epoch": 43.888888888888886, | |
| "grad_norm": 1.4156928353056575, | |
| "kl": 0.050048828125, | |
| "learning_rate": 6.387014543809223e-07, | |
| "loss": -0.0245, | |
| "reward": 3.625, | |
| "reward_std": 2.184383064508438, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 549.28125, | |
| "epoch": 44.22222222222222, | |
| "grad_norm": 1.6625979237822903, | |
| "kl": 0.05389404296875, | |
| "learning_rate": 6.347253165771289e-07, | |
| "loss": 0.0393, | |
| "reward": 4.34375, | |
| "reward_std": 2.0728103518486023, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 616.4375, | |
| "epoch": 44.44444444444444, | |
| "grad_norm": 0.9016620089051227, | |
| "kl": 0.04852294921875, | |
| "learning_rate": 6.307399704769098e-07, | |
| "loss": 0.0327, | |
| "reward": 3.3125, | |
| "reward_std": 1.9239110946655273, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 554.40625, | |
| "epoch": 44.666666666666664, | |
| "grad_norm": 1.4445648538792832, | |
| "kl": 0.06365966796875, | |
| "learning_rate": 6.26745688473377e-07, | |
| "loss": 0.0527, | |
| "reward": 2.90625, | |
| "reward_std": 1.2700245678424835, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 596.6875, | |
| "epoch": 44.888888888888886, | |
| "grad_norm": 1.5856705116806837, | |
| "kl": 0.06280517578125, | |
| "learning_rate": 6.227427435703995e-07, | |
| "loss": 0.0488, | |
| "reward": 3.59375, | |
| "reward_std": 2.1598991453647614, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 540.375, | |
| "epoch": 45.22222222222222, | |
| "grad_norm": 1.4832995345417785, | |
| "kl": 0.0472412109375, | |
| "learning_rate": 6.187314093639443e-07, | |
| "loss": 0.021, | |
| "reward": 3.8125, | |
| "reward_std": 2.2678900957107544, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 596.75, | |
| "epoch": 45.44444444444444, | |
| "grad_norm": 1.5871845074006228, | |
| "kl": 0.048828125, | |
| "learning_rate": 6.147119600233758e-07, | |
| "loss": -0.025, | |
| "reward": 4.40625, | |
| "reward_std": 2.732926845550537, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 636.21875, | |
| "epoch": 45.666666666666664, | |
| "grad_norm": 1.1682267885626447, | |
| "kl": 0.050537109375, | |
| "learning_rate": 6.106846702727172e-07, | |
| "loss": -0.0041, | |
| "reward": 3.5625, | |
| "reward_std": 1.9367179870605469, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 567.6875, | |
| "epoch": 45.888888888888886, | |
| "grad_norm": 1.182505436622169, | |
| "kl": 0.052490234375, | |
| "learning_rate": 6.066498153718734e-07, | |
| "loss": -0.0104, | |
| "reward": 3.96875, | |
| "reward_std": 1.8926886320114136, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 556.84375, | |
| "epoch": 46.22222222222222, | |
| "grad_norm": 74.95843070592915, | |
| "kl": 0.51153564453125, | |
| "learning_rate": 6.026076710978171e-07, | |
| "loss": -0.0099, | |
| "reward": 4.03125, | |
| "reward_std": 2.5020731687545776, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 635.9375, | |
| "epoch": 46.44444444444444, | |
| "grad_norm": 1.1802575443084546, | |
| "kl": 0.046630859375, | |
| "learning_rate": 5.985585137257401e-07, | |
| "loss": -0.0104, | |
| "reward": 3.75, | |
| "reward_std": 1.5358919501304626, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 584.625, | |
| "epoch": 46.666666666666664, | |
| "grad_norm": 1.540364554923698, | |
| "kl": 0.053955078125, | |
| "learning_rate": 5.945026200101702e-07, | |
| "loss": 0.0173, | |
| "reward": 3.71875, | |
| "reward_std": 2.7078438997268677, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 511.375, | |
| "epoch": 46.888888888888886, | |
| "grad_norm": 1.3487938182691792, | |
| "kl": 0.05859375, | |
| "learning_rate": 5.90440267166055e-07, | |
| "loss": 0.0363, | |
| "reward": 3.125, | |
| "reward_std": 2.2170365154743195, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 536.40625, | |
| "epoch": 47.22222222222222, | |
| "grad_norm": 1.7030200868844614, | |
| "kl": 0.054931640625, | |
| "learning_rate": 5.863717328498152e-07, | |
| "loss": 0.0328, | |
| "reward": 3.84375, | |
| "reward_std": 2.070079743862152, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 555.875, | |
| "epoch": 47.44444444444444, | |
| "grad_norm": 1.7566836455673576, | |
| "kl": 0.05218505859375, | |
| "learning_rate": 5.82297295140367e-07, | |
| "loss": -0.0381, | |
| "reward": 3.75, | |
| "reward_std": 2.009314328432083, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 545.8125, | |
| "epoch": 47.666666666666664, | |
| "grad_norm": 1.594063347049537, | |
| "kl": 0.05426025390625, | |
| "learning_rate": 5.782172325201155e-07, | |
| "loss": 0.0535, | |
| "reward": 3.21875, | |
| "reward_std": 1.7700316905975342, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.8125, | |
| "epoch": 47.888888888888886, | |
| "grad_norm": 1.5439318867500331, | |
| "kl": 0.04937744140625, | |
| "learning_rate": 5.741318238559209e-07, | |
| "loss": -0.0012, | |
| "reward": 4.75, | |
| "reward_std": 2.4349581599235535, | |
| "rewards/accuracy_reward_staging": 0.5625, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 583.75, | |
| "epoch": 48.22222222222222, | |
| "grad_norm": 2.5201319810344454, | |
| "kl": 0.0770263671875, | |
| "learning_rate": 5.700413483800389e-07, | |
| "loss": -0.0762, | |
| "reward": 3.4375, | |
| "reward_std": 1.82216876745224, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 599.59375, | |
| "epoch": 48.44444444444444, | |
| "grad_norm": 1.473198815087056, | |
| "kl": 0.05352783203125, | |
| "learning_rate": 5.659460856710345e-07, | |
| "loss": -0.0055, | |
| "reward": 3.5625, | |
| "reward_std": 1.9599019289016724, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 569.75, | |
| "epoch": 48.666666666666664, | |
| "grad_norm": 1.6168573027198114, | |
| "kl": 0.05010986328125, | |
| "learning_rate": 5.618463156346739e-07, | |
| "loss": -0.0075, | |
| "reward": 4.21875, | |
| "reward_std": 1.739636391401291, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 577.375, | |
| "epoch": 48.888888888888886, | |
| "grad_norm": 1.5839729942600627, | |
| "kl": 0.04180908203125, | |
| "learning_rate": 5.577423184847931e-07, | |
| "loss": 0.0086, | |
| "reward": 3.875, | |
| "reward_std": 2.332531690597534, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 532.96875, | |
| "epoch": 49.22222222222222, | |
| "grad_norm": 1.5767088515541903, | |
| "kl": 0.04962158203125, | |
| "learning_rate": 5.536343747241459e-07, | |
| "loss": 0.0159, | |
| "reward": 4.15625, | |
| "reward_std": 1.9809716939926147, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 578.28125, | |
| "epoch": 49.44444444444444, | |
| "grad_norm": 1.3049917889915577, | |
| "kl": 0.04583740234375, | |
| "learning_rate": 5.495227651252315e-07, | |
| "loss": 0.0386, | |
| "reward": 4.53125, | |
| "reward_std": 1.7373294830322266, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 546.0625, | |
| "epoch": 49.666666666666664, | |
| "grad_norm": 1.3164741992532543, | |
| "kl": 0.0504150390625, | |
| "learning_rate": 5.454077707111041e-07, | |
| "loss": 0.0142, | |
| "reward": 4.65625, | |
| "reward_std": 1.945079743862152, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 49.888888888888886, | |
| "grad_norm": 1.3350172420738584, | |
| "learning_rate": 5.412896727361662e-07, | |
| "loss": 0.0656, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 49.888888888888886, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 600.85, | |
| "eval_kl": 0.047802734375, | |
| "eval_loss": 0.025471828877925873, | |
| "eval_reward": 2.6, | |
| "eval_reward_std": 1.3353363513946532, | |
| "eval_rewards/accuracy_reward_staging": 0.15, | |
| "eval_rewards/format_reward": 0.9, | |
| "eval_rewards/format_reward_staging": 0.95, | |
| "eval_runtime": 52.2669, | |
| "eval_samples_per_second": 0.689, | |
| "eval_steps_per_second": 0.096, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 614.453125, | |
| "epoch": 50.22222222222222, | |
| "grad_norm": 1.284721283794701, | |
| "kl": 0.05389404296875, | |
| "learning_rate": 5.371687526669439e-07, | |
| "loss": 0.0086, | |
| "reward": 3.421875, | |
| "reward_std": 2.202674761414528, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.890625, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 545.53125, | |
| "epoch": 50.44444444444444, | |
| "grad_norm": 1.235503506247465, | |
| "kl": 0.0528564453125, | |
| "learning_rate": 5.330452921628497e-07, | |
| "loss": -0.0137, | |
| "reward": 3.5625, | |
| "reward_std": 1.246154248714447, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 560.25, | |
| "epoch": 50.666666666666664, | |
| "grad_norm": 1.9492031211380043, | |
| "kl": 0.0654296875, | |
| "learning_rate": 5.28919573056932e-07, | |
| "loss": -0.049, | |
| "reward": 4.28125, | |
| "reward_std": 2.934589922428131, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 599.3125, | |
| "epoch": 50.888888888888886, | |
| "grad_norm": 1.567290502007258, | |
| "kl": 0.04364013671875, | |
| "learning_rate": 5.247918773366111e-07, | |
| "loss": 0.0937, | |
| "reward": 3.875, | |
| "reward_std": 1.930722177028656, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 562.6875, | |
| "epoch": 51.22222222222222, | |
| "grad_norm": 1.4477817793212922, | |
| "kl": 0.05084228515625, | |
| "learning_rate": 5.206624871244065e-07, | |
| "loss": 0.0148, | |
| "reward": 2.90625, | |
| "reward_std": 1.4091877937316895, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 605.625, | |
| "epoch": 51.44444444444444, | |
| "grad_norm": 1.5674813338685252, | |
| "kl": 0.04931640625, | |
| "learning_rate": 5.165316846586541e-07, | |
| "loss": 0.0963, | |
| "reward": 3.125, | |
| "reward_std": 2.1649354100227356, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 572.5625, | |
| "epoch": 51.666666666666664, | |
| "grad_norm": 1.521375079838418, | |
| "kl": 0.046875, | |
| "learning_rate": 5.123997522742151e-07, | |
| "loss": 0.0215, | |
| "reward": 3.71875, | |
| "reward_std": 2.047757565975189, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 524.4375, | |
| "epoch": 51.888888888888886, | |
| "grad_norm": 1.637742061840183, | |
| "kl": 0.04779052734375, | |
| "learning_rate": 5.082669723831793e-07, | |
| "loss": -0.0249, | |
| "reward": 3.59375, | |
| "reward_std": 2.858625650405884, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 566.875, | |
| "epoch": 52.22222222222222, | |
| "grad_norm": 1.5832843072882397, | |
| "kl": 0.04449462890625, | |
| "learning_rate": 5.041336274555625e-07, | |
| "loss": -0.063, | |
| "reward": 2.84375, | |
| "reward_std": 1.2771694660186768, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 576.0625, | |
| "epoch": 52.44444444444444, | |
| "grad_norm": 1.5508316387978383, | |
| "kl": 0.06103515625, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0291, | |
| "reward": 4.0, | |
| "reward_std": 2.082531690597534, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 611.71875, | |
| "epoch": 52.666666666666664, | |
| "grad_norm": 1.6164552079690877, | |
| "kl": 0.04437255859375, | |
| "learning_rate": 4.958663725444375e-07, | |
| "loss": 0.0102, | |
| "reward": 4.40625, | |
| "reward_std": 2.5580477714538574, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 563.3125, | |
| "epoch": 52.888888888888886, | |
| "grad_norm": 1.5726439650006456, | |
| "kl": 0.05096435546875, | |
| "learning_rate": 4.917330276168208e-07, | |
| "loss": -0.0031, | |
| "reward": 4.96875, | |
| "reward_std": 2.3175911903381348, | |
| "rewards/accuracy_reward_staging": 0.625, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 616.375, | |
| "epoch": 53.22222222222222, | |
| "grad_norm": 1.7880025106936461, | |
| "kl": 0.04498291015625, | |
| "learning_rate": 4.87600247725785e-07, | |
| "loss": 0.066, | |
| "reward": 3.1875, | |
| "reward_std": 1.891027882695198, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 556.6875, | |
| "epoch": 53.44444444444444, | |
| "grad_norm": 2.0232137942713573, | |
| "kl": 0.0498046875, | |
| "learning_rate": 4.834683153413459e-07, | |
| "loss": 0.0311, | |
| "reward": 3.5625, | |
| "reward_std": 1.6434174478054047, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 555.09375, | |
| "epoch": 53.666666666666664, | |
| "grad_norm": 1.4253180533139413, | |
| "kl": 0.0416259765625, | |
| "learning_rate": 4.793375128755933e-07, | |
| "loss": -0.0401, | |
| "reward": 4.03125, | |
| "reward_std": 2.570079743862152, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.4375, | |
| "epoch": 53.888888888888886, | |
| "grad_norm": 1.740974206086713, | |
| "kl": 0.04815673828125, | |
| "learning_rate": 4.752081226633888e-07, | |
| "loss": -0.038, | |
| "reward": 4.34375, | |
| "reward_std": 2.6059716939926147, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 564.0625, | |
| "epoch": 54.22222222222222, | |
| "grad_norm": 1.6032171003103113, | |
| "kl": 0.05572509765625, | |
| "learning_rate": 4.71080426943068e-07, | |
| "loss": 0.0092, | |
| "reward": 3.0625, | |
| "reward_std": 1.996816635131836, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 530.03125, | |
| "epoch": 54.44444444444444, | |
| "grad_norm": 1.3127356050754018, | |
| "kl": 0.0540771484375, | |
| "learning_rate": 4.669547078371503e-07, | |
| "loss": -0.0245, | |
| "reward": 6.59375, | |
| "reward_std": 2.073159486055374, | |
| "rewards/accuracy_reward_staging": 0.9375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 608.3125, | |
| "epoch": 54.666666666666664, | |
| "grad_norm": 1.7016991990394592, | |
| "kl": 0.05010986328125, | |
| "learning_rate": 4.628312473330562e-07, | |
| "loss": 0.0702, | |
| "reward": 3.875, | |
| "reward_std": 2.482748866081238, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 549.375, | |
| "epoch": 54.888888888888886, | |
| "grad_norm": 1.3141608271515532, | |
| "kl": 0.04742431640625, | |
| "learning_rate": 4.5871032726383385e-07, | |
| "loss": 0.0552, | |
| "reward": 3.125, | |
| "reward_std": 1.3886407911777496, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 578.71875, | |
| "epoch": 55.22222222222222, | |
| "grad_norm": 1.3932263109990592, | |
| "kl": 0.04364013671875, | |
| "learning_rate": 4.5459222928889587e-07, | |
| "loss": 0.051, | |
| "reward": 3.71875, | |
| "reward_std": 1.7805703282356262, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 588.4375, | |
| "epoch": 55.44444444444444, | |
| "grad_norm": 1.5339621078239263, | |
| "kl": 0.04962158203125, | |
| "learning_rate": 4.5047723487476864e-07, | |
| "loss": -0.0216, | |
| "reward": 3.46875, | |
| "reward_std": 2.488185405731201, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 585.34375, | |
| "epoch": 55.666666666666664, | |
| "grad_norm": 1.6607509386936015, | |
| "kl": 0.04962158203125, | |
| "learning_rate": 4.463656252758542e-07, | |
| "loss": 0.0452, | |
| "reward": 3.8125, | |
| "reward_std": 2.171033263206482, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 631.75, | |
| "epoch": 55.888888888888886, | |
| "grad_norm": 1.5614778713632624, | |
| "kl": 0.04669189453125, | |
| "learning_rate": 4.4225768151520694e-07, | |
| "loss": 0.0801, | |
| "reward": 3.5625, | |
| "reward_std": 2.430722177028656, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 613.75, | |
| "epoch": 56.22222222222222, | |
| "grad_norm": 1.5004046938088074, | |
| "kl": 0.05950927734375, | |
| "learning_rate": 4.381536843653261e-07, | |
| "loss": 0.0698, | |
| "reward": 3.59375, | |
| "reward_std": 2.5734615325927734, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 565.375, | |
| "epoch": 56.44444444444444, | |
| "grad_norm": 1.3766714019303354, | |
| "kl": 0.04168701171875, | |
| "learning_rate": 4.340539143289655e-07, | |
| "loss": 0.0233, | |
| "reward": 3.5, | |
| "reward_std": 2.0, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 553.125, | |
| "epoch": 56.666666666666664, | |
| "grad_norm": 1.307050706736634, | |
| "kl": 0.05133056640625, | |
| "learning_rate": 4.2995865161996104e-07, | |
| "loss": 0.0181, | |
| "reward": 4.0625, | |
| "reward_std": 2.421202301979065, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 516.4375, | |
| "epoch": 56.888888888888886, | |
| "grad_norm": 1.5405733998671278, | |
| "kl": 0.0562744140625, | |
| "learning_rate": 4.258681761440789e-07, | |
| "loss": 0.0017, | |
| "reward": 4.03125, | |
| "reward_std": 2.49512779712677, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 621.84375, | |
| "epoch": 57.22222222222222, | |
| "grad_norm": 1.606949877632979, | |
| "kl": 0.044189453125, | |
| "learning_rate": 4.2178276747988444e-07, | |
| "loss": -0.0076, | |
| "reward": 4.3125, | |
| "reward_std": 2.390491783618927, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.84375, | |
| "epoch": 57.44444444444444, | |
| "grad_norm": 1.5411205221206894, | |
| "kl": 0.0574951171875, | |
| "learning_rate": 4.1770270485963294e-07, | |
| "loss": -0.0387, | |
| "reward": 3.125, | |
| "reward_std": 2.1638975143432617, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 603.1875, | |
| "epoch": 57.666666666666664, | |
| "grad_norm": 1.3383276534008064, | |
| "kl": 0.04473876953125, | |
| "learning_rate": 4.1362826715018497e-07, | |
| "loss": 0.0122, | |
| "reward": 3.6875, | |
| "reward_std": 1.9202269613742828, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 552.25, | |
| "epoch": 57.888888888888886, | |
| "grad_norm": 1.7484795613881616, | |
| "kl": 0.06341552734375, | |
| "learning_rate": 4.095597328339452e-07, | |
| "loss": -0.0426, | |
| "reward": 4.46875, | |
| "reward_std": 2.5560158491134644, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 585.125, | |
| "epoch": 58.22222222222222, | |
| "grad_norm": 1.5442704440086175, | |
| "kl": 0.05377197265625, | |
| "learning_rate": 4.0549737998982994e-07, | |
| "loss": -0.0062, | |
| "reward": 3.65625, | |
| "reward_std": 2.2512659430503845, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 601.59375, | |
| "epoch": 58.44444444444444, | |
| "grad_norm": 1.3070749287077408, | |
| "kl": 0.05706787109375, | |
| "learning_rate": 4.0144148627425986e-07, | |
| "loss": 0.0357, | |
| "reward": 4.5625, | |
| "reward_std": 2.173893690109253, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 549.25, | |
| "epoch": 58.666666666666664, | |
| "grad_norm": 1.568215525888831, | |
| "kl": 0.04644775390625, | |
| "learning_rate": 3.973923289021829e-07, | |
| "loss": -0.0236, | |
| "reward": 3.375, | |
| "reward_std": 2.125, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 559.875, | |
| "epoch": 58.888888888888886, | |
| "grad_norm": 1.247655763308189, | |
| "kl": 0.05523681640625, | |
| "learning_rate": 3.9335018462812664e-07, | |
| "loss": 0.0335, | |
| "reward": 4.40625, | |
| "reward_std": 1.7515006065368652, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 608.75, | |
| "epoch": 59.22222222222222, | |
| "grad_norm": 1.4876134624852135, | |
| "kl": 0.05291748046875, | |
| "learning_rate": 3.893153297272828e-07, | |
| "loss": 0.0246, | |
| "reward": 3.28125, | |
| "reward_std": 1.5280899405479431, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 583.0, | |
| "epoch": 59.44444444444444, | |
| "grad_norm": 1.6243663627358595, | |
| "kl": 0.04718017578125, | |
| "learning_rate": 3.8528803997662423e-07, | |
| "loss": -0.0226, | |
| "reward": 4.5625, | |
| "reward_std": 2.9370444416999817, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 584.90625, | |
| "epoch": 59.666666666666664, | |
| "grad_norm": 1.579154131750563, | |
| "kl": 0.05328369140625, | |
| "learning_rate": 3.812685906360557e-07, | |
| "loss": -0.0118, | |
| "reward": 3.5625, | |
| "reward_std": 1.8252411782741547, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 602.875, | |
| "epoch": 59.888888888888886, | |
| "grad_norm": 1.6568742956735238, | |
| "kl": 0.05029296875, | |
| "learning_rate": 3.772572564296004e-07, | |
| "loss": 0.0049, | |
| "reward": 4.21875, | |
| "reward_std": 2.6711304783821106, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 527.71875, | |
| "epoch": 60.22222222222222, | |
| "grad_norm": 1.5110623474715636, | |
| "kl": 0.05316162109375, | |
| "learning_rate": 3.7325431152662294e-07, | |
| "loss": 0.004, | |
| "reward": 3.65625, | |
| "reward_std": 2.44047012925148, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 607.71875, | |
| "epoch": 60.44444444444444, | |
| "grad_norm": 1.5588742571636938, | |
| "kl": 0.05126953125, | |
| "learning_rate": 3.692600295230901e-07, | |
| "loss": 0.0174, | |
| "reward": 4.125, | |
| "reward_std": 2.93262779712677, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.375, | |
| "epoch": 60.666666666666664, | |
| "grad_norm": 1.4200468362196192, | |
| "kl": 0.05487060546875, | |
| "learning_rate": 3.6527468342287096e-07, | |
| "loss": 0.1256, | |
| "reward": 3.8125, | |
| "reward_std": 2.782258152961731, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.9375, | |
| "epoch": 60.888888888888886, | |
| "grad_norm": 2.112230364965324, | |
| "kl": 0.06414794921875, | |
| "learning_rate": 3.612985456190778e-07, | |
| "loss": -0.0099, | |
| "reward": 4.0625, | |
| "reward_std": 2.503733992576599, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 580.84375, | |
| "epoch": 61.22222222222222, | |
| "grad_norm": 1.5256554050376716, | |
| "kl": 0.0540771484375, | |
| "learning_rate": 3.5733188787544746e-07, | |
| "loss": 0.0285, | |
| "reward": 3.75, | |
| "reward_std": 2.553140878677368, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 553.53125, | |
| "epoch": 61.44444444444444, | |
| "grad_norm": 1.5714805767176323, | |
| "kl": 0.0645751953125, | |
| "learning_rate": 3.533749813077677e-07, | |
| "loss": 0.0666, | |
| "reward": 4.71875, | |
| "reward_std": 2.595756232738495, | |
| "rewards/accuracy_reward_staging": 0.59375, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 570.90625, | |
| "epoch": 61.666666666666664, | |
| "grad_norm": 1.3717833382169582, | |
| "kl": 0.05242919921875, | |
| "learning_rate": 3.4942809636534633e-07, | |
| "loss": 0.0464, | |
| "reward": 4.375, | |
| "reward_std": 1.9917186498641968, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 648.6875, | |
| "epoch": 61.888888888888886, | |
| "grad_norm": 1.281888219474357, | |
| "kl": 0.05694580078125, | |
| "learning_rate": 3.454915028125263e-07, | |
| "loss": -0.0053, | |
| "reward": 4.1875, | |
| "reward_std": 1.8432075381278992, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 601.28125, | |
| "epoch": 62.22222222222222, | |
| "grad_norm": 1.2189149322070956, | |
| "kl": 0.05279541015625, | |
| "learning_rate": 3.415654697102478e-07, | |
| "loss": -0.0095, | |
| "reward": 3.65625, | |
| "reward_std": 1.4233438968658447, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 62.44444444444444, | |
| "grad_norm": 1.7869776388477054, | |
| "learning_rate": 3.3765026539765827e-07, | |
| "loss": 0.0694, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 62.44444444444444, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 597.85, | |
| "eval_kl": 0.050439453125, | |
| "eval_loss": 0.033870112150907516, | |
| "eval_reward": 2.5, | |
| "eval_reward_std": 1.5911447525024414, | |
| "eval_rewards/accuracy_reward_staging": 0.15, | |
| "eval_rewards/format_reward": 0.825, | |
| "eval_rewards/format_reward_staging": 0.925, | |
| "eval_runtime": 53.5113, | |
| "eval_samples_per_second": 0.673, | |
| "eval_steps_per_second": 0.093, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 543.40625, | |
| "epoch": 62.666666666666664, | |
| "grad_norm": 1.580273768681387, | |
| "kl": 0.058380126953125, | |
| "learning_rate": 3.337461574737716e-07, | |
| "loss": 0.0381, | |
| "reward": 3.59375, | |
| "reward_std": 1.963532954454422, | |
| "rewards/accuracy_reward_staging": 0.359375, | |
| "rewards/format_reward": 0.859375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 629.9375, | |
| "epoch": 62.888888888888886, | |
| "grad_norm": 1.4446074061408753, | |
| "kl": 0.04742431640625, | |
| "learning_rate": 3.2985341277917846e-07, | |
| "loss": 0.0576, | |
| "reward": 3.5625, | |
| "reward_std": 1.8048822581768036, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 610.03125, | |
| "epoch": 63.22222222222222, | |
| "grad_norm": 2.4797282896452084, | |
| "kl": 0.06011962890625, | |
| "learning_rate": 3.2597229737780774e-07, | |
| "loss": 0.0258, | |
| "reward": 2.71875, | |
| "reward_std": 1.841366171836853, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.84375, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 535.8125, | |
| "epoch": 63.44444444444444, | |
| "grad_norm": 1.413545282954978, | |
| "kl": 0.04827880859375, | |
| "learning_rate": 3.221030765387417e-07, | |
| "loss": 0.0266, | |
| "reward": 4.0, | |
| "reward_std": 1.7409893572330475, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 625.125, | |
| "epoch": 63.666666666666664, | |
| "grad_norm": 1.4981329871397806, | |
| "kl": 0.04962158203125, | |
| "learning_rate": 3.1824601471808497e-07, | |
| "loss": 0.0841, | |
| "reward": 4.5625, | |
| "reward_std": 3.0762142539024353, | |
| "rewards/accuracy_reward_staging": 0.5625, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 633.125, | |
| "epoch": 63.888888888888886, | |
| "grad_norm": 1.4534244133457102, | |
| "kl": 0.04656982421875, | |
| "learning_rate": 3.1440137554088953e-07, | |
| "loss": 0.029, | |
| "reward": 3.84375, | |
| "reward_std": 2.296931117773056, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 519.59375, | |
| "epoch": 64.22222222222223, | |
| "grad_norm": 1.586114819510263, | |
| "kl": 0.05950927734375, | |
| "learning_rate": 3.1056942178313604e-07, | |
| "loss": 0.0666, | |
| "reward": 4.375, | |
| "reward_std": 2.7632179856300354, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 554.5, | |
| "epoch": 64.44444444444444, | |
| "grad_norm": 1.4820895514814123, | |
| "kl": 0.057373046875, | |
| "learning_rate": 3.06750415353774e-07, | |
| "loss": 0.015, | |
| "reward": 4.34375, | |
| "reward_std": 2.6667675375938416, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 594.3125, | |
| "epoch": 64.66666666666667, | |
| "grad_norm": 1.4710703551980364, | |
| "kl": 0.05108642578125, | |
| "learning_rate": 3.029446172768193e-07, | |
| "loss": -0.0532, | |
| "reward": 3.71875, | |
| "reward_std": 1.9592358469963074, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 615.125, | |
| "epoch": 64.88888888888889, | |
| "grad_norm": 1.195494273136427, | |
| "kl": 0.05078125, | |
| "learning_rate": 2.9915228767351535e-07, | |
| "loss": -0.0471, | |
| "reward": 3.71875, | |
| "reward_std": 1.5842358469963074, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 582.78125, | |
| "epoch": 65.22222222222223, | |
| "grad_norm": 1.0142470745003664, | |
| "kl": 0.0562744140625, | |
| "learning_rate": 2.9537368574455303e-07, | |
| "loss": 0.0116, | |
| "reward": 3.90625, | |
| "reward_std": 1.3764855861663818, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 551.5625, | |
| "epoch": 65.44444444444444, | |
| "grad_norm": 1.2621208103940496, | |
| "kl": 0.045166015625, | |
| "learning_rate": 2.916090697523549e-07, | |
| "loss": 0.0065, | |
| "reward": 3.5625, | |
| "reward_std": 1.8217839002609253, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 639.625, | |
| "epoch": 65.66666666666667, | |
| "grad_norm": 1.2178177535688726, | |
| "kl": 0.06982421875, | |
| "learning_rate": 2.878586970034232e-07, | |
| "loss": 0.0063, | |
| "reward": 2.8125, | |
| "reward_std": 1.2878219783306122, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.875, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 582.875, | |
| "epoch": 65.88888888888889, | |
| "grad_norm": 1.6224844954097977, | |
| "kl": 0.04864501953125, | |
| "learning_rate": 2.841228238307536e-07, | |
| "loss": -0.0201, | |
| "reward": 5.03125, | |
| "reward_std": 2.2327269315719604, | |
| "rewards/accuracy_reward_staging": 0.625, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 637.15625, | |
| "epoch": 66.22222222222223, | |
| "grad_norm": 1.2308430687264216, | |
| "kl": 0.05279541015625, | |
| "learning_rate": 2.8040170557631485e-07, | |
| "loss": 0.0153, | |
| "reward": 3.46875, | |
| "reward_std": 2.0372338593006134, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 600.4375, | |
| "epoch": 66.44444444444444, | |
| "grad_norm": 1.5321111568180714, | |
| "kl": 0.04827880859375, | |
| "learning_rate": 2.7669559657359673e-07, | |
| "loss": -0.0491, | |
| "reward": 3.8125, | |
| "reward_std": 2.4646694660186768, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 615.5, | |
| "epoch": 66.66666666666667, | |
| "grad_norm": 1.3679824700888612, | |
| "kl": 0.05462646484375, | |
| "learning_rate": 2.730047501302266e-07, | |
| "loss": 0.0308, | |
| "reward": 3.09375, | |
| "reward_std": 2.642750769853592, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 549.5, | |
| "epoch": 66.88888888888889, | |
| "grad_norm": 1.5033503223897624, | |
| "kl": 0.0577392578125, | |
| "learning_rate": 2.6932941851065615e-07, | |
| "loss": -0.0215, | |
| "reward": 4.9375, | |
| "reward_std": 2.482675850391388, | |
| "rewards/accuracy_reward_staging": 0.59375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 553.34375, | |
| "epoch": 67.22222222222223, | |
| "grad_norm": 1.4290897914516596, | |
| "kl": 0.05340576171875, | |
| "learning_rate": 2.656698529189193e-07, | |
| "loss": 0.0366, | |
| "reward": 3.78125, | |
| "reward_std": 1.9895031452178955, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 575.1875, | |
| "epoch": 67.44444444444444, | |
| "grad_norm": 1.579184850033335, | |
| "kl": 0.0518798828125, | |
| "learning_rate": 2.620263034814632e-07, | |
| "loss": 0.0078, | |
| "reward": 4.4375, | |
| "reward_std": 2.323539137840271, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 584.34375, | |
| "epoch": 67.66666666666667, | |
| "grad_norm": 1.482511162141472, | |
| "kl": 0.0482177734375, | |
| "learning_rate": 2.58399019230052e-07, | |
| "loss": -0.0587, | |
| "reward": 3.6875, | |
| "reward_std": 2.195499747991562, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 592.0, | |
| "epoch": 67.88888888888889, | |
| "grad_norm": 1.4652114217200525, | |
| "kl": 0.049560546875, | |
| "learning_rate": 2.547882480847461e-07, | |
| "loss": 0.0021, | |
| "reward": 3.1875, | |
| "reward_std": 2.073539137840271, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.59375, | |
| "epoch": 68.22222222222223, | |
| "grad_norm": 1.5345326135427537, | |
| "kl": 0.04913330078125, | |
| "learning_rate": 2.5119423683695657e-07, | |
| "loss": -0.0357, | |
| "reward": 4.25, | |
| "reward_std": 2.9848236441612244, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.4375, | |
| "epoch": 68.44444444444444, | |
| "grad_norm": 1.5990838171502337, | |
| "kl": 0.061279296875, | |
| "learning_rate": 2.476172311325783e-07, | |
| "loss": 0.0292, | |
| "reward": 5.1875, | |
| "reward_std": 2.957588255405426, | |
| "rewards/accuracy_reward_staging": 0.6875, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.71875, | |
| "epoch": 68.66666666666667, | |
| "grad_norm": 2.3340038254897566, | |
| "kl": 0.06951904296875, | |
| "learning_rate": 2.440574754551996e-07, | |
| "loss": 0.0246, | |
| "reward": 3.5, | |
| "reward_std": 2.0238241851329803, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 648.0625, | |
| "epoch": 68.88888888888889, | |
| "grad_norm": 1.5884432300379054, | |
| "kl": 0.04443359375, | |
| "learning_rate": 2.4051521310939254e-07, | |
| "loss": 0.1177, | |
| "reward": 4.0, | |
| "reward_std": 1.8069141209125519, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 546.625, | |
| "epoch": 69.22222222222223, | |
| "grad_norm": 2.985922020759926, | |
| "kl": 0.10992431640625, | |
| "learning_rate": 2.3699068620408301e-07, | |
| "loss": 0.0152, | |
| "reward": 3.15625, | |
| "reward_std": 1.511039137840271, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 602.25, | |
| "epoch": 69.44444444444444, | |
| "grad_norm": 1.5923870878518707, | |
| "kl": 0.056396484375, | |
| "learning_rate": 2.3348413563600323e-07, | |
| "loss": 0.0176, | |
| "reward": 4.5, | |
| "reward_std": 2.31710484623909, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.46875, | |
| "epoch": 69.66666666666667, | |
| "grad_norm": 1.5951806343259411, | |
| "kl": 0.04925537109375, | |
| "learning_rate": 2.2999580107322654e-07, | |
| "loss": 0.0929, | |
| "reward": 4.9375, | |
| "reward_std": 2.494741439819336, | |
| "rewards/accuracy_reward_staging": 0.59375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 520.6875, | |
| "epoch": 69.88888888888889, | |
| "grad_norm": 1.5129561001363085, | |
| "kl": 0.0699462890625, | |
| "learning_rate": 2.2652592093878665e-07, | |
| "loss": 0.0125, | |
| "reward": 4.25, | |
| "reward_std": 1.878759890794754, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 645.0, | |
| "epoch": 70.22222222222223, | |
| "grad_norm": 1.5182810808110982, | |
| "kl": 0.0643310546875, | |
| "learning_rate": 2.2307473239438152e-07, | |
| "loss": 0.01, | |
| "reward": 4.40625, | |
| "reward_std": 2.5910332798957825, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 569.21875, | |
| "epoch": 70.44444444444444, | |
| "grad_norm": 1.8382342741040079, | |
| "kl": 0.05499267578125, | |
| "learning_rate": 2.1964247132416368e-07, | |
| "loss": 0.0019, | |
| "reward": 4.40625, | |
| "reward_std": 3.0214737951755524, | |
| "rewards/accuracy_reward_staging": 0.5625, | |
| "rewards/format_reward": 0.6875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 608.0, | |
| "epoch": 70.66666666666667, | |
| "grad_norm": 1.7202842060510593, | |
| "kl": 0.04736328125, | |
| "learning_rate": 2.1622937231861822e-07, | |
| "loss": 0.0307, | |
| "reward": 3.375, | |
| "reward_std": 2.42453271150589, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.875, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 600.9375, | |
| "epoch": 70.88888888888889, | |
| "grad_norm": 1.4517912073118557, | |
| "kl": 0.04290771484375, | |
| "learning_rate": 2.128356686585282e-07, | |
| "loss": 0.0476, | |
| "reward": 3.75, | |
| "reward_std": 1.7858919501304626, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 649.53125, | |
| "epoch": 71.22222222222223, | |
| "grad_norm": 1.259142583692514, | |
| "kl": 0.0467529296875, | |
| "learning_rate": 2.0946159229903088e-07, | |
| "loss": 0.0839, | |
| "reward": 2.84375, | |
| "reward_std": 1.5846085250377655, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.875, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 625.875, | |
| "epoch": 71.44444444444444, | |
| "grad_norm": 1.340679975407133, | |
| "kl": 0.0565185546875, | |
| "learning_rate": 2.0610737385376348e-07, | |
| "loss": 0.0085, | |
| "reward": 3.59375, | |
| "reward_std": 1.975972980260849, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.875, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 603.34375, | |
| "epoch": 71.66666666666667, | |
| "grad_norm": 1.4289962866267603, | |
| "kl": 0.06005859375, | |
| "learning_rate": 2.0277324257910106e-07, | |
| "loss": 0.0185, | |
| "reward": 5.5, | |
| "reward_std": 2.4536279439926147, | |
| "rewards/accuracy_reward_staging": 0.75, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 558.25, | |
| "epoch": 71.88888888888889, | |
| "grad_norm": 1.6936868571901433, | |
| "kl": 0.0546875, | |
| "learning_rate": 1.9945942635848745e-07, | |
| "loss": 0.0145, | |
| "reward": 3.78125, | |
| "reward_std": 2.1591877937316895, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 636.28125, | |
| "epoch": 72.22222222222223, | |
| "grad_norm": 1.3457751552584203, | |
| "kl": 0.0472412109375, | |
| "learning_rate": 1.9616615168685942e-07, | |
| "loss": 0.0082, | |
| "reward": 3.375, | |
| "reward_std": 1.7216877937316895, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 565.1875, | |
| "epoch": 72.44444444444444, | |
| "grad_norm": 1.1708504647450504, | |
| "kl": 0.0599365234375, | |
| "learning_rate": 1.9289364365516607e-07, | |
| "loss": 0.015, | |
| "reward": 4.46875, | |
| "reward_std": 1.2958193719387054, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.125, | |
| "epoch": 72.66666666666667, | |
| "grad_norm": 2.00516555564966, | |
| "kl": 0.065185546875, | |
| "learning_rate": 1.896421259349844e-07, | |
| "loss": 0.0357, | |
| "reward": 4.21875, | |
| "reward_std": 2.589491307735443, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 672.75, | |
| "epoch": 72.88888888888889, | |
| "grad_norm": 2.8218244852832335, | |
| "kl": 0.09649658203125, | |
| "learning_rate": 1.8641182076323148e-07, | |
| "loss": -0.0058, | |
| "reward": 5.03125, | |
| "reward_std": 3.2576534748077393, | |
| "rewards/accuracy_reward_staging": 0.625, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 577.65625, | |
| "epoch": 73.22222222222223, | |
| "grad_norm": 1.7581055322994823, | |
| "kl": 0.06195068359375, | |
| "learning_rate": 1.8320294892697475e-07, | |
| "loss": 0.0534, | |
| "reward": 3.0, | |
| "reward_std": 2.1200742721557617, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.84375, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 562.6875, | |
| "epoch": 73.44444444444444, | |
| "grad_norm": 1.5050675135024016, | |
| "kl": 0.0499267578125, | |
| "learning_rate": 1.8001572974834168e-07, | |
| "loss": 0.0343, | |
| "reward": 4.0, | |
| "reward_std": 1.9108919501304626, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 590.5, | |
| "epoch": 73.66666666666667, | |
| "grad_norm": 12.895158631725321, | |
| "kl": 0.12432861328125, | |
| "learning_rate": 1.768503810695295e-07, | |
| "loss": 0.0513, | |
| "reward": 3.46875, | |
| "reward_std": 1.6672459840774536, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 609.125, | |
| "epoch": 73.88888888888889, | |
| "grad_norm": 1.7151482870021748, | |
| "kl": 0.07269287109375, | |
| "learning_rate": 1.7370711923791564e-07, | |
| "loss": -0.0106, | |
| "reward": 5.625, | |
| "reward_std": 2.8527393341064453, | |
| "rewards/accuracy_reward_staging": 0.78125, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.875, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 675.34375, | |
| "epoch": 74.22222222222223, | |
| "grad_norm": 1.6055158333490096, | |
| "kl": 0.0538330078125, | |
| "learning_rate": 1.70586159091271e-07, | |
| "loss": 0.0916, | |
| "reward": 3.53125, | |
| "reward_std": 2.737855911254883, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.6875, | |
| "rewards/format_reward_staging": 0.8125, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 638.0, | |
| "epoch": 74.44444444444444, | |
| "grad_norm": 1.3927154732757459, | |
| "kl": 0.0494384765625, | |
| "learning_rate": 1.674877139430758e-07, | |
| "loss": -0.0039, | |
| "reward": 3.5, | |
| "reward_std": 2.132579743862152, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.78125, | |
| "epoch": 74.66666666666667, | |
| "grad_norm": 1.2941317675033293, | |
| "kl": 0.05804443359375, | |
| "learning_rate": 1.6441199556794034e-07, | |
| "loss": 0.0582, | |
| "reward": 3.28125, | |
| "reward_std": 2.0324151515960693, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 74.88888888888889, | |
| "grad_norm": 1.1895166775660873, | |
| "learning_rate": 1.6135921418712955e-07, | |
| "loss": 0.0154, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 74.88888888888889, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 558.5, | |
| "eval_kl": 0.055908203125, | |
| "eval_loss": 0.04830198734998703, | |
| "eval_reward": 3.25, | |
| "eval_reward_std": 2.227747082710266, | |
| "eval_rewards/accuracy_reward_staging": 0.275, | |
| "eval_rewards/format_reward": 0.9, | |
| "eval_rewards/format_reward_staging": 0.975, | |
| "eval_runtime": 50.8525, | |
| "eval_samples_per_second": 0.708, | |
| "eval_steps_per_second": 0.098, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 522.46875, | |
| "epoch": 75.22222222222223, | |
| "grad_norm": 1.2719829976418617, | |
| "kl": 0.05950927734375, | |
| "learning_rate": 1.5832957845419582e-07, | |
| "loss": -0.0239, | |
| "reward": 4.078125, | |
| "reward_std": 1.734619602560997, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.75, | |
| "epoch": 75.44444444444444, | |
| "grad_norm": 1.4701942608061176, | |
| "kl": 0.05584716796875, | |
| "learning_rate": 1.553232954407171e-07, | |
| "loss": -0.0222, | |
| "reward": 4.46875, | |
| "reward_std": 1.8445461988449097, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 606.6875, | |
| "epoch": 75.66666666666667, | |
| "grad_norm": 0.979841248868734, | |
| "kl": 0.0506591796875, | |
| "learning_rate": 1.52340570622144e-07, | |
| "loss": 0.0094, | |
| "reward": 4.34375, | |
| "reward_std": 1.0341877937316895, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 645.0, | |
| "epoch": 75.88888888888889, | |
| "grad_norm": 1.2907279139619887, | |
| "kl": 0.05084228515625, | |
| "learning_rate": 1.493816078637557e-07, | |
| "loss": 0.0349, | |
| "reward": 4.03125, | |
| "reward_std": 2.768365204334259, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 612.03125, | |
| "epoch": 76.22222222222223, | |
| "grad_norm": 1.3052082852261886, | |
| "kl": 0.06219482421875, | |
| "learning_rate": 1.4644660940672627e-07, | |
| "loss": 0.0241, | |
| "reward": 3.90625, | |
| "reward_std": 1.5625, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 552.0, | |
| "epoch": 76.44444444444444, | |
| "grad_norm": 1.6816507380806482, | |
| "kl": 0.0640869140625, | |
| "learning_rate": 1.435357758543015e-07, | |
| "loss": 0.0623, | |
| "reward": 3.5, | |
| "reward_std": 2.3343209326267242, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 605.96875, | |
| "epoch": 76.66666666666667, | |
| "grad_norm": 1.7963549332670843, | |
| "kl": 0.05462646484375, | |
| "learning_rate": 1.4064930615808806e-07, | |
| "loss": -0.0141, | |
| "reward": 3.90625, | |
| "reward_std": 3.359531879425049, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 559.5625, | |
| "epoch": 76.88888888888889, | |
| "grad_norm": 1.3270350222684457, | |
| "kl": 0.0548095703125, | |
| "learning_rate": 1.3778739760445552e-07, | |
| "loss": 0.0232, | |
| "reward": 3.53125, | |
| "reward_std": 2.031329423189163, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 574.9375, | |
| "epoch": 77.22222222222223, | |
| "grad_norm": 1.403581677625956, | |
| "kl": 0.0579833984375, | |
| "learning_rate": 1.349502458010519e-07, | |
| "loss": 0.0045, | |
| "reward": 3.40625, | |
| "reward_std": 1.5280899405479431, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 571.4375, | |
| "epoch": 77.44444444444444, | |
| "grad_norm": 1.4518085181139868, | |
| "kl": 0.05694580078125, | |
| "learning_rate": 1.321380446634342e-07, | |
| "loss": -0.0332, | |
| "reward": 4.53125, | |
| "reward_std": 2.796904981136322, | |
| "rewards/accuracy_reward_staging": 0.5625, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 592.375, | |
| "epoch": 77.66666666666667, | |
| "grad_norm": 1.4193852483613092, | |
| "kl": 0.04937744140625, | |
| "learning_rate": 1.2935098640181457e-07, | |
| "loss": 0.0097, | |
| "reward": 3.71875, | |
| "reward_std": 1.6591877937316895, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 576.5625, | |
| "epoch": 77.88888888888889, | |
| "grad_norm": 1.5563017115814217, | |
| "kl": 0.055419921875, | |
| "learning_rate": 1.2658926150792322e-07, | |
| "loss": 0.0595, | |
| "reward": 4.03125, | |
| "reward_std": 2.8135814666748047, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 618.15625, | |
| "epoch": 78.22222222222223, | |
| "grad_norm": 1.7591845192775064, | |
| "kl": 0.05523681640625, | |
| "learning_rate": 1.2385305874198775e-07, | |
| "loss": -0.0554, | |
| "reward": 2.625, | |
| "reward_std": 1.8215623199939728, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 554.40625, | |
| "epoch": 78.44444444444444, | |
| "grad_norm": 1.532211221198721, | |
| "kl": 0.04986572265625, | |
| "learning_rate": 1.2114256511983274e-07, | |
| "loss": 0.0323, | |
| "reward": 4.40625, | |
| "reward_std": 2.975598633289337, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 563.71875, | |
| "epoch": 78.66666666666667, | |
| "grad_norm": 1.827273461906554, | |
| "kl": 0.054931640625, | |
| "learning_rate": 1.1845796590009683e-07, | |
| "loss": 0.1089, | |
| "reward": 4.28125, | |
| "reward_std": 2.9560980796813965, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 607.125, | |
| "epoch": 78.88888888888889, | |
| "grad_norm": 1.5705329008308428, | |
| "kl": 0.05218505859375, | |
| "learning_rate": 1.1579944457157059e-07, | |
| "loss": 0.0714, | |
| "reward": 3.53125, | |
| "reward_std": 2.3595376014709473, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.84375, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 576.59375, | |
| "epoch": 79.22222222222223, | |
| "grad_norm": 1.2784333227971698, | |
| "kl": 0.04998779296875, | |
| "learning_rate": 1.1316718284065535e-07, | |
| "loss": -0.0327, | |
| "reward": 3.1875, | |
| "reward_std": 1.75, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.6875, | |
| "epoch": 79.44444444444444, | |
| "grad_norm": 1.2996658268690655, | |
| "kl": 0.05255126953125, | |
| "learning_rate": 1.1056136061894384e-07, | |
| "loss": -0.0387, | |
| "reward": 4.5, | |
| "reward_std": 1.6467358469963074, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 576.25, | |
| "epoch": 79.66666666666667, | |
| "grad_norm": 1.5574614421463486, | |
| "kl": 0.04937744140625, | |
| "learning_rate": 1.0798215601092353e-07, | |
| "loss": 0.0303, | |
| "reward": 4.375, | |
| "reward_std": 2.325068473815918, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.0625, | |
| "epoch": 79.88888888888889, | |
| "grad_norm": 1.5754896085780978, | |
| "kl": 0.0546875, | |
| "learning_rate": 1.0542974530180327e-07, | |
| "loss": 0.0137, | |
| "reward": 4.1875, | |
| "reward_std": 2.3722406029701233, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 550.71875, | |
| "epoch": 80.22222222222223, | |
| "grad_norm": 1.610726394417215, | |
| "kl": 0.06024169921875, | |
| "learning_rate": 1.0290430294546448e-07, | |
| "loss": 0.013, | |
| "reward": 3.90625, | |
| "reward_std": 2.4335986375808716, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 655.25, | |
| "epoch": 80.44444444444444, | |
| "grad_norm": 1.4911116339078845, | |
| "kl": 0.0523681640625, | |
| "learning_rate": 1.0040600155253764e-07, | |
| "loss": 0.0332, | |
| "reward": 2.78125, | |
| "reward_std": 1.3004322350025177, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 519.3125, | |
| "epoch": 80.66666666666667, | |
| "grad_norm": 1.6907992973095978, | |
| "kl": 0.0538330078125, | |
| "learning_rate": 9.793501187860431e-08, | |
| "loss": -0.0401, | |
| "reward": 4.0, | |
| "reward_std": 2.362515449523926, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 538.5, | |
| "epoch": 80.88888888888889, | |
| "grad_norm": 1.646427054544714, | |
| "kl": 0.063232421875, | |
| "learning_rate": 9.549150281252632e-08, | |
| "loss": -0.0039, | |
| "reward": 4.15625, | |
| "reward_std": 2.2053900957107544, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 562.03125, | |
| "epoch": 81.22222222222223, | |
| "grad_norm": 1.2401904323679076, | |
| "kl": 0.059814453125, | |
| "learning_rate": 9.307564136490254e-08, | |
| "loss": 0.0337, | |
| "reward": 2.6875, | |
| "reward_std": 1.4073790609836578, | |
| "rewards/accuracy_reward_staging": 0.15625, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 593.3125, | |
| "epoch": 81.44444444444444, | |
| "grad_norm": 1.5857599353606664, | |
| "kl": 0.0521240234375, | |
| "learning_rate": 9.068759265665382e-08, | |
| "loss": 0.0031, | |
| "reward": 3.46875, | |
| "reward_std": 2.041439712047577, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 636.15625, | |
| "epoch": 81.66666666666667, | |
| "grad_norm": 1.464039724178544, | |
| "kl": 0.04815673828125, | |
| "learning_rate": 8.832751990773712e-08, | |
| "loss": -0.033, | |
| "reward": 4.0625, | |
| "reward_std": 2.3850997388362885, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 550.625, | |
| "epoch": 81.88888888888889, | |
| "grad_norm": 1.546052526056493, | |
| "kl": 0.05743408203125, | |
| "learning_rate": 8.599558442598998e-08, | |
| "loss": 0.0427, | |
| "reward": 4.15625, | |
| "reward_std": 2.8091025352478027, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.875, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 592.53125, | |
| "epoch": 82.22222222222223, | |
| "grad_norm": 1.359491111417973, | |
| "kl": 0.05621337890625, | |
| "learning_rate": 8.369194559610481e-08, | |
| "loss": 0.0752, | |
| "reward": 3.03125, | |
| "reward_std": 1.4954701960086823, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 608.78125, | |
| "epoch": 82.44444444444444, | |
| "grad_norm": 1.4795564527051304, | |
| "kl": 0.05267333984375, | |
| "learning_rate": 8.141676086873573e-08, | |
| "loss": 0.0759, | |
| "reward": 3.28125, | |
| "reward_std": 1.9649099707603455, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 583.0625, | |
| "epoch": 82.66666666666667, | |
| "grad_norm": 1.6241400838987166, | |
| "kl": 0.0562744140625, | |
| "learning_rate": 7.917018574973644e-08, | |
| "loss": 0.0196, | |
| "reward": 4.40625, | |
| "reward_std": 2.2960872054100037, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 553.5, | |
| "epoch": 82.88888888888889, | |
| "grad_norm": 1.5339258798115873, | |
| "kl": 0.0474853515625, | |
| "learning_rate": 7.695237378953224e-08, | |
| "loss": -0.0209, | |
| "reward": 4.5, | |
| "reward_std": 2.332531690597534, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 588.34375, | |
| "epoch": 83.22222222222223, | |
| "grad_norm": 1.5601324006274968, | |
| "kl": 0.05804443359375, | |
| "learning_rate": 7.476347657262455e-08, | |
| "loss": -0.039, | |
| "reward": 4.71875, | |
| "reward_std": 2.439529001712799, | |
| "rewards/accuracy_reward_staging": 0.5625, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 558.25, | |
| "epoch": 83.44444444444444, | |
| "grad_norm": 1.5625484764568953, | |
| "kl": 0.05841064453125, | |
| "learning_rate": 7.260364370723043e-08, | |
| "loss": -0.0022, | |
| "reward": 3.875, | |
| "reward_std": 2.6049662828445435, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 538.53125, | |
| "epoch": 83.66666666666667, | |
| "grad_norm": 1.6578933804792892, | |
| "kl": 0.064697265625, | |
| "learning_rate": 7.047302281505735e-08, | |
| "loss": 0.0178, | |
| "reward": 3.6875, | |
| "reward_std": 1.93262779712677, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 579.3125, | |
| "epoch": 83.88888888888889, | |
| "grad_norm": 1.737744709396854, | |
| "kl": 0.05303955078125, | |
| "learning_rate": 6.837175952121304e-08, | |
| "loss": -0.056, | |
| "reward": 3.875, | |
| "reward_std": 2.5176164507865906, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 559.25, | |
| "epoch": 84.22222222222223, | |
| "grad_norm": 1.5777065212148367, | |
| "kl": 0.0582275390625, | |
| "learning_rate": 6.629999744425235e-08, | |
| "loss": -0.0542, | |
| "reward": 3.3125, | |
| "reward_std": 1.8360159397125244, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 604.03125, | |
| "epoch": 84.44444444444444, | |
| "grad_norm": 1.7153117337295096, | |
| "kl": 0.05419921875, | |
| "learning_rate": 6.42578781863613e-08, | |
| "loss": 0.0782, | |
| "reward": 3.625, | |
| "reward_std": 3.0492074489593506, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 591.3125, | |
| "epoch": 84.66666666666667, | |
| "grad_norm": 1.3347688495066687, | |
| "kl": 0.053466796875, | |
| "learning_rate": 6.22455413236786e-08, | |
| "loss": -0.0014, | |
| "reward": 3.03125, | |
| "reward_std": 1.389709249138832, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 633.6875, | |
| "epoch": 84.88888888888889, | |
| "grad_norm": 1.389754239090955, | |
| "kl": 0.04827880859375, | |
| "learning_rate": 6.026312439675551e-08, | |
| "loss": 0.0256, | |
| "reward": 4.25, | |
| "reward_std": 2.171033263206482, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 550.59375, | |
| "epoch": 85.22222222222223, | |
| "grad_norm": 1.7000896266286212, | |
| "kl": 0.06982421875, | |
| "learning_rate": 5.831076290115572e-08, | |
| "loss": 0.0243, | |
| "reward": 4.15625, | |
| "reward_std": 2.343973159790039, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 600.8125, | |
| "epoch": 85.44444444444444, | |
| "grad_norm": 1.4833240972495387, | |
| "kl": 0.056884765625, | |
| "learning_rate": 5.638859027819409e-08, | |
| "loss": 0.0553, | |
| "reward": 3.53125, | |
| "reward_std": 2.29950013756752, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 650.875, | |
| "epoch": 85.66666666666667, | |
| "grad_norm": 1.3618290470634058, | |
| "kl": 0.04998779296875, | |
| "learning_rate": 5.44967379058161e-08, | |
| "loss": -0.0017, | |
| "reward": 5.0, | |
| "reward_std": 2.3323360979557037, | |
| "rewards/accuracy_reward_staging": 0.625, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 576.75, | |
| "epoch": 85.88888888888889, | |
| "grad_norm": 1.749186308713559, | |
| "kl": 0.05303955078125, | |
| "learning_rate": 5.263533508961826e-08, | |
| "loss": 0.0794, | |
| "reward": 3.34375, | |
| "reward_std": 2.082039564847946, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 573.3125, | |
| "epoch": 86.22222222222223, | |
| "grad_norm": 1.4322968854479468, | |
| "kl": 0.05615234375, | |
| "learning_rate": 5.080450905401057e-08, | |
| "loss": 0.0153, | |
| "reward": 4.25, | |
| "reward_std": 1.8755539804697037, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 546.59375, | |
| "epoch": 86.44444444444444, | |
| "grad_norm": 1.2682803454826486, | |
| "kl": 0.053955078125, | |
| "learning_rate": 4.9004384933520547e-08, | |
| "loss": 0.0083, | |
| "reward": 3.53125, | |
| "reward_std": 1.3726893961429596, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 580.21875, | |
| "epoch": 86.66666666666667, | |
| "grad_norm": 1.5693179766123742, | |
| "kl": 0.05389404296875, | |
| "learning_rate": 4.723508576424062e-08, | |
| "loss": -0.0063, | |
| "reward": 3.46875, | |
| "reward_std": 2.777799040079117, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 595.375, | |
| "epoch": 86.88888888888889, | |
| "grad_norm": 1.6314212035379032, | |
| "kl": 0.053955078125, | |
| "learning_rate": 4.549673247541874e-08, | |
| "loss": -0.01, | |
| "reward": 4.15625, | |
| "reward_std": 2.311874210834503, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 568.71875, | |
| "epoch": 87.22222222222223, | |
| "grad_norm": 1.658360075079596, | |
| "kl": 0.0572509765625, | |
| "learning_rate": 4.37894438811931e-08, | |
| "loss": 0.0064, | |
| "reward": 3.5625, | |
| "reward_std": 2.875, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 87.44444444444444, | |
| "grad_norm": 1.4586421894209247, | |
| "learning_rate": 4.2113336672471245e-08, | |
| "loss": 0.0579, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 87.44444444444444, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 609.05, | |
| "eval_kl": 0.052783203125, | |
| "eval_loss": 0.033656854182481766, | |
| "eval_reward": 2.45, | |
| "eval_reward_std": 1.6229771614074706, | |
| "eval_rewards/accuracy_reward_staging": 0.15, | |
| "eval_rewards/format_reward": 0.825, | |
| "eval_rewards/format_reward_staging": 0.875, | |
| "eval_runtime": 55.2193, | |
| "eval_samples_per_second": 0.652, | |
| "eval_steps_per_second": 0.091, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 609.578125, | |
| "epoch": 87.66666666666667, | |
| "grad_norm": 1.6263684105864271, | |
| "kl": 0.0521240234375, | |
| "learning_rate": 4.0468525408954456e-08, | |
| "loss": 0.0832, | |
| "reward": 4.265625, | |
| "reward_std": 2.632143199443817, | |
| "rewards/accuracy_reward_staging": 0.484375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 630.0, | |
| "epoch": 87.88888888888889, | |
| "grad_norm": 1.406310532139818, | |
| "kl": 0.0509033203125, | |
| "learning_rate": 3.8855122511307626e-08, | |
| "loss": 0.0517, | |
| "reward": 3.0625, | |
| "reward_std": 1.3608438968658447, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 592.46875, | |
| "epoch": 88.22222222222223, | |
| "grad_norm": 1.505022624138408, | |
| "kl": 0.05255126953125, | |
| "learning_rate": 3.727323825347578e-08, | |
| "loss": 0.0469, | |
| "reward": 4.25, | |
| "reward_std": 2.023455113172531, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 570.21875, | |
| "epoch": 88.44444444444444, | |
| "grad_norm": 1.7398525080011868, | |
| "kl": 0.051025390625, | |
| "learning_rate": 3.572298075514652e-08, | |
| "loss": 0.0079, | |
| "reward": 5.25, | |
| "reward_std": 2.496154248714447, | |
| "rewards/accuracy_reward_staging": 0.65625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 609.75, | |
| "epoch": 88.66666666666667, | |
| "grad_norm": 1.4985931737207225, | |
| "kl": 0.05255126953125, | |
| "learning_rate": 3.420445597436056e-08, | |
| "loss": 0.0262, | |
| "reward": 4.09375, | |
| "reward_std": 2.1352776885032654, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 673.75, | |
| "epoch": 88.88888888888889, | |
| "grad_norm": 1.4906376909879282, | |
| "kl": 0.05889892578125, | |
| "learning_rate": 3.271776770026963e-08, | |
| "loss": 0.0716, | |
| "reward": 3.28125, | |
| "reward_std": 2.086387515068054, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 549.53125, | |
| "epoch": 89.22222222222223, | |
| "grad_norm": 1.87310658244872, | |
| "kl": 0.08197021484375, | |
| "learning_rate": 3.1263017546042326e-08, | |
| "loss": 0.0395, | |
| "reward": 3.90625, | |
| "reward_std": 2.444858193397522, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 570.34375, | |
| "epoch": 89.44444444444444, | |
| "grad_norm": 1.6047992153607105, | |
| "kl": 0.05328369140625, | |
| "learning_rate": 2.9840304941919416e-08, | |
| "loss": 0.0128, | |
| "reward": 4.09375, | |
| "reward_std": 3.0098507404327393, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 588.09375, | |
| "epoch": 89.66666666666667, | |
| "grad_norm": 1.3977481210272724, | |
| "kl": 0.0628662109375, | |
| "learning_rate": 2.8449727128417367e-08, | |
| "loss": 0.0184, | |
| "reward": 3.6875, | |
| "reward_std": 1.197430670261383, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.875, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 663.0625, | |
| "epoch": 89.88888888888889, | |
| "grad_norm": 1.521856541607207, | |
| "kl": 0.04925537109375, | |
| "learning_rate": 2.7091379149682682e-08, | |
| "loss": -0.0485, | |
| "reward": 4.34375, | |
| "reward_std": 2.930923640727997, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 585.0625, | |
| "epoch": 90.22222222222223, | |
| "grad_norm": 1.8447806012116532, | |
| "kl": 0.05377197265625, | |
| "learning_rate": 2.5765353846995297e-08, | |
| "loss": 0.049, | |
| "reward": 4.53125, | |
| "reward_std": 3.1164740920066833, | |
| "rewards/accuracy_reward_staging": 0.5625, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 650.21875, | |
| "epoch": 90.44444444444444, | |
| "grad_norm": 1.184344841195356, | |
| "kl": 0.04827880859375, | |
| "learning_rate": 2.4471741852423233e-08, | |
| "loss": 0.0432, | |
| "reward": 2.96875, | |
| "reward_std": 1.5483438968658447, | |
| "rewards/accuracy_reward_staging": 0.21875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 547.875, | |
| "epoch": 90.66666666666667, | |
| "grad_norm": 1.4816241418571312, | |
| "kl": 0.065185546875, | |
| "learning_rate": 2.3210631582627927e-08, | |
| "loss": -0.007, | |
| "reward": 4.53125, | |
| "reward_std": 2.563981920480728, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 646.875, | |
| "epoch": 90.88888888888889, | |
| "grad_norm": 1.4324587884354845, | |
| "kl": 0.0557861328125, | |
| "learning_rate": 2.1982109232821176e-08, | |
| "loss": 0.0456, | |
| "reward": 4.6875, | |
| "reward_std": 2.595020294189453, | |
| "rewards/accuracy_reward_staging": 0.5625, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 609.875, | |
| "epoch": 91.22222222222223, | |
| "grad_norm": 1.406708270283001, | |
| "kl": 0.04571533203125, | |
| "learning_rate": 2.0786258770873645e-08, | |
| "loss": -0.0323, | |
| "reward": 5.28125, | |
| "reward_std": 2.358702301979065, | |
| "rewards/accuracy_reward_staging": 0.65625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 625.96875, | |
| "epoch": 91.44444444444444, | |
| "grad_norm": 1.254136427080868, | |
| "kl": 0.0477294921875, | |
| "learning_rate": 1.9623161931575926e-08, | |
| "loss": 0.0391, | |
| "reward": 4.25, | |
| "reward_std": 1.3912444412708282, | |
| "rewards/accuracy_reward_staging": 0.46875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 584.8125, | |
| "epoch": 91.66666666666667, | |
| "grad_norm": 1.4411232733747643, | |
| "kl": 0.057861328125, | |
| "learning_rate": 1.849289821105199e-08, | |
| "loss": 0.0171, | |
| "reward": 3.125, | |
| "reward_std": 1.5756275057792664, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 629.1875, | |
| "epoch": 91.88888888888889, | |
| "grad_norm": 1.1371389339839404, | |
| "kl": 0.051513671875, | |
| "learning_rate": 1.7395544861325718e-08, | |
| "loss": 0.011, | |
| "reward": 3.53125, | |
| "reward_std": 1.816932737827301, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 615.21875, | |
| "epoch": 92.22222222222223, | |
| "grad_norm": 1.2832104145352503, | |
| "kl": 0.046142578125, | |
| "learning_rate": 1.6331176885040876e-08, | |
| "loss": 0.0567, | |
| "reward": 3.78125, | |
| "reward_std": 1.9511407613754272, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 561.375, | |
| "epoch": 92.44444444444444, | |
| "grad_norm": 1.4397679570391773, | |
| "kl": 0.05340576171875, | |
| "learning_rate": 1.5299867030334813e-08, | |
| "loss": 0.0089, | |
| "reward": 3.1875, | |
| "reward_std": 1.2975594997406006, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 547.0625, | |
| "epoch": 92.66666666666667, | |
| "grad_norm": 1.5669540097130739, | |
| "kl": 0.066650390625, | |
| "learning_rate": 1.4301685785866213e-08, | |
| "loss": -0.0198, | |
| "reward": 4.46875, | |
| "reward_std": 2.9167675375938416, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 588.1875, | |
| "epoch": 92.88888888888889, | |
| "grad_norm": 1.6359800713030668, | |
| "kl": 0.05194091796875, | |
| "learning_rate": 1.3336701375997127e-08, | |
| "loss": 0.0226, | |
| "reward": 4.1875, | |
| "reward_std": 2.957531690597534, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 596.375, | |
| "epoch": 93.22222222222223, | |
| "grad_norm": 22.007037588121534, | |
| "kl": 0.2752685546875, | |
| "learning_rate": 1.240497975613014e-08, | |
| "loss": -0.0325, | |
| "reward": 3.75, | |
| "reward_std": 1.8229495882987976, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 554.75, | |
| "epoch": 93.44444444444444, | |
| "grad_norm": 1.5726784994170007, | |
| "kl": 0.05316162109375, | |
| "learning_rate": 1.1506584608200364e-08, | |
| "loss": 0.0904, | |
| "reward": 2.75, | |
| "reward_std": 1.680722177028656, | |
| "rewards/accuracy_reward_staging": 0.1875, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.53125, | |
| "epoch": 93.66666666666667, | |
| "grad_norm": 1.2719220301062524, | |
| "kl": 0.05828857421875, | |
| "learning_rate": 1.0641577336322761e-08, | |
| "loss": 0.0199, | |
| "reward": 4.96875, | |
| "reward_std": 2.2166852056980133, | |
| "rewards/accuracy_reward_staging": 0.625, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 640.1875, | |
| "epoch": 93.88888888888889, | |
| "grad_norm": 2.653449397011027, | |
| "kl": 0.07366943359375, | |
| "learning_rate": 9.810017062595321e-09, | |
| "loss": 0.0336, | |
| "reward": 4.0, | |
| "reward_std": 2.329674154520035, | |
| "rewards/accuracy_reward_staging": 0.4375, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 511.21875, | |
| "epoch": 94.22222222222223, | |
| "grad_norm": 1.4821827826071337, | |
| "kl": 0.04779052734375, | |
| "learning_rate": 9.011960623058201e-09, | |
| "loss": -0.0241, | |
| "reward": 4.53125, | |
| "reward_std": 1.9632892608642578, | |
| "rewards/accuracy_reward_staging": 0.53125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 516.34375, | |
| "epoch": 94.44444444444444, | |
| "grad_norm": 1.424328758939937, | |
| "kl": 0.055419921875, | |
| "learning_rate": 8.247462563808816e-09, | |
| "loss": 0.018, | |
| "reward": 4.46875, | |
| "reward_std": 2.4695461988449097, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 630.75, | |
| "epoch": 94.66666666666667, | |
| "grad_norm": 1.3627094949939382, | |
| "kl": 0.05291748046875, | |
| "learning_rate": 7.516575137274162e-09, | |
| "loss": 0.05, | |
| "reward": 3.9375, | |
| "reward_std": 2.0698782801628113, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.625, | |
| "epoch": 94.88888888888889, | |
| "grad_norm": 1.1699961416103346, | |
| "kl": 0.05120849609375, | |
| "learning_rate": 6.819348298638839e-09, | |
| "loss": 0.0182, | |
| "reward": 3.1875, | |
| "reward_std": 2.152123808860779, | |
| "rewards/accuracy_reward_staging": 0.25, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 618.8125, | |
| "epoch": 95.22222222222223, | |
| "grad_norm": 1.362463533881549, | |
| "kl": 0.06512451171875, | |
| "learning_rate": 6.15582970243117e-09, | |
| "loss": 0.0677, | |
| "reward": 3.375, | |
| "reward_std": 1.7910222113132477, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 551.15625, | |
| "epoch": 95.44444444444444, | |
| "grad_norm": 1.297999563247604, | |
| "kl": 0.06048583984375, | |
| "learning_rate": 5.526064699265753e-09, | |
| "loss": 0.0032, | |
| "reward": 3.96875, | |
| "reward_std": 1.8312554359436035, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 604.625, | |
| "epoch": 95.66666666666667, | |
| "grad_norm": 1.486555198053525, | |
| "kl": 0.05755615234375, | |
| "learning_rate": 4.9300963327441044e-09, | |
| "loss": 0.043, | |
| "reward": 4.40625, | |
| "reward_std": 3.2432121634483337, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 583.875, | |
| "epoch": 95.88888888888889, | |
| "grad_norm": 1.5255994079961044, | |
| "kl": 0.05120849609375, | |
| "learning_rate": 4.367965336512403e-09, | |
| "loss": -0.0079, | |
| "reward": 3.8125, | |
| "reward_std": 1.9035333096981049, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 500.53125, | |
| "epoch": 96.22222222222223, | |
| "grad_norm": 1.3113670847804533, | |
| "kl": 0.05438232421875, | |
| "learning_rate": 3.8397101314774915e-09, | |
| "loss": -0.0184, | |
| "reward": 3.28125, | |
| "reward_std": 1.0818375647068024, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 554.8125, | |
| "epoch": 96.44444444444444, | |
| "grad_norm": 1.7546175198232294, | |
| "kl": 0.056640625, | |
| "learning_rate": 3.3453668231809283e-09, | |
| "loss": -0.0321, | |
| "reward": 5.5, | |
| "reward_std": 3.5806562304496765, | |
| "rewards/accuracy_reward_staging": 0.71875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.9375, | |
| "epoch": 96.66666666666667, | |
| "grad_norm": 1.3830980489663667, | |
| "kl": 0.050537109375, | |
| "learning_rate": 2.8849691993311777e-09, | |
| "loss": 0.0483, | |
| "reward": 3.59375, | |
| "reward_std": 2.2675071954727173, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 565.0, | |
| "epoch": 96.88888888888889, | |
| "grad_norm": 1.6187360157512092, | |
| "kl": 0.0645751953125, | |
| "learning_rate": 2.458548727494292e-09, | |
| "loss": 0.0672, | |
| "reward": 4.1875, | |
| "reward_std": 2.5254639387130737, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 601.3125, | |
| "epoch": 97.22222222222223, | |
| "grad_norm": 1.513261818035226, | |
| "kl": 0.05316162109375, | |
| "learning_rate": 2.066134552943077e-09, | |
| "loss": -0.054, | |
| "reward": 4.0, | |
| "reward_std": 2.443375587463379, | |
| "rewards/accuracy_reward_staging": 0.40625, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 618.53125, | |
| "epoch": 97.44444444444444, | |
| "grad_norm": 1.2736528121828654, | |
| "kl": 0.04791259765625, | |
| "learning_rate": 1.7077534966650765e-09, | |
| "loss": 0.0219, | |
| "reward": 5.375, | |
| "reward_std": 2.514360010623932, | |
| "rewards/accuracy_reward_staging": 0.6875, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 593.1875, | |
| "epoch": 97.66666666666667, | |
| "grad_norm": 1.5869628592439236, | |
| "kl": 0.0782470703125, | |
| "learning_rate": 1.383430053529422e-09, | |
| "loss": -0.01, | |
| "reward": 3.40625, | |
| "reward_std": 1.3685379922389984, | |
| "rewards/accuracy_reward_staging": 0.3125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 646.5, | |
| "epoch": 97.88888888888889, | |
| "grad_norm": 1.3206401873227125, | |
| "kl": 0.052490234375, | |
| "learning_rate": 1.0931863906127325e-09, | |
| "loss": -0.0253, | |
| "reward": 3.65625, | |
| "reward_std": 1.6694981455802917, | |
| "rewards/accuracy_reward_staging": 0.34375, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 573.78125, | |
| "epoch": 98.22222222222223, | |
| "grad_norm": 1.8394528163274233, | |
| "kl": 0.0552978515625, | |
| "learning_rate": 8.370423456837139e-10, | |
| "loss": 0.0136, | |
| "reward": 4.625, | |
| "reward_std": 2.260310411453247, | |
| "rewards/accuracy_reward_staging": 0.5625, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 621.53125, | |
| "epoch": 98.44444444444444, | |
| "grad_norm": 1.5558051348782826, | |
| "kl": 0.06475830078125, | |
| "learning_rate": 6.150154258476314e-10, | |
| "loss": -0.0687, | |
| "reward": 4.71875, | |
| "reward_std": 2.5687596797943115, | |
| "rewards/accuracy_reward_staging": 0.5625, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 595.75, | |
| "epoch": 98.66666666666667, | |
| "grad_norm": 1.1453949481051802, | |
| "kl": 0.048095703125, | |
| "learning_rate": 4.271208063494902e-10, | |
| "loss": -0.0004, | |
| "reward": 2.5625, | |
| "reward_std": 1.1108438968658447, | |
| "rewards/accuracy_reward_staging": 0.125, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 612.9375, | |
| "epoch": 98.88888888888889, | |
| "grad_norm": 1.627509813072313, | |
| "kl": 0.05230712890625, | |
| "learning_rate": 2.733713295369755e-10, | |
| "loss": -0.0208, | |
| "reward": 4.375, | |
| "reward_std": 2.401917338371277, | |
| "rewards/accuracy_reward_staging": 0.5, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.03125, | |
| "epoch": 99.22222222222223, | |
| "grad_norm": 1.3542702485509543, | |
| "kl": 0.0565185546875, | |
| "learning_rate": 1.53777503982655e-10, | |
| "loss": 0.0102, | |
| "reward": 3.3125, | |
| "reward_std": 2.002065122127533, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.84375, | |
| "epoch": 99.44444444444444, | |
| "grad_norm": 1.524546929028841, | |
| "kl": 0.06207275390625, | |
| "learning_rate": 6.834750376549791e-11, | |
| "loss": 0.0366, | |
| "reward": 4.625, | |
| "reward_std": 2.895161896944046, | |
| "rewards/accuracy_reward_staging": 0.5625, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 563.6875, | |
| "epoch": 99.66666666666667, | |
| "grad_norm": 1.4232250672089828, | |
| "kl": 0.05712890625, | |
| "learning_rate": 1.7087167912710476e-11, | |
| "loss": 0.0203, | |
| "reward": 3.875, | |
| "reward_std": 1.875, | |
| "rewards/accuracy_reward_staging": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 99.88888888888889, | |
| "grad_norm": 1.8247807880781484, | |
| "learning_rate": 0.0, | |
| "loss": 0.0974, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 99.88888888888889, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 644.175, | |
| "eval_kl": 0.052978515625, | |
| "eval_loss": 0.008646870031952858, | |
| "eval_reward": 2.3, | |
| "eval_reward_std": 1.2995877504348754, | |
| "eval_rewards/accuracy_reward_staging": 0.125, | |
| "eval_rewards/format_reward": 0.8, | |
| "eval_rewards/format_reward_staging": 0.875, | |
| "eval_runtime": 54.9436, | |
| "eval_samples_per_second": 0.655, | |
| "eval_steps_per_second": 0.091, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 498.375, | |
| "epoch": 99.88888888888889, | |
| "kl": 0.0625, | |
| "reward": 3.28125, | |
| "reward_std": 2.418270230293274, | |
| "rewards/accuracy_reward_staging": 0.28125, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 400, | |
| "total_flos": 0.0, | |
| "train_loss": 0.01939338302021497, | |
| "train_runtime": 14839.7642, | |
| "train_samples_per_second": 0.243, | |
| "train_steps_per_second": 0.027 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 100, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |