{ "best_metric": null, "best_model_checkpoint": null, "epoch": 100.0, "eval_steps": 10, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 941.515625, "epoch": 0.5, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 957.15625, "epoch": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 947.609375, "epoch": 1.5, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.5e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 905.96875, "epoch": 2.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 934.28125, "epoch": 2.5, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.5e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 842.84375, "epoch": 3.0, "grad_norm": 0.4104181280494549, "kl": 0.0, "learning_rate": 3e-07, "loss": 0.0097, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 1060.203125, "epoch": 3.5, "grad_norm": 0.006270841170194027, "kl": 0.0004673004150390625, "learning_rate": 3.5e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 846.78125, "epoch": 4.0, "grad_norm": 0.18416362793200072, "kl": 0.0004229545593261719, "learning_rate": 4e-07, "loss": 0.0438, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 939.875, "epoch": 4.5, "grad_norm": 0.004467489660870973, "kl": 0.00043773651123046875, "learning_rate": 4.5e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 9 }, { "epoch": 5.0, "grad_norm": 0.003060223517037833, "learning_rate": 5e-07, "loss": 0.0, "step": 10 }, { "epoch": 5.0, "eval_clip_ratio": 0.0, "eval_completion_length": 914.8203125, "eval_kl": 0.0004506111145019531, "eval_loss": 0.0222869124263525, "eval_reward": 0.0078125, "eval_reward_std": 0.03125, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0078125, "eval_runtime": 102.8292, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.01, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 931.875, "epoch": 5.5, "grad_norm": 0.0029719679687620396, "kl": 0.0004215240478515625, "learning_rate": 5.5e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 840.71875, "epoch": 6.0, "grad_norm": 0.003151997120265335, "kl": 0.00043010711669921875, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 1021.125, "epoch": 6.5, "grad_norm": 0.00293567226380183, "kl": 0.00043010711669921875, "learning_rate": 6.5e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 901.53125, "epoch": 7.0, "grad_norm": 0.0029155273721110384, "kl": 0.0004425048828125, "learning_rate": 7e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 882.046875, "epoch": 7.5, "grad_norm": 0.24400359215564857, "kl": 0.00048542022705078125, "learning_rate": 7.5e-07, "loss": 0.0581, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 957.84375, "epoch": 8.0, "grad_norm": 0.0030236272302122703, "kl": 0.0004591941833496094, "learning_rate": 8e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 878.1875, "epoch": 8.5, "grad_norm": 0.005634063780769768, "kl": 0.0004887580871582031, "learning_rate": 8.499999999999999e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 840.09375, "epoch": 9.0, "grad_norm": 0.0033459555512032678, "kl": 0.0004935264587402344, "learning_rate": 9e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 949.34375, "epoch": 9.5, "grad_norm": 0.1871077676079831, "kl": 0.00047397613525390625, "learning_rate": 9.499999999999999e-07, "loss": 0.0603, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 19 }, { "epoch": 10.0, "grad_norm": 0.48673171940966303, "learning_rate": 1e-06, "loss": 0.0174, "step": 20 }, { "epoch": 10.0, "eval_clip_ratio": 0.0, "eval_completion_length": 883.921875, "eval_kl": 0.00046753883361816406, "eval_loss": 1.7391932487953454e-05, "eval_reward": 0.0, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0, "eval_runtime": 98.7699, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 912.484375, "epoch": 10.5, "grad_norm": 0.3321658665461902, "kl": 0.0004425048828125, "learning_rate": 9.99931462820376e-07, "loss": 0.0141, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 749.5625, "epoch": 11.0, "grad_norm": 0.003786135169019501, "kl": 0.00048828125, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 927.234375, "epoch": 11.5, "grad_norm": 0.002638559729174169, "kl": 0.0004210472106933594, "learning_rate": 9.993832906395582e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 847.6875, "epoch": 12.0, "grad_norm": 0.0037353511628820143, "kl": 0.0004801750183105469, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 937.25, "epoch": 12.5, "grad_norm": 0.0041230093967703964, "kl": 0.00046634674072265625, "learning_rate": 9.982876141412855e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 936.625, "epoch": 13.0, "grad_norm": 0.0048191989969733635, "kl": 0.0005083084106445312, "learning_rate": 9.975348529157229e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 860.578125, "epoch": 13.5, "grad_norm": 0.004235845464067776, "kl": 0.00048828125, "learning_rate": 9.96645768238595e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 842.3125, "epoch": 14.0, "grad_norm": 0.0035023722132941533, "kl": 0.0004673004150390625, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 876.859375, "epoch": 14.5, "grad_norm": 0.0054294046901942426, "kl": 0.0005173683166503906, "learning_rate": 9.944597532678119e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 29 }, { "epoch": 15.0, "grad_norm": 0.0031965438782979136, "learning_rate": 9.931634888554935e-07, "loss": 0.0, "step": 30 }, { "epoch": 15.0, "eval_clip_ratio": 0.0, "eval_completion_length": 986.3515625, "eval_kl": 0.0004820823669433594, "eval_loss": 1.6995334590319544e-05, "eval_reward": 0.0, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0, "eval_runtime": 108.5726, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.009, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 900.5234375, "epoch": 15.5, "grad_norm": 0.004036025288001792, "kl": 0.000476837158203125, "learning_rate": 9.917322325514487e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 934.3125, "epoch": 16.0, "grad_norm": 0.006195144986033682, "kl": 0.0006299018859863281, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 920.390625, "epoch": 16.5, "grad_norm": 0.0059456824839291, "kl": 0.0005106925964355469, "learning_rate": 9.88466529153356e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 879.125, "epoch": 17.0, "grad_norm": 0.21435517887811947, "kl": 0.000614166259765625, "learning_rate": 9.866330768241983e-07, "loss": 0.0454, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 871.34375, "epoch": 17.5, "grad_norm": 0.0066464547863155045, "kl": 0.0006079673767089844, "learning_rate": 9.846666218300807e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 802.6875, "epoch": 18.0, "grad_norm": 0.2319706301168723, "kl": 0.0005307197570800781, "learning_rate": 9.825677631722435e-07, "loss": 0.0365, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 898.46875, "epoch": 18.5, "grad_norm": 0.007249327675941041, "kl": 0.0006723403930664062, "learning_rate": 9.80337140183366e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 902.125, "epoch": 19.0, "grad_norm": 0.007027408347733654, "kl": 0.0005617141723632812, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 876.8125, "epoch": 19.5, "grad_norm": 0.24622076919830327, "kl": 0.0005426406860351562, "learning_rate": 9.754833590196926e-07, "loss": 0.0332, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 39 }, { "epoch": 20.0, "grad_norm": 0.0061991261734385215, "learning_rate": 9.728616793536587e-07, "loss": 0.0, "step": 40 }, { "epoch": 20.0, "eval_clip_ratio": 0.0, "eval_completion_length": 896.96875, "eval_kl": 0.0005970001220703125, "eval_loss": 2.167217098758556e-05, "eval_reward": 0.0, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0, "eval_runtime": 103.8756, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 826.8203125, "epoch": 20.5, "grad_norm": 0.36000165453483507, "kl": 0.0005981922149658203, "learning_rate": 9.701111919237408e-07, "loss": 0.0126, "reward": 0.0078125, "reward_std": 0.03125, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0078125, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 848.5, "epoch": 21.0, "grad_norm": 0.010730892722707015, "kl": 0.0006737709045410156, "learning_rate": 9.672327345550543e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 815.03125, "epoch": 21.5, "grad_norm": 0.4479793536674205, "kl": 0.00072479248046875, "learning_rate": 9.64227184053598e-07, "loss": 0.0694, "reward": 0.03125, "reward_std": 0.125, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.03125, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 1039.59375, "epoch": 22.0, "grad_norm": 0.005097432717583717, "kl": 0.0005826950073242188, "learning_rate": 9.610954559391704e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 913.5, "epoch": 22.5, "grad_norm": 0.005095664845686755, "kl": 0.0005693435668945312, "learning_rate": 9.578385041664925e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 982.3125, "epoch": 23.0, "grad_norm": 0.008066078534943774, "kl": 0.0007781982421875, "learning_rate": 9.54457320834625e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 940.015625, "epoch": 23.5, "grad_norm": 0.007637537595000865, "kl": 0.0008058547973632812, "learning_rate": 9.509529358847654e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 968.125, "epoch": 24.0, "grad_norm": 0.3097871453627547, "kl": 0.0007429122924804688, "learning_rate": 9.473264167865171e-07, "loss": 0.0144, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 1062.859375, "epoch": 24.5, "grad_norm": 0.005902735175410845, "kl": 0.0006656646728515625, "learning_rate": 9.43578868212728e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 49 }, { "epoch": 25.0, "grad_norm": 0.007727395228257341, "learning_rate": 9.397114317029974e-07, "loss": 0.0, "step": 50 }, { "epoch": 25.0, "eval_clip_ratio": 0.0, "eval_completion_length": 896.96875, "eval_kl": 0.000972747802734375, "eval_loss": 2.9567998353741132e-05, "eval_reward": 0.0, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0, "eval_runtime": 112.2758, "eval_samples_per_second": 0.071, "eval_steps_per_second": 0.009, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 945.0, "epoch": 25.5, "grad_norm": 0.008522169760744136, "kl": 0.0008330345153808594, "learning_rate": 9.357252853159505e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 916.71875, "epoch": 26.0, "grad_norm": 0.007229054230337371, "kl": 0.0007457733154296875, "learning_rate": 9.316216432703916e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 848.4375, "epoch": 26.5, "grad_norm": 0.009711666023450752, "kl": 0.00109100341796875, "learning_rate": 9.274017555754407e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 1112.53125, "epoch": 27.0, "grad_norm": 0.007287564769621722, "kl": 0.0007829666137695312, "learning_rate": 9.230669076497687e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 901.5625, "epoch": 27.5, "grad_norm": 0.010695472961060359, "kl": 0.0011224746704101562, "learning_rate": 9.186184199300463e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 1000.40625, "epoch": 28.0, "grad_norm": 0.012234323967281642, "kl": 0.0009059906005859375, "learning_rate": 9.140576474687263e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 1010.1875, "epoch": 28.5, "grad_norm": 0.009177302889350094, "kl": 0.0009927749633789062, "learning_rate": 9.093859795212817e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 910.34375, "epoch": 29.0, "grad_norm": 0.011497131889061145, "kl": 0.0014095306396484375, "learning_rate": 9.046048391230247e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 846.640625, "epoch": 29.5, "grad_norm": 0.009586334839630907, "kl": 0.001155853271484375, "learning_rate": 8.997156826556369e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 59 }, { "epoch": 30.0, "grad_norm": 0.01647829222650522, "learning_rate": 8.9471999940354e-07, "loss": 0.0, "step": 60 }, { "epoch": 30.0, "eval_clip_ratio": 0.0, "eval_completion_length": 907.8828125, "eval_kl": 0.0010533332824707031, "eval_loss": 0.021646613255143166, "eval_reward": 0.0078125, "eval_reward_std": 0.03125, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0078125, "eval_runtime": 107.6329, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.009, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 920.4375, "epoch": 30.5, "grad_norm": 0.0168591367199892, "kl": 0.0012812614440917969, "learning_rate": 8.896193111002475e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 970.28125, "epoch": 31.0, "grad_norm": 0.008710794047022944, "kl": 0.0010786056518554688, "learning_rate": 8.844151714648274e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 957.59375, "epoch": 31.5, "grad_norm": 0.012243414806160322, "kl": 0.0009698867797851562, "learning_rate": 8.791091657286267e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 855.59375, "epoch": 32.0, "grad_norm": 0.015193617397552813, "kl": 0.001148223876953125, "learning_rate": 8.737029101523929e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 1009.546875, "epoch": 32.5, "grad_norm": 0.007369522915137431, "kl": 0.0008792877197265625, "learning_rate": 8.681980515339463e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 868.84375, "epoch": 33.0, "grad_norm": 0.015494723417675436, "kl": 0.0012235641479492188, "learning_rate": 8.625962667065487e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 877.046875, "epoch": 33.5, "grad_norm": 0.022596736280910343, "kl": 0.0012197494506835938, "learning_rate": 8.568992620281243e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 1001.34375, "epoch": 34.0, "grad_norm": 0.005352216682577747, "kl": 0.0008544921875, "learning_rate": 8.511087728614862e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 924.84375, "epoch": 34.5, "grad_norm": 0.005953464155512391, "kl": 0.000751495361328125, "learning_rate": 8.452265630457282e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 69 }, { "epoch": 35.0, "grad_norm": 0.016277308014848297, "learning_rate": 8.392544243589427e-07, "loss": 0.0, "step": 70 }, { "epoch": 35.0, "eval_clip_ratio": 0.0, "eval_completion_length": 864.765625, "eval_kl": 0.00107574462890625, "eval_loss": 3.6555644328473136e-05, "eval_reward": 0.0, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0, "eval_runtime": 103.6058, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.01, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 888.2890625, "epoch": 35.5, "grad_norm": 0.00851896616755137, "kl": 0.0011534690856933594, "learning_rate": 8.331941759724268e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 894.71875, "epoch": 36.0, "grad_norm": 0.007086845304726659, "kl": 0.0008459091186523438, "learning_rate": 8.270476638965461e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 987.375, "epoch": 36.5, "grad_norm": 0.005738334037241088, "kl": 0.0007448196411132812, "learning_rate": 8.208167604184217e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 812.09375, "epoch": 37.0, "grad_norm": 0.01060348535065019, "kl": 0.00112152099609375, "learning_rate": 8.145033635316128e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 858.890625, "epoch": 37.5, "grad_norm": 0.25436864848634955, "kl": 0.0011377334594726562, "learning_rate": 8.081093963579707e-07, "loss": 0.0323, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 940.40625, "epoch": 38.0, "grad_norm": 0.006724100368173609, "kl": 0.0008859634399414062, "learning_rate": 8.01636806561836e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 895.34375, "epoch": 38.5, "grad_norm": 0.008819033954111552, "kl": 0.0010356903076171875, "learning_rate": 7.950875657567621e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 974.9375, "epoch": 39.0, "grad_norm": 0.013437649890405814, "kl": 0.0010128021240234375, "learning_rate": 7.884636689049422e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 868.8125, "epoch": 39.5, "grad_norm": 0.36102678136071253, "kl": 0.00128936767578125, "learning_rate": 7.817671337095244e-07, "loss": 0.0071, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 79 }, { "epoch": 40.0, "grad_norm": 0.005225754916645614, "learning_rate": 7.75e-07, "loss": 0.0, "step": 80 }, { "epoch": 40.0, "eval_clip_ratio": 0.0, "eval_completion_length": 863.53125, "eval_kl": 0.0010199546813964844, "eval_loss": 3.5483633837429807e-05, "eval_reward": 0.0, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0, "eval_runtime": 93.6372, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.011, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 945.6015625, "epoch": 40.5, "grad_norm": 0.3853250090014461, "kl": 0.0010247230529785156, "learning_rate": 7.681643291108517e-07, "loss": 0.0662, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 928.375, "epoch": 41.0, "grad_norm": 0.005636824414465673, "kl": 0.0008144378662109375, "learning_rate": 7.612622032536507e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 928.90625, "epoch": 41.5, "grad_norm": 0.009695421562075205, "kl": 0.001068115234375, "learning_rate": 7.54295724882796e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 963.625, "epoch": 42.0, "grad_norm": 0.009278458906845803, "kl": 0.0011796951293945312, "learning_rate": 7.472670160550848e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 971.390625, "epoch": 42.5, "grad_norm": 0.007871724093899331, "kl": 0.0010309219360351562, "learning_rate": 7.401782177833147e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 665.90625, "epoch": 43.0, "grad_norm": 0.2548433246597271, "kl": 0.0026493072509765625, "learning_rate": 7.330314893841101e-07, "loss": 0.0591, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 832.6875, "epoch": 43.5, "grad_norm": 0.008654348640434956, "kl": 0.0010986328125, "learning_rate": 7.258290078201731e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 971.375, "epoch": 44.0, "grad_norm": 0.23717723076741856, "kl": 0.0013666152954101562, "learning_rate": 7.185729670371604e-07, "loss": 0.0375, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 962.234375, "epoch": 44.5, "grad_norm": 0.015377158658563683, "kl": 0.0014400482177734375, "learning_rate": 7.11265577295385e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 89 }, { "epoch": 45.0, "grad_norm": 0.008932566857599763, "learning_rate": 7.039090644965509e-07, "loss": 0.0, "step": 90 }, { "epoch": 45.0, "eval_clip_ratio": 0.0, "eval_completion_length": 926.8125, "eval_kl": 0.001495361328125, "eval_loss": 5.114699524710886e-05, "eval_reward": 0.0, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0, "eval_runtime": 99.6982, "eval_samples_per_second": 0.08, "eval_steps_per_second": 0.01, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 893.046875, "epoch": 45.5, "grad_norm": 0.010366882749113264, "kl": 0.0011758804321289062, "learning_rate": 6.965056695057204e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 904.78125, "epoch": 46.0, "grad_norm": 0.02599396238517655, "kl": 0.0017518997192382812, "learning_rate": 6.890576474687263e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 848.65625, "epoch": 46.5, "grad_norm": 0.5691502068122966, "kl": 0.0011138916015625, "learning_rate": 6.815672671252315e-07, "loss": -0.1031, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 896.9375, "epoch": 47.0, "grad_norm": 0.011737006406008286, "kl": 0.00153350830078125, "learning_rate": 6.740368101176495e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 845.0625, "epoch": 47.5, "grad_norm": 0.387460398723794, "kl": 0.0025997161865234375, "learning_rate": 6.664685702961344e-07, "loss": 0.1011, "reward": 0.046875, "reward_std": 0.14789125323295593, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.046875, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 825.6875, "epoch": 48.0, "grad_norm": 0.3148662082343754, "kl": 0.0015659332275390625, "learning_rate": 6.588648530198504e-07, "loss": -0.0819, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 836.265625, "epoch": 48.5, "grad_norm": 0.010907755471241348, "kl": 0.0014362335205078125, "learning_rate": 6.512279744547392e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 875.25, "epoch": 49.0, "grad_norm": 0.012995813252816308, "kl": 0.0014848709106445312, "learning_rate": 6.435602608679916e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 940.625, "epoch": 49.5, "grad_norm": 0.012426286814278077, "kl": 0.00176239013671875, "learning_rate": 6.358640479194451e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 99 }, { "epoch": 50.0, "grad_norm": 0.01574734631355814, "learning_rate": 6.281416799501187e-07, "loss": 0.0001, "step": 100 }, { "epoch": 50.0, "eval_clip_ratio": 0.0, "eval_completion_length": 877.0859375, "eval_kl": 0.00176239013671875, "eval_loss": 5.840706580784172e-05, "eval_reward": 0.0, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0, "eval_runtime": 104.8754, "eval_samples_per_second": 0.076, "eval_steps_per_second": 0.01, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 893.6875, "epoch": 50.5, "grad_norm": 0.01032773418561725, "kl": 0.0016312599182128906, "learning_rate": 6.203955092681039e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 849.125, "epoch": 51.0, "grad_norm": 0.015022844032421302, "kl": 0.0020303726196289062, "learning_rate": 6.126278954320294e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 989.890625, "epoch": 51.5, "grad_norm": 0.01614919998047259, "kl": 0.00213623046875, "learning_rate": 6.048412045323164e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 942.59375, "epoch": 52.0, "grad_norm": 0.010917168106263793, "kl": 0.0014476776123046875, "learning_rate": 5.97037808470444e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 944.640625, "epoch": 52.5, "grad_norm": 0.26063398005499927, "kl": 0.002063751220703125, "learning_rate": 5.892200842364462e-07, "loss": 0.0398, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 859.4375, "epoch": 53.0, "grad_norm": 0.3786401255845219, "kl": 0.0019893646240234375, "learning_rate": 5.813904131848564e-07, "loss": 0.0037, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 982.65625, "epoch": 53.5, "grad_norm": 0.017106845969968518, "kl": 0.0019397735595703125, "learning_rate": 5.735511803093248e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 764.34375, "epoch": 54.0, "grad_norm": 0.01473506964168249, "kl": 0.0020160675048828125, "learning_rate": 5.657047735161255e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 908.171875, "epoch": 54.5, "grad_norm": 0.018754498587991306, "kl": 0.0021953582763671875, "learning_rate": 5.578535828967777e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 109 }, { "epoch": 55.0, "grad_norm": 0.010670548584562288, "learning_rate": 5.5e-07, "loss": 0.0001, "step": 110 }, { "epoch": 55.0, "eval_clip_ratio": 0.0, "eval_completion_length": 948.3125, "eval_kl": 0.0020236968994140625, "eval_loss": 6.496578134829178e-05, "eval_reward": 0.0, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0, "eval_runtime": 100.0071, "eval_samples_per_second": 0.08, "eval_steps_per_second": 0.01, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 990.34375, "epoch": 55.5, "grad_norm": 0.015563623719699342, "kl": 0.0019483566284179688, "learning_rate": 5.421464171032224e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 855.28125, "epoch": 56.0, "grad_norm": 0.008970140876741549, "kl": 0.001338958740234375, "learning_rate": 5.342952264838747e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 982.640625, "epoch": 56.5, "grad_norm": 0.0108924950539754, "kl": 0.0016727447509765625, "learning_rate": 5.264488196906752e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 903.78125, "epoch": 57.0, "grad_norm": 0.2686051732486419, "kl": 0.0024242401123046875, "learning_rate": 5.186095868151436e-07, "loss": 0.0391, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 853.375, "epoch": 57.5, "grad_norm": 0.2831838787851241, "kl": 0.0024967193603515625, "learning_rate": 5.107799157635538e-07, "loss": 0.0351, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 917.34375, "epoch": 58.0, "grad_norm": 0.013530004501295191, "kl": 0.001804351806640625, "learning_rate": 5.02962191529556e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 859.03125, "epoch": 58.5, "grad_norm": 0.44797396450129406, "kl": 0.0030193328857421875, "learning_rate": 4.951587954676837e-07, "loss": -0.0663, "reward": 0.03125, "reward_std": 0.08539125323295593, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.03125, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 896.625, "epoch": 59.0, "grad_norm": 0.00998304849357523, "kl": 0.0016765594482421875, "learning_rate": 4.873721045679706e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 954.359375, "epoch": 59.5, "grad_norm": 0.22962099915558717, "kl": 0.0020122528076171875, "learning_rate": 4.79604490731896e-07, "loss": 0.0424, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 119 }, { "epoch": 60.0, "grad_norm": 0.02248418123929179, "learning_rate": 4.7185832004988133e-07, "loss": 0.0001, "step": 120 }, { "epoch": 60.0, "eval_clip_ratio": 0.0, "eval_completion_length": 960.3671875, "eval_kl": 0.0021600723266601562, "eval_loss": 7.169261516537517e-05, "eval_reward": 0.0, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0, "eval_runtime": 107.7429, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.009, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 837.0390625, "epoch": 60.5, "grad_norm": 0.017761869908686022, "kl": 0.0027360916137695312, "learning_rate": 4.641359520805548e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 822.75, "epoch": 61.0, "grad_norm": 0.016374329005476508, "kl": 0.00225830078125, "learning_rate": 4.5643973913200837e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 873.46875, "epoch": 61.5, "grad_norm": 0.22223040398437557, "kl": 0.0031833648681640625, "learning_rate": 4.4877202554526084e-07, "loss": 0.0424, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 1016.71875, "epoch": 62.0, "grad_norm": 0.01056012461452998, "kl": 0.001789093017578125, "learning_rate": 4.4113514698014953e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 867.015625, "epoch": 62.5, "grad_norm": 0.012502277305124584, "kl": 0.0022029876708984375, "learning_rate": 4.3353142970386557e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 1008.1875, "epoch": 63.0, "grad_norm": 0.017467667097429012, "kl": 0.002529144287109375, "learning_rate": 4.2596318988235037e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 876.40625, "epoch": 63.5, "grad_norm": 0.27826851686759685, "kl": 0.003124237060546875, "learning_rate": 4.1843273287476854e-07, "loss": 0.0208, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 931.4375, "epoch": 64.0, "grad_norm": 0.010111725537803692, "kl": 0.0016422271728515625, "learning_rate": 4.1094235253127374e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 940.890625, "epoch": 64.5, "grad_norm": 0.19504698473227727, "kl": 0.003879547119140625, "learning_rate": 4.034943304942796e-07, "loss": 0.0002, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 129 }, { "epoch": 65.0, "grad_norm": 0.3594474052537692, "learning_rate": 3.9609093550344907e-07, "loss": -0.031, "step": 130 }, { "epoch": 65.0, "eval_clip_ratio": 0.0, "eval_completion_length": 872.6484375, "eval_kl": 0.0025424957275390625, "eval_loss": -0.011794866062700748, "eval_reward": 0.015625, "eval_reward_std": 0.042695626616477966, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.015625, "eval_runtime": 95.6188, "eval_samples_per_second": 0.084, "eval_steps_per_second": 0.01, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 827.2578125, "epoch": 65.5, "grad_norm": 0.031146258604450788, "kl": 0.002330780029296875, "learning_rate": 3.8873442270461485e-07, "loss": 0.0001, "reward": 0.0078125, "reward_std": 0.03125, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0078125, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 1005.28125, "epoch": 66.0, "grad_norm": 0.23823090391570464, "kl": 0.00301361083984375, "learning_rate": 3.8142703296283953e-07, "loss": 0.0414, "reward": 0.03125, "reward_std": 0.08539125323295593, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.03125, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 965.671875, "epoch": 66.5, "grad_norm": 0.016282343372095484, "kl": 0.0027065277099609375, "learning_rate": 3.7417099217982686e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 950.9375, "epoch": 67.0, "grad_norm": 0.37418953088110135, "kl": 0.00205230712890625, "learning_rate": 3.6696851061588994e-07, "loss": 0.0608, "reward": 0.03125, "reward_std": 0.125, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.03125, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 911.234375, "epoch": 67.5, "grad_norm": 0.012255261965707507, "kl": 0.0020465850830078125, "learning_rate": 3.5982178221668533e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 979.71875, "epoch": 68.0, "grad_norm": 0.26206279671926186, "kl": 0.002727508544921875, "learning_rate": 3.5273298394491515e-07, "loss": 0.0389, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 933.796875, "epoch": 68.5, "grad_norm": 0.01763374341802393, "kl": 0.002559661865234375, "learning_rate": 3.45704275117204e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 905.125, "epoch": 69.0, "grad_norm": 0.3004009853199538, "kl": 0.002872467041015625, "learning_rate": 3.387377967463493e-07, "loss": 0.0281, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 951.5625, "epoch": 69.5, "grad_norm": 0.0130597651929469, "kl": 0.0021648406982421875, "learning_rate": 3.3183567088914833e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 139 }, { "epoch": 70.0, "grad_norm": 0.01647973523518642, "learning_rate": 3.250000000000001e-07, "loss": 0.0001, "step": 140 }, { "epoch": 70.0, "eval_clip_ratio": 0.0, "eval_completion_length": 888.59375, "eval_kl": 0.0026531219482421875, "eval_loss": 0.02596096135675907, "eval_reward": 0.0078125, "eval_reward_std": 0.03125, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0078125, "eval_runtime": 98.3732, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.01, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 873.234375, "epoch": 70.5, "grad_norm": 0.2834208593274132, "kl": 0.0033597946166992188, "learning_rate": 3.182328662904756e-07, "loss": 0.0572, "reward": 0.015625, "reward_std": 0.042695626616477966, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 1037.5625, "epoch": 71.0, "grad_norm": 0.010414710134713713, "kl": 0.0019664764404296875, "learning_rate": 3.115363310950578e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 931.96875, "epoch": 71.5, "grad_norm": 0.27576871169806716, "kl": 0.0029125213623046875, "learning_rate": 3.0491243424323783e-07, "loss": 0.0371, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 892.28125, "epoch": 72.0, "grad_norm": 0.3229311945497025, "kl": 0.002956390380859375, "learning_rate": 2.9836319343816397e-07, "loss": -0.0668, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 794.59375, "epoch": 72.5, "grad_norm": 0.24794430012534288, "kl": 0.00383758544921875, "learning_rate": 2.918906036420294e-07, "loss": 0.0331, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 893.875, "epoch": 73.0, "grad_norm": 0.018126291722898823, "kl": 0.00266265869140625, "learning_rate": 2.854966364683872e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 954.375, "epoch": 73.5, "grad_norm": 0.2235013065843689, "kl": 0.002880096435546875, "learning_rate": 2.791832395815782e-07, "loss": 0.0445, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 804.0, "epoch": 74.0, "grad_norm": 0.3025438980959449, "kl": 0.003345489501953125, "learning_rate": 2.729523361034538e-07, "loss": 0.0521, "reward": 0.03125, "reward_std": 0.08539125323295593, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.03125, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 971.578125, "epoch": 74.5, "grad_norm": 0.024475677293083028, "kl": 0.0037384033203125, "learning_rate": 2.6680582402757324e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 149 }, { "epoch": 75.0, "grad_norm": 0.012017701370876262, "learning_rate": 2.6074557564105724e-07, "loss": 0.0001, "step": 150 }, { "epoch": 75.0, "eval_clip_ratio": 0.0, "eval_completion_length": 862.15625, "eval_kl": 0.0033111572265625, "eval_loss": 0.02151690237224102, "eval_reward": 0.0078125, "eval_reward_std": 0.03125, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0078125, "eval_runtime": 95.2607, "eval_samples_per_second": 0.084, "eval_steps_per_second": 0.01, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 1021.8515625, "epoch": 75.5, "grad_norm": 0.344266822655093, "kl": 0.002719879150390625, "learning_rate": 2.547734369542718e-07, "loss": 0.0005, "reward": 0.015625, "reward_std": 0.042695626616477966, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 840.5, "epoch": 76.0, "grad_norm": 0.23923411010062942, "kl": 0.00366973876953125, "learning_rate": 2.488912271385139e-07, "loss": 0.042, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 944.609375, "epoch": 76.5, "grad_norm": 0.016984013306757723, "kl": 0.00287628173828125, "learning_rate": 2.4310073797187573e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 980.09375, "epoch": 77.0, "grad_norm": 0.013783286152037033, "kl": 0.002635955810546875, "learning_rate": 2.374037332934512e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 877.71875, "epoch": 77.5, "grad_norm": 0.17935562272538372, "kl": 0.003742218017578125, "learning_rate": 2.3180194846605364e-07, "loss": 0.0379, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 982.125, "epoch": 78.0, "grad_norm": 0.22095835889243898, "kl": 0.00313568115234375, "learning_rate": 2.2629708984760706e-07, "loss": 0.0429, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 853.6875, "epoch": 78.5, "grad_norm": 0.014922010725132247, "kl": 0.00323486328125, "learning_rate": 2.2089083427137329e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 725.46875, "epoch": 79.0, "grad_norm": 0.02027511748538313, "kl": 0.003879547119140625, "learning_rate": 2.1558482853517253e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 792.5, "epoch": 79.5, "grad_norm": 0.27874692769469644, "kl": 0.00482940673828125, "learning_rate": 2.1038068889975259e-07, "loss": 0.0339, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 159 }, { "epoch": 80.0, "grad_norm": 0.01202834608521387, "learning_rate": 2.0528000059645995e-07, "loss": 0.0001, "step": 160 }, { "epoch": 80.0, "eval_clip_ratio": 0.0, "eval_completion_length": 898.7109375, "eval_kl": 0.0037212371826171875, "eval_loss": -0.011074024252593517, "eval_reward": 0.015625, "eval_reward_std": 0.042695626616477966, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.015625, "eval_runtime": 106.259, "eval_samples_per_second": 0.075, "eval_steps_per_second": 0.009, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 979.0859375, "epoch": 80.5, "grad_norm": 0.018630946038723055, "kl": 0.002685546875, "learning_rate": 2.0028431734436308e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 854.25, "epoch": 81.0, "grad_norm": 0.020703125475339885, "kl": 0.003864288330078125, "learning_rate": 1.9539516087697517e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 842.71875, "epoch": 81.5, "grad_norm": 0.028387308129338176, "kl": 0.004302978515625, "learning_rate": 1.9061402047871833e-07, "loss": 0.0002, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 834.75, "epoch": 82.0, "grad_norm": 0.01641578674675751, "kl": 0.0029449462890625, "learning_rate": 1.8594235253127372e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 842.40625, "epoch": 82.5, "grad_norm": 0.021721834190437855, "kl": 0.0041656494140625, "learning_rate": 1.8138158006995363e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 837.8125, "epoch": 83.0, "grad_norm": 0.01842609716407046, "kl": 0.003582000732421875, "learning_rate": 1.7693309235023127e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 837.453125, "epoch": 83.5, "grad_norm": 0.27337633068496925, "kl": 0.005832672119140625, "learning_rate": 1.7259824442455923e-07, "loss": 0.0532, "reward": 0.03125, "reward_std": 0.08539125323295593, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.03125, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 934.5625, "epoch": 84.0, "grad_norm": 0.012528179411346417, "kl": 0.0025787353515625, "learning_rate": 1.6837835672960831e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 906.21875, "epoch": 84.5, "grad_norm": 0.016944597423777116, "kl": 0.003490447998046875, "learning_rate": 1.6427471468404952e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 169 }, { "epoch": 85.0, "grad_norm": 0.397851893169485, "learning_rate": 1.6028856829700258e-07, "loss": -0.057, "step": 170 }, { "epoch": 85.0, "eval_clip_ratio": 0.0, "eval_completion_length": 884.7109375, "eval_kl": 0.0037250518798828125, "eval_loss": 0.0001178277816507034, "eval_reward": 0.0, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0, "eval_runtime": 94.4429, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.011, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 889.3359375, "epoch": 85.5, "grad_norm": 0.0165553280450213, "kl": 0.0038013458251953125, "learning_rate": 1.5642113178727193e-07, "loss": 0.0001, "reward": 0.015625, "reward_std": 0.042695626616477966, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 931.34375, "epoch": 86.0, "grad_norm": 0.22070193047632167, "kl": 0.00353240966796875, "learning_rate": 1.5267358321348285e-07, "loss": 0.0258, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 902.5625, "epoch": 86.5, "grad_norm": 0.014212788161515211, "kl": 0.002574920654296875, "learning_rate": 1.4904706411523448e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 734.3125, "epoch": 87.0, "grad_norm": 0.5334088299783731, "kl": 0.005619049072265625, "learning_rate": 1.4554267916537495e-07, "loss": 0.0865, "reward": 0.046875, "reward_std": 0.1875, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.046875, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 846.8125, "epoch": 87.5, "grad_norm": 0.3350297851743498, "kl": 0.005451202392578125, "learning_rate": 1.4216149583350755e-07, "loss": 0.0788, "reward": 0.03125, "reward_std": 0.125, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.03125, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 896.90625, "epoch": 88.0, "grad_norm": 0.012136250169886781, "kl": 0.002605438232421875, "learning_rate": 1.3890454406082956e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 924.78125, "epoch": 88.5, "grad_norm": 0.020936369813142675, "kl": 0.004558563232421875, "learning_rate": 1.3577281594640182e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 1008.28125, "epoch": 89.0, "grad_norm": 0.017396797408268067, "kl": 0.003875732421875, "learning_rate": 1.3276726544494571e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 972.84375, "epoch": 89.5, "grad_norm": 0.46133339595558576, "kl": 0.003589630126953125, "learning_rate": 1.2988880807625927e-07, "loss": -0.0749, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 179 }, { "epoch": 90.0, "grad_norm": 0.2749649880366191, "learning_rate": 1.2713832064634125e-07, "loss": 0.0315, "step": 180 }, { "epoch": 90.0, "eval_clip_ratio": 0.0, "eval_completion_length": 928.6796875, "eval_kl": 0.0047626495361328125, "eval_loss": 0.06055343151092529, "eval_reward": 0.03125, "eval_reward_std": 0.10519562661647797, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.03125, "eval_runtime": 101.2345, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 887.515625, "epoch": 90.5, "grad_norm": 0.22634210714921138, "kl": 0.004863739013671875, "learning_rate": 1.2451664098030743e-07, "loss": 0.0395, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 975.1875, "epoch": 91.0, "grad_norm": 0.019843010782603646, "kl": 0.0040130615234375, "learning_rate": 1.220245676671809e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 936.265625, "epoch": 91.5, "grad_norm": 0.35206959012280886, "kl": 0.003910064697265625, "learning_rate": 1.1966285981663407e-07, "loss": 0.0049, "reward": 0.046875, "reward_std": 0.14789125323295593, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.046875, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 825.53125, "epoch": 92.0, "grad_norm": 0.3276867393127649, "kl": 0.0047760009765625, "learning_rate": 1.1743223682775649e-07, "loss": 0.0324, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 976.5, "epoch": 92.5, "grad_norm": 0.24825006369205177, "kl": 0.004093170166015625, "learning_rate": 1.1533337816991931e-07, "loss": 0.0156, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 803.0625, "epoch": 93.0, "grad_norm": 0.022316490692140336, "kl": 0.0040130615234375, "learning_rate": 1.1336692317580158e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 825.28125, "epoch": 93.5, "grad_norm": 0.4846328870102803, "kl": 0.00514984130859375, "learning_rate": 1.1153347084664419e-07, "loss": -0.0498, "reward": 0.03125, "reward_std": 0.125, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.03125, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 1054.5, "epoch": 94.0, "grad_norm": 0.22552887394065263, "kl": 0.003173828125, "learning_rate": 1.0983357966978745e-07, "loss": 0.0259, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 896.578125, "epoch": 94.5, "grad_norm": 0.28343113199408254, "kl": 0.0047607421875, "learning_rate": 1.0826776744855121e-07, "loss": 0.0366, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 189 }, { "epoch": 95.0, "grad_norm": 0.018331093652178724, "learning_rate": 1.068365111445064e-07, "loss": 0.0001, "step": 190 }, { "epoch": 95.0, "eval_clip_ratio": 0.0, "eval_completion_length": 926.5390625, "eval_kl": 0.004039764404296875, "eval_loss": 0.0001301074807997793, "eval_reward": 0.0, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0, "eval_runtime": 100.4368, "eval_samples_per_second": 0.08, "eval_steps_per_second": 0.01, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 902.8046875, "epoch": 95.5, "grad_norm": 0.27554893857164553, "kl": 0.0040225982666015625, "learning_rate": 1.0554024673218806e-07, "loss": 0.052, "reward": 0.0078125, "reward_std": 0.03125, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0078125, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 753.625, "epoch": 96.0, "grad_norm": 0.4490555309724536, "kl": 0.004604339599609375, "learning_rate": 1.0437936906629334e-07, "loss": -0.0745, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 829.953125, "epoch": 96.5, "grad_norm": 0.02546191399674691, "kl": 0.00514984130859375, "learning_rate": 1.0335423176140511e-07, "loss": 0.0002, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 852.78125, "epoch": 97.0, "grad_norm": 0.01515669426337304, "kl": 0.003253936767578125, "learning_rate": 1.0246514708427701e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 901.71875, "epoch": 97.5, "grad_norm": 0.01672130582287373, "kl": 0.003467559814453125, "learning_rate": 1.017123858587145e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 904.375, "epoch": 98.0, "grad_norm": 0.021287930078995584, "kl": 0.004909515380859375, "learning_rate": 1.0109617738307911e-07, "loss": 0.0002, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 970.984375, "epoch": 98.5, "grad_norm": 0.024329146341496674, "kl": 0.00463104248046875, "learning_rate": 1.0061670936044178e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 837.75, "epoch": 99.0, "grad_norm": 0.25418431822719284, "kl": 0.004810333251953125, "learning_rate": 1.002741278414069e-07, "loss": 0.0424, "reward": 0.015625, "reward_std": 0.0625, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.015625, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 894.921875, "epoch": 99.5, "grad_norm": 0.015548877308025595, "kl": 0.0036163330078125, "learning_rate": 1.0006853717962393e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 199 }, { "epoch": 100.0, "grad_norm": 0.02199600718524651, "learning_rate": 1e-07, "loss": 0.0002, "step": 200 }, { "epoch": 100.0, "eval_clip_ratio": 0.0, "eval_completion_length": 909.0390625, "eval_kl": 0.004390716552734375, "eval_loss": 0.02110510692000389, "eval_reward": 0.0078125, "eval_reward_std": 0.03125, "eval_rewards/accuracy_reward_staging": 0.0, "eval_rewards/format_reward": 0.0, "eval_rewards/format_reward_staging": 0.0078125, "eval_runtime": 101.442, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.01, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 893.78125, "epoch": 100.0, "kl": 0.004932403564453125, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward_staging": 0.0, "rewards/format_reward": 0.0, "rewards/format_reward_staging": 0.0, "step": 200, "total_flos": 0.0, "train_loss": 0.007099090418751075, "train_runtime": 14240.9534, "train_samples_per_second": 0.056, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }