diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.36986301369863, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 463.84375, + "epoch": 0.0013698630136986301, + "grad_norm": 3.912295341491699, + "kl": 0.0006895065307617188, + "learning_rate": 9.995433789954337e-07, + "loss": 0.0, + "reward": 0.734375, + "reward_std": 0.6482069045305252, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.4375, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.53125, + "epoch": 0.0027397260273972603, + "grad_norm": 4.425003528594971, + "kl": 0.0008821487426757812, + "learning_rate": 9.990867579908674e-07, + "loss": 0.0, + "reward": 0.734375, + "reward_std": 0.5391269624233246, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.4375, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.46875, + "epoch": 0.00410958904109589, + "grad_norm": 4.136044979095459, + "kl": 0.0013284683227539062, + "learning_rate": 9.986301369863014e-07, + "loss": 0.0, + "reward": 0.909375011920929, + "reward_std": 0.49507734179496765, + "rewards/accuracy_reward": 0.25312499701976776, + "rewards/format_reward": 0.65625, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.9375, + "epoch": 0.005479452054794521, + "grad_norm": 3.070124864578247, + "kl": 0.0018978118896484375, + "learning_rate": 9.98173515981735e-07, + "loss": 0.0, + "reward": 0.859375, + "reward_std": 0.5979855433106422, + "rewards/accuracy_reward": 0.265625, + "rewards/format_reward": 0.59375, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.75, + "epoch": 0.00684931506849315, + "grad_norm": 3.5212478637695312, + "kl": 0.00206756591796875, + "learning_rate": 9.977168949771688e-07, + "loss": 0.0, + "reward": 1.3125, + "reward_std": 0.6811521649360657, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.78125, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.0625, + "epoch": 0.00821917808219178, + "grad_norm": 3.7381842136383057, + "kl": 0.002559661865234375, + "learning_rate": 9.972602739726028e-07, + "loss": 0.0, + "reward": 1.109375, + "reward_std": 0.3039700835943222, + "rewards/accuracy_reward": 0.171875, + "rewards/format_reward": 0.9375, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.1875, + "epoch": 0.009589041095890411, + "grad_norm": 2.0565128326416016, + "kl": 0.003376007080078125, + "learning_rate": 9.968036529680365e-07, + "loss": 0.0, + "reward": 1.296875, + "reward_std": 0.1530819907784462, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 1.0, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.5625, + "epoch": 0.010958904109589041, + "grad_norm": 3.6933867931365967, + "kl": 0.004985809326171875, + "learning_rate": 9.963470319634703e-07, + "loss": 0.0, + "reward": 1.390625, + "reward_std": 0.5591665953397751, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.875, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.625, + "epoch": 0.012328767123287671, + "grad_norm": 15.956212043762207, + "kl": 0.006072998046875, + "learning_rate": 9.95890410958904e-07, + "loss": 0.0, + "reward": 1.1119791865348816, + "reward_std": 0.5166353359818459, + "rewards/accuracy_reward": 0.2369791567325592, + "rewards/format_reward": 0.875, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.15625, + "epoch": 0.0136986301369863, + "grad_norm": 2.227487802505493, + "kl": 0.0097808837890625, + "learning_rate": 9.954337899543377e-07, + "loss": 0.0, + "reward": 1.0885416567325592, + "reward_std": 0.4930662214756012, + "rewards/accuracy_reward": 0.2760416716337204, + "rewards/format_reward": 0.8125, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.625, + "epoch": 0.015068493150684932, + "grad_norm": 4.018799781799316, + "kl": 0.00763702392578125, + "learning_rate": 9.949771689497717e-07, + "loss": 0.0, + "reward": 1.234375, + "reward_std": 0.49322642385959625, + "rewards/accuracy_reward": 0.328125, + "rewards/format_reward": 0.90625, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 506.96875, + "epoch": 0.01643835616438356, + "grad_norm": 2.298476219177246, + "kl": 0.01721954345703125, + "learning_rate": 9.945205479452054e-07, + "loss": 0.0, + "reward": 1.15625, + "reward_std": 0.5379246100783348, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.875, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.78125, + "epoch": 0.01780821917808219, + "grad_norm": 5.038638591766357, + "kl": 0.015594482421875, + "learning_rate": 9.940639269406391e-07, + "loss": 0.0, + "reward": 1.1875, + "reward_std": 0.48483333736658096, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.40625, + "epoch": 0.019178082191780823, + "grad_norm": 3.0773346424102783, + "kl": 0.0150146484375, + "learning_rate": 9.93607305936073e-07, + "loss": 0.0, + "reward": 1.15625, + "reward_std": 0.3471629247069359, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.96875, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.25, + "epoch": 0.02054794520547945, + "grad_norm": 4.9919514656066895, + "kl": 0.02197265625, + "learning_rate": 9.931506849315068e-07, + "loss": 0.0, + "reward": 1.234375, + "reward_std": 0.40913281589746475, + "rewards/accuracy_reward": 0.265625, + "rewards/format_reward": 0.96875, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.25, + "epoch": 0.021917808219178082, + "grad_norm": 4.952914714813232, + "kl": 0.0227508544921875, + "learning_rate": 9.926940639269406e-07, + "loss": 0.0, + "reward": 1.3359375, + "reward_std": 0.3955412805080414, + "rewards/accuracy_reward": 0.4296875, + "rewards/format_reward": 0.90625, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.96875, + "epoch": 0.023287671232876714, + "grad_norm": 2.7455577850341797, + "kl": 0.0212554931640625, + "learning_rate": 9.922374429223745e-07, + "loss": 0.0, + "reward": 1.25, + "reward_std": 0.4765502139925957, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.96875, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.875, + "epoch": 0.024657534246575342, + "grad_norm": 5.91363525390625, + "kl": 0.021270751953125, + "learning_rate": 9.917808219178082e-07, + "loss": 0.0, + "reward": 1.2265625, + "reward_std": 0.5312308222055435, + "rewards/accuracy_reward": 0.3203125, + "rewards/format_reward": 0.90625, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.09375, + "epoch": 0.026027397260273973, + "grad_norm": 4.743810653686523, + "kl": 0.0315704345703125, + "learning_rate": 9.91324200913242e-07, + "loss": 0.0, + "reward": 1.1875, + "reward_std": 0.6161536350846291, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.875, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.5, + "epoch": 0.0273972602739726, + "grad_norm": 1.7055182456970215, + "kl": 0.027923583984375, + "learning_rate": 9.908675799086757e-07, + "loss": 0.0, + "reward": 1.234375, + "reward_std": 0.13939543068408966, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 1.0, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.6875, + "epoch": 0.028767123287671233, + "grad_norm": 2.169266700744629, + "kl": 0.04107666015625, + "learning_rate": 9.904109589041094e-07, + "loss": 0.0, + "reward": 1.3229166865348816, + "reward_std": 0.23428862541913986, + "rewards/accuracy_reward": 0.3541666567325592, + "rewards/format_reward": 0.96875, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.53125, + "epoch": 0.030136986301369864, + "grad_norm": 3.142946720123291, + "kl": 0.0318145751953125, + "learning_rate": 9.899543378995434e-07, + "loss": 0.0, + "reward": 1.2890625, + "reward_std": 0.46334072202444077, + "rewards/accuracy_reward": 0.3515625, + "rewards/format_reward": 0.9375, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.65625, + "epoch": 0.031506849315068496, + "grad_norm": 4.148040294647217, + "kl": 0.021270751953125, + "learning_rate": 9.894977168949771e-07, + "loss": 0.0, + "reward": 1.234375, + "reward_std": 0.3653144985437393, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 1.0, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.15625, + "epoch": 0.03287671232876712, + "grad_norm": 3.037674903869629, + "kl": 0.03924560546875, + "learning_rate": 9.89041095890411e-07, + "loss": 0.0, + "reward": 1.3385416865348816, + "reward_std": 0.3099621832370758, + "rewards/accuracy_reward": 0.3697916567325592, + "rewards/format_reward": 0.96875, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.1875, + "epoch": 0.03424657534246575, + "grad_norm": 4.072643280029297, + "kl": 0.036590576171875, + "learning_rate": 9.885844748858448e-07, + "loss": 0.0, + "reward": 1.453125, + "reward_std": 0.40609828382730484, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.90625, + "epoch": 0.03561643835616438, + "grad_norm": 6.202883243560791, + "kl": 0.03265380859375, + "learning_rate": 9.881278538812785e-07, + "loss": 0.0, + "reward": 1.171875, + "reward_std": 0.19583626091480255, + "rewards/accuracy_reward": 0.171875, + "rewards/format_reward": 1.0, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.75, + "epoch": 0.036986301369863014, + "grad_norm": 2.829223155975342, + "kl": 0.0391693115234375, + "learning_rate": 9.876712328767123e-07, + "loss": 0.0, + "reward": 1.46875, + "reward_std": 0.22084104642271996, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 1.0, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.96875, + "epoch": 0.038356164383561646, + "grad_norm": 3.0730113983154297, + "kl": 0.043853759765625, + "learning_rate": 9.87214611872146e-07, + "loss": 0.0, + "reward": 1.46875, + "reward_std": 0.36339621990919113, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.96875, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.34375, + "epoch": 0.03972602739726028, + "grad_norm": 3.6202750205993652, + "kl": 0.0521240234375, + "learning_rate": 9.867579908675797e-07, + "loss": 0.0001, + "reward": 1.43359375, + "reward_std": 0.4707936607301235, + "rewards/accuracy_reward": 0.52734375, + "rewards/format_reward": 0.90625, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.75, + "epoch": 0.0410958904109589, + "grad_norm": 2.320294141769409, + "kl": 0.040008544921875, + "learning_rate": 9.863013698630137e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.03125, + "rewards/format_reward": 0.96875, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.125, + "epoch": 0.04246575342465753, + "grad_norm": 3.044395685195923, + "kl": 0.05633544921875, + "learning_rate": 9.858447488584474e-07, + "loss": 0.0001, + "reward": 1.46875, + "reward_std": 0.346555445343256, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.96875, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.65625, + "epoch": 0.043835616438356165, + "grad_norm": 2.4935569763183594, + "kl": 0.04290771484375, + "learning_rate": 9.853881278538814e-07, + "loss": 0.0, + "reward": 1.09375, + "reward_std": 0.2041158601641655, + "rewards/accuracy_reward": 0.09375, + "rewards/format_reward": 1.0, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.46875, + "epoch": 0.045205479452054796, + "grad_norm": 2.3980815410614014, + "kl": 0.06085205078125, + "learning_rate": 9.84931506849315e-07, + "loss": 0.0001, + "reward": 1.28125, + "reward_std": 0.30371319502592087, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.96875, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.6875, + "epoch": 0.04657534246575343, + "grad_norm": 1.6388932466506958, + "kl": 0.04888916015625, + "learning_rate": 9.844748858447488e-07, + "loss": 0.0, + "reward": 1.421875, + "reward_std": 0.1711306795477867, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 1.0, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.09375, + "epoch": 0.04794520547945205, + "grad_norm": 2.9538469314575195, + "kl": 0.054229736328125, + "learning_rate": 9.840182648401826e-07, + "loss": 0.0001, + "reward": 1.34375, + "reward_std": 0.1523548737168312, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.28125, + "epoch": 0.049315068493150684, + "grad_norm": 1.2823041677474976, + "kl": 0.032958984375, + "learning_rate": 9.835616438356163e-07, + "loss": 0.0, + "reward": 1.21875, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.96875, + "epoch": 0.050684931506849315, + "grad_norm": 10.917831420898438, + "kl": 0.0721435546875, + "learning_rate": 9.831050228310502e-07, + "loss": 0.0001, + "reward": 1.50390625, + "reward_std": 0.08883348293602467, + "rewards/accuracy_reward": 0.50390625, + "rewards/format_reward": 1.0, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 489.15625, + "epoch": 0.052054794520547946, + "grad_norm": 3.121203660964966, + "kl": 0.04541015625, + "learning_rate": 9.82648401826484e-07, + "loss": 0.0, + "reward": 1.25, + "reward_std": 0.2925042062997818, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.96875, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.96875, + "epoch": 0.05342465753424658, + "grad_norm": 2.3028512001037598, + "kl": 0.0438232421875, + "learning_rate": 9.821917808219177e-07, + "loss": 0.0, + "reward": 1.4739583134651184, + "reward_std": 0.2700696364045143, + "rewards/accuracy_reward": 0.4739583432674408, + "rewards/format_reward": 1.0, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 626.96875, + "epoch": 0.0547945205479452, + "grad_norm": 1.8145931959152222, + "kl": 0.056488037109375, + "learning_rate": 9.817351598173517e-07, + "loss": 0.0001, + "reward": 1.2421875, + "reward_std": 0.3533598557114601, + "rewards/accuracy_reward": 0.3046875, + "rewards/format_reward": 0.9375, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 507.59375, + "epoch": 0.056164383561643834, + "grad_norm": 3.3131184577941895, + "kl": 0.044189453125, + "learning_rate": 9.812785388127854e-07, + "loss": 0.0, + "reward": 1.34375, + "reward_std": 0.4578060656785965, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.96875, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.9375, + "epoch": 0.057534246575342465, + "grad_norm": 8.756609916687012, + "kl": 0.059814453125, + "learning_rate": 9.808219178082191e-07, + "loss": 0.0001, + "reward": 1.5703125, + "reward_std": 0.4013843312859535, + "rewards/accuracy_reward": 0.6015625, + "rewards/format_reward": 0.96875, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 472.96875, + "epoch": 0.0589041095890411, + "grad_norm": 2.866149663925171, + "kl": 0.0538330078125, + "learning_rate": 9.803652968036529e-07, + "loss": 0.0001, + "reward": 1.3515625, + "reward_std": 0.34194046072661877, + "rewards/accuracy_reward": 0.3515625, + "rewards/format_reward": 1.0, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 667.09375, + "epoch": 0.06027397260273973, + "grad_norm": 6.409434795379639, + "kl": 0.058013916015625, + "learning_rate": 9.799086757990868e-07, + "loss": 0.0001, + "reward": 1.5218749642372131, + "reward_std": 0.34158414881676435, + "rewards/accuracy_reward": 0.5843749791383743, + "rewards/format_reward": 0.9375, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 716.84375, + "epoch": 0.06164383561643835, + "grad_norm": 1.960483431816101, + "kl": 0.0870361328125, + "learning_rate": 9.794520547945205e-07, + "loss": 0.0001, + "reward": 1.3515625, + "reward_std": 0.26203832402825356, + "rewards/accuracy_reward": 0.3828125, + "rewards/format_reward": 0.96875, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 633.34375, + "epoch": 0.06301369863013699, + "grad_norm": 1.6870055198669434, + "kl": 0.06658935546875, + "learning_rate": 9.789954337899543e-07, + "loss": 0.0001, + "reward": 1.2578125, + "reward_std": 0.32489965856075287, + "rewards/accuracy_reward": 0.3828125, + "rewards/format_reward": 0.875, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.34375, + "epoch": 0.06438356164383562, + "grad_norm": 2.6664040088653564, + "kl": 0.0615234375, + "learning_rate": 9.78538812785388e-07, + "loss": 0.0001, + "reward": 1.3828125, + "reward_std": 0.1984097883105278, + "rewards/accuracy_reward": 0.3828125, + "rewards/format_reward": 1.0, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 758.625, + "epoch": 0.06575342465753424, + "grad_norm": 6.174232482910156, + "kl": 0.07745361328125, + "learning_rate": 9.78082191780822e-07, + "loss": 0.0001, + "reward": 1.2946428656578064, + "reward_std": 0.6038303673267365, + "rewards/accuracy_reward": 0.4821428507566452, + "rewards/format_reward": 0.8125, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 478.3125, + "epoch": 0.06712328767123288, + "grad_norm": 4.74515438079834, + "kl": 0.06597900390625, + "learning_rate": 9.776255707762557e-07, + "loss": 0.0001, + "reward": 1.2734375, + "reward_std": 0.24417023360729218, + "rewards/accuracy_reward": 0.2734375, + "rewards/format_reward": 1.0, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.71875, + "epoch": 0.0684931506849315, + "grad_norm": 3.3720693588256836, + "kl": 0.056671142578125, + "learning_rate": 9.771689497716894e-07, + "loss": 0.0001, + "reward": 1.5, + "reward_std": 0.23194295540452003, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.125, + "epoch": 0.06986301369863014, + "grad_norm": 1.075783133506775, + "kl": 0.07354736328125, + "learning_rate": 9.767123287671234e-07, + "loss": 0.0001, + "reward": 1.140625, + "reward_std": 0.1530819907784462, + "rewards/accuracy_reward": 0.171875, + "rewards/format_reward": 0.96875, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.5, + "epoch": 0.07123287671232877, + "grad_norm": 4.1653876304626465, + "kl": 0.0670166015625, + "learning_rate": 9.762557077625571e-07, + "loss": 0.0001, + "reward": 1.4375, + "reward_std": 0.35381053015589714, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.96875, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 659.1875, + "epoch": 0.07260273972602739, + "grad_norm": 2.497183322906494, + "kl": 0.07391357421875, + "learning_rate": 9.757990867579908e-07, + "loss": 0.0001, + "reward": 1.703125, + "reward_std": 0.2172447368502617, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 1.0, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.5, + "epoch": 0.07397260273972603, + "grad_norm": 1.2262613773345947, + "kl": 0.06640625, + "learning_rate": 9.753424657534246e-07, + "loss": 0.0001, + "reward": 1.15625, + "reward_std": 0.1356339044868946, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.96875, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 710.59375, + "epoch": 0.07534246575342465, + "grad_norm": 1.7770954370498657, + "kl": 0.0960693359375, + "learning_rate": 9.748858447488583e-07, + "loss": 0.0001, + "reward": 1.6171875, + "reward_std": 0.18259718269109726, + "rewards/accuracy_reward": 0.6171875, + "rewards/format_reward": 1.0, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.71875, + "epoch": 0.07671232876712329, + "grad_norm": 2.9477083683013916, + "kl": 0.08294677734375, + "learning_rate": 9.744292237442923e-07, + "loss": 0.0001, + "reward": 1.375, + "reward_std": 0.2698745857924223, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 633.40625, + "epoch": 0.07808219178082192, + "grad_norm": 2.187389850616455, + "kl": 0.07904052734375, + "learning_rate": 9.73972602739726e-07, + "loss": 0.0001, + "reward": 1.4620535373687744, + "reward_std": 0.3073258101940155, + "rewards/accuracy_reward": 0.493303582072258, + "rewards/format_reward": 0.96875, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 722.9375, + "epoch": 0.07945205479452055, + "grad_norm": 1.8998942375183105, + "kl": 0.1063232421875, + "learning_rate": 9.735159817351597e-07, + "loss": 0.0001, + "reward": 1.3177083432674408, + "reward_std": 0.2553338035941124, + "rewards/accuracy_reward": 0.3489583283662796, + "rewards/format_reward": 0.96875, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.84375, + "epoch": 0.08082191780821918, + "grad_norm": 2.1120872497558594, + "kl": 0.07696533203125, + "learning_rate": 9.730593607305937e-07, + "loss": 0.0001, + "reward": 1.265625, + "reward_std": 0.26977966725826263, + "rewards/accuracy_reward": 0.265625, + "rewards/format_reward": 1.0, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.28125, + "epoch": 0.0821917808219178, + "grad_norm": 3.2552480697631836, + "kl": 0.06304931640625, + "learning_rate": 9.726027397260274e-07, + "loss": 0.0001, + "reward": 1.125, + "reward_std": 0.2177756354212761, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 510.9375, + "epoch": 0.08356164383561644, + "grad_norm": 2.9126460552215576, + "kl": 0.0733642578125, + "learning_rate": 9.721461187214611e-07, + "loss": 0.0001, + "reward": 1.328125, + "reward_std": 0.28930897638201714, + "rewards/accuracy_reward": 0.328125, + "rewards/format_reward": 1.0, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.8125, + "epoch": 0.08493150684931507, + "grad_norm": 1.73414146900177, + "kl": 0.0865478515625, + "learning_rate": 9.716894977168949e-07, + "loss": 0.0001, + "reward": 1.203125, + "reward_std": 0.33669837564229965, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 0.96875, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 692.09375, + "epoch": 0.0863013698630137, + "grad_norm": 1.9965529441833496, + "kl": 0.1036376953125, + "learning_rate": 9.712328767123286e-07, + "loss": 0.0001, + "reward": 1.7749256193637848, + "reward_std": 0.1739531122148037, + "rewards/accuracy_reward": 0.7749256044626236, + "rewards/format_reward": 1.0, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 695.1875, + "epoch": 0.08767123287671233, + "grad_norm": 1.9106062650680542, + "kl": 0.094482421875, + "learning_rate": 9.707762557077626e-07, + "loss": 0.0001, + "reward": 1.580729216337204, + "reward_std": 0.07768097147345543, + "rewards/accuracy_reward": 0.5807291567325592, + "rewards/format_reward": 1.0, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.75, + "epoch": 0.08904109589041095, + "grad_norm": 2.800123929977417, + "kl": 0.0806884765625, + "learning_rate": 9.703196347031963e-07, + "loss": 0.0001, + "reward": 1.50390625, + "reward_std": 0.30879483185708523, + "rewards/accuracy_reward": 0.50390625, + "rewards/format_reward": 1.0, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.1875, + "epoch": 0.09041095890410959, + "grad_norm": 3.2412710189819336, + "kl": 0.0830078125, + "learning_rate": 9.6986301369863e-07, + "loss": 0.0001, + "reward": 1.515625, + "reward_std": 0.5082386285066605, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.96875, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.75, + "epoch": 0.09178082191780822, + "grad_norm": 4.994422912597656, + "kl": 0.0894775390625, + "learning_rate": 9.69406392694064e-07, + "loss": 0.0001, + "reward": 1.1796875, + "reward_std": 0.12073516845703125, + "rewards/accuracy_reward": 0.1796875, + "rewards/format_reward": 1.0, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.15625, + "epoch": 0.09315068493150686, + "grad_norm": 0.836651623249054, + "kl": 0.0897216796875, + "learning_rate": 9.689497716894977e-07, + "loss": 0.0001, + "reward": 1.1796875, + "reward_std": 0.04005437344312668, + "rewards/accuracy_reward": 0.1796875, + "rewards/format_reward": 1.0, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 488.65625, + "epoch": 0.09452054794520548, + "grad_norm": 3.8262500762939453, + "kl": 0.0775146484375, + "learning_rate": 9.684931506849314e-07, + "loss": 0.0001, + "reward": 1.3515625, + "reward_std": 0.15467960201203823, + "rewards/accuracy_reward": 0.3515625, + "rewards/format_reward": 1.0, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.3125, + "epoch": 0.0958904109589041, + "grad_norm": 15.291179656982422, + "kl": 0.069091796875, + "learning_rate": 9.680365296803652e-07, + "loss": 0.0001, + "reward": 1.2265625, + "reward_std": 0.14807433634996414, + "rewards/accuracy_reward": 0.2265625, + "rewards/format_reward": 1.0, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.1875, + "epoch": 0.09726027397260274, + "grad_norm": 2.252143383026123, + "kl": 0.07574462890625, + "learning_rate": 9.675799086757991e-07, + "loss": 0.0001, + "reward": 1.359375, + "reward_std": 0.2109457477927208, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 1.0, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.8125, + "epoch": 0.09863013698630137, + "grad_norm": 2.375014305114746, + "kl": 0.0933837890625, + "learning_rate": 9.671232876712329e-07, + "loss": 0.0001, + "reward": 1.359375, + "reward_std": 0.2414703369140625, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 1.0, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.03125, + "epoch": 0.1, + "grad_norm": 3.685784101486206, + "kl": 0.05877685546875, + "learning_rate": 9.666666666666666e-07, + "loss": 0.0001, + "reward": 1.421875, + "reward_std": 0.37769732251763344, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 1.0, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 496.0, + "epoch": 0.10136986301369863, + "grad_norm": 2.5997228622436523, + "kl": 0.07012939453125, + "learning_rate": 9.662100456621003e-07, + "loss": 0.0001, + "reward": 1.5, + "reward_std": 0.31046149134635925, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.75, + "epoch": 0.10273972602739725, + "grad_norm": 2.172786235809326, + "kl": 0.12713623046875, + "learning_rate": 9.657534246575343e-07, + "loss": 0.0001, + "reward": 1.234375, + "reward_std": 0.17358146235346794, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 1.0, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 653.59375, + "epoch": 0.10410958904109589, + "grad_norm": 2.230001449584961, + "kl": 0.1158447265625, + "learning_rate": 9.65296803652968e-07, + "loss": 0.0001, + "reward": 1.453125, + "reward_std": 0.23453032225370407, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.96875, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.0, + "epoch": 0.10547945205479452, + "grad_norm": 3.474895477294922, + "kl": 0.1109619140625, + "learning_rate": 9.648401826484017e-07, + "loss": 0.0001, + "reward": 1.7578125, + "reward_std": 0.15931576862931252, + "rewards/accuracy_reward": 0.7578125, + "rewards/format_reward": 1.0, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.8125, + "epoch": 0.10684931506849316, + "grad_norm": 2.6400644779205322, + "kl": 0.0609130859375, + "learning_rate": 9.643835616438357e-07, + "loss": 0.0001, + "reward": 1.28125, + "reward_std": 0.2709311693906784, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 1.0, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.4375, + "epoch": 0.10821917808219178, + "grad_norm": 3.992541790008545, + "kl": 0.09515380859375, + "learning_rate": 9.639269406392694e-07, + "loss": 0.0001, + "reward": 1.7578125, + "reward_std": 0.4703022539615631, + "rewards/accuracy_reward": 0.7578125, + "rewards/format_reward": 1.0, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 647.96875, + "epoch": 0.1095890410958904, + "grad_norm": 3.139620065689087, + "kl": 0.106689453125, + "learning_rate": 9.634703196347032e-07, + "loss": 0.0001, + "reward": 1.53515625, + "reward_std": 0.22270986810326576, + "rewards/accuracy_reward": 0.53515625, + "rewards/format_reward": 1.0, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 626.8125, + "epoch": 0.11095890410958904, + "grad_norm": 1.872832179069519, + "kl": 0.1011962890625, + "learning_rate": 9.630136986301369e-07, + "loss": 0.0001, + "reward": 1.6640625, + "reward_std": 0.14312389120459557, + "rewards/accuracy_reward": 0.6640625, + "rewards/format_reward": 1.0, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.96875, + "epoch": 0.11232876712328767, + "grad_norm": 1.6903170347213745, + "kl": 0.0982666015625, + "learning_rate": 9.625570776255706e-07, + "loss": 0.0001, + "reward": 1.234375, + "reward_std": 0.1804211586713791, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 1.0, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.625, + "epoch": 0.1136986301369863, + "grad_norm": 2.276421546936035, + "kl": 0.095947265625, + "learning_rate": 9.621004566210046e-07, + "loss": 0.0001, + "reward": 1.4453125, + "reward_std": 0.232578843832016, + "rewards/accuracy_reward": 0.4453125, + "rewards/format_reward": 1.0, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.03125, + "epoch": 0.11506849315068493, + "grad_norm": 3.133164882659912, + "kl": 0.109130859375, + "learning_rate": 9.616438356164383e-07, + "loss": 0.0001, + "reward": 1.6015625, + "reward_std": 0.2483602836728096, + "rewards/accuracy_reward": 0.6015625, + "rewards/format_reward": 1.0, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.8125, + "epoch": 0.11643835616438356, + "grad_norm": 2.0008928775787354, + "kl": 0.0712890625, + "learning_rate": 9.61187214611872e-07, + "loss": 0.0001, + "reward": 1.328125, + "reward_std": 0.30617379024624825, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 0.96875, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.75, + "epoch": 0.1178082191780822, + "grad_norm": 1.2373316287994385, + "kl": 0.082275390625, + "learning_rate": 9.60730593607306e-07, + "loss": 0.0001, + "reward": 1.390625, + "reward_std": 0.10205793008208275, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 1.0, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.5, + "epoch": 0.11917808219178082, + "grad_norm": 22.696931838989258, + "kl": 0.0906982421875, + "learning_rate": 9.602739726027397e-07, + "loss": 0.0001, + "reward": 1.171875, + "reward_std": 0.13258251920342445, + "rewards/accuracy_reward": 0.171875, + "rewards/format_reward": 1.0, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.84375, + "epoch": 0.12054794520547946, + "grad_norm": 4.310155868530273, + "kl": 0.05987548828125, + "learning_rate": 9.598173515981735e-07, + "loss": 0.0001, + "reward": 1.1875, + "reward_std": 0.3104073107242584, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.21875, + "epoch": 0.12191780821917808, + "grad_norm": 0.8474344611167908, + "kl": 0.09967041015625, + "learning_rate": 9.593607305936072e-07, + "loss": 0.0001, + "reward": 1.1875, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 642.03125, + "epoch": 0.1232876712328767, + "grad_norm": 2.66658091545105, + "kl": 0.12939453125, + "learning_rate": 9.58904109589041e-07, + "loss": 0.0001, + "reward": 1.4296875, + "reward_std": 0.2822495624423027, + "rewards/accuracy_reward": 0.4609375, + "rewards/format_reward": 0.96875, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 631.40625, + "epoch": 0.12465753424657534, + "grad_norm": 1.2776012420654297, + "kl": 0.1162109375, + "learning_rate": 9.584474885844749e-07, + "loss": 0.0001, + "reward": 1.34375, + "reward_std": 0.10888781771063805, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.25, + "epoch": 0.12602739726027398, + "grad_norm": 1.5968950986862183, + "kl": 0.09906005859375, + "learning_rate": 9.579908675799086e-07, + "loss": 0.0001, + "reward": 1.3125, + "reward_std": 0.1356339044868946, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.21875, + "epoch": 0.1273972602739726, + "grad_norm": 6.543825149536133, + "kl": 0.112060546875, + "learning_rate": 9.575342465753423e-07, + "loss": 0.0001, + "reward": 1.5546875, + "reward_std": 0.29614376835525036, + "rewards/accuracy_reward": 0.5546875, + "rewards/format_reward": 1.0, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 710.5625, + "epoch": 0.12876712328767123, + "grad_norm": 4.022386074066162, + "kl": 0.1082763671875, + "learning_rate": 9.570776255707763e-07, + "loss": 0.0001, + "reward": 1.2916666865348816, + "reward_std": 0.289409551769495, + "rewards/accuracy_reward": 0.3854166716337204, + "rewards/format_reward": 0.90625, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 499.5625, + "epoch": 0.13013698630136986, + "grad_norm": 1.6726783514022827, + "kl": 0.103271484375, + "learning_rate": 9.5662100456621e-07, + "loss": 0.0001, + "reward": 1.2265625, + "reward_std": 0.14807433634996414, + "rewards/accuracy_reward": 0.2265625, + "rewards/format_reward": 1.0, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.5, + "epoch": 0.13150684931506848, + "grad_norm": 2.4275333881378174, + "kl": 0.109130859375, + "learning_rate": 9.561643835616437e-07, + "loss": 0.0001, + "reward": 1.4453125, + "reward_std": 0.35531364381313324, + "rewards/accuracy_reward": 0.4765625, + "rewards/format_reward": 0.96875, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.40625, + "epoch": 0.13287671232876713, + "grad_norm": 1.4377013444900513, + "kl": 0.12249755859375, + "learning_rate": 9.557077625570777e-07, + "loss": 0.0001, + "reward": 1.6145833432674408, + "reward_std": 0.15587851032614708, + "rewards/accuracy_reward": 0.6145833432674408, + "rewards/format_reward": 1.0, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 697.5625, + "epoch": 0.13424657534246576, + "grad_norm": 2.026230573654175, + "kl": 0.1431884765625, + "learning_rate": 9.552511415525114e-07, + "loss": 0.0001, + "reward": 1.6875, + "reward_std": 0.18185461685061455, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 616.28125, + "epoch": 0.13561643835616438, + "grad_norm": 4.95074462890625, + "kl": 0.153564453125, + "learning_rate": 9.547945205479452e-07, + "loss": 0.0002, + "reward": 2.0625, + "reward_std": 0.3226073309779167, + "rewards/accuracy_reward": 1.09375, + "rewards/format_reward": 0.96875, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 694.40625, + "epoch": 0.136986301369863, + "grad_norm": 3.2093119621276855, + "kl": 0.1156005859375, + "learning_rate": 9.54337899543379e-07, + "loss": 0.0001, + "reward": 1.4140625, + "reward_std": 0.18956539407372475, + "rewards/accuracy_reward": 0.4453125, + "rewards/format_reward": 0.96875, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.21875, + "epoch": 0.13835616438356163, + "grad_norm": 3.3832147121429443, + "kl": 0.0748291015625, + "learning_rate": 9.538812785388126e-07, + "loss": 0.0001, + "reward": 1.2604166865348816, + "reward_std": 0.24269168078899384, + "rewards/accuracy_reward": 0.2604166567325592, + "rewards/format_reward": 1.0, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.53125, + "epoch": 0.13972602739726028, + "grad_norm": 0.012525072321295738, + "kl": 0.08740234375, + "learning_rate": 9.534246575342465e-07, + "loss": 0.0001, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 740.4375, + "epoch": 0.1410958904109589, + "grad_norm": 1.9927910566329956, + "kl": 0.1253662109375, + "learning_rate": 9.529680365296803e-07, + "loss": 0.0001, + "reward": 1.59375, + "reward_std": 0.16675157472491264, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 1.0, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 503.375, + "epoch": 0.14246575342465753, + "grad_norm": 3.8206396102905273, + "kl": 0.1165771484375, + "learning_rate": 9.525114155251142e-07, + "loss": 0.0001, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 696.6875, + "epoch": 0.14383561643835616, + "grad_norm": 8.162276268005371, + "kl": 0.134521484375, + "learning_rate": 9.520547945205479e-07, + "loss": 0.0001, + "reward": 1.6171875, + "reward_std": 0.19097032584249973, + "rewards/accuracy_reward": 0.6171875, + "rewards/format_reward": 1.0, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 656.78125, + "epoch": 0.14520547945205478, + "grad_norm": 2.644918441772461, + "kl": 0.1160888671875, + "learning_rate": 9.515981735159817e-07, + "loss": 0.0001, + "reward": 1.86328125, + "reward_std": 0.41913160867989063, + "rewards/accuracy_reward": 0.86328125, + "rewards/format_reward": 1.0, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.90625, + "epoch": 0.14657534246575343, + "grad_norm": 3.6577141284942627, + "kl": 0.1168212890625, + "learning_rate": 9.511415525114155e-07, + "loss": 0.0001, + "reward": 1.359375, + "reward_std": 0.19939782842993736, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 1.0, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 679.875, + "epoch": 0.14794520547945206, + "grad_norm": 4.23774528503418, + "kl": 0.1524658203125, + "learning_rate": 9.506849315068493e-07, + "loss": 0.0002, + "reward": 1.7565104365348816, + "reward_std": 0.21211734786629677, + "rewards/accuracy_reward": 0.7565104365348816, + "rewards/format_reward": 1.0, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 684.34375, + "epoch": 0.14931506849315068, + "grad_norm": 0.013699422590434551, + "kl": 0.1331787109375, + "learning_rate": 9.50228310502283e-07, + "loss": 0.0001, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.3125, + "epoch": 0.1506849315068493, + "grad_norm": 9.379846572875977, + "kl": 0.1199951171875, + "learning_rate": 9.497716894977168e-07, + "loss": 0.0001, + "reward": 1.640625, + "reward_std": 0.36036762222647667, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 1.0, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 718.03125, + "epoch": 0.15205479452054796, + "grad_norm": 4.443063735961914, + "kl": 0.126708984375, + "learning_rate": 9.493150684931507e-07, + "loss": 0.0001, + "reward": 1.4427083134651184, + "reward_std": 0.17040568217635155, + "rewards/accuracy_reward": 0.4739583283662796, + "rewards/format_reward": 0.96875, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 735.9375, + "epoch": 0.15342465753424658, + "grad_norm": 2.7463929653167725, + "kl": 0.1220703125, + "learning_rate": 9.488584474885845e-07, + "loss": 0.0001, + "reward": 1.46875, + "reward_std": 0.47249409183859825, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.875, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.25, + "epoch": 0.1547945205479452, + "grad_norm": 3.2011280059814453, + "kl": 0.0909423828125, + "learning_rate": 9.484018264840182e-07, + "loss": 0.0001, + "reward": 1.15625, + "reward_std": 0.3061639815568924, + "rewards/accuracy_reward": 0.15625, + "rewards/format_reward": 1.0, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 504.8125, + "epoch": 0.15616438356164383, + "grad_norm": 3.5561389923095703, + "kl": 0.10040283203125, + "learning_rate": 9.47945205479452e-07, + "loss": 0.0001, + "reward": 1.203125, + "reward_std": 0.15992168709635735, + "rewards/accuracy_reward": 0.203125, + "rewards/format_reward": 1.0, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.15625, + "epoch": 0.15753424657534246, + "grad_norm": 1.8714385032653809, + "kl": 0.100341796875, + "learning_rate": 9.474885844748858e-07, + "loss": 0.0001, + "reward": 1.265625, + "reward_std": 0.19408093392848969, + "rewards/accuracy_reward": 0.265625, + "rewards/format_reward": 1.0, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.625, + "epoch": 0.1589041095890411, + "grad_norm": 6.825204849243164, + "kl": 0.086181640625, + "learning_rate": 9.470319634703196e-07, + "loss": 0.0001, + "reward": 1.21875, + "reward_std": 0.2756393924355507, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 616.34375, + "epoch": 0.16027397260273973, + "grad_norm": 1.6678544282913208, + "kl": 0.112060546875, + "learning_rate": 9.465753424657534e-07, + "loss": 0.0001, + "reward": 1.3333333134651184, + "reward_std": 0.1900147907435894, + "rewards/accuracy_reward": 0.3645833134651184, + "rewards/format_reward": 0.96875, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.1875, + "epoch": 0.16164383561643836, + "grad_norm": 2.2251501083374023, + "kl": 0.0775146484375, + "learning_rate": 9.461187214611872e-07, + "loss": 0.0001, + "reward": 1.1953125, + "reward_std": 0.022097086533904076, + "rewards/accuracy_reward": 0.1953125, + "rewards/format_reward": 1.0, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 633.8125, + "epoch": 0.16301369863013698, + "grad_norm": 1.6605937480926514, + "kl": 0.1004638671875, + "learning_rate": 9.45662100456621e-07, + "loss": 0.0001, + "reward": 1.6875, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 488.3125, + "epoch": 0.1643835616438356, + "grad_norm": 10.110955238342285, + "kl": 0.1434326171875, + "learning_rate": 9.452054794520548e-07, + "loss": 0.0001, + "reward": 1.171875, + "reward_std": 0.13258251920342445, + "rewards/accuracy_reward": 0.171875, + "rewards/format_reward": 1.0, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 670.28125, + "epoch": 0.16575342465753426, + "grad_norm": 1.275918960571289, + "kl": 0.1370849609375, + "learning_rate": 9.447488584474885e-07, + "loss": 0.0001, + "reward": 1.1302083432674408, + "reward_std": 0.014731401577591896, + "rewards/accuracy_reward": 0.1302083283662796, + "rewards/format_reward": 1.0, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 479.28125, + "epoch": 0.16712328767123288, + "grad_norm": 2.001127004623413, + "kl": 0.1129150390625, + "learning_rate": 9.442922374429223e-07, + "loss": 0.0001, + "reward": 1.3125, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.59375, + "epoch": 0.1684931506849315, + "grad_norm": 4.077696800231934, + "kl": 0.0904541015625, + "learning_rate": 9.438356164383561e-07, + "loss": 0.0001, + "reward": 1.375, + "reward_std": 0.2619796171784401, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 491.8125, + "epoch": 0.16986301369863013, + "grad_norm": 4.269510269165039, + "kl": 0.09326171875, + "learning_rate": 9.4337899543379e-07, + "loss": 0.0001, + "reward": 1.390625, + "reward_std": 0.2665942460298538, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 1.0, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.78125, + "epoch": 0.17123287671232876, + "grad_norm": 3.122952938079834, + "kl": 0.1368408203125, + "learning_rate": 9.429223744292237e-07, + "loss": 0.0001, + "reward": 1.453125, + "reward_std": 0.16415906324982643, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.25, + "epoch": 0.1726027397260274, + "grad_norm": 1.53658926486969, + "kl": 0.129150390625, + "learning_rate": 9.424657534246575e-07, + "loss": 0.0001, + "reward": 1.66015625, + "reward_std": 0.19974715635180473, + "rewards/accuracy_reward": 0.66015625, + "rewards/format_reward": 1.0, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.1875, + "epoch": 0.17397260273972603, + "grad_norm": 2.161501884460449, + "kl": 0.117919921875, + "learning_rate": 9.420091324200913e-07, + "loss": 0.0001, + "reward": 1.25, + "reward_std": 0.1462521031498909, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.65625, + "epoch": 0.17534246575342466, + "grad_norm": 2.8414783477783203, + "kl": 0.0875244140625, + "learning_rate": 9.41552511415525e-07, + "loss": 0.0001, + "reward": 1.5625, + "reward_std": 0.3335031494498253, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.3125, + "epoch": 0.17671232876712328, + "grad_norm": 2.295241355895996, + "kl": 0.1317138671875, + "learning_rate": 9.410958904109588e-07, + "loss": 0.0001, + "reward": 1.6302083134651184, + "reward_std": 0.18143897131085396, + "rewards/accuracy_reward": 0.6302083283662796, + "rewards/format_reward": 1.0, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.71875, + "epoch": 0.1780821917808219, + "grad_norm": 2.436352491378784, + "kl": 0.13232421875, + "learning_rate": 9.406392694063926e-07, + "loss": 0.0001, + "reward": 1.7109375, + "reward_std": 0.2758216764777899, + "rewards/accuracy_reward": 0.7109375, + "rewards/format_reward": 1.0, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.25, + "epoch": 0.17945205479452056, + "grad_norm": 3.0529682636260986, + "kl": 0.13720703125, + "learning_rate": 9.401826484018265e-07, + "loss": 0.0001, + "reward": 1.50390625, + "reward_std": 0.07282309234142303, + "rewards/accuracy_reward": 0.50390625, + "rewards/format_reward": 1.0, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 483.15625, + "epoch": 0.18082191780821918, + "grad_norm": 1.2933306694030762, + "kl": 0.123291015625, + "learning_rate": 9.397260273972603e-07, + "loss": 0.0001, + "reward": 1.09375, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward": 0.09375, + "rewards/format_reward": 1.0, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.71875, + "epoch": 0.1821917808219178, + "grad_norm": 1.558491826057434, + "kl": 0.11572265625, + "learning_rate": 9.39269406392694e-07, + "loss": 0.0001, + "reward": 1.5677083432674408, + "reward_std": 0.0725951585918665, + "rewards/accuracy_reward": 0.5677083432674408, + "rewards/format_reward": 1.0, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 488.78125, + "epoch": 0.18356164383561643, + "grad_norm": 2.861250400543213, + "kl": 0.113525390625, + "learning_rate": 9.388127853881278e-07, + "loss": 0.0001, + "reward": 1.375, + "reward_std": 0.2966036908328533, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 0.96875, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.28125, + "epoch": 0.18493150684931506, + "grad_norm": 2.4764702320098877, + "kl": 0.123291015625, + "learning_rate": 9.383561643835616e-07, + "loss": 0.0001, + "reward": 1.5825892686843872, + "reward_std": 0.20715469866991043, + "rewards/accuracy_reward": 0.5825892686843872, + "rewards/format_reward": 1.0, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.53125, + "epoch": 0.1863013698630137, + "grad_norm": 2.0686910152435303, + "kl": 0.1162109375, + "learning_rate": 9.378995433789953e-07, + "loss": 0.0001, + "reward": 1.40234375, + "reward_std": 0.21405773982405663, + "rewards/accuracy_reward": 0.40234375, + "rewards/format_reward": 1.0, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 659.84375, + "epoch": 0.18767123287671234, + "grad_norm": 1.7358382940292358, + "kl": 0.1298828125, + "learning_rate": 9.374429223744292e-07, + "loss": 0.0001, + "reward": 1.6507812142372131, + "reward_std": 0.2213343046605587, + "rewards/accuracy_reward": 0.6820312440395355, + "rewards/format_reward": 0.96875, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.5, + "epoch": 0.18904109589041096, + "grad_norm": 2.3077900409698486, + "kl": 0.114501953125, + "learning_rate": 9.36986301369863e-07, + "loss": 0.0001, + "reward": 1.2265625, + "reward_std": 0.22621294669806957, + "rewards/accuracy_reward": 0.2265625, + "rewards/format_reward": 1.0, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.59375, + "epoch": 0.19041095890410958, + "grad_norm": 0.7606338262557983, + "kl": 0.135498046875, + "learning_rate": 9.365296803652968e-07, + "loss": 0.0001, + "reward": 1.3333333134651184, + "reward_std": 0.07042950391769409, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 1.0, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 691.9375, + "epoch": 0.1917808219178082, + "grad_norm": 3.7832796573638916, + "kl": 0.1358642578125, + "learning_rate": 9.360730593607306e-07, + "loss": 0.0001, + "reward": 1.526041716337204, + "reward_std": 0.2930229790508747, + "rewards/accuracy_reward": 0.5885416567325592, + "rewards/format_reward": 0.9375, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 681.34375, + "epoch": 0.19315068493150686, + "grad_norm": 5.2146830558776855, + "kl": 0.1507568359375, + "learning_rate": 9.356164383561643e-07, + "loss": 0.0002, + "reward": 1.734375, + "reward_std": 0.16887323930859566, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 1.0, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 628.59375, + "epoch": 0.19452054794520549, + "grad_norm": 1.6805075407028198, + "kl": 0.158935546875, + "learning_rate": 9.351598173515981e-07, + "loss": 0.0002, + "reward": 1.3515625, + "reward_std": 0.07996084354817867, + "rewards/accuracy_reward": 0.3515625, + "rewards/format_reward": 1.0, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.8125, + "epoch": 0.1958904109589041, + "grad_norm": 2.2252542972564697, + "kl": 0.130615234375, + "learning_rate": 9.347031963470319e-07, + "loss": 0.0001, + "reward": 1.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 1.0, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 487.96875, + "epoch": 0.19726027397260273, + "grad_norm": 2.0735678672790527, + "kl": 0.164306640625, + "learning_rate": 9.342465753424658e-07, + "loss": 0.0002, + "reward": 1.4791666567325592, + "reward_std": 0.23902175202965736, + "rewards/accuracy_reward": 0.4791666567325592, + "rewards/format_reward": 1.0, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.96875, + "epoch": 0.19863013698630136, + "grad_norm": 1.4199141263961792, + "kl": 0.1151123046875, + "learning_rate": 9.337899543378995e-07, + "loss": 0.0001, + "reward": 1.328125, + "reward_std": 0.220042884349823, + "rewards/accuracy_reward": 0.328125, + "rewards/format_reward": 1.0, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.3125, + "epoch": 0.2, + "grad_norm": 4.550744533538818, + "kl": 0.13037109375, + "learning_rate": 9.333333333333333e-07, + "loss": 0.0001, + "reward": 1.53125, + "reward_std": 0.2041158601641655, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 1.0, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.625, + "epoch": 0.20136986301369864, + "grad_norm": 0.7138487100601196, + "kl": 0.1424560546875, + "learning_rate": 9.328767123287671e-07, + "loss": 0.0001, + "reward": 1.15625, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward": 0.15625, + "rewards/format_reward": 1.0, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.65625, + "epoch": 0.20273972602739726, + "grad_norm": 5.086813449859619, + "kl": 0.1173095703125, + "learning_rate": 9.324200913242009e-07, + "loss": 0.0001, + "reward": 1.4609375, + "reward_std": 0.05476716160774231, + "rewards/accuracy_reward": 0.4609375, + "rewards/format_reward": 1.0, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 673.65625, + "epoch": 0.20410958904109588, + "grad_norm": 0.8646766543388367, + "kl": 0.1317138671875, + "learning_rate": 9.319634703196346e-07, + "loss": 0.0001, + "reward": 1.5104166567325592, + "reward_std": 0.32082508504390717, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/format_reward": 0.96875, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.40625, + "epoch": 0.2054794520547945, + "grad_norm": 2.680182456970215, + "kl": 0.12353515625, + "learning_rate": 9.315068493150684e-07, + "loss": 0.0001, + "reward": 1.40625, + "reward_std": 0.4045617878437042, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 1.0, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.90625, + "epoch": 0.20684931506849316, + "grad_norm": 1.8212926387786865, + "kl": 0.108642578125, + "learning_rate": 9.310502283105023e-07, + "loss": 0.0001, + "reward": 1.3177083432674408, + "reward_std": 0.13903126679360867, + "rewards/accuracy_reward": 0.3177083432674408, + "rewards/format_reward": 1.0, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 510.625, + "epoch": 0.20821917808219179, + "grad_norm": 2.6356329917907715, + "kl": 0.1246337890625, + "learning_rate": 9.30593607305936e-07, + "loss": 0.0001, + "reward": 1.4812500178813934, + "reward_std": 0.45198121294379234, + "rewards/accuracy_reward": 0.512499988079071, + "rewards/format_reward": 0.96875, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.8125, + "epoch": 0.2095890410958904, + "grad_norm": 3.026435613632202, + "kl": 0.1207275390625, + "learning_rate": 9.301369863013698e-07, + "loss": 0.0001, + "reward": 1.3125, + "reward_std": 0.3230287954211235, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.15625, + "epoch": 0.21095890410958903, + "grad_norm": 0.8813155293464661, + "kl": 0.1131591796875, + "learning_rate": 9.296803652968036e-07, + "loss": 0.0001, + "reward": 1.3294270634651184, + "reward_std": 0.18091023340821266, + "rewards/accuracy_reward": 0.3606770634651184, + "rewards/format_reward": 0.96875, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.59375, + "epoch": 0.21232876712328766, + "grad_norm": 2.931427001953125, + "kl": 0.1202392578125, + "learning_rate": 9.292237442922374e-07, + "loss": 0.0001, + "reward": 1.515625, + "reward_std": 0.17358146235346794, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 1.0, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.25, + "epoch": 0.2136986301369863, + "grad_norm": 1.421940803527832, + "kl": 0.1243896484375, + "learning_rate": 9.287671232876712e-07, + "loss": 0.0001, + "reward": 1.125, + "reward_std": 0.13363061845302582, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.15625, + "epoch": 0.21506849315068494, + "grad_norm": 0.9390948414802551, + "kl": 0.134521484375, + "learning_rate": 9.28310502283105e-07, + "loss": 0.0001, + "reward": 1.35546875, + "reward_std": 0.06765139661729336, + "rewards/accuracy_reward": 0.35546875, + "rewards/format_reward": 1.0, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 721.1875, + "epoch": 0.21643835616438356, + "grad_norm": 1.8904517889022827, + "kl": 0.130615234375, + "learning_rate": 9.278538812785388e-07, + "loss": 0.0001, + "reward": 1.3515625, + "reward_std": 0.10474801808595657, + "rewards/accuracy_reward": 0.3515625, + "rewards/format_reward": 1.0, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.71875, + "epoch": 0.21780821917808219, + "grad_norm": 6.320685386657715, + "kl": 0.1202392578125, + "learning_rate": 9.273972602739726e-07, + "loss": 0.0001, + "reward": 1.4107142686843872, + "reward_std": 0.2957366779446602, + "rewards/accuracy_reward": 0.4107142984867096, + "rewards/format_reward": 1.0, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.5625, + "epoch": 0.2191780821917808, + "grad_norm": 0.6069585680961609, + "kl": 0.1357421875, + "learning_rate": 9.269406392694063e-07, + "loss": 0.0001, + "reward": 1.1875, + "reward_std": 0.06681530922651291, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.25, + "epoch": 0.22054794520547946, + "grad_norm": 3.5432889461517334, + "kl": 0.18798828125, + "learning_rate": 9.264840182648401e-07, + "loss": 0.0002, + "reward": 1.5, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 645.9375, + "epoch": 0.2219178082191781, + "grad_norm": 2.56186580657959, + "kl": 0.149169921875, + "learning_rate": 9.260273972602739e-07, + "loss": 0.0001, + "reward": 1.6875, + "reward_std": 0.1825428232550621, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.8125, + "epoch": 0.2232876712328767, + "grad_norm": 5.973612308502197, + "kl": 0.14306640625, + "learning_rate": 9.255707762557077e-07, + "loss": 0.0001, + "reward": 1.359375, + "reward_std": 0.19939782842993736, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 1.0, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.125, + "epoch": 0.22465753424657534, + "grad_norm": 1.2704259157180786, + "kl": 0.14794921875, + "learning_rate": 9.251141552511416e-07, + "loss": 0.0001, + "reward": 1.1796875, + "reward_std": 0.04005437344312668, + "rewards/accuracy_reward": 0.1796875, + "rewards/format_reward": 1.0, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 500.28125, + "epoch": 0.22602739726027396, + "grad_norm": 2.514188289642334, + "kl": 0.126220703125, + "learning_rate": 9.246575342465753e-07, + "loss": 0.0001, + "reward": 1.515625, + "reward_std": 0.2561880201101303, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 1.0, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 701.75, + "epoch": 0.2273972602739726, + "grad_norm": 1.1485939025878906, + "kl": 0.116943359375, + "learning_rate": 9.242009132420091e-07, + "loss": 0.0001, + "reward": 1.265625, + "reward_std": 0.2461063265800476, + "rewards/accuracy_reward": 0.328125, + "rewards/format_reward": 0.9375, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.125, + "epoch": 0.22876712328767124, + "grad_norm": 16.20563316345215, + "kl": 0.132568359375, + "learning_rate": 9.237442922374429e-07, + "loss": 0.0001, + "reward": 1.550000011920929, + "reward_std": 0.15476782992482185, + "rewards/accuracy_reward": 0.5499999970197678, + "rewards/format_reward": 1.0, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.78125, + "epoch": 0.23013698630136986, + "grad_norm": 0.7318564057350159, + "kl": 0.146728515625, + "learning_rate": 9.232876712328766e-07, + "loss": 0.0001, + "reward": 1.21875, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.28125, + "epoch": 0.23150684931506849, + "grad_norm": 3.2940330505371094, + "kl": 0.136962890625, + "learning_rate": 9.228310502283104e-07, + "loss": 0.0001, + "reward": 1.5013020634651184, + "reward_std": 0.0952342264354229, + "rewards/accuracy_reward": 0.5013020783662796, + "rewards/format_reward": 1.0, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 475.6875, + "epoch": 0.2328767123287671, + "grad_norm": 6.060558795928955, + "kl": 0.1484375, + "learning_rate": 9.223744292237442e-07, + "loss": 0.0001, + "reward": 1.4140625, + "reward_std": 0.2114126794040203, + "rewards/accuracy_reward": 0.4140625, + "rewards/format_reward": 1.0, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.96875, + "epoch": 0.23424657534246576, + "grad_norm": 2.6800572872161865, + "kl": 0.1197509765625, + "learning_rate": 9.219178082191781e-07, + "loss": 0.0001, + "reward": 1.4947916567325592, + "reward_std": 0.1304523590952158, + "rewards/accuracy_reward": 0.4947916567325592, + "rewards/format_reward": 1.0, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.0625, + "epoch": 0.2356164383561644, + "grad_norm": 1.535984992980957, + "kl": 0.2501220703125, + "learning_rate": 9.214611872146119e-07, + "loss": 0.0002, + "reward": 1.28125, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 1.0, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.125, + "epoch": 0.236986301369863, + "grad_norm": 1.8922423124313354, + "kl": 0.1363525390625, + "learning_rate": 9.210045662100456e-07, + "loss": 0.0001, + "reward": 1.3671875, + "reward_std": 0.16834918968379498, + "rewards/accuracy_reward": 0.3671875, + "rewards/format_reward": 1.0, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 489.4375, + "epoch": 0.23835616438356164, + "grad_norm": 4.069134712219238, + "kl": 0.154052734375, + "learning_rate": 9.205479452054794e-07, + "loss": 0.0002, + "reward": 1.5, + "reward_std": 0.33614395931363106, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.1875, + "epoch": 0.23972602739726026, + "grad_norm": 3.013641119003296, + "kl": 0.1435546875, + "learning_rate": 9.200913242009132e-07, + "loss": 0.0001, + "reward": 1.21875, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.9375, + "epoch": 0.2410958904109589, + "grad_norm": 1.713585376739502, + "kl": 0.1307373046875, + "learning_rate": 9.196347031963469e-07, + "loss": 0.0001, + "reward": 1.6640625, + "reward_std": 0.202580526471138, + "rewards/accuracy_reward": 0.6640625, + "rewards/format_reward": 1.0, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.90625, + "epoch": 0.24246575342465754, + "grad_norm": 1.653497576713562, + "kl": 0.133544921875, + "learning_rate": 9.191780821917808e-07, + "loss": 0.0001, + "reward": 1.3828125, + "reward_std": 0.23987272381782532, + "rewards/accuracy_reward": 0.3828125, + "rewards/format_reward": 1.0, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.875, + "epoch": 0.24383561643835616, + "grad_norm": 1.537979006767273, + "kl": 0.167236328125, + "learning_rate": 9.187214611872146e-07, + "loss": 0.0002, + "reward": 1.0625, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.28125, + "epoch": 0.2452054794520548, + "grad_norm": 1.1757584810256958, + "kl": 0.146484375, + "learning_rate": 9.182648401826484e-07, + "loss": 0.0001, + "reward": 1.6953125, + "reward_std": 0.08516896516084671, + "rewards/accuracy_reward": 0.6953125, + "rewards/format_reward": 1.0, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.3125, + "epoch": 0.2465753424657534, + "grad_norm": 2.4092350006103516, + "kl": 0.143310546875, + "learning_rate": 9.178082191780822e-07, + "loss": 0.0001, + "reward": 1.4921875, + "reward_std": 0.22854942083358765, + "rewards/accuracy_reward": 0.4921875, + "rewards/format_reward": 1.0, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 616.53125, + "epoch": 0.24794520547945206, + "grad_norm": 2.0598623752593994, + "kl": 0.1513671875, + "learning_rate": 9.173515981735159e-07, + "loss": 0.0002, + "reward": 1.3125, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.84375, + "epoch": 0.2493150684931507, + "grad_norm": 3.4757208824157715, + "kl": 0.15380859375, + "learning_rate": 9.168949771689497e-07, + "loss": 0.0002, + "reward": 1.4322916567325592, + "reward_std": 0.11389755457639694, + "rewards/accuracy_reward": 0.4322916716337204, + "rewards/format_reward": 1.0, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.53125, + "epoch": 0.25068493150684934, + "grad_norm": 1.2352172136306763, + "kl": 0.1416015625, + "learning_rate": 9.164383561643835e-07, + "loss": 0.0001, + "reward": 1.7265625, + "reward_std": 0.14262642711400986, + "rewards/accuracy_reward": 0.7265625, + "rewards/format_reward": 1.0, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 479.46875, + "epoch": 0.25205479452054796, + "grad_norm": 2.0595240592956543, + "kl": 0.143798828125, + "learning_rate": 9.159817351598174e-07, + "loss": 0.0001, + "reward": 1.671875, + "reward_std": 0.3808670938014984, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 1.0, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.75, + "epoch": 0.2534246575342466, + "grad_norm": 2.4244847297668457, + "kl": 0.14208984375, + "learning_rate": 9.155251141552511e-07, + "loss": 0.0001, + "reward": 1.3046875, + "reward_std": 0.04339781776070595, + "rewards/accuracy_reward": 0.3046875, + "rewards/format_reward": 1.0, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.1875, + "epoch": 0.2547945205479452, + "grad_norm": 2.7618906497955322, + "kl": 0.138671875, + "learning_rate": 9.150684931506849e-07, + "loss": 0.0001, + "reward": 1.3828125, + "reward_std": 0.13782460056245327, + "rewards/accuracy_reward": 0.3828125, + "rewards/format_reward": 1.0, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.375, + "epoch": 0.25616438356164384, + "grad_norm": 1.3447990417480469, + "kl": 0.16015625, + "learning_rate": 9.146118721461187e-07, + "loss": 0.0002, + "reward": 1.2942708432674408, + "reward_std": 0.11083479970693588, + "rewards/accuracy_reward": 0.2942708283662796, + "rewards/format_reward": 1.0, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 482.59375, + "epoch": 0.25753424657534246, + "grad_norm": 1.3052458763122559, + "kl": 0.17041015625, + "learning_rate": 9.141552511415525e-07, + "loss": 0.0002, + "reward": 1.40625, + "reward_std": 0.1246790662407875, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 1.0, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.96875, + "epoch": 0.2589041095890411, + "grad_norm": 1.417614221572876, + "kl": 0.1513671875, + "learning_rate": 9.136986301369862e-07, + "loss": 0.0002, + "reward": 1.2109375, + "reward_std": 0.17315101623535156, + "rewards/accuracy_reward": 0.2109375, + "rewards/format_reward": 1.0, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 485.75, + "epoch": 0.2602739726027397, + "grad_norm": 1.8264148235321045, + "kl": 0.179931640625, + "learning_rate": 9.1324200913242e-07, + "loss": 0.0002, + "reward": 1.359375, + "reward_std": 0.09143973141908646, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 1.0, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.78125, + "epoch": 0.26164383561643834, + "grad_norm": 2.271404266357422, + "kl": 0.276123046875, + "learning_rate": 9.127853881278539e-07, + "loss": 0.0003, + "reward": 1.5, + "reward_std": 0.1293042004108429, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.90625, + "epoch": 0.26301369863013696, + "grad_norm": 1.1621103286743164, + "kl": 0.146484375, + "learning_rate": 9.123287671232876e-07, + "loss": 0.0001, + "reward": 1.34375, + "reward_std": 0.10888781771063805, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.71875, + "epoch": 0.26438356164383564, + "grad_norm": 2.4608054161071777, + "kl": 0.1552734375, + "learning_rate": 9.118721461187214e-07, + "loss": 0.0002, + "reward": 1.84375, + "reward_std": 0.2704022154211998, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 1.0, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.65625, + "epoch": 0.26575342465753427, + "grad_norm": 2.2332265377044678, + "kl": 0.152587890625, + "learning_rate": 9.114155251141552e-07, + "loss": 0.0002, + "reward": 1.5546875, + "reward_std": 0.27746163308620453, + "rewards/accuracy_reward": 0.5546875, + "rewards/format_reward": 1.0, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.59375, + "epoch": 0.2671232876712329, + "grad_norm": 1.5108232498168945, + "kl": 0.13525390625, + "learning_rate": 9.10958904109589e-07, + "loss": 0.0001, + "reward": 1.375, + "reward_std": 0.115727499127388, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.625, + "epoch": 0.2684931506849315, + "grad_norm": 4.214125633239746, + "kl": 0.14013671875, + "learning_rate": 9.105022831050228e-07, + "loss": 0.0001, + "reward": 1.3984375, + "reward_std": 0.18013210594654083, + "rewards/accuracy_reward": 0.3984375, + "rewards/format_reward": 1.0, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.3125, + "epoch": 0.26986301369863014, + "grad_norm": 5.064935207366943, + "kl": 0.165771484375, + "learning_rate": 9.100456621004566e-07, + "loss": 0.0002, + "reward": 1.7109375, + "reward_std": 0.27000918984413147, + "rewards/accuracy_reward": 0.7109375, + "rewards/format_reward": 1.0, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.4375, + "epoch": 0.27123287671232876, + "grad_norm": 1.5454697608947754, + "kl": 0.18994140625, + "learning_rate": 9.095890410958904e-07, + "loss": 0.0002, + "reward": 1.4296875, + "reward_std": 0.13488983362913132, + "rewards/accuracy_reward": 0.4296875, + "rewards/format_reward": 1.0, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.34375, + "epoch": 0.2726027397260274, + "grad_norm": 4.849545478820801, + "kl": 0.1414794921875, + "learning_rate": 9.091324200913242e-07, + "loss": 0.0001, + "reward": 1.234375, + "reward_std": 0.432646207511425, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.9375, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.40625, + "epoch": 0.273972602739726, + "grad_norm": 1.7864863872528076, + "kl": 0.173583984375, + "learning_rate": 9.08675799086758e-07, + "loss": 0.0002, + "reward": 1.53125, + "reward_std": 0.14913516864180565, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 1.0, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.65625, + "epoch": 0.27534246575342464, + "grad_norm": 1.9118916988372803, + "kl": 0.177734375, + "learning_rate": 9.082191780821917e-07, + "loss": 0.0002, + "reward": 1.640625, + "reward_std": 0.35035815089941025, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 1.0, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.84375, + "epoch": 0.27671232876712326, + "grad_norm": 3.346449613571167, + "kl": 0.16943359375, + "learning_rate": 9.077625570776255e-07, + "loss": 0.0002, + "reward": 1.875, + "reward_std": 0.4419417232275009, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.96875, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.90625, + "epoch": 0.27808219178082194, + "grad_norm": 2.130422830581665, + "kl": 0.161865234375, + "learning_rate": 9.073059360730593e-07, + "loss": 0.0002, + "reward": 1.2890625, + "reward_std": 0.27090023458004, + "rewards/accuracy_reward": 0.4140625, + "rewards/format_reward": 0.875, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.65625, + "epoch": 0.27945205479452057, + "grad_norm": 2.6185011863708496, + "kl": 0.1533203125, + "learning_rate": 9.068493150684932e-07, + "loss": 0.0002, + "reward": 1.625, + "reward_std": 0.18702642805874348, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.5625, + "epoch": 0.2808219178082192, + "grad_norm": 5.361741065979004, + "kl": 0.174072265625, + "learning_rate": 9.063926940639269e-07, + "loss": 0.0002, + "reward": 1.34375, + "reward_std": 0.19149437546730042, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.5625, + "epoch": 0.2821917808219178, + "grad_norm": 1.3938766717910767, + "kl": 0.155517578125, + "learning_rate": 9.059360730593607e-07, + "loss": 0.0002, + "reward": 1.453125, + "reward_std": 0.10205793008208275, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.875, + "epoch": 0.28356164383561644, + "grad_norm": 1.8740441799163818, + "kl": 0.191162109375, + "learning_rate": 9.054794520547945e-07, + "loss": 0.0002, + "reward": 1.3125, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.59375, + "epoch": 0.28493150684931506, + "grad_norm": 10.041444778442383, + "kl": 0.17919921875, + "learning_rate": 9.050228310502282e-07, + "loss": 0.0002, + "reward": 1.4453125, + "reward_std": 0.26258746162056923, + "rewards/accuracy_reward": 0.4453125, + "rewards/format_reward": 1.0, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.15625, + "epoch": 0.2863013698630137, + "grad_norm": 2.726414203643799, + "kl": 0.166015625, + "learning_rate": 9.04566210045662e-07, + "loss": 0.0002, + "reward": 1.453125, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.84375, + "epoch": 0.2876712328767123, + "grad_norm": 1.979183554649353, + "kl": 0.167724609375, + "learning_rate": 9.041095890410958e-07, + "loss": 0.0002, + "reward": 1.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 1.0, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.84375, + "epoch": 0.28904109589041094, + "grad_norm": 1.7484036684036255, + "kl": 0.17041015625, + "learning_rate": 9.036529680365297e-07, + "loss": 0.0002, + "reward": 1.453125, + "reward_std": 0.15992168709635735, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.84375, + "epoch": 0.29041095890410956, + "grad_norm": 0.7065235376358032, + "kl": 0.157958984375, + "learning_rate": 9.031963470319635e-07, + "loss": 0.0002, + "reward": 1.1953125, + "reward_std": 0.09704047441482544, + "rewards/accuracy_reward": 0.1953125, + "rewards/format_reward": 1.0, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.21875, + "epoch": 0.29178082191780824, + "grad_norm": 2.361640453338623, + "kl": 0.18505859375, + "learning_rate": 9.027397260273972e-07, + "loss": 0.0002, + "reward": 1.6070312559604645, + "reward_std": 0.23584069684147835, + "rewards/accuracy_reward": 0.6070312410593033, + "rewards/format_reward": 1.0, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 644.40625, + "epoch": 0.29315068493150687, + "grad_norm": 2.628955125808716, + "kl": 0.17041015625, + "learning_rate": 9.02283105022831e-07, + "loss": 0.0002, + "reward": 1.42578125, + "reward_std": 0.028628919273614883, + "rewards/accuracy_reward": 0.42578125, + "rewards/format_reward": 1.0, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.65625, + "epoch": 0.2945205479452055, + "grad_norm": 5.236007213592529, + "kl": 0.173828125, + "learning_rate": 9.018264840182648e-07, + "loss": 0.0002, + "reward": 1.643750011920929, + "reward_std": 0.30251236632466316, + "rewards/accuracy_reward": 0.643750011920929, + "rewards/format_reward": 1.0, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.25, + "epoch": 0.2958904109589041, + "grad_norm": 3.1358797550201416, + "kl": 0.176513671875, + "learning_rate": 9.013698630136985e-07, + "loss": 0.0002, + "reward": 1.4453125, + "reward_std": 0.22854942083358765, + "rewards/accuracy_reward": 0.4453125, + "rewards/format_reward": 1.0, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.75, + "epoch": 0.29726027397260274, + "grad_norm": 1.5145561695098877, + "kl": 0.185791015625, + "learning_rate": 9.009132420091324e-07, + "loss": 0.0002, + "reward": 1.19140625, + "reward_std": 0.11935807205736637, + "rewards/accuracy_reward": 0.19140625, + "rewards/format_reward": 1.0, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.40625, + "epoch": 0.29863013698630136, + "grad_norm": 1.9805744886398315, + "kl": 0.181884765625, + "learning_rate": 9.004566210045662e-07, + "loss": 0.0002, + "reward": 1.28125, + "reward_std": 0.19149437546730042, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 1.0, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.84375, + "epoch": 0.3, + "grad_norm": 8.512849807739258, + "kl": 0.184814453125, + "learning_rate": 9e-07, + "loss": 0.0002, + "reward": 1.53125, + "reward_std": 0.22301281243562698, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 1.0, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.09375, + "epoch": 0.3013698630136986, + "grad_norm": 0.7778679132461548, + "kl": 0.225341796875, + "learning_rate": 8.995433789954338e-07, + "loss": 0.0002, + "reward": 1.34375, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.75, + "epoch": 0.30273972602739724, + "grad_norm": 6.034634590148926, + "kl": 0.19775390625, + "learning_rate": 8.990867579908675e-07, + "loss": 0.0002, + "reward": 1.375, + "reward_std": 0.16279494389891624, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 501.40625, + "epoch": 0.3041095890410959, + "grad_norm": 0.8172019720077515, + "kl": 0.258544921875, + "learning_rate": 8.986301369863013e-07, + "loss": 0.0003, + "reward": 1.578125, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 1.0, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.28125, + "epoch": 0.30547945205479454, + "grad_norm": 2.73528790473938, + "kl": 0.19140625, + "learning_rate": 8.981735159817351e-07, + "loss": 0.0002, + "reward": 1.3645833432674408, + "reward_std": 0.2686460316181183, + "rewards/accuracy_reward": 0.3645833283662796, + "rewards/format_reward": 1.0, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 475.09375, + "epoch": 0.30684931506849317, + "grad_norm": 0.015807894989848137, + "kl": 0.178466796875, + "learning_rate": 8.97716894977169e-07, + "loss": 0.0002, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 673.03125, + "epoch": 0.3082191780821918, + "grad_norm": 1.2931923866271973, + "kl": 0.155029296875, + "learning_rate": 8.972602739726027e-07, + "loss": 0.0002, + "reward": 1.3515625, + "reward_std": 0.2547192648053169, + "rewards/accuracy_reward": 0.3828125, + "rewards/format_reward": 0.96875, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.40625, + "epoch": 0.3095890410958904, + "grad_norm": 2.7091639041900635, + "kl": 0.183349609375, + "learning_rate": 8.968036529680365e-07, + "loss": 0.0002, + "reward": 1.390625, + "reward_std": 0.2109457477927208, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 1.0, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 497.9375, + "epoch": 0.31095890410958904, + "grad_norm": 1.3506500720977783, + "kl": 0.1748046875, + "learning_rate": 8.963470319634703e-07, + "loss": 0.0002, + "reward": 1.09375, + "reward_std": 0.22201896458864212, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.96875, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.8125, + "epoch": 0.31232876712328766, + "grad_norm": 1.4593608379364014, + "kl": 0.183349609375, + "learning_rate": 8.958904109589041e-07, + "loss": 0.0002, + "reward": 1.34375, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.3125, + "epoch": 0.3136986301369863, + "grad_norm": 0.8097538948059082, + "kl": 0.159423828125, + "learning_rate": 8.954337899543378e-07, + "loss": 0.0002, + "reward": 1.1875, + "reward_std": 0.06681530922651291, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.09375, + "epoch": 0.3150684931506849, + "grad_norm": 2.8190650939941406, + "kl": 0.18408203125, + "learning_rate": 8.949771689497716e-07, + "loss": 0.0002, + "reward": 1.40625, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 1.0, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.1875, + "epoch": 0.31643835616438354, + "grad_norm": 1.5466049909591675, + "kl": 0.1650390625, + "learning_rate": 8.945205479452055e-07, + "loss": 0.0002, + "reward": 1.40625, + "reward_std": 0.27339156717061996, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.96875, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.09375, + "epoch": 0.3178082191780822, + "grad_norm": 8.43017864227295, + "kl": 0.15771484375, + "learning_rate": 8.940639269406392e-07, + "loss": 0.0002, + "reward": 1.203125, + "reward_std": 0.26621313393116, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 0.96875, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.96875, + "epoch": 0.31917808219178084, + "grad_norm": 3.6506810188293457, + "kl": 0.197509765625, + "learning_rate": 8.93607305936073e-07, + "loss": 0.0002, + "reward": 1.625, + "reward_std": 0.4045617878437042, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 642.0, + "epoch": 0.32054794520547947, + "grad_norm": 2.9570024013519287, + "kl": 0.17041015625, + "learning_rate": 8.931506849315068e-07, + "loss": 0.0002, + "reward": 1.4375, + "reward_std": 0.3128525000065565, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.78125, + "epoch": 0.3219178082191781, + "grad_norm": 1.4737908840179443, + "kl": 0.158203125, + "learning_rate": 8.926940639269406e-07, + "loss": 0.0002, + "reward": 1.765625, + "reward_std": 0.41709377616643906, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 1.0, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 656.59375, + "epoch": 0.3232876712328767, + "grad_norm": 3.560518264770508, + "kl": 0.168701171875, + "learning_rate": 8.922374429223744e-07, + "loss": 0.0002, + "reward": 1.6796875, + "reward_std": 0.12393621355295181, + "rewards/accuracy_reward": 0.6796875, + "rewards/format_reward": 1.0, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.59375, + "epoch": 0.32465753424657534, + "grad_norm": 1.0155640840530396, + "kl": 0.16259765625, + "learning_rate": 8.917808219178081e-07, + "loss": 0.0002, + "reward": 1.28125, + "reward_std": 0.07312605157494545, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 1.0, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.84375, + "epoch": 0.32602739726027397, + "grad_norm": 0.018812214955687523, + "kl": 0.177490234375, + "learning_rate": 8.91324200913242e-07, + "loss": 0.0002, + "reward": 1.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 714.375, + "epoch": 0.3273972602739726, + "grad_norm": 1.2518051862716675, + "kl": 0.1611328125, + "learning_rate": 8.908675799086758e-07, + "loss": 0.0002, + "reward": 1.8567708134651184, + "reward_std": 0.23625470884144306, + "rewards/accuracy_reward": 0.8880208134651184, + "rewards/format_reward": 0.96875, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.78125, + "epoch": 0.3287671232876712, + "grad_norm": 0.015509501099586487, + "kl": 0.189453125, + "learning_rate": 8.904109589041095e-07, + "loss": 0.0002, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 660.5625, + "epoch": 0.33013698630136984, + "grad_norm": 2.026989459991455, + "kl": 0.18701171875, + "learning_rate": 8.899543378995433e-07, + "loss": 0.0002, + "reward": 1.6953125, + "reward_std": 0.229622982442379, + "rewards/accuracy_reward": 0.6953125, + "rewards/format_reward": 1.0, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 691.1875, + "epoch": 0.3315068493150685, + "grad_norm": 1.3158351182937622, + "kl": 0.180419921875, + "learning_rate": 8.894977168949771e-07, + "loss": 0.0002, + "reward": 1.65625, + "reward_std": 0.1462521031498909, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 1.0, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 721.46875, + "epoch": 0.33287671232876714, + "grad_norm": 1.1662763357162476, + "kl": 0.147216796875, + "learning_rate": 8.890410958904109e-07, + "loss": 0.0001, + "reward": 1.7299107313156128, + "reward_std": 0.14230500534176826, + "rewards/accuracy_reward": 0.7299107015132904, + "rewards/format_reward": 1.0, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.09375, + "epoch": 0.33424657534246577, + "grad_norm": 2.274336814880371, + "kl": 0.17529296875, + "learning_rate": 8.885844748858448e-07, + "loss": 0.0002, + "reward": 1.4114583432674408, + "reward_std": 0.17236988991498947, + "rewards/accuracy_reward": 0.4114583432674408, + "rewards/format_reward": 1.0, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.25, + "epoch": 0.3356164383561644, + "grad_norm": 1.401793360710144, + "kl": 0.201416015625, + "learning_rate": 8.881278538812785e-07, + "loss": 0.0002, + "reward": 1.328125, + "reward_std": 0.12255740165710449, + "rewards/accuracy_reward": 0.328125, + "rewards/format_reward": 1.0, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.71875, + "epoch": 0.336986301369863, + "grad_norm": 1.7275149822235107, + "kl": 0.1708984375, + "learning_rate": 8.876712328767123e-07, + "loss": 0.0002, + "reward": 1.4192708432674408, + "reward_std": 0.12374339066445827, + "rewards/accuracy_reward": 0.4192708432674408, + "rewards/format_reward": 1.0, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.875, + "epoch": 0.33835616438356164, + "grad_norm": 10.723580360412598, + "kl": 0.16357421875, + "learning_rate": 8.872146118721461e-07, + "loss": 0.0002, + "reward": 1.2890625, + "reward_std": 0.16597744077444077, + "rewards/accuracy_reward": 0.2890625, + "rewards/format_reward": 1.0, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 618.375, + "epoch": 0.33972602739726027, + "grad_norm": 5.1156816482543945, + "kl": 0.235107421875, + "learning_rate": 8.867579908675798e-07, + "loss": 0.0002, + "reward": 1.625, + "reward_std": 0.24456444010138512, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.625, + "epoch": 0.3410958904109589, + "grad_norm": 2.348053216934204, + "kl": 0.185546875, + "learning_rate": 8.863013698630136e-07, + "loss": 0.0002, + "reward": 1.421875, + "reward_std": 0.29355230554938316, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 1.0, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 694.375, + "epoch": 0.3424657534246575, + "grad_norm": 0.8780984878540039, + "kl": 0.171630859375, + "learning_rate": 8.858447488584474e-07, + "loss": 0.0002, + "reward": 1.234375, + "reward_std": 0.1173202246427536, + "rewards/accuracy_reward": 0.265625, + "rewards/format_reward": 0.96875, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.1875, + "epoch": 0.34383561643835614, + "grad_norm": 3.7509207725524902, + "kl": 0.19140625, + "learning_rate": 8.853881278538813e-07, + "loss": 0.0002, + "reward": 1.4375, + "reward_std": 0.2734241336584091, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.5, + "epoch": 0.3452054794520548, + "grad_norm": 1.7510371208190918, + "kl": 0.3330078125, + "learning_rate": 8.849315068493151e-07, + "loss": 0.0003, + "reward": 1.53125, + "reward_std": 0.26409146934747696, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 1.0, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 497.0625, + "epoch": 0.34657534246575344, + "grad_norm": 0.7126864790916443, + "kl": 0.1669921875, + "learning_rate": 8.844748858447488e-07, + "loss": 0.0002, + "reward": 1.2890625, + "reward_std": 0.3738361746072769, + "rewards/accuracy_reward": 0.3515625, + "rewards/format_reward": 0.9375, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.6875, + "epoch": 0.34794520547945207, + "grad_norm": 2.1832709312438965, + "kl": 0.1943359375, + "learning_rate": 8.840182648401826e-07, + "loss": 0.0002, + "reward": 1.3515625, + "reward_std": 0.1815449744462967, + "rewards/accuracy_reward": 0.3515625, + "rewards/format_reward": 1.0, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.0, + "epoch": 0.3493150684931507, + "grad_norm": 2.6141257286071777, + "kl": 0.217041015625, + "learning_rate": 8.835616438356164e-07, + "loss": 0.0002, + "reward": 1.203125, + "reward_std": 0.22097086533904076, + "rewards/accuracy_reward": 0.203125, + "rewards/format_reward": 1.0, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.78125, + "epoch": 0.3506849315068493, + "grad_norm": 1.7095686197280884, + "kl": 0.1640625, + "learning_rate": 8.831050228310501e-07, + "loss": 0.0002, + "reward": 1.3671875, + "reward_std": 0.21604011207818985, + "rewards/accuracy_reward": 0.3984375, + "rewards/format_reward": 0.96875, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 666.25, + "epoch": 0.35205479452054794, + "grad_norm": 1.052132487297058, + "kl": 0.1669921875, + "learning_rate": 8.826484018264839e-07, + "loss": 0.0002, + "reward": 1.609375, + "reward_std": 0.1315089538693428, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 1.0, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.0, + "epoch": 0.35342465753424657, + "grad_norm": 17.632997512817383, + "kl": 0.168212890625, + "learning_rate": 8.821917808219178e-07, + "loss": 0.0002, + "reward": 1.671875, + "reward_std": 0.38394393771886826, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.96875, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.46875, + "epoch": 0.3547945205479452, + "grad_norm": 1.7030237913131714, + "kl": 0.17041015625, + "learning_rate": 8.817351598173516e-07, + "loss": 0.0002, + "reward": 1.53125, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 1.0, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.09375, + "epoch": 0.3561643835616438, + "grad_norm": 3.2231457233428955, + "kl": 0.19873046875, + "learning_rate": 8.812785388127854e-07, + "loss": 0.0002, + "reward": 1.15625, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward": 0.15625, + "rewards/format_reward": 1.0, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 697.28125, + "epoch": 0.35753424657534244, + "grad_norm": 1.5386288166046143, + "kl": 0.18798828125, + "learning_rate": 8.808219178082191e-07, + "loss": 0.0002, + "reward": 1.5078125, + "reward_std": 0.1916224267333746, + "rewards/accuracy_reward": 0.5390625, + "rewards/format_reward": 0.96875, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.25, + "epoch": 0.3589041095890411, + "grad_norm": 17.967370986938477, + "kl": 0.21240234375, + "learning_rate": 8.803652968036529e-07, + "loss": 0.0002, + "reward": 1.6458333432674408, + "reward_std": 0.26592448726296425, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/format_reward": 1.0, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.34375, + "epoch": 0.36027397260273974, + "grad_norm": 1.2522979974746704, + "kl": 0.169921875, + "learning_rate": 8.799086757990867e-07, + "loss": 0.0002, + "reward": 1.4791666269302368, + "reward_std": 0.12335556373000145, + "rewards/accuracy_reward": 0.4791666567325592, + "rewards/format_reward": 1.0, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.53125, + "epoch": 0.36164383561643837, + "grad_norm": 1.5335299968719482, + "kl": 0.187744140625, + "learning_rate": 8.794520547945205e-07, + "loss": 0.0002, + "reward": 1.28125, + "reward_std": 0.1962025985121727, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 1.0, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.9375, + "epoch": 0.363013698630137, + "grad_norm": 1.35916006565094, + "kl": 0.24365234375, + "learning_rate": 8.789954337899543e-07, + "loss": 0.0002, + "reward": 1.390625, + "reward_std": 0.13258251920342445, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 0.96875, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.75, + "epoch": 0.3643835616438356, + "grad_norm": 0.8804590702056885, + "kl": 0.2041015625, + "learning_rate": 8.785388127853881e-07, + "loss": 0.0002, + "reward": 1.3515625, + "reward_std": 0.09704046696424484, + "rewards/accuracy_reward": 0.3515625, + "rewards/format_reward": 1.0, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.875, + "epoch": 0.36575342465753424, + "grad_norm": 1.8236092329025269, + "kl": 0.188720703125, + "learning_rate": 8.780821917808219e-07, + "loss": 0.0002, + "reward": 2.078125, + "reward_std": 0.0776018276810646, + "rewards/accuracy_reward": 1.078125, + "rewards/format_reward": 1.0, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.59375, + "epoch": 0.36712328767123287, + "grad_norm": 11.162297248840332, + "kl": 0.190185546875, + "learning_rate": 8.776255707762557e-07, + "loss": 0.0002, + "reward": 1.5859375, + "reward_std": 0.23224157467484474, + "rewards/accuracy_reward": 0.5859375, + "rewards/format_reward": 1.0, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.5625, + "epoch": 0.3684931506849315, + "grad_norm": 2.6213197708129883, + "kl": 0.214111328125, + "learning_rate": 8.771689497716894e-07, + "loss": 0.0002, + "reward": 1.58984375, + "reward_std": 0.33964164927601814, + "rewards/accuracy_reward": 0.58984375, + "rewards/format_reward": 1.0, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.5, + "epoch": 0.3698630136986301, + "grad_norm": 2.206378221511841, + "kl": 0.1650390625, + "learning_rate": 8.767123287671232e-07, + "loss": 0.0002, + "reward": 1.6437499523162842, + "reward_std": 0.4507673643529415, + "rewards/accuracy_reward": 0.6749999970197678, + "rewards/format_reward": 0.96875, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.84375, + "epoch": 0.37123287671232874, + "grad_norm": 3.658581018447876, + "kl": 0.167236328125, + "learning_rate": 8.762557077625571e-07, + "loss": 0.0002, + "reward": 1.2125000059604645, + "reward_std": 0.09996108617633581, + "rewards/accuracy_reward": 0.2124999761581421, + "rewards/format_reward": 1.0, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.375, + "epoch": 0.3726027397260274, + "grad_norm": 2.4175188541412354, + "kl": 0.258056640625, + "learning_rate": 8.757990867579908e-07, + "loss": 0.0003, + "reward": 1.515625, + "reward_std": 0.30721208080649376, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 1.0, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 484.53125, + "epoch": 0.37397260273972605, + "grad_norm": 1.4815560579299927, + "kl": 0.1787109375, + "learning_rate": 8.753424657534246e-07, + "loss": 0.0002, + "reward": 1.359375, + "reward_std": 0.1804211586713791, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 1.0, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.0, + "epoch": 0.37534246575342467, + "grad_norm": 2.208836078643799, + "kl": 0.159423828125, + "learning_rate": 8.748858447488584e-07, + "loss": 0.0002, + "reward": 1.5859375, + "reward_std": 0.19568835757672787, + "rewards/accuracy_reward": 0.5859375, + "rewards/format_reward": 1.0, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.1875, + "epoch": 0.3767123287671233, + "grad_norm": 2.3311572074890137, + "kl": 0.16748046875, + "learning_rate": 8.744292237442922e-07, + "loss": 0.0002, + "reward": 1.609375, + "reward_std": 0.269338458776474, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 1.0, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.78125, + "epoch": 0.3780821917808219, + "grad_norm": 2.8278110027313232, + "kl": 0.22265625, + "learning_rate": 8.73972602739726e-07, + "loss": 0.0002, + "reward": 1.296875, + "reward_std": 0.1583191677927971, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 1.0, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 708.03125, + "epoch": 0.37945205479452054, + "grad_norm": 1.0638788938522339, + "kl": 0.224853515625, + "learning_rate": 8.735159817351597e-07, + "loss": 0.0002, + "reward": 1.3984375, + "reward_std": 0.13941731117665768, + "rewards/accuracy_reward": 0.4296875, + "rewards/format_reward": 0.96875, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.6875, + "epoch": 0.38082191780821917, + "grad_norm": 0.551222026348114, + "kl": 0.21728515625, + "learning_rate": 8.730593607305936e-07, + "loss": 0.0002, + "reward": 1.078125, + "reward_std": 0.13258251547813416, + "rewards/accuracy_reward": 0.109375, + "rewards/format_reward": 0.96875, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.125, + "epoch": 0.3821917808219178, + "grad_norm": 2.8419177532196045, + "kl": 0.177978515625, + "learning_rate": 8.726027397260274e-07, + "loss": 0.0002, + "reward": 1.515625, + "reward_std": 0.3098084479570389, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 1.0, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.46875, + "epoch": 0.3835616438356164, + "grad_norm": 2.1832010746002197, + "kl": 0.171875, + "learning_rate": 8.721461187214611e-07, + "loss": 0.0002, + "reward": 1.734375, + "reward_std": 0.22673699632287025, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 1.0, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.8125, + "epoch": 0.38493150684931504, + "grad_norm": 1.6464120149612427, + "kl": 0.18115234375, + "learning_rate": 8.716894977168949e-07, + "loss": 0.0002, + "reward": 1.203125, + "reward_std": 0.22097086161375046, + "rewards/accuracy_reward": 0.203125, + "rewards/format_reward": 1.0, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.125, + "epoch": 0.3863013698630137, + "grad_norm": 1.0540616512298584, + "kl": 0.1845703125, + "learning_rate": 8.712328767123287e-07, + "loss": 0.0002, + "reward": 1.546875, + "reward_std": 0.05444390885531902, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 1.0, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.4375, + "epoch": 0.38767123287671235, + "grad_norm": 0.87820965051651, + "kl": 0.15380859375, + "learning_rate": 8.707762557077625e-07, + "loss": 0.0002, + "reward": 1.3020833134651184, + "reward_std": 0.029462769627571106, + "rewards/accuracy_reward": 0.3020833283662796, + "rewards/format_reward": 1.0, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 651.53125, + "epoch": 0.38904109589041097, + "grad_norm": 3.814786434173584, + "kl": 0.184326171875, + "learning_rate": 8.703196347031964e-07, + "loss": 0.0002, + "reward": 1.5679687559604645, + "reward_std": 0.125592902302742, + "rewards/accuracy_reward": 0.5679687559604645, + "rewards/format_reward": 1.0, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 660.59375, + "epoch": 0.3904109589041096, + "grad_norm": 8.2400484085083, + "kl": 0.18896484375, + "learning_rate": 8.698630136986301e-07, + "loss": 0.0002, + "reward": 1.7677083015441895, + "reward_std": 0.2733229286968708, + "rewards/accuracy_reward": 0.7989583611488342, + "rewards/format_reward": 0.96875, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 505.28125, + "epoch": 0.3917808219178082, + "grad_norm": 1.994737982749939, + "kl": 0.18994140625, + "learning_rate": 8.694063926940639e-07, + "loss": 0.0002, + "reward": 1.4140625, + "reward_std": 0.10613362491130829, + "rewards/accuracy_reward": 0.4140625, + "rewards/format_reward": 1.0, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.8125, + "epoch": 0.39315068493150684, + "grad_norm": 5.628194808959961, + "kl": 0.166748046875, + "learning_rate": 8.689497716894977e-07, + "loss": 0.0002, + "reward": 1.7578125, + "reward_std": 0.21267853677272797, + "rewards/accuracy_reward": 0.7578125, + "rewards/format_reward": 1.0, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.21875, + "epoch": 0.39452054794520547, + "grad_norm": 2.055222749710083, + "kl": 0.202392578125, + "learning_rate": 8.684931506849314e-07, + "loss": 0.0002, + "reward": 1.7187499701976776, + "reward_std": 0.23613503947854042, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 1.0, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.75, + "epoch": 0.3958904109589041, + "grad_norm": 0.7133424282073975, + "kl": 0.19189453125, + "learning_rate": 8.680365296803652e-07, + "loss": 0.0002, + "reward": 1.5703125, + "reward_std": 0.05725783854722977, + "rewards/accuracy_reward": 0.5703125, + "rewards/format_reward": 1.0, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.25, + "epoch": 0.3972602739726027, + "grad_norm": 1.4115186929702759, + "kl": 0.185546875, + "learning_rate": 8.67579908675799e-07, + "loss": 0.0002, + "reward": 1.96875, + "reward_std": 0.23833239078521729, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.625, + "epoch": 0.39863013698630134, + "grad_norm": 1.865753173828125, + "kl": 0.2314453125, + "learning_rate": 8.671232876712329e-07, + "loss": 0.0002, + "reward": 1.5546875, + "reward_std": 0.18630647659301758, + "rewards/accuracy_reward": 0.5546875, + "rewards/format_reward": 1.0, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.46875, + "epoch": 0.4, + "grad_norm": 1.9343034029006958, + "kl": 0.18505859375, + "learning_rate": 8.666666666666667e-07, + "loss": 0.0002, + "reward": 1.640625, + "reward_std": 0.31587694957852364, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 1.0, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.9375, + "epoch": 0.40136986301369865, + "grad_norm": 1.4373409748077393, + "kl": 0.17578125, + "learning_rate": 8.662100456621004e-07, + "loss": 0.0002, + "reward": 1.6388888359069824, + "reward_std": 0.1706457920372486, + "rewards/accuracy_reward": 0.6388888955116272, + "rewards/format_reward": 1.0, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.96875, + "epoch": 0.40273972602739727, + "grad_norm": 1.1212681531906128, + "kl": 0.19921875, + "learning_rate": 8.657534246575342e-07, + "loss": 0.0002, + "reward": 1.203125, + "reward_std": 0.0289318785071373, + "rewards/accuracy_reward": 0.203125, + "rewards/format_reward": 1.0, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.65625, + "epoch": 0.4041095890410959, + "grad_norm": 1.6588208675384521, + "kl": 0.193359375, + "learning_rate": 8.65296803652968e-07, + "loss": 0.0002, + "reward": 1.3971354067325592, + "reward_std": 0.03875125199556351, + "rewards/accuracy_reward": 0.3971354067325592, + "rewards/format_reward": 1.0, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.1875, + "epoch": 0.4054794520547945, + "grad_norm": 1.8998637199401855, + "kl": 0.22900390625, + "learning_rate": 8.648401826484017e-07, + "loss": 0.0002, + "reward": 1.96875, + "reward_std": 0.2519447058439255, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 1.0, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.46875, + "epoch": 0.40684931506849314, + "grad_norm": 1.1315598487854004, + "kl": 0.178955078125, + "learning_rate": 8.643835616438355e-07, + "loss": 0.0002, + "reward": 1.3828125, + "reward_std": 0.09021057933568954, + "rewards/accuracy_reward": 0.3828125, + "rewards/format_reward": 1.0, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.6875, + "epoch": 0.40821917808219177, + "grad_norm": 1.4266315698623657, + "kl": 0.209716796875, + "learning_rate": 8.639269406392694e-07, + "loss": 0.0002, + "reward": 1.3203125, + "reward_std": 0.022097086533904076, + "rewards/accuracy_reward": 0.3203125, + "rewards/format_reward": 1.0, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.6875, + "epoch": 0.4095890410958904, + "grad_norm": 0.9578768014907837, + "kl": 0.186279296875, + "learning_rate": 8.634703196347032e-07, + "loss": 0.0002, + "reward": 1.203125, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward": 0.203125, + "rewards/format_reward": 1.0, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.875, + "epoch": 0.410958904109589, + "grad_norm": 1.1748281717300415, + "kl": 0.247802734375, + "learning_rate": 8.63013698630137e-07, + "loss": 0.0002, + "reward": 1.3671875, + "reward_std": 0.08679073117673397, + "rewards/accuracy_reward": 0.3671875, + "rewards/format_reward": 1.0, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.375, + "epoch": 0.4123287671232877, + "grad_norm": 1.2540714740753174, + "kl": 0.2158203125, + "learning_rate": 8.625570776255707e-07, + "loss": 0.0002, + "reward": 1.9296875, + "reward_std": 0.1649293377995491, + "rewards/accuracy_reward": 0.9296875, + "rewards/format_reward": 1.0, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.5625, + "epoch": 0.4136986301369863, + "grad_norm": 1.9239349365234375, + "kl": 0.167724609375, + "learning_rate": 8.621004566210045e-07, + "loss": 0.0002, + "reward": 1.5446428060531616, + "reward_std": 0.05746740661561489, + "rewards/accuracy_reward": 0.5446428954601288, + "rewards/format_reward": 1.0, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.46875, + "epoch": 0.41506849315068495, + "grad_norm": 2.8443050384521484, + "kl": 0.185546875, + "learning_rate": 8.616438356164383e-07, + "loss": 0.0002, + "reward": 1.5546875, + "reward_std": 0.15467960387468338, + "rewards/accuracy_reward": 0.5546875, + "rewards/format_reward": 1.0, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.8125, + "epoch": 0.41643835616438357, + "grad_norm": 2.6606931686401367, + "kl": 0.226318359375, + "learning_rate": 8.611872146118721e-07, + "loss": 0.0002, + "reward": 1.71875, + "reward_std": 0.38481390848755836, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 1.0, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.625, + "epoch": 0.4178082191780822, + "grad_norm": 1.6216801404953003, + "kl": 0.1943359375, + "learning_rate": 8.607305936073059e-07, + "loss": 0.0002, + "reward": 1.21875, + "reward_std": 0.2041158601641655, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.3125, + "epoch": 0.4191780821917808, + "grad_norm": 2.814319372177124, + "kl": 0.189453125, + "learning_rate": 8.602739726027397e-07, + "loss": 0.0002, + "reward": 1.515625, + "reward_std": 0.36036762222647667, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 1.0, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.53125, + "epoch": 0.42054794520547945, + "grad_norm": 2.310194730758667, + "kl": 0.187255859375, + "learning_rate": 8.598173515981735e-07, + "loss": 0.0002, + "reward": 1.578125, + "reward_std": 0.30617379024624825, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 1.0, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 659.25, + "epoch": 0.42191780821917807, + "grad_norm": 0.9056942462921143, + "kl": 0.190673828125, + "learning_rate": 8.593607305936073e-07, + "loss": 0.0002, + "reward": 1.359375, + "reward_std": 0.19408094882965088, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 1.0, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.1875, + "epoch": 0.4232876712328767, + "grad_norm": 0.03478769585490227, + "kl": 0.239501953125, + "learning_rate": 8.58904109589041e-07, + "loss": 0.0002, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.59375, + "epoch": 0.4246575342465753, + "grad_norm": 7.321046829223633, + "kl": 0.204345703125, + "learning_rate": 8.584474885844748e-07, + "loss": 0.0002, + "reward": 1.3046875, + "reward_std": 0.30941806733608246, + "rewards/accuracy_reward": 0.3359375, + "rewards/format_reward": 0.96875, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 475.28125, + "epoch": 0.426027397260274, + "grad_norm": 3.1986887454986572, + "kl": 0.221435546875, + "learning_rate": 8.579908675799087e-07, + "loss": 0.0002, + "reward": 1.421875, + "reward_std": 0.4004939943552017, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 0.96875, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.9375, + "epoch": 0.4273972602739726, + "grad_norm": 1.7835962772369385, + "kl": 0.18798828125, + "learning_rate": 8.575342465753424e-07, + "loss": 0.0002, + "reward": 2.171875, + "reward_std": 0.23280548676848412, + "rewards/accuracy_reward": 1.171875, + "rewards/format_reward": 1.0, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 506.5625, + "epoch": 0.42876712328767125, + "grad_norm": 2.5982441902160645, + "kl": 0.175537109375, + "learning_rate": 8.570776255707762e-07, + "loss": 0.0002, + "reward": 1.46875, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 1.0, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.71875, + "epoch": 0.4301369863013699, + "grad_norm": 0.013101520948112011, + "kl": 0.212890625, + "learning_rate": 8.5662100456621e-07, + "loss": 0.0002, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.71875, + "epoch": 0.4315068493150685, + "grad_norm": 5.282835960388184, + "kl": 0.212890625, + "learning_rate": 8.561643835616438e-07, + "loss": 0.0002, + "reward": 1.1796875, + "reward_std": 0.12073516845703125, + "rewards/accuracy_reward": 0.1796875, + "rewards/format_reward": 1.0, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.96875, + "epoch": 0.4328767123287671, + "grad_norm": 2.862443447113037, + "kl": 0.21142578125, + "learning_rate": 8.557077625570776e-07, + "loss": 0.0002, + "reward": 2.234375, + "reward_std": 0.44417304918169975, + "rewards/accuracy_reward": 1.234375, + "rewards/format_reward": 1.0, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 707.71875, + "epoch": 0.43424657534246575, + "grad_norm": 1.707274317741394, + "kl": 0.19287109375, + "learning_rate": 8.552511415525113e-07, + "loss": 0.0002, + "reward": 1.7218749523162842, + "reward_std": 0.0844996627420187, + "rewards/accuracy_reward": 0.721875011920929, + "rewards/format_reward": 1.0, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.0, + "epoch": 0.43561643835616437, + "grad_norm": 2.739851713180542, + "kl": 0.216796875, + "learning_rate": 8.547945205479452e-07, + "loss": 0.0002, + "reward": 1.328125, + "reward_std": 0.29355230554938316, + "rewards/accuracy_reward": 0.328125, + "rewards/format_reward": 1.0, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 501.71875, + "epoch": 0.436986301369863, + "grad_norm": 1.7768224477767944, + "kl": 0.217529296875, + "learning_rate": 8.54337899543379e-07, + "loss": 0.0002, + "reward": 1.25, + "reward_std": 0.3535533770918846, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.96875, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.8125, + "epoch": 0.4383561643835616, + "grad_norm": 5.192182540893555, + "kl": 0.357421875, + "learning_rate": 8.538812785388127e-07, + "loss": 0.0004, + "reward": 1.3697916865348816, + "reward_std": 0.1467284932732582, + "rewards/accuracy_reward": 0.3697916567325592, + "rewards/format_reward": 1.0, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.8125, + "epoch": 0.4397260273972603, + "grad_norm": 1.898450493812561, + "kl": 0.24365234375, + "learning_rate": 8.534246575342465e-07, + "loss": 0.0002, + "reward": 1.2265625, + "reward_std": 0.11048543266952038, + "rewards/accuracy_reward": 0.2265625, + "rewards/format_reward": 1.0, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.375, + "epoch": 0.4410958904109589, + "grad_norm": 1.9675666093826294, + "kl": 0.231689453125, + "learning_rate": 8.529680365296803e-07, + "loss": 0.0002, + "reward": 1.375, + "reward_std": 0.1825428232550621, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.5, + "epoch": 0.44246575342465755, + "grad_norm": 1.3606637716293335, + "kl": 0.257568359375, + "learning_rate": 8.52511415525114e-07, + "loss": 0.0003, + "reward": 1.25, + "reward_std": 0.0704294964671135, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 673.0, + "epoch": 0.4438356164383562, + "grad_norm": 1.0777534246444702, + "kl": 0.18701171875, + "learning_rate": 8.52054794520548e-07, + "loss": 0.0002, + "reward": 1.6614583134651184, + "reward_std": 0.0961906760931015, + "rewards/accuracy_reward": 0.6614583432674408, + "rewards/format_reward": 1.0, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 640.09375, + "epoch": 0.4452054794520548, + "grad_norm": 1.1463043689727783, + "kl": 0.205078125, + "learning_rate": 8.515981735159817e-07, + "loss": 0.0002, + "reward": 1.390625, + "reward_std": 0.12255740165710449, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 1.0, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 483.25, + "epoch": 0.4465753424657534, + "grad_norm": 3.938135862350464, + "kl": 0.256591796875, + "learning_rate": 8.511415525114155e-07, + "loss": 0.0003, + "reward": 1.60546875, + "reward_std": 0.2536969259381294, + "rewards/accuracy_reward": 0.60546875, + "rewards/format_reward": 1.0, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 707.46875, + "epoch": 0.44794520547945205, + "grad_norm": 2.0289745330810547, + "kl": 0.170166015625, + "learning_rate": 8.506849315068493e-07, + "loss": 0.0002, + "reward": 1.75, + "reward_std": 0.16622394509613514, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 496.15625, + "epoch": 0.44931506849315067, + "grad_norm": 1.461342215538025, + "kl": 0.2412109375, + "learning_rate": 8.50228310502283e-07, + "loss": 0.0002, + "reward": 1.390625, + "reward_std": 0.1446593925356865, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 1.0, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 693.28125, + "epoch": 0.4506849315068493, + "grad_norm": 1.320560336112976, + "kl": 0.18310546875, + "learning_rate": 8.497716894977168e-07, + "loss": 0.0002, + "reward": 1.703125, + "reward_std": 0.3912949990481138, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.96875, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 649.65625, + "epoch": 0.4520547945205479, + "grad_norm": 2.9369583129882812, + "kl": 0.19384765625, + "learning_rate": 8.493150684931506e-07, + "loss": 0.0002, + "reward": 1.80859375, + "reward_std": 0.1918780878186226, + "rewards/accuracy_reward": 0.80859375, + "rewards/format_reward": 1.0, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.03125, + "epoch": 0.4534246575342466, + "grad_norm": 2.696607828140259, + "kl": 0.252685546875, + "learning_rate": 8.488584474885845e-07, + "loss": 0.0003, + "reward": 1.45703125, + "reward_std": 0.16619354858994484, + "rewards/accuracy_reward": 0.45703125, + "rewards/format_reward": 1.0, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.0625, + "epoch": 0.4547945205479452, + "grad_norm": 1.493513822555542, + "kl": 0.244140625, + "learning_rate": 8.484018264840183e-07, + "loss": 0.0002, + "reward": 1.234375, + "reward_std": 0.1530819907784462, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 1.0, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 503.59375, + "epoch": 0.45616438356164385, + "grad_norm": 2.641517162322998, + "kl": 0.220703125, + "learning_rate": 8.47945205479452e-07, + "loss": 0.0002, + "reward": 1.7890625, + "reward_std": 0.2441160511225462, + "rewards/accuracy_reward": 0.7890625, + "rewards/format_reward": 1.0, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.84375, + "epoch": 0.4575342465753425, + "grad_norm": 1.104002594947815, + "kl": 0.206298828125, + "learning_rate": 8.474885844748858e-07, + "loss": 0.0002, + "reward": 1.4296875, + "reward_std": 0.09704047441482544, + "rewards/accuracy_reward": 0.4296875, + "rewards/format_reward": 1.0, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.6875, + "epoch": 0.4589041095890411, + "grad_norm": 2.944154977798462, + "kl": 0.2412109375, + "learning_rate": 8.470319634703196e-07, + "loss": 0.0002, + "reward": 1.3671875, + "reward_std": 0.1678851991891861, + "rewards/accuracy_reward": 0.3671875, + "rewards/format_reward": 1.0, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 633.9375, + "epoch": 0.4602739726027397, + "grad_norm": 1.3954813480377197, + "kl": 1.03955078125, + "learning_rate": 8.465753424657533e-07, + "loss": 0.001, + "reward": 1.1614583432674408, + "reward_std": 0.026702914386987686, + "rewards/accuracy_reward": 0.1614583283662796, + "rewards/format_reward": 1.0, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.25, + "epoch": 0.46164383561643835, + "grad_norm": 1.2163310050964355, + "kl": 0.23876953125, + "learning_rate": 8.461187214611871e-07, + "loss": 0.0002, + "reward": 1.75, + "reward_std": 0.1733490191400051, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.625, + "epoch": 0.46301369863013697, + "grad_norm": 1.4970253705978394, + "kl": 0.2275390625, + "learning_rate": 8.45662100456621e-07, + "loss": 0.0002, + "reward": 1.34375, + "reward_std": 0.1356339044868946, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.5, + "epoch": 0.4643835616438356, + "grad_norm": 2.083859920501709, + "kl": 0.19677734375, + "learning_rate": 8.452054794520548e-07, + "loss": 0.0002, + "reward": 1.62109375, + "reward_std": 0.06917708925902843, + "rewards/accuracy_reward": 0.62109375, + "rewards/format_reward": 1.0, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 503.34375, + "epoch": 0.4657534246575342, + "grad_norm": 0.6715368628501892, + "kl": 0.2392578125, + "learning_rate": 8.447488584474886e-07, + "loss": 0.0002, + "reward": 1.49609375, + "reward_std": 0.04555431008338928, + "rewards/accuracy_reward": 0.49609375, + "rewards/format_reward": 1.0, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.625, + "epoch": 0.4671232876712329, + "grad_norm": 1.0103129148483276, + "kl": 0.19287109375, + "learning_rate": 8.442922374429223e-07, + "loss": 0.0002, + "reward": 1.3606770634651184, + "reward_std": 0.040511311031877995, + "rewards/accuracy_reward": 0.3606770783662796, + "rewards/format_reward": 1.0, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.5, + "epoch": 0.4684931506849315, + "grad_norm": 0.989227831363678, + "kl": 0.186767578125, + "learning_rate": 8.438356164383561e-07, + "loss": 0.0002, + "reward": 1.84375, + "reward_std": 0.21564550511538982, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 1.0, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.0, + "epoch": 0.46986301369863015, + "grad_norm": 1.0388092994689941, + "kl": 0.19189453125, + "learning_rate": 8.433789954337899e-07, + "loss": 0.0002, + "reward": 1.34375, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.3125, + "epoch": 0.4712328767123288, + "grad_norm": 2.6138434410095215, + "kl": 0.216552734375, + "learning_rate": 8.429223744292237e-07, + "loss": 0.0002, + "reward": 1.234375, + "reward_std": 0.1437886729836464, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 1.0, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 666.34375, + "epoch": 0.4726027397260274, + "grad_norm": 2.721027135848999, + "kl": 0.189453125, + "learning_rate": 8.424657534246576e-07, + "loss": 0.0002, + "reward": 1.453125, + "reward_std": 0.2909114882349968, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.9375, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.65625, + "epoch": 0.473972602739726, + "grad_norm": 1.8512462377548218, + "kl": 0.235107421875, + "learning_rate": 8.420091324200913e-07, + "loss": 0.0002, + "reward": 1.2109375, + "reward_std": 0.11048543266952038, + "rewards/accuracy_reward": 0.2109375, + "rewards/format_reward": 1.0, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.125, + "epoch": 0.47534246575342465, + "grad_norm": 0.8181569576263428, + "kl": 0.232177734375, + "learning_rate": 8.415525114155251e-07, + "loss": 0.0002, + "reward": 1.2447916567325592, + "reward_std": 0.05193428695201874, + "rewards/accuracy_reward": 0.2447916865348816, + "rewards/format_reward": 1.0, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.875, + "epoch": 0.4767123287671233, + "grad_norm": 1.4510343074798584, + "kl": 0.21044921875, + "learning_rate": 8.410958904109589e-07, + "loss": 0.0002, + "reward": 2.015625, + "reward_std": 0.3187600150704384, + "rewards/accuracy_reward": 1.015625, + "rewards/format_reward": 1.0, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 632.625, + "epoch": 0.4780821917808219, + "grad_norm": 0.8320886492729187, + "kl": 0.21875, + "learning_rate": 8.406392694063926e-07, + "loss": 0.0002, + "reward": 1.328125, + "reward_std": 0.2824692949652672, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 0.96875, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 485.75, + "epoch": 0.4794520547945205, + "grad_norm": 2.6687960624694824, + "kl": 0.2578125, + "learning_rate": 8.401826484018264e-07, + "loss": 0.0003, + "reward": 1.32421875, + "reward_std": 0.26008394733071327, + "rewards/accuracy_reward": 0.32421875, + "rewards/format_reward": 1.0, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.84375, + "epoch": 0.4808219178082192, + "grad_norm": 1.2626047134399414, + "kl": 0.199462890625, + "learning_rate": 8.397260273972603e-07, + "loss": 0.0002, + "reward": 1.453125, + "reward_std": 0.19960851781070232, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.96875, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 760.0, + "epoch": 0.4821917808219178, + "grad_norm": 1.4241095781326294, + "kl": 0.26806640625, + "learning_rate": 8.39269406392694e-07, + "loss": 0.0003, + "reward": 1.5312499701976776, + "reward_std": 0.19424722902476788, + "rewards/accuracy_reward": 0.5624999850988388, + "rewards/format_reward": 0.96875, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.09375, + "epoch": 0.48356164383561645, + "grad_norm": 2.1359047889709473, + "kl": 0.23779296875, + "learning_rate": 8.388127853881279e-07, + "loss": 0.0002, + "reward": 1.359375, + "reward_std": 0.27564920112490654, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 1.0, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.59375, + "epoch": 0.4849315068493151, + "grad_norm": 2.235769510269165, + "kl": 0.20361328125, + "learning_rate": 8.383561643835616e-07, + "loss": 0.0002, + "reward": 1.25, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 754.40625, + "epoch": 0.4863013698630137, + "grad_norm": 0.011801384389400482, + "kl": 0.194091796875, + "learning_rate": 8.378995433789954e-07, + "loss": 0.0002, + "reward": 1.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.09375, + "epoch": 0.4876712328767123, + "grad_norm": 3.116180181503296, + "kl": 0.214599609375, + "learning_rate": 8.374429223744292e-07, + "loss": 0.0002, + "reward": 1.9453125, + "reward_std": 0.35993071645498276, + "rewards/accuracy_reward": 0.9453125, + "rewards/format_reward": 1.0, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 500.90625, + "epoch": 0.48904109589041095, + "grad_norm": 2.013319492340088, + "kl": 0.21240234375, + "learning_rate": 8.369863013698629e-07, + "loss": 0.0002, + "reward": 1.5546875, + "reward_std": 0.19226671755313873, + "rewards/accuracy_reward": 0.5546875, + "rewards/format_reward": 1.0, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 684.53125, + "epoch": 0.4904109589041096, + "grad_norm": 1.1122536659240723, + "kl": 0.219970703125, + "learning_rate": 8.365296803652968e-07, + "loss": 0.0002, + "reward": 1.6927083134651184, + "reward_std": 0.13533581793308258, + "rewards/accuracy_reward": 0.6927083432674408, + "rewards/format_reward": 1.0, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.5, + "epoch": 0.4917808219178082, + "grad_norm": 3.6900923252105713, + "kl": 0.253173828125, + "learning_rate": 8.360730593607306e-07, + "loss": 0.0003, + "reward": 1.140625, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward": 0.140625, + "rewards/format_reward": 1.0, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 402.375, + "epoch": 0.4931506849315068, + "grad_norm": 1.4678456783294678, + "kl": 0.254150390625, + "learning_rate": 8.356164383561643e-07, + "loss": 0.0003, + "reward": 1.375, + "reward_std": 0.2925042062997818, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 0.96875, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.90625, + "epoch": 0.4945205479452055, + "grad_norm": 0.8065510392189026, + "kl": 0.238037109375, + "learning_rate": 8.351598173515981e-07, + "loss": 0.0002, + "reward": 1.484375, + "reward_std": 0.0289318785071373, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 1.0, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.53125, + "epoch": 0.4958904109589041, + "grad_norm": 0.8244228959083557, + "kl": 0.2685546875, + "learning_rate": 8.347031963470319e-07, + "loss": 0.0003, + "reward": 1.1953125, + "reward_std": 0.04005437344312668, + "rewards/accuracy_reward": 0.1953125, + "rewards/format_reward": 1.0, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.59375, + "epoch": 0.49726027397260275, + "grad_norm": 0.8816384077072144, + "kl": 0.24267578125, + "learning_rate": 8.342465753424657e-07, + "loss": 0.0002, + "reward": 1.234375, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 1.0, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 694.5, + "epoch": 0.4986301369863014, + "grad_norm": 1.5277591943740845, + "kl": 0.229248046875, + "learning_rate": 8.337899543378996e-07, + "loss": 0.0002, + "reward": 1.9697916805744171, + "reward_std": 0.2954293917864561, + "rewards/accuracy_reward": 0.9697916805744171, + "rewards/format_reward": 1.0, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.96875, + "epoch": 0.5, + "grad_norm": 0.6433670520782471, + "kl": 0.241455078125, + "learning_rate": 8.333333333333333e-07, + "loss": 0.0002, + "reward": 1.453125, + "reward_std": 0.13258251547813416, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.78125, + "epoch": 0.5013698630136987, + "grad_norm": 2.3756937980651855, + "kl": 0.272216796875, + "learning_rate": 8.328767123287671e-07, + "loss": 0.0003, + "reward": 1.28125, + "reward_std": 0.2346404492855072, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 1.0, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 733.40625, + "epoch": 0.5027397260273972, + "grad_norm": 2.2720232009887695, + "kl": 0.206298828125, + "learning_rate": 8.324200913242009e-07, + "loss": 0.0002, + "reward": 1.8359375, + "reward_std": 0.1747150868177414, + "rewards/accuracy_reward": 0.8359375, + "rewards/format_reward": 1.0, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.53125, + "epoch": 0.5041095890410959, + "grad_norm": 2.717717170715332, + "kl": 0.2490234375, + "learning_rate": 8.319634703196346e-07, + "loss": 0.0002, + "reward": 1.15625, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward": 0.15625, + "rewards/format_reward": 1.0, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.96875, + "epoch": 0.5054794520547945, + "grad_norm": 1.975050449371338, + "kl": 0.249267578125, + "learning_rate": 8.315068493150684e-07, + "loss": 0.0002, + "reward": 1.21875, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.09375, + "epoch": 0.5068493150684932, + "grad_norm": 4.310245990753174, + "kl": 0.254638671875, + "learning_rate": 8.310502283105022e-07, + "loss": 0.0003, + "reward": 1.59375, + "reward_std": 0.2177756354212761, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 1.0, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.375, + "epoch": 0.5082191780821917, + "grad_norm": 487.17987060546875, + "kl": 0.27783203125, + "learning_rate": 8.305936073059361e-07, + "loss": 0.0003, + "reward": 1.46875, + "reward_std": 0.2346404492855072, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 1.0, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.71875, + "epoch": 0.5095890410958904, + "grad_norm": 6.600228309631348, + "kl": 0.24365234375, + "learning_rate": 8.301369863013699e-07, + "loss": 0.0002, + "reward": 1.46875, + "reward_std": 0.17965975776314735, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 1.0, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.6875, + "epoch": 0.510958904109589, + "grad_norm": 1.2109966278076172, + "kl": 0.24609375, + "learning_rate": 8.296803652968036e-07, + "loss": 0.0002, + "reward": 1.33984375, + "reward_std": 0.08086705580353737, + "rewards/accuracy_reward": 0.33984375, + "rewards/format_reward": 1.0, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.96875, + "epoch": 0.5123287671232877, + "grad_norm": 1.2593879699707031, + "kl": 0.23974609375, + "learning_rate": 8.292237442922374e-07, + "loss": 0.0002, + "reward": 1.9322916567325592, + "reward_std": 0.07365694083273411, + "rewards/accuracy_reward": 0.9322916716337204, + "rewards/format_reward": 1.0, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 654.34375, + "epoch": 0.5136986301369864, + "grad_norm": 1.1873910427093506, + "kl": 0.228515625, + "learning_rate": 8.287671232876712e-07, + "loss": 0.0002, + "reward": 1.5208333730697632, + "reward_std": 0.06780947372317314, + "rewards/accuracy_reward": 0.5208333134651184, + "rewards/format_reward": 1.0, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.625, + "epoch": 0.5150684931506849, + "grad_norm": 2.3675568103790283, + "kl": 0.3232421875, + "learning_rate": 8.283105022831049e-07, + "loss": 0.0003, + "reward": 1.74609375, + "reward_std": 0.18580568581819534, + "rewards/accuracy_reward": 0.74609375, + "rewards/format_reward": 1.0, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.6875, + "epoch": 0.5164383561643836, + "grad_norm": 1.0833803415298462, + "kl": 0.29248046875, + "learning_rate": 8.278538812785387e-07, + "loss": 0.0003, + "reward": 1.2109375, + "reward_std": 0.022097086533904076, + "rewards/accuracy_reward": 0.2109375, + "rewards/format_reward": 1.0, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.5625, + "epoch": 0.5178082191780822, + "grad_norm": 2.686486005783081, + "kl": 0.234130859375, + "learning_rate": 8.273972602739726e-07, + "loss": 0.0002, + "reward": 1.8828125, + "reward_std": 0.08731903322041035, + "rewards/accuracy_reward": 0.8828125, + "rewards/format_reward": 1.0, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.8125, + "epoch": 0.5191780821917809, + "grad_norm": 0.7580247521400452, + "kl": 0.2861328125, + "learning_rate": 8.269406392694064e-07, + "loss": 0.0003, + "reward": 1.2578125, + "reward_std": 0.09704047441482544, + "rewards/accuracy_reward": 0.2578125, + "rewards/format_reward": 1.0, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.46875, + "epoch": 0.5205479452054794, + "grad_norm": 0.9390192627906799, + "kl": 0.2568359375, + "learning_rate": 8.264840182648402e-07, + "loss": 0.0003, + "reward": 1.359375, + "reward_std": 0.19408094882965088, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 1.0, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.59375, + "epoch": 0.5219178082191781, + "grad_norm": 1.6535121202468872, + "kl": 0.2880859375, + "learning_rate": 8.260273972602739e-07, + "loss": 0.0003, + "reward": 1.4765625, + "reward_std": 0.11048543266952038, + "rewards/accuracy_reward": 0.4765625, + "rewards/format_reward": 1.0, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 631.625, + "epoch": 0.5232876712328767, + "grad_norm": 32.29432678222656, + "kl": 0.25146484375, + "learning_rate": 8.255707762557077e-07, + "loss": 0.0003, + "reward": 1.578125, + "reward_std": 0.10205793008208275, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 1.0, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 654.96875, + "epoch": 0.5246575342465754, + "grad_norm": 4.199236869812012, + "kl": 0.243896484375, + "learning_rate": 8.251141552511415e-07, + "loss": 0.0002, + "reward": 1.4739583134651184, + "reward_std": 0.10995287448167801, + "rewards/accuracy_reward": 0.4739583283662796, + "rewards/format_reward": 1.0, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.28125, + "epoch": 0.5260273972602739, + "grad_norm": 14.366601943969727, + "kl": 0.306640625, + "learning_rate": 8.246575342465753e-07, + "loss": 0.0003, + "reward": 1.421875, + "reward_std": 0.2877064570784569, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 1.0, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.9375, + "epoch": 0.5273972602739726, + "grad_norm": 1.405211329460144, + "kl": 0.2666015625, + "learning_rate": 8.242009132420092e-07, + "loss": 0.0003, + "reward": 1.5494791269302368, + "reward_std": 0.10189040005207062, + "rewards/accuracy_reward": 0.5494791567325592, + "rewards/format_reward": 1.0, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 501.96875, + "epoch": 0.5287671232876713, + "grad_norm": 1.3458356857299805, + "kl": 0.250244140625, + "learning_rate": 8.237442922374429e-07, + "loss": 0.0003, + "reward": 1.4140625, + "reward_std": 0.15268651396036148, + "rewards/accuracy_reward": 0.4140625, + "rewards/format_reward": 1.0, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 679.59375, + "epoch": 0.5301369863013699, + "grad_norm": 1.6495295763015747, + "kl": 0.23291015625, + "learning_rate": 8.232876712328767e-07, + "loss": 0.0002, + "reward": 1.7203125059604645, + "reward_std": 0.15397237055003643, + "rewards/accuracy_reward": 0.7203124761581421, + "rewards/format_reward": 1.0, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.90625, + "epoch": 0.5315068493150685, + "grad_norm": 1.8638428449630737, + "kl": 0.28076171875, + "learning_rate": 8.228310502283105e-07, + "loss": 0.0003, + "reward": 1.421875, + "reward_std": 0.0289318785071373, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 1.0, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.375, + "epoch": 0.5328767123287671, + "grad_norm": 2.2824759483337402, + "kl": 0.250732421875, + "learning_rate": 8.223744292237442e-07, + "loss": 0.0003, + "reward": 1.8055555820465088, + "reward_std": 0.19911032635718584, + "rewards/accuracy_reward": 0.8055555671453476, + "rewards/format_reward": 1.0, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.59375, + "epoch": 0.5342465753424658, + "grad_norm": 2.9996278285980225, + "kl": 0.259765625, + "learning_rate": 8.21917808219178e-07, + "loss": 0.0003, + "reward": 2.0, + "reward_std": 0.16151439771056175, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.9375, + "epoch": 0.5356164383561643, + "grad_norm": 2.858344316482544, + "kl": 0.2744140625, + "learning_rate": 8.214611872146119e-07, + "loss": 0.0003, + "reward": 1.3828125, + "reward_std": 0.2894125059247017, + "rewards/accuracy_reward": 0.3828125, + "rewards/format_reward": 1.0, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.03125, + "epoch": 0.536986301369863, + "grad_norm": 5.41072940826416, + "kl": 0.30810546875, + "learning_rate": 8.210045662100456e-07, + "loss": 0.0003, + "reward": 1.21875, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 717.78125, + "epoch": 0.5383561643835616, + "grad_norm": 1.8393654823303223, + "kl": 0.301025390625, + "learning_rate": 8.205479452054795e-07, + "loss": 0.0003, + "reward": 1.40625, + "reward_std": 0.2041158601641655, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.96875, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.125, + "epoch": 0.5397260273972603, + "grad_norm": 1.726664662361145, + "kl": 0.287109375, + "learning_rate": 8.200913242009132e-07, + "loss": 0.0003, + "reward": 1.265625, + "reward_std": 0.17730122804641724, + "rewards/accuracy_reward": 0.265625, + "rewards/format_reward": 1.0, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.46875, + "epoch": 0.541095890410959, + "grad_norm": 1.4600183963775635, + "kl": 0.23193359375, + "learning_rate": 8.19634703196347e-07, + "loss": 0.0002, + "reward": 1.578125, + "reward_std": 0.1983242630958557, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 1.0, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.34375, + "epoch": 0.5424657534246575, + "grad_norm": 1.1658108234405518, + "kl": 0.25390625, + "learning_rate": 8.191780821917808e-07, + "loss": 0.0003, + "reward": 1.9140625, + "reward_std": 0.1649293377995491, + "rewards/accuracy_reward": 0.9140625, + "rewards/format_reward": 1.0, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 634.875, + "epoch": 0.5438356164383562, + "grad_norm": 1.2310694456100464, + "kl": 0.251953125, + "learning_rate": 8.187214611872145e-07, + "loss": 0.0003, + "reward": 1.453125, + "reward_std": 0.10205793008208275, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.3125, + "epoch": 0.5452054794520548, + "grad_norm": 1.6130096912384033, + "kl": 0.244140625, + "learning_rate": 8.182648401826484e-07, + "loss": 0.0002, + "reward": 1.6171875, + "reward_std": 0.0765409953892231, + "rewards/accuracy_reward": 0.6171875, + "rewards/format_reward": 1.0, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 700.3125, + "epoch": 0.5465753424657535, + "grad_norm": 1.6435996294021606, + "kl": 0.229736328125, + "learning_rate": 8.178082191780822e-07, + "loss": 0.0002, + "reward": 1.5677083134651184, + "reward_std": 0.3693719878792763, + "rewards/accuracy_reward": 0.6302083283662796, + "rewards/format_reward": 0.9375, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.75, + "epoch": 0.547945205479452, + "grad_norm": 0.7573147416114807, + "kl": 0.2626953125, + "learning_rate": 8.173515981735159e-07, + "loss": 0.0003, + "reward": 1.703125, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 1.0, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 858.03125, + "epoch": 0.5493150684931507, + "grad_norm": 2.761258602142334, + "kl": 0.218994140625, + "learning_rate": 8.168949771689498e-07, + "loss": 0.0002, + "reward": 1.125, + "reward_std": 0.2925042062997818, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.875, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.5625, + "epoch": 0.5506849315068493, + "grad_norm": 4.398726463317871, + "kl": 0.285400390625, + "learning_rate": 8.164383561643835e-07, + "loss": 0.0003, + "reward": 1.40625, + "reward_std": 0.24511480331420898, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 1.0, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 824.625, + "epoch": 0.552054794520548, + "grad_norm": 1.2996392250061035, + "kl": 0.2373046875, + "learning_rate": 8.159817351598172e-07, + "loss": 0.0002, + "reward": 1.6875, + "reward_std": 0.6573142260313034, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.875, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completion_length": 847.0, + "epoch": 0.5534246575342465, + "grad_norm": 1.484923243522644, + "kl": 0.2275390625, + "learning_rate": 8.155251141552512e-07, + "loss": 0.0002, + "reward": 1.640625, + "reward_std": 0.5366625860333443, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.875, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.21875, + "epoch": 0.5547945205479452, + "grad_norm": 2.970003843307495, + "kl": 0.255859375, + "learning_rate": 8.150684931506849e-07, + "loss": 0.0003, + "reward": 1.4921875, + "reward_std": 0.336070965975523, + "rewards/accuracy_reward": 0.5234375, + "rewards/format_reward": 0.96875, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 843.25, + "epoch": 0.5561643835616439, + "grad_norm": 2.474311113357544, + "kl": 0.22802734375, + "learning_rate": 8.146118721461187e-07, + "loss": 0.0002, + "reward": 1.90625, + "reward_std": 0.5572120249271393, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.9375, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 494.5625, + "epoch": 0.5575342465753425, + "grad_norm": 3.557459592819214, + "kl": 0.23876953125, + "learning_rate": 8.141552511415525e-07, + "loss": 0.0002, + "reward": 1.6197916567325592, + "reward_std": 0.1845790259540081, + "rewards/accuracy_reward": 0.6197916567325592, + "rewards/format_reward": 1.0, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.65625, + "epoch": 0.5589041095890411, + "grad_norm": 4.341843605041504, + "kl": 0.25634765625, + "learning_rate": 8.136986301369862e-07, + "loss": 0.0003, + "reward": 1.4375, + "reward_std": 0.49022960662841797, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9375, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 659.375, + "epoch": 0.5602739726027397, + "grad_norm": 1.4180370569229126, + "kl": 0.231689453125, + "learning_rate": 8.132420091324201e-07, + "loss": 0.0002, + "reward": 1.59375, + "reward_std": 0.30327702313661575, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.96875, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completion_length": 618.65625, + "epoch": 0.5616438356164384, + "grad_norm": 1.4287060499191284, + "kl": 0.273193359375, + "learning_rate": 8.127853881278538e-07, + "loss": 0.0003, + "reward": 1.484375, + "reward_std": 0.2497537788003683, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.9375, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 769.59375, + "epoch": 0.563013698630137, + "grad_norm": 1.4561328887939453, + "kl": 0.249267578125, + "learning_rate": 8.123287671232877e-07, + "loss": 0.0002, + "reward": 1.328125, + "reward_std": 0.09143973141908646, + "rewards/accuracy_reward": 0.328125, + "rewards/format_reward": 1.0, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.5625, + "epoch": 0.5643835616438356, + "grad_norm": 1.4629502296447754, + "kl": 0.26904296875, + "learning_rate": 8.118721461187215e-07, + "loss": 0.0003, + "reward": 1.34375, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.34375, + "epoch": 0.5657534246575342, + "grad_norm": 4.4141669273376465, + "kl": 0.276611328125, + "learning_rate": 8.114155251141552e-07, + "loss": 0.0003, + "reward": 1.3828125, + "reward_std": 0.17702843621373177, + "rewards/accuracy_reward": 0.3828125, + "rewards/format_reward": 1.0, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.84375, + "epoch": 0.5671232876712329, + "grad_norm": 0.015535110607743263, + "kl": 0.278564453125, + "learning_rate": 8.10958904109589e-07, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.53125, + "epoch": 0.5684931506849316, + "grad_norm": 5.981386661529541, + "kl": 0.2919921875, + "learning_rate": 8.105022831050228e-07, + "loss": 0.0003, + "reward": 1.640625, + "reward_std": 0.2777610570192337, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 1.0, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.8125, + "epoch": 0.5698630136986301, + "grad_norm": 2.892124652862549, + "kl": 0.279296875, + "learning_rate": 8.100456621004565e-07, + "loss": 0.0003, + "reward": 1.765625, + "reward_std": 0.35141606256365776, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 1.0, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 685.75, + "epoch": 0.5712328767123288, + "grad_norm": 1.2724870443344116, + "kl": 0.252685546875, + "learning_rate": 8.095890410958903e-07, + "loss": 0.0003, + "reward": 1.390625, + "reward_std": 0.12255740165710449, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 1.0, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.0, + "epoch": 0.5726027397260274, + "grad_norm": 2.105574131011963, + "kl": 0.296875, + "learning_rate": 8.091324200913242e-07, + "loss": 0.0003, + "reward": 1.46875, + "reward_std": 0.2041158601641655, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 1.0, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.21875, + "epoch": 0.5739726027397261, + "grad_norm": 4.214569091796875, + "kl": 0.3125, + "learning_rate": 8.08675799086758e-07, + "loss": 0.0003, + "reward": 1.640625, + "reward_std": 0.18648964539170265, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 1.0, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.59375, + "epoch": 0.5753424657534246, + "grad_norm": 4.964509963989258, + "kl": 0.2587890625, + "learning_rate": 8.082191780821918e-07, + "loss": 0.0003, + "reward": 1.453125, + "reward_std": 0.28664068691432476, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 669.875, + "epoch": 0.5767123287671233, + "grad_norm": 2.2054591178894043, + "kl": 0.243408203125, + "learning_rate": 8.077625570776255e-07, + "loss": 0.0002, + "reward": 1.71484375, + "reward_std": 0.11598511412739754, + "rewards/accuracy_reward": 0.71484375, + "rewards/format_reward": 1.0, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.125, + "epoch": 0.5780821917808219, + "grad_norm": 2.5887465476989746, + "kl": 0.28125, + "learning_rate": 8.073059360730593e-07, + "loss": 0.0003, + "reward": 1.515625, + "reward_std": 0.23243396915495396, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 1.0, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.84375, + "epoch": 0.5794520547945206, + "grad_norm": 3.1099772453308105, + "kl": 0.29345703125, + "learning_rate": 8.068493150684931e-07, + "loss": 0.0003, + "reward": 1.4810267686843872, + "reward_std": 0.10061977338045835, + "rewards/accuracy_reward": 0.4810267835855484, + "rewards/format_reward": 1.0, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.875, + "epoch": 0.5808219178082191, + "grad_norm": 2.3216264247894287, + "kl": 0.57421875, + "learning_rate": 8.063926940639269e-07, + "loss": 0.0006, + "reward": 1.140625, + "reward_std": 0.0289318785071373, + "rewards/accuracy_reward": 0.140625, + "rewards/format_reward": 1.0, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.0, + "epoch": 0.5821917808219178, + "grad_norm": 1.1300055980682373, + "kl": 0.27392578125, + "learning_rate": 8.059360730593608e-07, + "loss": 0.0003, + "reward": 1.140625, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward": 0.140625, + "rewards/format_reward": 1.0, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.5, + "epoch": 0.5835616438356165, + "grad_norm": 1.6136479377746582, + "kl": 0.2802734375, + "learning_rate": 8.054794520547945e-07, + "loss": 0.0003, + "reward": 1.859375, + "reward_std": 0.19408093392848969, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 1.0, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.6875, + "epoch": 0.584931506849315, + "grad_norm": 3.859463930130005, + "kl": 0.30029296875, + "learning_rate": 8.050228310502283e-07, + "loss": 0.0003, + "reward": 1.4375, + "reward_std": 0.4182215705513954, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completion_length": 491.25, + "epoch": 0.5863013698630137, + "grad_norm": 3.137112617492676, + "kl": 0.2880859375, + "learning_rate": 8.045662100456621e-07, + "loss": 0.0003, + "reward": 1.4713541865348816, + "reward_std": 0.18944428488612175, + "rewards/accuracy_reward": 0.4713541716337204, + "rewards/format_reward": 1.0, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.84375, + "epoch": 0.5876712328767123, + "grad_norm": 2.1864824295043945, + "kl": 0.30810546875, + "learning_rate": 8.041095890410958e-07, + "loss": 0.0003, + "reward": 1.6171875, + "reward_std": 0.14428602531552315, + "rewards/accuracy_reward": 0.6171875, + "rewards/format_reward": 1.0, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.46875, + "epoch": 0.589041095890411, + "grad_norm": 4.553158283233643, + "kl": 0.33642578125, + "learning_rate": 8.036529680365296e-07, + "loss": 0.0003, + "reward": 1.5390625, + "reward_std": 0.25012245774269104, + "rewards/accuracy_reward": 0.5390625, + "rewards/format_reward": 1.0, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.96875, + "epoch": 0.5904109589041096, + "grad_norm": 2.7253074645996094, + "kl": 0.31884765625, + "learning_rate": 8.031963470319635e-07, + "loss": 0.0003, + "reward": 1.453125, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.8125, + "epoch": 0.5917808219178082, + "grad_norm": 1.4084954261779785, + "kl": 0.275634765625, + "learning_rate": 8.027397260273972e-07, + "loss": 0.0003, + "reward": 1.19921875, + "reward_std": 0.06874492764472961, + "rewards/accuracy_reward": 0.19921875, + "rewards/format_reward": 1.0, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 485.96875, + "epoch": 0.5931506849315068, + "grad_norm": 4.675118446350098, + "kl": 0.31787109375, + "learning_rate": 8.022831050228311e-07, + "loss": 0.0003, + "reward": 1.640625, + "reward_std": 0.10205793008208275, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 1.0, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completion_length": 479.9375, + "epoch": 0.5945205479452055, + "grad_norm": 2.0734643936157227, + "kl": 0.2802734375, + "learning_rate": 8.018264840182648e-07, + "loss": 0.0003, + "reward": 1.671875, + "reward_std": 0.15101328492164612, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 1.0, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.59375, + "epoch": 0.5958904109589042, + "grad_norm": 4.225161075592041, + "kl": 0.31201171875, + "learning_rate": 8.013698630136985e-07, + "loss": 0.0003, + "reward": 1.53125, + "reward_std": 0.24511480331420898, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 1.0, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 651.125, + "epoch": 0.5972602739726027, + "grad_norm": 1.829791784286499, + "kl": 0.298828125, + "learning_rate": 8.009132420091324e-07, + "loss": 0.0003, + "reward": 1.66015625, + "reward_std": 0.3770730784162879, + "rewards/accuracy_reward": 0.6914062350988388, + "rewards/format_reward": 0.96875, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.25, + "epoch": 0.5986301369863014, + "grad_norm": 3.862016439437866, + "kl": 0.26318359375, + "learning_rate": 8.004566210045661e-07, + "loss": 0.0003, + "reward": 1.46875, + "reward_std": 0.3198433741927147, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 1.0, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.21875, + "epoch": 0.6, + "grad_norm": 3.7712178230285645, + "kl": 0.283203125, + "learning_rate": 8e-07, + "loss": 0.0003, + "reward": 1.5989583134651184, + "reward_std": 0.37473164498806, + "rewards/accuracy_reward": 0.6302083134651184, + "rewards/format_reward": 0.96875, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.53125, + "epoch": 0.6013698630136987, + "grad_norm": 3.2269580364227295, + "kl": 0.2978515625, + "learning_rate": 7.995433789954338e-07, + "loss": 0.0003, + "reward": 1.40625, + "reward_std": 0.18662459589540958, + "rewards/accuracy_reward": 0.4062499850988388, + "rewards/format_reward": 1.0, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.15625, + "epoch": 0.6027397260273972, + "grad_norm": 2.537423610687256, + "kl": 0.30517578125, + "learning_rate": 7.990867579908675e-07, + "loss": 0.0003, + "reward": 1.4791666567325592, + "reward_std": 0.22078385017812252, + "rewards/accuracy_reward": 0.4791666567325592, + "rewards/format_reward": 1.0, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.1875, + "epoch": 0.6041095890410959, + "grad_norm": 2.5535776615142822, + "kl": 0.28759765625, + "learning_rate": 7.986301369863014e-07, + "loss": 0.0003, + "reward": 1.59375, + "reward_std": 0.2167138308286667, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 1.0, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completion_length": 666.71875, + "epoch": 0.6054794520547945, + "grad_norm": 2.6353836059570312, + "kl": 0.27392578125, + "learning_rate": 7.981735159817351e-07, + "loss": 0.0003, + "reward": 1.8463541567325592, + "reward_std": 0.2513718083500862, + "rewards/accuracy_reward": 0.8776041567325592, + "rewards/format_reward": 0.96875, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 640.34375, + "epoch": 0.6068493150684932, + "grad_norm": 2.2128899097442627, + "kl": 0.265625, + "learning_rate": 7.977168949771688e-07, + "loss": 0.0003, + "reward": 1.859375, + "reward_std": 0.17806704714894295, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 1.0, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.8125, + "epoch": 0.6082191780821918, + "grad_norm": 2.6061644554138184, + "kl": 0.287109375, + "learning_rate": 7.972602739726027e-07, + "loss": 0.0003, + "reward": 1.4375, + "reward_std": 0.09127141162753105, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.46875, + "epoch": 0.6095890410958904, + "grad_norm": 13.305716514587402, + "kl": 0.30712890625, + "learning_rate": 7.968036529680365e-07, + "loss": 0.0003, + "reward": 1.4196428656578064, + "reward_std": 0.2756448173895478, + "rewards/accuracy_reward": 0.419642835855484, + "rewards/format_reward": 1.0, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.03125, + "epoch": 0.6109589041095891, + "grad_norm": 3.1724693775177, + "kl": 0.27685546875, + "learning_rate": 7.963470319634703e-07, + "loss": 0.0003, + "reward": 1.296875, + "reward_std": 0.1446593925356865, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 1.0, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.15625, + "epoch": 0.6123287671232877, + "grad_norm": 3.927133798599243, + "kl": 0.369140625, + "learning_rate": 7.958904109589041e-07, + "loss": 0.0004, + "reward": 1.564843773841858, + "reward_std": 0.18459024094045162, + "rewards/accuracy_reward": 0.5648437440395355, + "rewards/format_reward": 1.0, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completion_length": 617.625, + "epoch": 0.6136986301369863, + "grad_norm": 1.9001963138580322, + "kl": 0.26806640625, + "learning_rate": 7.954337899543378e-07, + "loss": 0.0003, + "reward": 1.6171875, + "reward_std": 0.148872472345829, + "rewards/accuracy_reward": 0.6171875149011612, + "rewards/format_reward": 1.0, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.96875, + "epoch": 0.6150684931506849, + "grad_norm": 0.014812873676419258, + "kl": 0.29345703125, + "learning_rate": 7.949771689497717e-07, + "loss": 0.0003, + "reward": 1.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.25, + "epoch": 0.6164383561643836, + "grad_norm": 4.2387614250183105, + "kl": 0.345703125, + "learning_rate": 7.945205479452054e-07, + "loss": 0.0003, + "reward": 1.5859375, + "reward_std": 0.21959786862134933, + "rewards/accuracy_reward": 0.5859375, + "rewards/format_reward": 1.0, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 483.5, + "epoch": 0.6178082191780822, + "grad_norm": 1.1416374444961548, + "kl": 0.29345703125, + "learning_rate": 7.940639269406393e-07, + "loss": 0.0003, + "reward": 1.4531249701976776, + "reward_std": 0.11806126311421394, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.5, + "epoch": 0.6191780821917808, + "grad_norm": 2.7325708866119385, + "kl": 0.296875, + "learning_rate": 7.936073059360731e-07, + "loss": 0.0003, + "reward": 1.8020833432674408, + "reward_std": 0.1516590639948845, + "rewards/accuracy_reward": 0.8020833283662796, + "rewards/format_reward": 1.0, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 698.15625, + "epoch": 0.6205479452054794, + "grad_norm": 1.962207317352295, + "kl": 0.2685546875, + "learning_rate": 7.931506849315068e-07, + "loss": 0.0003, + "reward": 1.4609375, + "reward_std": 0.10126157477498055, + "rewards/accuracy_reward": 0.4609375, + "rewards/format_reward": 1.0, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.25, + "epoch": 0.6219178082191781, + "grad_norm": 2.9467625617980957, + "kl": 0.254150390625, + "learning_rate": 7.926940639269406e-07, + "loss": 0.0003, + "reward": 1.7825521230697632, + "reward_std": 0.37998438626527786, + "rewards/accuracy_reward": 0.8138020634651184, + "rewards/format_reward": 0.96875, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.5625, + "epoch": 0.6232876712328768, + "grad_norm": 2.3912293910980225, + "kl": 0.299072265625, + "learning_rate": 7.922374429223744e-07, + "loss": 0.0003, + "reward": 1.71484375, + "reward_std": 0.41244056448340416, + "rewards/accuracy_reward": 0.74609375, + "rewards/format_reward": 0.96875, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.84375, + "epoch": 0.6246575342465753, + "grad_norm": 2.0481388568878174, + "kl": 0.271240234375, + "learning_rate": 7.917808219178081e-07, + "loss": 0.0003, + "reward": 1.4296875, + "reward_std": 0.23987272381782532, + "rewards/accuracy_reward": 0.4296875, + "rewards/format_reward": 1.0, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.1875, + "epoch": 0.626027397260274, + "grad_norm": 1.0337698459625244, + "kl": 0.2783203125, + "learning_rate": 7.91324200913242e-07, + "loss": 0.0003, + "reward": 1.140625, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward": 0.140625, + "rewards/format_reward": 1.0, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completion_length": 479.8125, + "epoch": 0.6273972602739726, + "grad_norm": 3.085940361022949, + "kl": 0.267578125, + "learning_rate": 7.908675799086758e-07, + "loss": 0.0003, + "reward": 1.3984375, + "reward_std": 0.21758441254496574, + "rewards/accuracy_reward": 0.4296875, + "rewards/format_reward": 0.96875, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completion_length": 722.125, + "epoch": 0.6287671232876713, + "grad_norm": 2.2344439029693604, + "kl": 0.243408203125, + "learning_rate": 7.904109589041096e-07, + "loss": 0.0002, + "reward": 1.7083333730697632, + "reward_std": 0.10860283300280571, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/format_reward": 1.0, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.84375, + "epoch": 0.6301369863013698, + "grad_norm": 2.974860429763794, + "kl": 0.2705078125, + "learning_rate": 7.899543378995434e-07, + "loss": 0.0003, + "reward": 1.453125, + "reward_std": 0.11100946366786957, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.53125, + "epoch": 0.6315068493150685, + "grad_norm": 2.6200695037841797, + "kl": 0.289794921875, + "learning_rate": 7.894977168949771e-07, + "loss": 0.0003, + "reward": 1.265625, + "reward_std": 0.1804211586713791, + "rewards/accuracy_reward": 0.265625, + "rewards/format_reward": 1.0, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completion_length": 731.0, + "epoch": 0.6328767123287671, + "grad_norm": 2.422001838684082, + "kl": 0.307861328125, + "learning_rate": 7.890410958904109e-07, + "loss": 0.0003, + "reward": 1.9296875, + "reward_std": 0.31823596358299255, + "rewards/accuracy_reward": 0.9296875, + "rewards/format_reward": 1.0, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completion_length": 685.46875, + "epoch": 0.6342465753424658, + "grad_norm": 2.2778875827789307, + "kl": 0.2734375, + "learning_rate": 7.885844748858447e-07, + "loss": 0.0003, + "reward": 1.7734375, + "reward_std": 0.1592222936451435, + "rewards/accuracy_reward": 0.7734375, + "rewards/format_reward": 1.0, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completion_length": 683.90625, + "epoch": 0.6356164383561644, + "grad_norm": 1.6627824306488037, + "kl": 0.251708984375, + "learning_rate": 7.881278538812784e-07, + "loss": 0.0003, + "reward": 1.5598958134651184, + "reward_std": 0.04366161487996578, + "rewards/accuracy_reward": 0.5598958283662796, + "rewards/format_reward": 1.0, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.0625, + "epoch": 0.636986301369863, + "grad_norm": 2.5756311416625977, + "kl": 0.251220703125, + "learning_rate": 7.876712328767124e-07, + "loss": 0.0003, + "reward": 1.203125, + "reward_std": 0.2824692949652672, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 0.96875, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 682.375, + "epoch": 0.6383561643835617, + "grad_norm": 2.5796759128570557, + "kl": 0.2705078125, + "learning_rate": 7.872146118721461e-07, + "loss": 0.0003, + "reward": 1.4140625, + "reward_std": 0.03234682232141495, + "rewards/accuracy_reward": 0.4140625, + "rewards/format_reward": 1.0, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completion_length": 731.21875, + "epoch": 0.6397260273972603, + "grad_norm": 2.126073122024536, + "kl": 0.257568359375, + "learning_rate": 7.867579908675798e-07, + "loss": 0.0003, + "reward": 1.7682291567325592, + "reward_std": 0.19275827147066593, + "rewards/accuracy_reward": 0.768229141831398, + "rewards/format_reward": 1.0, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.9375, + "epoch": 0.6410958904109589, + "grad_norm": 1.904000163078308, + "kl": 0.2880859375, + "learning_rate": 7.863013698630137e-07, + "loss": 0.0003, + "reward": 1.65625, + "reward_std": 0.1246790662407875, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 1.0, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 690.0, + "epoch": 0.6424657534246575, + "grad_norm": 1.2763168811798096, + "kl": 0.28125, + "learning_rate": 7.858447488584474e-07, + "loss": 0.0003, + "reward": 1.421875, + "reward_std": 0.06646592170000076, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 1.0, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completion_length": 689.65625, + "epoch": 0.6438356164383562, + "grad_norm": 1.074062705039978, + "kl": 0.2880859375, + "learning_rate": 7.853881278538812e-07, + "loss": 0.0003, + "reward": 1.578125, + "reward_std": 0.09300297498703003, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.96875, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.03125, + "epoch": 0.6452054794520548, + "grad_norm": 7.623113632202148, + "kl": 0.27197265625, + "learning_rate": 7.849315068493151e-07, + "loss": 0.0003, + "reward": 1.28125, + "reward_std": 0.3471629247069359, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 1.0, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.34375, + "epoch": 0.6465753424657534, + "grad_norm": 1.6552479267120361, + "kl": 0.3212890625, + "learning_rate": 7.844748858447488e-07, + "loss": 0.0003, + "reward": 1.34375, + "reward_std": 0.16279494389891624, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.875, + "epoch": 0.647945205479452, + "grad_norm": 2.9016270637512207, + "kl": 0.2880859375, + "learning_rate": 7.840182648401827e-07, + "loss": 0.0003, + "reward": 1.421875, + "reward_std": 0.3365500792860985, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 0.96875, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.46875, + "epoch": 0.6493150684931507, + "grad_norm": 2.371548652648926, + "kl": 0.30126953125, + "learning_rate": 7.835616438356164e-07, + "loss": 0.0003, + "reward": 1.6510416567325592, + "reward_std": 0.15049929916858673, + "rewards/accuracy_reward": 0.6510416716337204, + "rewards/format_reward": 1.0, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.125, + "epoch": 0.6506849315068494, + "grad_norm": 1.709067940711975, + "kl": 0.29541015625, + "learning_rate": 7.831050228310501e-07, + "loss": 0.0003, + "reward": 1.5, + "reward_std": 0.1825428232550621, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.53125, + "epoch": 0.6520547945205479, + "grad_norm": 0.8692818284034729, + "kl": 0.29931640625, + "learning_rate": 7.82648401826484e-07, + "loss": 0.0003, + "reward": 1.5729166269302368, + "reward_std": 0.07952611148357391, + "rewards/accuracy_reward": 0.5729166567325592, + "rewards/format_reward": 1.0, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.5, + "epoch": 0.6534246575342466, + "grad_norm": 3.1770925521850586, + "kl": 0.2998046875, + "learning_rate": 7.821917808219177e-07, + "loss": 0.0003, + "reward": 1.875, + "reward_std": 0.27724190801382065, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completion_length": 734.28125, + "epoch": 0.6547945205479452, + "grad_norm": 6.152999401092529, + "kl": 0.271484375, + "learning_rate": 7.817351598173516e-07, + "loss": 0.0003, + "reward": 1.5234375, + "reward_std": 0.11048543080687523, + "rewards/accuracy_reward": 0.5234375, + "rewards/format_reward": 1.0, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.75, + "epoch": 0.6561643835616439, + "grad_norm": 2.7290778160095215, + "kl": 0.269287109375, + "learning_rate": 7.812785388127854e-07, + "loss": 0.0003, + "reward": 1.5729166865348816, + "reward_std": 0.18791258335113525, + "rewards/accuracy_reward": 0.5729166865348816, + "rewards/format_reward": 1.0, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.03125, + "epoch": 0.6575342465753424, + "grad_norm": 1.3419233560562134, + "kl": 0.322021484375, + "learning_rate": 7.808219178082191e-07, + "loss": 0.0003, + "reward": 1.375, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 624.03125, + "epoch": 0.6589041095890411, + "grad_norm": 2.2850310802459717, + "kl": 0.2744140625, + "learning_rate": 7.80365296803653e-07, + "loss": 0.0003, + "reward": 1.63671875, + "reward_std": 0.17153325304389, + "rewards/accuracy_reward": 0.63671875, + "rewards/format_reward": 1.0, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.03125, + "epoch": 0.6602739726027397, + "grad_norm": 1.9118156433105469, + "kl": 0.247802734375, + "learning_rate": 7.799086757990867e-07, + "loss": 0.0002, + "reward": 1.53125, + "reward_std": 0.102588826790452, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 1.0, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.625, + "epoch": 0.6616438356164384, + "grad_norm": 15.575130462646484, + "kl": 0.304931640625, + "learning_rate": 7.794520547945204e-07, + "loss": 0.0003, + "reward": 1.34375, + "reward_std": 0.18861131370067596, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completion_length": 507.46875, + "epoch": 0.663013698630137, + "grad_norm": 2.0344674587249756, + "kl": 0.3017578125, + "learning_rate": 7.789954337899543e-07, + "loss": 0.0003, + "reward": 1.5234375, + "reward_std": 0.29708079993724823, + "rewards/accuracy_reward": 0.5546875, + "rewards/format_reward": 0.96875, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 773.5625, + "epoch": 0.6643835616438356, + "grad_norm": 1.7942973375320435, + "kl": 0.3359375, + "learning_rate": 7.785388127853881e-07, + "loss": 0.0003, + "reward": 1.59375, + "reward_std": 0.1514892801642418, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 1.0, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.84375, + "epoch": 0.6657534246575343, + "grad_norm": 4.269023418426514, + "kl": 0.285400390625, + "learning_rate": 7.780821917808219e-07, + "loss": 0.0003, + "reward": 1.359375, + "reward_std": 0.43254324793815613, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 0.96875, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completion_length": 642.53125, + "epoch": 0.6671232876712329, + "grad_norm": 1.3577572107315063, + "kl": 0.267578125, + "learning_rate": 7.776255707762557e-07, + "loss": 0.0003, + "reward": 1.453125, + "reward_std": 0.13258251920342445, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.96875, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completion_length": 797.6875, + "epoch": 0.6684931506849315, + "grad_norm": 0.8058923482894897, + "kl": 0.247802734375, + "learning_rate": 7.771689497716894e-07, + "loss": 0.0002, + "reward": 1.15625, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward": 0.15625, + "rewards/format_reward": 1.0, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 631.90625, + "epoch": 0.6698630136986301, + "grad_norm": 1.1615171432495117, + "kl": 0.2861328125, + "learning_rate": 7.767123287671233e-07, + "loss": 0.0003, + "reward": 1.3802083432674408, + "reward_std": 0.0725951585918665, + "rewards/accuracy_reward": 0.3802083432674408, + "rewards/format_reward": 1.0, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.8125, + "epoch": 0.6712328767123288, + "grad_norm": 2.5159947872161865, + "kl": 0.248779296875, + "learning_rate": 7.76255707762557e-07, + "loss": 0.0002, + "reward": 1.2421875, + "reward_std": 0.19887377880513668, + "rewards/accuracy_reward": 0.2734375, + "rewards/format_reward": 0.96875, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.34375, + "epoch": 0.6726027397260274, + "grad_norm": 3.6898646354675293, + "kl": 0.26513671875, + "learning_rate": 7.757990867579909e-07, + "loss": 0.0003, + "reward": 1.5572916269302368, + "reward_std": 0.22042623907327652, + "rewards/accuracy_reward": 0.5572916567325592, + "rewards/format_reward": 1.0, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.15625, + "epoch": 0.673972602739726, + "grad_norm": 0.8622454404830933, + "kl": 0.2578125, + "learning_rate": 7.753424657534247e-07, + "loss": 0.0003, + "reward": 1.6875, + "reward_std": 0.06681530922651291, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.40625, + "epoch": 0.6753424657534246, + "grad_norm": 6.587668418884277, + "kl": 0.26904296875, + "learning_rate": 7.748858447488584e-07, + "loss": 0.0003, + "reward": 1.7760416269302368, + "reward_std": 0.11938937567174435, + "rewards/accuracy_reward": 0.7760416567325592, + "rewards/format_reward": 1.0, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completion_length": 649.5625, + "epoch": 0.6767123287671233, + "grad_norm": 1.662431001663208, + "kl": 0.2685546875, + "learning_rate": 7.744292237442922e-07, + "loss": 0.0003, + "reward": 1.5390625, + "reward_std": 0.15148437581956387, + "rewards/accuracy_reward": 0.5390625, + "rewards/format_reward": 1.0, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completion_length": 841.1875, + "epoch": 0.678082191780822, + "grad_norm": 2.413971424102783, + "kl": 0.218994140625, + "learning_rate": 7.73972602739726e-07, + "loss": 0.0002, + "reward": 1.2734375, + "reward_std": 0.4280551001429558, + "rewards/accuracy_reward": 0.3671875, + "rewards/format_reward": 0.90625, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 699.34375, + "epoch": 0.6794520547945205, + "grad_norm": 1.9714837074279785, + "kl": 0.2646484375, + "learning_rate": 7.735159817351597e-07, + "loss": 0.0003, + "reward": 1.4921875, + "reward_std": 0.43332719057798386, + "rewards/accuracy_reward": 0.5859375, + "rewards/format_reward": 0.90625, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.53125, + "epoch": 0.6808219178082192, + "grad_norm": 5.755354881286621, + "kl": 0.29296875, + "learning_rate": 7.730593607305936e-07, + "loss": 0.0003, + "reward": 1.375, + "reward_std": 0.3945523276925087, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 0.96875, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completion_length": 889.03125, + "epoch": 0.6821917808219178, + "grad_norm": 1.7405959367752075, + "kl": 0.22509765625, + "learning_rate": 7.726027397260274e-07, + "loss": 0.0002, + "reward": 1.7942708134651184, + "reward_std": 0.7046211212873459, + "rewards/accuracy_reward": 0.9192708432674408, + "rewards/format_reward": 0.875, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completion_length": 740.78125, + "epoch": 0.6835616438356165, + "grad_norm": 2.4895272254943848, + "kl": 0.259521484375, + "learning_rate": 7.721461187214611e-07, + "loss": 0.0003, + "reward": 1.3125, + "reward_std": 0.6559129282832146, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.84375, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completion_length": 684.9375, + "epoch": 0.684931506849315, + "grad_norm": 1.2708057165145874, + "kl": 0.2705078125, + "learning_rate": 7.71689497716895e-07, + "loss": 0.0003, + "reward": 1.078125, + "reward_std": 0.3463020324707031, + "rewards/accuracy_reward": 0.171875, + "rewards/format_reward": 0.90625, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.6875, + "epoch": 0.6863013698630137, + "grad_norm": 2.6224188804626465, + "kl": 0.279052734375, + "learning_rate": 7.712328767123287e-07, + "loss": 0.0003, + "reward": 0.96875, + "reward_std": 0.2041158601641655, + "rewards/accuracy_reward": 0.03125, + "rewards/format_reward": 0.9375, + "step": 501 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.8125, + "epoch": 0.6876712328767123, + "grad_norm": 3.9175403118133545, + "kl": 0.35009765625, + "learning_rate": 7.707762557077625e-07, + "loss": 0.0004, + "reward": 1.625, + "reward_std": 0.29514501616358757, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 502 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.9375, + "epoch": 0.689041095890411, + "grad_norm": 6.975749492645264, + "kl": 0.266357421875, + "learning_rate": 7.703196347031963e-07, + "loss": 0.0003, + "reward": 2.0, + "reward_std": 0.45176807790994644, + "rewards/accuracy_reward": 1.03125, + "rewards/format_reward": 0.96875, + "step": 503 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.9375, + "epoch": 0.6904109589041096, + "grad_norm": 5.680784225463867, + "kl": 0.28369140625, + "learning_rate": 7.6986301369863e-07, + "loss": 0.0003, + "reward": 1.46875, + "reward_std": 0.4729364886879921, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.96875, + "step": 504 + }, + { + "clip_ratio": 0.0, + "completion_length": 821.1875, + "epoch": 0.6917808219178082, + "grad_norm": 1.110270380973816, + "kl": 0.23046875, + "learning_rate": 7.69406392694064e-07, + "loss": 0.0002, + "reward": 0.9375, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completion_length": 656.78125, + "epoch": 0.6931506849315069, + "grad_norm": 1.595416784286499, + "kl": 0.2509765625, + "learning_rate": 7.689497716894977e-07, + "loss": 0.0003, + "reward": 1.8359375, + "reward_std": 0.26719603314995766, + "rewards/accuracy_reward": 0.8359375, + "rewards/format_reward": 1.0, + "step": 506 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.8125, + "epoch": 0.6945205479452055, + "grad_norm": 4.328030586242676, + "kl": 0.26171875, + "learning_rate": 7.684931506849314e-07, + "loss": 0.0003, + "reward": 1.234375, + "reward_std": 0.0289318785071373, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 1.0, + "step": 507 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.65625, + "epoch": 0.6958904109589041, + "grad_norm": 0.007987846620380878, + "kl": 0.261474609375, + "learning_rate": 7.680365296803653e-07, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 508 + }, + { + "clip_ratio": 0.0, + "completion_length": 501.03125, + "epoch": 0.6972602739726027, + "grad_norm": 3.3370308876037598, + "kl": 0.268310546875, + "learning_rate": 7.67579908675799e-07, + "loss": 0.0003, + "reward": 1.1875, + "reward_std": 0.2619796171784401, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 0.96875, + "step": 509 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.25, + "epoch": 0.6986301369863014, + "grad_norm": 3.816755533218384, + "kl": 0.287109375, + "learning_rate": 7.671232876712328e-07, + "loss": 0.0003, + "reward": 1.09375, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward": 0.09375, + "rewards/format_reward": 1.0, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.0, + "epoch": 0.7, + "grad_norm": 0.028308337554335594, + "kl": 0.325439453125, + "learning_rate": 7.666666666666667e-07, + "loss": 0.0003, + "reward": 1.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 511 + }, + { + "clip_ratio": 0.0, + "completion_length": 758.375, + "epoch": 0.7013698630136986, + "grad_norm": 2.2149853706359863, + "kl": 0.2958984375, + "learning_rate": 7.662100456621004e-07, + "loss": 0.0003, + "reward": 1.6875, + "reward_std": 0.05892554949969053, + "rewards/accuracy_reward": 0.6874999850988388, + "rewards/format_reward": 1.0, + "step": 512 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.5625, + "epoch": 0.7027397260273973, + "grad_norm": 3.7374677658081055, + "kl": 0.328857421875, + "learning_rate": 7.657534246575343e-07, + "loss": 0.0003, + "reward": 1.7734375, + "reward_std": 0.30776159279048443, + "rewards/accuracy_reward": 0.7734375, + "rewards/format_reward": 1.0, + "step": 513 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.09375, + "epoch": 0.7041095890410959, + "grad_norm": 1.2948936223983765, + "kl": 0.260986328125, + "learning_rate": 7.65296803652968e-07, + "loss": 0.0003, + "reward": 1.6953125, + "reward_std": 0.15467960201203823, + "rewards/accuracy_reward": 0.6953125, + "rewards/format_reward": 1.0, + "step": 514 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.53125, + "epoch": 0.7054794520547946, + "grad_norm": 6.833772659301758, + "kl": 0.3134765625, + "learning_rate": 7.648401826484017e-07, + "loss": 0.0003, + "reward": 1.75, + "reward_std": 0.22778154164552689, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.03125, + "epoch": 0.7068493150684931, + "grad_norm": 8.628284454345703, + "kl": 0.315185546875, + "learning_rate": 7.643835616438356e-07, + "loss": 0.0003, + "reward": 1.5, + "reward_std": 0.18861131370067596, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 516 + }, + { + "clip_ratio": 0.0, + "completion_length": 709.09375, + "epoch": 0.7082191780821918, + "grad_norm": 3.9952471256256104, + "kl": 0.2568359375, + "learning_rate": 7.639269406392693e-07, + "loss": 0.0003, + "reward": 1.4375, + "reward_std": 0.08548713475465775, + "rewards/accuracy_reward": 0.4374999850988388, + "rewards/format_reward": 1.0, + "step": 517 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.03125, + "epoch": 0.7095890410958904, + "grad_norm": 0.01444815844297409, + "kl": 0.294921875, + "learning_rate": 7.634703196347032e-07, + "loss": 0.0003, + "reward": 1.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 518 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.21875, + "epoch": 0.7109589041095891, + "grad_norm": 3.509876251220703, + "kl": 0.31005859375, + "learning_rate": 7.63013698630137e-07, + "loss": 0.0003, + "reward": 1.640625, + "reward_std": 0.3361537680029869, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 1.0, + "step": 519 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.625, + "epoch": 0.7123287671232876, + "grad_norm": 2.812932252883911, + "kl": 0.3154296875, + "learning_rate": 7.625570776255707e-07, + "loss": 0.0003, + "reward": 1.578125, + "reward_std": 0.2109457477927208, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 1.0, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.75, + "epoch": 0.7136986301369863, + "grad_norm": 1.0527565479278564, + "kl": 0.28515625, + "learning_rate": 7.621004566210046e-07, + "loss": 0.0003, + "reward": 1.21875, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 521 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.125, + "epoch": 0.7150684931506849, + "grad_norm": 1.4227604866027832, + "kl": 0.26123046875, + "learning_rate": 7.616438356164383e-07, + "loss": 0.0003, + "reward": 1.6328125, + "reward_std": 0.20593809336423874, + "rewards/accuracy_reward": 0.6328125, + "rewards/format_reward": 1.0, + "step": 522 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.34375, + "epoch": 0.7164383561643836, + "grad_norm": 1.042665719985962, + "kl": 0.3056640625, + "learning_rate": 7.61187214611872e-07, + "loss": 0.0003, + "reward": 1.21875, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 523 + }, + { + "clip_ratio": 0.0, + "completion_length": 697.65625, + "epoch": 0.7178082191780822, + "grad_norm": 4.018407344818115, + "kl": 0.2666015625, + "learning_rate": 7.607305936073059e-07, + "loss": 0.0003, + "reward": 1.6953125, + "reward_std": 0.16724733635783195, + "rewards/accuracy_reward": 0.6953125, + "rewards/format_reward": 1.0, + "step": 524 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.78125, + "epoch": 0.7191780821917808, + "grad_norm": 1.4768673181533813, + "kl": 0.28564453125, + "learning_rate": 7.602739726027397e-07, + "loss": 0.0003, + "reward": 1.4114583134651184, + "reward_std": 0.09781630150973797, + "rewards/accuracy_reward": 0.4114583283662796, + "rewards/format_reward": 1.0, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.15625, + "epoch": 0.7205479452054795, + "grad_norm": 1.8076670169830322, + "kl": 0.254638671875, + "learning_rate": 7.598173515981735e-07, + "loss": 0.0003, + "reward": 1.875, + "reward_std": 0.13363061845302582, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 526 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.625, + "epoch": 0.7219178082191781, + "grad_norm": 2.97518253326416, + "kl": 0.26708984375, + "learning_rate": 7.593607305936073e-07, + "loss": 0.0003, + "reward": 1.59375, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 1.0, + "step": 527 + }, + { + "clip_ratio": 0.0, + "completion_length": 690.125, + "epoch": 0.7232876712328767, + "grad_norm": 3.0159289836883545, + "kl": 0.29248046875, + "learning_rate": 7.58904109589041e-07, + "loss": 0.0003, + "reward": 1.6875, + "reward_std": 0.3495672009885311, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.96875, + "step": 528 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.53125, + "epoch": 0.7246575342465753, + "grad_norm": 0.024200087413191795, + "kl": 0.3154296875, + "learning_rate": 7.584474885844749e-07, + "loss": 0.0003, + "reward": 1.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 1.0, + "step": 529 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.3125, + "epoch": 0.726027397260274, + "grad_norm": 16.51392936706543, + "kl": 0.2783203125, + "learning_rate": 7.579908675799086e-07, + "loss": 0.0003, + "reward": 1.7708333134651184, + "reward_std": 0.24923127330839634, + "rewards/accuracy_reward": 0.7708333283662796, + "rewards/format_reward": 1.0, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completion_length": 670.75, + "epoch": 0.7273972602739726, + "grad_norm": 2.6126623153686523, + "kl": 0.275390625, + "learning_rate": 7.575342465753424e-07, + "loss": 0.0003, + "reward": 1.7552083432674408, + "reward_std": 0.14971192181110382, + "rewards/accuracy_reward": 0.7552083432674408, + "rewards/format_reward": 1.0, + "step": 531 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.21875, + "epoch": 0.7287671232876712, + "grad_norm": 1.1948318481445312, + "kl": 0.2802734375, + "learning_rate": 7.570776255707763e-07, + "loss": 0.0003, + "reward": 1.4375, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 532 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.375, + "epoch": 0.7301369863013699, + "grad_norm": 23.645620346069336, + "kl": 0.320556640625, + "learning_rate": 7.5662100456621e-07, + "loss": 0.0003, + "reward": 1.46875, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 1.0, + "step": 533 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.96875, + "epoch": 0.7315068493150685, + "grad_norm": 3.456782579421997, + "kl": 0.36669921875, + "learning_rate": 7.561643835616438e-07, + "loss": 0.0004, + "reward": 1.6354166269302368, + "reward_std": 0.20183072239160538, + "rewards/accuracy_reward": 0.6354166865348816, + "rewards/format_reward": 1.0, + "step": 534 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.625, + "epoch": 0.7328767123287672, + "grad_norm": 2.4733662605285645, + "kl": 0.30615234375, + "learning_rate": 7.557077625570776e-07, + "loss": 0.0003, + "reward": 1.15625, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward": 0.15625, + "rewards/format_reward": 1.0, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.875, + "epoch": 0.7342465753424657, + "grad_norm": 3.866555690765381, + "kl": 0.3017578125, + "learning_rate": 7.552511415525113e-07, + "loss": 0.0003, + "reward": 1.875, + "reward_std": 0.2177756354212761, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 536 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.59375, + "epoch": 0.7356164383561644, + "grad_norm": 85.90696716308594, + "kl": 0.3330078125, + "learning_rate": 7.547945205479452e-07, + "loss": 0.0003, + "reward": 1.75, + "reward_std": 0.22461533173918724, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 537 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.5625, + "epoch": 0.736986301369863, + "grad_norm": 5.220126628875732, + "kl": 0.30078125, + "learning_rate": 7.54337899543379e-07, + "loss": 0.0003, + "reward": 1.3125, + "reward_std": 0.20044592767953873, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 538 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.40625, + "epoch": 0.7383561643835617, + "grad_norm": 5.28268575668335, + "kl": 0.310546875, + "learning_rate": 7.538812785388127e-07, + "loss": 0.0003, + "reward": 1.7890625, + "reward_std": 0.26380185037851334, + "rewards/accuracy_reward": 0.7890625, + "rewards/format_reward": 1.0, + "step": 539 + }, + { + "clip_ratio": 0.0, + "completion_length": 691.09375, + "epoch": 0.7397260273972602, + "grad_norm": 2.411632537841797, + "kl": 0.29296875, + "learning_rate": 7.534246575342466e-07, + "loss": 0.0003, + "reward": 1.65625, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 1.0, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.8125, + "epoch": 0.7410958904109589, + "grad_norm": 1.7370656728744507, + "kl": 0.31689453125, + "learning_rate": 7.529680365296803e-07, + "loss": 0.0003, + "reward": 1.375, + "reward_std": 0.13363061845302582, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 541 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.75, + "epoch": 0.7424657534246575, + "grad_norm": 1.8685580492019653, + "kl": 0.2685546875, + "learning_rate": 7.525114155251141e-07, + "loss": 0.0003, + "reward": 1.5052083432674408, + "reward_std": 0.1630059964954853, + "rewards/accuracy_reward": 0.5052083283662796, + "rewards/format_reward": 1.0, + "step": 542 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.1875, + "epoch": 0.7438356164383562, + "grad_norm": 4.826626777648926, + "kl": 0.28369140625, + "learning_rate": 7.520547945205479e-07, + "loss": 0.0003, + "reward": 1.546875, + "reward_std": 0.319402813911438, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 1.0, + "step": 543 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.8125, + "epoch": 0.7452054794520548, + "grad_norm": 1.5942007303237915, + "kl": 0.27001953125, + "learning_rate": 7.515981735159816e-07, + "loss": 0.0003, + "reward": 1.4375, + "reward_std": 0.06681530922651291, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 544 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.1875, + "epoch": 0.7465753424657534, + "grad_norm": 0.8855934739112854, + "kl": 0.2890625, + "learning_rate": 7.511415525114156e-07, + "loss": 0.0003, + "reward": 1.4453125, + "reward_std": 0.0521576851606369, + "rewards/accuracy_reward": 0.4453125, + "rewards/format_reward": 1.0, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.75, + "epoch": 0.7479452054794521, + "grad_norm": 9.443655967712402, + "kl": 0.31640625, + "learning_rate": 7.506849315068493e-07, + "loss": 0.0003, + "reward": 1.53125, + "reward_std": 0.17358146235346794, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 1.0, + "step": 546 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.25, + "epoch": 0.7493150684931507, + "grad_norm": 1.2628391981124878, + "kl": 0.2822265625, + "learning_rate": 7.50228310502283e-07, + "loss": 0.0003, + "reward": 1.478124976158142, + "reward_std": 0.08902433887124062, + "rewards/accuracy_reward": 0.4781249761581421, + "rewards/format_reward": 1.0, + "step": 547 + }, + { + "clip_ratio": 0.0, + "completion_length": 683.21875, + "epoch": 0.7506849315068493, + "grad_norm": 2.95210599899292, + "kl": 0.349609375, + "learning_rate": 7.497716894977169e-07, + "loss": 0.0004, + "reward": 1.7135416865348816, + "reward_std": 0.18521836958825588, + "rewards/accuracy_reward": 0.7135416716337204, + "rewards/format_reward": 1.0, + "step": 548 + }, + { + "clip_ratio": 0.0, + "completion_length": 717.96875, + "epoch": 0.7520547945205479, + "grad_norm": 17.074256896972656, + "kl": 0.2978515625, + "learning_rate": 7.493150684931506e-07, + "loss": 0.0003, + "reward": 1.6614583134651184, + "reward_std": 0.1296813301742077, + "rewards/accuracy_reward": 0.6614583432674408, + "rewards/format_reward": 1.0, + "step": 549 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.6875, + "epoch": 0.7534246575342466, + "grad_norm": 3.607158660888672, + "kl": 0.283203125, + "learning_rate": 7.488584474885844e-07, + "loss": 0.0003, + "reward": 1.640625, + "reward_std": 0.3824852555990219, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.96875, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completion_length": 683.0, + "epoch": 0.7547945205479452, + "grad_norm": 1.7502480745315552, + "kl": 0.26708984375, + "learning_rate": 7.484018264840183e-07, + "loss": 0.0003, + "reward": 1.6238839328289032, + "reward_std": 0.12080634757876396, + "rewards/accuracy_reward": 0.623883917927742, + "rewards/format_reward": 1.0, + "step": 551 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.1875, + "epoch": 0.7561643835616438, + "grad_norm": 6.180083274841309, + "kl": 0.29345703125, + "learning_rate": 7.47945205479452e-07, + "loss": 0.0003, + "reward": 1.8828125, + "reward_std": 0.31077960692346096, + "rewards/accuracy_reward": 0.8828124850988388, + "rewards/format_reward": 1.0, + "step": 552 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.1875, + "epoch": 0.7575342465753425, + "grad_norm": 0.9904180765151978, + "kl": 0.2626953125, + "learning_rate": 7.474885844748859e-07, + "loss": 0.0003, + "reward": 1.546875, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 1.0, + "step": 553 + }, + { + "clip_ratio": 0.0, + "completion_length": 716.78125, + "epoch": 0.7589041095890411, + "grad_norm": 2.229074478149414, + "kl": 0.24365234375, + "learning_rate": 7.470319634703196e-07, + "loss": 0.0002, + "reward": 1.8260416388511658, + "reward_std": 0.1378844864666462, + "rewards/accuracy_reward": 0.826041653752327, + "rewards/format_reward": 1.0, + "step": 554 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.5625, + "epoch": 0.7602739726027398, + "grad_norm": 4.273021697998047, + "kl": 0.277587890625, + "learning_rate": 7.465753424657533e-07, + "loss": 0.0003, + "reward": 1.8828125, + "reward_std": 0.15702588856220245, + "rewards/accuracy_reward": 0.8828125, + "rewards/format_reward": 1.0, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.03125, + "epoch": 0.7616438356164383, + "grad_norm": 0.02915891632437706, + "kl": 0.322265625, + "learning_rate": 7.461187214611872e-07, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 556 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.90625, + "epoch": 0.763013698630137, + "grad_norm": 1.1693438291549683, + "kl": 0.29638671875, + "learning_rate": 7.456621004566209e-07, + "loss": 0.0003, + "reward": 1.34375, + "reward_std": 0.06681530922651291, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 557 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.09375, + "epoch": 0.7643835616438356, + "grad_norm": 0.9773749113082886, + "kl": 0.3115234375, + "learning_rate": 7.452054794520548e-07, + "loss": 0.0003, + "reward": 1.3046875, + "reward_std": 0.09704047441482544, + "rewards/accuracy_reward": 0.3046875, + "rewards/format_reward": 1.0, + "step": 558 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.03125, + "epoch": 0.7657534246575343, + "grad_norm": 6.028942108154297, + "kl": 0.28271484375, + "learning_rate": 7.447488584474886e-07, + "loss": 0.0003, + "reward": 1.765625, + "reward_std": 0.10698894783854485, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 1.0, + "step": 559 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.125, + "epoch": 0.7671232876712328, + "grad_norm": 6.394768714904785, + "kl": 0.3134765625, + "learning_rate": 7.442922374429223e-07, + "loss": 0.0003, + "reward": 1.609375, + "reward_std": 0.17236988618969917, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 1.0, + "step": 560 + }, + { + "clip_ratio": 0.0, + "completion_length": 768.28125, + "epoch": 0.7684931506849315, + "grad_norm": 1.2454359531402588, + "kl": 0.275390625, + "learning_rate": 7.438356164383562e-07, + "loss": 0.0003, + "reward": 1.4140625, + "reward_std": 0.09704046696424484, + "rewards/accuracy_reward": 0.4140625, + "rewards/format_reward": 1.0, + "step": 561 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.40625, + "epoch": 0.7698630136986301, + "grad_norm": 4.182791233062744, + "kl": 0.27001953125, + "learning_rate": 7.433789954337899e-07, + "loss": 0.0003, + "reward": 1.484375, + "reward_std": 0.23144521936774254, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 1.0, + "step": 562 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.8125, + "epoch": 0.7712328767123288, + "grad_norm": 2.8904168605804443, + "kl": 0.3212890625, + "learning_rate": 7.429223744292236e-07, + "loss": 0.0003, + "reward": 1.40625, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 1.0, + "step": 563 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.15625, + "epoch": 0.7726027397260274, + "grad_norm": 10.556282997131348, + "kl": 0.28759765625, + "learning_rate": 7.424657534246575e-07, + "loss": 0.0003, + "reward": 1.46875, + "reward_std": 0.047245558351278305, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 1.0, + "step": 564 + }, + { + "clip_ratio": 0.0, + "completion_length": 788.96875, + "epoch": 0.773972602739726, + "grad_norm": 2.3608124256134033, + "kl": 0.26708984375, + "learning_rate": 7.420091324200913e-07, + "loss": 0.0003, + "reward": 1.4427083730697632, + "reward_std": 0.10979808866977692, + "rewards/accuracy_reward": 0.4427083432674408, + "rewards/format_reward": 1.0, + "step": 565 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.4375, + "epoch": 0.7753424657534247, + "grad_norm": 1.940722942352295, + "kl": 0.26904296875, + "learning_rate": 7.415525114155251e-07, + "loss": 0.0003, + "reward": 1.78125, + "reward_std": 0.16675157472491264, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 1.0, + "step": 566 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.8125, + "epoch": 0.7767123287671233, + "grad_norm": 1.715049147605896, + "kl": 0.27685546875, + "learning_rate": 7.410958904109589e-07, + "loss": 0.0003, + "reward": 1.62109375, + "reward_std": 0.20764102414250374, + "rewards/accuracy_reward": 0.65234375, + "rewards/format_reward": 0.96875, + "step": 567 + }, + { + "clip_ratio": 0.0, + "completion_length": 634.90625, + "epoch": 0.7780821917808219, + "grad_norm": 2.0080249309539795, + "kl": 0.255859375, + "learning_rate": 7.406392694063926e-07, + "loss": 0.0003, + "reward": 1.4375, + "reward_std": 0.1462521031498909, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 568 + }, + { + "clip_ratio": 0.0, + "completion_length": 792.75, + "epoch": 0.7794520547945205, + "grad_norm": 1.9278701543807983, + "kl": 0.27001953125, + "learning_rate": 7.401826484018265e-07, + "loss": 0.0003, + "reward": 1.375, + "reward_std": 0.07312605157494545, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 569 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.28125, + "epoch": 0.7808219178082192, + "grad_norm": 0.017607873305678368, + "kl": 0.31640625, + "learning_rate": 7.397260273972602e-07, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 570 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.0625, + "epoch": 0.7821917808219178, + "grad_norm": 4.558350086212158, + "kl": 0.3037109375, + "learning_rate": 7.39269406392694e-07, + "loss": 0.0003, + "reward": 1.25, + "reward_std": 0.2925042062997818, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.96875, + "step": 571 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.625, + "epoch": 0.7835616438356164, + "grad_norm": 5.511167049407959, + "kl": 0.361328125, + "learning_rate": 7.388127853881279e-07, + "loss": 0.0004, + "reward": 1.6484375, + "reward_std": 0.2647695615887642, + "rewards/accuracy_reward": 0.6796875, + "rewards/format_reward": 0.96875, + "step": 572 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.5, + "epoch": 0.7849315068493151, + "grad_norm": 3.1493077278137207, + "kl": 0.3115234375, + "learning_rate": 7.383561643835616e-07, + "loss": 0.0003, + "reward": 1.4453125, + "reward_std": 0.09021057933568954, + "rewards/accuracy_reward": 0.4453125, + "rewards/format_reward": 1.0, + "step": 573 + }, + { + "clip_ratio": 0.0, + "completion_length": 659.9375, + "epoch": 0.7863013698630137, + "grad_norm": 2.7337605953216553, + "kl": 0.266357421875, + "learning_rate": 7.378995433789954e-07, + "loss": 0.0003, + "reward": 1.5859375, + "reward_std": 0.4949648827314377, + "rewards/accuracy_reward": 0.7109375, + "rewards/format_reward": 0.875, + "step": 574 + }, + { + "clip_ratio": 0.0, + "completion_length": 803.21875, + "epoch": 0.7876712328767124, + "grad_norm": 1.5574761629104614, + "kl": 0.269775390625, + "learning_rate": 7.374429223744292e-07, + "loss": 0.0003, + "reward": 1.703125, + "reward_std": 0.3444985970854759, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.96875, + "step": 575 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.21875, + "epoch": 0.7890410958904109, + "grad_norm": 2.1322855949401855, + "kl": 0.28125, + "learning_rate": 7.369863013698629e-07, + "loss": 0.0003, + "reward": 1.6328125, + "reward_std": 0.38375694304704666, + "rewards/accuracy_reward": 0.6640625, + "rewards/format_reward": 0.96875, + "step": 576 + }, + { + "clip_ratio": 0.0, + "completion_length": 811.1875, + "epoch": 0.7904109589041096, + "grad_norm": 2.1284778118133545, + "kl": 0.27099609375, + "learning_rate": 7.365296803652968e-07, + "loss": 0.0003, + "reward": 1.65625, + "reward_std": 0.2121911160647869, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/format_reward": 0.96875, + "step": 577 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.84375, + "epoch": 0.7917808219178082, + "grad_norm": 2.501584529876709, + "kl": 0.2880859375, + "learning_rate": 7.360730593607306e-07, + "loss": 0.0003, + "reward": 1.7604166269302368, + "reward_std": 0.19149437546730042, + "rewards/accuracy_reward": 0.7604166269302368, + "rewards/format_reward": 1.0, + "step": 578 + }, + { + "clip_ratio": 0.0, + "completion_length": 709.5625, + "epoch": 0.7931506849315069, + "grad_norm": 2.271083354949951, + "kl": 0.2861328125, + "learning_rate": 7.356164383561643e-07, + "loss": 0.0003, + "reward": 1.7109375, + "reward_std": 0.29368035681545734, + "rewards/accuracy_reward": 0.7421875, + "rewards/format_reward": 0.96875, + "step": 579 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.21875, + "epoch": 0.7945205479452054, + "grad_norm": 0.75428706407547, + "kl": 0.2734375, + "learning_rate": 7.351598173515982e-07, + "loss": 0.0003, + "reward": 1.3828125, + "reward_std": 0.26397860050201416, + "rewards/accuracy_reward": 0.4453125, + "rewards/format_reward": 0.9375, + "step": 580 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.28125, + "epoch": 0.7958904109589041, + "grad_norm": 6.066575527191162, + "kl": 0.29150390625, + "learning_rate": 7.347031963470319e-07, + "loss": 0.0003, + "reward": 2.0625, + "reward_std": 0.21965250372886658, + "rewards/accuracy_reward": 1.0625, + "rewards/format_reward": 1.0, + "step": 581 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.875, + "epoch": 0.7972602739726027, + "grad_norm": 2.4375851154327393, + "kl": 0.28125, + "learning_rate": 7.342465753424657e-07, + "loss": 0.0003, + "reward": 1.7734375, + "reward_std": 0.14333692379295826, + "rewards/accuracy_reward": 0.7734375, + "rewards/format_reward": 1.0, + "step": 582 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.75, + "epoch": 0.7986301369863014, + "grad_norm": 4.929551601409912, + "kl": 0.30712890625, + "learning_rate": 7.337899543378995e-07, + "loss": 0.0003, + "reward": 1.5885416865348816, + "reward_std": 0.33561520278453827, + "rewards/accuracy_reward": 0.6197916716337204, + "rewards/format_reward": 0.96875, + "step": 583 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.96875, + "epoch": 0.8, + "grad_norm": 3.666161298751831, + "kl": 0.294921875, + "learning_rate": 7.333333333333332e-07, + "loss": 0.0003, + "reward": 1.6875, + "reward_std": 0.20616560243070126, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 584 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.90625, + "epoch": 0.8013698630136986, + "grad_norm": 3.3543715476989746, + "kl": 0.31201171875, + "learning_rate": 7.328767123287672e-07, + "loss": 0.0003, + "reward": 1.4140625, + "reward_std": 0.22327817976474762, + "rewards/accuracy_reward": 0.4140625, + "rewards/format_reward": 1.0, + "step": 585 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.5625, + "epoch": 0.8027397260273973, + "grad_norm": 4.409224987030029, + "kl": 0.28857421875, + "learning_rate": 7.324200913242009e-07, + "loss": 0.0003, + "reward": 1.875, + "reward_std": 0.1934976615011692, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 586 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.78125, + "epoch": 0.8041095890410959, + "grad_norm": 2.494868278503418, + "kl": 0.3046875, + "learning_rate": 7.319634703196346e-07, + "loss": 0.0003, + "reward": 1.703125, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 1.0, + "step": 587 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.3125, + "epoch": 0.8054794520547945, + "grad_norm": 4.713694095611572, + "kl": 0.3271484375, + "learning_rate": 7.315068493150685e-07, + "loss": 0.0003, + "reward": 1.21875, + "reward_std": 0.29305070638656616, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.9375, + "step": 588 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.4375, + "epoch": 0.8068493150684931, + "grad_norm": 4.22615909576416, + "kl": 0.288818359375, + "learning_rate": 7.310502283105022e-07, + "loss": 0.0003, + "reward": 1.6979166269302368, + "reward_std": 0.14746366813778877, + "rewards/accuracy_reward": 0.6979166567325592, + "rewards/format_reward": 1.0, + "step": 589 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.46875, + "epoch": 0.8082191780821918, + "grad_norm": 3.3240842819213867, + "kl": 0.29296875, + "learning_rate": 7.30593607305936e-07, + "loss": 0.0003, + "reward": 1.8515625, + "reward_std": 0.07912752032279968, + "rewards/accuracy_reward": 0.8515625, + "rewards/format_reward": 1.0, + "step": 590 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.59375, + "epoch": 0.8095890410958904, + "grad_norm": 2.9559454917907715, + "kl": 0.45556640625, + "learning_rate": 7.301369863013699e-07, + "loss": 0.0005, + "reward": 1.3984375, + "reward_std": 0.08679073117673397, + "rewards/accuracy_reward": 0.3984375, + "rewards/format_reward": 1.0, + "step": 591 + }, + { + "clip_ratio": 0.0, + "completion_length": 698.65625, + "epoch": 0.810958904109589, + "grad_norm": 6.2642693519592285, + "kl": 0.29150390625, + "learning_rate": 7.296803652968036e-07, + "loss": 0.0003, + "reward": 1.390625, + "reward_std": 0.1825350895524025, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 0.96875, + "step": 592 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.78125, + "epoch": 0.8123287671232877, + "grad_norm": 1.58978271484375, + "kl": 0.2802734375, + "learning_rate": 7.292237442922375e-07, + "loss": 0.0003, + "reward": 1.453125, + "reward_std": 0.10669417306780815, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 593 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.75, + "epoch": 0.8136986301369863, + "grad_norm": 3.9518697261810303, + "kl": 0.2890625, + "learning_rate": 7.287671232876712e-07, + "loss": 0.0003, + "reward": 1.9453125, + "reward_std": 0.26032389141619205, + "rewards/accuracy_reward": 0.9453125, + "rewards/format_reward": 1.0, + "step": 594 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.9375, + "epoch": 0.815068493150685, + "grad_norm": 2.4345860481262207, + "kl": 0.30078125, + "learning_rate": 7.283105022831049e-07, + "loss": 0.0003, + "reward": 1.390625, + "reward_std": 0.17358146235346794, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 1.0, + "step": 595 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.90625, + "epoch": 0.8164383561643835, + "grad_norm": 1.7585220336914062, + "kl": 0.260009765625, + "learning_rate": 7.278538812785388e-07, + "loss": 0.0003, + "reward": 1.7447916269302368, + "reward_std": 0.11100948229432106, + "rewards/accuracy_reward": 0.7447916567325592, + "rewards/format_reward": 1.0, + "step": 596 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.03125, + "epoch": 0.8178082191780822, + "grad_norm": 10.006606101989746, + "kl": 0.2744140625, + "learning_rate": 7.273972602739725e-07, + "loss": 0.0003, + "reward": 1.3828125, + "reward_std": 0.08679073117673397, + "rewards/accuracy_reward": 0.3828125, + "rewards/format_reward": 1.0, + "step": 597 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.84375, + "epoch": 0.8191780821917808, + "grad_norm": 3.1597371101379395, + "kl": 0.293701171875, + "learning_rate": 7.269406392694064e-07, + "loss": 0.0003, + "reward": 1.6015625, + "reward_std": 0.1936504878103733, + "rewards/accuracy_reward": 0.6015625, + "rewards/format_reward": 1.0, + "step": 598 + }, + { + "clip_ratio": 0.0, + "completion_length": 715.8125, + "epoch": 0.8205479452054795, + "grad_norm": 1.5528578758239746, + "kl": 0.28076171875, + "learning_rate": 7.264840182648402e-07, + "loss": 0.0003, + "reward": 1.4765625, + "reward_std": 0.07860555313527584, + "rewards/accuracy_reward": 0.4765625, + "rewards/format_reward": 1.0, + "step": 599 + }, + { + "clip_ratio": 0.0, + "completion_length": 751.15625, + "epoch": 0.821917808219178, + "grad_norm": 1.6795881986618042, + "kl": 0.312744140625, + "learning_rate": 7.260273972602739e-07, + "loss": 0.0003, + "reward": 1.875, + "reward_std": 0.21018434315919876, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 600 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.125, + "epoch": 0.8232876712328767, + "grad_norm": 1.0556623935699463, + "kl": 0.32861328125, + "learning_rate": 7.255707762557078e-07, + "loss": 0.0003, + "reward": 1.171875, + "reward_std": 0.0289318785071373, + "rewards/accuracy_reward": 0.171875, + "rewards/format_reward": 1.0, + "step": 601 + }, + { + "clip_ratio": 0.0, + "completion_length": 638.03125, + "epoch": 0.8246575342465754, + "grad_norm": 4.618863105773926, + "kl": 0.28955078125, + "learning_rate": 7.251141552511415e-07, + "loss": 0.0003, + "reward": 1.609375, + "reward_std": 0.3098084628582001, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 1.0, + "step": 602 + }, + { + "clip_ratio": 0.0, + "completion_length": 724.4375, + "epoch": 0.826027397260274, + "grad_norm": 3.4050896167755127, + "kl": 0.2763671875, + "learning_rate": 7.246575342465752e-07, + "loss": 0.0003, + "reward": 1.6015625, + "reward_std": 0.1325697861611843, + "rewards/accuracy_reward": 0.6015625, + "rewards/format_reward": 1.0, + "step": 603 + }, + { + "clip_ratio": 0.0, + "completion_length": 779.09375, + "epoch": 0.8273972602739726, + "grad_norm": 2.157498836517334, + "kl": 0.27197265625, + "learning_rate": 7.242009132420091e-07, + "loss": 0.0003, + "reward": 2.125, + "reward_std": 0.4762524124234915, + "rewards/accuracy_reward": 1.15625, + "rewards/format_reward": 0.96875, + "step": 604 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.0, + "epoch": 0.8287671232876712, + "grad_norm": 2.8244545459747314, + "kl": 0.29443359375, + "learning_rate": 7.237442922374429e-07, + "loss": 0.0003, + "reward": 1.671875, + "reward_std": 0.22097086161375046, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 1.0, + "step": 605 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.0, + "epoch": 0.8301369863013699, + "grad_norm": 3.9018266201019287, + "kl": 0.28271484375, + "learning_rate": 7.232876712328767e-07, + "loss": 0.0003, + "reward": 1.5736607313156128, + "reward_std": 0.1770894043147564, + "rewards/accuracy_reward": 0.5736607313156128, + "rewards/format_reward": 1.0, + "step": 606 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.84375, + "epoch": 0.8315068493150685, + "grad_norm": 3.512537956237793, + "kl": 0.3349609375, + "learning_rate": 7.228310502283105e-07, + "loss": 0.0003, + "reward": 1.375, + "reward_std": 0.1472245752811432, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 607 + }, + { + "clip_ratio": 0.0, + "completion_length": 641.84375, + "epoch": 0.8328767123287671, + "grad_norm": 2.6885643005371094, + "kl": 0.28564453125, + "learning_rate": 7.223744292237442e-07, + "loss": 0.0003, + "reward": 1.4765625, + "reward_std": 0.29774628579616547, + "rewards/accuracy_reward": 0.5390625, + "rewards/format_reward": 0.9375, + "step": 608 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.8125, + "epoch": 0.8342465753424657, + "grad_norm": 2.3910205364227295, + "kl": 0.345947265625, + "learning_rate": 7.219178082191781e-07, + "loss": 0.0003, + "reward": 1.140625, + "reward_std": 0.3463020324707031, + "rewards/accuracy_reward": 0.203125, + "rewards/format_reward": 0.9375, + "step": 609 + }, + { + "clip_ratio": 0.0, + "completion_length": 475.6875, + "epoch": 0.8356164383561644, + "grad_norm": 1.3065999746322632, + "kl": 0.326171875, + "learning_rate": 7.214611872146118e-07, + "loss": 0.0003, + "reward": 1.390625, + "reward_std": 0.08456665836274624, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 1.0, + "step": 610 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.9375, + "epoch": 0.836986301369863, + "grad_norm": 1.250556468963623, + "kl": 0.34521484375, + "learning_rate": 7.210045662100456e-07, + "loss": 0.0003, + "reward": 1.359375, + "reward_std": 0.27564920112490654, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 0.9375, + "step": 611 + }, + { + "clip_ratio": 0.0, + "completion_length": 808.1875, + "epoch": 0.8383561643835616, + "grad_norm": 1.4332400560379028, + "kl": 0.263671875, + "learning_rate": 7.205479452054795e-07, + "loss": 0.0003, + "reward": 1.8890624940395355, + "reward_std": 0.47034546732902527, + "rewards/accuracy_reward": 0.9828124940395355, + "rewards/format_reward": 0.90625, + "step": 612 + }, + { + "clip_ratio": 0.0, + "completion_length": 819.65625, + "epoch": 0.8397260273972603, + "grad_norm": 1.6501152515411377, + "kl": 0.258544921875, + "learning_rate": 7.200913242009132e-07, + "loss": 0.0003, + "reward": 1.75, + "reward_std": 0.4389287531375885, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9375, + "step": 613 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.34375, + "epoch": 0.8410958904109589, + "grad_norm": 0.8255899548530579, + "kl": 0.30078125, + "learning_rate": 7.19634703196347e-07, + "loss": 0.0003, + "reward": 1.765625, + "reward_std": 0.0289318785071373, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 1.0, + "step": 614 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.28125, + "epoch": 0.8424657534246576, + "grad_norm": 0.9710884094238281, + "kl": 0.30859375, + "learning_rate": 7.191780821917808e-07, + "loss": 0.0003, + "reward": 1.484375, + "reward_std": 0.0289318785071373, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 1.0, + "step": 615 + }, + { + "clip_ratio": 0.0, + "completion_length": 790.28125, + "epoch": 0.8438356164383561, + "grad_norm": 1.5235772132873535, + "kl": 0.28076171875, + "learning_rate": 7.187214611872145e-07, + "loss": 0.0003, + "reward": 1.75, + "reward_std": 0.15580293536186218, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 616 + }, + { + "clip_ratio": 0.0, + "completion_length": 753.9375, + "epoch": 0.8452054794520548, + "grad_norm": 2.7367019653320312, + "kl": 0.279296875, + "learning_rate": 7.182648401826484e-07, + "loss": 0.0003, + "reward": 2.5677083134651184, + "reward_std": 0.48400574177503586, + "rewards/accuracy_reward": 1.5989583134651184, + "rewards/format_reward": 0.96875, + "step": 617 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.375, + "epoch": 0.8465753424657534, + "grad_norm": 3.4342260360717773, + "kl": 0.29248046875, + "learning_rate": 7.178082191780822e-07, + "loss": 0.0003, + "reward": 1.97265625, + "reward_std": 0.23257054761052132, + "rewards/accuracy_reward": 0.97265625, + "rewards/format_reward": 1.0, + "step": 618 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.65625, + "epoch": 0.8479452054794521, + "grad_norm": 5.605570316314697, + "kl": 0.28662109375, + "learning_rate": 7.173515981735159e-07, + "loss": 0.0003, + "reward": 1.8854166269302368, + "reward_std": 0.23936405219137669, + "rewards/accuracy_reward": 0.885416641831398, + "rewards/format_reward": 1.0, + "step": 619 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.84375, + "epoch": 0.8493150684931506, + "grad_norm": 2.4277820587158203, + "kl": 0.318359375, + "learning_rate": 7.168949771689498e-07, + "loss": 0.0003, + "reward": 1.6328125, + "reward_std": 0.13782460056245327, + "rewards/accuracy_reward": 0.6328125, + "rewards/format_reward": 1.0, + "step": 620 + }, + { + "clip_ratio": 0.0, + "completion_length": 668.28125, + "epoch": 0.8506849315068493, + "grad_norm": 1.7949193716049194, + "kl": 0.28271484375, + "learning_rate": 7.164383561643835e-07, + "loss": 0.0003, + "reward": 1.6276041567325592, + "reward_std": 0.09027346037328243, + "rewards/accuracy_reward": 0.6276041567325592, + "rewards/format_reward": 1.0, + "step": 621 + }, + { + "clip_ratio": 0.0, + "completion_length": 706.1875, + "epoch": 0.852054794520548, + "grad_norm": 3.527575969696045, + "kl": 0.27978515625, + "learning_rate": 7.159817351598173e-07, + "loss": 0.0003, + "reward": 1.421875, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 1.0, + "step": 622 + }, + { + "clip_ratio": 0.0, + "completion_length": 700.46875, + "epoch": 0.8534246575342466, + "grad_norm": 1.6975399255752563, + "kl": 0.28515625, + "learning_rate": 7.155251141552511e-07, + "loss": 0.0003, + "reward": 1.4375, + "reward_std": 0.0936255231499672, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 623 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.59375, + "epoch": 0.8547945205479452, + "grad_norm": 4.774352550506592, + "kl": 0.3291015625, + "learning_rate": 7.150684931506848e-07, + "loss": 0.0003, + "reward": 1.6901041865348816, + "reward_std": 0.15677691251039505, + "rewards/accuracy_reward": 0.690104141831398, + "rewards/format_reward": 1.0, + "step": 624 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.5625, + "epoch": 0.8561643835616438, + "grad_norm": 3.3761487007141113, + "kl": 0.3056640625, + "learning_rate": 7.146118721461188e-07, + "loss": 0.0003, + "reward": 1.68359375, + "reward_std": 0.26938531920313835, + "rewards/accuracy_reward": 0.68359375, + "rewards/format_reward": 1.0, + "step": 625 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.375, + "epoch": 0.8575342465753425, + "grad_norm": 3.5964181423187256, + "kl": 0.38427734375, + "learning_rate": 7.141552511415525e-07, + "loss": 0.0004, + "reward": 1.515625, + "reward_std": 0.17782479152083397, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 1.0, + "step": 626 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.78125, + "epoch": 0.8589041095890411, + "grad_norm": 1.3047560453414917, + "kl": 0.3671875, + "learning_rate": 7.136986301369862e-07, + "loss": 0.0004, + "reward": 1.453125, + "reward_std": 0.02670290134847164, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 627 + }, + { + "clip_ratio": 0.0, + "completion_length": 500.15625, + "epoch": 0.8602739726027397, + "grad_norm": 3.4180963039398193, + "kl": 0.37451171875, + "learning_rate": 7.132420091324201e-07, + "loss": 0.0004, + "reward": 1.75, + "reward_std": 0.11659536883234978, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 628 + }, + { + "clip_ratio": 0.0, + "completion_length": 503.59375, + "epoch": 0.8616438356164383, + "grad_norm": 3.0317859649658203, + "kl": 0.31884765625, + "learning_rate": 7.127853881278538e-07, + "loss": 0.0003, + "reward": 1.9609375, + "reward_std": 0.40251583606004715, + "rewards/accuracy_reward": 0.9921875, + "rewards/format_reward": 0.96875, + "step": 629 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.46875, + "epoch": 0.863013698630137, + "grad_norm": 5.503389358520508, + "kl": 0.404296875, + "learning_rate": 7.123287671232876e-07, + "loss": 0.0004, + "reward": 1.5, + "reward_std": 0.3335031494498253, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 630 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.84375, + "epoch": 0.8643835616438356, + "grad_norm": 1.256745457649231, + "kl": 0.33642578125, + "learning_rate": 7.118721461187215e-07, + "loss": 0.0003, + "reward": 1.6197916269302368, + "reward_std": 0.051934316754341125, + "rewards/accuracy_reward": 0.6197916269302368, + "rewards/format_reward": 1.0, + "step": 631 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.125, + "epoch": 0.8657534246575342, + "grad_norm": 2.3345208168029785, + "kl": 0.35302734375, + "learning_rate": 7.114155251141552e-07, + "loss": 0.0004, + "reward": 1.40625, + "reward_std": 0.10888781771063805, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 1.0, + "step": 632 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.625, + "epoch": 0.8671232876712329, + "grad_norm": 2.5264086723327637, + "kl": 0.33544921875, + "learning_rate": 7.109589041095891e-07, + "loss": 0.0003, + "reward": 1.609375, + "reward_std": 0.2382849156856537, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 1.0, + "step": 633 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.78125, + "epoch": 0.8684931506849315, + "grad_norm": 2.600130081176758, + "kl": 0.34765625, + "learning_rate": 7.105022831050228e-07, + "loss": 0.0003, + "reward": 1.9562499523162842, + "reward_std": 0.1930173598229885, + "rewards/accuracy_reward": 0.956250011920929, + "rewards/format_reward": 1.0, + "step": 634 + }, + { + "clip_ratio": 0.0, + "completion_length": 490.625, + "epoch": 0.8698630136986302, + "grad_norm": 3.292919874191284, + "kl": 0.35791015625, + "learning_rate": 7.100456621004565e-07, + "loss": 0.0004, + "reward": 1.8541666567325592, + "reward_std": 0.193493926897645, + "rewards/accuracy_reward": 0.8541666716337204, + "rewards/format_reward": 1.0, + "step": 635 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.59375, + "epoch": 0.8712328767123287, + "grad_norm": 14.443897247314453, + "kl": 0.42919921875, + "learning_rate": 7.095890410958904e-07, + "loss": 0.0004, + "reward": 1.1875, + "reward_std": 0.06681530922651291, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 636 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.125, + "epoch": 0.8726027397260274, + "grad_norm": 1.4970674514770508, + "kl": 0.3974609375, + "learning_rate": 7.091324200913241e-07, + "loss": 0.0004, + "reward": 1.2265625, + "reward_std": 0.03234682232141495, + "rewards/accuracy_reward": 0.2265625, + "rewards/format_reward": 1.0, + "step": 637 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.375, + "epoch": 0.873972602739726, + "grad_norm": 4.312900543212891, + "kl": 0.39453125, + "learning_rate": 7.08675799086758e-07, + "loss": 0.0004, + "reward": 1.484375, + "reward_std": 0.27092268504202366, + "rewards/accuracy_reward": 0.4843750298023224, + "rewards/format_reward": 1.0, + "step": 638 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.6875, + "epoch": 0.8753424657534247, + "grad_norm": 1.0829259157180786, + "kl": 0.49462890625, + "learning_rate": 7.082191780821918e-07, + "loss": 0.0005, + "reward": 1.3125, + "reward_std": 0.10022296756505966, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 639 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.5, + "epoch": 0.8767123287671232, + "grad_norm": 5.12158203125, + "kl": 0.37353515625, + "learning_rate": 7.077625570776255e-07, + "loss": 0.0004, + "reward": 1.375, + "reward_std": 0.25693800300359726, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 0.96875, + "step": 640 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.96875, + "epoch": 0.8780821917808219, + "grad_norm": 3.457838773727417, + "kl": 0.376953125, + "learning_rate": 7.073059360730594e-07, + "loss": 0.0004, + "reward": 1.578125, + "reward_std": 0.30377669632434845, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 1.0, + "step": 641 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.09375, + "epoch": 0.8794520547945206, + "grad_norm": 3.6919641494750977, + "kl": 0.31689453125, + "learning_rate": 7.068493150684931e-07, + "loss": 0.0003, + "reward": 1.859375, + "reward_std": 0.34486518055200577, + "rewards/accuracy_reward": 0.890625, + "rewards/format_reward": 0.96875, + "step": 642 + }, + { + "clip_ratio": 0.0, + "completion_length": 627.125, + "epoch": 0.8808219178082192, + "grad_norm": 1.8475896120071411, + "kl": 0.34619140625, + "learning_rate": 7.063926940639268e-07, + "loss": 0.0003, + "reward": 1.78125, + "reward_std": 0.4355708882212639, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.84375, + "step": 643 + }, + { + "clip_ratio": 0.0, + "completion_length": 680.34375, + "epoch": 0.8821917808219178, + "grad_norm": 2.7510480880737305, + "kl": 0.29150390625, + "learning_rate": 7.059360730593607e-07, + "loss": 0.0003, + "reward": 1.578125, + "reward_std": 0.2629348188638687, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.9375, + "step": 644 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.53125, + "epoch": 0.8835616438356164, + "grad_norm": 4.39385986328125, + "kl": 0.3525390625, + "learning_rate": 7.054794520547945e-07, + "loss": 0.0004, + "reward": 1.125, + "reward_std": 0.3104073107242584, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.9375, + "step": 645 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.90625, + "epoch": 0.8849315068493151, + "grad_norm": 3.439211368560791, + "kl": 0.32080078125, + "learning_rate": 7.050228310502283e-07, + "loss": 0.0003, + "reward": 1.15625, + "reward_std": 0.3438149690628052, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 0.9375, + "step": 646 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.5625, + "epoch": 0.8863013698630137, + "grad_norm": 2.4772706031799316, + "kl": 0.3466796875, + "learning_rate": 7.045662100456621e-07, + "loss": 0.0003, + "reward": 1.15625, + "reward_std": 0.22201896458864212, + "rewards/accuracy_reward": 0.15625, + "rewards/format_reward": 1.0, + "step": 647 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.21875, + "epoch": 0.8876712328767123, + "grad_norm": 1.3561179637908936, + "kl": 0.341796875, + "learning_rate": 7.041095890410958e-07, + "loss": 0.0003, + "reward": 1.4765625, + "reward_std": 0.10474801808595657, + "rewards/accuracy_reward": 0.4765625, + "rewards/format_reward": 1.0, + "step": 648 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.78125, + "epoch": 0.8890410958904109, + "grad_norm": 45.053157806396484, + "kl": 0.33154296875, + "learning_rate": 7.036529680365297e-07, + "loss": 0.0003, + "reward": 1.59375, + "reward_std": 0.4626970961689949, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.9375, + "step": 649 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.5, + "epoch": 0.8904109589041096, + "grad_norm": 3.172072649002075, + "kl": 0.3642578125, + "learning_rate": 7.031963470319634e-07, + "loss": 0.0004, + "reward": 1.3515625, + "reward_std": 0.3067816346883774, + "rewards/accuracy_reward": 0.3828125, + "rewards/format_reward": 0.96875, + "step": 650 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.53125, + "epoch": 0.8917808219178082, + "grad_norm": 1.1374866962432861, + "kl": 0.3310546875, + "learning_rate": 7.027397260273972e-07, + "loss": 0.0003, + "reward": 1.184374988079071, + "reward_std": 0.022903122007846832, + "rewards/accuracy_reward": 0.18437501788139343, + "rewards/format_reward": 1.0, + "step": 651 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.3125, + "epoch": 0.8931506849315068, + "grad_norm": 2.0539586544036865, + "kl": 0.29931640625, + "learning_rate": 7.022831050228311e-07, + "loss": 0.0003, + "reward": 1.6953125, + "reward_std": 0.24830512702465057, + "rewards/accuracy_reward": 0.7265625, + "rewards/format_reward": 0.96875, + "step": 652 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.65625, + "epoch": 0.8945205479452055, + "grad_norm": 3.450404405593872, + "kl": 0.3037109375, + "learning_rate": 7.018264840182648e-07, + "loss": 0.0003, + "reward": 1.65625, + "reward_std": 0.49012404680252075, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.96875, + "step": 653 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.96875, + "epoch": 0.8958904109589041, + "grad_norm": 5.490792274475098, + "kl": 0.3271484375, + "learning_rate": 7.013698630136986e-07, + "loss": 0.0003, + "reward": 1.2750000059604645, + "reward_std": 0.13220207020640373, + "rewards/accuracy_reward": 0.2750000059604645, + "rewards/format_reward": 1.0, + "step": 654 + }, + { + "clip_ratio": 0.0, + "completion_length": 474.78125, + "epoch": 0.8972602739726028, + "grad_norm": 4.403593063354492, + "kl": 0.3408203125, + "learning_rate": 7.009132420091324e-07, + "loss": 0.0003, + "reward": 1.203125, + "reward_std": 0.30935921147465706, + "rewards/accuracy_reward": 0.265625, + "rewards/format_reward": 0.9375, + "step": 655 + }, + { + "clip_ratio": 0.0, + "completion_length": 529.5, + "epoch": 0.8986301369863013, + "grad_norm": 1.4611150026321411, + "kl": 0.3193359375, + "learning_rate": 7.004566210045661e-07, + "loss": 0.0003, + "reward": 1.359375, + "reward_std": 0.10205793008208275, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 0.96875, + "step": 656 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.375, + "epoch": 0.9, + "grad_norm": 3.657111644744873, + "kl": 0.330078125, + "learning_rate": 7e-07, + "loss": 0.0003, + "reward": 1.640625, + "reward_std": 0.28412990644574165, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 1.0, + "step": 657 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.5625, + "epoch": 0.9013698630136986, + "grad_norm": 2.3929004669189453, + "kl": 0.3193359375, + "learning_rate": 6.995433789954338e-07, + "loss": 0.0003, + "reward": 1.859375, + "reward_std": 0.30617378652095795, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 1.0, + "step": 658 + }, + { + "clip_ratio": 0.0, + "completion_length": 484.4375, + "epoch": 0.9027397260273973, + "grad_norm": 2.0510473251342773, + "kl": 0.3330078125, + "learning_rate": 6.990867579908675e-07, + "loss": 0.0003, + "reward": 1.6953125, + "reward_std": 0.2706219367682934, + "rewards/accuracy_reward": 0.6953125, + "rewards/format_reward": 1.0, + "step": 659 + }, + { + "clip_ratio": 0.0, + "completion_length": 683.28125, + "epoch": 0.9041095890410958, + "grad_norm": 9.588709831237793, + "kl": 0.32958984375, + "learning_rate": 6.986301369863014e-07, + "loss": 0.0003, + "reward": 1.9114583730697632, + "reward_std": 0.13045240752398968, + "rewards/accuracy_reward": 0.9114583432674408, + "rewards/format_reward": 1.0, + "step": 660 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.28125, + "epoch": 0.9054794520547945, + "grad_norm": 3.2910804748535156, + "kl": 0.373046875, + "learning_rate": 6.981735159817351e-07, + "loss": 0.0004, + "reward": 1.53125, + "reward_std": 0.3061639815568924, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.96875, + "step": 661 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.09375, + "epoch": 0.9068493150684932, + "grad_norm": 1.6819573640823364, + "kl": 0.32080078125, + "learning_rate": 6.977168949771689e-07, + "loss": 0.0003, + "reward": 1.5052083134651184, + "reward_std": 0.11100949719548225, + "rewards/accuracy_reward": 0.5052083134651184, + "rewards/format_reward": 1.0, + "step": 662 + }, + { + "clip_ratio": 0.0, + "completion_length": 657.21875, + "epoch": 0.9082191780821918, + "grad_norm": 1.5459468364715576, + "kl": 0.3544921875, + "learning_rate": 6.972602739726027e-07, + "loss": 0.0004, + "reward": 1.453125, + "reward_std": 0.10205793008208275, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 663 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.4375, + "epoch": 0.9095890410958904, + "grad_norm": 2.212812900543213, + "kl": 0.3076171875, + "learning_rate": 6.968036529680364e-07, + "loss": 0.0003, + "reward": 1.7638888359069824, + "reward_std": 0.22503389045596123, + "rewards/accuracy_reward": 0.7951388657093048, + "rewards/format_reward": 0.96875, + "step": 664 + }, + { + "clip_ratio": 0.0, + "completion_length": 762.3125, + "epoch": 0.910958904109589, + "grad_norm": 2.2994203567504883, + "kl": 0.29345703125, + "learning_rate": 6.963470319634704e-07, + "loss": 0.0003, + "reward": 1.9127604067325592, + "reward_std": 0.33594274893403053, + "rewards/accuracy_reward": 0.9440104067325592, + "rewards/format_reward": 0.96875, + "step": 665 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.3125, + "epoch": 0.9123287671232877, + "grad_norm": 6.928981304168701, + "kl": 0.2998046875, + "learning_rate": 6.958904109589041e-07, + "loss": 0.0003, + "reward": 1.2291666567325592, + "reward_std": 0.022271782159805298, + "rewards/accuracy_reward": 0.2291666567325592, + "rewards/format_reward": 1.0, + "step": 666 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.15625, + "epoch": 0.9136986301369863, + "grad_norm": 7.005854606628418, + "kl": 0.3505859375, + "learning_rate": 6.954337899543378e-07, + "loss": 0.0004, + "reward": 1.3125, + "reward_std": 0.2925042062997818, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 667 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.78125, + "epoch": 0.915068493150685, + "grad_norm": 4.68707275390625, + "kl": 0.37255859375, + "learning_rate": 6.949771689497717e-07, + "loss": 0.0004, + "reward": 0.96875, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 668 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.375, + "epoch": 0.9164383561643835, + "grad_norm": 3.7123045921325684, + "kl": 0.380859375, + "learning_rate": 6.945205479452054e-07, + "loss": 0.0004, + "reward": 1.671875, + "reward_std": 0.41054617613554, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 1.0, + "step": 669 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.65625, + "epoch": 0.9178082191780822, + "grad_norm": 1.5095117092132568, + "kl": 0.30859375, + "learning_rate": 6.940639269406392e-07, + "loss": 0.0003, + "reward": 1.44921875, + "reward_std": 0.055597566068172455, + "rewards/accuracy_reward": 0.44921875, + "rewards/format_reward": 1.0, + "step": 670 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.25, + "epoch": 0.9191780821917809, + "grad_norm": 1.7842953205108643, + "kl": 0.40478515625, + "learning_rate": 6.93607305936073e-07, + "loss": 0.0004, + "reward": 1.1796875, + "reward_std": 0.022097086533904076, + "rewards/accuracy_reward": 0.2109375, + "rewards/format_reward": 0.96875, + "step": 671 + }, + { + "clip_ratio": 0.0, + "completion_length": 496.84375, + "epoch": 0.9205479452054794, + "grad_norm": 3.0466763973236084, + "kl": 0.32470703125, + "learning_rate": 6.931506849315068e-07, + "loss": 0.0003, + "reward": 1.62109375, + "reward_std": 0.17449257895350456, + "rewards/accuracy_reward": 0.62109375, + "rewards/format_reward": 1.0, + "step": 672 + }, + { + "clip_ratio": 0.0, + "completion_length": 622.59375, + "epoch": 0.9219178082191781, + "grad_norm": 6.0069193840026855, + "kl": 0.3232421875, + "learning_rate": 6.926940639269407e-07, + "loss": 0.0003, + "reward": 1.4479166567325592, + "reward_std": 0.17747542820870876, + "rewards/accuracy_reward": 0.4479166567325592, + "rewards/format_reward": 1.0, + "step": 673 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.59375, + "epoch": 0.9232876712328767, + "grad_norm": 2.055826425552368, + "kl": 0.3134765625, + "learning_rate": 6.922374429223744e-07, + "loss": 0.0003, + "reward": 1.75, + "reward_std": 0.24671732261776924, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 674 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.25, + "epoch": 0.9246575342465754, + "grad_norm": 0.03342582285404205, + "kl": 0.3740234375, + "learning_rate": 6.917808219178081e-07, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 675 + }, + { + "clip_ratio": 0.0, + "completion_length": 472.21875, + "epoch": 0.9260273972602739, + "grad_norm": 2.0904788970947266, + "kl": 0.34375, + "learning_rate": 6.91324200913242e-07, + "loss": 0.0003, + "reward": 1.3541666865348816, + "reward_std": 0.10767630115151405, + "rewards/accuracy_reward": 0.3541666865348816, + "rewards/format_reward": 1.0, + "step": 676 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.59375, + "epoch": 0.9273972602739726, + "grad_norm": 2.1896920204162598, + "kl": 0.32666015625, + "learning_rate": 6.908675799086757e-07, + "loss": 0.0003, + "reward": 1.453125, + "reward_std": 0.20648781582713127, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.96875, + "step": 677 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.15625, + "epoch": 0.9287671232876712, + "grad_norm": 1.9878590106964111, + "kl": 0.32666015625, + "learning_rate": 6.904109589041097e-07, + "loss": 0.0003, + "reward": 1.546875, + "reward_std": 0.1446593925356865, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9375, + "step": 678 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.78125, + "epoch": 0.9301369863013699, + "grad_norm": 4.627434253692627, + "kl": 0.3154296875, + "learning_rate": 6.899543378995434e-07, + "loss": 0.0003, + "reward": 1.811718761920929, + "reward_std": 0.12913026381283998, + "rewards/accuracy_reward": 0.811718761920929, + "rewards/format_reward": 1.0, + "step": 679 + }, + { + "clip_ratio": 0.0, + "completion_length": 703.84375, + "epoch": 0.9315068493150684, + "grad_norm": 2.76521897315979, + "kl": 0.3232421875, + "learning_rate": 6.894977168949771e-07, + "loss": 0.0003, + "reward": 2.1796875, + "reward_std": 0.18400542438030243, + "rewards/accuracy_reward": 1.1796875, + "rewards/format_reward": 1.0, + "step": 680 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.75, + "epoch": 0.9328767123287671, + "grad_norm": 3.8087267875671387, + "kl": 0.43505859375, + "learning_rate": 6.89041095890411e-07, + "loss": 0.0004, + "reward": 1.359375, + "reward_std": 0.17782479152083397, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 1.0, + "step": 681 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.0625, + "epoch": 0.9342465753424658, + "grad_norm": 1.986623764038086, + "kl": 0.35986328125, + "learning_rate": 6.885844748858447e-07, + "loss": 0.0004, + "reward": 1.25, + "reward_std": 0.13363061845302582, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 682 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.40625, + "epoch": 0.9356164383561644, + "grad_norm": 2.0661747455596924, + "kl": 0.3330078125, + "learning_rate": 6.881278538812784e-07, + "loss": 0.0003, + "reward": 1.515625, + "reward_std": 0.18997547030448914, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 1.0, + "step": 683 + }, + { + "clip_ratio": 0.0, + "completion_length": 703.46875, + "epoch": 0.936986301369863, + "grad_norm": 13.223034858703613, + "kl": 0.33251953125, + "learning_rate": 6.876712328767123e-07, + "loss": 0.0003, + "reward": 2.14453125, + "reward_std": 0.1712184101343155, + "rewards/accuracy_reward": 1.14453125, + "rewards/format_reward": 1.0, + "step": 684 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.0625, + "epoch": 0.9383561643835616, + "grad_norm": 3.540472984313965, + "kl": 0.341796875, + "learning_rate": 6.872146118721461e-07, + "loss": 0.0003, + "reward": 1.37109375, + "reward_std": 0.14941447600722313, + "rewards/accuracy_reward": 0.37109375, + "rewards/format_reward": 1.0, + "step": 685 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.71875, + "epoch": 0.9397260273972603, + "grad_norm": 4.10709285736084, + "kl": 0.36865234375, + "learning_rate": 6.867579908675799e-07, + "loss": 0.0004, + "reward": 1.09375, + "reward_std": 0.22201896458864212, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.96875, + "step": 686 + }, + { + "clip_ratio": 0.0, + "completion_length": 766.8125, + "epoch": 0.9410958904109589, + "grad_norm": 2.0390625, + "kl": 0.3642578125, + "learning_rate": 6.863013698630137e-07, + "loss": 0.0004, + "reward": 1.7942708134651184, + "reward_std": 0.28445861861109734, + "rewards/accuracy_reward": 0.8255208432674408, + "rewards/format_reward": 0.96875, + "step": 687 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.40625, + "epoch": 0.9424657534246575, + "grad_norm": 2.7066261768341064, + "kl": 0.3212890625, + "learning_rate": 6.858447488584474e-07, + "loss": 0.0003, + "reward": 1.390625, + "reward_std": 0.10205793008208275, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 1.0, + "step": 688 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.59375, + "epoch": 0.9438356164383561, + "grad_norm": 1.330697774887085, + "kl": 0.36767578125, + "learning_rate": 6.853881278538813e-07, + "loss": 0.0004, + "reward": 1.40625, + "reward_std": 0.22301282733678818, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 1.0, + "step": 689 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.25, + "epoch": 0.9452054794520548, + "grad_norm": 1.8517435789108276, + "kl": 0.3125, + "learning_rate": 6.84931506849315e-07, + "loss": 0.0003, + "reward": 1.4921875, + "reward_std": 0.23285578191280365, + "rewards/accuracy_reward": 0.5234375, + "rewards/format_reward": 0.96875, + "step": 690 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.96875, + "epoch": 0.9465753424657535, + "grad_norm": 7.800582408905029, + "kl": 0.33154296875, + "learning_rate": 6.844748858447487e-07, + "loss": 0.0003, + "reward": 1.21875, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 691 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.5, + "epoch": 0.947945205479452, + "grad_norm": 2.3165299892425537, + "kl": 0.3642578125, + "learning_rate": 6.840182648401827e-07, + "loss": 0.0004, + "reward": 1.25, + "reward_std": 0.1462521031498909, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 692 + }, + { + "clip_ratio": 0.0, + "completion_length": 488.1875, + "epoch": 0.9493150684931507, + "grad_norm": 1.7786669731140137, + "kl": 0.31591796875, + "learning_rate": 6.835616438356164e-07, + "loss": 0.0003, + "reward": 1.3828125, + "reward_std": 0.16173411160707474, + "rewards/accuracy_reward": 0.3828125, + "rewards/format_reward": 1.0, + "step": 693 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.96875, + "epoch": 0.9506849315068493, + "grad_norm": 0.027479765936732292, + "kl": 0.33447265625, + "learning_rate": 6.831050228310502e-07, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 694 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.96875, + "epoch": 0.952054794520548, + "grad_norm": 2.6095175743103027, + "kl": 0.36181640625, + "learning_rate": 6.82648401826484e-07, + "loss": 0.0004, + "reward": 1.4609375, + "reward_std": 0.20912351459264755, + "rewards/accuracy_reward": 0.4921875, + "rewards/format_reward": 0.96875, + "step": 695 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.96875, + "epoch": 0.9534246575342465, + "grad_norm": 1.3289512395858765, + "kl": 0.328125, + "learning_rate": 6.821917808219177e-07, + "loss": 0.0003, + "reward": 1.34375, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 696 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.1875, + "epoch": 0.9547945205479452, + "grad_norm": 0.9503890872001648, + "kl": 0.33642578125, + "learning_rate": 6.817351598173516e-07, + "loss": 0.0003, + "reward": 1.243749976158142, + "reward_std": 0.04580627381801605, + "rewards/accuracy_reward": 0.24375002086162567, + "rewards/format_reward": 1.0, + "step": 697 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.0625, + "epoch": 0.9561643835616438, + "grad_norm": 6.267343044281006, + "kl": 0.34130859375, + "learning_rate": 6.812785388127854e-07, + "loss": 0.0003, + "reward": 1.2890625, + "reward_std": 0.14807433634996414, + "rewards/accuracy_reward": 0.2890625, + "rewards/format_reward": 1.0, + "step": 698 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.59375, + "epoch": 0.9575342465753425, + "grad_norm": 3.089637517929077, + "kl": 0.33935546875, + "learning_rate": 6.808219178082191e-07, + "loss": 0.0003, + "reward": 1.4895833134651184, + "reward_std": 0.2777084931731224, + "rewards/accuracy_reward": 0.4895833134651184, + "rewards/format_reward": 1.0, + "step": 699 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.03125, + "epoch": 0.958904109589041, + "grad_norm": 1.276449203491211, + "kl": 1.47900390625, + "learning_rate": 6.80365296803653e-07, + "loss": 0.0015, + "reward": 1.7890625, + "reward_std": 0.06629125773906708, + "rewards/accuracy_reward": 0.7890625, + "rewards/format_reward": 1.0, + "step": 700 + }, + { + "clip_ratio": 0.0, + "completion_length": 490.3125, + "epoch": 0.9602739726027397, + "grad_norm": 0.03158112242817879, + "kl": 0.37646484375, + "learning_rate": 6.799086757990867e-07, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 701 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.4375, + "epoch": 0.9616438356164384, + "grad_norm": 2.683041572570801, + "kl": 0.31982421875, + "learning_rate": 6.794520547945205e-07, + "loss": 0.0003, + "reward": 1.453125, + "reward_std": 0.1751839816570282, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 702 + }, + { + "clip_ratio": 0.0, + "completion_length": 744.75, + "epoch": 0.963013698630137, + "grad_norm": 3.1106810569763184, + "kl": 0.32958984375, + "learning_rate": 6.789954337899543e-07, + "loss": 0.0003, + "reward": 1.9375, + "reward_std": 0.328794926404953, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 703 + }, + { + "clip_ratio": 0.0, + "completion_length": 721.6875, + "epoch": 0.9643835616438357, + "grad_norm": 2.060368776321411, + "kl": 0.31201171875, + "learning_rate": 6.78538812785388e-07, + "loss": 0.0003, + "reward": 1.53125, + "reward_std": 0.1422954723238945, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 1.0, + "step": 704 + }, + { + "clip_ratio": 0.0, + "completion_length": 616.1875, + "epoch": 0.9657534246575342, + "grad_norm": 1.8324370384216309, + "kl": 0.30078125, + "learning_rate": 6.78082191780822e-07, + "loss": 0.0003, + "reward": 1.34375, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 705 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.0, + "epoch": 0.9671232876712329, + "grad_norm": 6.562459468841553, + "kl": 0.341796875, + "learning_rate": 6.776255707762557e-07, + "loss": 0.0003, + "reward": 1.5546875, + "reward_std": 0.4100441411137581, + "rewards/accuracy_reward": 0.5546875, + "rewards/format_reward": 1.0, + "step": 706 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.4375, + "epoch": 0.9684931506849315, + "grad_norm": 2.6876327991485596, + "kl": 0.36376953125, + "learning_rate": 6.771689497716894e-07, + "loss": 0.0004, + "reward": 1.453125, + "reward_std": 0.18807452730834484, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 707 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.4375, + "epoch": 0.9698630136986301, + "grad_norm": 1.8336397409439087, + "kl": 0.35595703125, + "learning_rate": 6.767123287671233e-07, + "loss": 0.0004, + "reward": 1.703125, + "reward_std": 0.13258251920342445, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 1.0, + "step": 708 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.90625, + "epoch": 0.9712328767123287, + "grad_norm": 4.356468200683594, + "kl": 0.32275390625, + "learning_rate": 6.76255707762557e-07, + "loss": 0.0003, + "reward": 1.3828125, + "reward_std": 0.09069566056132317, + "rewards/accuracy_reward": 0.3828125, + "rewards/format_reward": 1.0, + "step": 709 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.8125, + "epoch": 0.9726027397260274, + "grad_norm": 1.405023217201233, + "kl": 0.328125, + "learning_rate": 6.757990867579907e-07, + "loss": 0.0003, + "reward": 1.484375, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 1.0, + "step": 710 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.1875, + "epoch": 0.9739726027397261, + "grad_norm": 3.4619688987731934, + "kl": 0.361328125, + "learning_rate": 6.753424657534246e-07, + "loss": 0.0004, + "reward": 1.8125, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 1.0, + "step": 711 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.59375, + "epoch": 0.9753424657534246, + "grad_norm": 3.3762919902801514, + "kl": 0.33642578125, + "learning_rate": 6.748858447488584e-07, + "loss": 0.0003, + "reward": 1.59375, + "reward_std": 0.3072218894958496, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 1.0, + "step": 712 + }, + { + "clip_ratio": 0.0, + "completion_length": 757.53125, + "epoch": 0.9767123287671233, + "grad_norm": 1.4996120929718018, + "kl": 0.29296875, + "learning_rate": 6.744292237442923e-07, + "loss": 0.0003, + "reward": 1.518750011920929, + "reward_std": 0.2324894331395626, + "rewards/accuracy_reward": 0.5812499821186066, + "rewards/format_reward": 0.9375, + "step": 713 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.5, + "epoch": 0.9780821917808219, + "grad_norm": 1.4486969709396362, + "kl": 0.3935546875, + "learning_rate": 6.73972602739726e-07, + "loss": 0.0004, + "reward": 1.421875, + "reward_std": 0.11100948229432106, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 1.0, + "step": 714 + }, + { + "clip_ratio": 0.0, + "completion_length": 668.375, + "epoch": 0.9794520547945206, + "grad_norm": 2.0050597190856934, + "kl": 0.310546875, + "learning_rate": 6.735159817351597e-07, + "loss": 0.0003, + "reward": 1.625, + "reward_std": 0.11230766586959362, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 715 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.59375, + "epoch": 0.9808219178082191, + "grad_norm": 1.8048930168151855, + "kl": 0.31787109375, + "learning_rate": 6.730593607305936e-07, + "loss": 0.0003, + "reward": 1.59375, + "reward_std": 0.08065321296453476, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 1.0, + "step": 716 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.78125, + "epoch": 0.9821917808219178, + "grad_norm": 0.010665824636816978, + "kl": 0.32568359375, + "learning_rate": 6.726027397260273e-07, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 717 + }, + { + "clip_ratio": 0.0, + "completion_length": 655.6875, + "epoch": 0.9835616438356164, + "grad_norm": 1.4615073204040527, + "kl": 0.3134765625, + "learning_rate": 6.721461187214613e-07, + "loss": 0.0003, + "reward": 1.5078125, + "reward_std": 0.12534979730844498, + "rewards/accuracy_reward": 0.5078125, + "rewards/format_reward": 1.0, + "step": 718 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.4375, + "epoch": 0.9849315068493151, + "grad_norm": 1.7689210176467896, + "kl": 0.341796875, + "learning_rate": 6.71689497716895e-07, + "loss": 0.0003, + "reward": 1.509374976158142, + "reward_std": 0.10276375338435173, + "rewards/accuracy_reward": 0.5093749761581421, + "rewards/format_reward": 1.0, + "step": 719 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.53125, + "epoch": 0.9863013698630136, + "grad_norm": 7.66063117980957, + "kl": 0.33544921875, + "learning_rate": 6.712328767123287e-07, + "loss": 0.0003, + "reward": 1.63671875, + "reward_std": 0.25478895008563995, + "rewards/accuracy_reward": 0.63671875, + "rewards/format_reward": 1.0, + "step": 720 + }, + { + "clip_ratio": 0.0, + "completion_length": 672.34375, + "epoch": 0.9876712328767123, + "grad_norm": 3.410891532897949, + "kl": 0.3125, + "learning_rate": 6.707762557077626e-07, + "loss": 0.0003, + "reward": 2.002604156732559, + "reward_std": 0.156676534563303, + "rewards/accuracy_reward": 1.0026041567325592, + "rewards/format_reward": 1.0, + "step": 721 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.59375, + "epoch": 0.989041095890411, + "grad_norm": 2.7770752906799316, + "kl": 0.359375, + "learning_rate": 6.703196347031963e-07, + "loss": 0.0004, + "reward": 1.390625, + "reward_std": 0.2824692949652672, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 1.0, + "step": 722 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.40625, + "epoch": 0.9904109589041096, + "grad_norm": 4.406149387359619, + "kl": 0.3603515625, + "learning_rate": 6.6986301369863e-07, + "loss": 0.0004, + "reward": 1.7109375, + "reward_std": 0.3282610699534416, + "rewards/accuracy_reward": 0.7109375, + "rewards/format_reward": 1.0, + "step": 723 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.5625, + "epoch": 0.9917808219178083, + "grad_norm": 3.31536602973938, + "kl": 0.390625, + "learning_rate": 6.694063926940639e-07, + "loss": 0.0004, + "reward": 1.453125, + "reward_std": 0.24556982144713402, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 724 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.75, + "epoch": 0.9931506849315068, + "grad_norm": 4.0502424240112305, + "kl": 0.39697265625, + "learning_rate": 6.689497716894977e-07, + "loss": 0.0004, + "reward": 1.8671875, + "reward_std": 0.16954397037625313, + "rewards/accuracy_reward": 0.8671875, + "rewards/format_reward": 1.0, + "step": 725 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.25, + "epoch": 0.9945205479452055, + "grad_norm": 4.198473930358887, + "kl": 0.31005859375, + "learning_rate": 6.684931506849316e-07, + "loss": 0.0003, + "reward": 1.7578125, + "reward_std": 0.17160805128514767, + "rewards/accuracy_reward": 0.7578125, + "rewards/format_reward": 1.0, + "step": 726 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.1875, + "epoch": 0.9958904109589041, + "grad_norm": 2.3717732429504395, + "kl": 0.33544921875, + "learning_rate": 6.680365296803653e-07, + "loss": 0.0003, + "reward": 2.010416656732559, + "reward_std": 0.2956680431962013, + "rewards/accuracy_reward": 1.0104166567325592, + "rewards/format_reward": 1.0, + "step": 727 + }, + { + "clip_ratio": 0.0, + "completion_length": 523.90625, + "epoch": 0.9972602739726028, + "grad_norm": 1.4983484745025635, + "kl": 0.3330078125, + "learning_rate": 6.67579908675799e-07, + "loss": 0.0003, + "reward": 1.671875, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 1.0, + "step": 728 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.34375, + "epoch": 0.9986301369863013, + "grad_norm": 2.0858328342437744, + "kl": 0.42578125, + "learning_rate": 6.671232876712329e-07, + "loss": 0.0004, + "reward": 1.22265625, + "reward_std": 0.11446718871593475, + "rewards/accuracy_reward": 0.22265625, + "rewards/format_reward": 1.0, + "step": 729 + }, + { + "clip_ratio": 0.0, + "completion_length": 510.15625, + "epoch": 1.0, + "grad_norm": 3.0337514877319336, + "kl": 0.3486328125, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0003, + "reward": 1.5234375, + "reward_std": 0.1662898138165474, + "rewards/accuracy_reward": 0.5234375, + "rewards/format_reward": 1.0, + "step": 730 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.5625, + "epoch": 1.0013698630136987, + "grad_norm": 1.1392573118209839, + "kl": 0.33984375, + "learning_rate": 6.662100456621003e-07, + "loss": 0.0003, + "reward": 1.2265625, + "reward_std": 0.03234682232141495, + "rewards/accuracy_reward": 0.2265625, + "rewards/format_reward": 1.0, + "step": 731 + }, + { + "clip_ratio": 0.0, + "completion_length": 482.75, + "epoch": 1.0027397260273974, + "grad_norm": 3.7406272888183594, + "kl": 0.3173828125, + "learning_rate": 6.657534246575343e-07, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.2925042062997818, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.9375, + "step": 732 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.84375, + "epoch": 1.0041095890410958, + "grad_norm": 1.7799715995788574, + "kl": 0.34912109375, + "learning_rate": 6.65296803652968e-07, + "loss": 0.0003, + "reward": 1.484375, + "reward_std": 0.22097086161375046, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 1.0, + "step": 733 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.59375, + "epoch": 1.0054794520547945, + "grad_norm": 6.9609904289245605, + "kl": 0.34375, + "learning_rate": 6.648401826484019e-07, + "loss": 0.0003, + "reward": 1.8333333134651184, + "reward_std": 0.17943118885159492, + "rewards/accuracy_reward": 0.8333333134651184, + "rewards/format_reward": 1.0, + "step": 734 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.25, + "epoch": 1.0068493150684932, + "grad_norm": 2.033268928527832, + "kl": 0.3408203125, + "learning_rate": 6.643835616438356e-07, + "loss": 0.0003, + "reward": 1.7291666269302368, + "reward_std": 0.1323188804090023, + "rewards/accuracy_reward": 0.7291666567325592, + "rewards/format_reward": 1.0, + "step": 735 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.78125, + "epoch": 1.0082191780821919, + "grad_norm": 1.8167824745178223, + "kl": 0.35107421875, + "learning_rate": 6.639269406392693e-07, + "loss": 0.0004, + "reward": 1.81640625, + "reward_std": 0.2947664186358452, + "rewards/accuracy_reward": 0.81640625, + "rewards/format_reward": 1.0, + "step": 736 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.4375, + "epoch": 1.0095890410958903, + "grad_norm": 8.44912338256836, + "kl": 0.37841796875, + "learning_rate": 6.634703196347032e-07, + "loss": 0.0004, + "reward": 1.4609375, + "reward_std": 0.3800046369433403, + "rewards/accuracy_reward": 0.4609375, + "rewards/format_reward": 1.0, + "step": 737 + }, + { + "clip_ratio": 0.0, + "completion_length": 499.15625, + "epoch": 1.010958904109589, + "grad_norm": 2.3125061988830566, + "kl": 0.34716796875, + "learning_rate": 6.63013698630137e-07, + "loss": 0.0003, + "reward": 1.7330728769302368, + "reward_std": 0.23545794188976288, + "rewards/accuracy_reward": 0.7643229067325592, + "rewards/format_reward": 0.96875, + "step": 738 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.9375, + "epoch": 1.0123287671232877, + "grad_norm": 3.3824410438537598, + "kl": 0.34521484375, + "learning_rate": 6.625570776255707e-07, + "loss": 0.0003, + "reward": 1.453125, + "reward_std": 0.1173202246427536, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 739 + }, + { + "clip_ratio": 0.0, + "completion_length": 402.25, + "epoch": 1.0136986301369864, + "grad_norm": 2.3408203125, + "kl": 0.35205078125, + "learning_rate": 6.621004566210046e-07, + "loss": 0.0004, + "reward": 1.4609375, + "reward_std": 0.19531989470124245, + "rewards/accuracy_reward": 0.4609375, + "rewards/format_reward": 1.0, + "step": 740 + }, + { + "clip_ratio": 0.0, + "completion_length": 623.28125, + "epoch": 1.015068493150685, + "grad_norm": 2.0831456184387207, + "kl": 0.349609375, + "learning_rate": 6.616438356164383e-07, + "loss": 0.0003, + "reward": 1.7421875, + "reward_std": 0.09021057933568954, + "rewards/accuracy_reward": 0.7421875, + "rewards/format_reward": 1.0, + "step": 741 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.03125, + "epoch": 1.0164383561643835, + "grad_norm": 2.4400479793548584, + "kl": 0.333984375, + "learning_rate": 6.61187214611872e-07, + "loss": 0.0003, + "reward": 1.5234375, + "reward_std": 0.22642776370048523, + "rewards/accuracy_reward": 0.5234375, + "rewards/format_reward": 1.0, + "step": 742 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.15625, + "epoch": 1.0178082191780822, + "grad_norm": 1.4530420303344727, + "kl": 0.35107421875, + "learning_rate": 6.607305936073059e-07, + "loss": 0.0004, + "reward": 1.4765625, + "reward_std": 0.1708463504910469, + "rewards/accuracy_reward": 0.4765625, + "rewards/format_reward": 1.0, + "step": 743 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.125, + "epoch": 1.0191780821917809, + "grad_norm": 3.7937586307525635, + "kl": 0.419921875, + "learning_rate": 6.602739726027396e-07, + "loss": 0.0004, + "reward": 1.375, + "reward_std": 0.1629730723798275, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 744 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.0, + "epoch": 1.0205479452054795, + "grad_norm": 1.1267149448394775, + "kl": 0.40283203125, + "learning_rate": 6.598173515981736e-07, + "loss": 0.0004, + "reward": 0.96875, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.96875, + "step": 745 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.9375, + "epoch": 1.021917808219178, + "grad_norm": 1.195306658744812, + "kl": 0.35009765625, + "learning_rate": 6.593607305936073e-07, + "loss": 0.0003, + "reward": 1.34375, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 1.0, + "step": 746 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.15625, + "epoch": 1.0232876712328767, + "grad_norm": 1.5478026866912842, + "kl": 0.3720703125, + "learning_rate": 6.58904109589041e-07, + "loss": 0.0004, + "reward": 1.703125, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 1.0, + "step": 747 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.1875, + "epoch": 1.0246575342465754, + "grad_norm": 2.914033889770508, + "kl": 0.33740234375, + "learning_rate": 6.584474885844749e-07, + "loss": 0.0003, + "reward": 1.86328125, + "reward_std": 0.5137732066214085, + "rewards/accuracy_reward": 0.89453125, + "rewards/format_reward": 0.96875, + "step": 748 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.71875, + "epoch": 1.026027397260274, + "grad_norm": 1.969694972038269, + "kl": 0.3681640625, + "learning_rate": 6.579908675799086e-07, + "loss": 0.0004, + "reward": 1.6796875, + "reward_std": 0.11048541404306889, + "rewards/accuracy_reward": 0.6796875298023224, + "rewards/format_reward": 1.0, + "step": 749 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.75, + "epoch": 1.0273972602739727, + "grad_norm": 5.528670787811279, + "kl": 0.39990234375, + "learning_rate": 6.575342465753423e-07, + "loss": 0.0004, + "reward": 1.6015625, + "reward_std": 0.17859892547130585, + "rewards/accuracy_reward": 0.6015625, + "rewards/format_reward": 1.0, + "step": 750 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.46875, + "epoch": 1.0287671232876712, + "grad_norm": 4.909811973571777, + "kl": 0.39013671875, + "learning_rate": 6.570776255707762e-07, + "loss": 0.0004, + "reward": 1.640625, + "reward_std": 0.0776018276810646, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 1.0, + "step": 751 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.4375, + "epoch": 1.0301369863013699, + "grad_norm": 5.930173397064209, + "kl": 0.35498046875, + "learning_rate": 6.5662100456621e-07, + "loss": 0.0004, + "reward": 1.7734375, + "reward_std": 0.2153516486287117, + "rewards/accuracy_reward": 0.7734375, + "rewards/format_reward": 1.0, + "step": 752 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.5, + "epoch": 1.0315068493150685, + "grad_norm": 2.4682910442352295, + "kl": 0.3642578125, + "learning_rate": 6.561643835616439e-07, + "loss": 0.0004, + "reward": 1.7369791865348816, + "reward_std": 0.2828553803265095, + "rewards/accuracy_reward": 0.7682291865348816, + "rewards/format_reward": 0.96875, + "step": 753 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.28125, + "epoch": 1.0328767123287672, + "grad_norm": 3.169569969177246, + "kl": 0.3759765625, + "learning_rate": 6.557077625570776e-07, + "loss": 0.0004, + "reward": 1.625, + "reward_std": 0.21937815099954605, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 754 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.6875, + "epoch": 1.0342465753424657, + "grad_norm": 3.002521514892578, + "kl": 0.35986328125, + "learning_rate": 6.552511415525113e-07, + "loss": 0.0004, + "reward": 1.671875, + "reward_std": 0.22208165377378464, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 1.0, + "step": 755 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.5625, + "epoch": 1.0356164383561643, + "grad_norm": 2.0301320552825928, + "kl": 0.36865234375, + "learning_rate": 6.547945205479452e-07, + "loss": 0.0004, + "reward": 1.8772321939468384, + "reward_std": 0.2395726516842842, + "rewards/accuracy_reward": 0.908482164144516, + "rewards/format_reward": 0.96875, + "step": 756 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.46875, + "epoch": 1.036986301369863, + "grad_norm": 2.746330738067627, + "kl": 0.4189453125, + "learning_rate": 6.543378995433789e-07, + "loss": 0.0004, + "reward": 1.75, + "reward_std": 0.249358132481575, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 757 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.34375, + "epoch": 1.0383561643835617, + "grad_norm": 2.1871116161346436, + "kl": 0.40625, + "learning_rate": 6.538812785388129e-07, + "loss": 0.0004, + "reward": 1.6692708134651184, + "reward_std": 0.11145787499845028, + "rewards/accuracy_reward": 0.6692708134651184, + "rewards/format_reward": 1.0, + "step": 758 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.96875, + "epoch": 1.0397260273972604, + "grad_norm": 15.84890365600586, + "kl": 0.40673828125, + "learning_rate": 6.534246575342466e-07, + "loss": 0.0004, + "reward": 1.7109375, + "reward_std": 0.21829968504607677, + "rewards/accuracy_reward": 0.7109375, + "rewards/format_reward": 1.0, + "step": 759 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.09375, + "epoch": 1.0410958904109588, + "grad_norm": 2.4201431274414062, + "kl": 0.37158203125, + "learning_rate": 6.529680365296803e-07, + "loss": 0.0004, + "reward": 1.5390625, + "reward_std": 0.18201877176761627, + "rewards/accuracy_reward": 0.5390625, + "rewards/format_reward": 1.0, + "step": 760 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.375, + "epoch": 1.0424657534246575, + "grad_norm": 1.7726272344589233, + "kl": 0.4150390625, + "learning_rate": 6.525114155251142e-07, + "loss": 0.0004, + "reward": 1.1640625, + "reward_std": 0.03234682232141495, + "rewards/accuracy_reward": 0.1640625, + "rewards/format_reward": 1.0, + "step": 761 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.59375, + "epoch": 1.0438356164383562, + "grad_norm": 3.5865135192871094, + "kl": 0.35986328125, + "learning_rate": 6.520547945205479e-07, + "loss": 0.0004, + "reward": 1.703125, + "reward_std": 0.36083250865340233, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 1.0, + "step": 762 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.4375, + "epoch": 1.0452054794520549, + "grad_norm": 4.15688943862915, + "kl": 0.37939453125, + "learning_rate": 6.515981735159816e-07, + "loss": 0.0004, + "reward": 1.78125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 1.0, + "step": 763 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.5625, + "epoch": 1.0465753424657533, + "grad_norm": 1.4329558610916138, + "kl": 0.361328125, + "learning_rate": 6.511415525114155e-07, + "loss": 0.0004, + "reward": 1.5026041865348816, + "reward_std": 0.19893107935786247, + "rewards/accuracy_reward": 0.5338541716337204, + "rewards/format_reward": 0.96875, + "step": 764 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.625, + "epoch": 1.047945205479452, + "grad_norm": 2.296790599822998, + "kl": 0.3896484375, + "learning_rate": 6.506849315068493e-07, + "loss": 0.0004, + "reward": 1.8984375, + "reward_std": 0.13098490238189697, + "rewards/accuracy_reward": 0.8984375, + "rewards/format_reward": 1.0, + "step": 765 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.0625, + "epoch": 1.0493150684931507, + "grad_norm": 4.319180011749268, + "kl": 0.47119140625, + "learning_rate": 6.502283105022832e-07, + "loss": 0.0005, + "reward": 1.44921875, + "reward_std": 0.29051198065280914, + "rewards/accuracy_reward": 0.44921875, + "rewards/format_reward": 1.0, + "step": 766 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.8125, + "epoch": 1.0506849315068494, + "grad_norm": 1.033851146697998, + "kl": 0.45166015625, + "learning_rate": 6.497716894977169e-07, + "loss": 0.0005, + "reward": 1.375, + "reward_std": 0.1602174937725067, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 0.96875, + "step": 767 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.625, + "epoch": 1.0520547945205478, + "grad_norm": 3.8175528049468994, + "kl": 0.408203125, + "learning_rate": 6.493150684931506e-07, + "loss": 0.0004, + "reward": 1.3125, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 1.0, + "step": 768 + }, + { + "clip_ratio": 0.0, + "completion_length": 501.90625, + "epoch": 1.0534246575342465, + "grad_norm": 3.4146058559417725, + "kl": 0.37744140625, + "learning_rate": 6.488584474885845e-07, + "loss": 0.0004, + "reward": 1.46875, + "reward_std": 0.3107999712228775, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.9375, + "step": 769 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.4375, + "epoch": 1.0547945205479452, + "grad_norm": 2.580096960067749, + "kl": 0.390625, + "learning_rate": 6.484018264840182e-07, + "loss": 0.0004, + "reward": 1.5703125, + "reward_std": 0.11353681609034538, + "rewards/accuracy_reward": 0.5703125, + "rewards/format_reward": 1.0, + "step": 770 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.6875, + "epoch": 1.0561643835616439, + "grad_norm": 3.621427297592163, + "kl": 0.4423828125, + "learning_rate": 6.479452054794519e-07, + "loss": 0.0004, + "reward": 1.09375, + "reward_std": 0.2041158601641655, + "rewards/accuracy_reward": 0.09375, + "rewards/format_reward": 1.0, + "step": 771 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.125, + "epoch": 1.0575342465753426, + "grad_norm": 6.84398889541626, + "kl": 0.5302734375, + "learning_rate": 6.474885844748859e-07, + "loss": 0.0005, + "reward": 1.328125, + "reward_std": 0.30537302792072296, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 0.96875, + "step": 772 + }, + { + "clip_ratio": 0.0, + "completion_length": 696.125, + "epoch": 1.058904109589041, + "grad_norm": 2.4137730598449707, + "kl": 0.3642578125, + "learning_rate": 6.470319634703196e-07, + "loss": 0.0004, + "reward": 1.65625, + "reward_std": 0.5298669151961803, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.90625, + "step": 773 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.1875, + "epoch": 1.0602739726027397, + "grad_norm": 2.266726493835449, + "kl": 0.36181640625, + "learning_rate": 6.465753424657535e-07, + "loss": 0.0004, + "reward": 1.390625, + "reward_std": 0.2643740847706795, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 0.9375, + "step": 774 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.53125, + "epoch": 1.0616438356164384, + "grad_norm": 2.0069427490234375, + "kl": 0.39306640625, + "learning_rate": 6.461187214611872e-07, + "loss": 0.0004, + "reward": 1.6328125, + "reward_std": 0.20269311219453812, + "rewards/accuracy_reward": 0.6640625, + "rewards/format_reward": 0.96875, + "step": 775 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.0625, + "epoch": 1.063013698630137, + "grad_norm": 2.6422464847564697, + "kl": 0.35205078125, + "learning_rate": 6.456621004566209e-07, + "loss": 0.0004, + "reward": 1.3671875, + "reward_std": 0.08891239576041698, + "rewards/accuracy_reward": 0.3671875, + "rewards/format_reward": 1.0, + "step": 776 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.0, + "epoch": 1.0643835616438355, + "grad_norm": 2.95044207572937, + "kl": 0.4296875, + "learning_rate": 6.452054794520548e-07, + "loss": 0.0004, + "reward": 1.21875, + "reward_std": 0.2651650384068489, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.96875, + "step": 777 + }, + { + "clip_ratio": 0.0, + "completion_length": 618.21875, + "epoch": 1.0657534246575342, + "grad_norm": 3.2683823108673096, + "kl": 0.33740234375, + "learning_rate": 6.447488584474886e-07, + "loss": 0.0003, + "reward": 1.296875, + "reward_std": 0.7756812795996666, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.71875, + "step": 778 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.40625, + "epoch": 1.0671232876712329, + "grad_norm": 3.376239061355591, + "kl": 0.38623046875, + "learning_rate": 6.442922374429223e-07, + "loss": 0.0004, + "reward": 1.4140625, + "reward_std": 0.5326974391937256, + "rewards/accuracy_reward": 0.4765625, + "rewards/format_reward": 0.9375, + "step": 779 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.84375, + "epoch": 1.0684931506849316, + "grad_norm": 0.967551052570343, + "kl": 0.42138671875, + "learning_rate": 6.438356164383562e-07, + "loss": 0.0004, + "reward": 1.40625, + "reward_std": 0.1735912710428238, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.96875, + "step": 780 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.125, + "epoch": 1.0698630136986302, + "grad_norm": 7.696181297302246, + "kl": 0.38427734375, + "learning_rate": 6.433789954337899e-07, + "loss": 0.0004, + "reward": 1.265625, + "reward_std": 0.1446593925356865, + "rewards/accuracy_reward": 0.265625, + "rewards/format_reward": 1.0, + "step": 781 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.5625, + "epoch": 1.0712328767123287, + "grad_norm": 3.0563812255859375, + "kl": 0.4501953125, + "learning_rate": 6.429223744292238e-07, + "loss": 0.0005, + "reward": 1.640625, + "reward_std": 0.17358146235346794, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 1.0, + "step": 782 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.96875, + "epoch": 1.0726027397260274, + "grad_norm": 0.024443048983812332, + "kl": 0.388671875, + "learning_rate": 6.424657534246575e-07, + "loss": 0.0004, + "reward": 1.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 783 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.40625, + "epoch": 1.073972602739726, + "grad_norm": 1.1630728244781494, + "kl": 0.365234375, + "learning_rate": 6.420091324200912e-07, + "loss": 0.0004, + "reward": 1.484375, + "reward_std": 0.0289318785071373, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 1.0, + "step": 784 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.1875, + "epoch": 1.0753424657534247, + "grad_norm": 3.8575427532196045, + "kl": 0.38916015625, + "learning_rate": 6.415525114155252e-07, + "loss": 0.0004, + "reward": 1.375, + "reward_std": 0.4482543617486954, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9375, + "step": 785 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.6875, + "epoch": 1.0767123287671232, + "grad_norm": 3.8265440464019775, + "kl": 0.37060546875, + "learning_rate": 6.410958904109589e-07, + "loss": 0.0004, + "reward": 1.3984375, + "reward_std": 0.2753252908587456, + "rewards/accuracy_reward": 0.3984375, + "rewards/format_reward": 1.0, + "step": 786 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.5625, + "epoch": 1.0780821917808219, + "grad_norm": 10.559552192687988, + "kl": 0.38232421875, + "learning_rate": 6.406392694063926e-07, + "loss": 0.0004, + "reward": 1.4765625, + "reward_std": 0.1541428230702877, + "rewards/accuracy_reward": 0.4765625, + "rewards/format_reward": 1.0, + "step": 787 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.8125, + "epoch": 1.0794520547945206, + "grad_norm": 2.3642754554748535, + "kl": 0.35791015625, + "learning_rate": 6.401826484018265e-07, + "loss": 0.0004, + "reward": 1.40625, + "reward_std": 0.10888781771063805, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 1.0, + "step": 788 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.8125, + "epoch": 1.0808219178082192, + "grad_norm": 2.5423686504364014, + "kl": 0.3876953125, + "learning_rate": 6.397260273972602e-07, + "loss": 0.0004, + "reward": 1.8723958432674408, + "reward_std": 0.20485981926321983, + "rewards/accuracy_reward": 0.8723958283662796, + "rewards/format_reward": 1.0, + "step": 789 + }, + { + "clip_ratio": 0.0, + "completion_length": 626.28125, + "epoch": 1.0821917808219177, + "grad_norm": 0.012947800569236279, + "kl": 0.38232421875, + "learning_rate": 6.39269406392694e-07, + "loss": 0.0004, + "reward": 1.75, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 790 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.875, + "epoch": 1.0835616438356164, + "grad_norm": 2.922663688659668, + "kl": 0.38818359375, + "learning_rate": 6.388127853881278e-07, + "loss": 0.0004, + "reward": 2.0546875, + "reward_std": 0.2646155208349228, + "rewards/accuracy_reward": 1.0546875, + "rewards/format_reward": 1.0, + "step": 791 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.09375, + "epoch": 1.084931506849315, + "grad_norm": 1.6584970951080322, + "kl": 0.4111328125, + "learning_rate": 6.383561643835616e-07, + "loss": 0.0004, + "reward": 1.84375, + "reward_std": 0.19999710842967033, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 792 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.8125, + "epoch": 1.0863013698630137, + "grad_norm": 1.3865472078323364, + "kl": 0.39599609375, + "learning_rate": 6.378995433789955e-07, + "loss": 0.0004, + "reward": 1.2135416567325592, + "reward_std": 0.026702916249632835, + "rewards/accuracy_reward": 0.2135416567325592, + "rewards/format_reward": 1.0, + "step": 793 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.65625, + "epoch": 1.0876712328767124, + "grad_norm": 9.972989082336426, + "kl": 0.33984375, + "learning_rate": 6.374429223744292e-07, + "loss": 0.0003, + "reward": 1.453125, + "reward_std": 0.3874029070138931, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.90625, + "step": 794 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.5, + "epoch": 1.0890410958904109, + "grad_norm": 3.65055513381958, + "kl": 0.39306640625, + "learning_rate": 6.369863013698629e-07, + "loss": 0.0004, + "reward": 1.734375, + "reward_std": 0.33669837564229965, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.96875, + "step": 795 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.75, + "epoch": 1.0904109589041096, + "grad_norm": 7.232997417449951, + "kl": 0.38671875, + "learning_rate": 6.365296803652968e-07, + "loss": 0.0004, + "reward": 1.4375, + "reward_std": 0.23827510699629784, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 796 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.34375, + "epoch": 1.0917808219178082, + "grad_norm": 4.632352828979492, + "kl": 0.39208984375, + "learning_rate": 6.360730593607305e-07, + "loss": 0.0004, + "reward": 1.578125, + "reward_std": 0.24831003323197365, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 1.0, + "step": 797 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.90625, + "epoch": 1.093150684931507, + "grad_norm": 2.5987162590026855, + "kl": 0.3681640625, + "learning_rate": 6.356164383561645e-07, + "loss": 0.0004, + "reward": 1.640625, + "reward_std": 0.3057369217276573, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 1.0, + "step": 798 + }, + { + "clip_ratio": 0.0, + "completion_length": 623.25, + "epoch": 1.0945205479452054, + "grad_norm": 2.42681884765625, + "kl": 0.380859375, + "learning_rate": 6.351598173515982e-07, + "loss": 0.0004, + "reward": 1.953125, + "reward_std": 0.2366182692348957, + "rewards/accuracy_reward": 0.953125, + "rewards/format_reward": 1.0, + "step": 799 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.6875, + "epoch": 1.095890410958904, + "grad_norm": 2.0286333560943604, + "kl": 0.3779296875, + "learning_rate": 6.347031963470319e-07, + "loss": 0.0004, + "reward": 1.6484375, + "reward_std": 0.13488983362913132, + "rewards/accuracy_reward": 0.6484375, + "rewards/format_reward": 1.0, + "step": 800 + }, + { + "clip_ratio": 0.0, + "completion_length": 397.625, + "epoch": 1.0972602739726027, + "grad_norm": 3.4189932346343994, + "kl": 0.36962890625, + "learning_rate": 6.342465753424658e-07, + "loss": 0.0004, + "reward": 1.21875, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 801 + }, + { + "clip_ratio": 0.0, + "completion_length": 670.96875, + "epoch": 1.0986301369863014, + "grad_norm": 2.2040603160858154, + "kl": 0.39013671875, + "learning_rate": 6.337899543378995e-07, + "loss": 0.0004, + "reward": 1.7276785373687744, + "reward_std": 0.21270384266972542, + "rewards/accuracy_reward": 0.7589285373687744, + "rewards/format_reward": 0.96875, + "step": 802 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.96875, + "epoch": 1.1, + "grad_norm": 2.4739291667938232, + "kl": 0.35107421875, + "learning_rate": 6.333333333333332e-07, + "loss": 0.0004, + "reward": 1.25, + "reward_std": 0.1462521031498909, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 803 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.8125, + "epoch": 1.1013698630136985, + "grad_norm": 3.376389980316162, + "kl": 0.39453125, + "learning_rate": 6.328767123287671e-07, + "loss": 0.0004, + "reward": 2.2265625, + "reward_std": 0.40516645461320877, + "rewards/accuracy_reward": 1.2265625, + "rewards/format_reward": 1.0, + "step": 804 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.3125, + "epoch": 1.1027397260273972, + "grad_norm": 3.8003089427948, + "kl": 0.36767578125, + "learning_rate": 6.324200913242009e-07, + "loss": 0.0004, + "reward": 1.6328125, + "reward_std": 0.23646268248558044, + "rewards/accuracy_reward": 0.6328125, + "rewards/format_reward": 1.0, + "step": 805 + }, + { + "clip_ratio": 0.0, + "completion_length": 488.375, + "epoch": 1.104109589041096, + "grad_norm": 0.9171125888824463, + "kl": 0.39794921875, + "learning_rate": 6.319634703196348e-07, + "loss": 0.0004, + "reward": 1.234375, + "reward_std": 0.0289318785071373, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 1.0, + "step": 806 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.375, + "epoch": 1.1054794520547946, + "grad_norm": 2.2937474250793457, + "kl": 0.46240234375, + "learning_rate": 6.315068493150685e-07, + "loss": 0.0005, + "reward": 1.953125, + "reward_std": 0.22367356345057487, + "rewards/accuracy_reward": 0.953125, + "rewards/format_reward": 1.0, + "step": 807 + }, + { + "clip_ratio": 0.0, + "completion_length": 499.25, + "epoch": 1.106849315068493, + "grad_norm": 1.2261008024215698, + "kl": 0.39794921875, + "learning_rate": 6.310502283105022e-07, + "loss": 0.0004, + "reward": 1.6875, + "reward_std": 0.06681530922651291, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 808 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.65625, + "epoch": 1.1082191780821917, + "grad_norm": 10.46350383758545, + "kl": 0.41455078125, + "learning_rate": 6.305936073059361e-07, + "loss": 0.0004, + "reward": 1.5546875, + "reward_std": 0.19728106819093227, + "rewards/accuracy_reward": 0.5859375, + "rewards/format_reward": 0.96875, + "step": 809 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.0625, + "epoch": 1.1095890410958904, + "grad_norm": 0.021392615512013435, + "kl": 0.39013671875, + "learning_rate": 6.301369863013698e-07, + "loss": 0.0004, + "reward": 1.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 810 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.0, + "epoch": 1.110958904109589, + "grad_norm": 7.004191875457764, + "kl": 0.33984375, + "learning_rate": 6.296803652968035e-07, + "loss": 0.0003, + "reward": 1.03125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.03125, + "rewards/format_reward": 1.0, + "step": 811 + }, + { + "clip_ratio": 0.0, + "completion_length": 397.21875, + "epoch": 1.1123287671232878, + "grad_norm": 1.1587871313095093, + "kl": 0.38330078125, + "learning_rate": 6.292237442922375e-07, + "loss": 0.0004, + "reward": 1.21875, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 812 + }, + { + "clip_ratio": 0.0, + "completion_length": 497.53125, + "epoch": 1.1136986301369862, + "grad_norm": 1.5848588943481445, + "kl": 0.42041015625, + "learning_rate": 6.287671232876712e-07, + "loss": 0.0004, + "reward": 1.9352678656578064, + "reward_std": 0.08208167925477028, + "rewards/accuracy_reward": 0.9352678507566452, + "rewards/format_reward": 1.0, + "step": 813 + }, + { + "clip_ratio": 0.0, + "completion_length": 485.90625, + "epoch": 1.115068493150685, + "grad_norm": 2.042673349380493, + "kl": 0.3955078125, + "learning_rate": 6.283105022831051e-07, + "loss": 0.0004, + "reward": 1.6822916865348816, + "reward_std": 0.031000984832644463, + "rewards/accuracy_reward": 0.6822916716337204, + "rewards/format_reward": 1.0, + "step": 814 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.625, + "epoch": 1.1164383561643836, + "grad_norm": 3.0837056636810303, + "kl": 0.3740234375, + "learning_rate": 6.278538812785388e-07, + "loss": 0.0004, + "reward": 1.296875, + "reward_std": 0.15992168709635735, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 1.0, + "step": 815 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.84375, + "epoch": 1.1178082191780823, + "grad_norm": 2.6489899158477783, + "kl": 0.4033203125, + "learning_rate": 6.273972602739725e-07, + "loss": 0.0004, + "reward": 1.66015625, + "reward_std": 0.12530867382884026, + "rewards/accuracy_reward": 0.66015625, + "rewards/format_reward": 1.0, + "step": 816 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.25, + "epoch": 1.1191780821917807, + "grad_norm": 2.778820514678955, + "kl": 0.412109375, + "learning_rate": 6.269406392694064e-07, + "loss": 0.0004, + "reward": 1.6093749403953552, + "reward_std": 0.2615504954010248, + "rewards/accuracy_reward": 0.6406249850988388, + "rewards/format_reward": 0.96875, + "step": 817 + }, + { + "clip_ratio": 0.0, + "completion_length": 487.5, + "epoch": 1.1205479452054794, + "grad_norm": 2.434713363647461, + "kl": 0.39013671875, + "learning_rate": 6.264840182648402e-07, + "loss": 0.0004, + "reward": 1.421875, + "reward_std": 0.15992168709635735, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 1.0, + "step": 818 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.0625, + "epoch": 1.121917808219178, + "grad_norm": 5.153371334075928, + "kl": 0.4091796875, + "learning_rate": 6.260273972602739e-07, + "loss": 0.0004, + "reward": 1.796875, + "reward_std": 0.2057085707783699, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 1.0, + "step": 819 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.0625, + "epoch": 1.1232876712328768, + "grad_norm": 1.432228922843933, + "kl": 0.37451171875, + "learning_rate": 6.255707762557078e-07, + "loss": 0.0004, + "reward": 1.2421875, + "reward_std": 0.022097086533904076, + "rewards/accuracy_reward": 0.2734375, + "rewards/format_reward": 0.96875, + "step": 820 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.5625, + "epoch": 1.1246575342465754, + "grad_norm": 0.9751577377319336, + "kl": 0.396484375, + "learning_rate": 6.251141552511415e-07, + "loss": 0.0004, + "reward": 1.484375, + "reward_std": 0.1966201364994049, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.96875, + "step": 821 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.125, + "epoch": 1.126027397260274, + "grad_norm": 1.529309868812561, + "kl": 0.3896484375, + "learning_rate": 6.246575342465754e-07, + "loss": 0.0004, + "reward": 1.4140625, + "reward_std": 0.0810895636677742, + "rewards/accuracy_reward": 0.4140625, + "rewards/format_reward": 1.0, + "step": 822 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.65625, + "epoch": 1.1273972602739726, + "grad_norm": 2.0530521869659424, + "kl": 0.404296875, + "learning_rate": 6.242009132420091e-07, + "loss": 0.0004, + "reward": 1.796875, + "reward_std": 0.24911179021000862, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.96875, + "step": 823 + }, + { + "clip_ratio": 0.0, + "completion_length": 617.65625, + "epoch": 1.1287671232876713, + "grad_norm": 1.90122389793396, + "kl": 0.376953125, + "learning_rate": 6.237442922374428e-07, + "loss": 0.0004, + "reward": 1.5924479365348816, + "reward_std": 0.15607357770204544, + "rewards/accuracy_reward": 0.5924479216337204, + "rewards/format_reward": 1.0, + "step": 824 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.5, + "epoch": 1.13013698630137, + "grad_norm": 2.433654546737671, + "kl": 0.3994140625, + "learning_rate": 6.232876712328768e-07, + "loss": 0.0004, + "reward": 1.9285714030265808, + "reward_std": 0.07229206152260303, + "rewards/accuracy_reward": 0.9285714030265808, + "rewards/format_reward": 1.0, + "step": 825 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.78125, + "epoch": 1.1315068493150684, + "grad_norm": 1.0846009254455566, + "kl": 0.40673828125, + "learning_rate": 6.228310502283105e-07, + "loss": 0.0004, + "reward": 1.390625, + "reward_std": 0.08010874688625336, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 1.0, + "step": 826 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.375, + "epoch": 1.132876712328767, + "grad_norm": 3.237276554107666, + "kl": 0.42041015625, + "learning_rate": 6.223744292237442e-07, + "loss": 0.0004, + "reward": 1.609375, + "reward_std": 0.24567490443587303, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.96875, + "step": 827 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.875, + "epoch": 1.1342465753424658, + "grad_norm": 1.206854224205017, + "kl": 0.37841796875, + "learning_rate": 6.219178082191781e-07, + "loss": 0.0004, + "reward": 1.1640625, + "reward_std": 0.046501487493515015, + "rewards/accuracy_reward": 0.1640625, + "rewards/format_reward": 1.0, + "step": 828 + }, + { + "clip_ratio": 0.0, + "completion_length": 397.8125, + "epoch": 1.1356164383561644, + "grad_norm": 0.02847045287489891, + "kl": 0.41259765625, + "learning_rate": 6.214611872146118e-07, + "loss": 0.0004, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 829 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.15625, + "epoch": 1.1369863013698631, + "grad_norm": 9.929925918579102, + "kl": 0.390625, + "learning_rate": 6.210045662100457e-07, + "loss": 0.0004, + "reward": 1.8880208730697632, + "reward_std": 0.16971025243401527, + "rewards/accuracy_reward": 0.8880208432674408, + "rewards/format_reward": 1.0, + "step": 830 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.3125, + "epoch": 1.1383561643835616, + "grad_norm": 1.0134655237197876, + "kl": 0.404296875, + "learning_rate": 6.205479452054794e-07, + "loss": 0.0004, + "reward": 1.4375, + "reward_std": 0.06681530922651291, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 831 + }, + { + "clip_ratio": 0.0, + "completion_length": 472.71875, + "epoch": 1.1397260273972603, + "grad_norm": 1.1163495779037476, + "kl": 0.39990234375, + "learning_rate": 6.200913242009132e-07, + "loss": 0.0004, + "reward": 1.4375, + "reward_std": 0.09449111670255661, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 832 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.1875, + "epoch": 1.141095890410959, + "grad_norm": 2.4487106800079346, + "kl": 0.40185546875, + "learning_rate": 6.196347031963471e-07, + "loss": 0.0004, + "reward": 1.46875, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.96875, + "step": 833 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.15625, + "epoch": 1.1424657534246576, + "grad_norm": 8.19469165802002, + "kl": 0.376953125, + "learning_rate": 6.191780821917808e-07, + "loss": 0.0004, + "reward": 1.8093750476837158, + "reward_std": 0.04783010669052601, + "rewards/accuracy_reward": 0.809374988079071, + "rewards/format_reward": 1.0, + "step": 834 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.21875, + "epoch": 1.143835616438356, + "grad_norm": 0.01866794563829899, + "kl": 0.4033203125, + "learning_rate": 6.187214611872145e-07, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 835 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.21875, + "epoch": 1.1452054794520548, + "grad_norm": 0.9545954465866089, + "kl": 0.41162109375, + "learning_rate": 6.182648401826484e-07, + "loss": 0.0004, + "reward": 2.1468749940395355, + "reward_std": 0.048065248876810074, + "rewards/accuracy_reward": 1.1468749940395355, + "rewards/format_reward": 1.0, + "step": 836 + }, + { + "clip_ratio": 0.0, + "completion_length": 488.8125, + "epoch": 1.1465753424657534, + "grad_norm": 2.1958019733428955, + "kl": 0.40283203125, + "learning_rate": 6.178082191780821e-07, + "loss": 0.0004, + "reward": 1.8671875, + "reward_std": 0.20912351086735725, + "rewards/accuracy_reward": 0.8671875, + "rewards/format_reward": 1.0, + "step": 837 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.125, + "epoch": 1.1479452054794521, + "grad_norm": 1.0978032350540161, + "kl": 0.3486328125, + "learning_rate": 6.173515981735161e-07, + "loss": 0.0003, + "reward": 1.44921875, + "reward_std": 0.0817250907421112, + "rewards/accuracy_reward": 0.44921875, + "rewards/format_reward": 1.0, + "step": 838 + }, + { + "clip_ratio": 0.0, + "completion_length": 484.03125, + "epoch": 1.1493150684931508, + "grad_norm": 0.019236719235777855, + "kl": 0.4033203125, + "learning_rate": 6.168949771689498e-07, + "loss": 0.0004, + "reward": 1.625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 839 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.65625, + "epoch": 1.1506849315068493, + "grad_norm": 0.01787232607603073, + "kl": 0.431640625, + "learning_rate": 6.164383561643835e-07, + "loss": 0.0004, + "reward": 1.4375, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 840 + }, + { + "clip_ratio": 0.0, + "completion_length": 390.53125, + "epoch": 1.152054794520548, + "grad_norm": 5.282984733581543, + "kl": 0.40185546875, + "learning_rate": 6.159817351598174e-07, + "loss": 0.0004, + "reward": 1.734375, + "reward_std": 0.22097086533904076, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 1.0, + "step": 841 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.6875, + "epoch": 1.1534246575342466, + "grad_norm": 0.8666876554489136, + "kl": 0.39599609375, + "learning_rate": 6.155251141552511e-07, + "loss": 0.0004, + "reward": 1.65625, + "reward_std": 0.11410887539386749, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 1.0, + "step": 842 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.90625, + "epoch": 1.1547945205479453, + "grad_norm": 2.4815289974212646, + "kl": 0.43505859375, + "learning_rate": 6.150684931506848e-07, + "loss": 0.0004, + "reward": 1.3984375, + "reward_std": 0.2441160511225462, + "rewards/accuracy_reward": 0.3984375, + "rewards/format_reward": 1.0, + "step": 843 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.84375, + "epoch": 1.1561643835616437, + "grad_norm": 8.32512378692627, + "kl": 0.37744140625, + "learning_rate": 6.146118721461187e-07, + "loss": 0.0004, + "reward": 1.6875, + "reward_std": 0.1356339044868946, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 844 + }, + { + "clip_ratio": 0.0, + "completion_length": 637.96875, + "epoch": 1.1575342465753424, + "grad_norm": 5.9347052574157715, + "kl": 0.40380859375, + "learning_rate": 6.141552511415525e-07, + "loss": 0.0004, + "reward": 1.9375, + "reward_std": 0.19219962693750858, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 1.0, + "step": 845 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.78125, + "epoch": 1.158904109589041, + "grad_norm": 2.557178020477295, + "kl": 0.35595703125, + "learning_rate": 6.136986301369864e-07, + "loss": 0.0004, + "reward": 1.28125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 1.0, + "step": 846 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.09375, + "epoch": 1.1602739726027398, + "grad_norm": 0.9839144945144653, + "kl": 0.369140625, + "learning_rate": 6.132420091324201e-07, + "loss": 0.0004, + "reward": 1.21875, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 847 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.1875, + "epoch": 1.1616438356164385, + "grad_norm": 0.01052401214838028, + "kl": 0.3525390625, + "learning_rate": 6.127853881278538e-07, + "loss": 0.0004, + "reward": 1.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 848 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.25, + "epoch": 1.163013698630137, + "grad_norm": 2.7697222232818604, + "kl": 0.35546875, + "learning_rate": 6.123287671232877e-07, + "loss": 0.0004, + "reward": 1.774999976158142, + "reward_std": 0.13462574779987335, + "rewards/accuracy_reward": 0.7749999463558197, + "rewards/format_reward": 1.0, + "step": 849 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.90625, + "epoch": 1.1643835616438356, + "grad_norm": 3.054478406906128, + "kl": 0.35205078125, + "learning_rate": 6.118721461187214e-07, + "loss": 0.0004, + "reward": 1.5390625, + "reward_std": 0.2896413579583168, + "rewards/accuracy_reward": 0.5390625, + "rewards/format_reward": 1.0, + "step": 850 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.125, + "epoch": 1.1657534246575343, + "grad_norm": 1.4414507150650024, + "kl": 0.3310546875, + "learning_rate": 6.114155251141551e-07, + "loss": 0.0003, + "reward": 1.4375, + "reward_std": 0.07312605157494545, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 851 + }, + { + "clip_ratio": 0.0, + "completion_length": 641.875, + "epoch": 1.167123287671233, + "grad_norm": 0.9515066146850586, + "kl": 0.35302734375, + "learning_rate": 6.109589041095891e-07, + "loss": 0.0004, + "reward": 1.421875, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 1.0, + "step": 852 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.6875, + "epoch": 1.1684931506849314, + "grad_norm": 2.1378560066223145, + "kl": 0.36083984375, + "learning_rate": 6.105022831050228e-07, + "loss": 0.0004, + "reward": 2.0, + "reward_std": 0.12179600074887276, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 853 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.9375, + "epoch": 1.16986301369863, + "grad_norm": 1.0797525644302368, + "kl": 0.3515625, + "learning_rate": 6.100456621004567e-07, + "loss": 0.0004, + "reward": 1.421875, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 1.0, + "step": 854 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.71875, + "epoch": 1.1712328767123288, + "grad_norm": 2.498821258544922, + "kl": 0.341796875, + "learning_rate": 6.095890410958904e-07, + "loss": 0.0003, + "reward": 1.375, + "reward_std": 0.13363061845302582, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 855 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.53125, + "epoch": 1.1726027397260275, + "grad_norm": 1.0982065200805664, + "kl": 0.36279296875, + "learning_rate": 6.091324200913241e-07, + "loss": 0.0004, + "reward": 1.21875, + "reward_std": 0.033407654613256454, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 856 + }, + { + "clip_ratio": 0.0, + "completion_length": 404.75, + "epoch": 1.1739726027397261, + "grad_norm": 6.744996547698975, + "kl": 0.3828125, + "learning_rate": 6.08675799086758e-07, + "loss": 0.0004, + "reward": 1.8333333730697632, + "reward_std": 0.13888481445610523, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 1.0, + "step": 857 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.25, + "epoch": 1.1753424657534246, + "grad_norm": 3.830198049545288, + "kl": 0.36181640625, + "learning_rate": 6.082191780821918e-07, + "loss": 0.0004, + "reward": 1.4375, + "reward_std": 0.3230287954211235, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.96875, + "step": 858 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.59375, + "epoch": 1.1767123287671233, + "grad_norm": 2.6295642852783203, + "kl": 0.35009765625, + "learning_rate": 6.077625570776255e-07, + "loss": 0.0003, + "reward": 1.1875, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 859 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.25, + "epoch": 1.178082191780822, + "grad_norm": 3.0436506271362305, + "kl": 0.35498046875, + "learning_rate": 6.073059360730594e-07, + "loss": 0.0004, + "reward": 1.921875, + "reward_std": 0.1446593925356865, + "rewards/accuracy_reward": 0.921875, + "rewards/format_reward": 1.0, + "step": 860 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.28125, + "epoch": 1.1794520547945206, + "grad_norm": 3.9150311946868896, + "kl": 0.39208984375, + "learning_rate": 6.068493150684931e-07, + "loss": 0.0004, + "reward": 1.484375, + "reward_std": 0.1366081517189741, + "rewards/accuracy_reward": 0.4843749701976776, + "rewards/format_reward": 1.0, + "step": 861 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.53125, + "epoch": 1.180821917808219, + "grad_norm": 1.7237287759780884, + "kl": 0.33984375, + "learning_rate": 6.06392694063927e-07, + "loss": 0.0003, + "reward": 1.609375, + "reward_std": 0.39774755015969276, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9375, + "step": 862 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.21875, + "epoch": 1.1821917808219178, + "grad_norm": 4.21384334564209, + "kl": 0.33203125, + "learning_rate": 6.059360730593607e-07, + "loss": 0.0003, + "reward": 1.7738094925880432, + "reward_std": 0.27128167636692524, + "rewards/accuracy_reward": 0.8050595223903656, + "rewards/format_reward": 0.96875, + "step": 863 + }, + { + "clip_ratio": 0.0, + "completion_length": 703.3125, + "epoch": 1.1835616438356165, + "grad_norm": 1.8811252117156982, + "kl": 0.36279296875, + "learning_rate": 6.054794520547944e-07, + "loss": 0.0004, + "reward": 1.9895833134651184, + "reward_std": 0.1106601133942604, + "rewards/accuracy_reward": 0.9895833283662796, + "rewards/format_reward": 1.0, + "step": 864 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.3125, + "epoch": 1.1849315068493151, + "grad_norm": 1.4166743755340576, + "kl": 0.357421875, + "learning_rate": 6.050228310502284e-07, + "loss": 0.0004, + "reward": 1.8359375, + "reward_std": 0.061278700828552246, + "rewards/accuracy_reward": 0.8359375, + "rewards/format_reward": 1.0, + "step": 865 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.34375, + "epoch": 1.1863013698630138, + "grad_norm": 3.5078675746917725, + "kl": 0.37109375, + "learning_rate": 6.045662100456621e-07, + "loss": 0.0004, + "reward": 1.5, + "reward_std": 0.2644027303904295, + "rewards/accuracy_reward": 0.4999999850988388, + "rewards/format_reward": 1.0, + "step": 866 + }, + { + "clip_ratio": 0.0, + "completion_length": 494.53125, + "epoch": 1.1876712328767123, + "grad_norm": 2.0775046348571777, + "kl": 0.44140625, + "learning_rate": 6.041095890410958e-07, + "loss": 0.0004, + "reward": 1.4375, + "reward_std": 0.1462521031498909, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 1.0, + "step": 867 + }, + { + "clip_ratio": 0.0, + "completion_length": 617.40625, + "epoch": 1.189041095890411, + "grad_norm": 3.112804651260376, + "kl": 0.349609375, + "learning_rate": 6.036529680365297e-07, + "loss": 0.0003, + "reward": 1.7890625, + "reward_std": 0.3950253389775753, + "rewards/accuracy_reward": 0.8203125, + "rewards/format_reward": 0.96875, + "step": 868 + }, + { + "clip_ratio": 0.0, + "completion_length": 738.625, + "epoch": 1.1904109589041096, + "grad_norm": 0.8393311500549316, + "kl": 0.34765625, + "learning_rate": 6.031963470319634e-07, + "loss": 0.0003, + "reward": 1.46875, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.96875, + "step": 869 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.5, + "epoch": 1.191780821917808, + "grad_norm": 1.9282116889953613, + "kl": 0.3583984375, + "learning_rate": 6.027397260273972e-07, + "loss": 0.0004, + "reward": 1.1875, + "reward_std": 0.06681530922651291, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 870 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.90625, + "epoch": 1.1931506849315068, + "grad_norm": 1.3332931995391846, + "kl": 0.37451171875, + "learning_rate": 6.02283105022831e-07, + "loss": 0.0004, + "reward": 1.7109375, + "reward_std": 0.0765409953892231, + "rewards/accuracy_reward": 0.7109375, + "rewards/format_reward": 1.0, + "step": 871 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.5625, + "epoch": 1.1945205479452055, + "grad_norm": 2.1982920169830322, + "kl": 0.36279296875, + "learning_rate": 6.018264840182648e-07, + "loss": 0.0004, + "reward": 1.2083333134651184, + "reward_std": 0.031497031450271606, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 1.0, + "step": 872 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.4375, + "epoch": 1.1958904109589041, + "grad_norm": 1.4797850847244263, + "kl": 0.37353515625, + "learning_rate": 6.013698630136987e-07, + "loss": 0.0004, + "reward": 1.3776041567325592, + "reward_std": 0.10277634114027023, + "rewards/accuracy_reward": 0.3776041567325592, + "rewards/format_reward": 1.0, + "step": 873 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.28125, + "epoch": 1.1972602739726028, + "grad_norm": 1.9213581085205078, + "kl": 0.37158203125, + "learning_rate": 6.009132420091324e-07, + "loss": 0.0004, + "reward": 1.5833333432674408, + "reward_std": 0.09757299907505512, + "rewards/accuracy_reward": 0.5833333283662796, + "rewards/format_reward": 1.0, + "step": 874 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.0625, + "epoch": 1.1986301369863013, + "grad_norm": 3.607898712158203, + "kl": 0.4130859375, + "learning_rate": 6.004566210045661e-07, + "loss": 0.0004, + "reward": 1.7734375, + "reward_std": 0.2514927387237549, + "rewards/accuracy_reward": 0.7734375, + "rewards/format_reward": 1.0, + "step": 875 + }, + { + "clip_ratio": 0.0, + "completion_length": 402.34375, + "epoch": 1.2, + "grad_norm": 5.052751541137695, + "kl": 0.46923828125, + "learning_rate": 6e-07, + "loss": 0.0005, + "reward": 1.546875, + "reward_std": 0.15992168709635735, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 1.0, + "step": 876 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.96875, + "epoch": 1.2013698630136986, + "grad_norm": 4.230776309967041, + "kl": 0.46435546875, + "learning_rate": 5.995433789954337e-07, + "loss": 0.0005, + "reward": 1.8444940447807312, + "reward_std": 0.1678207330405712, + "rewards/accuracy_reward": 0.8444940149784088, + "rewards/format_reward": 1.0, + "step": 877 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.78125, + "epoch": 1.2027397260273973, + "grad_norm": 8.833257675170898, + "kl": 0.41357421875, + "learning_rate": 5.990867579908675e-07, + "loss": 0.0004, + "reward": 1.5, + "reward_std": 0.1462521031498909, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 878 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.96875, + "epoch": 1.2041095890410958, + "grad_norm": 6.765456199645996, + "kl": 0.45166015625, + "learning_rate": 5.986301369863014e-07, + "loss": 0.0005, + "reward": 1.8531250357627869, + "reward_std": 0.34787509217858315, + "rewards/accuracy_reward": 0.8843750059604645, + "rewards/format_reward": 0.96875, + "step": 879 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.6875, + "epoch": 1.2054794520547945, + "grad_norm": 1.5083391666412354, + "kl": 0.39013671875, + "learning_rate": 5.981735159817351e-07, + "loss": 0.0004, + "reward": 1.5703125, + "reward_std": 0.0657544769346714, + "rewards/accuracy_reward": 0.5703125, + "rewards/format_reward": 1.0, + "step": 880 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.84375, + "epoch": 1.2068493150684931, + "grad_norm": 0.8882075548171997, + "kl": 0.4072265625, + "learning_rate": 5.97716894977169e-07, + "loss": 0.0004, + "reward": 1.453125, + "reward_std": 0.0867956355214119, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 1.0, + "step": 881 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.40625, + "epoch": 1.2082191780821918, + "grad_norm": 1.7317863702774048, + "kl": 0.427734375, + "learning_rate": 5.972602739726027e-07, + "loss": 0.0004, + "reward": 1.65625, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 1.0, + "step": 882 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.3125, + "epoch": 1.2095890410958905, + "grad_norm": 11.200748443603516, + "kl": 0.46435546875, + "learning_rate": 5.968036529680364e-07, + "loss": 0.0005, + "reward": 1.5625, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 883 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.90625, + "epoch": 1.210958904109589, + "grad_norm": 5.06436014175415, + "kl": 0.44775390625, + "learning_rate": 5.963470319634703e-07, + "loss": 0.0004, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 884 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.125, + "epoch": 1.2123287671232876, + "grad_norm": 3.117278575897217, + "kl": 0.39013671875, + "learning_rate": 5.958904109589041e-07, + "loss": 0.0004, + "reward": 1.15625, + "reward_std": 0.3061639815568924, + "rewards/accuracy_reward": 0.15625, + "rewards/format_reward": 1.0, + "step": 885 + }, + { + "clip_ratio": 0.0, + "completion_length": 679.0625, + "epoch": 1.2136986301369863, + "grad_norm": 3.001260757446289, + "kl": 0.390625, + "learning_rate": 5.95433789954338e-07, + "loss": 0.0004, + "reward": 2.359375, + "reward_std": 0.0289318785071373, + "rewards/accuracy_reward": 1.359375, + "rewards/format_reward": 1.0, + "step": 886 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.78125, + "epoch": 1.215068493150685, + "grad_norm": 4.375697135925293, + "kl": 0.39306640625, + "learning_rate": 5.949771689497717e-07, + "loss": 0.0004, + "reward": 1.5625, + "reward_std": 0.2177756354212761, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 887 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.71875, + "epoch": 1.2164383561643834, + "grad_norm": 1.2586559057235718, + "kl": 0.3837890625, + "learning_rate": 5.945205479452054e-07, + "loss": 0.0004, + "reward": 1.421875, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 1.0, + "step": 888 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.21875, + "epoch": 1.2178082191780821, + "grad_norm": 0.010667498223483562, + "kl": 0.3828125, + "learning_rate": 5.940639269406393e-07, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 889 + }, + { + "clip_ratio": 0.0, + "completion_length": 631.0625, + "epoch": 1.2191780821917808, + "grad_norm": 4.928837299346924, + "kl": 0.37646484375, + "learning_rate": 5.93607305936073e-07, + "loss": 0.0004, + "reward": 2.0390625, + "reward_std": 0.061278700828552246, + "rewards/accuracy_reward": 1.0390625, + "rewards/format_reward": 1.0, + "step": 890 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.6875, + "epoch": 1.2205479452054795, + "grad_norm": 3.472158193588257, + "kl": 0.39794921875, + "learning_rate": 5.931506849315067e-07, + "loss": 0.0004, + "reward": 1.6875, + "reward_std": 0.2177756354212761, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 1.0, + "step": 891 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.375, + "epoch": 1.2219178082191782, + "grad_norm": 1.6116597652435303, + "kl": 0.41357421875, + "learning_rate": 5.926940639269407e-07, + "loss": 0.0004, + "reward": 1.109375, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward": 0.109375, + "rewards/format_reward": 1.0, + "step": 892 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.46875, + "epoch": 1.2232876712328766, + "grad_norm": 2.2251503467559814, + "kl": 0.39697265625, + "learning_rate": 5.922374429223744e-07, + "loss": 0.0004, + "reward": 1.8541666865348816, + "reward_std": 0.32778636924922466, + "rewards/accuracy_reward": 0.8854166567325592, + "rewards/format_reward": 0.96875, + "step": 893 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.15625, + "epoch": 1.2246575342465753, + "grad_norm": 2.8448946475982666, + "kl": 0.3955078125, + "learning_rate": 5.917808219178083e-07, + "loss": 0.0004, + "reward": 1.375, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 894 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.8125, + "epoch": 1.226027397260274, + "grad_norm": 2.6070163249969482, + "kl": 0.400390625, + "learning_rate": 5.91324200913242e-07, + "loss": 0.0004, + "reward": 1.734375, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 1.0, + "step": 895 + }, + { + "clip_ratio": 0.0, + "completion_length": 655.21875, + "epoch": 1.2273972602739727, + "grad_norm": 1.7733893394470215, + "kl": 0.38232421875, + "learning_rate": 5.908675799086757e-07, + "loss": 0.0004, + "reward": 1.55859375, + "reward_std": 0.07789094373583794, + "rewards/accuracy_reward": 0.55859375, + "rewards/format_reward": 1.0, + "step": 896 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.40625, + "epoch": 1.2287671232876711, + "grad_norm": 4.120666027069092, + "kl": 0.40087890625, + "learning_rate": 5.904109589041096e-07, + "loss": 0.0004, + "reward": 1.6651785373687744, + "reward_std": 0.17251565493643284, + "rewards/accuracy_reward": 0.6651785373687744, + "rewards/format_reward": 1.0, + "step": 897 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.0625, + "epoch": 1.2301369863013698, + "grad_norm": 2.522732973098755, + "kl": 0.38720703125, + "learning_rate": 5.899543378995433e-07, + "loss": 0.0004, + "reward": 1.6119791567325592, + "reward_std": 0.25090846233069897, + "rewards/accuracy_reward": 0.6119791716337204, + "rewards/format_reward": 1.0, + "step": 898 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.9375, + "epoch": 1.2315068493150685, + "grad_norm": 2.1330246925354004, + "kl": 0.390625, + "learning_rate": 5.894977168949771e-07, + "loss": 0.0004, + "reward": 1.25, + "reward_std": 0.2925042062997818, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.96875, + "step": 899 + }, + { + "clip_ratio": 0.0, + "completion_length": 482.0625, + "epoch": 1.2328767123287672, + "grad_norm": 6.462682247161865, + "kl": 0.38671875, + "learning_rate": 5.89041095890411e-07, + "loss": 0.0004, + "reward": 1.4609375, + "reward_std": 0.1662898138165474, + "rewards/accuracy_reward": 0.4609375, + "rewards/format_reward": 1.0, + "step": 900 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.21875, + "epoch": 1.2342465753424658, + "grad_norm": 2.9799962043762207, + "kl": 0.37939453125, + "learning_rate": 5.885844748858447e-07, + "loss": 0.0004, + "reward": 2.119791656732559, + "reward_std": 0.014731401577591896, + "rewards/accuracy_reward": 1.1197916567325592, + "rewards/format_reward": 1.0, + "step": 901 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.90625, + "epoch": 1.2356164383561643, + "grad_norm": 3.0818071365356445, + "kl": 0.4287109375, + "learning_rate": 5.881278538812785e-07, + "loss": 0.0004, + "reward": 1.546875, + "reward_std": 0.22733328863978386, + "rewards/accuracy_reward": 0.5468750149011612, + "rewards/format_reward": 1.0, + "step": 902 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.28125, + "epoch": 1.236986301369863, + "grad_norm": 0.7816328406333923, + "kl": 0.41259765625, + "learning_rate": 5.876712328767123e-07, + "loss": 0.0004, + "reward": 1.609375, + "reward_std": 0.17278572916984558, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.96875, + "step": 903 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.96875, + "epoch": 1.2383561643835617, + "grad_norm": 2.238757848739624, + "kl": 0.37060546875, + "learning_rate": 5.87214611872146e-07, + "loss": 0.0004, + "reward": 1.8671875, + "reward_std": 0.27381716668605804, + "rewards/accuracy_reward": 0.8984375, + "rewards/format_reward": 0.96875, + "step": 904 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.5625, + "epoch": 1.2397260273972603, + "grad_norm": 2.3652467727661133, + "kl": 0.38232421875, + "learning_rate": 5.8675799086758e-07, + "loss": 0.0004, + "reward": 1.5364583134651184, + "reward_std": 0.13729207031428814, + "rewards/accuracy_reward": 0.5364583283662796, + "rewards/format_reward": 1.0, + "step": 905 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.53125, + "epoch": 1.2410958904109588, + "grad_norm": 0.013698413036763668, + "kl": 0.3779296875, + "learning_rate": 5.863013698630137e-07, + "loss": 0.0004, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 906 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.875, + "epoch": 1.2424657534246575, + "grad_norm": 2.4582483768463135, + "kl": 0.35595703125, + "learning_rate": 5.858447488584474e-07, + "loss": 0.0004, + "reward": 1.484375, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 1.0, + "step": 907 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.53125, + "epoch": 1.2438356164383562, + "grad_norm": 1.652496576309204, + "kl": 0.4560546875, + "learning_rate": 5.853881278538813e-07, + "loss": 0.0005, + "reward": 1.03125, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.03125, + "rewards/format_reward": 1.0, + "step": 908 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.53125, + "epoch": 1.2452054794520548, + "grad_norm": 1.0350927114486694, + "kl": 0.3720703125, + "learning_rate": 5.84931506849315e-07, + "loss": 0.0004, + "reward": 1.671875, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 1.0, + "step": 909 + }, + { + "clip_ratio": 0.0, + "completion_length": 528.375, + "epoch": 1.2465753424657535, + "grad_norm": 2.5157017707824707, + "kl": 0.40625, + "learning_rate": 5.844748858447488e-07, + "loss": 0.0004, + "reward": 1.4947916567325592, + "reward_std": 0.1395920068025589, + "rewards/accuracy_reward": 0.4947916567325592, + "rewards/format_reward": 1.0, + "step": 910 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.375, + "epoch": 1.247945205479452, + "grad_norm": 1.965340256690979, + "kl": 0.37060546875, + "learning_rate": 5.840182648401826e-07, + "loss": 0.0004, + "reward": 1.203125, + "reward_std": 0.1530819907784462, + "rewards/accuracy_reward": 0.203125, + "rewards/format_reward": 1.0, + "step": 911 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.15625, + "epoch": 1.2493150684931507, + "grad_norm": 1.2656036615371704, + "kl": 0.39697265625, + "learning_rate": 5.835616438356164e-07, + "loss": 0.0004, + "reward": 1.359375, + "reward_std": 0.12255740165710449, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 1.0, + "step": 912 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.375, + "epoch": 1.2506849315068493, + "grad_norm": 2.7013890743255615, + "kl": 0.37451171875, + "learning_rate": 5.831050228310503e-07, + "loss": 0.0004, + "reward": 1.734375, + "reward_std": 0.22097086533904076, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 1.0, + "step": 913 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.34375, + "epoch": 1.252054794520548, + "grad_norm": 2.896122932434082, + "kl": 0.3740234375, + "learning_rate": 5.82648401826484e-07, + "loss": 0.0004, + "reward": 2.0625, + "reward_std": 0.26197961531579494, + "rewards/accuracy_reward": 1.0625, + "rewards/format_reward": 1.0, + "step": 914 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.09375, + "epoch": 1.2534246575342465, + "grad_norm": 3.789921998977661, + "kl": 0.3935546875, + "learning_rate": 5.821917808219177e-07, + "loss": 0.0004, + "reward": 1.328125, + "reward_std": 0.28930897638201714, + "rewards/accuracy_reward": 0.328125, + "rewards/format_reward": 1.0, + "step": 915 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.21875, + "epoch": 1.2547945205479452, + "grad_norm": 2.50876522064209, + "kl": 0.408203125, + "learning_rate": 5.817351598173516e-07, + "loss": 0.0004, + "reward": 1.671875, + "reward_std": 0.1583191677927971, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 1.0, + "step": 916 + }, + { + "clip_ratio": 0.0, + "completion_length": 632.125, + "epoch": 1.2561643835616438, + "grad_norm": 2.9093079566955566, + "kl": 0.380859375, + "learning_rate": 5.812785388127853e-07, + "loss": 0.0004, + "reward": 2.0078125, + "reward_std": 0.163336630910635, + "rewards/accuracy_reward": 1.0078125, + "rewards/format_reward": 1.0, + "step": 917 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.65625, + "epoch": 1.2575342465753425, + "grad_norm": 1.01085364818573, + "kl": 0.37353515625, + "learning_rate": 5.808219178082191e-07, + "loss": 0.0004, + "reward": 1.484375, + "reward_std": 0.021564556285738945, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 1.0, + "step": 918 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.0625, + "epoch": 1.2589041095890412, + "grad_norm": 1.2397353649139404, + "kl": 0.36962890625, + "learning_rate": 5.80365296803653e-07, + "loss": 0.0004, + "reward": 1.1875, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 0.96875, + "step": 919 + }, + { + "clip_ratio": 0.0, + "completion_length": 478.5, + "epoch": 1.2602739726027397, + "grad_norm": 1.3165117502212524, + "kl": 0.3916015625, + "learning_rate": 5.799086757990867e-07, + "loss": 0.0004, + "reward": 1.5625, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 920 + }, + { + "clip_ratio": 0.0, + "completion_length": 682.6875, + "epoch": 1.2616438356164383, + "grad_norm": 2.271702289581299, + "kl": 0.3701171875, + "learning_rate": 5.794520547945206e-07, + "loss": 0.0004, + "reward": 1.77734375, + "reward_std": 0.32882310450077057, + "rewards/accuracy_reward": 0.80859375, + "rewards/format_reward": 0.96875, + "step": 921 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.09375, + "epoch": 1.263013698630137, + "grad_norm": 0.017480703070759773, + "kl": 0.3984375, + "learning_rate": 5.789954337899543e-07, + "loss": 0.0004, + "reward": 1.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 922 + }, + { + "clip_ratio": 0.0, + "completion_length": 504.65625, + "epoch": 1.2643835616438357, + "grad_norm": 1.695114016532898, + "kl": 0.4033203125, + "learning_rate": 5.78538812785388e-07, + "loss": 0.0004, + "reward": 1.5234375, + "reward_std": 0.11048543266952038, + "rewards/accuracy_reward": 0.5234375, + "rewards/format_reward": 1.0, + "step": 923 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.90625, + "epoch": 1.2657534246575342, + "grad_norm": 1.5401252508163452, + "kl": 0.353515625, + "learning_rate": 5.780821917808219e-07, + "loss": 0.0004, + "reward": 1.7708333134651184, + "reward_std": 0.05891544930636883, + "rewards/accuracy_reward": 0.7708333283662796, + "rewards/format_reward": 1.0, + "step": 924 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.78125, + "epoch": 1.2671232876712328, + "grad_norm": 3.2033495903015137, + "kl": 0.39013671875, + "learning_rate": 5.776255707762557e-07, + "loss": 0.0004, + "reward": 1.55859375, + "reward_std": 0.27621358167380095, + "rewards/accuracy_reward": 0.58984375, + "rewards/format_reward": 0.96875, + "step": 925 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.28125, + "epoch": 1.2684931506849315, + "grad_norm": 3.77433705329895, + "kl": 0.4111328125, + "learning_rate": 5.771689497716896e-07, + "loss": 0.0004, + "reward": 1.3177083134651184, + "reward_std": 0.4321344643831253, + "rewards/accuracy_reward": 0.3802083283662796, + "rewards/format_reward": 0.9375, + "step": 926 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.78125, + "epoch": 1.2698630136986302, + "grad_norm": 4.000980854034424, + "kl": 0.3642578125, + "learning_rate": 5.767123287671233e-07, + "loss": 0.0004, + "reward": 1.515625, + "reward_std": 0.0828370526432991, + "rewards/accuracy_reward": 0.5156249701976776, + "rewards/format_reward": 1.0, + "step": 927 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.28125, + "epoch": 1.2712328767123289, + "grad_norm": 2.266773223876953, + "kl": 0.37060546875, + "learning_rate": 5.76255707762557e-07, + "loss": 0.0004, + "reward": 1.53125, + "reward_std": 0.2041158601641655, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 1.0, + "step": 928 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.125, + "epoch": 1.2726027397260273, + "grad_norm": 5.215886116027832, + "kl": 0.4228515625, + "learning_rate": 5.757990867579909e-07, + "loss": 0.0004, + "reward": 1.6822916269302368, + "reward_std": 0.24831003323197365, + "rewards/accuracy_reward": 0.7135416269302368, + "rewards/format_reward": 0.96875, + "step": 929 + }, + { + "clip_ratio": 0.0, + "completion_length": 477.78125, + "epoch": 1.273972602739726, + "grad_norm": 1.8586878776550293, + "kl": 0.44384765625, + "learning_rate": 5.753424657534246e-07, + "loss": 0.0004, + "reward": 1.390625, + "reward_std": 0.2302001230418682, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 0.96875, + "step": 930 + }, + { + "clip_ratio": 0.0, + "completion_length": 773.1875, + "epoch": 1.2753424657534247, + "grad_norm": 1.8589192628860474, + "kl": 0.36376953125, + "learning_rate": 5.748858447488583e-07, + "loss": 0.0004, + "reward": 2.052604168653488, + "reward_std": 0.25580091029405594, + "rewards/accuracy_reward": 1.0838541686534882, + "rewards/format_reward": 0.96875, + "step": 931 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.84375, + "epoch": 1.2767123287671232, + "grad_norm": 1.5999672412872314, + "kl": 1.4580078125, + "learning_rate": 5.744292237442923e-07, + "loss": 0.0015, + "reward": 1.46875, + "reward_std": 0.033407654613256454, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 1.0, + "step": 932 + }, + { + "clip_ratio": 0.0, + "completion_length": 668.84375, + "epoch": 1.2780821917808218, + "grad_norm": 1.7803735733032227, + "kl": 0.39013671875, + "learning_rate": 5.73972602739726e-07, + "loss": 0.0004, + "reward": 2.435156285762787, + "reward_std": 0.16129423771053553, + "rewards/accuracy_reward": 1.4351562857627869, + "rewards/format_reward": 1.0, + "step": 933 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.46875, + "epoch": 1.2794520547945205, + "grad_norm": 3.8296427726745605, + "kl": 0.39404296875, + "learning_rate": 5.735159817351598e-07, + "loss": 0.0004, + "reward": 1.65625, + "reward_std": 0.22201896272599697, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 1.0, + "step": 934 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.03125, + "epoch": 1.2808219178082192, + "grad_norm": 0.021630197763442993, + "kl": 0.4111328125, + "learning_rate": 5.730593607305936e-07, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 1.0, + "step": 935 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.90625, + "epoch": 1.2821917808219179, + "grad_norm": 1.9150142669677734, + "kl": 0.42236328125, + "learning_rate": 5.726027397260273e-07, + "loss": 0.0004, + "reward": 1.4791666269302368, + "reward_std": 0.3344755917787552, + "rewards/accuracy_reward": 0.5104166716337204, + "rewards/format_reward": 0.96875, + "step": 936 + }, + { + "clip_ratio": 0.0, + "completion_length": 778.40625, + "epoch": 1.2835616438356166, + "grad_norm": 1.2064579725265503, + "kl": 0.3486328125, + "learning_rate": 5.721461187214612e-07, + "loss": 0.0003, + "reward": 1.671875, + "reward_std": 0.22097086533904076, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.96875, + "step": 937 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.5625, + "epoch": 1.284931506849315, + "grad_norm": 2.410706043243408, + "kl": 0.44140625, + "learning_rate": 5.716894977168949e-07, + "loss": 0.0004, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 938 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.0, + "epoch": 1.2863013698630137, + "grad_norm": 2.2251358032226562, + "kl": 0.3798828125, + "learning_rate": 5.712328767123287e-07, + "loss": 0.0004, + "reward": 1.734375, + "reward_std": 0.24511976540088654, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 1.0, + "step": 939 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.78125, + "epoch": 1.2876712328767124, + "grad_norm": 2.1631131172180176, + "kl": 0.4345703125, + "learning_rate": 5.707762557077626e-07, + "loss": 0.0004, + "reward": 1.3177083134651184, + "reward_std": 0.3885781615972519, + "rewards/accuracy_reward": 0.3489583134651184, + "rewards/format_reward": 0.96875, + "step": 940 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.125, + "epoch": 1.2890410958904108, + "grad_norm": 2.293469190597534, + "kl": 0.51708984375, + "learning_rate": 5.703196347031963e-07, + "loss": 0.0005, + "reward": 1.421875, + "reward_std": 0.1446593925356865, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 1.0, + "step": 941 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.53125, + "epoch": 1.2904109589041095, + "grad_norm": 0.01647823676466942, + "kl": 0.3896484375, + "learning_rate": 5.698630136986301e-07, + "loss": 0.0004, + "reward": 2.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.5, + "rewards/format_reward": 1.0, + "step": 942 + }, + { + "clip_ratio": 0.0, + "completion_length": 623.71875, + "epoch": 1.2917808219178082, + "grad_norm": 2.266577959060669, + "kl": 0.80126953125, + "learning_rate": 5.694063926940639e-07, + "loss": 0.0008, + "reward": 1.5963541865348816, + "reward_std": 0.2827804908156395, + "rewards/accuracy_reward": 0.6276041865348816, + "rewards/format_reward": 0.96875, + "step": 943 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.25, + "epoch": 1.2931506849315069, + "grad_norm": 3.0220417976379395, + "kl": 0.40234375, + "learning_rate": 5.689497716894976e-07, + "loss": 0.0004, + "reward": 1.515625, + "reward_std": 0.16828217171132565, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 1.0, + "step": 944 + }, + { + "clip_ratio": 0.0, + "completion_length": 693.875, + "epoch": 1.2945205479452055, + "grad_norm": 0.9308492541313171, + "kl": 0.3837890625, + "learning_rate": 5.684931506849316e-07, + "loss": 0.0004, + "reward": 1.6796875, + "reward_std": 0.09704047441482544, + "rewards/accuracy_reward": 0.6796875, + "rewards/format_reward": 1.0, + "step": 945 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.5, + "epoch": 1.2958904109589042, + "grad_norm": 1.6099046468734741, + "kl": 0.5966796875, + "learning_rate": 5.680365296803653e-07, + "loss": 0.0006, + "reward": 1.3203125, + "reward_std": 0.10436524450778961, + "rewards/accuracy_reward": 0.3203125, + "rewards/format_reward": 1.0, + "step": 946 + }, + { + "clip_ratio": 0.0, + "completion_length": 635.0625, + "epoch": 1.2972602739726027, + "grad_norm": 2.74033784866333, + "kl": 0.3798828125, + "learning_rate": 5.67579908675799e-07, + "loss": 0.0004, + "reward": 1.8953125476837158, + "reward_std": 0.3537486009299755, + "rewards/accuracy_reward": 0.9265625178813934, + "rewards/format_reward": 0.96875, + "step": 947 + }, + { + "clip_ratio": 0.0, + "completion_length": 654.90625, + "epoch": 1.2986301369863014, + "grad_norm": 1.12486732006073, + "kl": 0.41796875, + "learning_rate": 5.671232876712329e-07, + "loss": 0.0004, + "reward": 1.5078125, + "reward_std": 0.306659497320652, + "rewards/accuracy_reward": 0.5703125, + "rewards/format_reward": 0.9375, + "step": 948 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.1875, + "epoch": 1.3, + "grad_norm": 0.8883933424949646, + "kl": 0.41259765625, + "learning_rate": 5.666666666666666e-07, + "loss": 0.0004, + "reward": 1.734375, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 1.0, + "step": 949 + }, + { + "clip_ratio": 0.0, + "completion_length": 503.03125, + "epoch": 1.3013698630136985, + "grad_norm": 3.347505569458008, + "kl": 0.39111328125, + "learning_rate": 5.662100456621004e-07, + "loss": 0.0004, + "reward": 1.515625, + "reward_std": 0.1304589118808508, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 1.0, + "step": 950 + }, + { + "clip_ratio": 0.0, + "completion_length": 493.15625, + "epoch": 1.3027397260273972, + "grad_norm": 2.3713202476501465, + "kl": 0.46923828125, + "learning_rate": 5.657534246575342e-07, + "loss": 0.0005, + "reward": 1.2864583730697632, + "reward_std": 0.13258253410458565, + "rewards/accuracy_reward": 0.3177083432674408, + "rewards/format_reward": 0.96875, + "step": 951 + }, + { + "clip_ratio": 0.0, + "completion_length": 650.3125, + "epoch": 1.3041095890410959, + "grad_norm": 3.497941017150879, + "kl": 0.37890625, + "learning_rate": 5.65296803652968e-07, + "loss": 0.0004, + "reward": 1.0625, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 1.0, + "step": 952 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.25, + "epoch": 1.3054794520547945, + "grad_norm": 2.711280107498169, + "kl": 0.38818359375, + "learning_rate": 5.648401826484019e-07, + "loss": 0.0004, + "reward": 1.70703125, + "reward_std": 0.19521116837859154, + "rewards/accuracy_reward": 0.70703125, + "rewards/format_reward": 1.0, + "step": 953 + }, + { + "clip_ratio": 0.0, + "completion_length": 506.21875, + "epoch": 1.3068493150684932, + "grad_norm": 2.053891181945801, + "kl": 0.4052734375, + "learning_rate": 5.643835616438356e-07, + "loss": 0.0004, + "reward": 1.5625, + "reward_std": 0.249358132481575, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 954 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.5625, + "epoch": 1.308219178082192, + "grad_norm": 6.742806911468506, + "kl": 0.4111328125, + "learning_rate": 5.639269406392693e-07, + "loss": 0.0004, + "reward": 1.3177083134651184, + "reward_std": 0.16407955065369606, + "rewards/accuracy_reward": 0.3489583283662796, + "rewards/format_reward": 0.96875, + "step": 955 + }, + { + "clip_ratio": 0.0, + "completion_length": 397.0, + "epoch": 1.3095890410958904, + "grad_norm": 3.2806613445281982, + "kl": 0.6689453125, + "learning_rate": 5.634703196347032e-07, + "loss": 0.0007, + "reward": 1.34375, + "reward_std": 0.482940673828125, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.96875, + "step": 956 + }, + { + "clip_ratio": 0.0, + "completion_length": 721.625, + "epoch": 1.310958904109589, + "grad_norm": 2.4216856956481934, + "kl": 0.380859375, + "learning_rate": 5.630136986301369e-07, + "loss": 0.0004, + "reward": 1.8515625, + "reward_std": 0.1936504878103733, + "rewards/accuracy_reward": 0.8515625, + "rewards/format_reward": 1.0, + "step": 957 + }, + { + "clip_ratio": 0.0, + "completion_length": 626.96875, + "epoch": 1.3123287671232877, + "grad_norm": 2.3214023113250732, + "kl": 0.375, + "learning_rate": 5.625570776255707e-07, + "loss": 0.0004, + "reward": 1.46875, + "reward_std": 0.2884480655193329, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.96875, + "step": 958 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.1875, + "epoch": 1.3136986301369862, + "grad_norm": 0.8679585456848145, + "kl": 0.41845703125, + "learning_rate": 5.621004566210046e-07, + "loss": 0.0004, + "reward": 1.21875, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 1.0, + "step": 959 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.90625, + "epoch": 1.3150684931506849, + "grad_norm": 1.6525453329086304, + "kl": 0.37451171875, + "learning_rate": 5.616438356164383e-07, + "loss": 0.0004, + "reward": 1.5, + "reward_std": 0.2925042062997818, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.96875, + "step": 960 + }, + { + "clip_ratio": 0.0, + "completion_length": 617.59375, + "epoch": 1.3164383561643835, + "grad_norm": 1.9350600242614746, + "kl": 0.37255859375, + "learning_rate": 5.611872146118722e-07, + "loss": 0.0004, + "reward": 1.8984375, + "reward_std": 0.11048543266952038, + "rewards/accuracy_reward": 0.9296875, + "rewards/format_reward": 0.96875, + "step": 961 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.8125, + "epoch": 1.3178082191780822, + "grad_norm": 2.944469928741455, + "kl": 0.416015625, + "learning_rate": 5.607305936073059e-07, + "loss": 0.0004, + "reward": 1.5234375, + "reward_std": 0.4395580645650625, + "rewards/accuracy_reward": 0.5546875, + "rewards/format_reward": 0.96875, + "step": 962 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.84375, + "epoch": 1.319178082191781, + "grad_norm": 3.110598564147949, + "kl": 0.39990234375, + "learning_rate": 5.602739726027396e-07, + "loss": 0.0004, + "reward": 1.4921875, + "reward_std": 0.1297733597457409, + "rewards/accuracy_reward": 0.4921874701976776, + "rewards/format_reward": 1.0, + "step": 963 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.3125, + "epoch": 1.3205479452054796, + "grad_norm": 3.190995216369629, + "kl": 0.39306640625, + "learning_rate": 5.598173515981735e-07, + "loss": 0.0004, + "reward": 1.6953125, + "reward_std": 0.2972095049917698, + "rewards/accuracy_reward": 0.6953125, + "rewards/format_reward": 1.0, + "step": 964 + }, + { + "clip_ratio": 0.0, + "completion_length": 624.34375, + "epoch": 1.321917808219178, + "grad_norm": 1.7255425453186035, + "kl": 0.41748046875, + "learning_rate": 5.593607305936073e-07, + "loss": 0.0004, + "reward": 1.515625, + "reward_std": 0.24831003323197365, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.96875, + "step": 965 + }, + { + "clip_ratio": 0.0, + "completion_length": 684.375, + "epoch": 1.3232876712328767, + "grad_norm": 2.058529853820801, + "kl": 0.36181640625, + "learning_rate": 5.589041095890411e-07, + "loss": 0.0004, + "reward": 1.84375, + "reward_std": 0.30173346400260925, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 966 + }, + { + "clip_ratio": 0.0, + "completion_length": 666.5625, + "epoch": 1.3246575342465754, + "grad_norm": 0.08733844757080078, + "kl": 0.435546875, + "learning_rate": 5.584474885844749e-07, + "loss": 0.0004, + "reward": 2.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 967 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.5, + "epoch": 1.3260273972602739, + "grad_norm": 4.318344593048096, + "kl": 0.40234375, + "learning_rate": 5.579908675799086e-07, + "loss": 0.0004, + "reward": 1.09375, + "reward_std": 0.2041158601641655, + "rewards/accuracy_reward": 0.09375, + "rewards/format_reward": 1.0, + "step": 968 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.4375, + "epoch": 1.3273972602739725, + "grad_norm": 7.782508373260498, + "kl": 0.4287109375, + "learning_rate": 5.575342465753425e-07, + "loss": 0.0004, + "reward": 1.28125, + "reward_std": 0.4218914955854416, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 1.0, + "step": 969 + }, + { + "clip_ratio": 0.0, + "completion_length": 784.46875, + "epoch": 1.3287671232876712, + "grad_norm": 3.4223484992980957, + "kl": 0.35205078125, + "learning_rate": 5.570776255707762e-07, + "loss": 0.0004, + "reward": 1.15625, + "reward_std": 0.2651650384068489, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 0.9375, + "step": 970 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.125, + "epoch": 1.33013698630137, + "grad_norm": 1.055464267730713, + "kl": 0.39599609375, + "learning_rate": 5.566210045662099e-07, + "loss": 0.0004, + "reward": 1.45703125, + "reward_std": 0.028628919273614883, + "rewards/accuracy_reward": 0.45703125, + "rewards/format_reward": 1.0, + "step": 971 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.53125, + "epoch": 1.3315068493150686, + "grad_norm": 3.628035068511963, + "kl": 0.41015625, + "learning_rate": 5.561643835616439e-07, + "loss": 0.0004, + "reward": 1.46875, + "reward_std": 0.4397946000099182, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.96875, + "step": 972 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.3125, + "epoch": 1.3328767123287673, + "grad_norm": 1.331992745399475, + "kl": 0.4189453125, + "learning_rate": 5.557077625570776e-07, + "loss": 0.0004, + "reward": 1.59375, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 1.0, + "step": 973 + }, + { + "clip_ratio": 0.0, + "completion_length": 660.0625, + "epoch": 1.3342465753424657, + "grad_norm": 1.930756688117981, + "kl": 0.373046875, + "learning_rate": 5.552511415525114e-07, + "loss": 0.0004, + "reward": 1.625, + "reward_std": 0.2925042062997818, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.96875, + "step": 974 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.09375, + "epoch": 1.3356164383561644, + "grad_norm": 9.222930908203125, + "kl": 0.39697265625, + "learning_rate": 5.547945205479452e-07, + "loss": 0.0004, + "reward": 1.5, + "reward_std": 0.3777071312069893, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.96875, + "step": 975 + }, + { + "clip_ratio": 0.0, + "completion_length": 683.3125, + "epoch": 1.336986301369863, + "grad_norm": 2.1612720489501953, + "kl": 0.38427734375, + "learning_rate": 5.543378995433789e-07, + "loss": 0.0004, + "reward": 1.7994791567325592, + "reward_std": 0.37991958670318127, + "rewards/accuracy_reward": 0.8307291567325592, + "rewards/format_reward": 0.96875, + "step": 976 + }, + { + "clip_ratio": 0.0, + "completion_length": 667.65625, + "epoch": 1.3383561643835615, + "grad_norm": 3.2220661640167236, + "kl": 0.3798828125, + "learning_rate": 5.538812785388128e-07, + "loss": 0.0004, + "reward": 1.1953125, + "reward_std": 0.12073516845703125, + "rewards/accuracy_reward": 0.2265625, + "rewards/format_reward": 0.96875, + "step": 977 + }, + { + "clip_ratio": 0.0, + "completion_length": 709.03125, + "epoch": 1.3397260273972602, + "grad_norm": 7.830074310302734, + "kl": 0.39794921875, + "learning_rate": 5.534246575342465e-07, + "loss": 0.0004, + "reward": 1.765625, + "reward_std": 0.05444390885531902, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 1.0, + "step": 978 + }, + { + "clip_ratio": 0.0, + "completion_length": 673.5625, + "epoch": 1.341095890410959, + "grad_norm": 2.3540923595428467, + "kl": 0.3916015625, + "learning_rate": 5.529680365296803e-07, + "loss": 0.0004, + "reward": 1.03125, + "reward_std": 0.2041158601641655, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.96875, + "step": 979 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.375, + "epoch": 1.3424657534246576, + "grad_norm": 12.305758476257324, + "kl": 0.44287109375, + "learning_rate": 5.525114155251142e-07, + "loss": 0.0004, + "reward": 1.9375, + "reward_std": 0.2884863168001175, + "rewards/accuracy_reward": 0.9375000596046448, + "rewards/format_reward": 1.0, + "step": 980 + }, + { + "clip_ratio": 0.0, + "completion_length": 665.3125, + "epoch": 1.3438356164383563, + "grad_norm": 5.535678863525391, + "kl": 0.3837890625, + "learning_rate": 5.520547945205479e-07, + "loss": 0.0004, + "reward": 1.576562523841858, + "reward_std": 0.16898519545793533, + "rewards/accuracy_reward": 0.5765625238418579, + "rewards/format_reward": 1.0, + "step": 981 + }, + { + "clip_ratio": 0.0, + "completion_length": 740.0, + "epoch": 1.345205479452055, + "grad_norm": 0.8980807065963745, + "kl": 0.4013671875, + "learning_rate": 5.515981735159817e-07, + "loss": 0.0004, + "reward": 2.253125011920929, + "reward_std": 0.13258251547813416, + "rewards/accuracy_reward": 1.253125011920929, + "rewards/format_reward": 1.0, + "step": 982 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.0, + "epoch": 1.3465753424657534, + "grad_norm": 1.950405240058899, + "kl": 0.40380859375, + "learning_rate": 5.511415525114155e-07, + "loss": 0.0004, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 1.0, + "step": 983 + }, + { + "clip_ratio": 0.0, + "completion_length": 785.625, + "epoch": 1.347945205479452, + "grad_norm": 0.8518653512001038, + "kl": 0.357421875, + "learning_rate": 5.506849315068492e-07, + "loss": 0.0004, + "reward": 1.21875, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.96875, + "step": 984 + }, + { + "clip_ratio": 0.0, + "completion_length": 726.53125, + "epoch": 1.3493150684931507, + "grad_norm": 1.3810690641403198, + "kl": 0.40283203125, + "learning_rate": 5.502283105022832e-07, + "loss": 0.0004, + "reward": 1.7838541269302368, + "reward_std": 0.05391141213476658, + "rewards/accuracy_reward": 0.7838541567325592, + "rewards/format_reward": 1.0, + "step": 985 + }, + { + "clip_ratio": 0.0, + "completion_length": 675.5625, + "epoch": 1.3506849315068492, + "grad_norm": 3.373875141143799, + "kl": 0.36474609375, + "learning_rate": 5.497716894977169e-07, + "loss": 0.0004, + "reward": 1.84375, + "reward_std": 0.28276579082012177, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.96875, + "step": 986 + }, + { + "clip_ratio": 0.0, + "completion_length": 655.59375, + "epoch": 1.3520547945205479, + "grad_norm": 1.4311509132385254, + "kl": 0.4384765625, + "learning_rate": 5.493150684931506e-07, + "loss": 0.0004, + "reward": 1.5364583134651184, + "reward_std": 0.36767221987247467, + "rewards/accuracy_reward": 0.5677083283662796, + "rewards/format_reward": 0.96875, + "step": 987 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.25, + "epoch": 1.3534246575342466, + "grad_norm": 4.090206146240234, + "kl": 0.40185546875, + "learning_rate": 5.488584474885845e-07, + "loss": 0.0004, + "reward": 2.08203125, + "reward_std": 0.19203272461891174, + "rewards/accuracy_reward": 1.08203125, + "rewards/format_reward": 1.0, + "step": 988 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.34375, + "epoch": 1.3547945205479452, + "grad_norm": 0.023737359791994095, + "kl": 0.44921875, + "learning_rate": 5.484018264840182e-07, + "loss": 0.0005, + "reward": 1.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 989 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.9375, + "epoch": 1.356164383561644, + "grad_norm": 3.110661029815674, + "kl": 0.40283203125, + "learning_rate": 5.47945205479452e-07, + "loss": 0.0004, + "reward": 1.5104166567325592, + "reward_std": 0.18662460334599018, + "rewards/accuracy_reward": 0.5104166567325592, + "rewards/format_reward": 1.0, + "step": 990 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.4375, + "epoch": 1.3575342465753424, + "grad_norm": 2.7677109241485596, + "kl": 0.3935546875, + "learning_rate": 5.474885844748858e-07, + "loss": 0.0004, + "reward": 1.7395833134651184, + "reward_std": 0.09139656275510788, + "rewards/accuracy_reward": 0.7395833432674408, + "rewards/format_reward": 1.0, + "step": 991 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.59375, + "epoch": 1.358904109589041, + "grad_norm": 1.7430964708328247, + "kl": 0.37451171875, + "learning_rate": 5.470319634703196e-07, + "loss": 0.0004, + "reward": 1.765625, + "reward_std": 0.1804211586713791, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 1.0, + "step": 992 + }, + { + "clip_ratio": 0.0, + "completion_length": 731.75, + "epoch": 1.3602739726027397, + "grad_norm": 3.005925178527832, + "kl": 0.38525390625, + "learning_rate": 5.465753424657535e-07, + "loss": 0.0004, + "reward": 2.6171875, + "reward_std": 0.022097086533904076, + "rewards/accuracy_reward": 1.6171875, + "rewards/format_reward": 1.0, + "step": 993 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.21875, + "epoch": 1.3616438356164384, + "grad_norm": 1.7805390357971191, + "kl": 0.39306640625, + "learning_rate": 5.461187214611872e-07, + "loss": 0.0004, + "reward": 1.8854166269302368, + "reward_std": 0.10773438680917025, + "rewards/accuracy_reward": 0.8854166269302368, + "rewards/format_reward": 1.0, + "step": 994 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.5, + "epoch": 1.3630136986301369, + "grad_norm": 3.8102054595947266, + "kl": 0.46337890625, + "learning_rate": 5.456621004566209e-07, + "loss": 0.0005, + "reward": 1.65625, + "reward_std": 0.4218914955854416, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 1.0, + "step": 995 + }, + { + "clip_ratio": 0.0, + "completion_length": 712.1875, + "epoch": 1.3643835616438356, + "grad_norm": 1.8652178049087524, + "kl": 0.3828125, + "learning_rate": 5.452054794520548e-07, + "loss": 0.0004, + "reward": 1.7265625, + "reward_std": 0.03234682232141495, + "rewards/accuracy_reward": 0.7265625, + "rewards/format_reward": 1.0, + "step": 996 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.96875, + "epoch": 1.3657534246575342, + "grad_norm": 3.4569380283355713, + "kl": 0.4541015625, + "learning_rate": 5.447488584474885e-07, + "loss": 0.0005, + "reward": 1.25, + "reward_std": 0.2314550280570984, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 1.0, + "step": 997 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.40625, + "epoch": 1.367123287671233, + "grad_norm": 0.03730163723230362, + "kl": 0.39794921875, + "learning_rate": 5.442922374429223e-07, + "loss": 0.0004, + "reward": 1.375, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 998 + }, + { + "clip_ratio": 0.0, + "completion_length": 727.78125, + "epoch": 1.3684931506849316, + "grad_norm": 1.656997799873352, + "kl": 0.39501953125, + "learning_rate": 5.438356164383562e-07, + "loss": 0.0004, + "reward": 1.676041603088379, + "reward_std": 0.11559091322124004, + "rewards/accuracy_reward": 0.6760416626930237, + "rewards/format_reward": 1.0, + "step": 999 + }, + { + "clip_ratio": 0.0, + "completion_length": 724.75, + "epoch": 1.36986301369863, + "grad_norm": 1.8627666234970093, + "kl": 0.45703125, + "learning_rate": 5.433789954337899e-07, + "loss": 0.0005, + "reward": 1.546875, + "reward_std": 0.2690962068736553, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.96875, + "step": 1000 + } + ], + "logging_steps": 1.0, + "max_steps": 2190, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}