diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10702 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 820, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 788.5000305175781, + "epoch": 0.0012195121951219512, + "grad_norm": 0.3571978509426117, + "kl": 0.0, + "learning_rate": 3.658536585365854e-08, + "loss": 0.0178, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 1 + }, + { + "completion_length": 595.1875305175781, + "epoch": 0.0024390243902439024, + "grad_norm": 0.3311821520328522, + "kl": 0.0, + "learning_rate": 7.317073170731708e-08, + "loss": -0.006, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 2 + }, + { + "completion_length": 894.6041870117188, + "epoch": 0.003658536585365854, + "grad_norm": 0.5522251129150391, + "kl": 0.00023651123046875, + "learning_rate": 1.097560975609756e-07, + "loss": -0.0317, + "reward": 0.1875000074505806, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 3 + }, + { + "completion_length": 913.2083435058594, + "epoch": 0.004878048780487805, + "grad_norm": 0.2455306351184845, + "kl": 0.00029087066650390625, + "learning_rate": 1.4634146341463415e-07, + "loss": -0.0142, + "reward": 0.1666666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4 + }, + { + "completion_length": 645.5625, + "epoch": 0.006097560975609756, + "grad_norm": 0.20754282176494598, + "kl": 0.0003032684326171875, + "learning_rate": 1.8292682926829268e-07, + "loss": -0.0035, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 5 + }, + { + "completion_length": 825.25, + "epoch": 0.007317073170731708, + "grad_norm": 0.33052483201026917, + "kl": 0.000278472900390625, + "learning_rate": 2.195121951219512e-07, + "loss": 0.0511, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 6 + }, + { + "completion_length": 646.5208435058594, + "epoch": 0.00853658536585366, + "grad_norm": 0.6244280934333801, + "kl": 0.00029754638671875, + "learning_rate": 2.5609756097560976e-07, + "loss": -0.0356, + "reward": 0.06250000186264515, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 7 + }, + { + "completion_length": 723.1041870117188, + "epoch": 0.00975609756097561, + "grad_norm": 0.3806585371494293, + "kl": 0.00030422210693359375, + "learning_rate": 2.926829268292683e-07, + "loss": 0.0408, + "reward": 0.1250000037252903, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 8 + }, + { + "completion_length": 752.7083435058594, + "epoch": 0.01097560975609756, + "grad_norm": 0.3775721490383148, + "kl": 0.00028228759765625, + "learning_rate": 3.2926829268292686e-07, + "loss": 0.0091, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 9 + }, + { + "completion_length": 880.8541870117188, + "epoch": 0.012195121951219513, + "grad_norm": 0.16199147701263428, + "kl": 0.00026607513427734375, + "learning_rate": 3.6585365853658536e-07, + "loss": 0.0043, + "reward": 0.1666666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 10 + }, + { + "completion_length": 752.1458435058594, + "epoch": 0.013414634146341463, + "grad_norm": 0.5467624068260193, + "kl": 0.000339508056640625, + "learning_rate": 4.0243902439024396e-07, + "loss": -0.056, + "reward": 0.1666666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 11 + }, + { + "completion_length": 790.125, + "epoch": 0.014634146341463415, + "grad_norm": 0.3221971392631531, + "kl": 0.0002956390380859375, + "learning_rate": 4.390243902439024e-07, + "loss": -0.0217, + "reward": 0.2500000074505806, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.0, + "step": 12 + }, + { + "completion_length": 661.7291870117188, + "epoch": 0.015853658536585366, + "grad_norm": 0.5072605609893799, + "kl": 0.00029659271240234375, + "learning_rate": 4.75609756097561e-07, + "loss": -0.0177, + "reward": 0.1041666679084301, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 13 + }, + { + "completion_length": 620.8750305175781, + "epoch": 0.01707317073170732, + "grad_norm": 0.32282891869544983, + "kl": 0.00041866302490234375, + "learning_rate": 5.121951219512195e-07, + "loss": 0.0156, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 14 + }, + { + "completion_length": 821.2291870117188, + "epoch": 0.018292682926829267, + "grad_norm": 0.2993911802768707, + "kl": 0.00032138824462890625, + "learning_rate": 5.48780487804878e-07, + "loss": 0.0265, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 15 + }, + { + "completion_length": 632.0416870117188, + "epoch": 0.01951219512195122, + "grad_norm": 0.1648959368467331, + "kl": 0.000415802001953125, + "learning_rate": 5.853658536585366e-07, + "loss": -0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 16 + }, + { + "completion_length": 606.5208435058594, + "epoch": 0.020731707317073172, + "grad_norm": 0.4805357754230499, + "kl": 0.0004749298095703125, + "learning_rate": 6.219512195121951e-07, + "loss": -0.0221, + "reward": 0.2291666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 17 + }, + { + "completion_length": 603.0625305175781, + "epoch": 0.02195121951219512, + "grad_norm": 0.09931223839521408, + "kl": 0.0008544921875, + "learning_rate": 6.585365853658537e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 18 + }, + { + "completion_length": 764.8125305175781, + "epoch": 0.023170731707317073, + "grad_norm": 0.02471252717077732, + "kl": 0.0004911422729492188, + "learning_rate": 6.951219512195122e-07, + "loss": 0.0, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 19 + }, + { + "completion_length": 608.6875305175781, + "epoch": 0.024390243902439025, + "grad_norm": 0.4140380918979645, + "kl": 0.0008373260498046875, + "learning_rate": 7.317073170731707e-07, + "loss": -0.0022, + "reward": 0.1250000037252903, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 20 + }, + { + "completion_length": 810.7292175292969, + "epoch": 0.025609756097560974, + "grad_norm": 0.43750235438346863, + "kl": 0.001323699951171875, + "learning_rate": 7.682926829268293e-07, + "loss": -0.0126, + "reward": 0.229166679084301, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.229166679084301, + "rewards/format_reward": 0.0, + "step": 21 + }, + { + "completion_length": 733.875, + "epoch": 0.026829268292682926, + "grad_norm": 0.0427839532494545, + "kl": 0.0009918212890625, + "learning_rate": 8.048780487804879e-07, + "loss": 0.0, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 22 + }, + { + "completion_length": 719.0208435058594, + "epoch": 0.02804878048780488, + "grad_norm": 0.3788954019546509, + "kl": 0.005901336669921875, + "learning_rate": 8.414634146341464e-07, + "loss": -0.0154, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 23 + }, + { + "completion_length": 668.7708435058594, + "epoch": 0.02926829268292683, + "grad_norm": 0.6176497936248779, + "kl": 0.00179290771484375, + "learning_rate": 8.780487804878048e-07, + "loss": 0.0085, + "reward": 0.06250000186264515, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 24 + }, + { + "completion_length": 859.4166870117188, + "epoch": 0.03048780487804878, + "grad_norm": 0.34154024720191956, + "kl": 0.0024566650390625, + "learning_rate": 9.146341463414634e-07, + "loss": 0.0001, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 25 + }, + { + "completion_length": 600.7708435058594, + "epoch": 0.03170731707317073, + "grad_norm": 0.6522072553634644, + "kl": 0.005767822265625, + "learning_rate": 9.51219512195122e-07, + "loss": -0.0238, + "reward": 0.14583333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 26 + }, + { + "completion_length": 773.3541870117188, + "epoch": 0.032926829268292684, + "grad_norm": 0.07703638821840286, + "kl": 0.00222015380859375, + "learning_rate": 9.878048780487806e-07, + "loss": 0.0001, + "reward": 0.3125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.0, + "step": 27 + }, + { + "completion_length": 944.4583435058594, + "epoch": 0.03414634146341464, + "grad_norm": 0.37970709800720215, + "kl": 0.001628875732421875, + "learning_rate": 1.024390243902439e-06, + "loss": -0.0156, + "reward": 0.291666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.291666679084301, + "rewards/format_reward": 0.0, + "step": 28 + }, + { + "completion_length": 838.8333435058594, + "epoch": 0.03536585365853658, + "grad_norm": 0.06090879812836647, + "kl": 0.001903533935546875, + "learning_rate": 1.0609756097560976e-06, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 29 + }, + { + "completion_length": 771.0833435058594, + "epoch": 0.036585365853658534, + "grad_norm": 0.3602464199066162, + "kl": 0.00479888916015625, + "learning_rate": 1.097560975609756e-06, + "loss": 0.0039, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 30 + }, + { + "completion_length": 664.7500305175781, + "epoch": 0.03780487804878049, + "grad_norm": 0.4147832691669464, + "kl": 0.0021514892578125, + "learning_rate": 1.1341463414634146e-06, + "loss": 0.0062, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 31 + }, + { + "completion_length": 860.0208740234375, + "epoch": 0.03902439024390244, + "grad_norm": 0.03816133737564087, + "kl": 0.00128936767578125, + "learning_rate": 1.1707317073170732e-06, + "loss": 0.0, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 32 + }, + { + "completion_length": 725.3125, + "epoch": 0.04024390243902439, + "grad_norm": 0.0702565535902977, + "kl": 0.002208709716796875, + "learning_rate": 1.2073170731707318e-06, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 33 + }, + { + "completion_length": 651.3958435058594, + "epoch": 0.041463414634146344, + "grad_norm": 0.04627303034067154, + "kl": 0.001392364501953125, + "learning_rate": 1.2439024390243902e-06, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 34 + }, + { + "completion_length": 865.2291870117188, + "epoch": 0.042682926829268296, + "grad_norm": 0.35550418496131897, + "kl": 0.002017974853515625, + "learning_rate": 1.2804878048780488e-06, + "loss": -0.0023, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 35 + }, + { + "completion_length": 950.8333435058594, + "epoch": 0.04390243902439024, + "grad_norm": 0.3738349974155426, + "kl": 0.001232147216796875, + "learning_rate": 1.3170731707317074e-06, + "loss": -0.0004, + "reward": 0.06250000186264515, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 36 + }, + { + "completion_length": 663.4375305175781, + "epoch": 0.045121951219512194, + "grad_norm": 0.5210116505622864, + "kl": 0.002288818359375, + "learning_rate": 1.3536585365853658e-06, + "loss": 0.0308, + "reward": 0.229166679084301, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.229166679084301, + "rewards/format_reward": 0.0, + "step": 37 + }, + { + "completion_length": 690.5, + "epoch": 0.046341463414634146, + "grad_norm": 0.39043503999710083, + "kl": 0.009716033935546875, + "learning_rate": 1.3902439024390244e-06, + "loss": 0.0009, + "reward": 0.2083333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 38 + }, + { + "completion_length": 679.2916870117188, + "epoch": 0.0475609756097561, + "grad_norm": 0.20443572103977203, + "kl": 0.00389862060546875, + "learning_rate": 1.4268292682926828e-06, + "loss": 0.0001, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 39 + }, + { + "completion_length": 637.7708740234375, + "epoch": 0.04878048780487805, + "grad_norm": 0.3915785551071167, + "kl": 0.001926422119140625, + "learning_rate": 1.4634146341463414e-06, + "loss": 0.0156, + "reward": 0.1458333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333432674408, + "rewards/format_reward": 0.0, + "step": 40 + }, + { + "completion_length": 742.4166870117188, + "epoch": 0.05, + "grad_norm": 0.4709751009941101, + "kl": 0.002841949462890625, + "learning_rate": 1.5e-06, + "loss": 0.0118, + "reward": 0.16666667722165585, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 41 + }, + { + "completion_length": 794.6666870117188, + "epoch": 0.05121951219512195, + "grad_norm": 0.09043405950069427, + "kl": 0.001739501953125, + "learning_rate": 1.5365853658536586e-06, + "loss": 0.0001, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 42 + }, + { + "completion_length": 653.7708740234375, + "epoch": 0.0524390243902439, + "grad_norm": 0.4868049919605255, + "kl": 0.001434326171875, + "learning_rate": 1.5731707317073172e-06, + "loss": 0.0249, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 43 + }, + { + "completion_length": 551.6458435058594, + "epoch": 0.05365853658536585, + "grad_norm": 0.3079073429107666, + "kl": 0.00128173828125, + "learning_rate": 1.6097560975609759e-06, + "loss": 0.0083, + "reward": 0.2916666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/format_reward": 0.0, + "step": 44 + }, + { + "completion_length": 794.8333435058594, + "epoch": 0.054878048780487805, + "grad_norm": 0.5330808758735657, + "kl": 0.000972747802734375, + "learning_rate": 1.6463414634146342e-06, + "loss": -0.0414, + "reward": 0.1666666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 45 + }, + { + "completion_length": 604.5833435058594, + "epoch": 0.05609756097560976, + "grad_norm": 0.5505648851394653, + "kl": 0.00283050537109375, + "learning_rate": 1.6829268292682928e-06, + "loss": -0.0118, + "reward": 0.2291666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 46 + }, + { + "completion_length": 541.2291717529297, + "epoch": 0.05731707317073171, + "grad_norm": 0.6158074736595154, + "kl": 0.00244903564453125, + "learning_rate": 1.719512195121951e-06, + "loss": 0.0012, + "reward": 0.1875000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 47 + }, + { + "completion_length": 671.0416870117188, + "epoch": 0.05853658536585366, + "grad_norm": 0.1726604402065277, + "kl": 0.005340576171875, + "learning_rate": 1.7560975609756096e-06, + "loss": 0.0001, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 48 + }, + { + "completion_length": 757.9166870117188, + "epoch": 0.05975609756097561, + "grad_norm": 0.08531015366315842, + "kl": 0.002162933349609375, + "learning_rate": 1.7926829268292682e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 49 + }, + { + "completion_length": 729.4583740234375, + "epoch": 0.06097560975609756, + "grad_norm": 0.42457133531570435, + "kl": 0.0019073486328125, + "learning_rate": 1.8292682926829268e-06, + "loss": -0.033, + "reward": 0.1458333395421505, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 50 + }, + { + "completion_length": 641.375, + "epoch": 0.06219512195121951, + "grad_norm": 0.04091706499457359, + "kl": 0.002422332763671875, + "learning_rate": 1.8658536585365854e-06, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 51 + }, + { + "completion_length": 760.2708740234375, + "epoch": 0.06341463414634146, + "grad_norm": 0.3898300528526306, + "kl": 0.00279998779296875, + "learning_rate": 1.902439024390244e-06, + "loss": -0.0003, + "reward": 0.2500000074505806, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.0, + "step": 52 + }, + { + "completion_length": 744.9375305175781, + "epoch": 0.06463414634146342, + "grad_norm": 0.09664002805948257, + "kl": 0.0029296875, + "learning_rate": 1.9390243902439024e-06, + "loss": 0.0018, + "reward": 0.3333333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 53 + }, + { + "completion_length": 771.5625, + "epoch": 0.06585365853658537, + "grad_norm": 0.644061803817749, + "kl": 0.00276947021484375, + "learning_rate": 1.9756097560975613e-06, + "loss": -0.0595, + "reward": 0.2500000149011612, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 54 + }, + { + "completion_length": 714.1041870117188, + "epoch": 0.06707317073170732, + "grad_norm": 0.4097103178501129, + "kl": 0.00342559814453125, + "learning_rate": 2.0121951219512197e-06, + "loss": 0.0054, + "reward": 0.2083333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 55 + }, + { + "completion_length": 667.5625305175781, + "epoch": 0.06829268292682927, + "grad_norm": 0.690647304058075, + "kl": 0.0050201416015625, + "learning_rate": 2.048780487804878e-06, + "loss": 0.0823, + "reward": 0.1666666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 56 + }, + { + "completion_length": 631.9375, + "epoch": 0.06951219512195123, + "grad_norm": 0.21440348029136658, + "kl": 0.0043792724609375, + "learning_rate": 2.0853658536585364e-06, + "loss": 0.0048, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 57 + }, + { + "completion_length": 617.8958435058594, + "epoch": 0.07073170731707316, + "grad_norm": 0.363092303276062, + "kl": 0.005340576171875, + "learning_rate": 2.1219512195121953e-06, + "loss": 0.0056, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 58 + }, + { + "completion_length": 794.4375, + "epoch": 0.07195121951219512, + "grad_norm": 0.46107953786849976, + "kl": 0.008514404296875, + "learning_rate": 2.1585365853658537e-06, + "loss": 0.012, + "reward": 0.1458333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 59 + }, + { + "completion_length": 769.2083740234375, + "epoch": 0.07317073170731707, + "grad_norm": 0.6187219023704529, + "kl": 0.0089111328125, + "learning_rate": 2.195121951219512e-06, + "loss": 0.0335, + "reward": 0.1041666679084301, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 60 + }, + { + "completion_length": 766.9791870117188, + "epoch": 0.07439024390243902, + "grad_norm": 0.5316298007965088, + "kl": 0.005126953125, + "learning_rate": 2.231707317073171e-06, + "loss": -0.0355, + "reward": 0.2083333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 61 + }, + { + "completion_length": 622.1041870117188, + "epoch": 0.07560975609756097, + "grad_norm": 0.6351970434188843, + "kl": 0.0066375732421875, + "learning_rate": 2.2682926829268293e-06, + "loss": 0.0001, + "reward": 0.1666666679084301, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 62 + }, + { + "completion_length": 640.4791870117188, + "epoch": 0.07682926829268293, + "grad_norm": 0.4834135174751282, + "kl": 0.0060577392578125, + "learning_rate": 2.304878048780488e-06, + "loss": -0.0195, + "reward": 0.3125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.0, + "step": 63 + }, + { + "completion_length": 700.7708435058594, + "epoch": 0.07804878048780488, + "grad_norm": 0.34847137331962585, + "kl": 0.00543975830078125, + "learning_rate": 2.3414634146341465e-06, + "loss": 0.0004, + "reward": 0.1041666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "step": 64 + }, + { + "completion_length": 704.4583435058594, + "epoch": 0.07926829268292683, + "grad_norm": 0.5386676788330078, + "kl": 0.005706787109375, + "learning_rate": 2.378048780487805e-06, + "loss": -0.0559, + "reward": 0.1458333358168602, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 65 + }, + { + "completion_length": 825.2083740234375, + "epoch": 0.08048780487804878, + "grad_norm": 0.6664050817489624, + "kl": 0.0067596435546875, + "learning_rate": 2.4146341463414637e-06, + "loss": 0.0535, + "reward": 0.1458333358168602, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 66 + }, + { + "completion_length": 743.7916717529297, + "epoch": 0.08170731707317073, + "grad_norm": 0.6769405603408813, + "kl": 0.0077056884765625, + "learning_rate": 2.451219512195122e-06, + "loss": 0.0357, + "reward": 0.2083333432674408, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 67 + }, + { + "completion_length": 681.4166870117188, + "epoch": 0.08292682926829269, + "grad_norm": 0.11391153931617737, + "kl": 0.005401611328125, + "learning_rate": 2.4878048780487805e-06, + "loss": 0.0002, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 68 + }, + { + "completion_length": 906.8125305175781, + "epoch": 0.08414634146341464, + "grad_norm": 0.48767825961112976, + "kl": 0.01153564453125, + "learning_rate": 2.524390243902439e-06, + "loss": -0.0238, + "reward": 0.4166666865348816, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/format_reward": 0.0, + "step": 69 + }, + { + "completion_length": 617.2916717529297, + "epoch": 0.08536585365853659, + "grad_norm": 0.8901994228363037, + "kl": 0.015533447265625, + "learning_rate": 2.5609756097560977e-06, + "loss": -0.0081, + "reward": 0.1875000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 70 + }, + { + "completion_length": 775.3541870117188, + "epoch": 0.08658536585365853, + "grad_norm": 0.5658962726593018, + "kl": 0.014251708984375, + "learning_rate": 2.597560975609756e-06, + "loss": 0.0227, + "reward": 0.3125, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.0, + "step": 71 + }, + { + "completion_length": 717.9375305175781, + "epoch": 0.08780487804878048, + "grad_norm": 0.5440481305122375, + "kl": 0.009857177734375, + "learning_rate": 2.634146341463415e-06, + "loss": -0.0534, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 72 + }, + { + "completion_length": 718.6458740234375, + "epoch": 0.08902439024390243, + "grad_norm": 0.6130486130714417, + "kl": 0.012054443359375, + "learning_rate": 2.6707317073170733e-06, + "loss": 0.0151, + "reward": 0.2708333432674408, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 73 + }, + { + "completion_length": 813.5625305175781, + "epoch": 0.09024390243902439, + "grad_norm": 0.11815643310546875, + "kl": 0.0111541748046875, + "learning_rate": 2.7073170731707317e-06, + "loss": 0.0004, + "reward": 0.1875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 74 + }, + { + "completion_length": 825.8541870117188, + "epoch": 0.09146341463414634, + "grad_norm": 395.2074279785156, + "kl": 4.0714111328125, + "learning_rate": 2.7439024390243905e-06, + "loss": 0.1267, + "reward": 0.2500000074505806, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.0, + "step": 75 + }, + { + "completion_length": 799.9583435058594, + "epoch": 0.09268292682926829, + "grad_norm": 0.4025568664073944, + "kl": 0.013458251953125, + "learning_rate": 2.780487804878049e-06, + "loss": -0.0142, + "reward": 0.3333333432674408, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 76 + }, + { + "completion_length": 971.3333435058594, + "epoch": 0.09390243902439024, + "grad_norm": 0.5087530016899109, + "kl": 0.016357421875, + "learning_rate": 2.8170731707317073e-06, + "loss": -0.0119, + "reward": 0.10416666977107525, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 77 + }, + { + "completion_length": 587.6041870117188, + "epoch": 0.0951219512195122, + "grad_norm": 0.38013386726379395, + "kl": 0.01312255859375, + "learning_rate": 2.8536585365853657e-06, + "loss": 0.0125, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 78 + }, + { + "completion_length": 659.6041870117188, + "epoch": 0.09634146341463415, + "grad_norm": 0.058323778212070465, + "kl": 0.013397216796875, + "learning_rate": 2.8902439024390245e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 79 + }, + { + "completion_length": 680.1041870117188, + "epoch": 0.0975609756097561, + "grad_norm": 0.39666855335235596, + "kl": 0.01202392578125, + "learning_rate": 2.926829268292683e-06, + "loss": 0.0016, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 80 + }, + { + "completion_length": 898.7708740234375, + "epoch": 0.09878048780487805, + "grad_norm": 0.5175566673278809, + "kl": 0.01434326171875, + "learning_rate": 2.9634146341463417e-06, + "loss": 0.018, + "reward": 0.14583333395421505, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 81 + }, + { + "completion_length": 584.2083435058594, + "epoch": 0.1, + "grad_norm": 0.25543463230133057, + "kl": 0.01416015625, + "learning_rate": 3e-06, + "loss": 0.0109, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 82 + }, + { + "completion_length": 652.0416870117188, + "epoch": 0.10121951219512196, + "grad_norm": 0.5867159962654114, + "kl": 0.0198974609375, + "learning_rate": 2.9999864091183917e-06, + "loss": -0.0393, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 83 + }, + { + "completion_length": 657.2708740234375, + "epoch": 0.1024390243902439, + "grad_norm": 0.5001187324523926, + "kl": 0.014007568359375, + "learning_rate": 2.999945636719849e-06, + "loss": 0.0109, + "reward": 0.1875000111758709, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.0, + "step": 84 + }, + { + "completion_length": 672.2916870117188, + "epoch": 0.10365853658536585, + "grad_norm": 0.2978525757789612, + "kl": 0.014129638671875, + "learning_rate": 2.999877683543216e-06, + "loss": -0.017, + "reward": 0.1041666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 85 + }, + { + "completion_length": 654.8958435058594, + "epoch": 0.1048780487804878, + "grad_norm": 0.4366808533668518, + "kl": 0.009307861328125, + "learning_rate": 2.999782550819884e-06, + "loss": -0.0144, + "reward": 0.3333333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 86 + }, + { + "completion_length": 725.7083435058594, + "epoch": 0.10609756097560975, + "grad_norm": 0.332344651222229, + "kl": 0.012420654296875, + "learning_rate": 2.99966024027377e-06, + "loss": 0.0065, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 87 + }, + { + "completion_length": 814.4583435058594, + "epoch": 0.1073170731707317, + "grad_norm": 0.4384961724281311, + "kl": 0.010833740234375, + "learning_rate": 2.9995107541212846e-06, + "loss": -0.0281, + "reward": 0.0833333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 88 + }, + { + "completion_length": 704.3125305175781, + "epoch": 0.10853658536585366, + "grad_norm": 0.32473617792129517, + "kl": 0.011474609375, + "learning_rate": 2.999334095071293e-06, + "loss": 0.0134, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 89 + }, + { + "completion_length": 691.3958435058594, + "epoch": 0.10975609756097561, + "grad_norm": 0.4972739517688751, + "kl": 0.012054443359375, + "learning_rate": 2.9991302663250642e-06, + "loss": 0.0078, + "reward": 0.2500000149011612, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 90 + }, + { + "completion_length": 602.7916870117188, + "epoch": 0.11097560975609756, + "grad_norm": 0.06607077270746231, + "kl": 0.01300048828125, + "learning_rate": 2.9988992715762147e-06, + "loss": 0.0005, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 91 + }, + { + "completion_length": 793.6666870117188, + "epoch": 0.11219512195121951, + "grad_norm": 0.38537999987602234, + "kl": 0.013641357421875, + "learning_rate": 2.9986411150106423e-06, + "loss": 0.021, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 92 + }, + { + "completion_length": 774.5000305175781, + "epoch": 0.11341463414634147, + "grad_norm": 0.3016974925994873, + "kl": 0.013336181640625, + "learning_rate": 2.9983558013064455e-06, + "loss": -0.0093, + "reward": 0.2708333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/format_reward": 0.0, + "step": 93 + }, + { + "completion_length": 645.1875305175781, + "epoch": 0.11463414634146342, + "grad_norm": 0.5931347012519836, + "kl": 0.01019287109375, + "learning_rate": 2.998043335633845e-06, + "loss": 0.0087, + "reward": 0.1875000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 94 + }, + { + "completion_length": 724.2916870117188, + "epoch": 0.11585365853658537, + "grad_norm": 0.2517394721508026, + "kl": 0.015899658203125, + "learning_rate": 2.997703723655086e-06, + "loss": 0.0087, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 95 + }, + { + "completion_length": 671.7083740234375, + "epoch": 0.11707317073170732, + "grad_norm": 0.12199469655752182, + "kl": 0.014068603515625, + "learning_rate": 2.9973369715243363e-06, + "loss": 0.0005, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 96 + }, + { + "completion_length": 665.0833435058594, + "epoch": 0.11829268292682926, + "grad_norm": 0.4756318926811218, + "kl": 0.0115966796875, + "learning_rate": 2.996943085887577e-06, + "loss": -0.003, + "reward": 0.2083333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 97 + }, + { + "completion_length": 662.1250305175781, + "epoch": 0.11951219512195121, + "grad_norm": 0.3721674680709839, + "kl": 0.01690673828125, + "learning_rate": 2.996522073882477e-06, + "loss": -0.0076, + "reward": 0.0625, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 98 + }, + { + "completion_length": 706.9791870117188, + "epoch": 0.12073170731707317, + "grad_norm": 0.4329390525817871, + "kl": 0.011962890625, + "learning_rate": 2.9960739431382697e-06, + "loss": -0.0022, + "reward": 0.16666667722165585, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 99 + }, + { + "completion_length": 675.2708435058594, + "epoch": 0.12195121951219512, + "grad_norm": 0.08323477953672409, + "kl": 0.01898193359375, + "learning_rate": 2.9955987017756107e-06, + "loss": 0.0007, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 100 + }, + { + "completion_length": 774.4166870117188, + "epoch": 0.12317073170731707, + "grad_norm": 0.3017697334289551, + "kl": 0.014556884765625, + "learning_rate": 2.9950963584064327e-06, + "loss": -0.0116, + "reward": 0.25, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 101 + }, + { + "completion_length": 690.4791870117188, + "epoch": 0.12439024390243902, + "grad_norm": 10.445072174072266, + "kl": 0.10589599609375, + "learning_rate": 2.9945669221337873e-06, + "loss": -0.023, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 102 + }, + { + "completion_length": 690.6666870117188, + "epoch": 0.12560975609756098, + "grad_norm": 0.24977770447731018, + "kl": 0.016754150390625, + "learning_rate": 2.994010402551682e-06, + "loss": 0.0083, + "reward": 0.2083333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 103 + }, + { + "completion_length": 768.1458740234375, + "epoch": 0.12682926829268293, + "grad_norm": 0.5401036739349365, + "kl": 0.0157470703125, + "learning_rate": 2.9934268097449068e-06, + "loss": -0.0023, + "reward": 0.2083333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.0, + "step": 104 + }, + { + "completion_length": 716.8958435058594, + "epoch": 0.12804878048780488, + "grad_norm": 0.3317832350730896, + "kl": 0.01983642578125, + "learning_rate": 2.9928161542888487e-06, + "loss": 0.0046, + "reward": 0.06250000186264515, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 105 + }, + { + "completion_length": 739.6458435058594, + "epoch": 0.12926829268292683, + "grad_norm": 0.2833709120750427, + "kl": 0.0157470703125, + "learning_rate": 2.9921784472493023e-06, + "loss": 0.0306, + "reward": 0.1041666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "step": 106 + }, + { + "completion_length": 685.7083435058594, + "epoch": 0.13048780487804879, + "grad_norm": 0.49674850702285767, + "kl": 0.02630615234375, + "learning_rate": 2.9915137001822686e-06, + "loss": -0.0083, + "reward": 0.1666666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 107 + }, + { + "completion_length": 962.9166870117188, + "epoch": 0.13170731707317074, + "grad_norm": 0.23559360206127167, + "kl": 0.01373291015625, + "learning_rate": 2.9908219251337465e-06, + "loss": 0.0306, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 108 + }, + { + "completion_length": 756.2500305175781, + "epoch": 0.1329268292682927, + "grad_norm": 0.49854040145874023, + "kl": 0.01385498046875, + "learning_rate": 2.9901031346395125e-06, + "loss": -0.0227, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 109 + }, + { + "completion_length": 880.1458740234375, + "epoch": 0.13414634146341464, + "grad_norm": 0.35081374645233154, + "kl": 0.01531982421875, + "learning_rate": 2.9893573417248957e-06, + "loss": 0.0152, + "reward": 0.1458333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333432674408, + "rewards/format_reward": 0.0, + "step": 110 + }, + { + "completion_length": 783.8541870117188, + "epoch": 0.1353658536585366, + "grad_norm": 0.32845669984817505, + "kl": 0.0166015625, + "learning_rate": 2.98858455990454e-06, + "loss": 0.0296, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 111 + }, + { + "completion_length": 924.6666870117188, + "epoch": 0.13658536585365855, + "grad_norm": 0.8032549619674683, + "kl": 0.05633544921875, + "learning_rate": 2.987784803182161e-06, + "loss": 0.0036, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 112 + }, + { + "completion_length": 793.3958435058594, + "epoch": 0.1378048780487805, + "grad_norm": 0.03511551022529602, + "kl": 0.013946533203125, + "learning_rate": 2.9869580860502894e-06, + "loss": 0.0005, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 113 + }, + { + "completion_length": 724.6666870117188, + "epoch": 0.13902439024390245, + "grad_norm": 0.04026523232460022, + "kl": 0.013092041015625, + "learning_rate": 2.9861044234900125e-06, + "loss": 0.0005, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 114 + }, + { + "completion_length": 1096.5625610351562, + "epoch": 0.1402439024390244, + "grad_norm": 0.046590324491262436, + "kl": 0.014556884765625, + "learning_rate": 2.985223830970699e-06, + "loss": 0.0005, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 115 + }, + { + "completion_length": 756.1458740234375, + "epoch": 0.14146341463414633, + "grad_norm": 1.1626088619232178, + "kl": 0.079345703125, + "learning_rate": 2.98431632444972e-06, + "loss": -0.0112, + "reward": 0.1666666679084301, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 116 + }, + { + "completion_length": 537.8125, + "epoch": 0.14268292682926828, + "grad_norm": 0.26080095767974854, + "kl": 0.019287109375, + "learning_rate": 2.9833819203721614e-06, + "loss": 0.0128, + "reward": 0.2083333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 117 + }, + { + "completion_length": 636.1041870117188, + "epoch": 0.14390243902439023, + "grad_norm": 0.4711505174636841, + "kl": 0.0189208984375, + "learning_rate": 2.982420635670523e-06, + "loss": 0.0116, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 118 + }, + { + "completion_length": 633.5625305175781, + "epoch": 0.14512195121951219, + "grad_norm": 0.05762294679880142, + "kl": 0.015533447265625, + "learning_rate": 2.981432487764413e-06, + "loss": 0.0006, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 119 + }, + { + "completion_length": 568.2291870117188, + "epoch": 0.14634146341463414, + "grad_norm": 0.5497531890869141, + "kl": 0.01641845703125, + "learning_rate": 2.980417494560234e-06, + "loss": 0.0081, + "reward": 0.16666667722165585, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 120 + }, + { + "completion_length": 732.0416870117188, + "epoch": 0.1475609756097561, + "grad_norm": 0.5708346366882324, + "kl": 0.015960693359375, + "learning_rate": 2.979375674450855e-06, + "loss": -0.0526, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 121 + }, + { + "completion_length": 672.0208435058594, + "epoch": 0.14878048780487804, + "grad_norm": 0.21933433413505554, + "kl": 0.01849365234375, + "learning_rate": 2.9783070463152816e-06, + "loss": 0.008, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 122 + }, + { + "completion_length": 673.6875, + "epoch": 0.15, + "grad_norm": 0.5358403325080872, + "kl": 0.02154541015625, + "learning_rate": 2.9772116295183124e-06, + "loss": -0.0399, + "reward": 0.1875000074505806, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 123 + }, + { + "completion_length": 678.8333740234375, + "epoch": 0.15121951219512195, + "grad_norm": 0.45023179054260254, + "kl": 0.022705078125, + "learning_rate": 2.9760894439101857e-06, + "loss": 0.0313, + "reward": 0.25, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 124 + }, + { + "completion_length": 702.5000305175781, + "epoch": 0.1524390243902439, + "grad_norm": 0.20217838883399963, + "kl": 0.013641357421875, + "learning_rate": 2.974940509826225e-06, + "loss": 0.0027, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 125 + }, + { + "completion_length": 633.3541870117188, + "epoch": 0.15365853658536585, + "grad_norm": 0.37631967663764954, + "kl": 0.02264404296875, + "learning_rate": 2.973764848086466e-06, + "loss": -0.0185, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 126 + }, + { + "completion_length": 554.8750152587891, + "epoch": 0.1548780487804878, + "grad_norm": 0.3253299593925476, + "kl": 0.02276611328125, + "learning_rate": 2.9725624799952824e-06, + "loss": -0.0038, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 127 + }, + { + "completion_length": 584.6458587646484, + "epoch": 0.15609756097560976, + "grad_norm": 0.39743635058403015, + "kl": 0.02117919921875, + "learning_rate": 2.9713334273409965e-06, + "loss": 0.0128, + "reward": 0.25, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 128 + }, + { + "completion_length": 737.2083435058594, + "epoch": 0.1573170731707317, + "grad_norm": 0.37635689973831177, + "kl": 0.01800537109375, + "learning_rate": 2.9700777123954867e-06, + "loss": -0.0073, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 129 + }, + { + "completion_length": 751.875, + "epoch": 0.15853658536585366, + "grad_norm": 0.7148156762123108, + "kl": 0.0213623046875, + "learning_rate": 2.968795357913784e-06, + "loss": 0.0008, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 130 + }, + { + "completion_length": 941.3125305175781, + "epoch": 0.1597560975609756, + "grad_norm": 0.10271207243204117, + "kl": 0.019287109375, + "learning_rate": 2.9674863871336603e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 131 + }, + { + "completion_length": 761.4375, + "epoch": 0.16097560975609757, + "grad_norm": 0.21008461713790894, + "kl": 0.0179443359375, + "learning_rate": 2.9661508237752034e-06, + "loss": 0.0088, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 132 + }, + { + "completion_length": 679.3541870117188, + "epoch": 0.16219512195121952, + "grad_norm": 0.3089422881603241, + "kl": 0.0194091796875, + "learning_rate": 2.9647886920403916e-06, + "loss": 0.024, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 133 + }, + { + "completion_length": 667.9166870117188, + "epoch": 0.16341463414634147, + "grad_norm": 0.07238946855068207, + "kl": 0.015869140625, + "learning_rate": 2.9634000166126534e-06, + "loss": 0.0006, + "reward": 0.375, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.0, + "step": 134 + }, + { + "completion_length": 620.2291870117188, + "epoch": 0.16463414634146342, + "grad_norm": 0.44060084223747253, + "kl": 0.02508544921875, + "learning_rate": 2.9619848226564196e-06, + "loss": -0.0035, + "reward": 0.1875000074505806, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 135 + }, + { + "completion_length": 505.79168701171875, + "epoch": 0.16585365853658537, + "grad_norm": 0.60687255859375, + "kl": 0.0185546875, + "learning_rate": 2.9605431358166687e-06, + "loss": -0.0126, + "reward": 0.12500000558793545, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 136 + }, + { + "completion_length": 592.6875305175781, + "epoch": 0.16707317073170733, + "grad_norm": 0.7305315136909485, + "kl": 0.0240478515625, + "learning_rate": 2.9590749822184602e-06, + "loss": -0.0122, + "reward": 0.2708333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 137 + }, + { + "completion_length": 604.3750152587891, + "epoch": 0.16829268292682928, + "grad_norm": 0.5852400660514832, + "kl": 0.0186767578125, + "learning_rate": 2.9575803884664634e-06, + "loss": 0.0194, + "reward": 0.2708333432674408, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 138 + }, + { + "completion_length": 675.6666870117188, + "epoch": 0.16951219512195123, + "grad_norm": 0.2013079971075058, + "kl": 0.02276611328125, + "learning_rate": 2.9560593816444746e-06, + "loss": 0.0004, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 139 + }, + { + "completion_length": 504.37501525878906, + "epoch": 0.17073170731707318, + "grad_norm": 0.4904243052005768, + "kl": 0.0238037109375, + "learning_rate": 2.9545119893149243e-06, + "loss": -0.0117, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 140 + }, + { + "completion_length": 681.2083435058594, + "epoch": 0.1719512195121951, + "grad_norm": 0.6175960302352905, + "kl": 0.024658203125, + "learning_rate": 2.9529382395183812e-06, + "loss": -0.0032, + "reward": 0.06250000186264515, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 141 + }, + { + "completion_length": 726.3125, + "epoch": 0.17317073170731706, + "grad_norm": 0.07112989574670792, + "kl": 0.01910400390625, + "learning_rate": 2.9513381607730403e-06, + "loss": 0.0007, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 142 + }, + { + "completion_length": 650.2083740234375, + "epoch": 0.174390243902439, + "grad_norm": 0.37218180298805237, + "kl": 0.0191650390625, + "learning_rate": 2.949711782074211e-06, + "loss": 0.0127, + "reward": 0.1458333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333432674408, + "rewards/format_reward": 0.0, + "step": 143 + }, + { + "completion_length": 814.2708435058594, + "epoch": 0.17560975609756097, + "grad_norm": 0.05150744691491127, + "kl": 0.01824951171875, + "learning_rate": 2.948059132893786e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 144 + }, + { + "completion_length": 687.2291870117188, + "epoch": 0.17682926829268292, + "grad_norm": 0.2909289300441742, + "kl": 0.02252197265625, + "learning_rate": 2.9463802431797115e-06, + "loss": 0.0009, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 145 + }, + { + "completion_length": 607.1458740234375, + "epoch": 0.17804878048780487, + "grad_norm": 0.468717485666275, + "kl": 0.0185546875, + "learning_rate": 2.9446751433554426e-06, + "loss": 0.0035, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 146 + }, + { + "completion_length": 590.0833435058594, + "epoch": 0.17926829268292682, + "grad_norm": 0.2879053056240082, + "kl": 0.01934814453125, + "learning_rate": 2.942943864319392e-06, + "loss": -0.0179, + "reward": 0.06250000186264515, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 147 + }, + { + "completion_length": 552.7083435058594, + "epoch": 0.18048780487804877, + "grad_norm": 0.11373342573642731, + "kl": 0.019775390625, + "learning_rate": 2.941186437444372e-06, + "loss": 0.0008, + "reward": 0.1875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 148 + }, + { + "completion_length": 507.62501525878906, + "epoch": 0.18170731707317073, + "grad_norm": 0.4177855849266052, + "kl": 0.02447509765625, + "learning_rate": 2.939402894577022e-06, + "loss": 0.0069, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 149 + }, + { + "completion_length": 425.4166717529297, + "epoch": 0.18292682926829268, + "grad_norm": 0.3714848458766937, + "kl": 0.02252197265625, + "learning_rate": 2.9375932680372358e-06, + "loss": -0.0108, + "reward": 0.2083333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 150 + }, + { + "completion_length": 693.5000305175781, + "epoch": 0.18414634146341463, + "grad_norm": 0.3068605363368988, + "kl": 0.0174560546875, + "learning_rate": 2.935757590617574e-06, + "loss": 0.0115, + "reward": 0.1666666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 151 + }, + { + "completion_length": 863.25, + "epoch": 0.18536585365853658, + "grad_norm": 0.07176525145769119, + "kl": 0.014923095703125, + "learning_rate": 2.9338958955826685e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 152 + }, + { + "completion_length": 561.2916870117188, + "epoch": 0.18658536585365854, + "grad_norm": 0.051739297807216644, + "kl": 0.0205078125, + "learning_rate": 2.9320082166686226e-06, + "loss": 0.0007, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 153 + }, + { + "completion_length": 621.0, + "epoch": 0.1878048780487805, + "grad_norm": 0.26465901732444763, + "kl": 0.0167236328125, + "learning_rate": 2.9300945880823955e-06, + "loss": -0.0025, + "reward": 0.0416666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 154 + }, + { + "completion_length": 525.875, + "epoch": 0.18902439024390244, + "grad_norm": 0.6293399930000305, + "kl": 0.0224609375, + "learning_rate": 2.928155044501189e-06, + "loss": -0.0075, + "reward": 0.12500000558793545, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 155 + }, + { + "completion_length": 561.7916870117188, + "epoch": 0.1902439024390244, + "grad_norm": 0.41370439529418945, + "kl": 0.016876220703125, + "learning_rate": 2.9261896210718106e-06, + "loss": 0.0014, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 156 + }, + { + "completion_length": 748.8750305175781, + "epoch": 0.19146341463414634, + "grad_norm": 0.04764688387513161, + "kl": 0.0169677734375, + "learning_rate": 2.924198353410044e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 157 + }, + { + "completion_length": 576.6666717529297, + "epoch": 0.1926829268292683, + "grad_norm": 0.5345750451087952, + "kl": 0.02020263671875, + "learning_rate": 2.9221812776000003e-06, + "loss": 0.0161, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 158 + }, + { + "completion_length": 717.7291870117188, + "epoch": 0.19390243902439025, + "grad_norm": 0.2683437168598175, + "kl": 0.016448974609375, + "learning_rate": 2.9201384301934632e-06, + "loss": -0.0001, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 159 + }, + { + "completion_length": 744.0833435058594, + "epoch": 0.1951219512195122, + "grad_norm": 0.05052180215716362, + "kl": 0.0198974609375, + "learning_rate": 2.9180698482092302e-06, + "loss": 0.0007, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 160 + }, + { + "completion_length": 791.9166870117188, + "epoch": 0.19634146341463415, + "grad_norm": 0.04119595140218735, + "kl": 0.016937255859375, + "learning_rate": 2.9159755691324377e-06, + "loss": 0.0006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 161 + }, + { + "completion_length": 639.9375305175781, + "epoch": 0.1975609756097561, + "grad_norm": 0.37889334559440613, + "kl": 0.020751953125, + "learning_rate": 2.913855630913884e-06, + "loss": -0.0038, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 162 + }, + { + "completion_length": 868.8125, + "epoch": 0.19878048780487806, + "grad_norm": 0.27075132727622986, + "kl": 0.01708984375, + "learning_rate": 2.911710071969342e-06, + "loss": 0.0158, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 163 + }, + { + "completion_length": 890.9791870117188, + "epoch": 0.2, + "grad_norm": 0.285118043422699, + "kl": 0.014984130859375, + "learning_rate": 2.9095389311788626e-06, + "loss": -0.0051, + "reward": 0.2291666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 164 + }, + { + "completion_length": 745.7916870117188, + "epoch": 0.20121951219512196, + "grad_norm": 0.4438501000404358, + "kl": 0.016265869140625, + "learning_rate": 2.9073422478860678e-06, + "loss": -0.0643, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 165 + }, + { + "completion_length": 664.1666870117188, + "epoch": 0.20243902439024392, + "grad_norm": 0.26312437653541565, + "kl": 0.02099609375, + "learning_rate": 2.9051200618974418e-06, + "loss": 0.0026, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 166 + }, + { + "completion_length": 731.6875, + "epoch": 0.20365853658536584, + "grad_norm": 0.34627678990364075, + "kl": 0.01849365234375, + "learning_rate": 2.9028724134816064e-06, + "loss": -0.0197, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 167 + }, + { + "completion_length": 688.1041870117188, + "epoch": 0.2048780487804878, + "grad_norm": 0.4049510657787323, + "kl": 0.017578125, + "learning_rate": 2.9005993433685932e-06, + "loss": 0.013, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 168 + }, + { + "completion_length": 683.3750305175781, + "epoch": 0.20609756097560974, + "grad_norm": 0.5772159099578857, + "kl": 0.01739501953125, + "learning_rate": 2.8983008927491046e-06, + "loss": 0.0063, + "reward": 0.2708333432674408, + "reward_std": 0.10825318098068237, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 169 + }, + { + "completion_length": 693.4583740234375, + "epoch": 0.2073170731707317, + "grad_norm": 0.3611339032649994, + "kl": 0.0157470703125, + "learning_rate": 2.8959771032737673e-06, + "loss": -0.0406, + "reward": 0.0833333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 170 + }, + { + "completion_length": 630.0000305175781, + "epoch": 0.20853658536585365, + "grad_norm": 0.33315309882164, + "kl": 0.01806640625, + "learning_rate": 2.8936280170523784e-06, + "loss": -0.016, + "reward": 0.16666667722165585, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 171 + }, + { + "completion_length": 736.4166870117188, + "epoch": 0.2097560975609756, + "grad_norm": 0.5158050060272217, + "kl": 0.0198974609375, + "learning_rate": 2.8912536766531423e-06, + "loss": -0.0491, + "reward": 0.125, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 172 + }, + { + "completion_length": 731.9166870117188, + "epoch": 0.21097560975609755, + "grad_norm": 0.22972215712070465, + "kl": 0.017333984375, + "learning_rate": 2.8888541251018963e-06, + "loss": -0.0104, + "reward": 0.1458333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333432674408, + "rewards/format_reward": 0.0, + "step": 173 + }, + { + "completion_length": 803.3333435058594, + "epoch": 0.2121951219512195, + "grad_norm": 0.7990434169769287, + "kl": 0.02093505859375, + "learning_rate": 2.8864294058813364e-06, + "loss": -0.112, + "reward": 0.2500000149011612, + "reward_std": 0.21650634706020355, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 174 + }, + { + "completion_length": 572.9583740234375, + "epoch": 0.21341463414634146, + "grad_norm": 0.4772682189941406, + "kl": 0.02081298828125, + "learning_rate": 2.883979562930225e-06, + "loss": -0.0108, + "reward": 0.2083333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 175 + }, + { + "completion_length": 669.0208435058594, + "epoch": 0.2146341463414634, + "grad_norm": 0.05744696035981178, + "kl": 0.02032470703125, + "learning_rate": 2.8815046406425954e-06, + "loss": 0.0007, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 176 + }, + { + "completion_length": 815.25, + "epoch": 0.21585365853658536, + "grad_norm": 0.2521149516105652, + "kl": 0.01373291015625, + "learning_rate": 2.8790046838669493e-06, + "loss": 0.0314, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 177 + }, + { + "completion_length": 688.3750305175781, + "epoch": 0.21707317073170732, + "grad_norm": 0.6815643906593323, + "kl": 0.02545166015625, + "learning_rate": 2.876479737905442e-06, + "loss": -0.0403, + "reward": 0.1041666679084301, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 178 + }, + { + "completion_length": 618.25, + "epoch": 0.21829268292682927, + "grad_norm": 0.5136005878448486, + "kl": 0.02203369140625, + "learning_rate": 2.8739298485130627e-06, + "loss": -0.0078, + "reward": 0.2083333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.0, + "step": 179 + }, + { + "completion_length": 671.2500305175781, + "epoch": 0.21951219512195122, + "grad_norm": 0.4481271803379059, + "kl": 0.0186767578125, + "learning_rate": 2.8713550618968034e-06, + "loss": 0.0089, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 180 + }, + { + "completion_length": 777.4791870117188, + "epoch": 0.22073170731707317, + "grad_norm": 0.3541518449783325, + "kl": 0.02325439453125, + "learning_rate": 2.8687554247148247e-06, + "loss": 0.0262, + "reward": 0.12500000558793545, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 181 + }, + { + "completion_length": 725.6875305175781, + "epoch": 0.22195121951219512, + "grad_norm": 0.09448660165071487, + "kl": 0.0201416015625, + "learning_rate": 2.8661309840756093e-06, + "loss": 0.0008, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 182 + }, + { + "completion_length": 852.5833435058594, + "epoch": 0.22317073170731708, + "grad_norm": 0.30420514941215515, + "kl": 0.018310546875, + "learning_rate": 2.863481787537105e-06, + "loss": 0.005, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 183 + }, + { + "completion_length": 676.0833435058594, + "epoch": 0.22439024390243903, + "grad_norm": 0.7469632029533386, + "kl": 0.0177001953125, + "learning_rate": 2.8608078831058682e-06, + "loss": 0.015, + "reward": 0.1875000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 184 + }, + { + "completion_length": 737.5833740234375, + "epoch": 0.22560975609756098, + "grad_norm": 0.4696647822856903, + "kl": 0.0260009765625, + "learning_rate": 2.8581093192361895e-06, + "loss": 0.0463, + "reward": 0.2291666716337204, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 185 + }, + { + "completion_length": 859.0, + "epoch": 0.22682926829268293, + "grad_norm": 0.04222070053219795, + "kl": 0.0218505859375, + "learning_rate": 2.8553861448292185e-06, + "loss": 0.0008, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 186 + }, + { + "completion_length": 733.7708435058594, + "epoch": 0.2280487804878049, + "grad_norm": 0.4025222659111023, + "kl": 0.0279541015625, + "learning_rate": 2.852638409232077e-06, + "loss": 0.0001, + "reward": 0.1041666679084301, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 187 + }, + { + "completion_length": 564.2916717529297, + "epoch": 0.22926829268292684, + "grad_norm": 0.32440370321273804, + "kl": 0.0225830078125, + "learning_rate": 2.8498661622369637e-06, + "loss": 0.0085, + "reward": 0.14583333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 188 + }, + { + "completion_length": 830.2500305175781, + "epoch": 0.2304878048780488, + "grad_norm": 0.2527843117713928, + "kl": 0.0234375, + "learning_rate": 2.8470694540802527e-06, + "loss": 0.0077, + "reward": 0.0416666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 189 + }, + { + "completion_length": 990.375, + "epoch": 0.23170731707317074, + "grad_norm": 0.5628884434700012, + "kl": 0.04888916015625, + "learning_rate": 2.8442483354415836e-06, + "loss": 0.0041, + "reward": 0.25, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 190 + }, + { + "completion_length": 761.8125, + "epoch": 0.2329268292682927, + "grad_norm": 0.4500414729118347, + "kl": 0.02325439453125, + "learning_rate": 2.841402857442942e-06, + "loss": -0.0141, + "reward": 0.2083333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 191 + }, + { + "completion_length": 928.3958740234375, + "epoch": 0.23414634146341465, + "grad_norm": 0.32092925906181335, + "kl": 0.0218505859375, + "learning_rate": 2.8385330716477335e-06, + "loss": 0.0019, + "reward": 0.06250000186264515, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 192 + }, + { + "completion_length": 524.9375305175781, + "epoch": 0.23536585365853657, + "grad_norm": 0.08342643827199936, + "kl": 0.128875732421875, + "learning_rate": 2.835639030059851e-06, + "loss": 0.0008, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 193 + }, + { + "completion_length": 728.5833435058594, + "epoch": 0.23658536585365852, + "grad_norm": 0.43060678243637085, + "kl": 0.0198974609375, + "learning_rate": 2.8327207851227295e-06, + "loss": 0.0183, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 194 + }, + { + "completion_length": 804.6666870117188, + "epoch": 0.23780487804878048, + "grad_norm": 0.2729571461677551, + "kl": 0.02264404296875, + "learning_rate": 2.829778389718398e-06, + "loss": 0.0081, + "reward": 0.2708333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 195 + }, + { + "completion_length": 701.6666870117188, + "epoch": 0.23902439024390243, + "grad_norm": 0.46106624603271484, + "kl": 0.02728271484375, + "learning_rate": 2.826811897166519e-06, + "loss": -0.0018, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 196 + }, + { + "completion_length": 764.1666870117188, + "epoch": 0.24024390243902438, + "grad_norm": 0.1949763149023056, + "kl": 0.0191650390625, + "learning_rate": 2.8238213612234255e-06, + "loss": -0.0161, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 197 + }, + { + "completion_length": 691.0625305175781, + "epoch": 0.24146341463414633, + "grad_norm": 0.05811993405222893, + "kl": 0.0220947265625, + "learning_rate": 2.8208068360811445e-06, + "loss": 0.0008, + "reward": 0.1875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 198 + }, + { + "completion_length": 981.8125, + "epoch": 0.2426829268292683, + "grad_norm": 0.40761807560920715, + "kl": 0.02154541015625, + "learning_rate": 2.8177683763664137e-06, + "loss": -0.0305, + "reward": 0.1458333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 199 + }, + { + "completion_length": 962.7500305175781, + "epoch": 0.24390243902439024, + "grad_norm": 0.26158013939857483, + "kl": 0.0181884765625, + "learning_rate": 2.8147060371396953e-06, + "loss": 0.034, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 200 + }, + { + "completion_length": 662.2083435058594, + "epoch": 0.2451219512195122, + "grad_norm": 0.4212491512298584, + "kl": 0.022216796875, + "learning_rate": 2.8116198738941766e-06, + "loss": -0.0079, + "reward": 0.1041666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "step": 201 + }, + { + "completion_length": 836.3541870117188, + "epoch": 0.24634146341463414, + "grad_norm": 0.3516237437725067, + "kl": 0.02349853515625, + "learning_rate": 2.8085099425547627e-06, + "loss": -0.004, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 202 + }, + { + "completion_length": 744.4583435058594, + "epoch": 0.2475609756097561, + "grad_norm": 0.5145571827888489, + "kl": 0.02130126953125, + "learning_rate": 2.8053762994770646e-06, + "loss": -0.0356, + "reward": 0.16666667722165585, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 203 + }, + { + "completion_length": 899.5416870117188, + "epoch": 0.24878048780487805, + "grad_norm": 1.0642642974853516, + "kl": 0.0482177734375, + "learning_rate": 2.8022190014463794e-06, + "loss": 0.0028, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 204 + }, + { + "completion_length": 656.0625, + "epoch": 0.25, + "grad_norm": 0.4658428132534027, + "kl": 0.0225830078125, + "learning_rate": 2.7990381056766585e-06, + "loss": 0.0129, + "reward": 0.1041666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "step": 205 + }, + { + "completion_length": 529.6041870117188, + "epoch": 0.25121951219512195, + "grad_norm": 0.6593291163444519, + "kl": 0.02252197265625, + "learning_rate": 2.795833669809471e-06, + "loss": 0.0031, + "reward": 0.2708333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/format_reward": 0.0, + "step": 206 + }, + { + "completion_length": 883.5833740234375, + "epoch": 0.2524390243902439, + "grad_norm": 0.4920080900192261, + "kl": 0.02020263671875, + "learning_rate": 2.7926057519129634e-06, + "loss": 0.0473, + "reward": 0.1041666679084301, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 207 + }, + { + "completion_length": 704.2083435058594, + "epoch": 0.25365853658536586, + "grad_norm": 0.3727148771286011, + "kl": 0.01800537109375, + "learning_rate": 2.7893544104808017e-06, + "loss": -0.0068, + "reward": 0.16666667722165585, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 208 + }, + { + "completion_length": 555.1458587646484, + "epoch": 0.2548780487804878, + "grad_norm": 0.6752776503562927, + "kl": 0.023681640625, + "learning_rate": 2.7860797044311143e-06, + "loss": 0.0138, + "reward": 0.27083333395421505, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.27083333395421505, + "rewards/format_reward": 0.0, + "step": 209 + }, + { + "completion_length": 644.2708740234375, + "epoch": 0.25609756097560976, + "grad_norm": 0.35868731141090393, + "kl": 0.020751953125, + "learning_rate": 2.7827816931054245e-06, + "loss": -0.0067, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 210 + }, + { + "completion_length": 760.9583740234375, + "epoch": 0.2573170731707317, + "grad_norm": 0.4104251265525818, + "kl": 0.0220947265625, + "learning_rate": 2.7794604362675733e-06, + "loss": -0.0301, + "reward": 0.1458333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 211 + }, + { + "completion_length": 791.0625, + "epoch": 0.25853658536585367, + "grad_norm": 0.51336669921875, + "kl": 0.02239990234375, + "learning_rate": 2.7761159941026403e-06, + "loss": 0.0342, + "reward": 0.14583333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 212 + }, + { + "completion_length": 577.2916870117188, + "epoch": 0.2597560975609756, + "grad_norm": 0.2733917534351349, + "kl": 0.024169921875, + "learning_rate": 2.772748427215848e-06, + "loss": 0.0023, + "reward": 0.14583333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 213 + }, + { + "completion_length": 569.5000152587891, + "epoch": 0.26097560975609757, + "grad_norm": 0.63326495885849, + "kl": 0.02398681640625, + "learning_rate": 2.7693577966314664e-06, + "loss": -0.0395, + "reward": 0.2291666716337204, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 214 + }, + { + "completion_length": 731.6875305175781, + "epoch": 0.2621951219512195, + "grad_norm": 0.5346475839614868, + "kl": 0.0211181640625, + "learning_rate": 2.7659441637917076e-06, + "loss": 0.0211, + "reward": 0.2916666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/format_reward": 0.0, + "step": 215 + }, + { + "completion_length": 870.7500610351562, + "epoch": 0.2634146341463415, + "grad_norm": 0.5171618461608887, + "kl": 0.0230712890625, + "learning_rate": 2.7625075905556117e-06, + "loss": 0.0235, + "reward": 0.1875000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 216 + }, + { + "completion_length": 658.0, + "epoch": 0.2646341463414634, + "grad_norm": 0.2690303325653076, + "kl": 0.03155517578125, + "learning_rate": 2.7590481391979253e-06, + "loss": 0.0162, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 217 + }, + { + "completion_length": 802.6666870117188, + "epoch": 0.2658536585365854, + "grad_norm": 0.3439900875091553, + "kl": 0.02734375, + "learning_rate": 2.755565872407973e-06, + "loss": 0.004, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 218 + }, + { + "completion_length": 749.9375, + "epoch": 0.26707317073170733, + "grad_norm": 0.5028407573699951, + "kl": 0.02398681640625, + "learning_rate": 2.7520608532885228e-06, + "loss": -0.0342, + "reward": 0.2291666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 219 + }, + { + "completion_length": 770.5, + "epoch": 0.2682926829268293, + "grad_norm": 0.1258758008480072, + "kl": 0.0250244140625, + "learning_rate": 2.7485331453546407e-06, + "loss": 0.0009, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 220 + }, + { + "completion_length": 976.0208740234375, + "epoch": 0.26951219512195124, + "grad_norm": 0.22799670696258545, + "kl": 0.02197265625, + "learning_rate": 2.744982812532542e-06, + "loss": 0.0478, + "reward": 0.2083333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 221 + }, + { + "completion_length": 659.6458435058594, + "epoch": 0.2707317073170732, + "grad_norm": 0.35914433002471924, + "kl": 0.0286865234375, + "learning_rate": 2.7414099191584305e-06, + "loss": -0.0094, + "reward": 0.4166666865348816, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/format_reward": 0.0, + "step": 222 + }, + { + "completion_length": 730.8125, + "epoch": 0.27195121951219514, + "grad_norm": 0.4243104159832001, + "kl": 0.02203369140625, + "learning_rate": 2.7378145299773337e-06, + "loss": 0.0084, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 223 + }, + { + "completion_length": 587.3125305175781, + "epoch": 0.2731707317073171, + "grad_norm": 0.4017314016819, + "kl": 0.02728271484375, + "learning_rate": 2.7341967101419303e-06, + "loss": 0.0112, + "reward": 0.2500000149011612, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 224 + }, + { + "completion_length": 661.5625305175781, + "epoch": 0.27439024390243905, + "grad_norm": 0.3526459038257599, + "kl": 0.02374267578125, + "learning_rate": 2.730556525211368e-06, + "loss": -0.0158, + "reward": 0.1875000111758709, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.0, + "step": 225 + }, + { + "completion_length": 649.7083435058594, + "epoch": 0.275609756097561, + "grad_norm": 0.12818405032157898, + "kl": 0.0234375, + "learning_rate": 2.726894041150077e-06, + "loss": 0.001, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 226 + }, + { + "completion_length": 956.9791870117188, + "epoch": 0.27682926829268295, + "grad_norm": 2.025303363800049, + "kl": 0.09014892578125, + "learning_rate": 2.7232093243265727e-06, + "loss": 0.0229, + "reward": 0.1875, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 227 + }, + { + "completion_length": 624.3125305175781, + "epoch": 0.2780487804878049, + "grad_norm": 0.2305293083190918, + "kl": 0.02728271484375, + "learning_rate": 2.7195024415122565e-06, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 228 + }, + { + "completion_length": 624.2916870117188, + "epoch": 0.27926829268292686, + "grad_norm": 0.5692446231842041, + "kl": 0.02569580078125, + "learning_rate": 2.715773459880202e-06, + "loss": -0.0621, + "reward": 0.2083333432674408, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 229 + }, + { + "completion_length": 821.6875, + "epoch": 0.2804878048780488, + "grad_norm": 0.7158800959587097, + "kl": 0.02783203125, + "learning_rate": 2.7120224470039394e-06, + "loss": -0.0085, + "reward": 0.1458333358168602, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 230 + }, + { + "completion_length": 608.7916870117188, + "epoch": 0.2817073170731707, + "grad_norm": 0.5108224153518677, + "kl": 0.02252197265625, + "learning_rate": 2.7082494708562316e-06, + "loss": 0.0071, + "reward": 0.1041666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "step": 231 + }, + { + "completion_length": 690.5833435058594, + "epoch": 0.28292682926829266, + "grad_norm": 0.3444475531578064, + "kl": 0.0269775390625, + "learning_rate": 2.7044545998078414e-06, + "loss": 0.0132, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 232 + }, + { + "completion_length": 780.3333740234375, + "epoch": 0.2841463414634146, + "grad_norm": 0.1466454416513443, + "kl": 0.0250244140625, + "learning_rate": 2.7006379026262924e-06, + "loss": 0.0002, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 233 + }, + { + "completion_length": 595.2083740234375, + "epoch": 0.28536585365853656, + "grad_norm": 0.051908962428569794, + "kl": 0.08935546875, + "learning_rate": 2.696799448474625e-06, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 234 + }, + { + "completion_length": 857.4166870117188, + "epoch": 0.2865853658536585, + "grad_norm": 0.3672059178352356, + "kl": 0.030029296875, + "learning_rate": 2.69293930691014e-06, + "loss": 0.0098, + "reward": 0.06250000186264515, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 235 + }, + { + "completion_length": 766.4791870117188, + "epoch": 0.28780487804878047, + "grad_norm": 0.13399188220500946, + "kl": 0.02374267578125, + "learning_rate": 2.689057547883139e-06, + "loss": 0.0001, + "reward": 0.20833333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.20833333395421505, + "rewards/format_reward": 0.0, + "step": 236 + }, + { + "completion_length": 666.7500305175781, + "epoch": 0.2890243902439024, + "grad_norm": 0.5673995614051819, + "kl": 0.02789306640625, + "learning_rate": 2.6851542417356605e-06, + "loss": -0.0061, + "reward": 0.229166679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.229166679084301, + "rewards/format_reward": 0.0, + "step": 237 + }, + { + "completion_length": 721.4375305175781, + "epoch": 0.29024390243902437, + "grad_norm": 0.31549733877182007, + "kl": 0.02764892578125, + "learning_rate": 2.6812294592001984e-06, + "loss": -0.0241, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 238 + }, + { + "completion_length": 702.8333435058594, + "epoch": 0.2914634146341463, + "grad_norm": 0.4895757734775543, + "kl": 0.0269775390625, + "learning_rate": 2.677283271398427e-06, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 239 + }, + { + "completion_length": 734.9791870117188, + "epoch": 0.2926829268292683, + "grad_norm": 0.28384703397750854, + "kl": 0.03662109375, + "learning_rate": 2.673315749839907e-06, + "loss": -0.0144, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 240 + }, + { + "completion_length": 763.6666870117188, + "epoch": 0.2939024390243902, + "grad_norm": 0.4405684173107147, + "kl": 0.027099609375, + "learning_rate": 2.669326966420793e-06, + "loss": 0.0024, + "reward": 0.25, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 241 + }, + { + "completion_length": 674.2708435058594, + "epoch": 0.2951219512195122, + "grad_norm": 0.6043628454208374, + "kl": 0.03167724609375, + "learning_rate": 2.6653169934225295e-06, + "loss": -0.0699, + "reward": 0.06250000186264515, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 242 + }, + { + "completion_length": 645.2708435058594, + "epoch": 0.29634146341463413, + "grad_norm": 0.5713904500007629, + "kl": 0.02581787109375, + "learning_rate": 2.661285903510541e-06, + "loss": -0.0441, + "reward": 0.2291666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 243 + }, + { + "completion_length": 781.5000305175781, + "epoch": 0.2975609756097561, + "grad_norm": 0.6145543456077576, + "kl": 0.022705078125, + "learning_rate": 2.6572337697329145e-06, + "loss": -0.008, + "reward": 0.2500000074505806, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.0, + "step": 244 + }, + { + "completion_length": 862.4375305175781, + "epoch": 0.29878048780487804, + "grad_norm": 0.36308160424232483, + "kl": 0.02362060546875, + "learning_rate": 2.6531606655190777e-06, + "loss": 0.0404, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 245 + }, + { + "completion_length": 779.2500305175781, + "epoch": 0.3, + "grad_norm": 0.5605431795120239, + "kl": 0.0257568359375, + "learning_rate": 2.649066664678467e-06, + "loss": 0.0311, + "reward": 0.1250000037252903, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 246 + }, + { + "completion_length": 854.9375305175781, + "epoch": 0.30121951219512194, + "grad_norm": 0.4492291212081909, + "kl": 0.02496337890625, + "learning_rate": 2.64495184139919e-06, + "loss": 0.0258, + "reward": 0.125, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 247 + }, + { + "completion_length": 865.6250610351562, + "epoch": 0.3024390243902439, + "grad_norm": 0.71879643201828, + "kl": 0.02789306640625, + "learning_rate": 2.640816270246681e-06, + "loss": 0.0375, + "reward": 0.1458333395421505, + "reward_std": 0.21650634706020355, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 248 + }, + { + "completion_length": 1087.0000305175781, + "epoch": 0.30365853658536585, + "grad_norm": 0.2714973986148834, + "kl": 0.02093505859375, + "learning_rate": 2.636660026162351e-06, + "loss": -0.0026, + "reward": 0.2291666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 249 + }, + { + "completion_length": 963.3750305175781, + "epoch": 0.3048780487804878, + "grad_norm": 0.5230698585510254, + "kl": 0.0335693359375, + "learning_rate": 2.6324831844622278e-06, + "loss": 0.0096, + "reward": 0.1458333358168602, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 250 + }, + { + "completion_length": 855.3541870117188, + "epoch": 0.30609756097560975, + "grad_norm": 0.34206509590148926, + "kl": 0.0244140625, + "learning_rate": 2.628285820835593e-06, + "loss": 0.0278, + "reward": 0.27083333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.27083333395421505, + "rewards/format_reward": 0.0, + "step": 251 + }, + { + "completion_length": 850.0417175292969, + "epoch": 0.3073170731707317, + "grad_norm": 0.35433900356292725, + "kl": 0.025390625, + "learning_rate": 2.6240680113436096e-06, + "loss": -0.0119, + "reward": 0.0833333358168602, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 252 + }, + { + "completion_length": 1090.8750610351562, + "epoch": 0.30853658536585366, + "grad_norm": 0.2618762254714966, + "kl": 0.02801513671875, + "learning_rate": 2.619829832417944e-06, + "loss": 0.0758, + "reward": 0.0625, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 253 + }, + { + "completion_length": 679.8541870117188, + "epoch": 0.3097560975609756, + "grad_norm": 0.6165598034858704, + "kl": 0.02508544921875, + "learning_rate": 2.6155713608593796e-06, + "loss": 0.0086, + "reward": 0.3125000149011612, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.3125000149011612, + "rewards/format_reward": 0.0, + "step": 254 + }, + { + "completion_length": 809.3125305175781, + "epoch": 0.31097560975609756, + "grad_norm": 0.2922210991382599, + "kl": 0.02557373046875, + "learning_rate": 2.6112926738364267e-06, + "loss": 0.036, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 255 + }, + { + "completion_length": 780.2291870117188, + "epoch": 0.3121951219512195, + "grad_norm": 0.4339911937713623, + "kl": 0.03070068359375, + "learning_rate": 2.606993848883924e-06, + "loss": 0.041, + "reward": 0.16666667722165585, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 256 + }, + { + "completion_length": 943.6458740234375, + "epoch": 0.31341463414634146, + "grad_norm": 0.19445890188217163, + "kl": 0.02685546875, + "learning_rate": 2.6026749639016327e-06, + "loss": 0.0082, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 257 + }, + { + "completion_length": 870.3750305175781, + "epoch": 0.3146341463414634, + "grad_norm": 0.36287394165992737, + "kl": 0.031494140625, + "learning_rate": 2.5983360971528252e-06, + "loss": 0.0174, + "reward": 0.229166679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.229166679084301, + "rewards/format_reward": 0.0, + "step": 258 + }, + { + "completion_length": 649.9375, + "epoch": 0.31585365853658537, + "grad_norm": 0.5976565480232239, + "kl": 0.02911376953125, + "learning_rate": 2.5939773272628674e-06, + "loss": 0.0043, + "reward": 0.2708333432674408, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 259 + }, + { + "completion_length": 733.9166870117188, + "epoch": 0.3170731707317073, + "grad_norm": 0.6212018728256226, + "kl": 0.02813720703125, + "learning_rate": 2.5895987332177935e-06, + "loss": -0.0088, + "reward": 0.125, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 260 + }, + { + "completion_length": 699.7291870117188, + "epoch": 0.3182926829268293, + "grad_norm": 0.24755185842514038, + "kl": 0.02996826171875, + "learning_rate": 2.5852003943628746e-06, + "loss": 0.0008, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 261 + }, + { + "completion_length": 832.0625305175781, + "epoch": 0.3195121951219512, + "grad_norm": 0.28362536430358887, + "kl": 0.0272216796875, + "learning_rate": 2.5807823904011804e-06, + "loss": 0.0071, + "reward": 0.0416666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 262 + }, + { + "completion_length": 752.9375, + "epoch": 0.3207317073170732, + "grad_norm": 0.6556203365325928, + "kl": 0.0247802734375, + "learning_rate": 2.576344801392137e-06, + "loss": -0.006, + "reward": 0.2500000149011612, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 263 + }, + { + "completion_length": 812.4791870117188, + "epoch": 0.32195121951219513, + "grad_norm": 0.5754515528678894, + "kl": 0.02923583984375, + "learning_rate": 2.571887707750072e-06, + "loss": -0.0423, + "reward": 0.0833333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 264 + }, + { + "completion_length": 936.2708435058594, + "epoch": 0.3231707317073171, + "grad_norm": 0.26100462675094604, + "kl": 0.02996826171875, + "learning_rate": 2.5674111902427625e-06, + "loss": 0.023, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 265 + }, + { + "completion_length": 683.9375305175781, + "epoch": 0.32439024390243903, + "grad_norm": 0.24268393218517303, + "kl": 0.02978515625, + "learning_rate": 2.5629153299899673e-06, + "loss": -0.0018, + "reward": 0.2291666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 266 + }, + { + "completion_length": 757.6458435058594, + "epoch": 0.325609756097561, + "grad_norm": 0.5983391404151917, + "kl": 0.05010986328125, + "learning_rate": 2.5584002084619593e-06, + "loss": 0.0316, + "reward": 0.2916666716337204, + "reward_std": 0.21650636196136475, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/format_reward": 0.0, + "step": 267 + }, + { + "completion_length": 1014.1458740234375, + "epoch": 0.32682926829268294, + "grad_norm": 0.23932863771915436, + "kl": 0.028076171875, + "learning_rate": 2.5538659074780484e-06, + "loss": 0.0211, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 268 + }, + { + "completion_length": 748.7708435058594, + "epoch": 0.3280487804878049, + "grad_norm": 0.4234470725059509, + "kl": 0.03076171875, + "learning_rate": 2.549312509205097e-06, + "loss": 0.0318, + "reward": 0.12500000558793545, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 269 + }, + { + "completion_length": 779.1041870117188, + "epoch": 0.32926829268292684, + "grad_norm": 0.5329450964927673, + "kl": 0.03021240234375, + "learning_rate": 2.5447400961560355e-06, + "loss": -0.0543, + "reward": 0.1458333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 270 + }, + { + "completion_length": 728.125, + "epoch": 0.3304878048780488, + "grad_norm": 0.5748668313026428, + "kl": 0.0338134765625, + "learning_rate": 2.5401487511883627e-06, + "loss": -0.0385, + "reward": 0.1875, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 271 + }, + { + "completion_length": 635.1666870117188, + "epoch": 0.33170731707317075, + "grad_norm": 0.7328594326972961, + "kl": 0.02838134765625, + "learning_rate": 2.5355385575026464e-06, + "loss": 0.0339, + "reward": 0.229166679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.229166679084301, + "rewards/format_reward": 0.0, + "step": 272 + }, + { + "completion_length": 786.9166870117188, + "epoch": 0.3329268292682927, + "grad_norm": 0.056253425776958466, + "kl": 0.0277099609375, + "learning_rate": 2.5309095986410155e-06, + "loss": 0.001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 273 + }, + { + "completion_length": 575.5208587646484, + "epoch": 0.33414634146341465, + "grad_norm": 0.05611734464764595, + "kl": 0.0244140625, + "learning_rate": 2.5262619584856456e-06, + "loss": 0.0009, + "reward": 0.3125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.0, + "step": 274 + }, + { + "completion_length": 632.1666870117188, + "epoch": 0.3353658536585366, + "grad_norm": 0.06363707035779953, + "kl": 0.0218505859375, + "learning_rate": 2.52159572125724e-06, + "loss": 0.0008, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 275 + }, + { + "completion_length": 915.6458435058594, + "epoch": 0.33658536585365856, + "grad_norm": 0.5183939933776855, + "kl": 0.027587890625, + "learning_rate": 2.5169109715135015e-06, + "loss": 0.0111, + "reward": 0.1458333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 276 + }, + { + "completion_length": 791.625, + "epoch": 0.3378048780487805, + "grad_norm": 0.32279711961746216, + "kl": 0.02484130859375, + "learning_rate": 2.512207794147603e-06, + "loss": 0.0133, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 277 + }, + { + "completion_length": 712.0000305175781, + "epoch": 0.33902439024390246, + "grad_norm": 0.18284721672534943, + "kl": 0.0255126953125, + "learning_rate": 2.507486274386647e-06, + "loss": -0.0013, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 278 + }, + { + "completion_length": 719.6041870117188, + "epoch": 0.3402439024390244, + "grad_norm": 0.3969678580760956, + "kl": 0.0311279296875, + "learning_rate": 2.5027464977901206e-06, + "loss": -0.0471, + "reward": 0.0625, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 279 + }, + { + "completion_length": 662.8125305175781, + "epoch": 0.34146341463414637, + "grad_norm": 0.278129518032074, + "kl": 0.02740478515625, + "learning_rate": 2.4979885502483478e-06, + "loss": -0.0116, + "reward": 0.1458333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333432674408, + "rewards/format_reward": 0.0, + "step": 280 + }, + { + "completion_length": 631.0833435058594, + "epoch": 0.3426829268292683, + "grad_norm": 0.49812057614326477, + "kl": 0.0302734375, + "learning_rate": 2.4932125179809316e-06, + "loss": -0.0037, + "reward": 0.1458333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 281 + }, + { + "completion_length": 840.5208740234375, + "epoch": 0.3439024390243902, + "grad_norm": 0.6025025248527527, + "kl": 0.03045654296875, + "learning_rate": 2.4884184875351897e-06, + "loss": 0.0369, + "reward": 0.1458333395421505, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 282 + }, + { + "completion_length": 601.2500305175781, + "epoch": 0.34512195121951217, + "grad_norm": 0.2603875696659088, + "kl": 0.03564453125, + "learning_rate": 2.48360654578459e-06, + "loss": 0.0017, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 283 + }, + { + "completion_length": 605.9791870117188, + "epoch": 0.3463414634146341, + "grad_norm": 0.4111523926258087, + "kl": 0.02874755859375, + "learning_rate": 2.4787767799271725e-06, + "loss": 0.0172, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 284 + }, + { + "completion_length": 581.3125, + "epoch": 0.3475609756097561, + "grad_norm": 0.3759603798389435, + "kl": 0.03076171875, + "learning_rate": 2.473929277483972e-06, + "loss": -0.0094, + "reward": 0.2083333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 285 + }, + { + "completion_length": 815.0625305175781, + "epoch": 0.348780487804878, + "grad_norm": 0.30721497535705566, + "kl": 0.0318603515625, + "learning_rate": 2.4690641262974317e-06, + "loss": 0.0639, + "reward": 0.0625, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 286 + }, + { + "completion_length": 812.2708435058594, + "epoch": 0.35, + "grad_norm": 0.05051800608634949, + "kl": 0.02484130859375, + "learning_rate": 2.464181414529809e-06, + "loss": 0.001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 287 + }, + { + "completion_length": 760.7291870117188, + "epoch": 0.35121951219512193, + "grad_norm": 0.3336050510406494, + "kl": 0.03076171875, + "learning_rate": 2.4592812306615812e-06, + "loss": -0.0171, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 288 + }, + { + "completion_length": 664.7083740234375, + "epoch": 0.3524390243902439, + "grad_norm": 0.5336496829986572, + "kl": 0.03125, + "learning_rate": 2.4543636634898398e-06, + "loss": 0.0195, + "reward": 0.1041666679084301, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 289 + }, + { + "completion_length": 866.4375, + "epoch": 0.35365853658536583, + "grad_norm": 0.29412227869033813, + "kl": 0.02923583984375, + "learning_rate": 2.4494288021266825e-06, + "loss": 0.0126, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 290 + }, + { + "completion_length": 922.4791870117188, + "epoch": 0.3548780487804878, + "grad_norm": 0.62317955493927, + "kl": 0.0618896484375, + "learning_rate": 2.444476735997598e-06, + "loss": 0.0498, + "reward": 0.2708333432674408, + "reward_std": 0.21650634706020355, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 291 + }, + { + "completion_length": 728.375, + "epoch": 0.35609756097560974, + "grad_norm": 0.48821818828582764, + "kl": 0.03179931640625, + "learning_rate": 2.439507554839846e-06, + "loss": -0.0207, + "reward": 0.2083333358168602, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 292 + }, + { + "completion_length": 655.6458740234375, + "epoch": 0.3573170731707317, + "grad_norm": 0.3668544888496399, + "kl": 0.028076171875, + "learning_rate": 2.4345213487008296e-06, + "loss": -0.0002, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 293 + }, + { + "completion_length": 563.5625305175781, + "epoch": 0.35853658536585364, + "grad_norm": 0.2510969340801239, + "kl": 0.029296875, + "learning_rate": 2.4295182079364655e-06, + "loss": 0.0075, + "reward": 0.1666666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 294 + }, + { + "completion_length": 640.6458740234375, + "epoch": 0.3597560975609756, + "grad_norm": 0.4731411635875702, + "kl": 0.0267333984375, + "learning_rate": 2.424498223209545e-06, + "loss": 0.0057, + "reward": 0.1666666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 295 + }, + { + "completion_length": 784.1875305175781, + "epoch": 0.36097560975609755, + "grad_norm": 0.43168067932128906, + "kl": 0.03045654296875, + "learning_rate": 2.4194614854880937e-06, + "loss": -0.0009, + "reward": 0.06250000186264515, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 296 + }, + { + "completion_length": 666.4791870117188, + "epoch": 0.3621951219512195, + "grad_norm": 0.41461437940597534, + "kl": 0.0250244140625, + "learning_rate": 2.4144080860437184e-06, + "loss": 0.0125, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 297 + }, + { + "completion_length": 742.7291870117188, + "epoch": 0.36341463414634145, + "grad_norm": 0.056942686438560486, + "kl": 0.026123046875, + "learning_rate": 2.409338116449957e-06, + "loss": 0.001, + "reward": 0.1875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 298 + }, + { + "completion_length": 702.1458435058594, + "epoch": 0.3646341463414634, + "grad_norm": 0.5765194296836853, + "kl": 0.0224609375, + "learning_rate": 2.404251668580619e-06, + "loss": 0.0231, + "reward": 0.3333333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 299 + }, + { + "completion_length": 716.8333435058594, + "epoch": 0.36585365853658536, + "grad_norm": 0.5342187881469727, + "kl": 0.02703857421875, + "learning_rate": 2.3991488346081183e-06, + "loss": -0.0256, + "reward": 0.2916666865348816, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2916666865348816, + "rewards/format_reward": 0.0, + "step": 300 + }, + { + "completion_length": 646.7291870117188, + "epoch": 0.3670731707317073, + "grad_norm": 0.07587277144193649, + "kl": 0.0301513671875, + "learning_rate": 2.3940297070018048e-06, + "loss": 0.0012, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 301 + }, + { + "completion_length": 719.8541870117188, + "epoch": 0.36829268292682926, + "grad_norm": 0.1976253092288971, + "kl": 0.02813720703125, + "learning_rate": 2.388894378526288e-06, + "loss": 0.0088, + "reward": 0.2083333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 302 + }, + { + "completion_length": 654.0833740234375, + "epoch": 0.3695121951219512, + "grad_norm": 0.5830801725387573, + "kl": 0.03106689453125, + "learning_rate": 2.383742942239757e-06, + "loss": 0.02, + "reward": 0.1666666716337204, + "reward_std": 0.14433757960796356, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 303 + }, + { + "completion_length": 631.0, + "epoch": 0.37073170731707317, + "grad_norm": 1.7362228631973267, + "kl": 0.0513916015625, + "learning_rate": 2.3785754914922923e-06, + "loss": 0.0032, + "reward": 0.1875, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 304 + }, + { + "completion_length": 704.5833435058594, + "epoch": 0.3719512195121951, + "grad_norm": 0.7385122776031494, + "kl": 0.0291748046875, + "learning_rate": 2.3733921199241755e-06, + "loss": -0.0092, + "reward": 0.2083333358168602, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 305 + }, + { + "completion_length": 819.3750305175781, + "epoch": 0.37317073170731707, + "grad_norm": 0.3535645008087158, + "kl": 0.03369140625, + "learning_rate": 2.3681929214641924e-06, + "loss": 0.0263, + "reward": 0.229166679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.229166679084301, + "rewards/format_reward": 0.0, + "step": 306 + }, + { + "completion_length": 774.2083435058594, + "epoch": 0.374390243902439, + "grad_norm": 0.48355501890182495, + "kl": 0.03045654296875, + "learning_rate": 2.362977990327931e-06, + "loss": -0.0385, + "reward": 0.2500000149011612, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 307 + }, + { + "completion_length": 787.75, + "epoch": 0.375609756097561, + "grad_norm": 0.5030492544174194, + "kl": 0.02679443359375, + "learning_rate": 2.357747421016073e-06, + "loss": -0.07, + "reward": 0.125, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 308 + }, + { + "completion_length": 892.0625305175781, + "epoch": 0.37682926829268293, + "grad_norm": 0.1832209974527359, + "kl": 0.02886962890625, + "learning_rate": 2.3525013083126835e-06, + "loss": -0.0045, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 309 + }, + { + "completion_length": 713.125, + "epoch": 0.3780487804878049, + "grad_norm": 0.3876541554927826, + "kl": 0.032470703125, + "learning_rate": 2.34723974728349e-06, + "loss": 0.0125, + "reward": 0.229166679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.229166679084301, + "rewards/format_reward": 0.0, + "step": 310 + }, + { + "completion_length": 814.3541870117188, + "epoch": 0.37926829268292683, + "grad_norm": 0.2946406900882721, + "kl": 0.03204345703125, + "learning_rate": 2.341962833274165e-06, + "loss": 0.0051, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 311 + }, + { + "completion_length": 808.9583740234375, + "epoch": 0.3804878048780488, + "grad_norm": 0.1276874542236328, + "kl": 0.03515625, + "learning_rate": 2.336670661908592e-06, + "loss": 0.0081, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 312 + }, + { + "completion_length": 950.375, + "epoch": 0.38170731707317074, + "grad_norm": 0.2518679201602936, + "kl": 0.02764892578125, + "learning_rate": 2.3313633290871373e-06, + "loss": -0.0234, + "reward": 0.1875, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 313 + }, + { + "completion_length": 734.1666870117188, + "epoch": 0.3829268292682927, + "grad_norm": 0.32292279601097107, + "kl": 0.033447265625, + "learning_rate": 2.3260409309849103e-06, + "loss": -0.0036, + "reward": 0.1875, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 314 + }, + { + "completion_length": 892.4166870117188, + "epoch": 0.38414634146341464, + "grad_norm": 0.2906545400619507, + "kl": 0.0340576171875, + "learning_rate": 2.3207035640500206e-06, + "loss": -0.0361, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 315 + }, + { + "completion_length": 826.3333435058594, + "epoch": 0.3853658536585366, + "grad_norm": 0.500372052192688, + "kl": 0.03460693359375, + "learning_rate": 2.315351325001832e-06, + "loss": 0.0285, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 316 + }, + { + "completion_length": 968.1666870117188, + "epoch": 0.38658536585365855, + "grad_norm": 0.15128959715366364, + "kl": 0.02838134765625, + "learning_rate": 2.3099843108292062e-06, + "loss": 0.0349, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 317 + }, + { + "completion_length": 1018.5625, + "epoch": 0.3878048780487805, + "grad_norm": 0.25746986269950867, + "kl": 0.0301513671875, + "learning_rate": 2.3046026187887498e-06, + "loss": -0.0357, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 318 + }, + { + "completion_length": 822.1041870117188, + "epoch": 0.38902439024390245, + "grad_norm": 0.2673456072807312, + "kl": 0.03369140625, + "learning_rate": 2.2992063464030482e-06, + "loss": -0.0471, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 319 + }, + { + "completion_length": 761.8958435058594, + "epoch": 0.3902439024390244, + "grad_norm": 0.10515403747558594, + "kl": 0.03021240234375, + "learning_rate": 2.293795591458901e-06, + "loss": 0.0011, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 320 + }, + { + "completion_length": 594.7291870117188, + "epoch": 0.39146341463414636, + "grad_norm": 0.32800784707069397, + "kl": 0.03155517578125, + "learning_rate": 2.288370452005547e-06, + "loss": -0.0235, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 321 + }, + { + "completion_length": 684.8541870117188, + "epoch": 0.3926829268292683, + "grad_norm": 0.0611780546605587, + "kl": 0.02734375, + "learning_rate": 2.2829310263528907e-06, + "loss": 0.001, + "reward": 0.1875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 322 + }, + { + "completion_length": 779.6041870117188, + "epoch": 0.39390243902439026, + "grad_norm": 0.35459983348846436, + "kl": 0.02886962890625, + "learning_rate": 2.2774774130697184e-06, + "loss": 0.0159, + "reward": 0.2083333432674408, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 323 + }, + { + "completion_length": 792.125, + "epoch": 0.3951219512195122, + "grad_norm": 0.49110984802246094, + "kl": 0.03131103515625, + "learning_rate": 2.2720097109819135e-06, + "loss": 0.048, + "reward": 0.1458333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 324 + }, + { + "completion_length": 781.1458435058594, + "epoch": 0.39634146341463417, + "grad_norm": 0.9487172365188599, + "kl": 0.0322265625, + "learning_rate": 2.2665280191706656e-06, + "loss": 0.0379, + "reward": 0.25, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 325 + }, + { + "completion_length": 823.6250305175781, + "epoch": 0.3975609756097561, + "grad_norm": 0.45459306240081787, + "kl": 0.03363037109375, + "learning_rate": 2.2610324369706735e-06, + "loss": 0.0376, + "reward": 0.1666666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 326 + }, + { + "completion_length": 528.4791717529297, + "epoch": 0.39878048780487807, + "grad_norm": 0.35636627674102783, + "kl": 0.03955078125, + "learning_rate": 2.2555230639683464e-06, + "loss": 0.0086, + "reward": 0.2708333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 327 + }, + { + "completion_length": 632.2083435058594, + "epoch": 0.4, + "grad_norm": 0.7059880495071411, + "kl": 0.02734375, + "learning_rate": 2.25e-06, + "loss": -0.0038, + "reward": 0.2083333358168602, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 328 + }, + { + "completion_length": 597.8541717529297, + "epoch": 0.401219512195122, + "grad_norm": 0.45517703890800476, + "kl": 0.03338623046875, + "learning_rate": 2.2444633451500453e-06, + "loss": 0.0128, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 329 + }, + { + "completion_length": 750.7083435058594, + "epoch": 0.4024390243902439, + "grad_norm": 0.07014621794223785, + "kl": 0.02850341796875, + "learning_rate": 2.2389131997491756e-06, + "loss": 0.001, + "reward": 0.1875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 330 + }, + { + "completion_length": 736.0625, + "epoch": 0.4036585365853659, + "grad_norm": 0.37191396951675415, + "kl": 0.02838134765625, + "learning_rate": 2.2333496643725505e-06, + "loss": 0.0431, + "reward": 0.2083333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 331 + }, + { + "completion_length": 845.6250305175781, + "epoch": 0.40487804878048783, + "grad_norm": 0.052367597818374634, + "kl": 0.0240478515625, + "learning_rate": 2.2277728398379705e-06, + "loss": 0.0009, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 332 + }, + { + "completion_length": 994.6250610351562, + "epoch": 0.4060975609756098, + "grad_norm": 0.31657665967941284, + "kl": 0.03240966796875, + "learning_rate": 2.2221828272040517e-06, + "loss": 0.0022, + "reward": 0.0833333358168602, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 333 + }, + { + "completion_length": 674.9791870117188, + "epoch": 0.4073170731707317, + "grad_norm": 0.05391751974821091, + "kl": 0.02685546875, + "learning_rate": 2.2165797277683943e-06, + "loss": 0.001, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 334 + }, + { + "completion_length": 672.7291870117188, + "epoch": 0.40853658536585363, + "grad_norm": 0.2743265628814697, + "kl": 0.03350830078125, + "learning_rate": 2.2109636430657463e-06, + "loss": 0.0015, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 335 + }, + { + "completion_length": 604.9375, + "epoch": 0.4097560975609756, + "grad_norm": 0.40125370025634766, + "kl": 0.03033447265625, + "learning_rate": 2.2053346748661633e-06, + "loss": 0.0156, + "reward": 0.16666667722165585, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 336 + }, + { + "completion_length": 1189.8125, + "epoch": 0.41097560975609754, + "grad_norm": 0.13064952194690704, + "kl": 0.02838134765625, + "learning_rate": 2.1996929251731665e-06, + "loss": 0.002, + "reward": 0.2083333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 337 + }, + { + "completion_length": 603.6875305175781, + "epoch": 0.4121951219512195, + "grad_norm": 0.7183840274810791, + "kl": 0.0283203125, + "learning_rate": 2.194038496221892e-06, + "loss": 0.0167, + "reward": 0.2083333395421505, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.0, + "step": 338 + }, + { + "completion_length": 777.9375305175781, + "epoch": 0.41341463414634144, + "grad_norm": 0.042287491261959076, + "kl": 0.0262451171875, + "learning_rate": 2.188371490477239e-06, + "loss": 0.0009, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 339 + }, + { + "completion_length": 842.4792175292969, + "epoch": 0.4146341463414634, + "grad_norm": 0.29391470551490784, + "kl": 0.0264892578125, + "learning_rate": 2.182692010632013e-06, + "loss": 0.0147, + "reward": 0.1666666679084301, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 340 + }, + { + "completion_length": 677.5833435058594, + "epoch": 0.41585365853658535, + "grad_norm": 0.4390711784362793, + "kl": 0.0345458984375, + "learning_rate": 2.177000159605065e-06, + "loss": 0.0028, + "reward": 0.1041666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "step": 341 + }, + { + "completion_length": 760.4791870117188, + "epoch": 0.4170731707317073, + "grad_norm": 0.3324912190437317, + "kl": 0.02978515625, + "learning_rate": 2.1712960405394265e-06, + "loss": -0.0057, + "reward": 0.1666666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 342 + }, + { + "completion_length": 658.0208435058594, + "epoch": 0.41829268292682925, + "grad_norm": 0.42109569907188416, + "kl": 0.0302734375, + "learning_rate": 2.1655797568004397e-06, + "loss": 0.0047, + "reward": 0.2500000149011612, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 343 + }, + { + "completion_length": 844.0208435058594, + "epoch": 0.4195121951219512, + "grad_norm": 0.5153623819351196, + "kl": 0.029296875, + "learning_rate": 2.1598514119738853e-06, + "loss": 0.0467, + "reward": 0.08333333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 344 + }, + { + "completion_length": 1024.7292175292969, + "epoch": 0.42073170731707316, + "grad_norm": 0.32257041335105896, + "kl": 0.0279541015625, + "learning_rate": 2.154111109864105e-06, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 345 + }, + { + "completion_length": 634.3541870117188, + "epoch": 0.4219512195121951, + "grad_norm": 0.43821436166763306, + "kl": 0.0289306640625, + "learning_rate": 2.1483589544921202e-06, + "loss": 0.0102, + "reward": 0.0625, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 346 + }, + { + "completion_length": 988.8750305175781, + "epoch": 0.42317073170731706, + "grad_norm": 0.2754349410533905, + "kl": 0.02734375, + "learning_rate": 2.1425950500937493e-06, + "loss": 0.0076, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 347 + }, + { + "completion_length": 675.8541870117188, + "epoch": 0.424390243902439, + "grad_norm": 1.2542449235916138, + "kl": 0.03582763671875, + "learning_rate": 2.1368195011177142e-06, + "loss": 0.0095, + "reward": 0.2291666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 348 + }, + { + "completion_length": 866.3750305175781, + "epoch": 0.42560975609756097, + "grad_norm": 0.4978950023651123, + "kl": 0.03076171875, + "learning_rate": 2.1310324122237512e-06, + "loss": 0.0125, + "reward": 0.06250000186264515, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 349 + }, + { + "completion_length": 871.4375305175781, + "epoch": 0.4268292682926829, + "grad_norm": 0.4244663417339325, + "kl": 0.02947998046875, + "learning_rate": 2.125233888280715e-06, + "loss": -0.0582, + "reward": 0.1041666679084301, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 350 + }, + { + "completion_length": 857.8333740234375, + "epoch": 0.42804878048780487, + "grad_norm": 0.5104000568389893, + "kl": 0.02020263671875, + "learning_rate": 2.1194240343646732e-06, + "loss": -0.0086, + "reward": 0.2500000149011612, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 351 + }, + { + "completion_length": 779.0625305175781, + "epoch": 0.4292682926829268, + "grad_norm": 0.2433476448059082, + "kl": 0.03094482421875, + "learning_rate": 2.11360295575701e-06, + "loss": 0.0085, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 352 + }, + { + "completion_length": 1070.5625610351562, + "epoch": 0.4304878048780488, + "grad_norm": 0.24915798008441925, + "kl": 0.026123046875, + "learning_rate": 2.1077707579425114e-06, + "loss": 0.0376, + "reward": 0.14583333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 353 + }, + { + "completion_length": 953.8541870117188, + "epoch": 0.4317073170731707, + "grad_norm": 0.4156853258609772, + "kl": 0.0296630859375, + "learning_rate": 2.1019275466074585e-06, + "loss": 0.0097, + "reward": 0.0833333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 354 + }, + { + "completion_length": 1144.7916870117188, + "epoch": 0.4329268292682927, + "grad_norm": 0.2654878795146942, + "kl": 0.0277099609375, + "learning_rate": 2.0960734276377082e-06, + "loss": -0.0253, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 355 + }, + { + "completion_length": 1072.1458740234375, + "epoch": 0.43414634146341463, + "grad_norm": 0.2536885440349579, + "kl": 0.05084228515625, + "learning_rate": 2.0902085071167774e-06, + "loss": -0.0073, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 356 + }, + { + "completion_length": 804.7500305175781, + "epoch": 0.4353658536585366, + "grad_norm": 0.4465596675872803, + "kl": 0.032470703125, + "learning_rate": 2.0843328913239216e-06, + "loss": -0.097, + "reward": 0.2083333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.0, + "step": 357 + }, + { + "completion_length": 990.1666870117188, + "epoch": 0.43658536585365854, + "grad_norm": 679.5493774414062, + "kl": 4.13946533203125, + "learning_rate": 2.0784466867322037e-06, + "loss": 0.0965, + "reward": 0.16666667722165585, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 358 + }, + { + "completion_length": 1086.2500610351562, + "epoch": 0.4378048780487805, + "grad_norm": 0.4612940549850464, + "kl": 0.02392578125, + "learning_rate": 2.0725500000065715e-06, + "loss": 0.0144, + "reward": 0.1666666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 359 + }, + { + "completion_length": 880.9583435058594, + "epoch": 0.43902439024390244, + "grad_norm": 0.6914082765579224, + "kl": 0.0379638671875, + "learning_rate": 2.0666429380019185e-06, + "loss": 0.0259, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 360 + }, + { + "completion_length": 758.5000305175781, + "epoch": 0.4402439024390244, + "grad_norm": 0.5643234252929688, + "kl": 0.038818359375, + "learning_rate": 2.060725607761153e-06, + "loss": -0.0063, + "reward": 0.1458333358168602, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 361 + }, + { + "completion_length": 842.0833435058594, + "epoch": 0.44146341463414634, + "grad_norm": 0.24680182337760925, + "kl": 0.0428466796875, + "learning_rate": 2.0547981165132547e-06, + "loss": 0.0014, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 362 + }, + { + "completion_length": 1108.1250610351562, + "epoch": 0.4426829268292683, + "grad_norm": 0.41155484318733215, + "kl": 0.03021240234375, + "learning_rate": 2.048860571671332e-06, + "loss": -0.028, + "reward": 0.1041666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "step": 363 + }, + { + "completion_length": 865.8125305175781, + "epoch": 0.44390243902439025, + "grad_norm": 0.2589362859725952, + "kl": 0.0301513671875, + "learning_rate": 2.0429130808306767e-06, + "loss": 0.0356, + "reward": 0.16666667722165585, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 364 + }, + { + "completion_length": 799.2708435058594, + "epoch": 0.4451219512195122, + "grad_norm": 0.5328904390335083, + "kl": 0.031005859375, + "learning_rate": 2.036955751766815e-06, + "loss": -0.0409, + "reward": 0.2500000111758709, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.0, + "step": 365 + }, + { + "completion_length": 863.5000305175781, + "epoch": 0.44634146341463415, + "grad_norm": 0.5315119624137878, + "kl": 0.03106689453125, + "learning_rate": 2.030988692433552e-06, + "loss": 0.0315, + "reward": 0.12500000558793545, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 366 + }, + { + "completion_length": 1099.1875305175781, + "epoch": 0.4475609756097561, + "grad_norm": 0.3300071358680725, + "kl": 0.030029296875, + "learning_rate": 2.0250120109610155e-06, + "loss": 0.0887, + "reward": 0.2083333395421505, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.0, + "step": 367 + }, + { + "completion_length": 849.4583435058594, + "epoch": 0.44878048780487806, + "grad_norm": 0.30705785751342773, + "kl": 0.029052734375, + "learning_rate": 2.019025815653701e-06, + "loss": -0.0104, + "reward": 0.3333333432674408, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 368 + }, + { + "completion_length": 746.2916870117188, + "epoch": 0.45, + "grad_norm": 0.5189336538314819, + "kl": 0.032470703125, + "learning_rate": 2.0130302149885033e-06, + "loss": -0.0545, + "reward": 0.1041666679084301, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 369 + }, + { + "completion_length": 721.7291870117188, + "epoch": 0.45121951219512196, + "grad_norm": 0.21197453141212463, + "kl": 0.03759765625, + "learning_rate": 2.007025317612754e-06, + "loss": 0.0083, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 370 + }, + { + "completion_length": 923.8541870117188, + "epoch": 0.4524390243902439, + "grad_norm": 0.3330663740634918, + "kl": 0.0364990234375, + "learning_rate": 2.001011232342253e-06, + "loss": -0.0072, + "reward": 0.229166679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.229166679084301, + "rewards/format_reward": 0.0, + "step": 371 + }, + { + "completion_length": 1017.0833435058594, + "epoch": 0.45365853658536587, + "grad_norm": 0.5465441942214966, + "kl": 0.0428466796875, + "learning_rate": 1.994988068159294e-06, + "loss": 0.0243, + "reward": 0.2291666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 372 + }, + { + "completion_length": 813.5833740234375, + "epoch": 0.4548780487804878, + "grad_norm": 0.37892287969589233, + "kl": 0.03466796875, + "learning_rate": 1.9889559342106926e-06, + "loss": 0.0091, + "reward": 0.1666666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 373 + }, + { + "completion_length": 818.5833740234375, + "epoch": 0.4560975609756098, + "grad_norm": 0.06201218068599701, + "kl": 0.031494140625, + "learning_rate": 1.9829149398058068e-06, + "loss": 0.0012, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 374 + }, + { + "completion_length": 808.6041870117188, + "epoch": 0.4573170731707317, + "grad_norm": 0.6525385975837708, + "kl": 0.036376953125, + "learning_rate": 1.976865194414555e-06, + "loss": -0.0442, + "reward": 0.1666666716337204, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 375 + }, + { + "completion_length": 652.3125305175781, + "epoch": 0.4585365853658537, + "grad_norm": 0.5023518800735474, + "kl": 0.0386962890625, + "learning_rate": 1.9708068076654364e-06, + "loss": -0.0344, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 376 + }, + { + "completion_length": 729.75, + "epoch": 0.45975609756097563, + "grad_norm": 0.23177191615104675, + "kl": 0.03369140625, + "learning_rate": 1.9647398893435394e-06, + "loss": 0.0079, + "reward": 0.2291666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 377 + }, + { + "completion_length": 752.1666870117188, + "epoch": 0.4609756097560976, + "grad_norm": 0.4666472375392914, + "kl": 0.051513671875, + "learning_rate": 1.9586645493885565e-06, + "loss": -0.0459, + "reward": 0.1666666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 378 + }, + { + "completion_length": 664.6875305175781, + "epoch": 0.46219512195121953, + "grad_norm": 0.5903889536857605, + "kl": 0.0306396484375, + "learning_rate": 1.9525808978927886e-06, + "loss": 0.0618, + "reward": 0.1666666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 379 + }, + { + "completion_length": 864.8541870117188, + "epoch": 0.4634146341463415, + "grad_norm": 0.34605127573013306, + "kl": 0.02838134765625, + "learning_rate": 1.946489045099152e-06, + "loss": 0.0032, + "reward": 0.06250000186264515, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 380 + }, + { + "completion_length": 747.9166870117188, + "epoch": 0.46463414634146344, + "grad_norm": 0.5324747562408447, + "kl": 0.032958984375, + "learning_rate": 1.94038910139918e-06, + "loss": 0.0287, + "reward": 0.2291666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 381 + }, + { + "completion_length": 593.4375, + "epoch": 0.4658536585365854, + "grad_norm": 0.550981879234314, + "kl": 0.033447265625, + "learning_rate": 1.934281177331023e-06, + "loss": 0.0041, + "reward": 0.1666666679084301, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 382 + }, + { + "completion_length": 740.3958435058594, + "epoch": 0.46707317073170734, + "grad_norm": 0.26112014055252075, + "kl": 0.03411865234375, + "learning_rate": 1.928165383577445e-06, + "loss": 0.0041, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 383 + }, + { + "completion_length": 730.7291870117188, + "epoch": 0.4682926829268293, + "grad_norm": 0.6180046200752258, + "kl": 0.030029296875, + "learning_rate": 1.9220418309638175e-06, + "loss": -0.0243, + "reward": 0.1875, + "reward_std": 0.18042196333408356, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 384 + }, + { + "completion_length": 766.4375, + "epoch": 0.4695121951219512, + "grad_norm": 0.6600415706634521, + "kl": 0.0411376953125, + "learning_rate": 1.915910630456112e-06, + "loss": 0.0005, + "reward": 0.1666666716337204, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 385 + }, + { + "completion_length": 723.0208587646484, + "epoch": 0.47073170731707314, + "grad_norm": 0.6182783842086792, + "kl": 0.03955078125, + "learning_rate": 1.909771893158889e-06, + "loss": 0.0118, + "reward": 0.10416666977107525, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 386 + }, + { + "completion_length": 688.4583435058594, + "epoch": 0.4719512195121951, + "grad_norm": 0.42049577832221985, + "kl": 0.02972412109375, + "learning_rate": 1.9036257303132843e-06, + "loss": -0.0022, + "reward": 0.3750000223517418, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.3750000223517418, + "rewards/format_reward": 0.0, + "step": 387 + }, + { + "completion_length": 921.5625305175781, + "epoch": 0.47317073170731705, + "grad_norm": 0.39355793595314026, + "kl": 0.03167724609375, + "learning_rate": 1.8974722532949929e-06, + "loss": 0.0195, + "reward": 0.1875000111758709, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.0, + "step": 388 + }, + { + "completion_length": 567.2291870117188, + "epoch": 0.474390243902439, + "grad_norm": 0.5436845421791077, + "kl": 0.0390625, + "learning_rate": 1.8913115736122519e-06, + "loss": 0.0301, + "reward": 0.2500000149011612, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 389 + }, + { + "completion_length": 614.9791870117188, + "epoch": 0.47560975609756095, + "grad_norm": 0.5892400741577148, + "kl": 0.03302001953125, + "learning_rate": 1.8851438029038191e-06, + "loss": 0.0559, + "reward": 0.2916666865348816, + "reward_std": 0.14433755725622177, + "rewards/accuracy_reward": 0.2916666865348816, + "rewards/format_reward": 0.0, + "step": 390 + }, + { + "completion_length": 885.6041870117188, + "epoch": 0.4768292682926829, + "grad_norm": 0.06690337508916855, + "kl": 0.032470703125, + "learning_rate": 1.8789690529369492e-06, + "loss": 0.0011, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 391 + }, + { + "completion_length": 493.85418701171875, + "epoch": 0.47804878048780486, + "grad_norm": 0.6229822039604187, + "kl": 0.034912109375, + "learning_rate": 1.8727874356053706e-06, + "loss": -0.0013, + "reward": 0.2708333432674408, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 392 + }, + { + "completion_length": 755.2916870117188, + "epoch": 0.4792682926829268, + "grad_norm": 0.31777453422546387, + "kl": 0.036376953125, + "learning_rate": 1.8665990629272555e-06, + "loss": 0.0094, + "reward": 0.0833333358168602, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 393 + }, + { + "completion_length": 728.6250305175781, + "epoch": 0.48048780487804876, + "grad_norm": 0.4183621108531952, + "kl": 0.03448486328125, + "learning_rate": 1.8604040470431908e-06, + "loss": -0.0205, + "reward": 0.06250000186264515, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 394 + }, + { + "completion_length": 554.4583435058594, + "epoch": 0.4817073170731707, + "grad_norm": 0.5221788287162781, + "kl": 0.0399169921875, + "learning_rate": 1.8542025002141474e-06, + "loss": 0.0101, + "reward": 0.1875000111758709, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.0, + "step": 395 + }, + { + "completion_length": 802.2291870117188, + "epoch": 0.48292682926829267, + "grad_norm": 0.22250708937644958, + "kl": 0.03021240234375, + "learning_rate": 1.8479945348194423e-06, + "loss": 0.0055, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 396 + }, + { + "completion_length": 646.2916870117188, + "epoch": 0.4841463414634146, + "grad_norm": 0.3303127586841583, + "kl": 0.037109375, + "learning_rate": 1.8417802633547067e-06, + "loss": -0.0063, + "reward": 0.1666666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 397 + }, + { + "completion_length": 712.6666870117188, + "epoch": 0.4853658536585366, + "grad_norm": 0.6952998042106628, + "kl": 0.042236328125, + "learning_rate": 1.8355597984298435e-06, + "loss": -0.0255, + "reward": 0.14583333395421505, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 398 + }, + { + "completion_length": 680.8541870117188, + "epoch": 0.4865853658536585, + "grad_norm": 0.6096604466438293, + "kl": 0.03057861328125, + "learning_rate": 1.8293332527669897e-06, + "loss": 0.0025, + "reward": 0.1875000074505806, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 399 + }, + { + "completion_length": 697.625, + "epoch": 0.4878048780487805, + "grad_norm": 0.5263100266456604, + "kl": 0.0400390625, + "learning_rate": 1.823100739198472e-06, + "loss": 0.0056, + "reward": 0.1875, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 400 + }, + { + "completion_length": 717.2916870117188, + "epoch": 0.48902439024390243, + "grad_norm": 0.2937505841255188, + "kl": 0.0421142578125, + "learning_rate": 1.816862370664762e-06, + "loss": 0.0369, + "reward": 0.1875000074505806, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 401 + }, + { + "completion_length": 737.1250305175781, + "epoch": 0.4902439024390244, + "grad_norm": 0.5524131059646606, + "kl": 0.03460693359375, + "learning_rate": 1.8106182602124312e-06, + "loss": -0.0016, + "reward": 0.16666667722165585, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 402 + }, + { + "completion_length": 689.3541870117188, + "epoch": 0.49146341463414633, + "grad_norm": 0.6056103110313416, + "kl": 0.0382080078125, + "learning_rate": 1.8043685209921002e-06, + "loss": 0.0203, + "reward": 0.1458333358168602, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 403 + }, + { + "completion_length": 821.8958435058594, + "epoch": 0.4926829268292683, + "grad_norm": 0.770128607749939, + "kl": 0.035400390625, + "learning_rate": 1.7981132662563906e-06, + "loss": 0.0777, + "reward": 0.2500000149011612, + "reward_std": 0.21650634706020355, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 404 + }, + { + "completion_length": 869.4166870117188, + "epoch": 0.49390243902439024, + "grad_norm": 0.3313486576080322, + "kl": 0.03070068359375, + "learning_rate": 1.7918526093578702e-06, + "loss": -0.0011, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 405 + }, + { + "completion_length": 905.875, + "epoch": 0.4951219512195122, + "grad_norm": 0.29705655574798584, + "kl": 0.0396728515625, + "learning_rate": 1.7855866637470027e-06, + "loss": -0.0047, + "reward": 0.14583333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 406 + }, + { + "completion_length": 667.7083740234375, + "epoch": 0.49634146341463414, + "grad_norm": 0.6838599443435669, + "kl": 0.0286865234375, + "learning_rate": 1.7793155429700868e-06, + "loss": -0.0007, + "reward": 0.2083333395421505, + "reward_std": 0.21650634706020355, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.0, + "step": 407 + }, + { + "completion_length": 877.5625, + "epoch": 0.4975609756097561, + "grad_norm": 0.3647572100162506, + "kl": 0.03082275390625, + "learning_rate": 1.7730393606672033e-06, + "loss": -0.0071, + "reward": 0.2291666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 408 + }, + { + "completion_length": 876.4791870117188, + "epoch": 0.49878048780487805, + "grad_norm": 0.46186333894729614, + "kl": 0.03302001953125, + "learning_rate": 1.7667582305701528e-06, + "loss": -0.0742, + "reward": 0.3333333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 409 + }, + { + "completion_length": 639.5833740234375, + "epoch": 0.5, + "grad_norm": 0.5355751514434814, + "kl": 0.0338134765625, + "learning_rate": 1.7604722665003958e-06, + "loss": -0.0021, + "reward": 0.2083333432674408, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 410 + }, + { + "completion_length": 850.5208740234375, + "epoch": 0.501219512195122, + "grad_norm": 0.4516288638114929, + "kl": 0.033935546875, + "learning_rate": 1.7541815823669903e-06, + "loss": 0.02, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 411 + }, + { + "completion_length": 691.6250305175781, + "epoch": 0.5024390243902439, + "grad_norm": 0.4676379859447479, + "kl": 0.0311279296875, + "learning_rate": 1.7478862921645273e-06, + "loss": 0.0353, + "reward": 0.14583333395421505, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 412 + }, + { + "completion_length": 688.0, + "epoch": 0.5036585365853659, + "grad_norm": 0.4021396040916443, + "kl": 0.03631591796875, + "learning_rate": 1.7415865099710657e-06, + "loss": 0.0129, + "reward": 0.1250000037252903, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 413 + }, + { + "completion_length": 838.8541870117188, + "epoch": 0.5048780487804878, + "grad_norm": 0.3340761363506317, + "kl": 0.03106689453125, + "learning_rate": 1.735282349946064e-06, + "loss": 0.0196, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 414 + }, + { + "completion_length": 763.1458740234375, + "epoch": 0.5060975609756098, + "grad_norm": 0.46428605914115906, + "kl": 0.03955078125, + "learning_rate": 1.7289739263283118e-06, + "loss": 0.0211, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 415 + }, + { + "completion_length": 724.7291870117188, + "epoch": 0.5073170731707317, + "grad_norm": 0.42527034878730774, + "kl": 0.03302001953125, + "learning_rate": 1.7226613534338608e-06, + "loss": -0.0064, + "reward": 0.12500000558793545, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 416 + }, + { + "completion_length": 772.3541870117188, + "epoch": 0.5085365853658537, + "grad_norm": 0.16283953189849854, + "kl": 0.02685546875, + "learning_rate": 1.716344745653952e-06, + "loss": 0.0059, + "reward": 0.1666666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 417 + }, + { + "completion_length": 788.2916870117188, + "epoch": 0.5097560975609756, + "grad_norm": 0.2448461353778839, + "kl": 0.02716064453125, + "learning_rate": 1.7100242174529439e-06, + "loss": 0.0199, + "reward": 0.1666666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 418 + }, + { + "completion_length": 701.0625305175781, + "epoch": 0.5109756097560976, + "grad_norm": 0.544904351234436, + "kl": 0.03387451171875, + "learning_rate": 1.7036998833662359e-06, + "loss": -0.0098, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 419 + }, + { + "completion_length": 799.9166870117188, + "epoch": 0.5121951219512195, + "grad_norm": 0.06163305044174194, + "kl": 0.02587890625, + "learning_rate": 1.6973718579981973e-06, + "loss": 0.001, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 420 + }, + { + "completion_length": 742.2916870117188, + "epoch": 0.5134146341463415, + "grad_norm": 0.3775089979171753, + "kl": 0.03271484375, + "learning_rate": 1.6910402560200854e-06, + "loss": -0.0004, + "reward": 0.0625, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 421 + }, + { + "completion_length": 830.2083435058594, + "epoch": 0.5146341463414634, + "grad_norm": 0.3336365222930908, + "kl": 0.03155517578125, + "learning_rate": 1.6847051921679702e-06, + "loss": 0.0057, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 422 + }, + { + "completion_length": 676.0416870117188, + "epoch": 0.5158536585365854, + "grad_norm": 0.493982195854187, + "kl": 0.0255126953125, + "learning_rate": 1.6783667812406569e-06, + "loss": 0.0064, + "reward": 0.2291666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 423 + }, + { + "completion_length": 816.1041870117188, + "epoch": 0.5170731707317073, + "grad_norm": 0.3415720462799072, + "kl": 0.02813720703125, + "learning_rate": 1.672025138097601e-06, + "loss": 0.0539, + "reward": 0.1041666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "step": 424 + }, + { + "completion_length": 791.0625305175781, + "epoch": 0.5182926829268293, + "grad_norm": 0.756782591342926, + "kl": 0.02923583984375, + "learning_rate": 1.6656803776568307e-06, + "loss": 0.0526, + "reward": 0.2708333432674408, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 425 + }, + { + "completion_length": 754.1666870117188, + "epoch": 0.5195121951219512, + "grad_norm": 0.4986019432544708, + "kl": 0.0341796875, + "learning_rate": 1.6593326148928643e-06, + "loss": 0.001, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 426 + }, + { + "completion_length": 776.0416870117188, + "epoch": 0.5207317073170732, + "grad_norm": 0.1987488865852356, + "kl": 0.02716064453125, + "learning_rate": 1.652981964834623e-06, + "loss": 0.0324, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 427 + }, + { + "completion_length": 732.1041870117188, + "epoch": 0.5219512195121951, + "grad_norm": 0.05336523428559303, + "kl": 0.02783203125, + "learning_rate": 1.6466285425633527e-06, + "loss": 0.0011, + "reward": 0.1875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 428 + }, + { + "completion_length": 686.9791870117188, + "epoch": 0.5231707317073171, + "grad_norm": 0.5836074948310852, + "kl": 0.02978515625, + "learning_rate": 1.6402724632105323e-06, + "loss": 0.0141, + "reward": 0.1875000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 429 + }, + { + "completion_length": 660.0625305175781, + "epoch": 0.524390243902439, + "grad_norm": 0.3314565122127533, + "kl": 0.03704833984375, + "learning_rate": 1.6339138419557916e-06, + "loss": 0.0029, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 430 + }, + { + "completion_length": 806.9791870117188, + "epoch": 0.525609756097561, + "grad_norm": 0.3738638460636139, + "kl": 0.0345458984375, + "learning_rate": 1.6275527940248218e-06, + "loss": 0.0445, + "reward": 0.16666667722165585, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 431 + }, + { + "completion_length": 1050.1250305175781, + "epoch": 0.526829268292683, + "grad_norm": 0.4248029589653015, + "kl": 0.026611328125, + "learning_rate": 1.6211894346872887e-06, + "loss": -0.0202, + "reward": 0.2083333432674408, + "reward_std": 0.10825318098068237, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 432 + }, + { + "completion_length": 839.0833435058594, + "epoch": 0.5280487804878049, + "grad_norm": 0.40769124031066895, + "kl": 0.0311279296875, + "learning_rate": 1.614823879254744e-06, + "loss": -0.0006, + "reward": 0.08333333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 433 + }, + { + "completion_length": 747.9166870117188, + "epoch": 0.5292682926829269, + "grad_norm": 0.4294043183326721, + "kl": 0.0472412109375, + "learning_rate": 1.6084562430785336e-06, + "loss": -0.0104, + "reward": 0.2083333432674408, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 434 + }, + { + "completion_length": 946.7708740234375, + "epoch": 0.5304878048780488, + "grad_norm": 0.3011494278907776, + "kl": 0.034912109375, + "learning_rate": 1.6020866415477108e-06, + "loss": -0.0333, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 435 + }, + { + "completion_length": 1171.1458740234375, + "epoch": 0.5317073170731708, + "grad_norm": 0.45685434341430664, + "kl": 0.02496337890625, + "learning_rate": 1.5957151900869425e-06, + "loss": 0.0143, + "reward": 0.125, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 436 + }, + { + "completion_length": 731.5, + "epoch": 0.5329268292682927, + "grad_norm": 0.5969831943511963, + "kl": 0.03338623046875, + "learning_rate": 1.5893420041544193e-06, + "loss": -0.0248, + "reward": 0.1458333358168602, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 437 + }, + { + "completion_length": 736.6041870117188, + "epoch": 0.5341463414634147, + "grad_norm": 0.4960964322090149, + "kl": 0.02801513671875, + "learning_rate": 1.582967199239761e-06, + "loss": 0.081, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 438 + }, + { + "completion_length": 662.6875305175781, + "epoch": 0.5353658536585366, + "grad_norm": 0.2524319291114807, + "kl": 0.02606201171875, + "learning_rate": 1.5765908908619258e-06, + "loss": 0.0336, + "reward": 0.2083333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 439 + }, + { + "completion_length": 758.0208435058594, + "epoch": 0.5365853658536586, + "grad_norm": 0.21499498188495636, + "kl": 0.0301513671875, + "learning_rate": 1.5702131945671182e-06, + "loss": -0.0047, + "reward": 0.0416666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 440 + }, + { + "completion_length": 931.4583435058594, + "epoch": 0.5378048780487805, + "grad_norm": 0.46516576409339905, + "kl": 0.032470703125, + "learning_rate": 1.5638342259266904e-06, + "loss": -0.0083, + "reward": 0.18750000558793545, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/format_reward": 0.0, + "step": 441 + }, + { + "completion_length": 762.7708435058594, + "epoch": 0.5390243902439025, + "grad_norm": 0.6176576614379883, + "kl": 0.0413818359375, + "learning_rate": 1.5574541005350532e-06, + "loss": -0.0412, + "reward": 0.1875, + "reward_std": 0.10825318098068237, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 442 + }, + { + "completion_length": 845.1666870117188, + "epoch": 0.5402439024390244, + "grad_norm": 0.36604827642440796, + "kl": 0.0323486328125, + "learning_rate": 1.5510729340075781e-06, + "loss": 0.0028, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 443 + }, + { + "completion_length": 822.5208435058594, + "epoch": 0.5414634146341464, + "grad_norm": 0.4656050205230713, + "kl": 0.031494140625, + "learning_rate": 1.544690841978504e-06, + "loss": -0.0166, + "reward": 0.1875000111758709, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.0, + "step": 444 + }, + { + "completion_length": 863.9791870117188, + "epoch": 0.5426829268292683, + "grad_norm": 0.5311189293861389, + "kl": 0.0374755859375, + "learning_rate": 1.5383079400988402e-06, + "loss": -0.0338, + "reward": 0.1458333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 445 + }, + { + "completion_length": 712.3125305175781, + "epoch": 0.5439024390243903, + "grad_norm": 0.5392478704452515, + "kl": 0.02532958984375, + "learning_rate": 1.5319243440342713e-06, + "loss": -0.0118, + "reward": 0.0833333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 446 + }, + { + "completion_length": 713.6666870117188, + "epoch": 0.5451219512195122, + "grad_norm": 0.6092529892921448, + "kl": 0.0330810546875, + "learning_rate": 1.5255401694630625e-06, + "loss": 0.0047, + "reward": 0.2916666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/format_reward": 0.0, + "step": 447 + }, + { + "completion_length": 767.2083435058594, + "epoch": 0.5463414634146342, + "grad_norm": 0.20800291001796722, + "kl": 0.0302734375, + "learning_rate": 1.5191555320739608e-06, + "loss": 0.014, + "reward": 0.14583333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 448 + }, + { + "completion_length": 755.1875305175781, + "epoch": 0.5475609756097561, + "grad_norm": 0.605426549911499, + "kl": 0.0333251953125, + "learning_rate": 1.5127705475641014e-06, + "loss": -0.0052, + "reward": 0.3333333432674408, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 449 + }, + { + "completion_length": 782.125, + "epoch": 0.5487804878048781, + "grad_norm": 0.37740781903266907, + "kl": 0.02899169921875, + "learning_rate": 1.5063853316369081e-06, + "loss": 0.0065, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 450 + }, + { + "completion_length": 802.1666870117188, + "epoch": 0.55, + "grad_norm": 0.07578609138727188, + "kl": 0.02288818359375, + "learning_rate": 1.5e-06, + "loss": 0.0009, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 451 + }, + { + "completion_length": 790.3125, + "epoch": 0.551219512195122, + "grad_norm": 0.4336966872215271, + "kl": 0.02813720703125, + "learning_rate": 1.4936146683630921e-06, + "loss": 0.0157, + "reward": 0.14583333395421505, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 452 + }, + { + "completion_length": 895.9375, + "epoch": 0.552439024390244, + "grad_norm": 0.32502347230911255, + "kl": 0.02935791015625, + "learning_rate": 1.4872294524358989e-06, + "loss": -0.0093, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 453 + }, + { + "completion_length": 1040.1667175292969, + "epoch": 0.5536585365853659, + "grad_norm": 0.14283445477485657, + "kl": 0.028076171875, + "learning_rate": 1.4808444679260396e-06, + "loss": 0.001, + "reward": 0.25, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 454 + }, + { + "completion_length": 1108.2500610351562, + "epoch": 0.5548780487804879, + "grad_norm": 0.30400022864341736, + "kl": 0.03131103515625, + "learning_rate": 1.4744598305369376e-06, + "loss": 0.0327, + "reward": 0.06250000186264515, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 455 + }, + { + "completion_length": 726.125, + "epoch": 0.5560975609756098, + "grad_norm": 0.09212367236614227, + "kl": 0.0345458984375, + "learning_rate": 1.4680756559657292e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 456 + }, + { + "completion_length": 756.4166870117188, + "epoch": 0.5573170731707318, + "grad_norm": 0.49157455563545227, + "kl": 0.03619384765625, + "learning_rate": 1.4616920599011603e-06, + "loss": 0.0927, + "reward": 0.2083333358168602, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 457 + }, + { + "completion_length": 990.3333435058594, + "epoch": 0.5585365853658537, + "grad_norm": 0.4339282512664795, + "kl": 0.0247802734375, + "learning_rate": 1.4553091580214963e-06, + "loss": 0.0336, + "reward": 0.0416666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 458 + }, + { + "completion_length": 806.1666870117188, + "epoch": 0.5597560975609757, + "grad_norm": 0.5246623754501343, + "kl": 0.0361328125, + "learning_rate": 1.4489270659924222e-06, + "loss": -0.029, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 459 + }, + { + "completion_length": 801.8958435058594, + "epoch": 0.5609756097560976, + "grad_norm": 0.4816710948944092, + "kl": 0.027099609375, + "learning_rate": 1.442545899464947e-06, + "loss": 0.0141, + "reward": 0.2291666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 460 + }, + { + "completion_length": 758.0833740234375, + "epoch": 0.5621951219512196, + "grad_norm": 0.20983240008354187, + "kl": 0.029296875, + "learning_rate": 1.4361657740733103e-06, + "loss": 0.0403, + "reward": 0.14583333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 461 + }, + { + "completion_length": 829.5, + "epoch": 0.5634146341463414, + "grad_norm": 0.4363538324832916, + "kl": 0.03021240234375, + "learning_rate": 1.429786805432882e-06, + "loss": 0.0002, + "reward": 0.3125000149011612, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.3125000149011612, + "rewards/format_reward": 0.0, + "step": 462 + }, + { + "completion_length": 1017.4166870117188, + "epoch": 0.5646341463414634, + "grad_norm": 0.0842226967215538, + "kl": 0.027587890625, + "learning_rate": 1.4234091091380743e-06, + "loss": 0.001, + "reward": 0.1875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 463 + }, + { + "completion_length": 803.875, + "epoch": 0.5658536585365853, + "grad_norm": 0.18806934356689453, + "kl": 0.02984619140625, + "learning_rate": 1.4170328007602395e-06, + "loss": -0.0075, + "reward": 0.0416666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 464 + }, + { + "completion_length": 1089.1041870117188, + "epoch": 0.5670731707317073, + "grad_norm": 0.19883829355239868, + "kl": 0.02545166015625, + "learning_rate": 1.4106579958455812e-06, + "loss": 0.0119, + "reward": 0.0416666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 465 + }, + { + "completion_length": 893.5000305175781, + "epoch": 0.5682926829268292, + "grad_norm": 0.4463866353034973, + "kl": 0.0289306640625, + "learning_rate": 1.4042848099130574e-06, + "loss": 0.0065, + "reward": 0.1041666679084301, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 466 + }, + { + "completion_length": 734.5416870117188, + "epoch": 0.5695121951219512, + "grad_norm": 0.6711140275001526, + "kl": 0.03326416015625, + "learning_rate": 1.3979133584522893e-06, + "loss": 0.0101, + "reward": 0.2083333432674408, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 467 + }, + { + "completion_length": 709.7083435058594, + "epoch": 0.5707317073170731, + "grad_norm": 0.7721737623214722, + "kl": 0.02752685546875, + "learning_rate": 1.391543756921467e-06, + "loss": -0.0311, + "reward": 0.16666667722165585, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 468 + }, + { + "completion_length": 895.5625305175781, + "epoch": 0.5719512195121951, + "grad_norm": 0.05096851661801338, + "kl": 0.02825927734375, + "learning_rate": 1.3851761207452565e-06, + "loss": 0.001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 469 + }, + { + "completion_length": 853.4583435058594, + "epoch": 0.573170731707317, + "grad_norm": 0.4231189787387848, + "kl": 0.0272216796875, + "learning_rate": 1.3788105653127118e-06, + "loss": 0.0083, + "reward": 0.06250000186264515, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 470 + }, + { + "completion_length": 651.0833435058594, + "epoch": 0.574390243902439, + "grad_norm": 0.3461414575576782, + "kl": 0.0380859375, + "learning_rate": 1.3724472059751785e-06, + "loss": 0.0157, + "reward": 0.2083333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 471 + }, + { + "completion_length": 867.8958435058594, + "epoch": 0.5756097560975609, + "grad_norm": 0.05793582275509834, + "kl": 0.02783203125, + "learning_rate": 1.3660861580442087e-06, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 472 + }, + { + "completion_length": 875.6875, + "epoch": 0.5768292682926829, + "grad_norm": 0.5400838851928711, + "kl": 0.02728271484375, + "learning_rate": 1.3597275367894676e-06, + "loss": -0.0105, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 473 + }, + { + "completion_length": 818.2500610351562, + "epoch": 0.5780487804878048, + "grad_norm": 0.7908319234848022, + "kl": 0.03375244140625, + "learning_rate": 1.3533714574366473e-06, + "loss": 0.0058, + "reward": 0.1458333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 474 + }, + { + "completion_length": 815.2500305175781, + "epoch": 0.5792682926829268, + "grad_norm": 0.5779252052307129, + "kl": 0.0318603515625, + "learning_rate": 1.3470180351653773e-06, + "loss": 0.0174, + "reward": 0.10416666977107525, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 475 + }, + { + "completion_length": 864.2916870117188, + "epoch": 0.5804878048780487, + "grad_norm": 0.3415527045726776, + "kl": 0.02471923828125, + "learning_rate": 1.3406673851071362e-06, + "loss": 0.0053, + "reward": 0.06250000186264515, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 476 + }, + { + "completion_length": 960.3333435058594, + "epoch": 0.5817073170731707, + "grad_norm": 0.29808786511421204, + "kl": 0.0283203125, + "learning_rate": 1.3343196223431698e-06, + "loss": 0.0058, + "reward": 0.2708333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 477 + }, + { + "completion_length": 946.3333740234375, + "epoch": 0.5829268292682926, + "grad_norm": 0.35267508029937744, + "kl": 0.0269775390625, + "learning_rate": 1.3279748619023995e-06, + "loss": 0.0228, + "reward": 0.1458333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 478 + }, + { + "completion_length": 922.4375305175781, + "epoch": 0.5841463414634146, + "grad_norm": 0.29554396867752075, + "kl": 0.02947998046875, + "learning_rate": 1.3216332187593434e-06, + "loss": -0.0132, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 479 + }, + { + "completion_length": 858.0833435058594, + "epoch": 0.5853658536585366, + "grad_norm": 0.39898625016212463, + "kl": 0.03106689453125, + "learning_rate": 1.3152948078320297e-06, + "loss": -0.0085, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 480 + }, + { + "completion_length": 581.6041717529297, + "epoch": 0.5865853658536585, + "grad_norm": 0.7595959305763245, + "kl": 0.0352783203125, + "learning_rate": 1.3089597439799151e-06, + "loss": -0.0163, + "reward": 0.3541666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/format_reward": 0.0, + "step": 481 + }, + { + "completion_length": 935.5417175292969, + "epoch": 0.5878048780487805, + "grad_norm": 0.4653733968734741, + "kl": 0.04296875, + "learning_rate": 1.3026281420018034e-06, + "loss": -0.0067, + "reward": 0.1666666679084301, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 482 + }, + { + "completion_length": 888.1041870117188, + "epoch": 0.5890243902439024, + "grad_norm": 0.18640004098415375, + "kl": 0.034423828125, + "learning_rate": 1.2963001166337642e-06, + "loss": 0.0006, + "reward": 0.12500000558793545, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 483 + }, + { + "completion_length": 898.3542175292969, + "epoch": 0.5902439024390244, + "grad_norm": 0.6631487607955933, + "kl": 0.02813720703125, + "learning_rate": 1.2899757825470568e-06, + "loss": -0.0036, + "reward": 0.14583333395421505, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 484 + }, + { + "completion_length": 723.5208435058594, + "epoch": 0.5914634146341463, + "grad_norm": 0.36477863788604736, + "kl": 0.02996826171875, + "learning_rate": 1.283655254346048e-06, + "loss": -0.0048, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 485 + }, + { + "completion_length": 887.1875305175781, + "epoch": 0.5926829268292683, + "grad_norm": 0.4081045389175415, + "kl": 0.05609130859375, + "learning_rate": 1.2773386465661395e-06, + "loss": 0.0024, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 486 + }, + { + "completion_length": 802.3333435058594, + "epoch": 0.5939024390243902, + "grad_norm": 0.25304004549980164, + "kl": 0.028076171875, + "learning_rate": 1.2710260736716882e-06, + "loss": -0.0011, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 487 + }, + { + "completion_length": 902.0833435058594, + "epoch": 0.5951219512195122, + "grad_norm": 0.3382227122783661, + "kl": 0.02691650390625, + "learning_rate": 1.264717650053936e-06, + "loss": 0.0269, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 488 + }, + { + "completion_length": 964.2291870117188, + "epoch": 0.5963414634146341, + "grad_norm": 0.5060334205627441, + "kl": 0.02862548828125, + "learning_rate": 1.2584134900289346e-06, + "loss": -0.0156, + "reward": 0.1666666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 489 + }, + { + "completion_length": 1043.3958740234375, + "epoch": 0.5975609756097561, + "grad_norm": 0.2051764875650406, + "kl": 0.0284423828125, + "learning_rate": 1.2521137078354728e-06, + "loss": 0.0004, + "reward": 0.1666666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 490 + }, + { + "completion_length": 929.3333740234375, + "epoch": 0.598780487804878, + "grad_norm": 0.4943280518054962, + "kl": 0.02301025390625, + "learning_rate": 1.2458184176330102e-06, + "loss": 0.0281, + "reward": 0.1875, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 491 + }, + { + "completion_length": 832.5833740234375, + "epoch": 0.6, + "grad_norm": 0.11139194667339325, + "kl": 0.02972412109375, + "learning_rate": 1.2395277334996047e-06, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 492 + }, + { + "completion_length": 745.7708435058594, + "epoch": 0.6012195121951219, + "grad_norm": 0.04622248560190201, + "kl": 0.0257568359375, + "learning_rate": 1.2332417694298477e-06, + "loss": 0.0008, + "reward": 0.3125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.0, + "step": 493 + }, + { + "completion_length": 866.1458740234375, + "epoch": 0.6024390243902439, + "grad_norm": 0.06395512074232101, + "kl": 0.0565185546875, + "learning_rate": 1.2269606393327968e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 494 + }, + { + "completion_length": 779.8958740234375, + "epoch": 0.6036585365853658, + "grad_norm": 0.5273105502128601, + "kl": 0.03179931640625, + "learning_rate": 1.2206844570299133e-06, + "loss": -0.112, + "reward": 0.0833333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 495 + }, + { + "completion_length": 780.1458740234375, + "epoch": 0.6048780487804878, + "grad_norm": 0.4124651849269867, + "kl": 0.025634765625, + "learning_rate": 1.2144133362529974e-06, + "loss": -0.0126, + "reward": 0.2708333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/format_reward": 0.0, + "step": 496 + }, + { + "completion_length": 829.9791870117188, + "epoch": 0.6060975609756097, + "grad_norm": 0.7791106700897217, + "kl": 0.03582763671875, + "learning_rate": 1.2081473906421298e-06, + "loss": 0.0441, + "reward": 0.2291666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 497 + }, + { + "completion_length": 690.7500305175781, + "epoch": 0.6073170731707317, + "grad_norm": 0.5013418793678284, + "kl": 0.03131103515625, + "learning_rate": 1.20188673374361e-06, + "loss": 0.0256, + "reward": 0.1875, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 498 + }, + { + "completion_length": 727.9375305175781, + "epoch": 0.6085365853658536, + "grad_norm": 0.5570080280303955, + "kl": 0.02862548828125, + "learning_rate": 1.1956314790078998e-06, + "loss": -0.0023, + "reward": 0.2708333432674408, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 499 + }, + { + "completion_length": 867.0416870117188, + "epoch": 0.6097560975609756, + "grad_norm": 0.04908730089664459, + "kl": 0.02886962890625, + "learning_rate": 1.189381739787569e-06, + "loss": 0.0009, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 500 + }, + { + "completion_length": 783.8333740234375, + "epoch": 0.6109756097560975, + "grad_norm": 0.3778320252895355, + "kl": 0.02886962890625, + "learning_rate": 1.1831376293352378e-06, + "loss": 0.0196, + "reward": 0.1875000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 501 + }, + { + "completion_length": 792.1250305175781, + "epoch": 0.6121951219512195, + "grad_norm": 1.6423802375793457, + "kl": 0.05072021484375, + "learning_rate": 1.176899260801528e-06, + "loss": -0.0162, + "reward": 0.2083333358168602, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 502 + }, + { + "completion_length": 828.8333740234375, + "epoch": 0.6134146341463415, + "grad_norm": 0.5353675484657288, + "kl": 0.0289306640625, + "learning_rate": 1.1706667472330101e-06, + "loss": -0.0059, + "reward": 0.1875000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 503 + }, + { + "completion_length": 709.5000305175781, + "epoch": 0.6146341463414634, + "grad_norm": 0.4470565915107727, + "kl": 0.026123046875, + "learning_rate": 1.1644402015701568e-06, + "loss": 0.0265, + "reward": 0.1666666679084301, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 504 + }, + { + "completion_length": 827.3541870117188, + "epoch": 0.6158536585365854, + "grad_norm": 0.5625233054161072, + "kl": 0.0302734375, + "learning_rate": 1.158219736645294e-06, + "loss": 0.0489, + "reward": 0.0833333358168602, + "reward_std": 0.14433757960796356, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 505 + }, + { + "completion_length": 803.9166870117188, + "epoch": 0.6170731707317073, + "grad_norm": 0.3888726532459259, + "kl": 0.02508544921875, + "learning_rate": 1.152005465180558e-06, + "loss": 0.0052, + "reward": 0.229166679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.229166679084301, + "rewards/format_reward": 0.0, + "step": 506 + }, + { + "completion_length": 900.4791870117188, + "epoch": 0.6182926829268293, + "grad_norm": 0.3920578956604004, + "kl": 0.0260009765625, + "learning_rate": 1.145797499785853e-06, + "loss": -0.0216, + "reward": 0.1458333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 507 + }, + { + "completion_length": 784.7291870117188, + "epoch": 0.6195121951219512, + "grad_norm": 0.4152125418186188, + "kl": 0.02838134765625, + "learning_rate": 1.1395959529568088e-06, + "loss": -0.0235, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 508 + }, + { + "completion_length": 699.4583435058594, + "epoch": 0.6207317073170732, + "grad_norm": 0.3461558520793915, + "kl": 0.0263671875, + "learning_rate": 1.1334009370727446e-06, + "loss": 0.0797, + "reward": 0.2708333358168602, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/format_reward": 0.0, + "step": 509 + }, + { + "completion_length": 560.3125, + "epoch": 0.6219512195121951, + "grad_norm": 0.6555963754653931, + "kl": 0.02825927734375, + "learning_rate": 1.127212564394629e-06, + "loss": 0.0049, + "reward": 0.375, + "reward_std": 0.18042196333408356, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.0, + "step": 510 + }, + { + "completion_length": 892.4166870117188, + "epoch": 0.6231707317073171, + "grad_norm": 0.4940139055252075, + "kl": 0.02618408203125, + "learning_rate": 1.1210309470630509e-06, + "loss": 0.1071, + "reward": 0.1875000111758709, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.0, + "step": 511 + }, + { + "completion_length": 854.7083435058594, + "epoch": 0.624390243902439, + "grad_norm": 0.5197833776473999, + "kl": 0.0255126953125, + "learning_rate": 1.1148561970961818e-06, + "loss": -0.0257, + "reward": 0.1458333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 512 + }, + { + "completion_length": 963.0833740234375, + "epoch": 0.625609756097561, + "grad_norm": 0.26763102412223816, + "kl": 0.0277099609375, + "learning_rate": 1.1086884263877486e-06, + "loss": 0.0028, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 513 + }, + { + "completion_length": 795.7917175292969, + "epoch": 0.6268292682926829, + "grad_norm": 0.04478263109922409, + "kl": 0.03985595703125, + "learning_rate": 1.1025277467050079e-06, + "loss": 0.001, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 514 + }, + { + "completion_length": 962.8125305175781, + "epoch": 0.6280487804878049, + "grad_norm": 0.3781687915325165, + "kl": 0.03082275390625, + "learning_rate": 1.0963742696867162e-06, + "loss": 0.0034, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 515 + }, + { + "completion_length": 823.6250305175781, + "epoch": 0.6292682926829268, + "grad_norm": 0.44658133387565613, + "kl": 0.031494140625, + "learning_rate": 1.0902281068411114e-06, + "loss": -0.0129, + "reward": 0.2500000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.0, + "step": 516 + }, + { + "completion_length": 748.2916870117188, + "epoch": 0.6304878048780488, + "grad_norm": 0.44513779878616333, + "kl": 0.0255126953125, + "learning_rate": 1.084089369543888e-06, + "loss": 0.0591, + "reward": 0.1041666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "step": 517 + }, + { + "completion_length": 778.8333740234375, + "epoch": 0.6317073170731707, + "grad_norm": 0.35178038477897644, + "kl": 0.030029296875, + "learning_rate": 1.077958169036183e-06, + "loss": -0.0142, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 518 + }, + { + "completion_length": 889.2708435058594, + "epoch": 0.6329268292682927, + "grad_norm": 0.26045531034469604, + "kl": 0.02838134765625, + "learning_rate": 1.0718346164225556e-06, + "loss": 0.0006, + "reward": 0.2083333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 519 + }, + { + "completion_length": 910.5416870117188, + "epoch": 0.6341463414634146, + "grad_norm": 0.47047415375709534, + "kl": 0.02703857421875, + "learning_rate": 1.0657188226689772e-06, + "loss": 0.047, + "reward": 0.10416666977107525, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 520 + }, + { + "completion_length": 841.9792175292969, + "epoch": 0.6353658536585366, + "grad_norm": 0.2454436719417572, + "kl": 0.02545166015625, + "learning_rate": 1.0596108986008203e-06, + "loss": 0.0034, + "reward": 0.2708333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 521 + }, + { + "completion_length": 791.6250305175781, + "epoch": 0.6365853658536585, + "grad_norm": 0.08019955456256866, + "kl": 0.02874755859375, + "learning_rate": 1.0535109549008482e-06, + "loss": 0.0011, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 522 + }, + { + "completion_length": 808.9791870117188, + "epoch": 0.6378048780487805, + "grad_norm": 0.3111408054828644, + "kl": 0.03118896484375, + "learning_rate": 1.0474191021072117e-06, + "loss": -0.0016, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 523 + }, + { + "completion_length": 816.1458435058594, + "epoch": 0.6390243902439025, + "grad_norm": 0.4471191167831421, + "kl": 0.02532958984375, + "learning_rate": 1.0413354506114434e-06, + "loss": -0.0062, + "reward": 0.1041666679084301, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 524 + }, + { + "completion_length": 682.3750305175781, + "epoch": 0.6402439024390244, + "grad_norm": 0.44450777769088745, + "kl": 0.025634765625, + "learning_rate": 1.0352601106564607e-06, + "loss": 0.0312, + "reward": 0.20833333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.20833333395421505, + "rewards/format_reward": 0.0, + "step": 525 + }, + { + "completion_length": 646.6875305175781, + "epoch": 0.6414634146341464, + "grad_norm": 0.5126345157623291, + "kl": 0.0330810546875, + "learning_rate": 1.0291931923345635e-06, + "loss": 0.0703, + "reward": 0.1666666679084301, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 526 + }, + { + "completion_length": 702.7916870117188, + "epoch": 0.6426829268292683, + "grad_norm": 0.5051405429840088, + "kl": 0.02252197265625, + "learning_rate": 1.0231348055854452e-06, + "loss": 0.0099, + "reward": 0.2083333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.0, + "step": 527 + }, + { + "completion_length": 879.0625, + "epoch": 0.6439024390243903, + "grad_norm": 0.31973937153816223, + "kl": 0.03155517578125, + "learning_rate": 1.0170850601941937e-06, + "loss": -0.0368, + "reward": 0.2291666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 528 + }, + { + "completion_length": 880.9791870117188, + "epoch": 0.6451219512195122, + "grad_norm": 0.25314292311668396, + "kl": 0.0260009765625, + "learning_rate": 1.0110440657893074e-06, + "loss": -0.008, + "reward": 0.3541666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/format_reward": 0.0, + "step": 529 + }, + { + "completion_length": 1073.8750610351562, + "epoch": 0.6463414634146342, + "grad_norm": 0.4375230669975281, + "kl": 0.02862548828125, + "learning_rate": 1.0050119318407061e-06, + "loss": -0.0044, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 530 + }, + { + "completion_length": 703.3750305175781, + "epoch": 0.6475609756097561, + "grad_norm": 0.4382186233997345, + "kl": 0.0386962890625, + "learning_rate": 9.98988767657747e-07, + "loss": 0.0117, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 531 + }, + { + "completion_length": 947.1041870117188, + "epoch": 0.6487804878048781, + "grad_norm": 0.3478910028934479, + "kl": 0.02777099609375, + "learning_rate": 9.929746823872462e-07, + "loss": 0.0117, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 532 + }, + { + "completion_length": 968.8333740234375, + "epoch": 0.65, + "grad_norm": 0.05612090975046158, + "kl": 0.03887939453125, + "learning_rate": 9.86969785011497e-07, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 533 + }, + { + "completion_length": 907.7500305175781, + "epoch": 0.651219512195122, + "grad_norm": 0.5268975496292114, + "kl": 0.02972412109375, + "learning_rate": 9.809741843462994e-07, + "loss": 0.0468, + "reward": 0.12500000558793545, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 534 + }, + { + "completion_length": 785.0833435058594, + "epoch": 0.6524390243902439, + "grad_norm": 0.47635316848754883, + "kl": 0.02850341796875, + "learning_rate": 9.749879890389848e-07, + "loss": -0.017, + "reward": 0.1666666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 535 + }, + { + "completion_length": 847.6875305175781, + "epoch": 0.6536585365853659, + "grad_norm": 0.18279653787612915, + "kl": 0.02557373046875, + "learning_rate": 9.690113075664488e-07, + "loss": -0.002, + "reward": 0.14583333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 536 + }, + { + "completion_length": 901.2083740234375, + "epoch": 0.6548780487804878, + "grad_norm": 0.5397875905036926, + "kl": 0.0302734375, + "learning_rate": 9.630442482331853e-07, + "loss": 0.0856, + "reward": 0.18750000558793545, + "reward_std": 0.25259073823690414, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/format_reward": 0.0, + "step": 537 + }, + { + "completion_length": 1034.0208435058594, + "epoch": 0.6560975609756098, + "grad_norm": 0.3381046950817108, + "kl": 0.0242919921875, + "learning_rate": 9.57086919169323e-07, + "loss": 0.042, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 538 + }, + { + "completion_length": 997.8125305175781, + "epoch": 0.6573170731707317, + "grad_norm": 0.64218670129776, + "kl": 0.03057861328125, + "learning_rate": 9.511394283286686e-07, + "loss": 0.1128, + "reward": 0.2083333432674408, + "reward_std": 0.21650634706020355, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 539 + }, + { + "completion_length": 717.875, + "epoch": 0.6585365853658537, + "grad_norm": 0.3277949392795563, + "kl": 0.02752685546875, + "learning_rate": 9.452018834867454e-07, + "loss": 0.0327, + "reward": 0.1666666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 540 + }, + { + "completion_length": 1029.4583740234375, + "epoch": 0.6597560975609756, + "grad_norm": 0.24999314546585083, + "kl": 0.025390625, + "learning_rate": 9.392743922388469e-07, + "loss": 0.0099, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 541 + }, + { + "completion_length": 912.1458435058594, + "epoch": 0.6609756097560976, + "grad_norm": 0.1514778882265091, + "kl": 0.02947998046875, + "learning_rate": 9.333570619980818e-07, + "loss": 0.0011, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 542 + }, + { + "completion_length": 840.1458740234375, + "epoch": 0.6621951219512195, + "grad_norm": 0.3129235804080963, + "kl": 0.02838134765625, + "learning_rate": 9.27449999993429e-07, + "loss": -0.0009, + "reward": 0.1250000037252903, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 543 + }, + { + "completion_length": 928.0000305175781, + "epoch": 0.6634146341463415, + "grad_norm": 0.4312836229801178, + "kl": 0.02886962890625, + "learning_rate": 9.215533132677969e-07, + "loss": 0.0046, + "reward": 0.229166679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.229166679084301, + "rewards/format_reward": 0.0, + "step": 544 + }, + { + "completion_length": 765.4167175292969, + "epoch": 0.6646341463414634, + "grad_norm": 0.7276366949081421, + "kl": 0.02789306640625, + "learning_rate": 9.156671086760788e-07, + "loss": 0.0027, + "reward": 0.2291666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2291666679084301, + "rewards/format_reward": 0.0, + "step": 545 + }, + { + "completion_length": 906.8958435058594, + "epoch": 0.6658536585365854, + "grad_norm": 0.4692193269729614, + "kl": 0.057373046875, + "learning_rate": 9.097914928832228e-07, + "loss": -0.084, + "reward": 0.2708333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/format_reward": 0.0, + "step": 546 + }, + { + "completion_length": 1031.8333740234375, + "epoch": 0.6670731707317074, + "grad_norm": 0.21384288370609283, + "kl": 0.0313720703125, + "learning_rate": 9.039265723622923e-07, + "loss": 0.0179, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 547 + }, + { + "completion_length": 984.6875, + "epoch": 0.6682926829268293, + "grad_norm": 0.32777276635169983, + "kl": 0.03143310546875, + "learning_rate": 8.980724533925419e-07, + "loss": 0.0412, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 548 + }, + { + "completion_length": 822.2291870117188, + "epoch": 0.6695121951219513, + "grad_norm": 0.06951643526554108, + "kl": 0.02813720703125, + "learning_rate": 8.922292420574888e-07, + "loss": 0.0011, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 549 + }, + { + "completion_length": 1072.8333740234375, + "epoch": 0.6707317073170732, + "grad_norm": 0.33174851536750793, + "kl": 0.03363037109375, + "learning_rate": 8.863970442429902e-07, + "loss": 0.0145, + "reward": 0.06250000186264515, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 550 + }, + { + "completion_length": 718.5416870117188, + "epoch": 0.6719512195121952, + "grad_norm": 0.3611091375350952, + "kl": 0.0299072265625, + "learning_rate": 8.805759656353275e-07, + "loss": 0.0043, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 551 + }, + { + "completion_length": 1112.2917175292969, + "epoch": 0.6731707317073171, + "grad_norm": 0.23453758656978607, + "kl": 0.03106689453125, + "learning_rate": 8.74766111719286e-07, + "loss": 0.0303, + "reward": 0.1875000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 552 + }, + { + "completion_length": 1011.8333740234375, + "epoch": 0.6743902439024391, + "grad_norm": 0.4298003613948822, + "kl": 0.02471923828125, + "learning_rate": 8.689675877762487e-07, + "loss": 0.0376, + "reward": 0.1875000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 553 + }, + { + "completion_length": 880.875, + "epoch": 0.675609756097561, + "grad_norm": 0.2480362057685852, + "kl": 0.02435302734375, + "learning_rate": 8.631804988822859e-07, + "loss": 0.0226, + "reward": 0.1458333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333432674408, + "rewards/format_reward": 0.0, + "step": 554 + }, + { + "completion_length": 931.6666870117188, + "epoch": 0.676829268292683, + "grad_norm": 0.45659956336021423, + "kl": 0.0318603515625, + "learning_rate": 8.574049499062509e-07, + "loss": 0.0662, + "reward": 0.458333358168602, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.458333358168602, + "rewards/format_reward": 0.0, + "step": 555 + }, + { + "completion_length": 1066.6458435058594, + "epoch": 0.6780487804878049, + "grad_norm": 0.3029688894748688, + "kl": 0.02972412109375, + "learning_rate": 8.516410455078793e-07, + "loss": 0.0435, + "reward": 0.1666666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 556 + }, + { + "completion_length": 1015.1042175292969, + "epoch": 0.6792682926829269, + "grad_norm": 0.43346521258354187, + "kl": 0.026611328125, + "learning_rate": 8.458888901358958e-07, + "loss": 0.0408, + "reward": 0.1666666716337204, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 557 + }, + { + "completion_length": 1045.7917175292969, + "epoch": 0.6804878048780488, + "grad_norm": 0.21469931304454803, + "kl": 0.0299072265625, + "learning_rate": 8.401485880261151e-07, + "loss": 0.0019, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 558 + }, + { + "completion_length": 825.8125305175781, + "epoch": 0.6817073170731708, + "grad_norm": 0.052236396819353104, + "kl": 0.021240234375, + "learning_rate": 8.344202431995604e-07, + "loss": 0.0008, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 559 + }, + { + "completion_length": 1029.2500610351562, + "epoch": 0.6829268292682927, + "grad_norm": 0.06884250044822693, + "kl": 0.03179931640625, + "learning_rate": 8.287039594605737e-07, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 560 + }, + { + "completion_length": 837.125, + "epoch": 0.6841463414634147, + "grad_norm": 0.8303191661834717, + "kl": 0.0313720703125, + "learning_rate": 8.229998403949348e-07, + "loss": 0.0064, + "reward": 0.0625, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 561 + }, + { + "completion_length": 1083.7708740234375, + "epoch": 0.6853658536585366, + "grad_norm": 0.4762817323207855, + "kl": 0.03076171875, + "learning_rate": 8.173079893679873e-07, + "loss": -0.0835, + "reward": 0.1458333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 562 + }, + { + "completion_length": 942.0000305175781, + "epoch": 0.6865853658536586, + "grad_norm": 0.39529234170913696, + "kl": 0.02484130859375, + "learning_rate": 8.116285095227604e-07, + "loss": 0.0101, + "reward": 0.06250000186264515, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 563 + }, + { + "completion_length": 1088.4166870117188, + "epoch": 0.6878048780487804, + "grad_norm": 0.35131967067718506, + "kl": 0.0321044921875, + "learning_rate": 8.05961503778108e-07, + "loss": -0.0654, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 564 + }, + { + "completion_length": 1007.2291870117188, + "epoch": 0.6890243902439024, + "grad_norm": 0.12090548872947693, + "kl": 0.02606201171875, + "learning_rate": 8.003070748268339e-07, + "loss": 0.0043, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 565 + }, + { + "completion_length": 940.6250305175781, + "epoch": 0.6902439024390243, + "grad_norm": 0.33971157670021057, + "kl": 0.02880859375, + "learning_rate": 7.94665325133837e-07, + "loss": -0.0299, + "reward": 0.1875000074505806, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 566 + }, + { + "completion_length": 1149.3958435058594, + "epoch": 0.6914634146341463, + "grad_norm": 0.5320213437080383, + "kl": 0.0594482421875, + "learning_rate": 7.890363569342539e-07, + "loss": 0.0018, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 567 + }, + { + "completion_length": 644.1041870117188, + "epoch": 0.6926829268292682, + "grad_norm": 0.6505311727523804, + "kl": 0.03271484375, + "learning_rate": 7.834202722316054e-07, + "loss": -0.0397, + "reward": 0.12500000558793545, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 568 + }, + { + "completion_length": 821.2500305175781, + "epoch": 0.6939024390243902, + "grad_norm": 0.28424742817878723, + "kl": 0.02581787109375, + "learning_rate": 7.778171727959482e-07, + "loss": -0.0189, + "reward": 0.2083333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 569 + }, + { + "completion_length": 986.1458740234375, + "epoch": 0.6951219512195121, + "grad_norm": 0.3112906217575073, + "kl": 0.02734375, + "learning_rate": 7.722271601620293e-07, + "loss": -0.0013, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 570 + }, + { + "completion_length": 873.5833435058594, + "epoch": 0.6963414634146341, + "grad_norm": 0.04128978028893471, + "kl": 0.0260009765625, + "learning_rate": 7.6665033562745e-07, + "loss": 0.0008, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 571 + }, + { + "completion_length": 774.3750305175781, + "epoch": 0.697560975609756, + "grad_norm": 0.4388665556907654, + "kl": 0.0338134765625, + "learning_rate": 7.610868002508248e-07, + "loss": -0.0205, + "reward": 0.2083333432674408, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 572 + }, + { + "completion_length": 771.5000305175781, + "epoch": 0.698780487804878, + "grad_norm": 5.242128372192383, + "kl": 0.05743408203125, + "learning_rate": 7.555366548499551e-07, + "loss": 0.0609, + "reward": 0.2916666865348816, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2916666865348816, + "rewards/format_reward": 0.0, + "step": 573 + }, + { + "completion_length": 815.7916870117188, + "epoch": 0.7, + "grad_norm": 0.44563284516334534, + "kl": 0.02752685546875, + "learning_rate": 7.500000000000003e-07, + "loss": 0.0092, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 574 + }, + { + "completion_length": 963.7708740234375, + "epoch": 0.7012195121951219, + "grad_norm": 0.32968661189079285, + "kl": 0.02679443359375, + "learning_rate": 7.444769360316534e-07, + "loss": 0.0105, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 575 + }, + { + "completion_length": 969.4375, + "epoch": 0.7024390243902439, + "grad_norm": 0.4815066158771515, + "kl": 0.025390625, + "learning_rate": 7.389675630293269e-07, + "loss": -0.0301, + "reward": 0.1458333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 576 + }, + { + "completion_length": 816.0000305175781, + "epoch": 0.7036585365853658, + "grad_norm": 0.2536729574203491, + "kl": 0.03240966796875, + "learning_rate": 7.334719808293342e-07, + "loss": 0.0069, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 577 + }, + { + "completion_length": 830.5833435058594, + "epoch": 0.7048780487804878, + "grad_norm": 0.3585840165615082, + "kl": 0.02716064453125, + "learning_rate": 7.279902890180865e-07, + "loss": 0.0016, + "reward": 0.25000000558793545, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.25000000558793545, + "rewards/format_reward": 0.0, + "step": 578 + }, + { + "completion_length": 932.3750305175781, + "epoch": 0.7060975609756097, + "grad_norm": 0.5187066793441772, + "kl": 0.02978515625, + "learning_rate": 7.225225869302818e-07, + "loss": -0.0782, + "reward": 0.2708333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 579 + }, + { + "completion_length": 1005.7083740234375, + "epoch": 0.7073170731707317, + "grad_norm": 0.313052237033844, + "kl": 0.0267333984375, + "learning_rate": 7.1706897364711e-07, + "loss": 0.0132, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 580 + }, + { + "completion_length": 745.5416870117188, + "epoch": 0.7085365853658536, + "grad_norm": 0.38321879506111145, + "kl": 0.03131103515625, + "learning_rate": 7.116295479944533e-07, + "loss": 0.0082, + "reward": 0.16666667722165585, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/format_reward": 0.0, + "step": 581 + }, + { + "completion_length": 1140.0000305175781, + "epoch": 0.7097560975609756, + "grad_norm": 0.6155075430870056, + "kl": 0.03070068359375, + "learning_rate": 7.062044085410991e-07, + "loss": -0.084, + "reward": 0.1666666716337204, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 582 + }, + { + "completion_length": 850.8958435058594, + "epoch": 0.7109756097560975, + "grad_norm": 0.4988707900047302, + "kl": 0.02691650390625, + "learning_rate": 7.007936535969516e-07, + "loss": 0.0107, + "reward": 0.08333333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 583 + }, + { + "completion_length": 1001.2083740234375, + "epoch": 0.7121951219512195, + "grad_norm": 0.4897194504737854, + "kl": 0.03070068359375, + "learning_rate": 6.9539738121125e-07, + "loss": 0.0243, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 584 + }, + { + "completion_length": 866.8125305175781, + "epoch": 0.7134146341463414, + "grad_norm": 0.5088192224502563, + "kl": 0.03009033203125, + "learning_rate": 6.90015689170794e-07, + "loss": 0.05, + "reward": 0.1458333395421505, + "reward_std": 0.21650634706020355, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 585 + }, + { + "completion_length": 1086.8958740234375, + "epoch": 0.7146341463414634, + "grad_norm": 0.391956090927124, + "kl": 0.0238037109375, + "learning_rate": 6.846486749981684e-07, + "loss": 0.0635, + "reward": 0.1666666679084301, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 586 + }, + { + "completion_length": 1109.1875610351562, + "epoch": 0.7158536585365853, + "grad_norm": 0.5406737923622131, + "kl": 0.03082275390625, + "learning_rate": 6.792964359499794e-07, + "loss": 0.0022, + "reward": 0.2083333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 587 + }, + { + "completion_length": 965.4583740234375, + "epoch": 0.7170731707317073, + "grad_norm": 0.472937673330307, + "kl": 0.025146484375, + "learning_rate": 6.739590690150903e-07, + "loss": 0.027, + "reward": 0.3333333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 588 + }, + { + "completion_length": 740.2708435058594, + "epoch": 0.7182926829268292, + "grad_norm": 0.7443292140960693, + "kl": 0.025390625, + "learning_rate": 6.686366709128632e-07, + "loss": 0.0367, + "reward": 0.1458333395421505, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 589 + }, + { + "completion_length": 623.6666870117188, + "epoch": 0.7195121951219512, + "grad_norm": 0.3671242594718933, + "kl": 0.027099609375, + "learning_rate": 6.633293380914087e-07, + "loss": -0.0144, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 590 + }, + { + "completion_length": 743.3541870117188, + "epoch": 0.7207317073170731, + "grad_norm": 0.4817129969596863, + "kl": 0.0281982421875, + "learning_rate": 6.580371667258349e-07, + "loss": 0.0248, + "reward": 0.1666666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 591 + }, + { + "completion_length": 914.7708435058594, + "epoch": 0.7219512195121951, + "grad_norm": 0.6760240197181702, + "kl": 0.03094482421875, + "learning_rate": 6.527602527165099e-07, + "loss": 0.0382, + "reward": 0.0625, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 592 + }, + { + "completion_length": 1096.8333740234375, + "epoch": 0.723170731707317, + "grad_norm": 0.40590059757232666, + "kl": 0.0230712890625, + "learning_rate": 6.474986916873168e-07, + "loss": 0.0277, + "reward": 0.1875000111758709, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.0, + "step": 593 + }, + { + "completion_length": 762.5625305175781, + "epoch": 0.724390243902439, + "grad_norm": 0.23105137050151825, + "kl": 0.033935546875, + "learning_rate": 6.422525789839273e-07, + "loss": 0.0089, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 594 + }, + { + "completion_length": 787.4375305175781, + "epoch": 0.725609756097561, + "grad_norm": 0.48707565665245056, + "kl": 0.02789306640625, + "learning_rate": 6.370220096720692e-07, + "loss": -0.0576, + "reward": 0.12500000558793545, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 595 + }, + { + "completion_length": 706.9166870117188, + "epoch": 0.7268292682926829, + "grad_norm": 0.042059846222400665, + "kl": 0.02459716796875, + "learning_rate": 6.318070785358074e-07, + "loss": 0.0009, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 596 + }, + { + "completion_length": 860.6041870117188, + "epoch": 0.7280487804878049, + "grad_norm": 0.5042432546615601, + "kl": 0.028564453125, + "learning_rate": 6.266078800758249e-07, + "loss": -0.0065, + "reward": 0.2500000149011612, + "reward_std": 0.21650634706020355, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 597 + }, + { + "completion_length": 914.4583740234375, + "epoch": 0.7292682926829268, + "grad_norm": 0.3709144592285156, + "kl": 0.029052734375, + "learning_rate": 6.214245085077078e-07, + "loss": 0.0667, + "reward": 0.1875000074505806, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 598 + }, + { + "completion_length": 767.2708740234375, + "epoch": 0.7304878048780488, + "grad_norm": 0.6649799346923828, + "kl": 0.02618408203125, + "learning_rate": 6.162570577602433e-07, + "loss": -0.0633, + "reward": 0.1458333395421505, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 599 + }, + { + "completion_length": 687.1041870117188, + "epoch": 0.7317073170731707, + "grad_norm": 0.3504408299922943, + "kl": 0.027099609375, + "learning_rate": 6.11105621473712e-07, + "loss": 0.0053, + "reward": 0.1875, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 600 + }, + { + "completion_length": 663.4791870117188, + "epoch": 0.7329268292682927, + "grad_norm": 0.33722177147865295, + "kl": 0.031494140625, + "learning_rate": 6.059702929981952e-07, + "loss": 0.0021, + "reward": 0.1041666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "step": 601 + }, + { + "completion_length": 927.5000305175781, + "epoch": 0.7341463414634146, + "grad_norm": 0.20690011978149414, + "kl": 0.029296875, + "learning_rate": 6.008511653918821e-07, + "loss": 0.0659, + "reward": 0.1041666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "step": 602 + }, + { + "completion_length": 945.1458740234375, + "epoch": 0.7353658536585366, + "grad_norm": 0.3113418519496918, + "kl": 0.02423095703125, + "learning_rate": 5.957483314193813e-07, + "loss": -0.0218, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 603 + }, + { + "completion_length": 1067.1250305175781, + "epoch": 0.7365853658536585, + "grad_norm": 0.16814640164375305, + "kl": 0.02593994140625, + "learning_rate": 5.906618835500434e-07, + "loss": -0.0261, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 604 + }, + { + "completion_length": 867.8333435058594, + "epoch": 0.7378048780487805, + "grad_norm": 0.46364933252334595, + "kl": 0.0235595703125, + "learning_rate": 5.855919139562815e-07, + "loss": 0.0116, + "reward": 0.1666666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 605 + }, + { + "completion_length": 735.8750305175781, + "epoch": 0.7390243902439024, + "grad_norm": 0.2824901044368744, + "kl": 0.025146484375, + "learning_rate": 5.805385145119064e-07, + "loss": 0.0078, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 606 + }, + { + "completion_length": 648.3541870117188, + "epoch": 0.7402439024390244, + "grad_norm": 0.21588559448719025, + "kl": 0.0252685546875, + "learning_rate": 5.755017767904543e-07, + "loss": -0.0065, + "reward": 0.2708333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 607 + }, + { + "completion_length": 936.625, + "epoch": 0.7414634146341463, + "grad_norm": 0.5255969166755676, + "kl": 0.02447509765625, + "learning_rate": 5.704817920635348e-07, + "loss": 0.0084, + "reward": 0.10416666977107525, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 608 + }, + { + "completion_length": 646.2916870117188, + "epoch": 0.7426829268292683, + "grad_norm": 0.3593496084213257, + "kl": 0.02374267578125, + "learning_rate": 5.654786512991705e-07, + "loss": -0.0195, + "reward": 0.20833333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.20833333395421505, + "rewards/format_reward": 0.0, + "step": 609 + }, + { + "completion_length": 762.4791870117188, + "epoch": 0.7439024390243902, + "grad_norm": 0.5494648814201355, + "kl": 0.02764892578125, + "learning_rate": 5.60492445160154e-07, + "loss": 0.0277, + "reward": 0.1458333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 610 + }, + { + "completion_length": 615.0625, + "epoch": 0.7451219512195122, + "grad_norm": 0.4446452558040619, + "kl": 0.02777099609375, + "learning_rate": 5.555232640024021e-07, + "loss": 0.0182, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 611 + }, + { + "completion_length": 1102.7292175292969, + "epoch": 0.7463414634146341, + "grad_norm": 0.5812187194824219, + "kl": 0.02154541015625, + "learning_rate": 5.505711978733175e-07, + "loss": 0.0239, + "reward": 0.2291666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 612 + }, + { + "completion_length": 718.625, + "epoch": 0.7475609756097561, + "grad_norm": 0.34378528594970703, + "kl": 0.02716064453125, + "learning_rate": 5.456363365101606e-07, + "loss": 0.0557, + "reward": 0.0833333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 613 + }, + { + "completion_length": 839.0625, + "epoch": 0.748780487804878, + "grad_norm": 0.28123462200164795, + "kl": 0.0296630859375, + "learning_rate": 5.407187693384191e-07, + "loss": -0.0126, + "reward": 0.1666666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 614 + }, + { + "completion_length": 853.4166870117188, + "epoch": 0.75, + "grad_norm": 0.2266978621482849, + "kl": 0.02679443359375, + "learning_rate": 5.358185854701909e-07, + "loss": 0.0021, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 615 + }, + { + "completion_length": 841.8541870117188, + "epoch": 0.751219512195122, + "grad_norm": 0.3853289783000946, + "kl": 0.030029296875, + "learning_rate": 5.309358737025682e-07, + "loss": -0.006, + "reward": 0.06250000186264515, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 616 + }, + { + "completion_length": 699.5416870117188, + "epoch": 0.7524390243902439, + "grad_norm": 0.16332949697971344, + "kl": 0.02691650390625, + "learning_rate": 5.26070722516028e-07, + "loss": -0.0019, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 617 + }, + { + "completion_length": 603.0833435058594, + "epoch": 0.7536585365853659, + "grad_norm": 0.3333573043346405, + "kl": 0.02734375, + "learning_rate": 5.21223220072828e-07, + "loss": 0.0047, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 618 + }, + { + "completion_length": 825.9791870117188, + "epoch": 0.7548780487804878, + "grad_norm": 0.4847300350666046, + "kl": 0.02606201171875, + "learning_rate": 5.163934542154106e-07, + "loss": -0.0903, + "reward": 0.14583333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 619 + }, + { + "completion_length": 893.6875, + "epoch": 0.7560975609756098, + "grad_norm": 0.3289225995540619, + "kl": 0.0340576171875, + "learning_rate": 5.115815124648103e-07, + "loss": -0.023, + "reward": 0.0625, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 620 + }, + { + "completion_length": 945.3333740234375, + "epoch": 0.7573170731707317, + "grad_norm": 0.35044562816619873, + "kl": 0.0244140625, + "learning_rate": 5.067874820190684e-07, + "loss": -0.0447, + "reward": 0.1458333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333432674408, + "rewards/format_reward": 0.0, + "step": 621 + }, + { + "completion_length": 774.5625305175781, + "epoch": 0.7585365853658537, + "grad_norm": 0.23236961662769318, + "kl": 0.02313232421875, + "learning_rate": 5.020114497516521e-07, + "loss": 0.0038, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 622 + }, + { + "completion_length": 845.3541870117188, + "epoch": 0.7597560975609756, + "grad_norm": 0.5932947397232056, + "kl": 0.02593994140625, + "learning_rate": 4.972535022098795e-07, + "loss": -0.0275, + "reward": 0.3125000149011612, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.3125000149011612, + "rewards/format_reward": 0.0, + "step": 623 + }, + { + "completion_length": 858.6875305175781, + "epoch": 0.7609756097560976, + "grad_norm": 0.04889252781867981, + "kl": 0.02398681640625, + "learning_rate": 4.925137256133533e-07, + "loss": 0.0009, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 624 + }, + { + "completion_length": 679.8333740234375, + "epoch": 0.7621951219512195, + "grad_norm": 0.5832393169403076, + "kl": 0.02545166015625, + "learning_rate": 4.877922058523971e-07, + "loss": 0.022, + "reward": 0.2083333432674408, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 625 + }, + { + "completion_length": 909.9792175292969, + "epoch": 0.7634146341463415, + "grad_norm": 0.5005730986595154, + "kl": 0.0291748046875, + "learning_rate": 4.830890284864985e-07, + "loss": 0.005, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 626 + }, + { + "completion_length": 749.4583740234375, + "epoch": 0.7646341463414634, + "grad_norm": 0.39322492480278015, + "kl": 0.0283203125, + "learning_rate": 4.784042787427605e-07, + "loss": -0.0427, + "reward": 0.2083333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 627 + }, + { + "completion_length": 819.6250305175781, + "epoch": 0.7658536585365854, + "grad_norm": 0.3320612609386444, + "kl": 0.0233154296875, + "learning_rate": 4.7373804151435456e-07, + "loss": -0.0096, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 628 + }, + { + "completion_length": 822.9791870117188, + "epoch": 0.7670731707317073, + "grad_norm": 0.23255078494548798, + "kl": 0.02508544921875, + "learning_rate": 4.6909040135898463e-07, + "loss": 0.0098, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 629 + }, + { + "completion_length": 744.3958740234375, + "epoch": 0.7682926829268293, + "grad_norm": 0.3943890333175659, + "kl": 0.02325439453125, + "learning_rate": 4.6446144249735345e-07, + "loss": 0.0175, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 630 + }, + { + "completion_length": 929.8333435058594, + "epoch": 0.7695121951219512, + "grad_norm": 0.2944657802581787, + "kl": 0.02777099609375, + "learning_rate": 4.598512488116376e-07, + "loss": 0.003, + "reward": 0.25, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 631 + }, + { + "completion_length": 699.0416870117188, + "epoch": 0.7707317073170732, + "grad_norm": 0.5173816084861755, + "kl": 0.02801513671875, + "learning_rate": 4.552599038439651e-07, + "loss": 0.0126, + "reward": 0.1250000037252903, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 632 + }, + { + "completion_length": 850.4166870117188, + "epoch": 0.7719512195121951, + "grad_norm": 0.46213850378990173, + "kl": 0.0316162109375, + "learning_rate": 4.506874907949034e-07, + "loss": 0.0377, + "reward": 0.0625, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 633 + }, + { + "completion_length": 648.7500305175781, + "epoch": 0.7731707317073171, + "grad_norm": 0.3204668164253235, + "kl": 0.02276611328125, + "learning_rate": 4.461340925219522e-07, + "loss": -0.0045, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 634 + }, + { + "completion_length": 1001.9375305175781, + "epoch": 0.774390243902439, + "grad_norm": 0.1694246381521225, + "kl": 0.02435302734375, + "learning_rate": 4.4159979153804064e-07, + "loss": -0.0036, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 635 + }, + { + "completion_length": 636.1875, + "epoch": 0.775609756097561, + "grad_norm": 0.518161416053772, + "kl": 0.02294921875, + "learning_rate": 4.3708467001003305e-07, + "loss": 0.0107, + "reward": 0.1875000111758709, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.0, + "step": 636 + }, + { + "completion_length": 979.5208740234375, + "epoch": 0.776829268292683, + "grad_norm": 0.22282478213310242, + "kl": 0.024871826171875, + "learning_rate": 4.3258880975723777e-07, + "loss": 0.0294, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 637 + }, + { + "completion_length": 995.0625305175781, + "epoch": 0.7780487804878049, + "grad_norm": 0.21094514429569244, + "kl": 0.02630615234375, + "learning_rate": 4.2811229224992807e-07, + "loss": 0.0009, + "reward": 0.2291666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 638 + }, + { + "completion_length": 774.4375305175781, + "epoch": 0.7792682926829269, + "grad_norm": 0.5504517555236816, + "kl": 0.026123046875, + "learning_rate": 4.2365519860786316e-07, + "loss": 0.0057, + "reward": 0.1458333395421505, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 639 + }, + { + "completion_length": 861.0625, + "epoch": 0.7804878048780488, + "grad_norm": 0.11976215988397598, + "kl": 0.0213623046875, + "learning_rate": 4.192176095988196e-07, + "loss": 0.002, + "reward": 0.2083333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 640 + }, + { + "completion_length": 867.6875305175781, + "epoch": 0.7817073170731708, + "grad_norm": 0.31083089113235474, + "kl": 0.0318603515625, + "learning_rate": 4.147996056371258e-07, + "loss": 0.006, + "reward": 0.12500000558793545, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 641 + }, + { + "completion_length": 666.8333435058594, + "epoch": 0.7829268292682927, + "grad_norm": 0.12398859858512878, + "kl": 0.02935791015625, + "learning_rate": 4.1040126678220656e-07, + "loss": 0.001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 642 + }, + { + "completion_length": 733.7291870117188, + "epoch": 0.7841463414634147, + "grad_norm": 0.35403457283973694, + "kl": 0.0262451171875, + "learning_rate": 4.060226727371327e-07, + "loss": -0.0117, + "reward": 0.3125000149011612, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.3125000149011612, + "rewards/format_reward": 0.0, + "step": 643 + }, + { + "completion_length": 926.9791870117188, + "epoch": 0.7853658536585366, + "grad_norm": 0.22392979264259338, + "kl": 0.02423095703125, + "learning_rate": 4.0166390284717475e-07, + "loss": -0.0329, + "reward": 0.1666666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 644 + }, + { + "completion_length": 881.4583740234375, + "epoch": 0.7865853658536586, + "grad_norm": 0.27586719393730164, + "kl": 0.02484130859375, + "learning_rate": 3.973250360983677e-07, + "loss": 0.0033, + "reward": 0.1041666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 645 + }, + { + "completion_length": 817.9375305175781, + "epoch": 0.7878048780487805, + "grad_norm": 0.366715669631958, + "kl": 0.0218505859375, + "learning_rate": 3.930061511160762e-07, + "loss": -0.0048, + "reward": 0.1458333395421505, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 646 + }, + { + "completion_length": 749.5625305175781, + "epoch": 0.7890243902439025, + "grad_norm": 0.3225473165512085, + "kl": 0.0262451171875, + "learning_rate": 3.8870732616357364e-07, + "loss": 0.0126, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 647 + }, + { + "completion_length": 922.1041870117188, + "epoch": 0.7902439024390244, + "grad_norm": 0.49840256571769714, + "kl": 0.02642822265625, + "learning_rate": 3.8442863914062065e-07, + "loss": -0.0015, + "reward": 0.1041666679084301, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 648 + }, + { + "completion_length": 629.3541870117188, + "epoch": 0.7914634146341464, + "grad_norm": 0.5496554970741272, + "kl": 0.02532958984375, + "learning_rate": 3.8017016758205597e-07, + "loss": -0.0105, + "reward": 0.2291666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 649 + }, + { + "completion_length": 666.8958740234375, + "epoch": 0.7926829268292683, + "grad_norm": 0.45331189036369324, + "kl": 0.031982421875, + "learning_rate": 3.759319886563905e-07, + "loss": -0.0191, + "reward": 0.25, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.0, + "step": 650 + }, + { + "completion_length": 1043.6250610351562, + "epoch": 0.7939024390243903, + "grad_norm": 0.46646979451179504, + "kl": 0.024169921875, + "learning_rate": 3.7171417916440714e-07, + "loss": 0.1326, + "reward": 0.1250000037252903, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 651 + }, + { + "completion_length": 1071.1875610351562, + "epoch": 0.7951219512195122, + "grad_norm": 0.3821835517883301, + "kl": 0.022705078125, + "learning_rate": 3.6751681553777236e-07, + "loss": 0.0294, + "reward": 0.2291666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 652 + }, + { + "completion_length": 844.9791870117188, + "epoch": 0.7963414634146342, + "grad_norm": 0.30517253279685974, + "kl": 0.0240478515625, + "learning_rate": 3.633399738376491e-07, + "loss": 0.0046, + "reward": 0.0416666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 653 + }, + { + "completion_length": 838.4792175292969, + "epoch": 0.7975609756097561, + "grad_norm": 0.5359827876091003, + "kl": 0.03106689453125, + "learning_rate": 3.5918372975331933e-07, + "loss": 0.0247, + "reward": 0.229166679084301, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.229166679084301, + "rewards/format_reward": 0.0, + "step": 654 + }, + { + "completion_length": 1229.2500610351562, + "epoch": 0.7987804878048781, + "grad_norm": 0.46620362997055054, + "kl": 0.0257568359375, + "learning_rate": 3.5504815860081056e-07, + "loss": -0.0116, + "reward": 0.1875000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 655 + }, + { + "completion_length": 768.3125, + "epoch": 0.8, + "grad_norm": 0.5677731037139893, + "kl": 0.03173828125, + "learning_rate": 3.5093333532153313e-07, + "loss": 0.0289, + "reward": 0.2708333432674408, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 656 + }, + { + "completion_length": 994.2916870117188, + "epoch": 0.801219512195122, + "grad_norm": 0.13922348618507385, + "kl": 0.0245361328125, + "learning_rate": 3.468393344809222e-07, + "loss": 0.0132, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 657 + }, + { + "completion_length": 898.0208435058594, + "epoch": 0.802439024390244, + "grad_norm": 0.24220433831214905, + "kl": 0.02447509765625, + "learning_rate": 3.4276623026708556e-07, + "loss": 0.0095, + "reward": 0.2708333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 658 + }, + { + "completion_length": 786.2500305175781, + "epoch": 0.8036585365853659, + "grad_norm": 0.34243243932724, + "kl": 0.02618408203125, + "learning_rate": 3.3871409648945955e-07, + "loss": 0.0175, + "reward": 0.2083333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 659 + }, + { + "completion_length": 809.3333435058594, + "epoch": 0.8048780487804879, + "grad_norm": 0.45875081419944763, + "kl": 0.02392578125, + "learning_rate": 3.346830065774706e-07, + "loss": 0.0062, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 660 + }, + { + "completion_length": 864.7083740234375, + "epoch": 0.8060975609756098, + "grad_norm": 0.3514421582221985, + "kl": 0.0238037109375, + "learning_rate": 3.306730335792075e-07, + "loss": -0.0071, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 661 + }, + { + "completion_length": 615.25, + "epoch": 0.8073170731707318, + "grad_norm": 0.5857967734336853, + "kl": 0.02655029296875, + "learning_rate": 3.266842501600934e-07, + "loss": -0.0196, + "reward": 0.1875, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 662 + }, + { + "completion_length": 780.3333740234375, + "epoch": 0.8085365853658537, + "grad_norm": 0.5814476013183594, + "kl": 0.02825927734375, + "learning_rate": 3.2271672860157324e-07, + "loss": 0.0054, + "reward": 0.1875, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 663 + }, + { + "completion_length": 860.0416870117188, + "epoch": 0.8097560975609757, + "grad_norm": 0.25556638836860657, + "kl": 0.0343017578125, + "learning_rate": 3.187705407998018e-07, + "loss": 0.0115, + "reward": 0.06250000186264515, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 664 + }, + { + "completion_length": 735.75, + "epoch": 0.8109756097560976, + "grad_norm": 0.5753107666969299, + "kl": 0.026123046875, + "learning_rate": 3.148457582643398e-07, + "loss": 0.0642, + "reward": 0.2500000074505806, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.0, + "step": 665 + }, + { + "completion_length": 1075.9375, + "epoch": 0.8121951219512196, + "grad_norm": 0.04850023239850998, + "kl": 0.0252685546875, + "learning_rate": 3.1094245211686106e-07, + "loss": 0.0008, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 666 + }, + { + "completion_length": 955.8750610351562, + "epoch": 0.8134146341463414, + "grad_norm": 0.0982297733426094, + "kl": 0.024169921875, + "learning_rate": 3.070606930898602e-07, + "loss": 0.0046, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 667 + }, + { + "completion_length": 863.0416870117188, + "epoch": 0.8146341463414634, + "grad_norm": 0.4044858515262604, + "kl": 0.0328369140625, + "learning_rate": 3.032005515253751e-07, + "loss": 0.0122, + "reward": 0.2083333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 668 + }, + { + "completion_length": 818.6250305175781, + "epoch": 0.8158536585365853, + "grad_norm": 0.38262733817100525, + "kl": 0.02764892578125, + "learning_rate": 2.9936209737370727e-07, + "loss": -0.0002, + "reward": 0.1458333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 669 + }, + { + "completion_length": 820.3958740234375, + "epoch": 0.8170731707317073, + "grad_norm": 0.44554057717323303, + "kl": 0.02392578125, + "learning_rate": 2.955454001921588e-07, + "loss": 0.0221, + "reward": 0.2083333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 670 + }, + { + "completion_length": 706.2083435058594, + "epoch": 0.8182926829268292, + "grad_norm": 0.5388452410697937, + "kl": 0.0235595703125, + "learning_rate": 2.917505291437683e-07, + "loss": 0.0026, + "reward": 0.12500000558793545, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 671 + }, + { + "completion_length": 759.5000305175781, + "epoch": 0.8195121951219512, + "grad_norm": 0.24352578818798065, + "kl": 0.080078125, + "learning_rate": 2.879775529960603e-07, + "loss": 0.0065, + "reward": 0.2500000149011612, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 672 + }, + { + "completion_length": 817.3958435058594, + "epoch": 0.8207317073170731, + "grad_norm": 0.6730024218559265, + "kl": 0.02984619140625, + "learning_rate": 2.842265401197982e-07, + "loss": 0.01, + "reward": 0.2916666679084301, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2916666679084301, + "rewards/format_reward": 0.0, + "step": 673 + }, + { + "completion_length": 978.3125, + "epoch": 0.8219512195121951, + "grad_norm": 0.4777490794658661, + "kl": 0.02362060546875, + "learning_rate": 2.8049755848774337e-07, + "loss": -0.0511, + "reward": 0.2083333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 674 + }, + { + "completion_length": 830.3541870117188, + "epoch": 0.823170731707317, + "grad_norm": 0.44030094146728516, + "kl": 0.03741455078125, + "learning_rate": 2.7679067567342766e-07, + "loss": -0.0183, + "reward": 0.2916666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/format_reward": 0.0, + "step": 675 + }, + { + "completion_length": 970.1666870117188, + "epoch": 0.824390243902439, + "grad_norm": 0.43740084767341614, + "kl": 0.02618408203125, + "learning_rate": 2.7310595884992354e-07, + "loss": 0.0676, + "reward": 0.125, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 676 + }, + { + "completion_length": 720.5000305175781, + "epoch": 0.8256097560975609, + "grad_norm": 0.5037770867347717, + "kl": 0.02215576171875, + "learning_rate": 2.6944347478863226e-07, + "loss": 0.0005, + "reward": 0.2083333358168602, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 677 + }, + { + "completion_length": 679.2916870117188, + "epoch": 0.8268292682926829, + "grad_norm": 0.5386813282966614, + "kl": 0.023681640625, + "learning_rate": 2.658032898580702e-07, + "loss": 0.0202, + "reward": 0.2500000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.0, + "step": 678 + }, + { + "completion_length": 824.7083435058594, + "epoch": 0.8280487804878048, + "grad_norm": 0.5326714515686035, + "kl": 0.02606201171875, + "learning_rate": 2.621854700226663e-07, + "loss": 0.0196, + "reward": 0.1666666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 679 + }, + { + "completion_length": 772.2916870117188, + "epoch": 0.8292682926829268, + "grad_norm": 0.36251839995384216, + "kl": 0.02960205078125, + "learning_rate": 2.5859008084156986e-07, + "loss": 0.0207, + "reward": 0.2083333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 680 + }, + { + "completion_length": 1063.3542175292969, + "epoch": 0.8304878048780487, + "grad_norm": 0.3171287477016449, + "kl": 0.0240478515625, + "learning_rate": 2.5501718746745766e-07, + "loss": -0.016, + "reward": 0.1041666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 681 + }, + { + "completion_length": 958.7291870117188, + "epoch": 0.8317073170731707, + "grad_norm": 0.04372232034802437, + "kl": 0.025390625, + "learning_rate": 2.514668546453592e-07, + "loss": 0.0009, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 682 + }, + { + "completion_length": 701.7083435058594, + "epoch": 0.8329268292682926, + "grad_norm": 0.5557974576950073, + "kl": 0.02490234375, + "learning_rate": 2.4793914671147745e-07, + "loss": -0.0015, + "reward": 0.2500000149011612, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 683 + }, + { + "completion_length": 828.6875305175781, + "epoch": 0.8341463414634146, + "grad_norm": 0.5050874352455139, + "kl": 0.0203857421875, + "learning_rate": 2.4443412759202745e-07, + "loss": -0.0188, + "reward": 0.2708333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/format_reward": 0.0, + "step": 684 + }, + { + "completion_length": 703.7708435058594, + "epoch": 0.8353658536585366, + "grad_norm": 0.5317684412002563, + "kl": 0.032470703125, + "learning_rate": 2.4095186080207505e-07, + "loss": -0.0035, + "reward": 0.14583333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 685 + }, + { + "completion_length": 779.8541870117188, + "epoch": 0.8365853658536585, + "grad_norm": 0.4637664556503296, + "kl": 0.029296875, + "learning_rate": 2.3749240944438845e-07, + "loss": 0.023, + "reward": 0.1458333358168602, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 686 + }, + { + "completion_length": 781.6875305175781, + "epoch": 0.8378048780487805, + "grad_norm": 0.4552900493144989, + "kl": 0.02520751953125, + "learning_rate": 2.3405583620829268e-07, + "loss": 0.0113, + "reward": 0.1666666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 687 + }, + { + "completion_length": 861.0625305175781, + "epoch": 0.8390243902439024, + "grad_norm": 0.5198604464530945, + "kl": 0.02117919921875, + "learning_rate": 2.3064220336853398e-07, + "loss": -0.0567, + "reward": 0.3541666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/format_reward": 0.0, + "step": 688 + }, + { + "completion_length": 784.625, + "epoch": 0.8402439024390244, + "grad_norm": 0.37156882882118225, + "kl": 0.0289306640625, + "learning_rate": 2.272515727841527e-07, + "loss": -0.0117, + "reward": 0.2500000149011612, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 689 + }, + { + "completion_length": 993.1667175292969, + "epoch": 0.8414634146341463, + "grad_norm": 0.42797571420669556, + "kl": 0.0313720703125, + "learning_rate": 2.2388400589735985e-07, + "loss": 0.0018, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 690 + }, + { + "completion_length": 807.8333435058594, + "epoch": 0.8426829268292683, + "grad_norm": 0.3258882164955139, + "kl": 0.0267333984375, + "learning_rate": 2.205395637324264e-07, + "loss": -0.0123, + "reward": 0.20833333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.20833333395421505, + "rewards/format_reward": 0.0, + "step": 691 + }, + { + "completion_length": 652.2291870117188, + "epoch": 0.8439024390243902, + "grad_norm": 0.5457414984703064, + "kl": 0.029541015625, + "learning_rate": 2.1721830689457583e-07, + "loss": 0.0421, + "reward": 0.1250000037252903, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 692 + }, + { + "completion_length": 672.4166870117188, + "epoch": 0.8451219512195122, + "grad_norm": 0.4368482828140259, + "kl": 0.02880859375, + "learning_rate": 2.1392029556888576e-07, + "loss": 0.0331, + "reward": 0.1875, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 693 + }, + { + "completion_length": 723.6458435058594, + "epoch": 0.8463414634146341, + "grad_norm": 0.41581639647483826, + "kl": 0.02862548828125, + "learning_rate": 2.1064558951919854e-07, + "loss": 0.0154, + "reward": 0.3333333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 694 + }, + { + "completion_length": 751.6875, + "epoch": 0.8475609756097561, + "grad_norm": 0.2076808363199234, + "kl": 0.0267333984375, + "learning_rate": 2.0739424808703638e-07, + "loss": -0.0015, + "reward": 0.0416666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 695 + }, + { + "completion_length": 965.2708435058594, + "epoch": 0.848780487804878, + "grad_norm": 0.1890517622232437, + "kl": 0.085205078125, + "learning_rate": 2.0416633019052882e-07, + "loss": -0.0136, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 696 + }, + { + "completion_length": 715.3333435058594, + "epoch": 0.85, + "grad_norm": 0.3903053402900696, + "kl": 0.0257568359375, + "learning_rate": 2.0096189432334195e-07, + "loss": -0.0048, + "reward": 0.12500000558793545, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 697 + }, + { + "completion_length": 1027.1250305175781, + "epoch": 0.8512195121951219, + "grad_norm": 0.1765126883983612, + "kl": 0.0223388671875, + "learning_rate": 1.9778099855362085e-07, + "loss": -0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 698 + }, + { + "completion_length": 798.6458740234375, + "epoch": 0.8524390243902439, + "grad_norm": 0.5328000783920288, + "kl": 0.02410888671875, + "learning_rate": 1.9462370052293544e-07, + "loss": 0.005, + "reward": 0.1666666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 699 + }, + { + "completion_length": 913.8750305175781, + "epoch": 0.8536585365853658, + "grad_norm": 0.8566571474075317, + "kl": 0.0328369140625, + "learning_rate": 1.9149005744523757e-07, + "loss": 0.0011, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 700 + }, + { + "completion_length": 729.0833435058594, + "epoch": 0.8548780487804878, + "grad_norm": 0.45158228278160095, + "kl": 0.030029296875, + "learning_rate": 1.8838012610582356e-07, + "loss": 0.0429, + "reward": 0.1458333358168602, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 701 + }, + { + "completion_length": 836.5416870117188, + "epoch": 0.8560975609756097, + "grad_norm": 0.3114115595817566, + "kl": 0.0238037109375, + "learning_rate": 1.852939628603046e-07, + "loss": -0.0105, + "reward": 0.1041666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 702 + }, + { + "completion_length": 730.625, + "epoch": 0.8573170731707317, + "grad_norm": 0.3165081739425659, + "kl": 0.02349853515625, + "learning_rate": 1.822316236335867e-07, + "loss": -0.0146, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 703 + }, + { + "completion_length": 1010.5416870117188, + "epoch": 0.8585365853658536, + "grad_norm": 0.25261008739471436, + "kl": 0.0235595703125, + "learning_rate": 1.7919316391885593e-07, + "loss": 0.0463, + "reward": 0.1875, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 704 + }, + { + "completion_length": 785.5625305175781, + "epoch": 0.8597560975609756, + "grad_norm": 0.3985195755958557, + "kl": 0.0279541015625, + "learning_rate": 1.761786387765743e-07, + "loss": -0.0239, + "reward": 0.2500000149011612, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 705 + }, + { + "completion_length": 849.5, + "epoch": 0.8609756097560975, + "grad_norm": 0.37897011637687683, + "kl": 0.024169921875, + "learning_rate": 1.731881028334808e-07, + "loss": 0.0273, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 706 + }, + { + "completion_length": 1048.3125, + "epoch": 0.8621951219512195, + "grad_norm": 0.1622416377067566, + "kl": 0.02752685546875, + "learning_rate": 1.7022161028160244e-07, + "loss": 0.0162, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 707 + }, + { + "completion_length": 802.9166870117188, + "epoch": 0.8634146341463415, + "grad_norm": 0.4028719365596771, + "kl": 0.0225830078125, + "learning_rate": 1.6727921487727095e-07, + "loss": 0.0212, + "reward": 0.2916666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/format_reward": 0.0, + "step": 708 + }, + { + "completion_length": 723.5416870117188, + "epoch": 0.8646341463414634, + "grad_norm": 0.2419525682926178, + "kl": 0.02032470703125, + "learning_rate": 1.64360969940149e-07, + "loss": -0.0051, + "reward": 0.20833333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.20833333395421505, + "rewards/format_reward": 0.0, + "step": 709 + }, + { + "completion_length": 1010.3125, + "epoch": 0.8658536585365854, + "grad_norm": 0.04453768953680992, + "kl": 0.0252685546875, + "learning_rate": 1.6146692835226669e-07, + "loss": 0.0009, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 710 + }, + { + "completion_length": 1053.875, + "epoch": 0.8670731707317073, + "grad_norm": 0.3233415186405182, + "kl": 0.0244140625, + "learning_rate": 1.5859714255705843e-07, + "loss": 0.0632, + "reward": 0.2083333395421505, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.0, + "step": 711 + }, + { + "completion_length": 745.6041870117188, + "epoch": 0.8682926829268293, + "grad_norm": 0.3168937861919403, + "kl": 0.0281982421875, + "learning_rate": 1.5575166455841677e-07, + "loss": 0.048, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 712 + }, + { + "completion_length": 700.4166870117188, + "epoch": 0.8695121951219512, + "grad_norm": 0.4607069194316864, + "kl": 0.02447509765625, + "learning_rate": 1.5293054591974726e-07, + "loss": -0.0158, + "reward": 0.1875000111758709, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.0, + "step": 713 + }, + { + "completion_length": 1071.3125305175781, + "epoch": 0.8707317073170732, + "grad_norm": 0.27966293692588806, + "kl": 0.020263671875, + "learning_rate": 1.501338377630362e-07, + "loss": 0.0557, + "reward": 0.1875, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 714 + }, + { + "completion_length": 937.6458435058594, + "epoch": 0.8719512195121951, + "grad_norm": 0.27274319529533386, + "kl": 0.04486083984375, + "learning_rate": 1.473615907679229e-07, + "loss": 0.0042, + "reward": 0.125, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 715 + }, + { + "completion_length": 880.2500305175781, + "epoch": 0.8731707317073171, + "grad_norm": 0.3870413899421692, + "kl": 0.024169921875, + "learning_rate": 1.446138551707814e-07, + "loss": -0.0014, + "reward": 0.2083333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/format_reward": 0.0, + "step": 716 + }, + { + "completion_length": 1127.7916870117188, + "epoch": 0.874390243902439, + "grad_norm": 0.1338748186826706, + "kl": 0.0244140625, + "learning_rate": 1.4189068076381078e-07, + "loss": 0.0268, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 717 + }, + { + "completion_length": 919.7708435058594, + "epoch": 0.875609756097561, + "grad_norm": 0.27817457914352417, + "kl": 0.020477294921875, + "learning_rate": 1.3919211689413207e-07, + "loss": 0.0074, + "reward": 0.2708333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 718 + }, + { + "completion_length": 723.7291870117188, + "epoch": 0.8768292682926829, + "grad_norm": 0.06823945790529251, + "kl": 0.02557373046875, + "learning_rate": 1.365182124628949e-07, + "loss": 0.001, + "reward": 0.1875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 719 + }, + { + "completion_length": 1069.0000610351562, + "epoch": 0.8780487804878049, + "grad_norm": 0.13704067468643188, + "kl": 0.0267333984375, + "learning_rate": 1.3386901592439071e-07, + "loss": 0.0003, + "reward": 0.1458333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333432674408, + "rewards/format_reward": 0.0, + "step": 720 + }, + { + "completion_length": 632.8125305175781, + "epoch": 0.8792682926829268, + "grad_norm": 0.2622615098953247, + "kl": 0.0281982421875, + "learning_rate": 1.3124457528517503e-07, + "loss": 0.0065, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 721 + }, + { + "completion_length": 881.5416870117188, + "epoch": 0.8804878048780488, + "grad_norm": 0.24476896226406097, + "kl": 0.02850341796875, + "learning_rate": 1.2864493810319676e-07, + "loss": 0.0161, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 722 + }, + { + "completion_length": 832.8750305175781, + "epoch": 0.8817073170731707, + "grad_norm": 0.41959550976753235, + "kl": 0.02484130859375, + "learning_rate": 1.260701514869379e-07, + "loss": 0.0916, + "reward": 0.2708333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 723 + }, + { + "completion_length": 1051.7083435058594, + "epoch": 0.8829268292682927, + "grad_norm": 0.37093260884284973, + "kl": 0.0228271484375, + "learning_rate": 1.2352026209455808e-07, + "loss": -0.0032, + "reward": 0.06250000186264515, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "step": 724 + }, + { + "completion_length": 1032.4791870117188, + "epoch": 0.8841463414634146, + "grad_norm": 0.4205034673213959, + "kl": 0.02294921875, + "learning_rate": 1.209953161330507e-07, + "loss": 0.013, + "reward": 0.2500000111758709, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/format_reward": 0.0, + "step": 725 + }, + { + "completion_length": 672.6458435058594, + "epoch": 0.8853658536585366, + "grad_norm": 0.32758957147598267, + "kl": 0.02978515625, + "learning_rate": 1.1849535935740474e-07, + "loss": 0.0171, + "reward": 0.1458333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333432674408, + "rewards/format_reward": 0.0, + "step": 726 + }, + { + "completion_length": 1033.6875305175781, + "epoch": 0.8865853658536585, + "grad_norm": 0.2726954221725464, + "kl": 0.0230712890625, + "learning_rate": 1.1602043706977538e-07, + "loss": 0.0574, + "reward": 0.0416666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 727 + }, + { + "completion_length": 668.8958435058594, + "epoch": 0.8878048780487805, + "grad_norm": 0.4613305628299713, + "kl": 0.02630615234375, + "learning_rate": 1.1357059411866355e-07, + "loss": 0.0132, + "reward": 0.2500000074505806, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/format_reward": 0.0, + "step": 728 + }, + { + "completion_length": 1183.0416870117188, + "epoch": 0.8890243902439025, + "grad_norm": 0.3565730154514313, + "kl": 0.0208740234375, + "learning_rate": 1.1114587489810352e-07, + "loss": 0.0297, + "reward": 0.18750000558793545, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/format_reward": 0.0, + "step": 729 + }, + { + "completion_length": 764.0833435058594, + "epoch": 0.8902439024390244, + "grad_norm": 0.38788408041000366, + "kl": 0.02801513671875, + "learning_rate": 1.0874632334685808e-07, + "loss": 0.0557, + "reward": 0.1875000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 730 + }, + { + "completion_length": 773.1250305175781, + "epoch": 0.8914634146341464, + "grad_norm": 0.16996777057647705, + "kl": 0.02484130859375, + "learning_rate": 1.0637198294762152e-07, + "loss": 0.0126, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 731 + }, + { + "completion_length": 1004.3958435058594, + "epoch": 0.8926829268292683, + "grad_norm": 0.27469712495803833, + "kl": 0.0269775390625, + "learning_rate": 1.0402289672623272e-07, + "loss": 0.0084, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 732 + }, + { + "completion_length": 1066.2291870117188, + "epoch": 0.8939024390243903, + "grad_norm": 0.09178400784730911, + "kl": 0.02496337890625, + "learning_rate": 1.0169910725089548e-07, + "loss": 0.0009, + "reward": 0.1875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 733 + }, + { + "completion_length": 953.7708740234375, + "epoch": 0.8951219512195122, + "grad_norm": 0.2735711634159088, + "kl": 0.023681640625, + "learning_rate": 9.940065663140663e-08, + "loss": 0.0439, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 734 + }, + { + "completion_length": 837.6042175292969, + "epoch": 0.8963414634146342, + "grad_norm": 0.47646215558052063, + "kl": 0.02392578125, + "learning_rate": 9.71275865183936e-08, + "loss": 0.0015, + "reward": 0.2083333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 735 + }, + { + "completion_length": 929.4166870117188, + "epoch": 0.8975609756097561, + "grad_norm": 0.43811649084091187, + "kl": 0.02923583984375, + "learning_rate": 9.487993810255823e-08, + "loss": 0.0975, + "reward": 0.1875, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 736 + }, + { + "completion_length": 994.0833740234375, + "epoch": 0.8987804878048781, + "grad_norm": 1.1990416049957275, + "kl": 0.0303955078125, + "learning_rate": 9.265775211393224e-08, + "loss": -0.0442, + "reward": 0.1875000074505806, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 737 + }, + { + "completion_length": 819.0000305175781, + "epoch": 0.9, + "grad_norm": 0.17224682867527008, + "kl": 0.02337646484375, + "learning_rate": 9.046106882113752e-08, + "loss": -0.0084, + "reward": 0.14583333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 738 + }, + { + "completion_length": 1121.4583435058594, + "epoch": 0.901219512195122, + "grad_norm": 0.3978723883628845, + "kl": 0.02642822265625, + "learning_rate": 8.828992803065772e-08, + "loss": -0.0758, + "reward": 0.1875000074505806, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 739 + }, + { + "completion_length": 716.4583435058594, + "epoch": 0.9024390243902439, + "grad_norm": 0.7046301364898682, + "kl": 0.03155517578125, + "learning_rate": 8.614436908611617e-08, + "loss": 0.0477, + "reward": 0.3333333358168602, + "reward_std": 0.21650634706020355, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/format_reward": 0.0, + "step": 740 + }, + { + "completion_length": 1084.0208740234375, + "epoch": 0.9036585365853659, + "grad_norm": 0.5535774827003479, + "kl": 0.031982421875, + "learning_rate": 8.402443086756273e-08, + "loss": -0.1231, + "reward": 0.08333333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 741 + }, + { + "completion_length": 919.0625305175781, + "epoch": 0.9048780487804878, + "grad_norm": 0.23134127259254456, + "kl": 0.02215576171875, + "learning_rate": 8.193015179076996e-08, + "loss": 0.0253, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 742 + }, + { + "completion_length": 807.6875, + "epoch": 0.9060975609756098, + "grad_norm": 0.04322041571140289, + "kl": 0.02398681640625, + "learning_rate": 7.986156980653653e-08, + "loss": 0.0009, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 743 + }, + { + "completion_length": 938.7292175292969, + "epoch": 0.9073170731707317, + "grad_norm": 0.21795502305030823, + "kl": 0.02130126953125, + "learning_rate": 7.781872239999993e-08, + "loss": -0.0017, + "reward": 0.1458333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333432674408, + "rewards/format_reward": 0.0, + "step": 744 + }, + { + "completion_length": 762.6875, + "epoch": 0.9085365853658537, + "grad_norm": 0.2852475345134735, + "kl": 0.0247802734375, + "learning_rate": 7.580164658995603e-08, + "loss": 0.0202, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 745 + }, + { + "completion_length": 974.8333435058594, + "epoch": 0.9097560975609756, + "grad_norm": 0.24209746718406677, + "kl": 0.02294921875, + "learning_rate": 7.381037892818959e-08, + "loss": -0.0242, + "reward": 0.1250000037252903, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 746 + }, + { + "completion_length": 886.3125, + "epoch": 0.9109756097560976, + "grad_norm": 0.47765588760375977, + "kl": 0.02490234375, + "learning_rate": 7.184495549881131e-08, + "loss": -0.0703, + "reward": 0.1458333395421505, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1458333395421505, + "rewards/format_reward": 0.0, + "step": 747 + }, + { + "completion_length": 766.6458435058594, + "epoch": 0.9121951219512195, + "grad_norm": 0.3742899000644684, + "kl": 0.02581787109375, + "learning_rate": 6.990541191760418e-08, + "loss": -0.004, + "reward": 0.3541666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/format_reward": 0.0, + "step": 748 + }, + { + "completion_length": 741.5000305175781, + "epoch": 0.9134146341463415, + "grad_norm": 0.4017605781555176, + "kl": 0.029052734375, + "learning_rate": 6.799178333137784e-08, + "loss": 0.0276, + "reward": 0.0625, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 749 + }, + { + "completion_length": 1010.5833740234375, + "epoch": 0.9146341463414634, + "grad_norm": 0.27121710777282715, + "kl": 0.0203857421875, + "learning_rate": 6.610410441733156e-08, + "loss": 0.0389, + "reward": 0.0625, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 750 + }, + { + "completion_length": 790.3333740234375, + "epoch": 0.9158536585365854, + "grad_norm": 0.05629832670092583, + "kl": 0.02557373046875, + "learning_rate": 6.424240938242643e-08, + "loss": 0.0009, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 751 + }, + { + "completion_length": 862.3125, + "epoch": 0.9170731707317074, + "grad_norm": 0.24488425254821777, + "kl": 0.0301513671875, + "learning_rate": 6.24067319627642e-08, + "loss": 0.0261, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 752 + }, + { + "completion_length": 889.4791870117188, + "epoch": 0.9182926829268293, + "grad_norm": 0.47951433062553406, + "kl": 0.0274658203125, + "learning_rate": 6.059710542297824e-08, + "loss": 0.011, + "reward": 0.12500000558793545, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 753 + }, + { + "completion_length": 845.4583435058594, + "epoch": 0.9195121951219513, + "grad_norm": 0.33234408497810364, + "kl": 0.02655029296875, + "learning_rate": 5.8813562555628585e-08, + "loss": -0.0212, + "reward": 0.0416666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 754 + }, + { + "completion_length": 733.3541870117188, + "epoch": 0.9207317073170732, + "grad_norm": 0.6557818651199341, + "kl": 0.031005859375, + "learning_rate": 5.7056135680607965e-08, + "loss": 0.046, + "reward": 0.3125, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.0, + "step": 755 + }, + { + "completion_length": 630.2083435058594, + "epoch": 0.9219512195121952, + "grad_norm": 0.6298221945762634, + "kl": 0.02850341796875, + "learning_rate": 5.532485664455755e-08, + "loss": 0.0159, + "reward": 0.2708333432674408, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 756 + }, + { + "completion_length": 761.6875305175781, + "epoch": 0.9231707317073171, + "grad_norm": 0.6046322584152222, + "kl": 0.032470703125, + "learning_rate": 5.3619756820288525e-08, + "loss": -0.0381, + "reward": 0.3541666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/format_reward": 0.0, + "step": 757 + }, + { + "completion_length": 716.9375305175781, + "epoch": 0.9243902439024391, + "grad_norm": 0.4098731279373169, + "kl": 0.027587890625, + "learning_rate": 5.194086710621404e-08, + "loss": 0.0823, + "reward": 0.125, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 758 + }, + { + "completion_length": 937.5, + "epoch": 0.925609756097561, + "grad_norm": 0.37171196937561035, + "kl": 0.02362060546875, + "learning_rate": 5.0288217925789025e-08, + "loss": 0.0248, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 759 + }, + { + "completion_length": 794.0208740234375, + "epoch": 0.926829268292683, + "grad_norm": 0.20126201212406158, + "kl": 0.0208740234375, + "learning_rate": 4.86618392269596e-08, + "loss": -0.007, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 760 + }, + { + "completion_length": 825.875, + "epoch": 0.9280487804878049, + "grad_norm": 0.6448650360107422, + "kl": 0.03265380859375, + "learning_rate": 4.70617604816192e-08, + "loss": 0.0139, + "reward": 0.2291666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 761 + }, + { + "completion_length": 883.6666870117188, + "epoch": 0.9292682926829269, + "grad_norm": 0.42482876777648926, + "kl": 0.02520751953125, + "learning_rate": 4.54880106850758e-08, + "loss": -0.0098, + "reward": 0.12500000558793545, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 762 + }, + { + "completion_length": 1032.2708435058594, + "epoch": 0.9304878048780488, + "grad_norm": 0.46651068329811096, + "kl": 0.0216064453125, + "learning_rate": 4.394061835552554e-08, + "loss": -0.0285, + "reward": 0.10416666977107525, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 763 + }, + { + "completion_length": 801.9375305175781, + "epoch": 0.9317073170731708, + "grad_norm": 0.04411710798740387, + "kl": 0.0208740234375, + "learning_rate": 4.2419611533536296e-08, + "loss": 0.0007, + "reward": 0.1875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 764 + }, + { + "completion_length": 791.7916870117188, + "epoch": 0.9329268292682927, + "grad_norm": 0.41850683093070984, + "kl": 0.026123046875, + "learning_rate": 4.0925017781539896e-08, + "loss": 0.0028, + "reward": 0.1458333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 765 + }, + { + "completion_length": 867.875, + "epoch": 0.9341463414634147, + "grad_norm": 0.04630811884999275, + "kl": 0.02459716796875, + "learning_rate": 3.9456864183331557e-08, + "loss": 0.0009, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 766 + }, + { + "completion_length": 806.6458435058594, + "epoch": 0.9353658536585366, + "grad_norm": 0.04593589901924133, + "kl": 0.02264404296875, + "learning_rate": 3.80151773435804e-08, + "loss": 0.0008, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 767 + }, + { + "completion_length": 833.9791870117188, + "epoch": 0.9365853658536586, + "grad_norm": 0.2245476394891739, + "kl": 0.02105712890625, + "learning_rate": 3.659998338734671e-08, + "loss": 0.0015, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 768 + }, + { + "completion_length": 763.0625, + "epoch": 0.9378048780487804, + "grad_norm": 0.0570383220911026, + "kl": 0.0294189453125, + "learning_rate": 3.5211307959608475e-08, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 769 + }, + { + "completion_length": 796.9375, + "epoch": 0.9390243902439024, + "grad_norm": 0.28452613949775696, + "kl": 0.0203857421875, + "learning_rate": 3.3849176224796884e-08, + "loss": -0.0315, + "reward": 0.1875000074505806, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.0, + "step": 770 + }, + { + "completion_length": 746.1458435058594, + "epoch": 0.9402439024390243, + "grad_norm": 0.5315723419189453, + "kl": 0.0302734375, + "learning_rate": 3.2513612866339916e-08, + "loss": 0.0077, + "reward": 0.125, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.0, + "step": 771 + }, + { + "completion_length": 879.4375610351562, + "epoch": 0.9414634146341463, + "grad_norm": 0.756249725818634, + "kl": 0.0343017578125, + "learning_rate": 3.1204642086215817e-08, + "loss": -0.0351, + "reward": 0.1666666716337204, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 772 + }, + { + "completion_length": 782.3750305175781, + "epoch": 0.9426829268292682, + "grad_norm": 0.28776443004608154, + "kl": 0.022216796875, + "learning_rate": 2.992228760451349e-08, + "loss": 0.0504, + "reward": 0.1666666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 773 + }, + { + "completion_length": 723.0833740234375, + "epoch": 0.9439024390243902, + "grad_norm": 0.5710537433624268, + "kl": 0.0245361328125, + "learning_rate": 2.8666572659003965e-08, + "loss": -0.0, + "reward": 0.1250000037252903, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 774 + }, + { + "completion_length": 816.8125305175781, + "epoch": 0.9451219512195121, + "grad_norm": 0.343589723110199, + "kl": 0.01898193359375, + "learning_rate": 2.743752000471761e-08, + "loss": 0.0147, + "reward": 0.2916666716337204, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/format_reward": 0.0, + "step": 775 + }, + { + "completion_length": 814.2291870117188, + "epoch": 0.9463414634146341, + "grad_norm": 0.40398523211479187, + "kl": 0.0257568359375, + "learning_rate": 2.6235151913533595e-08, + "loss": 0.0236, + "reward": 0.1041666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "step": 776 + }, + { + "completion_length": 745.4791870117188, + "epoch": 0.947560975609756, + "grad_norm": 0.6614434719085693, + "kl": 0.02471923828125, + "learning_rate": 2.50594901737749e-08, + "loss": 0.0419, + "reward": 0.12500000558793545, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 777 + }, + { + "completion_length": 758.8333435058594, + "epoch": 0.948780487804878, + "grad_norm": 0.2255949229001999, + "kl": 0.02142333984375, + "learning_rate": 2.3910556089814294e-08, + "loss": 0.0001, + "reward": 0.20833333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.20833333395421505, + "rewards/format_reward": 0.0, + "step": 778 + }, + { + "completion_length": 945.1875610351562, + "epoch": 0.95, + "grad_norm": 0.42945098876953125, + "kl": 0.0296630859375, + "learning_rate": 2.278837048168797e-08, + "loss": 0.0276, + "reward": 0.1250000037252903, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "step": 779 + }, + { + "completion_length": 918.0000305175781, + "epoch": 0.9512195121951219, + "grad_norm": 0.04331444576382637, + "kl": 0.01953125, + "learning_rate": 2.1692953684718187e-08, + "loss": 0.0008, + "reward": 0.0625, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.0, + "step": 780 + }, + { + "completion_length": 679.4791870117188, + "epoch": 0.9524390243902439, + "grad_norm": 0.41175445914268494, + "kl": 0.02374267578125, + "learning_rate": 2.0624325549144894e-08, + "loss": 0.0085, + "reward": 0.1458333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 781 + }, + { + "completion_length": 994.3125610351562, + "epoch": 0.9536585365853658, + "grad_norm": 0.17546556890010834, + "kl": 0.02337646484375, + "learning_rate": 1.9582505439766028e-08, + "loss": 0.0414, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 782 + }, + { + "completion_length": 831.7291870117188, + "epoch": 0.9548780487804878, + "grad_norm": 0.4530733823776245, + "kl": 0.0316162109375, + "learning_rate": 1.856751223558695e-08, + "loss": -0.0156, + "reward": 0.2291666716337204, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/format_reward": 0.0, + "step": 783 + }, + { + "completion_length": 720.4583435058594, + "epoch": 0.9560975609756097, + "grad_norm": 0.41587570309638977, + "kl": 0.0303955078125, + "learning_rate": 1.7579364329477375e-08, + "loss": 0.0231, + "reward": 0.12500000558793545, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 784 + }, + { + "completion_length": 896.4791870117188, + "epoch": 0.9573170731707317, + "grad_norm": 0.17405299842357635, + "kl": 0.02294921875, + "learning_rate": 1.661807962783851e-08, + "loss": 0.0575, + "reward": 0.14583333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 785 + }, + { + "completion_length": 773.2708435058594, + "epoch": 0.9585365853658536, + "grad_norm": 0.39356529712677, + "kl": 0.0245361328125, + "learning_rate": 1.5683675550279943e-08, + "loss": 0.0176, + "reward": 0.1666666679084301, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/format_reward": 0.0, + "step": 786 + }, + { + "completion_length": 956.1250305175781, + "epoch": 0.9597560975609756, + "grad_norm": 0.3414008915424347, + "kl": 0.02886962890625, + "learning_rate": 1.4776169029301234e-08, + "loss": 0.0331, + "reward": 0.2916666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/format_reward": 0.0, + "step": 787 + }, + { + "completion_length": 773.2291870117188, + "epoch": 0.9609756097560975, + "grad_norm": 0.34748509526252747, + "kl": 0.02716064453125, + "learning_rate": 1.3895576509987685e-08, + "loss": 0.0049, + "reward": 0.08333333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/format_reward": 0.0, + "step": 788 + }, + { + "completion_length": 677.2916870117188, + "epoch": 0.9621951219512195, + "grad_norm": 0.3142717182636261, + "kl": 0.02398681640625, + "learning_rate": 1.3041913949710715e-08, + "loss": 0.0035, + "reward": 0.14583333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 789 + }, + { + "completion_length": 958.1250305175781, + "epoch": 0.9634146341463414, + "grad_norm": 0.25102928280830383, + "kl": 0.0235595703125, + "learning_rate": 1.2215196817839447e-08, + "loss": 0.0045, + "reward": 0.1458333432674408, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333432674408, + "rewards/format_reward": 0.0, + "step": 790 + }, + { + "completion_length": 911.7708435058594, + "epoch": 0.9646341463414634, + "grad_norm": 0.34268632531166077, + "kl": 0.02667236328125, + "learning_rate": 1.1415440095460083e-08, + "loss": 0.0186, + "reward": 0.2083333432674408, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 791 + }, + { + "completion_length": 738.9583740234375, + "epoch": 0.9658536585365853, + "grad_norm": 0.31120502948760986, + "kl": 0.02655029296875, + "learning_rate": 1.06426582751043e-08, + "loss": 0.0329, + "reward": 0.2500000149011612, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 792 + }, + { + "completion_length": 906.5208435058594, + "epoch": 0.9670731707317073, + "grad_norm": 0.4276106357574463, + "kl": 0.02557373046875, + "learning_rate": 9.896865360487451e-09, + "loss": 0.0771, + "reward": 0.14583333395421505, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/format_reward": 0.0, + "step": 793 + }, + { + "completion_length": 715.0833435058594, + "epoch": 0.9682926829268292, + "grad_norm": 0.6245980858802795, + "kl": 0.03021240234375, + "learning_rate": 9.178074866253605e-09, + "loss": -0.0076, + "reward": 0.1666666716337204, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 794 + }, + { + "completion_length": 973.9583435058594, + "epoch": 0.9695121951219512, + "grad_norm": 0.40942537784576416, + "kl": 0.0260009765625, + "learning_rate": 8.486299817731412e-09, + "loss": 0.0285, + "reward": 0.2708333432674408, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/format_reward": 0.0, + "step": 795 + }, + { + "completion_length": 1057.3125305175781, + "epoch": 0.9707317073170731, + "grad_norm": 0.3129737973213196, + "kl": 0.023193359375, + "learning_rate": 7.821552750697958e-09, + "loss": 0.0336, + "reward": 0.1875000111758709, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.0, + "step": 796 + }, + { + "completion_length": 657.4583740234375, + "epoch": 0.9719512195121951, + "grad_norm": 0.4153045117855072, + "kl": 0.02880859375, + "learning_rate": 7.1838457111516044e-09, + "loss": -0.0107, + "reward": 0.1458333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 797 + }, + { + "completion_length": 1035.6875610351562, + "epoch": 0.973170731707317, + "grad_norm": 0.0584120973944664, + "kl": 0.023193359375, + "learning_rate": 6.573190255093342e-09, + "loss": 0.0009, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 798 + }, + { + "completion_length": 841.0416870117188, + "epoch": 0.974390243902439, + "grad_norm": 0.5308164954185486, + "kl": 0.03045654296875, + "learning_rate": 5.989597448317785e-09, + "loss": 0.0024, + "reward": 0.1458333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 799 + }, + { + "completion_length": 785.0208740234375, + "epoch": 0.975609756097561, + "grad_norm": 0.4039956331253052, + "kl": 0.0228271484375, + "learning_rate": 5.433077866212999e-09, + "loss": 0.0233, + "reward": 0.291666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.291666679084301, + "rewards/format_reward": 0.0, + "step": 800 + }, + { + "completion_length": 893.2500305175781, + "epoch": 0.9768292682926829, + "grad_norm": 0.5676692128181458, + "kl": 0.03875732421875, + "learning_rate": 4.903641593567654e-09, + "loss": -0.0039, + "reward": 0.229166679084301, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.229166679084301, + "rewards/format_reward": 0.0, + "step": 801 + }, + { + "completion_length": 719.5416870117188, + "epoch": 0.9780487804878049, + "grad_norm": 0.3240124583244324, + "kl": 0.02880859375, + "learning_rate": 4.401298224389338e-09, + "loss": 0.0029, + "reward": 0.1041666679084301, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0, + "step": 802 + }, + { + "completion_length": 845.0833435058594, + "epoch": 0.9792682926829268, + "grad_norm": 0.33833006024360657, + "kl": 0.026123046875, + "learning_rate": 3.926056861730532e-09, + "loss": 0.0627, + "reward": 0.10416666977107525, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "step": 803 + }, + { + "completion_length": 827.5208435058594, + "epoch": 0.9804878048780488, + "grad_norm": 0.3172595798969269, + "kl": 0.02911376953125, + "learning_rate": 3.4779261175232334e-09, + "loss": -0.0376, + "reward": 0.12500000558793545, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0, + "step": 804 + }, + { + "completion_length": 686.9583435058594, + "epoch": 0.9817073170731707, + "grad_norm": 0.3726345896720886, + "kl": 0.0220947265625, + "learning_rate": 3.0569141124234256e-09, + "loss": -0.0216, + "reward": 0.1041666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1041666716337204, + "rewards/format_reward": 0.0, + "step": 805 + }, + { + "completion_length": 852.8333435058594, + "epoch": 0.9829268292682927, + "grad_norm": 0.4700186848640442, + "kl": 0.0269775390625, + "learning_rate": 2.6630284756635204e-09, + "loss": -0.0481, + "reward": 0.0833333358168602, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 806 + }, + { + "completion_length": 827.2916870117188, + "epoch": 0.9841463414634146, + "grad_norm": 0.05038120225071907, + "kl": 0.02862548828125, + "learning_rate": 2.2962763449141387e-09, + "loss": 0.001, + "reward": 0.1875, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.0, + "step": 807 + }, + { + "completion_length": 835.0, + "epoch": 0.9853658536585366, + "grad_norm": 0.3867391049861908, + "kl": 0.02215576171875, + "learning_rate": 1.9566643661550478e-09, + "loss": 0.0422, + "reward": 0.2083333432674408, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/format_reward": 0.0, + "step": 808 + }, + { + "completion_length": 799.6458435058594, + "epoch": 0.9865853658536585, + "grad_norm": 0.3166770935058594, + "kl": 0.023681640625, + "learning_rate": 1.6441986935545884e-09, + "loss": -0.0102, + "reward": 0.02083333395421505, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "step": 809 + }, + { + "completion_length": 727.6041870117188, + "epoch": 0.9878048780487805, + "grad_norm": 0.17712058126926422, + "kl": 0.0240478515625, + "learning_rate": 1.3588849893579336e-09, + "loss": -0.0024, + "reward": 0.0833333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "step": 810 + }, + { + "completion_length": 1028.0833740234375, + "epoch": 0.9890243902439024, + "grad_norm": 0.04795070365071297, + "kl": 0.0245361328125, + "learning_rate": 1.1007284237850025e-09, + "loss": 0.0009, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 811 + }, + { + "completion_length": 856.7291870117188, + "epoch": 0.9902439024390244, + "grad_norm": 0.36679479479789734, + "kl": 0.0206298828125, + "learning_rate": 8.697336749358687e-10, + "loss": -0.0008, + "reward": 0.1458333358168602, + "reward_std": 0.03608439117670059, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0, + "step": 812 + }, + { + "completion_length": 1035.4167175292969, + "epoch": 0.9914634146341463, + "grad_norm": 0.2617432773113251, + "kl": 0.0228271484375, + "learning_rate": 6.659049287071617e-10, + "loss": 0.0244, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 813 + }, + { + "completion_length": 999.3125305175781, + "epoch": 0.9926829268292683, + "grad_norm": 0.4567527174949646, + "kl": 0.02532958984375, + "learning_rate": 4.892458787154608e-10, + "loss": 0.0007, + "reward": 0.0416666679084301, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "step": 814 + }, + { + "completion_length": 858.7291870117188, + "epoch": 0.9939024390243902, + "grad_norm": 0.6128376126289368, + "kl": 0.0572509765625, + "learning_rate": 3.397597262300156e-10, + "loss": -0.0398, + "reward": 0.2500000149011612, + "reward_std": 0.18042195588350296, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/format_reward": 0.0, + "step": 815 + }, + { + "completion_length": 1070.7708740234375, + "epoch": 0.9951219512195122, + "grad_norm": 0.3184763491153717, + "kl": 0.02508544921875, + "learning_rate": 2.1744918011595837e-10, + "loss": 0.0767, + "reward": 0.18750000558793545, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/format_reward": 0.0, + "step": 816 + }, + { + "completion_length": 882.7917175292969, + "epoch": 0.9963414634146341, + "grad_norm": 0.33201614022254944, + "kl": 0.030517578125, + "learning_rate": 1.2231645678401072e-10, + "loss": 0.0595, + "reward": 0.2083333358168602, + "reward_std": 0.14433756470680237, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/format_reward": 0.0, + "step": 817 + }, + { + "completion_length": 911.9583740234375, + "epoch": 0.9975609756097561, + "grad_norm": 0.33788737654685974, + "kl": 0.02301025390625, + "learning_rate": 5.436328015101522e-11, + "loss": 0.0052, + "reward": 0.1666666716337204, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 818 + }, + { + "completion_length": 693.2083740234375, + "epoch": 0.998780487804878, + "grad_norm": 0.5074701905250549, + "kl": 0.01910400390625, + "learning_rate": 1.359088160846067e-11, + "loss": -0.0122, + "reward": 0.1875000111758709, + "reward_std": 0.07216878235340118, + "rewards/accuracy_reward": 0.1875000111758709, + "rewards/format_reward": 0.0, + "step": 819 + }, + { + "completion_length": 1057.03125, + "epoch": 1.0, + "grad_norm": 0.466864675283432, + "kl": 0.02581787109375, + "learning_rate": 0.0, + "loss": -0.0094, + "reward": 0.18750000558793545, + "reward_std": 0.10825317353010178, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/format_reward": 0.0, + "step": 820 + }, + { + "epoch": 1.0, + "step": 820, + "total_flos": 0.0, + "train_loss": 0.004393076130298962, + "train_runtime": 23663.0394, + "train_samples_per_second": 0.554, + "train_steps_per_second": 0.035 + } + ], + "logging_steps": 1, + "max_steps": 820, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}