{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1640, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 973.2708740234375, "epoch": 0.0012195121951219512, "grad_norm": 0.251608282327652, "kl": 0.0, "learning_rate": 1.829268292682927e-08, "loss": -0.0402, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 1 }, { "completion_length": 746.2500305175781, "epoch": 0.0024390243902439024, "grad_norm": 0.21636046469211578, "kl": 0.0, "learning_rate": 3.658536585365854e-08, "loss": -0.0147, "reward": 0.2500000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 2 }, { "completion_length": 820.1875305175781, "epoch": 0.003658536585365854, "grad_norm": 0.27297210693359375, "kl": 0.00014448165893554688, "learning_rate": 5.48780487804878e-08, "loss": 0.0056, "reward": 0.2500000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 3 }, { "completion_length": 895.0625305175781, "epoch": 0.004878048780487805, "grad_norm": 0.2643326222896576, "kl": 0.00015592575073242188, "learning_rate": 7.317073170731708e-08, "loss": 0.0084, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 4 }, { "completion_length": 703.6666870117188, "epoch": 0.006097560975609756, "grad_norm": 0.4451855719089508, "kl": 0.000438690185546875, "learning_rate": 9.146341463414634e-08, "loss": 0.0052, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 796.1875305175781, "epoch": 0.007317073170731708, "grad_norm": 0.5127825140953064, "kl": 0.0002808570861816406, "learning_rate": 1.097560975609756e-07, "loss": 0.0049, "reward": 0.1250000037252903, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 6 }, { "completion_length": 785.3750305175781, "epoch": 0.00853658536585366, "grad_norm": 0.39015892148017883, "kl": 0.00020456314086914062, "learning_rate": 1.2804878048780488e-07, "loss": 0.018, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 7 }, { "completion_length": 842.5416870117188, "epoch": 0.00975609756097561, "grad_norm": 0.4429241716861725, "kl": 0.000247955322265625, "learning_rate": 1.4634146341463415e-07, "loss": 0.0162, "reward": 0.10416666977107525, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 8 }, { "completion_length": 1027.9166870117188, "epoch": 0.01097560975609756, "grad_norm": 0.2266790121793747, "kl": 0.00025653839111328125, "learning_rate": 1.6463414634146343e-07, "loss": 0.032, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 9 }, { "completion_length": 829.4375305175781, "epoch": 0.012195121951219513, "grad_norm": 0.24806636571884155, "kl": 0.00016164779663085938, "learning_rate": 1.8292682926829268e-07, "loss": -0.0073, "reward": 0.18750000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 995.5833740234375, "epoch": 0.013414634146341463, "grad_norm": 0.26613786816596985, "kl": 0.00015735626220703125, "learning_rate": 2.0121951219512198e-07, "loss": -0.0021, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 11 }, { "completion_length": 754.7083435058594, "epoch": 0.014634146341463415, "grad_norm": 0.3803806006908417, "kl": 0.0001609325408935547, "learning_rate": 2.195121951219512e-07, "loss": 0.0279, "reward": 0.3541666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 12 }, { "completion_length": 823.6458740234375, "epoch": 0.015853658536585366, "grad_norm": 0.4089258909225464, "kl": 0.00024271011352539062, "learning_rate": 2.378048780487805e-07, "loss": 0.0255, "reward": 0.0833333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 13 }, { "completion_length": 625.1875305175781, "epoch": 0.01707317073170732, "grad_norm": 0.22270222008228302, "kl": 0.000247955322265625, "learning_rate": 2.5609756097560976e-07, "loss": 0.0044, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 14 }, { "completion_length": 787.5, "epoch": 0.018292682926829267, "grad_norm": 0.259090781211853, "kl": 0.0001811981201171875, "learning_rate": 2.74390243902439e-07, "loss": -0.0023, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 874.1041870117188, "epoch": 0.01951219512195122, "grad_norm": 0.17729870975017548, "kl": 0.00018215179443359375, "learning_rate": 2.926829268292683e-07, "loss": 0.0084, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 16 }, { "completion_length": 807.4583435058594, "epoch": 0.020731707317073172, "grad_norm": 0.43330103158950806, "kl": 0.0002493858337402344, "learning_rate": 3.1097560975609756e-07, "loss": -0.071, "reward": 0.3125000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 17 }, { "completion_length": 699.9375305175781, "epoch": 0.02195121951219512, "grad_norm": 0.39200469851493835, "kl": 0.00019550323486328125, "learning_rate": 3.2926829268292686e-07, "loss": 0.0067, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 18 }, { "completion_length": 984.2708435058594, "epoch": 0.023170731707317073, "grad_norm": 0.19872254133224487, "kl": 0.00021219253540039062, "learning_rate": 3.475609756097561e-07, "loss": -0.0194, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 19 }, { "completion_length": 670.2083740234375, "epoch": 0.024390243902439025, "grad_norm": 0.14872349798679352, "kl": 0.00016355514526367188, "learning_rate": 3.6585365853658536e-07, "loss": -0.011, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 789.9583740234375, "epoch": 0.025609756097560974, "grad_norm": 0.3848355710506439, "kl": 0.00023126602172851562, "learning_rate": 3.8414634146341466e-07, "loss": 0.0121, "reward": 0.3333333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 21 }, { "completion_length": 881.9375305175781, "epoch": 0.026829268292682926, "grad_norm": 0.1362966001033783, "kl": 0.00018167495727539062, "learning_rate": 4.0243902439024396e-07, "loss": 0.0005, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 22 }, { "completion_length": 890.4791870117188, "epoch": 0.02804878048780488, "grad_norm": 0.3315347135066986, "kl": 0.00039196014404296875, "learning_rate": 4.207317073170732e-07, "loss": 0.0345, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 23 }, { "completion_length": 885.3541870117188, "epoch": 0.02926829268292683, "grad_norm": 0.24437867105007172, "kl": 0.000217437744140625, "learning_rate": 4.390243902439024e-07, "loss": 0.0338, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 24 }, { "completion_length": 861.4375305175781, "epoch": 0.03048780487804878, "grad_norm": 0.3951435685157776, "kl": 0.00027179718017578125, "learning_rate": 4.573170731707317e-07, "loss": 0.0056, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 703.4166870117188, "epoch": 0.03170731707317073, "grad_norm": 0.18676485121250153, "kl": 0.00022268295288085938, "learning_rate": 4.75609756097561e-07, "loss": 0.0009, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 26 }, { "completion_length": 807.0, "epoch": 0.032926829268292684, "grad_norm": 0.007806174457073212, "kl": 0.0003829002380371094, "learning_rate": 4.939024390243903e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 27 }, { "completion_length": 899.1250305175781, "epoch": 0.03414634146341464, "grad_norm": 0.011494847945868969, "kl": 0.0003495216369628906, "learning_rate": 5.121951219512195e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 28 }, { "completion_length": 890.3541870117188, "epoch": 0.03536585365853658, "grad_norm": 0.3775041401386261, "kl": 0.0006818771362304688, "learning_rate": 5.304878048780488e-07, "loss": 0.036, "reward": 0.0625, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 29 }, { "completion_length": 892.8541870117188, "epoch": 0.036585365853658534, "grad_norm": 0.28957894444465637, "kl": 0.00028514862060546875, "learning_rate": 5.48780487804878e-07, "loss": 0.0076, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 884.2708435058594, "epoch": 0.03780487804878049, "grad_norm": 0.3704320788383484, "kl": 0.00039577484130859375, "learning_rate": 5.670731707317073e-07, "loss": 0.0416, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 31 }, { "completion_length": 1051.0833740234375, "epoch": 0.03902439024390244, "grad_norm": 0.35672083497047424, "kl": 0.0004138946533203125, "learning_rate": 5.853658536585366e-07, "loss": 0.0371, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 32 }, { "completion_length": 1034.4375610351562, "epoch": 0.04024390243902439, "grad_norm": 0.25910377502441406, "kl": 0.000431060791015625, "learning_rate": 6.036585365853659e-07, "loss": -0.0011, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 33 }, { "completion_length": 881.4166870117188, "epoch": 0.041463414634146344, "grad_norm": 0.22427509725093842, "kl": 0.000640869140625, "learning_rate": 6.219512195121951e-07, "loss": -0.0109, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 34 }, { "completion_length": 928.9791870117188, "epoch": 0.042682926829268296, "grad_norm": 0.6121564507484436, "kl": 0.0008544921875, "learning_rate": 6.402439024390244e-07, "loss": 0.0896, "reward": 0.2083333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 757.2083435058594, "epoch": 0.04390243902439024, "grad_norm": 0.39821097254753113, "kl": 0.0007953643798828125, "learning_rate": 6.585365853658537e-07, "loss": 0.0038, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 36 }, { "completion_length": 807.8958435058594, "epoch": 0.045121951219512194, "grad_norm": 0.2658926546573639, "kl": 0.00089263916015625, "learning_rate": 6.768292682926829e-07, "loss": 0.0234, "reward": 0.2500000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 37 }, { "completion_length": 747.2083435058594, "epoch": 0.046341463414634146, "grad_norm": 0.46477219462394714, "kl": 0.0008087158203125, "learning_rate": 6.951219512195122e-07, "loss": 0.0491, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 38 }, { "completion_length": 1019.4375, "epoch": 0.0475609756097561, "grad_norm": 0.23934951424598694, "kl": 0.000652313232421875, "learning_rate": 7.134146341463414e-07, "loss": 0.0004, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 39 }, { "completion_length": 931.2291870117188, "epoch": 0.04878048780487805, "grad_norm": 0.2701385021209717, "kl": 0.0013561248779296875, "learning_rate": 7.317073170731707e-07, "loss": -0.0122, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 803.1250305175781, "epoch": 0.05, "grad_norm": 0.23006533086299896, "kl": 0.001125335693359375, "learning_rate": 7.5e-07, "loss": -0.0063, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 41 }, { "completion_length": 1055.7916870117188, "epoch": 0.05121951219512195, "grad_norm": 0.22470912337303162, "kl": 0.00102996826171875, "learning_rate": 7.682926829268293e-07, "loss": 0.0126, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 42 }, { "completion_length": 889.5833435058594, "epoch": 0.0524390243902439, "grad_norm": 0.43547120690345764, "kl": 0.0010833740234375, "learning_rate": 7.865853658536586e-07, "loss": 0.0203, "reward": 0.14583333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 43 }, { "completion_length": 708.0416870117188, "epoch": 0.05365853658536585, "grad_norm": 0.3968406617641449, "kl": 0.0013256072998046875, "learning_rate": 8.048780487804879e-07, "loss": 0.0382, "reward": 0.3125000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 44 }, { "completion_length": 811.1666870117188, "epoch": 0.054878048780487805, "grad_norm": 0.3708551228046417, "kl": 0.0020694732666015625, "learning_rate": 8.231707317073171e-07, "loss": 0.0054, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 776.6666870117188, "epoch": 0.05609756097560976, "grad_norm": 0.32095998525619507, "kl": 0.0015716552734375, "learning_rate": 8.414634146341464e-07, "loss": 0.0035, "reward": 0.2916666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 46 }, { "completion_length": 851.3541870117188, "epoch": 0.05731707317073171, "grad_norm": 0.3490510880947113, "kl": 0.00119781494140625, "learning_rate": 8.597560975609755e-07, "loss": 0.0469, "reward": 0.2500000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 47 }, { "completion_length": 818.7500305175781, "epoch": 0.05853658536585366, "grad_norm": 0.17611879110336304, "kl": 0.0006074905395507812, "learning_rate": 8.780487804878048e-07, "loss": 0.0069, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 48 }, { "completion_length": 860.3125305175781, "epoch": 0.05975609756097561, "grad_norm": 0.3276488184928894, "kl": 0.00099945068359375, "learning_rate": 8.963414634146341e-07, "loss": 0.01, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 49 }, { "completion_length": 869.1875, "epoch": 0.06097560975609756, "grad_norm": 0.3612532317638397, "kl": 0.00119781494140625, "learning_rate": 9.146341463414634e-07, "loss": 0.0327, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 902.25, "epoch": 0.06219512195121951, "grad_norm": 0.27144840359687805, "kl": 0.00298309326171875, "learning_rate": 9.329268292682927e-07, "loss": 0.0318, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 51 }, { "completion_length": 678.9375152587891, "epoch": 0.06341463414634146, "grad_norm": 0.32639819383621216, "kl": 0.0028533935546875, "learning_rate": 9.51219512195122e-07, "loss": 0.0379, "reward": 0.3541666865348816, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 0.0, "step": 52 }, { "completion_length": 716.6041870117188, "epoch": 0.06463414634146342, "grad_norm": 0.03865401819348335, "kl": 0.002239227294921875, "learning_rate": 9.695121951219512e-07, "loss": 0.0001, "reward": 0.375, "reward_std": 0.0, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 53 }, { "completion_length": 941.6458435058594, "epoch": 0.06585365853658537, "grad_norm": 0.32155147194862366, "kl": 0.003543853759765625, "learning_rate": 9.878048780487806e-07, "loss": 0.0288, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 54 }, { "completion_length": 781.7916870117188, "epoch": 0.06707317073170732, "grad_norm": 0.3982411026954651, "kl": 0.001758575439453125, "learning_rate": 1.0060975609756098e-06, "loss": 0.0154, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 878.5000610351562, "epoch": 0.06829268292682927, "grad_norm": 0.3388366103172302, "kl": 0.00228118896484375, "learning_rate": 1.024390243902439e-06, "loss": 0.0073, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 56 }, { "completion_length": 831.6458435058594, "epoch": 0.06951219512195123, "grad_norm": 0.2990972697734833, "kl": 0.001674652099609375, "learning_rate": 1.0426829268292682e-06, "loss": -0.0105, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 57 }, { "completion_length": 868.7083740234375, "epoch": 0.07073170731707316, "grad_norm": 0.2800074517726898, "kl": 0.003631591796875, "learning_rate": 1.0609756097560976e-06, "loss": 0.0269, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 58 }, { "completion_length": 765.5208740234375, "epoch": 0.07195121951219512, "grad_norm": 0.48817238211631775, "kl": 0.00269317626953125, "learning_rate": 1.0792682926829268e-06, "loss": 0.0563, "reward": 0.2291666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 59 }, { "completion_length": 934.75, "epoch": 0.07317073170731707, "grad_norm": 0.39176151156425476, "kl": 0.00350189208984375, "learning_rate": 1.097560975609756e-06, "loss": -0.0061, "reward": 0.125, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 730.6875, "epoch": 0.07439024390243902, "grad_norm": 0.22731173038482666, "kl": 0.0076904296875, "learning_rate": 1.1158536585365854e-06, "loss": 0.0143, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 61 }, { "completion_length": 840.1458435058594, "epoch": 0.07560975609756097, "grad_norm": 0.3801591694355011, "kl": 0.00402069091796875, "learning_rate": 1.1341463414634146e-06, "loss": 0.036, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 62 }, { "completion_length": 823.4791870117188, "epoch": 0.07682926829268293, "grad_norm": 0.30463361740112305, "kl": 0.0087127685546875, "learning_rate": 1.152439024390244e-06, "loss": 0.0263, "reward": 0.3125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 63 }, { "completion_length": 1038.8541870117188, "epoch": 0.07804878048780488, "grad_norm": 0.3331913948059082, "kl": 0.0054473876953125, "learning_rate": 1.1707317073170732e-06, "loss": 0.023, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 64 }, { "completion_length": 843.4166870117188, "epoch": 0.07926829268292683, "grad_norm": 0.3433734178543091, "kl": 0.00675201416015625, "learning_rate": 1.1890243902439024e-06, "loss": -0.0137, "reward": 0.1875000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 65 }, { "completion_length": 699.5208740234375, "epoch": 0.08048780487804878, "grad_norm": 0.4205603003501892, "kl": 0.0050811767578125, "learning_rate": 1.2073170731707318e-06, "loss": 0.0059, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 66 }, { "completion_length": 831.6041870117188, "epoch": 0.08170731707317073, "grad_norm": 0.23319682478904724, "kl": 0.0108642578125, "learning_rate": 1.225609756097561e-06, "loss": -0.0045, "reward": 0.1875000111758709, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 67 }, { "completion_length": 869.2291870117188, "epoch": 0.08292682926829269, "grad_norm": 0.2670533061027527, "kl": 0.002765655517578125, "learning_rate": 1.2439024390243902e-06, "loss": -0.0049, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 68 }, { "completion_length": 997.7083740234375, "epoch": 0.08414634146341464, "grad_norm": 0.46562182903289795, "kl": 0.006195068359375, "learning_rate": 1.2621951219512194e-06, "loss": -0.0183, "reward": 0.3958333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 69 }, { "completion_length": 757.6666870117188, "epoch": 0.08536585365853659, "grad_norm": 0.3912566304206848, "kl": 0.007232666015625, "learning_rate": 1.2804878048780488e-06, "loss": 0.0097, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 906.9583740234375, "epoch": 0.08658536585365853, "grad_norm": 0.3713250160217285, "kl": 0.008331298828125, "learning_rate": 1.298780487804878e-06, "loss": -0.0206, "reward": 0.375, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 71 }, { "completion_length": 943.4166870117188, "epoch": 0.08780487804878048, "grad_norm": 0.24080723524093628, "kl": 0.005340576171875, "learning_rate": 1.3170731707317074e-06, "loss": 0.0203, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 72 }, { "completion_length": 785.8958740234375, "epoch": 0.08902439024390243, "grad_norm": 0.30575063824653625, "kl": 0.0057220458984375, "learning_rate": 1.3353658536585366e-06, "loss": -0.0039, "reward": 0.2916666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 73 }, { "completion_length": 798.0625, "epoch": 0.09024390243902439, "grad_norm": 0.26189175248146057, "kl": 0.0051116943359375, "learning_rate": 1.3536585365853658e-06, "loss": -0.0086, "reward": 0.3125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 74 }, { "completion_length": 741.1250305175781, "epoch": 0.09146341463414634, "grad_norm": 0.37493857741355896, "kl": 0.00738525390625, "learning_rate": 1.3719512195121952e-06, "loss": 0.0134, "reward": 0.2916666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 892.3958435058594, "epoch": 0.09268292682926829, "grad_norm": 0.5211586356163025, "kl": 0.0086822509765625, "learning_rate": 1.3902439024390244e-06, "loss": -0.0278, "reward": 0.3958333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 76 }, { "completion_length": 1142.7917175292969, "epoch": 0.09390243902439024, "grad_norm": 0.39293408393859863, "kl": 0.0071868896484375, "learning_rate": 1.4085365853658536e-06, "loss": -0.0297, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 77 }, { "completion_length": 941.4583740234375, "epoch": 0.0951219512195122, "grad_norm": 0.3415144383907318, "kl": 0.00823974609375, "learning_rate": 1.4268292682926828e-06, "loss": 0.0074, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 78 }, { "completion_length": 821.4166870117188, "epoch": 0.09634146341463415, "grad_norm": 0.3430582582950592, "kl": 0.00689697265625, "learning_rate": 1.4451219512195122e-06, "loss": 0.0004, "reward": 0.2291666716337204, "reward_std": 0.10825316607952118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 79 }, { "completion_length": 693.3958740234375, "epoch": 0.0975609756097561, "grad_norm": 0.3961006999015808, "kl": 0.007293701171875, "learning_rate": 1.4634146341463414e-06, "loss": 0.0405, "reward": 0.2708333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 871.0208740234375, "epoch": 0.09878048780487805, "grad_norm": 0.3630426228046417, "kl": 0.0084381103515625, "learning_rate": 1.4817073170731708e-06, "loss": -0.0229, "reward": 0.3125000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 81 }, { "completion_length": 879.8125305175781, "epoch": 0.1, "grad_norm": 0.23860616981983185, "kl": 0.005859375, "learning_rate": 1.5e-06, "loss": -0.004, "reward": 0.1458333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 82 }, { "completion_length": 906.5625305175781, "epoch": 0.10121951219512196, "grad_norm": 0.35472050309181213, "kl": 0.0065460205078125, "learning_rate": 1.5182926829268292e-06, "loss": 0.0317, "reward": 0.2083333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 83 }, { "completion_length": 729.25, "epoch": 0.1024390243902439, "grad_norm": 0.5809890031814575, "kl": 0.0046539306640625, "learning_rate": 1.5365853658536586e-06, "loss": -0.0099, "reward": 0.3541666865348816, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 0.0, "step": 84 }, { "completion_length": 720.6250305175781, "epoch": 0.10365853658536585, "grad_norm": 0.19817733764648438, "kl": 0.0052337646484375, "learning_rate": 1.5548780487804878e-06, "loss": 0.0044, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 85 }, { "completion_length": 820.4583740234375, "epoch": 0.1048780487804878, "grad_norm": 0.3655123710632324, "kl": 0.00380706787109375, "learning_rate": 1.5731707317073172e-06, "loss": 0.0448, "reward": 0.3541666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 86 }, { "completion_length": 807.1666870117188, "epoch": 0.10609756097560975, "grad_norm": 0.057082075625658035, "kl": 0.007843017578125, "learning_rate": 1.5914634146341464e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 87 }, { "completion_length": 865.5208435058594, "epoch": 0.1073170731707317, "grad_norm": 0.29376837611198425, "kl": 0.004058837890625, "learning_rate": 1.6097560975609759e-06, "loss": -0.0335, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 88 }, { "completion_length": 833.5208435058594, "epoch": 0.10853658536585366, "grad_norm": 0.2153203785419464, "kl": 0.0058441162109375, "learning_rate": 1.628048780487805e-06, "loss": -0.0077, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 89 }, { "completion_length": 1033.8750610351562, "epoch": 0.10975609756097561, "grad_norm": 0.3933962881565094, "kl": 0.0074005126953125, "learning_rate": 1.6463414634146342e-06, "loss": 0.0848, "reward": 0.3125000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 803.6250305175781, "epoch": 0.11097560975609756, "grad_norm": 0.3615931272506714, "kl": 0.008636474609375, "learning_rate": 1.6646341463414637e-06, "loss": 0.001, "reward": 0.0833333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 91 }, { "completion_length": 1039.5625610351562, "epoch": 0.11219512195121951, "grad_norm": 0.5466797947883606, "kl": 0.008209228515625, "learning_rate": 1.6829268292682928e-06, "loss": 0.0668, "reward": 0.2708333358168602, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 92 }, { "completion_length": 845.4375, "epoch": 0.11341463414634147, "grad_norm": 0.2390030175447464, "kl": 0.009246826171875, "learning_rate": 1.7012195121951218e-06, "loss": -0.0019, "reward": 0.3125000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 93 }, { "completion_length": 809.5208435058594, "epoch": 0.11463414634146342, "grad_norm": 0.3774675130844116, "kl": 0.00933074951171875, "learning_rate": 1.719512195121951e-06, "loss": 0.0187, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 94 }, { "completion_length": 994.0625610351562, "epoch": 0.11585365853658537, "grad_norm": 0.43488016724586487, "kl": 0.00860595703125, "learning_rate": 1.7378048780487804e-06, "loss": 0.0297, "reward": 0.2916666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 95 }, { "completion_length": 774.9583435058594, "epoch": 0.11707317073170732, "grad_norm": 0.3485778570175171, "kl": 0.009124755859375, "learning_rate": 1.7560975609756096e-06, "loss": 0.0124, "reward": 0.291666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 96 }, { "completion_length": 955.1250305175781, "epoch": 0.11829268292682926, "grad_norm": 0.2105070799589157, "kl": 0.006011962890625, "learning_rate": 1.774390243902439e-06, "loss": -0.0018, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 97 }, { "completion_length": 859.5625305175781, "epoch": 0.11951219512195121, "grad_norm": 0.32763683795928955, "kl": 0.0069580078125, "learning_rate": 1.7926829268292682e-06, "loss": 0.0389, "reward": 0.2708333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 98 }, { "completion_length": 956.7917175292969, "epoch": 0.12073170731707317, "grad_norm": 0.2654731571674347, "kl": 0.0085601806640625, "learning_rate": 1.8109756097560976e-06, "loss": 0.0415, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 99 }, { "completion_length": 932.6250305175781, "epoch": 0.12195121951219512, "grad_norm": 0.41214823722839355, "kl": 0.0066986083984375, "learning_rate": 1.8292682926829268e-06, "loss": 0.0127, "reward": 0.229166679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 100 }, { "completion_length": 682.5000305175781, "epoch": 0.12317073170731707, "grad_norm": 0.2776435613632202, "kl": 0.01116943359375, "learning_rate": 1.847560975609756e-06, "loss": 0.0071, "reward": 0.4166666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 101 }, { "completion_length": 929.6250305175781, "epoch": 0.12439024390243902, "grad_norm": 0.31854933500289917, "kl": 0.0063323974609375, "learning_rate": 1.8658536585365854e-06, "loss": 0.0229, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 102 }, { "completion_length": 883.4791870117188, "epoch": 0.12560975609756098, "grad_norm": 0.29245489835739136, "kl": 0.007659912109375, "learning_rate": 1.8841463414634146e-06, "loss": 0.0269, "reward": 0.2916666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 103 }, { "completion_length": 936.6458435058594, "epoch": 0.12682926829268293, "grad_norm": 0.3715755343437195, "kl": 0.010101318359375, "learning_rate": 1.902439024390244e-06, "loss": 0.002, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 104 }, { "completion_length": 879.2083435058594, "epoch": 0.12804878048780488, "grad_norm": 0.2341831624507904, "kl": 0.010101318359375, "learning_rate": 1.9207317073170733e-06, "loss": 0.0155, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 105 }, { "completion_length": 995.5000305175781, "epoch": 0.12926829268292683, "grad_norm": 0.5579499006271362, "kl": 0.010284423828125, "learning_rate": 1.9390243902439024e-06, "loss": 0.0483, "reward": 0.3333333432674408, "reward_std": 0.21650633960962296, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 106 }, { "completion_length": 628.0208740234375, "epoch": 0.13048780487804879, "grad_norm": 0.336506724357605, "kl": 0.0086669921875, "learning_rate": 1.9573170731707316e-06, "loss": 0.0107, "reward": 0.2916666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 107 }, { "completion_length": 883.7500610351562, "epoch": 0.13170731707317074, "grad_norm": 0.20924407243728638, "kl": 0.0108642578125, "learning_rate": 1.9756097560975613e-06, "loss": 0.0053, "reward": 0.2708333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 108 }, { "completion_length": 682.0000305175781, "epoch": 0.1329268292682927, "grad_norm": 0.466841459274292, "kl": 0.05511474609375, "learning_rate": 1.9939024390243905e-06, "loss": 0.0083, "reward": 0.3333333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 109 }, { "completion_length": 902.5, "epoch": 0.13414634146341464, "grad_norm": 0.3656620681285858, "kl": 0.0137939453125, "learning_rate": 2.0121951219512197e-06, "loss": 0.0468, "reward": 0.3541666865348816, "reward_std": 0.18042196333408356, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 673.4375, "epoch": 0.1353658536585366, "grad_norm": 0.3500896394252777, "kl": 0.01605224609375, "learning_rate": 2.030487804878049e-06, "loss": 0.0208, "reward": 0.4166666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 111 }, { "completion_length": 1001.6666870117188, "epoch": 0.13658536585365855, "grad_norm": 0.30633047223091125, "kl": 0.01153564453125, "learning_rate": 2.048780487804878e-06, "loss": 0.0197, "reward": 0.0833333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 112 }, { "completion_length": 843.1875, "epoch": 0.1378048780487805, "grad_norm": 0.20953886210918427, "kl": 0.014923095703125, "learning_rate": 2.0670731707317072e-06, "loss": 0.0025, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 113 }, { "completion_length": 1128.1458740234375, "epoch": 0.13902439024390245, "grad_norm": 0.40073439478874207, "kl": 0.012115478515625, "learning_rate": 2.0853658536585364e-06, "loss": 0.0954, "reward": 0.2291666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 114 }, { "completion_length": 1261.875, "epoch": 0.1402439024390244, "grad_norm": 0.06579089909791946, "kl": 0.04351806640625, "learning_rate": 2.1036585365853656e-06, "loss": -0.0005, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 115 }, { "completion_length": 1249.8541870117188, "epoch": 0.14146341463414633, "grad_norm": 0.22086507081985474, "kl": 0.016204833984375, "learning_rate": 2.1219512195121953e-06, "loss": 0.0223, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 116 }, { "completion_length": 925.6667175292969, "epoch": 0.14268292682926828, "grad_norm": 0.39552637934684753, "kl": 0.01605224609375, "learning_rate": 2.1402439024390245e-06, "loss": 0.1733, "reward": 0.4375000298023224, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.4375000298023224, "rewards/format_reward": 0.0, "step": 117 }, { "completion_length": 1106.2708740234375, "epoch": 0.14390243902439023, "grad_norm": 0.05041651800274849, "kl": 0.014862060546875, "learning_rate": 2.1585365853658537e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 118 }, { "completion_length": 893.0833740234375, "epoch": 0.14512195121951219, "grad_norm": 0.23187892138957977, "kl": 0.01953125, "learning_rate": 2.176829268292683e-06, "loss": 0.0373, "reward": 0.3333333544433117, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333544433117, "rewards/format_reward": 0.0, "step": 119 }, { "completion_length": 1099.8958435058594, "epoch": 0.14634146341463414, "grad_norm": 0.37374091148376465, "kl": 0.017578125, "learning_rate": 2.195121951219512e-06, "loss": 0.1167, "reward": 0.291666679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 1553.0416870117188, "epoch": 0.1475609756097561, "grad_norm": 0.18386586010456085, "kl": 0.0186767578125, "learning_rate": 2.2134146341463417e-06, "loss": 0.0006, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 121 }, { "completion_length": 1248.5416870117188, "epoch": 0.14878048780487804, "grad_norm": 0.37695062160491943, "kl": 0.01995849609375, "learning_rate": 2.231707317073171e-06, "loss": 0.2, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 122 }, { "completion_length": 910.5416870117188, "epoch": 0.15, "grad_norm": 1.0668362379074097, "kl": 0.0341796875, "learning_rate": 2.25e-06, "loss": -0.0113, "reward": 0.3541666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 123 }, { "completion_length": 1080.375, "epoch": 0.15121951219512195, "grad_norm": 0.2541584372520447, "kl": 0.01959228515625, "learning_rate": 2.2682926829268293e-06, "loss": 0.0623, "reward": 0.3541666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 124 }, { "completion_length": 1526.7916870117188, "epoch": 0.1524390243902439, "grad_norm": 0.17064549028873444, "kl": 0.01776123046875, "learning_rate": 2.2865853658536584e-06, "loss": -0.0205, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 125 }, { "completion_length": 1344.5000610351562, "epoch": 0.15365853658536585, "grad_norm": 0.263281911611557, "kl": 0.019287109375, "learning_rate": 2.304878048780488e-06, "loss": -0.0047, "reward": 0.2916666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2916666679084301, "rewards/format_reward": 0.0, "step": 126 }, { "completion_length": 1013.6666870117188, "epoch": 0.1548780487804878, "grad_norm": 0.4602361023426056, "kl": 0.02886962890625, "learning_rate": 2.3231707317073173e-06, "loss": 0.0312, "reward": 0.5208333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 127 }, { "completion_length": 1398.2083740234375, "epoch": 0.15609756097560976, "grad_norm": 0.07369079440832138, "kl": 0.024658203125, "learning_rate": 2.3414634146341465e-06, "loss": 0.0009, "reward": 0.375, "reward_std": 0.0, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 128 }, { "completion_length": 1492.4375610351562, "epoch": 0.1573170731707317, "grad_norm": 0.34992021322250366, "kl": 0.02154541015625, "learning_rate": 2.3597560975609757e-06, "loss": 0.0632, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 129 }, { "completion_length": 1639.6041870117188, "epoch": 0.15853658536585366, "grad_norm": 0.29814255237579346, "kl": 0.02520751953125, "learning_rate": 2.378048780487805e-06, "loss": 0.0961, "reward": 0.2083333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 1588.7500610351562, "epoch": 0.1597560975609756, "grad_norm": 0.2149789184331894, "kl": 0.0281982421875, "learning_rate": 2.3963414634146345e-06, "loss": 0.0012, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 131 }, { "completion_length": 1835.5833740234375, "epoch": 0.16097560975609757, "grad_norm": 0.14360736310482025, "kl": 0.02545166015625, "learning_rate": 2.4146341463414637e-06, "loss": 0.0702, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 132 }, { "completion_length": 1631.6458740234375, "epoch": 0.16219512195121952, "grad_norm": 0.28000956773757935, "kl": 0.026123046875, "learning_rate": 2.432926829268293e-06, "loss": 0.0831, "reward": 0.2916666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 133 }, { "completion_length": 1266.3125610351562, "epoch": 0.16341463414634147, "grad_norm": 0.2712150812149048, "kl": 0.0250244140625, "learning_rate": 2.451219512195122e-06, "loss": 0.1113, "reward": 0.5208333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 134 }, { "completion_length": 1264.3541870117188, "epoch": 0.16463414634146342, "grad_norm": 0.32521137595176697, "kl": 0.02850341796875, "learning_rate": 2.4695121951219513e-06, "loss": 0.0752, "reward": 0.4166666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 135 }, { "completion_length": 1071.625, "epoch": 0.16585365853658537, "grad_norm": 0.32249101996421814, "kl": 0.0284423828125, "learning_rate": 2.4878048780487805e-06, "loss": -0.0046, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 136 }, { "completion_length": 1550.4375610351562, "epoch": 0.16707317073170733, "grad_norm": 0.33831775188446045, "kl": 0.03515625, "learning_rate": 2.5060975609756097e-06, "loss": 0.1029, "reward": 0.3750000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 137 }, { "completion_length": 1375.7500610351562, "epoch": 0.16829268292682928, "grad_norm": 0.19221460819244385, "kl": 0.0257568359375, "learning_rate": 2.524390243902439e-06, "loss": 0.0836, "reward": 0.3750000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 138 }, { "completion_length": 1883.479248046875, "epoch": 0.16951219512195123, "grad_norm": 0.130360409617424, "kl": 0.0352783203125, "learning_rate": 2.5426829268292685e-06, "loss": 0.0352, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 139 }, { "completion_length": 1811.9791870117188, "epoch": 0.17073170731707318, "grad_norm": 0.39922600984573364, "kl": 0.034912109375, "learning_rate": 2.5609756097560977e-06, "loss": 0.2314, "reward": 0.25, "reward_std": 0.25259073078632355, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 1884.3333740234375, "epoch": 0.1719512195121951, "grad_norm": 0.24000468850135803, "kl": 0.0455322265625, "learning_rate": 2.579268292682927e-06, "loss": 0.0686, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 141 }, { "completion_length": 2433.4583740234375, "epoch": 0.17317073170731706, "grad_norm": 0.32084038853645325, "kl": 0.053466796875, "learning_rate": 2.597560975609756e-06, "loss": -0.0349, "reward": 0.2083333432674408, "reward_std": 0.10825316607952118, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 142 }, { "completion_length": 1753.7708740234375, "epoch": 0.174390243902439, "grad_norm": 0.23981675505638123, "kl": 0.0496826171875, "learning_rate": 2.6158536585365853e-06, "loss": 0.1036, "reward": 0.2500000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 143 }, { "completion_length": 1808.3958740234375, "epoch": 0.17560975609756097, "grad_norm": 0.19555623829364777, "kl": 0.088623046875, "learning_rate": 2.634146341463415e-06, "loss": 0.038, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 144 }, { "completion_length": 1885.5, "epoch": 0.17682926829268292, "grad_norm": 0.17379753291606903, "kl": 0.05810546875, "learning_rate": 2.652439024390244e-06, "loss": 0.0462, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 145 }, { "completion_length": 1810.5000610351562, "epoch": 0.17804878048780487, "grad_norm": 0.23447324335575104, "kl": 0.048583984375, "learning_rate": 2.6707317073170733e-06, "loss": 0.0426, "reward": 0.3333333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.02083333395421505, "step": 146 }, { "completion_length": 1701.0, "epoch": 0.17926829268292682, "grad_norm": 0.2001214176416397, "kl": 0.047119140625, "learning_rate": 2.6890243902439025e-06, "loss": 0.0903, "reward": 0.12500000558793545, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 147 }, { "completion_length": 1525.5833740234375, "epoch": 0.18048780487804877, "grad_norm": 0.2016114443540573, "kl": 0.0396728515625, "learning_rate": 2.7073170731707317e-06, "loss": 0.0688, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 148 }, { "completion_length": 1922.0208740234375, "epoch": 0.18170731707317073, "grad_norm": 0.1354978084564209, "kl": 0.064697265625, "learning_rate": 2.7256097560975613e-06, "loss": 0.0216, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 149 }, { "completion_length": 1657.6875610351562, "epoch": 0.18292682926829268, "grad_norm": 0.19032008945941925, "kl": 0.0504150390625, "learning_rate": 2.7439024390243905e-06, "loss": 0.0572, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 1605.3333740234375, "epoch": 0.18414634146341463, "grad_norm": 0.19046162068843842, "kl": 0.032958984375, "learning_rate": 2.7621951219512197e-06, "loss": 0.0846, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 151 }, { "completion_length": 1913.3541870117188, "epoch": 0.18536585365853658, "grad_norm": 0.15275095403194427, "kl": 0.0401611328125, "learning_rate": 2.780487804878049e-06, "loss": -0.0208, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 152 }, { "completion_length": 1432.7708740234375, "epoch": 0.18658536585365854, "grad_norm": 0.3081569969654083, "kl": 0.0340576171875, "learning_rate": 2.798780487804878e-06, "loss": 0.0815, "reward": 0.3541666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 153 }, { "completion_length": 1972.2500610351562, "epoch": 0.1878048780487805, "grad_norm": 0.2554978132247925, "kl": 0.037353515625, "learning_rate": 2.8170731707317073e-06, "loss": 0.0533, "reward": 0.0625, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 154 }, { "completion_length": 1890.2708740234375, "epoch": 0.18902439024390244, "grad_norm": 0.23432564735412598, "kl": 0.0467529296875, "learning_rate": 2.8353658536585365e-06, "loss": 0.0025, "reward": 0.229166679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 155 }, { "completion_length": 1519.3750610351562, "epoch": 0.1902439024390244, "grad_norm": 0.24794629216194153, "kl": 0.03076171875, "learning_rate": 2.8536585365853657e-06, "loss": 0.1204, "reward": 0.20833333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 156 }, { "completion_length": 1954.375, "epoch": 0.19146341463414634, "grad_norm": 0.14570744335651398, "kl": 0.03106689453125, "learning_rate": 2.8719512195121953e-06, "loss": 0.0312, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 157 }, { "completion_length": 1679.4375610351562, "epoch": 0.1926829268292683, "grad_norm": 0.23852987587451935, "kl": 0.041015625, "learning_rate": 2.8902439024390245e-06, "loss": -0.0082, "reward": 0.2708333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 158 }, { "completion_length": 1470.916748046875, "epoch": 0.19390243902439025, "grad_norm": 0.2570444941520691, "kl": 0.039306640625, "learning_rate": 2.9085365853658537e-06, "loss": 0.1336, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 159 }, { "completion_length": 1912.541748046875, "epoch": 0.1951219512195122, "grad_norm": 0.2783234119415283, "kl": 0.0369873046875, "learning_rate": 2.926829268292683e-06, "loss": 0.0958, "reward": 0.1666666716337204, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 1547.6458740234375, "epoch": 0.19634146341463415, "grad_norm": 0.1358698457479477, "kl": 0.03521728515625, "learning_rate": 2.945121951219512e-06, "loss": 0.0126, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 161 }, { "completion_length": 1402.8333740234375, "epoch": 0.1975609756097561, "grad_norm": 0.2525395452976227, "kl": 0.033935546875, "learning_rate": 2.9634146341463417e-06, "loss": -0.0009, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 162 }, { "completion_length": 1746.6458740234375, "epoch": 0.19878048780487806, "grad_norm": 0.1487884223461151, "kl": 0.034423828125, "learning_rate": 2.981707317073171e-06, "loss": 0.0309, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 163 }, { "completion_length": 1724.4583740234375, "epoch": 0.2, "grad_norm": 0.2516140937805176, "kl": 0.0372314453125, "learning_rate": 3e-06, "loss": 0.0148, "reward": 0.3333333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 164 }, { "completion_length": 1089.3333740234375, "epoch": 0.20121951219512196, "grad_norm": 0.3022782802581787, "kl": 0.02685546875, "learning_rate": 2.9999966022757497e-06, "loss": 0.1024, "reward": 0.3125000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 165 }, { "completion_length": 1710.1458740234375, "epoch": 0.20243902439024392, "grad_norm": 0.38490527868270874, "kl": 0.0396728515625, "learning_rate": 2.9999864091183917e-06, "loss": 0.1086, "reward": 0.125, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 166 }, { "completion_length": 1551.6875, "epoch": 0.20365853658536584, "grad_norm": 0.4562043845653534, "kl": 0.0345458984375, "learning_rate": 2.999969420574104e-06, "loss": 0.0898, "reward": 0.2291666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 167 }, { "completion_length": 1459.4166870117188, "epoch": 0.2048780487804878, "grad_norm": 0.21141178905963898, "kl": 0.029296875, "learning_rate": 2.999945636719849e-06, "loss": -0.0465, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 168 }, { "completion_length": 1055.6458435058594, "epoch": 0.20609756097560974, "grad_norm": 0.22224751114845276, "kl": 0.02972412109375, "learning_rate": 2.9999150576633756e-06, "loss": 0.109, "reward": 0.4166666865348816, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "step": 169 }, { "completion_length": 1462.1875610351562, "epoch": 0.2073170731707317, "grad_norm": 0.17986340820789337, "kl": 0.03564453125, "learning_rate": 2.999877683543216e-06, "loss": 0.0317, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 1626.3125610351562, "epoch": 0.20853658536585365, "grad_norm": 0.2842521369457245, "kl": 0.0394287109375, "learning_rate": 2.9998335145286857e-06, "loss": 0.1621, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 171 }, { "completion_length": 1560.8958740234375, "epoch": 0.2097560975609756, "grad_norm": 0.1268962025642395, "kl": 0.037841796875, "learning_rate": 2.999782550819884e-06, "loss": 0.0404, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 172 }, { "completion_length": 1509.5416870117188, "epoch": 0.21097560975609755, "grad_norm": 0.10690096765756607, "kl": 0.03277587890625, "learning_rate": 2.9997247926476918e-06, "loss": 0.0248, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 173 }, { "completion_length": 1845.1875, "epoch": 0.2121951219512195, "grad_norm": 0.1804507076740265, "kl": 0.0416259765625, "learning_rate": 2.99966024027377e-06, "loss": 0.0708, "reward": 0.229166679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 174 }, { "completion_length": 1426.5625610351562, "epoch": 0.21341463414634146, "grad_norm": 0.3114238977432251, "kl": 0.0394287109375, "learning_rate": 2.999588893990561e-06, "loss": 0.0728, "reward": 0.27083333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.27083333395421505, "rewards/format_reward": 0.0, "step": 175 }, { "completion_length": 2135.7501220703125, "epoch": 0.2146341463414634, "grad_norm": 0.2029908299446106, "kl": 0.0428466796875, "learning_rate": 2.9995107541212846e-06, "loss": 0.1168, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 176 }, { "completion_length": 1527.5833740234375, "epoch": 0.21585365853658536, "grad_norm": 0.25787416100502014, "kl": 0.0362548828125, "learning_rate": 2.999425821019938e-06, "loss": 0.0446, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 177 }, { "completion_length": 1889.5208740234375, "epoch": 0.21707317073170732, "grad_norm": 0.2915896475315094, "kl": 0.128173828125, "learning_rate": 2.999334095071293e-06, "loss": 0.1539, "reward": 0.2083333395421505, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 178 }, { "completion_length": 1726.104248046875, "epoch": 0.21829268292682927, "grad_norm": 0.24176959693431854, "kl": 0.052978515625, "learning_rate": 2.999235576690896e-06, "loss": 0.0916, "reward": 0.2708333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333395421505, "rewards/format_reward": 0.0, "step": 179 }, { "completion_length": 1697.166748046875, "epoch": 0.21951219512195122, "grad_norm": 0.22746720910072327, "kl": 0.0643310546875, "learning_rate": 2.9991302663250642e-06, "loss": 0.0431, "reward": 0.2083333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 2614.416748046875, "epoch": 0.22073170731707317, "grad_norm": 0.20286248624324799, "kl": 0.07080078125, "learning_rate": 2.9990181644508856e-06, "loss": 0.0138, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 181 }, { "completion_length": 2142.1250610351562, "epoch": 0.22195121951219512, "grad_norm": 0.11354032158851624, "kl": 0.06396484375, "learning_rate": 2.9988992715762147e-06, "loss": 0.0467, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 182 }, { "completion_length": 2264.25, "epoch": 0.22317073170731708, "grad_norm": 0.16217917203903198, "kl": 0.0733642578125, "learning_rate": 2.998773588239673e-06, "loss": 0.0336, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 183 }, { "completion_length": 1902.916748046875, "epoch": 0.22439024390243903, "grad_norm": 0.1260344237089157, "kl": 0.06201171875, "learning_rate": 2.9986411150106423e-06, "loss": 0.0029, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 184 }, { "completion_length": 2245.0833740234375, "epoch": 0.22560975609756098, "grad_norm": 0.24795877933502197, "kl": 0.0927734375, "learning_rate": 2.998501852489266e-06, "loss": 0.0634, "reward": 0.2291666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 185 }, { "completion_length": 2453.1251220703125, "epoch": 0.22682926829268293, "grad_norm": 0.14966094493865967, "kl": 0.0791015625, "learning_rate": 2.9983558013064455e-06, "loss": 0.0485, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 186 }, { "completion_length": 2534.3751220703125, "epoch": 0.2280487804878049, "grad_norm": 0.15999601781368256, "kl": 0.073974609375, "learning_rate": 2.998202962123836e-06, "loss": 0.119, "reward": 0.2291666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 187 }, { "completion_length": 1823.1250610351562, "epoch": 0.22926829268292684, "grad_norm": 0.1819969266653061, "kl": 0.0517578125, "learning_rate": 2.998043335633845e-06, "loss": 0.0227, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 188 }, { "completion_length": 2523.875, "epoch": 0.2304878048780488, "grad_norm": 0.1063833013176918, "kl": 0.0523681640625, "learning_rate": 2.997876922559628e-06, "loss": 0.0515, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 189 }, { "completion_length": 2327.0626220703125, "epoch": 0.23170731707317074, "grad_norm": 0.19144243001937866, "kl": 0.078369140625, "learning_rate": 2.997703723655086e-06, "loss": 0.1176, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 2001.4375610351562, "epoch": 0.2329268292682927, "grad_norm": 0.1476822942495346, "kl": 0.048583984375, "learning_rate": 2.9975237397048618e-06, "loss": 0.0574, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 191 }, { "completion_length": 2143.125, "epoch": 0.23414634146341465, "grad_norm": 0.18168318271636963, "kl": 0.0589599609375, "learning_rate": 2.9973369715243363e-06, "loss": 0.0389, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 192 }, { "completion_length": 1624.5208740234375, "epoch": 0.23536585365853657, "grad_norm": 0.36823156476020813, "kl": 0.0450439453125, "learning_rate": 2.997143419959626e-06, "loss": 0.0288, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 193 }, { "completion_length": 1843.6875, "epoch": 0.23658536585365852, "grad_norm": 14.430148124694824, "kl": 0.1033935546875, "learning_rate": 2.996943085887577e-06, "loss": 0.1265, "reward": 0.1458333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 194 }, { "completion_length": 1235.9583740234375, "epoch": 0.23780487804878048, "grad_norm": 0.27972450852394104, "kl": 0.0362548828125, "learning_rate": 2.9967359702157616e-06, "loss": 0.011, "reward": 0.3541666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 195 }, { "completion_length": 1946.8958740234375, "epoch": 0.23902439024390243, "grad_norm": 0.15726293623447418, "kl": 0.0419921875, "learning_rate": 2.996522073882477e-06, "loss": 0.0123, "reward": 0.1875, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 196 }, { "completion_length": 1443.791748046875, "epoch": 0.24024390243902438, "grad_norm": 0.400764137506485, "kl": 0.04736328125, "learning_rate": 2.996301397856737e-06, "loss": 0.1831, "reward": 0.3125000149011612, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 197 }, { "completion_length": 1856.2083740234375, "epoch": 0.24146341463414633, "grad_norm": 0.3818769156932831, "kl": 0.0526123046875, "learning_rate": 2.9960739431382697e-06, "loss": 0.0235, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 198 }, { "completion_length": 1427.0416870117188, "epoch": 0.2426829268292683, "grad_norm": 0.17303015291690826, "kl": 0.0465087890625, "learning_rate": 2.9958397107575134e-06, "loss": 0.0248, "reward": 0.2916666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 199 }, { "completion_length": 1394.1875, "epoch": 0.24390243902439024, "grad_norm": 0.15026994049549103, "kl": 0.0545654296875, "learning_rate": 2.9955987017756107e-06, "loss": 0.0144, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 200 }, { "completion_length": 1516.7291870117188, "epoch": 0.2451219512195122, "grad_norm": 0.19180533289909363, "kl": 0.04541015625, "learning_rate": 2.9953509172844047e-06, "loss": 0.0813, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 201 }, { "completion_length": 2201.854248046875, "epoch": 0.24634146341463414, "grad_norm": 0.3112233579158783, "kl": 0.0589599609375, "learning_rate": 2.9950963584064327e-06, "loss": 0.1567, "reward": 0.3125000149011612, "reward_std": 0.28867512196302414, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 202 }, { "completion_length": 1703.854248046875, "epoch": 0.2475609756097561, "grad_norm": 0.13127818703651428, "kl": 0.058349609375, "learning_rate": 2.9948350262949224e-06, "loss": 0.0892, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 203 }, { "completion_length": 1960.0001220703125, "epoch": 0.24878048780487805, "grad_norm": 0.22910051047801971, "kl": 0.0498046875, "learning_rate": 2.9945669221337873e-06, "loss": 0.0996, "reward": 0.1458333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 204 }, { "completion_length": 1366.625, "epoch": 0.25, "grad_norm": 0.3151746988296509, "kl": 0.0543212890625, "learning_rate": 2.994292047137618e-06, "loss": 0.0571, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 205 }, { "completion_length": 1032.7916870117188, "epoch": 0.25121951219512195, "grad_norm": 0.35160189867019653, "kl": 1.169921875, "learning_rate": 2.994010402551682e-06, "loss": 0.1593, "reward": 0.4375000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 206 }, { "completion_length": 1767.3126220703125, "epoch": 0.2524390243902439, "grad_norm": 0.3725665211677551, "kl": 0.05078125, "learning_rate": 2.993721989651913e-06, "loss": 0.1003, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 207 }, { "completion_length": 1163.9375305175781, "epoch": 0.25365853658536586, "grad_norm": 0.24222548305988312, "kl": 0.0523681640625, "learning_rate": 2.9934268097449068e-06, "loss": -0.0101, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 208 }, { "completion_length": 1104.6666870117188, "epoch": 0.2548780487804878, "grad_norm": 0.2728269696235657, "kl": 0.04693603515625, "learning_rate": 2.9931248641679173e-06, "loss": 0.0744, "reward": 0.3750000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 209 }, { "completion_length": 1256.5833740234375, "epoch": 0.25609756097560976, "grad_norm": 0.4811817705631256, "kl": 0.0521240234375, "learning_rate": 2.9928161542888487e-06, "loss": 0.1457, "reward": 0.2291666679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 1646.5000610351562, "epoch": 0.2573170731707317, "grad_norm": 0.1947222501039505, "kl": 0.0662841796875, "learning_rate": 2.9925006815062483e-06, "loss": -0.0073, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 211 }, { "completion_length": 1581.0, "epoch": 0.25853658536585367, "grad_norm": 0.25163155794143677, "kl": 0.087890625, "learning_rate": 2.9921784472493023e-06, "loss": 0.0921, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 212 }, { "completion_length": 1567.416748046875, "epoch": 0.2597560975609756, "grad_norm": 1.025305986404419, "kl": 0.1005859375, "learning_rate": 2.9918494529778285e-06, "loss": 0.1623, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 213 }, { "completion_length": 1868.979248046875, "epoch": 0.26097560975609757, "grad_norm": 0.30298322439193726, "kl": 0.103271484375, "learning_rate": 2.9915137001822686e-06, "loss": 0.088, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 214 }, { "completion_length": 1856.7083740234375, "epoch": 0.2621951219512195, "grad_norm": 0.24291731417179108, "kl": 0.084716796875, "learning_rate": 2.9911711903836845e-06, "loss": 0.147, "reward": 0.3541666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 215 }, { "completion_length": 1902.1251220703125, "epoch": 0.2634146341463415, "grad_norm": 0.20493283867835999, "kl": 0.11572265625, "learning_rate": 2.9908219251337465e-06, "loss": 0.0579, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 216 }, { "completion_length": 1532.6875610351562, "epoch": 0.2646341463414634, "grad_norm": 0.20061595737934113, "kl": 0.0703125, "learning_rate": 2.9904659060147314e-06, "loss": 0.1154, "reward": 0.2500000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 217 }, { "completion_length": 2503.8751220703125, "epoch": 0.2658536585365854, "grad_norm": 0.2270926684141159, "kl": 0.108154296875, "learning_rate": 2.9901031346395125e-06, "loss": 0.0054, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 218 }, { "completion_length": 2234.729248046875, "epoch": 0.26707317073170733, "grad_norm": 0.25283458828926086, "kl": 0.095703125, "learning_rate": 2.9897336126515525e-06, "loss": 0.0788, "reward": 0.1875, "reward_std": 0.10825318098068237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 219 }, { "completion_length": 2447.541748046875, "epoch": 0.2682926829268293, "grad_norm": 0.12416581809520721, "kl": 0.091064453125, "learning_rate": 2.9893573417248957e-06, "loss": 0.1125, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 2169.1458740234375, "epoch": 0.26951219512195124, "grad_norm": 0.18030035495758057, "kl": 0.096435546875, "learning_rate": 2.9889743235641627e-06, "loss": 0.1243, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 221 }, { "completion_length": 2030.125, "epoch": 0.2707317073170732, "grad_norm": 0.1275298297405243, "kl": 0.0579833984375, "learning_rate": 2.98858455990454e-06, "loss": 0.1023, "reward": 0.4166666865348816, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "step": 222 }, { "completion_length": 2457.625, "epoch": 0.27195121951219514, "grad_norm": 0.16787779331207275, "kl": 0.083984375, "learning_rate": 2.988188052511774e-06, "loss": 0.0452, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 223 }, { "completion_length": 1908.6875, "epoch": 0.2731707317073171, "grad_norm": 0.13521677255630493, "kl": 0.06396484375, "learning_rate": 2.987784803182161e-06, "loss": 0.1097, "reward": 0.2500000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 224 }, { "completion_length": 1931.041748046875, "epoch": 0.27439024390243905, "grad_norm": 0.13458870351314545, "kl": 0.052978515625, "learning_rate": 2.9873748137425413e-06, "loss": 0.0445, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 225 }, { "completion_length": 1825.8958740234375, "epoch": 0.275609756097561, "grad_norm": 0.1985807865858078, "kl": 0.0548095703125, "learning_rate": 2.9869580860502894e-06, "loss": -0.0369, "reward": 0.1041666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 226 }, { "completion_length": 2253.3126220703125, "epoch": 0.27682926829268295, "grad_norm": 0.17767678201198578, "kl": 0.0694580078125, "learning_rate": 2.986534621993307e-06, "loss": 0.0372, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 227 }, { "completion_length": 2764.229248046875, "epoch": 0.2780487804878049, "grad_norm": 15.245186805725098, "kl": 0.1453857421875, "learning_rate": 2.9861044234900125e-06, "loss": 0.0564, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 228 }, { "completion_length": 2271.979248046875, "epoch": 0.27926829268292686, "grad_norm": 0.09283185750246048, "kl": 0.0518798828125, "learning_rate": 2.9856674924893338e-06, "loss": 0.0027, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 229 }, { "completion_length": 2877.854248046875, "epoch": 0.2804878048780488, "grad_norm": 0.1514541357755661, "kl": 0.069580078125, "learning_rate": 2.985223830970699e-06, "loss": 0.0237, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 1448.3958740234375, "epoch": 0.2817073170731707, "grad_norm": 0.14002852141857147, "kl": 0.042724609375, "learning_rate": 2.984773440944027e-06, "loss": 0.0612, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 231 }, { "completion_length": 1860.7708740234375, "epoch": 0.28292682926829266, "grad_norm": 0.15355679392814636, "kl": 0.0489501953125, "learning_rate": 2.98431632444972e-06, "loss": 0.0409, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.02083333395421505, "step": 232 }, { "completion_length": 1933.5625610351562, "epoch": 0.2841463414634146, "grad_norm": 0.19013522565364838, "kl": 0.05322265625, "learning_rate": 2.983852483558652e-06, "loss": 0.052, "reward": 0.16666667722165585, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 233 }, { "completion_length": 2254.729248046875, "epoch": 0.28536585365853656, "grad_norm": 0.13274292647838593, "kl": 0.0518798828125, "learning_rate": 2.9833819203721614e-06, "loss": 0.006, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 234 }, { "completion_length": 2080.479248046875, "epoch": 0.2865853658536585, "grad_norm": 0.22805815935134888, "kl": 0.0595703125, "learning_rate": 2.98290463702204e-06, "loss": 0.0451, "reward": 0.2500000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 235 }, { "completion_length": 1452.0625610351562, "epoch": 0.28780487804878047, "grad_norm": 0.08134093880653381, "kl": 0.03009033203125, "learning_rate": 2.982420635670523e-06, "loss": 0.0242, "reward": 0.20833333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 236 }, { "completion_length": 1564.104248046875, "epoch": 0.2890243902439024, "grad_norm": 0.22132782638072968, "kl": 0.0430908203125, "learning_rate": 2.9819299185102824e-06, "loss": 0.0289, "reward": 0.2916666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 237 }, { "completion_length": 1978.8958740234375, "epoch": 0.29024390243902437, "grad_norm": 0.16645598411560059, "kl": 0.0439453125, "learning_rate": 2.981432487764413e-06, "loss": 0.0733, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 238 }, { "completion_length": 2122.729248046875, "epoch": 0.2914634146341463, "grad_norm": 0.14445273578166962, "kl": 0.03466796875, "learning_rate": 2.9809283456864257e-06, "loss": 0.0442, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 239 }, { "completion_length": 1627.7083740234375, "epoch": 0.2926829268292683, "grad_norm": 0.2052607536315918, "kl": 0.0322265625, "learning_rate": 2.980417494560234e-06, "loss": 0.0442, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 1845.2500610351562, "epoch": 0.2939024390243902, "grad_norm": 0.2668360471725464, "kl": 0.040283203125, "learning_rate": 2.9798999367001467e-06, "loss": 0.1107, "reward": 0.3125000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 241 }, { "completion_length": 1981.5833740234375, "epoch": 0.2951219512195122, "grad_norm": 0.7613481879234314, "kl": 0.048828125, "learning_rate": 2.979375674450855e-06, "loss": 0.0295, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 242 }, { "completion_length": 1472.5208740234375, "epoch": 0.29634146341463413, "grad_norm": 0.23411507904529572, "kl": 0.0472412109375, "learning_rate": 2.9788447101874246e-06, "loss": 0.1042, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 243 }, { "completion_length": 1375.7916870117188, "epoch": 0.2975609756097561, "grad_norm": 0.26672086119651794, "kl": 0.089111328125, "learning_rate": 2.9783070463152816e-06, "loss": 0.1359, "reward": 0.3333333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 244 }, { "completion_length": 1839.1041870117188, "epoch": 0.29878048780487804, "grad_norm": 0.16482682526111603, "kl": 0.0364990234375, "learning_rate": 2.977762685270205e-06, "loss": -0.0134, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 245 }, { "completion_length": 1403.2708740234375, "epoch": 0.3, "grad_norm": 0.19812476634979248, "kl": 0.038818359375, "learning_rate": 2.9772116295183124e-06, "loss": 0.0392, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 246 }, { "completion_length": 1905.0208740234375, "epoch": 0.30121951219512194, "grad_norm": 0.22495272755622864, "kl": 0.04345703125, "learning_rate": 2.976653881556051e-06, "loss": 0.0419, "reward": 0.2083333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 247 }, { "completion_length": 2164.979248046875, "epoch": 0.3024390243902439, "grad_norm": 0.22480511665344238, "kl": 0.049560546875, "learning_rate": 2.9760894439101857e-06, "loss": 0.1341, "reward": 0.1666666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 248 }, { "completion_length": 1758.3334350585938, "epoch": 0.30365853658536585, "grad_norm": 0.22447431087493896, "kl": 0.065673828125, "learning_rate": 2.9755183191377888e-06, "loss": 0.0443, "reward": 0.291666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 249 }, { "completion_length": 1800.1666870117188, "epoch": 0.3048780487804878, "grad_norm": 0.16171671450138092, "kl": 0.0455322265625, "learning_rate": 2.974940509826225e-06, "loss": 0.069, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 1399.479248046875, "epoch": 0.30609756097560975, "grad_norm": 0.2546021342277527, "kl": 0.046142578125, "learning_rate": 2.9743560185931443e-06, "loss": 0.1156, "reward": 0.2916666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666679084301, "rewards/format_reward": 0.0, "step": 251 }, { "completion_length": 1716.0000610351562, "epoch": 0.3073170731707317, "grad_norm": 0.30597400665283203, "kl": 0.047607421875, "learning_rate": 2.973764848086466e-06, "loss": 0.1777, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 252 }, { "completion_length": 1833.5625610351562, "epoch": 0.30853658536585366, "grad_norm": 0.9375303983688354, "kl": 0.0728759765625, "learning_rate": 2.9731670009843704e-06, "loss": 0.0782, "reward": 0.20833333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 253 }, { "completion_length": 1342.0833740234375, "epoch": 0.3097560975609756, "grad_norm": 0.16923321783542633, "kl": 0.0413818359375, "learning_rate": 2.9725624799952824e-06, "loss": -0.0121, "reward": 0.3333333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 254 }, { "completion_length": 1430.1458740234375, "epoch": 0.31097560975609756, "grad_norm": 0.23225951194763184, "kl": 0.1304931640625, "learning_rate": 2.971951287857863e-06, "loss": 0.1052, "reward": 0.1875000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 255 }, { "completion_length": 1492.25, "epoch": 0.3121951219512195, "grad_norm": 0.2100754827260971, "kl": 0.043701171875, "learning_rate": 2.9713334273409965e-06, "loss": -0.0086, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 256 }, { "completion_length": 1829.9375, "epoch": 0.31341463414634146, "grad_norm": 0.1497432142496109, "kl": 0.0477294921875, "learning_rate": 2.970708901243774e-06, "loss": 0.0141, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 257 }, { "completion_length": 1750.375, "epoch": 0.3146341463414634, "grad_norm": 0.2294205278158188, "kl": 0.0540771484375, "learning_rate": 2.9700777123954867e-06, "loss": 0.0912, "reward": 0.229166679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 258 }, { "completion_length": 1672.875, "epoch": 0.31585365853658537, "grad_norm": 0.10803260654211044, "kl": 0.0452880859375, "learning_rate": 2.969439863655608e-06, "loss": 0.0477, "reward": 0.2916666865348816, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.0, "step": 259 }, { "completion_length": 1613.25, "epoch": 0.3170731707317073, "grad_norm": 0.21779678761959076, "kl": 0.047119140625, "learning_rate": 2.968795357913784e-06, "loss": 0.0895, "reward": 0.0833333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 1799.6458740234375, "epoch": 0.3182926829268293, "grad_norm": 0.1968408077955246, "kl": 0.0570068359375, "learning_rate": 2.968144198089819e-06, "loss": 0.0895, "reward": 0.3125000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 261 }, { "completion_length": 1247.1666870117188, "epoch": 0.3195121951219512, "grad_norm": 0.24609629809856415, "kl": 0.04443359375, "learning_rate": 2.9674863871336603e-06, "loss": 0.0207, "reward": 0.12500000558793545, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 262 }, { "completion_length": 1346.0833740234375, "epoch": 0.3207317073170732, "grad_norm": 0.2786525785923004, "kl": 0.049560546875, "learning_rate": 2.966821928025389e-06, "loss": 0.08, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 263 }, { "completion_length": 1544.7916870117188, "epoch": 0.32195121951219513, "grad_norm": 0.2146446853876114, "kl": 0.077392578125, "learning_rate": 2.9661508237752034e-06, "loss": 0.161, "reward": 0.1041666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 264 }, { "completion_length": 2214.666748046875, "epoch": 0.3231707317073171, "grad_norm": 0.1608172208070755, "kl": 0.0654296875, "learning_rate": 2.9654730774234067e-06, "loss": 0.0672, "reward": 0.1458333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 265 }, { "completion_length": 1518.7708740234375, "epoch": 0.32439024390243903, "grad_norm": 0.1421271115541458, "kl": 0.0408935546875, "learning_rate": 2.9647886920403916e-06, "loss": 0.0143, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 266 }, { "completion_length": 2144.541748046875, "epoch": 0.325609756097561, "grad_norm": 0.17404739558696747, "kl": 0.06689453125, "learning_rate": 2.9640976707266297e-06, "loss": 0.1098, "reward": 0.2500000149011612, "reward_std": 0.14433757215738297, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 267 }, { "completion_length": 2003.4375610351562, "epoch": 0.32682926829268294, "grad_norm": 0.319216787815094, "kl": 0.067138671875, "learning_rate": 2.9634000166126534e-06, "loss": 0.0536, "reward": 0.125, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 268 }, { "completion_length": 2589.3333740234375, "epoch": 0.3280487804878049, "grad_norm": 0.12199385464191437, "kl": 0.06494140625, "learning_rate": 2.962695732859045e-06, "loss": 0.0347, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 269 }, { "completion_length": 1742.3958740234375, "epoch": 0.32926829268292684, "grad_norm": 0.12943227589130402, "kl": 0.0618896484375, "learning_rate": 2.9619848226564196e-06, "loss": 0.0715, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 2020.3959350585938, "epoch": 0.3304878048780488, "grad_norm": 0.11941098421812057, "kl": 0.0556640625, "learning_rate": 2.961267289225414e-06, "loss": 0.118, "reward": 0.1458333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 271 }, { "completion_length": 2024.3958740234375, "epoch": 0.33170731707317075, "grad_norm": 0.18178167939186096, "kl": 0.0521240234375, "learning_rate": 2.9605431358166687e-06, "loss": 0.1238, "reward": 0.2500000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 272 }, { "completion_length": 1690.4166870117188, "epoch": 0.3329268292682927, "grad_norm": 0.10511098057031631, "kl": 0.05224609375, "learning_rate": 2.959812365710815e-06, "loss": 0.0463, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 273 }, { "completion_length": 1778.8750610351562, "epoch": 0.33414634146341465, "grad_norm": 0.14942613244056702, "kl": 0.0577392578125, "learning_rate": 2.9590749822184602e-06, "loss": 0.0831, "reward": 0.2916666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 274 }, { "completion_length": 1326.5208740234375, "epoch": 0.3353658536585366, "grad_norm": 0.12064136564731598, "kl": 0.0697021484375, "learning_rate": 2.958330988680172e-06, "loss": 0.0327, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 275 }, { "completion_length": 1679.1041870117188, "epoch": 0.33658536585365856, "grad_norm": 0.28148481249809265, "kl": 0.0535888671875, "learning_rate": 2.9575803884664634e-06, "loss": -0.0058, "reward": 0.229166679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 276 }, { "completion_length": 1548.7708740234375, "epoch": 0.3378048780487805, "grad_norm": 0.21928665041923523, "kl": 0.056396484375, "learning_rate": 2.9568231849777785e-06, "loss": 0.132, "reward": 0.2708333432674408, "reward_std": 0.14433755725622177, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 277 }, { "completion_length": 1741.3333740234375, "epoch": 0.33902439024390246, "grad_norm": 0.2810451090335846, "kl": 0.0496826171875, "learning_rate": 2.9560593816444746e-06, "loss": 0.0657, "reward": 0.10416666977107525, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 278 }, { "completion_length": 1726.1250610351562, "epoch": 0.3402439024390244, "grad_norm": 0.18711470067501068, "kl": 0.058837890625, "learning_rate": 2.9552889819268095e-06, "loss": 0.1002, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 279 }, { "completion_length": 1876.25, "epoch": 0.34146341463414637, "grad_norm": 0.19467337429523468, "kl": 0.0528564453125, "learning_rate": 2.9545119893149243e-06, "loss": 0.0811, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 1538.7083740234375, "epoch": 0.3426829268292683, "grad_norm": 0.22520393133163452, "kl": 0.0545654296875, "learning_rate": 2.953728407328828e-06, "loss": -0.0122, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 281 }, { "completion_length": 1224.6666870117188, "epoch": 0.3439024390243902, "grad_norm": 0.25890904664993286, "kl": 0.050537109375, "learning_rate": 2.9529382395183812e-06, "loss": 0.11, "reward": 0.2500000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 282 }, { "completion_length": 1285.6875, "epoch": 0.34512195121951217, "grad_norm": 0.2045072466135025, "kl": 0.0509033203125, "learning_rate": 2.9521414894632797e-06, "loss": 0.1145, "reward": 0.20833333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 283 }, { "completion_length": 1388.1666870117188, "epoch": 0.3463414634146341, "grad_norm": 0.403850793838501, "kl": 0.0540771484375, "learning_rate": 2.9513381607730403e-06, "loss": 0.1234, "reward": 0.2291666716337204, "reward_std": 0.18042194843292236, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 284 }, { "completion_length": 1427.4583740234375, "epoch": 0.3475609756097561, "grad_norm": 0.19482865929603577, "kl": 0.0474853515625, "learning_rate": 2.9505282570869825e-06, "loss": 0.0038, "reward": 0.2916666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.02083333395421505, "step": 285 }, { "completion_length": 1893.9583740234375, "epoch": 0.348780487804878, "grad_norm": 0.1747361272573471, "kl": 0.06884765625, "learning_rate": 2.949711782074211e-06, "loss": 0.0203, "reward": 0.10416666977107525, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 286 }, { "completion_length": 1326.8541870117188, "epoch": 0.35, "grad_norm": 0.05849047005176544, "kl": 0.039306640625, "learning_rate": 2.9488887394336023e-06, "loss": 0.0019, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 287 }, { "completion_length": 1374.8541870117188, "epoch": 0.35121951219512193, "grad_norm": 0.11791527271270752, "kl": 0.05419921875, "learning_rate": 2.948059132893786e-06, "loss": 0.0396, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 288 }, { "completion_length": 1827.1876220703125, "epoch": 0.3524390243902439, "grad_norm": 0.1812276691198349, "kl": 0.0489501953125, "learning_rate": 2.947222966213127e-06, "loss": 0.0809, "reward": 0.0833333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 289 }, { "completion_length": 1512.666748046875, "epoch": 0.35365853658536583, "grad_norm": 0.19956235587596893, "kl": 0.0435791015625, "learning_rate": 2.9463802431797115e-06, "loss": 0.0176, "reward": 0.2500000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 2007.2083740234375, "epoch": 0.3548780487804878, "grad_norm": 0.20655225217342377, "kl": 0.06982421875, "learning_rate": 2.945530967611326e-06, "loss": 0.097, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 291 }, { "completion_length": 1623.729248046875, "epoch": 0.35609756097560974, "grad_norm": 0.23635146021842957, "kl": 0.05078125, "learning_rate": 2.9446751433554426e-06, "loss": 0.1396, "reward": 0.3125000149011612, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 292 }, { "completion_length": 1586.3958740234375, "epoch": 0.3573170731707317, "grad_norm": 0.14283964037895203, "kl": 0.05224609375, "learning_rate": 2.9438127742892012e-06, "loss": 0.0841, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 293 }, { "completion_length": 1515.8958740234375, "epoch": 0.35853658536585364, "grad_norm": 0.10759755223989487, "kl": 0.063232421875, "learning_rate": 2.942943864319392e-06, "loss": 0.0024, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 294 }, { "completion_length": 1364.5416870117188, "epoch": 0.3597560975609756, "grad_norm": 0.28814077377319336, "kl": 0.04296875, "learning_rate": 2.9420684173824365e-06, "loss": 0.0919, "reward": 0.229166679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 295 }, { "completion_length": 1646.8541870117188, "epoch": 0.36097560975609755, "grad_norm": 0.10829178243875504, "kl": 0.0501708984375, "learning_rate": 2.941186437444372e-06, "loss": 0.0144, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 296 }, { "completion_length": 953.2708740234375, "epoch": 0.3621951219512195, "grad_norm": 0.17340049147605896, "kl": 0.0362548828125, "learning_rate": 2.94029792850083e-06, "loss": -0.0052, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 297 }, { "completion_length": 1209.0833740234375, "epoch": 0.36341463414634145, "grad_norm": 0.22808800637722015, "kl": 0.03759765625, "learning_rate": 2.939402894577022e-06, "loss": 0.0929, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 298 }, { "completion_length": 1447.6041870117188, "epoch": 0.3646341463414634, "grad_norm": 0.14471293985843658, "kl": 0.0372314453125, "learning_rate": 2.9385013397277197e-06, "loss": 0.0402, "reward": 0.29166667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.29166667722165585, "rewards/format_reward": 0.0, "step": 299 }, { "completion_length": 1879.5833740234375, "epoch": 0.36585365853658536, "grad_norm": 0.17423301935195923, "kl": 0.05810546875, "learning_rate": 2.9375932680372358e-06, "loss": 0.0222, "reward": 0.2500000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 300 }, { "completion_length": 1804.0416870117188, "epoch": 0.3670731707317073, "grad_norm": 0.1541019231081009, "kl": 0.0484619140625, "learning_rate": 2.936678683619407e-06, "loss": -0.0067, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.02083333395421505, "step": 301 }, { "completion_length": 1366.2500610351562, "epoch": 0.36829268292682926, "grad_norm": 0.12781226634979248, "kl": 0.04443359375, "learning_rate": 2.935757590617574e-06, "loss": 0.0558, "reward": 0.2083333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 302 }, { "completion_length": 1225.9583740234375, "epoch": 0.3695121951219512, "grad_norm": 0.23931077122688293, "kl": 0.03948974609375, "learning_rate": 2.9348299932045632e-06, "loss": 0.1285, "reward": 0.2083333358168602, "reward_std": 0.14433755725622177, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 303 }, { "completion_length": 1403.4166870117188, "epoch": 0.37073170731707317, "grad_norm": 0.20454244315624237, "kl": 0.0455322265625, "learning_rate": 2.9338958955826685e-06, "loss": 0.1692, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 304 }, { "completion_length": 1516.9791870117188, "epoch": 0.3719512195121951, "grad_norm": 0.3296282887458801, "kl": 0.03857421875, "learning_rate": 2.932955301983631e-06, "loss": -0.0222, "reward": 0.1875, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 305 }, { "completion_length": 1778.7916870117188, "epoch": 0.37317073170731707, "grad_norm": 0.16183285415172577, "kl": 0.0552978515625, "learning_rate": 2.9320082166686226e-06, "loss": 0.0549, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 306 }, { "completion_length": 1728.9166870117188, "epoch": 0.374390243902439, "grad_norm": 0.2384568452835083, "kl": 0.0556640625, "learning_rate": 2.9310546439282207e-06, "loss": 0.1255, "reward": 0.2916666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 307 }, { "completion_length": 1246.375, "epoch": 0.375609756097561, "grad_norm": 0.16777871549129486, "kl": 0.03662109375, "learning_rate": 2.9300945880823955e-06, "loss": 0.0754, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 308 }, { "completion_length": 1529.0000610351562, "epoch": 0.37682926829268293, "grad_norm": 0.1534949243068695, "kl": 0.04095458984375, "learning_rate": 2.9291280534804884e-06, "loss": 0.0743, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 309 }, { "completion_length": 1842.5208740234375, "epoch": 0.3780487804878049, "grad_norm": 0.22654692828655243, "kl": 0.061767578125, "learning_rate": 2.928155044501189e-06, "loss": 0.1141, "reward": 0.3333333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 1686.9791870117188, "epoch": 0.37926829268292683, "grad_norm": 0.21132716536521912, "kl": 0.048583984375, "learning_rate": 2.9271755655525186e-06, "loss": 0.1108, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.02083333395421505, "step": 311 }, { "completion_length": 1089.1666870117188, "epoch": 0.3804878048780488, "grad_norm": 0.2806246876716614, "kl": 0.0469970703125, "learning_rate": 2.9261896210718106e-06, "loss": 0.025, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 312 }, { "completion_length": 1979.8334350585938, "epoch": 0.38170731707317074, "grad_norm": 0.20625810325145721, "kl": 0.1033935546875, "learning_rate": 2.925197215525688e-06, "loss": 0.1587, "reward": 0.229166679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 313 }, { "completion_length": 1556.0, "epoch": 0.3829268292682927, "grad_norm": 0.20645713806152344, "kl": 0.0535888671875, "learning_rate": 2.924198353410044e-06, "loss": 0.0714, "reward": 0.3333333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333358168602, "rewards/format_reward": 0.0, "step": 314 }, { "completion_length": 1445.4375610351562, "epoch": 0.38414634146341464, "grad_norm": 0.22600673139095306, "kl": 0.0462646484375, "learning_rate": 2.923193039250024e-06, "loss": 0.0665, "reward": 0.1875000111758709, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 315 }, { "completion_length": 1767.7500610351562, "epoch": 0.3853658536585366, "grad_norm": 0.27526500821113586, "kl": 0.0552978515625, "learning_rate": 2.9221812776000003e-06, "loss": 0.1939, "reward": 0.25, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 316 }, { "completion_length": 1354.0208740234375, "epoch": 0.38658536585365855, "grad_norm": 0.17785799503326416, "kl": 0.053466796875, "learning_rate": 2.9211630730435564e-06, "loss": 0.0422, "reward": 0.3125000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 317 }, { "completion_length": 1435.7916870117188, "epoch": 0.3878048780487805, "grad_norm": 0.3112121820449829, "kl": 0.052734375, "learning_rate": 2.9201384301934632e-06, "loss": 0.1855, "reward": 0.3125000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "step": 318 }, { "completion_length": 1513.8541870117188, "epoch": 0.38902439024390245, "grad_norm": 0.13673458993434906, "kl": 0.0606689453125, "learning_rate": 2.91910735369166e-06, "loss": 0.0043, "reward": 0.1041666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 319 }, { "completion_length": 1117.3542175292969, "epoch": 0.3902439024390244, "grad_norm": 0.2540631890296936, "kl": 0.04443359375, "learning_rate": 2.9180698482092302e-06, "loss": 0.0845, "reward": 0.2500000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 1496.666748046875, "epoch": 0.39146341463414636, "grad_norm": 0.15195991098880768, "kl": 0.055419921875, "learning_rate": 2.917025918446385e-06, "loss": 0.0354, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 321 }, { "completion_length": 1144.4166870117188, "epoch": 0.3926829268292683, "grad_norm": 0.24764421582221985, "kl": 0.0406494140625, "learning_rate": 2.9159755691324377e-06, "loss": 0.0591, "reward": 0.4166666865348816, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "step": 322 }, { "completion_length": 1238.0625, "epoch": 0.39390243902439026, "grad_norm": 0.302700012922287, "kl": 0.0504150390625, "learning_rate": 2.9149188050257847e-06, "loss": 0.1575, "reward": 0.3333333544433117, "reward_std": 0.21650633960962296, "rewards/accuracy_reward": 0.3333333544433117, "rewards/format_reward": 0.0, "step": 323 }, { "completion_length": 1862.3333740234375, "epoch": 0.3951219512195122, "grad_norm": 0.25302961468696594, "kl": 0.069091796875, "learning_rate": 2.913855630913884e-06, "loss": 0.1944, "reward": 0.25, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 324 }, { "completion_length": 1243.1875610351562, "epoch": 0.39634146341463417, "grad_norm": 0.256081223487854, "kl": 0.0521240234375, "learning_rate": 2.912786051613232e-06, "loss": 0.1478, "reward": 0.3125000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 325 }, { "completion_length": 1392.166748046875, "epoch": 0.3975609756097561, "grad_norm": 0.18309538066387177, "kl": 0.06884765625, "learning_rate": 2.911710071969342e-06, "loss": 0.054, "reward": 0.25, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 326 }, { "completion_length": 1130.1041870117188, "epoch": 0.39878048780487807, "grad_norm": 0.1589204967021942, "kl": 0.0377197265625, "learning_rate": 2.910627696856725e-06, "loss": 0.0589, "reward": 0.3958333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 327 }, { "completion_length": 1120.4583740234375, "epoch": 0.4, "grad_norm": 0.35976317524909973, "kl": 0.0538330078125, "learning_rate": 2.9095389311788626e-06, "loss": 0.1675, "reward": 0.25, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 328 }, { "completion_length": 1578.104248046875, "epoch": 0.401219512195122, "grad_norm": 0.20205625891685486, "kl": 0.076171875, "learning_rate": 2.9084437798681894e-06, "loss": 0.1792, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 329 }, { "completion_length": 1677.7291870117188, "epoch": 0.4024390243902439, "grad_norm": 0.15404236316680908, "kl": 0.073486328125, "learning_rate": 2.9073422478860678e-06, "loss": 0.006, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 1399.3333740234375, "epoch": 0.4036585365853659, "grad_norm": 0.2214285284280777, "kl": 0.0643310546875, "learning_rate": 2.906234340222768e-06, "loss": 0.03, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 331 }, { "completion_length": 1783.4583740234375, "epoch": 0.40487804878048783, "grad_norm": 0.14713653922080994, "kl": 0.065673828125, "learning_rate": 2.9051200618974418e-06, "loss": 0.0049, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 332 }, { "completion_length": 1758.9791870117188, "epoch": 0.4060975609756098, "grad_norm": 0.16985398530960083, "kl": 0.07666015625, "learning_rate": 2.903999417958104e-06, "loss": 0.0649, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 333 }, { "completion_length": 1849.4584350585938, "epoch": 0.4073170731707317, "grad_norm": 0.12413925677537918, "kl": 0.073486328125, "learning_rate": 2.9028724134816064e-06, "loss": 0.0305, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 334 }, { "completion_length": 1856.2708740234375, "epoch": 0.40853658536585363, "grad_norm": 0.16786454617977142, "kl": 0.0709228515625, "learning_rate": 2.9017390535736164e-06, "loss": 0.0567, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 335 }, { "completion_length": 1565.4375, "epoch": 0.4097560975609756, "grad_norm": 0.10165859013795853, "kl": 0.056396484375, "learning_rate": 2.9005993433685932e-06, "loss": 0.0037, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 336 }, { "completion_length": 1560.2083740234375, "epoch": 0.41097560975609754, "grad_norm": 0.14114977419376373, "kl": 0.0655517578125, "learning_rate": 2.899453288029765e-06, "loss": -0.007, "reward": 0.2708333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 337 }, { "completion_length": 1438.1458740234375, "epoch": 0.4121951219512195, "grad_norm": 0.305000901222229, "kl": 0.047119140625, "learning_rate": 2.8983008927491046e-06, "loss": 0.0077, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 338 }, { "completion_length": 1639.1458740234375, "epoch": 0.41341463414634144, "grad_norm": 0.1008121445775032, "kl": 0.046142578125, "learning_rate": 2.8971421627473075e-06, "loss": 0.0022, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 339 }, { "completion_length": 1843.5000610351562, "epoch": 0.4146341463414634, "grad_norm": 0.10296133160591125, "kl": 0.0548095703125, "learning_rate": 2.8959771032737673e-06, "loss": -0.0001, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 1764.666748046875, "epoch": 0.41585365853658535, "grad_norm": 0.22910647094249725, "kl": 0.0535888671875, "learning_rate": 2.8948057196065517e-06, "loss": 0.0919, "reward": 0.2291666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 341 }, { "completion_length": 1748.604248046875, "epoch": 0.4170731707317073, "grad_norm": 0.28481927514076233, "kl": 0.0460205078125, "learning_rate": 2.8936280170523784e-06, "loss": 0.1541, "reward": 0.3125000149011612, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 342 }, { "completion_length": 1324.6875, "epoch": 0.41829268292682925, "grad_norm": 0.16286993026733398, "kl": 0.03826904296875, "learning_rate": 2.892444000946593e-06, "loss": 0.0544, "reward": 0.2500000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 343 }, { "completion_length": 1230.6875610351562, "epoch": 0.4195121951219512, "grad_norm": 0.22794990241527557, "kl": 0.04248046875, "learning_rate": 2.8912536766531423e-06, "loss": 0.0567, "reward": 0.229166679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 344 }, { "completion_length": 1264.2708740234375, "epoch": 0.42073170731707316, "grad_norm": 0.35351109504699707, "kl": 0.0576171875, "learning_rate": 2.8900570495645504e-06, "loss": 0.001, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 345 }, { "completion_length": 1340.6666870117188, "epoch": 0.4219512195121951, "grad_norm": 0.33874788880348206, "kl": 0.0400390625, "learning_rate": 2.8888541251018963e-06, "loss": 0.0849, "reward": 0.1666666679084301, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 346 }, { "completion_length": 1409.6875, "epoch": 0.42317073170731706, "grad_norm": 0.24520732462406158, "kl": 0.0360107421875, "learning_rate": 2.887644908714788e-06, "loss": 0.0975, "reward": 0.08333333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 347 }, { "completion_length": 977.6041870117188, "epoch": 0.424390243902439, "grad_norm": 0.21606609225273132, "kl": 0.02960205078125, "learning_rate": 2.8864294058813364e-06, "loss": 0.0266, "reward": 0.2708333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 348 }, { "completion_length": 1056.2916870117188, "epoch": 0.42560975609756097, "grad_norm": 0.39258772134780884, "kl": 0.037353515625, "learning_rate": 2.8852076221081333e-06, "loss": 0.0413, "reward": 0.2083333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 349 }, { "completion_length": 1330.9375610351562, "epoch": 0.4268292682926829, "grad_norm": 0.21891246736049652, "kl": 0.0457763671875, "learning_rate": 2.883979562930225e-06, "loss": 0.074, "reward": 0.08333333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 350 }, { "completion_length": 1092.5000610351562, "epoch": 0.42804878048780487, "grad_norm": 0.14052334427833557, "kl": 0.0950927734375, "learning_rate": 2.8827452339110856e-06, "loss": 0.064, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 351 }, { "completion_length": 1292.3333740234375, "epoch": 0.4292682926829268, "grad_norm": 0.14718182384967804, "kl": 0.036376953125, "learning_rate": 2.8815046406425954e-06, "loss": 0.0066, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 352 }, { "completion_length": 1312.2500610351562, "epoch": 0.4304878048780488, "grad_norm": 0.28290706872940063, "kl": 0.03179931640625, "learning_rate": 2.8802577887450124e-06, "loss": -0.0227, "reward": 0.25, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 353 }, { "completion_length": 1590.8750610351562, "epoch": 0.4317073170731707, "grad_norm": 0.19730785489082336, "kl": 0.0416259765625, "learning_rate": 2.8790046838669493e-06, "loss": 0.0633, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 354 }, { "completion_length": 1340.5833740234375, "epoch": 0.4329268292682927, "grad_norm": 0.32272911071777344, "kl": 0.037109375, "learning_rate": 2.877745331685345e-06, "loss": -0.0225, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 355 }, { "completion_length": 1146.9792175292969, "epoch": 0.43414634146341463, "grad_norm": 0.26675140857696533, "kl": 0.032470703125, "learning_rate": 2.876479737905442e-06, "loss": 0.1268, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 356 }, { "completion_length": 1439.5000610351562, "epoch": 0.4353658536585366, "grad_norm": 0.17541539669036865, "kl": 0.0404052734375, "learning_rate": 2.875207908260758e-06, "loss": 0.0494, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 357 }, { "completion_length": 1329.7500610351562, "epoch": 0.43658536585365854, "grad_norm": 0.21956294775009155, "kl": 0.0382080078125, "learning_rate": 2.8739298485130627e-06, "loss": 0.023, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 358 }, { "completion_length": 1404.875, "epoch": 0.4378048780487805, "grad_norm": 0.199632927775383, "kl": 0.038330078125, "learning_rate": 2.8726455644523473e-06, "loss": 0.0293, "reward": 0.2708333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 359 }, { "completion_length": 1618.6458740234375, "epoch": 0.43902439024390244, "grad_norm": 0.21699486672878265, "kl": 0.0408935546875, "learning_rate": 2.8713550618968034e-06, "loss": 0.0427, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 360 }, { "completion_length": 1790.604248046875, "epoch": 0.4402439024390244, "grad_norm": 0.13285081088542938, "kl": 0.04541015625, "learning_rate": 2.8700583466927935e-06, "loss": 0.0552, "reward": 0.1458333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 361 }, { "completion_length": 1869.2708740234375, "epoch": 0.44146341463414634, "grad_norm": 0.09731125086545944, "kl": 0.0435791015625, "learning_rate": 2.8687554247148247e-06, "loss": 0.0048, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 362 }, { "completion_length": 2136.604248046875, "epoch": 0.4426829268292683, "grad_norm": 0.1871100813150406, "kl": 0.056640625, "learning_rate": 2.8674463018655245e-06, "loss": 0.1163, "reward": 0.1666666679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 363 }, { "completion_length": 1892.9375610351562, "epoch": 0.44390243902439025, "grad_norm": 0.22789418697357178, "kl": 0.0433349609375, "learning_rate": 2.8661309840756093e-06, "loss": 0.0037, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 364 }, { "completion_length": 1789.479248046875, "epoch": 0.4451219512195122, "grad_norm": 0.29830899834632874, "kl": 0.0538330078125, "learning_rate": 2.8648094773038625e-06, "loss": 0.0351, "reward": 0.3125000149011612, "reward_std": 0.21650636196136475, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 365 }, { "completion_length": 2077.8959350585938, "epoch": 0.44634146341463415, "grad_norm": 0.19889052212238312, "kl": 0.04345703125, "learning_rate": 2.863481787537105e-06, "loss": 0.0693, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 366 }, { "completion_length": 1562.2291870117188, "epoch": 0.4475609756097561, "grad_norm": 0.1966758519411087, "kl": 0.04443359375, "learning_rate": 2.8621479207901685e-06, "loss": 0.0997, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 367 }, { "completion_length": 1414.2083740234375, "epoch": 0.44878048780487806, "grad_norm": 0.22300279140472412, "kl": 0.039794921875, "learning_rate": 2.8608078831058682e-06, "loss": 0.0397, "reward": 0.3125000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 368 }, { "completion_length": 1160.0416870117188, "epoch": 0.45, "grad_norm": 0.30040064454078674, "kl": 0.0352783203125, "learning_rate": 2.859461680554975e-06, "loss": -0.052, "reward": 0.2083333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 369 }, { "completion_length": 1438.6041870117188, "epoch": 0.45121951219512196, "grad_norm": 0.1837051510810852, "kl": 0.0372314453125, "learning_rate": 2.8581093192361895e-06, "loss": 0.103, "reward": 0.2916666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 370 }, { "completion_length": 1300.8125610351562, "epoch": 0.4524390243902439, "grad_norm": 1.8417969942092896, "kl": 0.0439453125, "learning_rate": 2.8567508052761125e-06, "loss": 0.0164, "reward": 0.3541666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 371 }, { "completion_length": 1123.4167175292969, "epoch": 0.45365853658536587, "grad_norm": 0.31571900844573975, "kl": 0.0333251953125, "learning_rate": 2.8553861448292185e-06, "loss": 0.0374, "reward": 0.2083333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 372 }, { "completion_length": 881.125, "epoch": 0.4548780487804878, "grad_norm": 0.37031301856040955, "kl": 0.0357666015625, "learning_rate": 2.854015344077828e-06, "loss": 0.0171, "reward": 0.3125000149011612, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 373 }, { "completion_length": 950.9167175292969, "epoch": 0.4560975609756098, "grad_norm": 0.30248159170150757, "kl": 0.0218505859375, "learning_rate": 2.852638409232077e-06, "loss": -0.0, "reward": 0.2916666865348816, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.0, "step": 374 }, { "completion_length": 1096.2291870117188, "epoch": 0.4573170731707317, "grad_norm": 0.5415692925453186, "kl": 0.04180908203125, "learning_rate": 2.8512553465298938e-06, "loss": 0.113, "reward": 0.25, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 375 }, { "completion_length": 999.3750305175781, "epoch": 0.4585365853658537, "grad_norm": 0.2519807517528534, "kl": 0.0286865234375, "learning_rate": 2.8498661622369637e-06, "loss": 0.0684, "reward": 0.229166679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 376 }, { "completion_length": 949.0833435058594, "epoch": 0.45975609756097563, "grad_norm": 0.43923333287239075, "kl": 0.02801513671875, "learning_rate": 2.848470862646709e-06, "loss": 0.0467, "reward": 0.4166666716337204, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 377 }, { "completion_length": 1178.0, "epoch": 0.4609756097560976, "grad_norm": 0.5409067273139954, "kl": 0.035400390625, "learning_rate": 2.8470694540802527e-06, "loss": 0.0799, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 378 }, { "completion_length": 1207.9375, "epoch": 0.46219512195121953, "grad_norm": 0.297665536403656, "kl": 0.034912109375, "learning_rate": 2.8456619428863958e-06, "loss": 0.0713, "reward": 0.2500000149011612, "reward_std": 0.14433757215738297, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 379 }, { "completion_length": 1274.7708740234375, "epoch": 0.4634146341463415, "grad_norm": 0.2379186749458313, "kl": 0.029052734375, "learning_rate": 2.8442483354415836e-06, "loss": 0.0696, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 380 }, { "completion_length": 1116.0208435058594, "epoch": 0.46463414634146344, "grad_norm": 0.2444387823343277, "kl": 0.03662109375, "learning_rate": 2.842828638149881e-06, "loss": 0.011, "reward": 0.2500000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 381 }, { "completion_length": 1328.4583740234375, "epoch": 0.4658536585365854, "grad_norm": 0.29103797674179077, "kl": 0.031494140625, "learning_rate": 2.841402857442942e-06, "loss": 0.0964, "reward": 0.16666667722165585, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 382 }, { "completion_length": 1327.0416870117188, "epoch": 0.46707317073170734, "grad_norm": 0.27820536494255066, "kl": 0.0511474609375, "learning_rate": 2.839970999779978e-06, "loss": 0.0601, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 383 }, { "completion_length": 1154.729248046875, "epoch": 0.4682926829268293, "grad_norm": 0.41225165128707886, "kl": 0.031982421875, "learning_rate": 2.8385330716477335e-06, "loss": 0.1585, "reward": 0.3125, "reward_std": 0.28867512196302414, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 384 }, { "completion_length": 1210.2916870117188, "epoch": 0.4695121951219512, "grad_norm": 0.2080121785402298, "kl": 0.05517578125, "learning_rate": 2.8370890795604523e-06, "loss": 0.0758, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 385 }, { "completion_length": 1365.0000610351562, "epoch": 0.47073170731707314, "grad_norm": 0.22689980268478394, "kl": 0.0433349609375, "learning_rate": 2.835639030059851e-06, "loss": 0.095, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 386 }, { "completion_length": 1188.2292175292969, "epoch": 0.4719512195121951, "grad_norm": 0.19032619893550873, "kl": 0.04095458984375, "learning_rate": 2.834182929715087e-06, "loss": 0.0076, "reward": 0.3125000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "step": 387 }, { "completion_length": 1653.6458740234375, "epoch": 0.47317073170731705, "grad_norm": 0.31270819902420044, "kl": 0.057373046875, "learning_rate": 2.8327207851227295e-06, "loss": 0.1424, "reward": 0.3125000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "step": 388 }, { "completion_length": 1491.3333740234375, "epoch": 0.474390243902439, "grad_norm": 0.210535928606987, "kl": 0.052001953125, "learning_rate": 2.831252602906732e-06, "loss": 0.1105, "reward": 0.3750000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 389 }, { "completion_length": 1337.6875, "epoch": 0.47560975609756095, "grad_norm": 0.23256336152553558, "kl": 0.0504150390625, "learning_rate": 2.829778389718398e-06, "loss": 0.1667, "reward": 0.3125, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 390 }, { "completion_length": 1153.0833740234375, "epoch": 0.4768292682926829, "grad_norm": 0.27207329869270325, "kl": 0.0440673828125, "learning_rate": 2.828298152236354e-06, "loss": 0.1266, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 391 }, { "completion_length": 1194.0, "epoch": 0.47804878048780486, "grad_norm": 0.34358105063438416, "kl": 0.047607421875, "learning_rate": 2.826811897166519e-06, "loss": 0.1167, "reward": 0.2916666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 392 }, { "completion_length": 1558.2083740234375, "epoch": 0.4792682926829268, "grad_norm": 0.28318658471107483, "kl": 0.080078125, "learning_rate": 2.8253196312420727e-06, "loss": 0.1105, "reward": 0.1041666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 393 }, { "completion_length": 1704.354248046875, "epoch": 0.48048780487804876, "grad_norm": 0.18758158385753632, "kl": 0.169921875, "learning_rate": 2.8238213612234255e-06, "loss": 0.1243, "reward": 0.1458333395421505, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 394 }, { "completion_length": 1349.7916870117188, "epoch": 0.4817073170731707, "grad_norm": 0.3759765923023224, "kl": 0.064697265625, "learning_rate": 2.822317093898189e-06, "loss": 0.1108, "reward": 0.2708333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 395 }, { "completion_length": 1822.9375610351562, "epoch": 0.48292682926829267, "grad_norm": 0.11863032728433609, "kl": 0.0777587890625, "learning_rate": 2.8208068360811445e-06, "loss": 0.033, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 396 }, { "completion_length": 1555.1250610351562, "epoch": 0.4841463414634146, "grad_norm": 0.30500730872154236, "kl": 0.079833984375, "learning_rate": 2.8192905946142097e-06, "loss": 0.1257, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 397 }, { "completion_length": 1335.416748046875, "epoch": 0.4853658536585366, "grad_norm": 0.25227734446525574, "kl": 0.068603515625, "learning_rate": 2.8177683763664137e-06, "loss": 0.0124, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 398 }, { "completion_length": 1354.541748046875, "epoch": 0.4865853658536585, "grad_norm": 0.18838754296302795, "kl": 0.0751953125, "learning_rate": 2.816240188233859e-06, "loss": 0.143, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 399 }, { "completion_length": 1734.7500610351562, "epoch": 0.4878048780487805, "grad_norm": 0.1608065962791443, "kl": 0.1005859375, "learning_rate": 2.8147060371396953e-06, "loss": 0.0471, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 400 }, { "completion_length": 1418.9375, "epoch": 0.48902439024390243, "grad_norm": 0.24483974277973175, "kl": 0.0677490234375, "learning_rate": 2.813165930034086e-06, "loss": 0.0526, "reward": 0.2500000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 401 }, { "completion_length": 1527.2500610351562, "epoch": 0.4902439024390244, "grad_norm": 0.34487003087997437, "kl": 0.0703125, "learning_rate": 2.8116198738941766e-06, "loss": 0.038, "reward": 0.18750000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 402 }, { "completion_length": 1277.7708740234375, "epoch": 0.49146341463414633, "grad_norm": 0.17776264250278473, "kl": 0.0684814453125, "learning_rate": 2.8100678757240637e-06, "loss": 0.059, "reward": 0.1875, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 403 }, { "completion_length": 1158.5, "epoch": 0.4926829268292683, "grad_norm": 0.2504693567752838, "kl": 0.0615234375, "learning_rate": 2.8085099425547627e-06, "loss": 0.0553, "reward": 0.2708333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 404 }, { "completion_length": 1852.1875, "epoch": 0.49390243902439024, "grad_norm": 0.18786410987377167, "kl": 0.07373046875, "learning_rate": 2.8069460814441764e-06, "loss": 0.1003, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 405 }, { "completion_length": 1766.729248046875, "epoch": 0.4951219512195122, "grad_norm": 0.11878161132335663, "kl": 0.0703125, "learning_rate": 2.8053762994770646e-06, "loss": 0.0532, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 406 }, { "completion_length": 1437.8333740234375, "epoch": 0.49634146341463414, "grad_norm": 0.18048304319381714, "kl": 0.0609130859375, "learning_rate": 2.803800603765008e-06, "loss": 0.1224, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 407 }, { "completion_length": 1439.2916870117188, "epoch": 0.4975609756097561, "grad_norm": 0.22953608632087708, "kl": 0.05682373046875, "learning_rate": 2.8022190014463794e-06, "loss": 0.0213, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 408 }, { "completion_length": 1297.8958740234375, "epoch": 0.49878048780487805, "grad_norm": 0.24577409029006958, "kl": 0.04931640625, "learning_rate": 2.80063149968631e-06, "loss": 0.0559, "reward": 0.3541666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 409 }, { "completion_length": 1632.229248046875, "epoch": 0.5, "grad_norm": 0.18608181178569794, "kl": 0.07568359375, "learning_rate": 2.7990381056766585e-06, "loss": 0.0622, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 410 }, { "completion_length": 1753.479248046875, "epoch": 0.501219512195122, "grad_norm": 0.18717730045318604, "kl": 0.071533203125, "learning_rate": 2.7974388266359745e-06, "loss": 0.0037, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 411 }, { "completion_length": 1317.9166870117188, "epoch": 0.5024390243902439, "grad_norm": 0.22639930248260498, "kl": 0.0557861328125, "learning_rate": 2.795833669809471e-06, "loss": 0.0207, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 412 }, { "completion_length": 1190.0625, "epoch": 0.5036585365853659, "grad_norm": 0.12377037107944489, "kl": 0.04931640625, "learning_rate": 2.794222642468989e-06, "loss": 0.0313, "reward": 0.2916666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 413 }, { "completion_length": 1517.5208740234375, "epoch": 0.5048780487804878, "grad_norm": 0.19605574011802673, "kl": 0.0616455078125, "learning_rate": 2.7926057519129634e-06, "loss": 0.0668, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 414 }, { "completion_length": 1734.1875, "epoch": 0.5060975609756098, "grad_norm": 0.1061120554804802, "kl": 0.0706787109375, "learning_rate": 2.790983005466392e-06, "loss": 0.0924, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 415 }, { "completion_length": 1568.8333740234375, "epoch": 0.5073170731707317, "grad_norm": 0.24775585532188416, "kl": 0.0552978515625, "learning_rate": 2.7893544104808017e-06, "loss": 0.093, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 416 }, { "completion_length": 1270.2500610351562, "epoch": 0.5085365853658537, "grad_norm": 0.13752508163452148, "kl": 0.043212890625, "learning_rate": 2.7877199743342145e-06, "loss": 0.0847, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 417 }, { "completion_length": 1386.104248046875, "epoch": 0.5097560975609756, "grad_norm": 0.06608755886554718, "kl": 0.0482177734375, "learning_rate": 2.7860797044311143e-06, "loss": 0.0028, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 418 }, { "completion_length": 1179.2708740234375, "epoch": 0.5109756097560976, "grad_norm": 0.12374904006719589, "kl": 0.0428466796875, "learning_rate": 2.784433608202415e-06, "loss": 0.0386, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 419 }, { "completion_length": 1443.0208740234375, "epoch": 0.5121951219512195, "grad_norm": 5.924796104431152, "kl": 0.089111328125, "learning_rate": 2.7827816931054245e-06, "loss": 0.0041, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 420 }, { "completion_length": 1319.9375610351562, "epoch": 0.5134146341463415, "grad_norm": 0.2563548982143402, "kl": 0.050537109375, "learning_rate": 2.7811239666238117e-06, "loss": 0.0337, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 421 }, { "completion_length": 1454.5208740234375, "epoch": 0.5146341463414634, "grad_norm": 0.18849150836467743, "kl": 0.04833984375, "learning_rate": 2.7794604362675733e-06, "loss": 0.1014, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 422 }, { "completion_length": 1556.5000610351562, "epoch": 0.5158536585365854, "grad_norm": 0.4775267541408539, "kl": 0.039306640625, "learning_rate": 2.777791109573e-06, "loss": 0.0488, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 423 }, { "completion_length": 1247.75, "epoch": 0.5170731707317073, "grad_norm": 0.2322523295879364, "kl": 0.037353515625, "learning_rate": 2.7761159941026403e-06, "loss": 0.0786, "reward": 0.1875000111758709, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 424 }, { "completion_length": 1167.5833740234375, "epoch": 0.5182926829268293, "grad_norm": 0.2629512846469879, "kl": 0.0377197265625, "learning_rate": 2.7744350974452685e-06, "loss": 0.1342, "reward": 0.395833358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.395833358168602, "rewards/format_reward": 0.0, "step": 425 }, { "completion_length": 1291.9583740234375, "epoch": 0.5195121951219512, "grad_norm": 0.2063109129667282, "kl": 0.060302734375, "learning_rate": 2.772748427215848e-06, "loss": 0.0577, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 426 }, { "completion_length": 1211.0833740234375, "epoch": 0.5207317073170732, "grad_norm": 0.2745192050933838, "kl": 0.0472412109375, "learning_rate": 2.7710559910555e-06, "loss": 0.0652, "reward": 0.1458333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 427 }, { "completion_length": 1026.8125, "epoch": 0.5219512195121951, "grad_norm": 0.30060166120529175, "kl": 0.0367431640625, "learning_rate": 2.7693577966314664e-06, "loss": 0.1704, "reward": 0.3541666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 428 }, { "completion_length": 1103.0000610351562, "epoch": 0.5231707317073171, "grad_norm": 0.18619495630264282, "kl": 0.0426025390625, "learning_rate": 2.7676538516370753e-06, "loss": 0.0261, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 429 }, { "completion_length": 935.8958740234375, "epoch": 0.524390243902439, "grad_norm": 0.20496368408203125, "kl": 0.0374755859375, "learning_rate": 2.7659441637917076e-06, "loss": 0.0691, "reward": 0.2083333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 430 }, { "completion_length": 857.6875305175781, "epoch": 0.525609756097561, "grad_norm": 0.3608001470565796, "kl": 0.0460205078125, "learning_rate": 2.76422874084076e-06, "loss": 0.0025, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 431 }, { "completion_length": 946.4792175292969, "epoch": 0.526829268292683, "grad_norm": 0.23060624301433563, "kl": 0.031005859375, "learning_rate": 2.7625075905556117e-06, "loss": 0.0419, "reward": 0.3541666865348816, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 0.0, "step": 432 }, { "completion_length": 1289.4375610351562, "epoch": 0.5280487804878049, "grad_norm": 0.2740060091018677, "kl": 0.05224609375, "learning_rate": 2.760780720733588e-06, "loss": 0.0771, "reward": 0.2708333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 433 }, { "completion_length": 984.8125, "epoch": 0.5292682926829269, "grad_norm": 0.1269846111536026, "kl": 0.04150390625, "learning_rate": 2.7590481391979253e-06, "loss": 0.0031, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 434 }, { "completion_length": 971.5208740234375, "epoch": 0.5304878048780488, "grad_norm": 0.26722022891044617, "kl": 0.0374755859375, "learning_rate": 2.757309853797736e-06, "loss": -0.0046, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 435 }, { "completion_length": 1290.1666870117188, "epoch": 0.5317073170731708, "grad_norm": 0.1854739636182785, "kl": 0.0390625, "learning_rate": 2.755565872407973e-06, "loss": 0.0296, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 436 }, { "completion_length": 867.9166870117188, "epoch": 0.5329268292682927, "grad_norm": 0.3844335973262787, "kl": 0.041259765625, "learning_rate": 2.7538162029293933e-06, "loss": 0.139, "reward": 0.3333333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 437 }, { "completion_length": 674.8750305175781, "epoch": 0.5341463414634147, "grad_norm": 0.30910828709602356, "kl": 0.0367431640625, "learning_rate": 2.7520608532885228e-06, "loss": 0.0333, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 438 }, { "completion_length": 785.8125305175781, "epoch": 0.5353658536585366, "grad_norm": 0.044259827584028244, "kl": 0.043212890625, "learning_rate": 2.75029983143762e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 439 }, { "completion_length": 993.2708435058594, "epoch": 0.5365853658536586, "grad_norm": 0.1698787659406662, "kl": 0.048583984375, "learning_rate": 2.7485331453546407e-06, "loss": 0.0429, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 440 }, { "completion_length": 1106.2291870117188, "epoch": 0.5378048780487805, "grad_norm": 0.29578039050102234, "kl": 0.046142578125, "learning_rate": 2.7467608030432016e-06, "loss": 0.0045, "reward": 0.3333333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333358168602, "rewards/format_reward": 0.0, "step": 441 }, { "completion_length": 899.1458740234375, "epoch": 0.5390243902439025, "grad_norm": 0.3156502842903137, "kl": 0.042724609375, "learning_rate": 2.744982812532542e-06, "loss": 0.0678, "reward": 0.3333333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 442 }, { "completion_length": 1144.8750305175781, "epoch": 0.5402439024390244, "grad_norm": 0.13950711488723755, "kl": 0.063232421875, "learning_rate": 2.743199181877492e-06, "loss": 0.073, "reward": 0.2916666865348816, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.0, "step": 443 }, { "completion_length": 1378.5416870117188, "epoch": 0.5414634146341464, "grad_norm": 0.2245616763830185, "kl": 0.0677490234375, "learning_rate": 2.7414099191584305e-06, "loss": 0.0981, "reward": 0.3125000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.02083333395421505, "step": 444 }, { "completion_length": 1002.4375610351562, "epoch": 0.5426829268292683, "grad_norm": 1.132480263710022, "kl": 0.0614013671875, "learning_rate": 2.739615032481253e-06, "loss": 0.0408, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 445 }, { "completion_length": 1103.1250610351562, "epoch": 0.5439024390243903, "grad_norm": 0.2091972976922989, "kl": 0.0439453125, "learning_rate": 2.7378145299773337e-06, "loss": 0.0327, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 446 }, { "completion_length": 1195.7708740234375, "epoch": 0.5451219512195122, "grad_norm": 0.21537700295448303, "kl": 0.05908203125, "learning_rate": 2.7360084198034864e-06, "loss": 0.0149, "reward": 0.4375000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 447 }, { "completion_length": 1282.7708740234375, "epoch": 0.5463414634146342, "grad_norm": 0.09660086035728455, "kl": 0.0556640625, "learning_rate": 2.7341967101419303e-06, "loss": 0.0292, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 448 }, { "completion_length": 1311.0208740234375, "epoch": 0.5475609756097561, "grad_norm": 0.2985081374645233, "kl": 0.077392578125, "learning_rate": 2.7323794092002518e-06, "loss": 0.1144, "reward": 0.4166666865348816, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "step": 449 }, { "completion_length": 1565.8125, "epoch": 0.5487804878048781, "grad_norm": 0.16280443966388702, "kl": 0.0771484375, "learning_rate": 2.730556525211368e-06, "loss": -0.0171, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 450 }, { "completion_length": 1437.666748046875, "epoch": 0.55, "grad_norm": 0.13337887823581696, "kl": 0.0634765625, "learning_rate": 2.728728066433488e-06, "loss": 0.0518, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 451 }, { "completion_length": 1637.9583740234375, "epoch": 0.551219512195122, "grad_norm": 0.7839400172233582, "kl": 0.1005859375, "learning_rate": 2.726894041150077e-06, "loss": 0.0629, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 452 }, { "completion_length": 1816.1250610351562, "epoch": 0.552439024390244, "grad_norm": 0.19779804348945618, "kl": 0.0736083984375, "learning_rate": 2.7250544576698174e-06, "loss": 0.0425, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.02083333395421505, "step": 453 }, { "completion_length": 1698.4375, "epoch": 0.5536585365853659, "grad_norm": 0.11766696721315384, "kl": 0.070556640625, "learning_rate": 2.7232093243265727e-06, "loss": 0.0805, "reward": 0.2916666865348816, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.0, "step": 454 }, { "completion_length": 1189.8750610351562, "epoch": 0.5548780487804879, "grad_norm": 0.1796276569366455, "kl": 0.0560302734375, "learning_rate": 2.7213586494793492e-06, "loss": 0.0659, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 455 }, { "completion_length": 1544.1250610351562, "epoch": 0.5560975609756098, "grad_norm": 0.22805029153823853, "kl": 0.07421875, "learning_rate": 2.7195024415122565e-06, "loss": 0.153, "reward": 0.1041666679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.02083333395421505, "step": 456 }, { "completion_length": 1442.604248046875, "epoch": 0.5573170731707318, "grad_norm": 0.28826913237571716, "kl": 0.0616455078125, "learning_rate": 2.7176407088344726e-06, "loss": 0.1045, "reward": 0.25, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.02083333395421505, "step": 457 }, { "completion_length": 1223.1458740234375, "epoch": 0.5585365853658537, "grad_norm": 0.13151633739471436, "kl": 0.0552978515625, "learning_rate": 2.715773459880202e-06, "loss": 0.0394, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 458 }, { "completion_length": 1160.1041870117188, "epoch": 0.5597560975609757, "grad_norm": 0.28270423412323, "kl": 0.055419921875, "learning_rate": 2.7139007031086414e-06, "loss": 0.0181, "reward": 0.10416666977107525, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 459 }, { "completion_length": 1319.9791870117188, "epoch": 0.5609756097560976, "grad_norm": 0.37884706258773804, "kl": 0.05419921875, "learning_rate": 2.7120224470039394e-06, "loss": -0.0192, "reward": 0.3750000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 460 }, { "completion_length": 778.9583435058594, "epoch": 0.5621951219512196, "grad_norm": 0.3582049608230591, "kl": 0.06005859375, "learning_rate": 2.710138700075157e-06, "loss": -0.0113, "reward": 0.3541666865348816, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 0.0, "step": 461 }, { "completion_length": 1029.1041870117188, "epoch": 0.5634146341463414, "grad_norm": 0.1818416714668274, "kl": 0.0457763671875, "learning_rate": 2.7082494708562316e-06, "loss": 0.0075, "reward": 0.3333333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 462 }, { "completion_length": 1187.1250305175781, "epoch": 0.5646341463414634, "grad_norm": 0.11220806837081909, "kl": 0.0535888671875, "learning_rate": 2.706354767905936e-06, "loss": 0.0434, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 463 }, { "completion_length": 958.875, "epoch": 0.5658536585365853, "grad_norm": 0.04939443618059158, "kl": 0.040283203125, "learning_rate": 2.7044545998078414e-06, "loss": 0.0021, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 464 }, { "completion_length": 1404.375, "epoch": 0.5670731707317073, "grad_norm": 0.2152305394411087, "kl": 0.0555419921875, "learning_rate": 2.702548975170277e-06, "loss": 0.0593, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 465 }, { "completion_length": 1072.6042175292969, "epoch": 0.5682926829268292, "grad_norm": 0.21922890841960907, "kl": 0.0535888671875, "learning_rate": 2.7006379026262924e-06, "loss": -0.0193, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.02083333395421505, "step": 466 }, { "completion_length": 1168.5000610351562, "epoch": 0.5695121951219512, "grad_norm": 0.36720722913742065, "kl": 0.0565185546875, "learning_rate": 2.6987213908336185e-06, "loss": -0.035, "reward": 0.2916666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.02083333395421505, "step": 467 }, { "completion_length": 738.0208435058594, "epoch": 0.5707317073170731, "grad_norm": 0.2384078949689865, "kl": 0.0830078125, "learning_rate": 2.696799448474625e-06, "loss": 0.0177, "reward": 0.2291666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 468 }, { "completion_length": 1138.6041870117188, "epoch": 0.5719512195121951, "grad_norm": 0.13069722056388855, "kl": 0.05322265625, "learning_rate": 2.694872084256287e-06, "loss": 0.0051, "reward": 0.1666666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 469 }, { "completion_length": 1095.3333740234375, "epoch": 0.573170731707317, "grad_norm": 0.20416928827762604, "kl": 0.0361328125, "learning_rate": 2.69293930691014e-06, "loss": 0.0289, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 470 }, { "completion_length": 957.6041870117188, "epoch": 0.574390243902439, "grad_norm": 0.19630835950374603, "kl": 0.0406494140625, "learning_rate": 2.691001125192243e-06, "loss": 0.0987, "reward": 0.2500000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 471 }, { "completion_length": 1503.6666870117188, "epoch": 0.5756097560975609, "grad_norm": 0.27781251072883606, "kl": 0.052001953125, "learning_rate": 2.689057547883139e-06, "loss": 0.0444, "reward": 0.1041666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 472 }, { "completion_length": 1112.9375, "epoch": 0.5768292682926829, "grad_norm": 0.17690780758857727, "kl": 0.0364990234375, "learning_rate": 2.687108583787815e-06, "loss": -0.0049, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 473 }, { "completion_length": 1363.5208740234375, "epoch": 0.5780487804878048, "grad_norm": 0.24803577363491058, "kl": 0.1256103515625, "learning_rate": 2.6851542417356605e-06, "loss": 0.1488, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 474 }, { "completion_length": 1684.0208740234375, "epoch": 0.5792682926829268, "grad_norm": 0.2815094292163849, "kl": 0.0484619140625, "learning_rate": 2.683194530580429e-06, "loss": 0.0227, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 475 }, { "completion_length": 1131.3333740234375, "epoch": 0.5804878048780487, "grad_norm": 0.2404363602399826, "kl": 0.037109375, "learning_rate": 2.6812294592001984e-06, "loss": 0.0644, "reward": 0.1458333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 476 }, { "completion_length": 1365.5417175292969, "epoch": 0.5817073170731707, "grad_norm": 0.22256530821323395, "kl": 0.047607421875, "learning_rate": 2.67925903649733e-06, "loss": 0.1315, "reward": 0.3125000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 477 }, { "completion_length": 1505.125, "epoch": 0.5829268292682926, "grad_norm": 0.2298562377691269, "kl": 0.0440673828125, "learning_rate": 2.677283271398427e-06, "loss": 0.1158, "reward": 0.229166679084301, "reward_std": 0.21650635451078415, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 478 }, { "completion_length": 1712.4583740234375, "epoch": 0.5841463414634146, "grad_norm": 0.3043181300163269, "kl": 0.0555419921875, "learning_rate": 2.6753021728542965e-06, "loss": 0.0561, "reward": 0.2291666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.02083333395421505, "step": 479 }, { "completion_length": 1515.104248046875, "epoch": 0.5853658536585366, "grad_norm": 0.08216846734285355, "kl": 0.0465087890625, "learning_rate": 2.673315749839907e-06, "loss": 0.0281, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 480 }, { "completion_length": 990.375, "epoch": 0.5865853658536585, "grad_norm": 0.24126873910427094, "kl": 0.0416259765625, "learning_rate": 2.6713240113543487e-06, "loss": 0.0088, "reward": 0.458333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.458333358168602, "rewards/format_reward": 0.0, "step": 481 }, { "completion_length": 1097.125, "epoch": 0.5878048780487805, "grad_norm": 0.3801729083061218, "kl": 0.04736328125, "learning_rate": 2.669326966420793e-06, "loss": 0.1985, "reward": 0.2500000149011612, "reward_std": 0.21650633960962296, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 482 }, { "completion_length": 1169.7291870117188, "epoch": 0.5890243902439024, "grad_norm": 0.14997708797454834, "kl": 0.0557861328125, "learning_rate": 2.66732462408645e-06, "loss": 0.0039, "reward": 0.2083333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 483 }, { "completion_length": 1166.3958740234375, "epoch": 0.5902439024390244, "grad_norm": 0.24672070145606995, "kl": 0.0418701171875, "learning_rate": 2.6653169934225295e-06, "loss": 0.0789, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 484 }, { "completion_length": 1335.7708740234375, "epoch": 0.5914634146341463, "grad_norm": 0.1735910326242447, "kl": 0.0411376953125, "learning_rate": 2.6633040835241987e-06, "loss": 0.0728, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 485 }, { "completion_length": 1121.1666870117188, "epoch": 0.5926829268292683, "grad_norm": 0.2157490849494934, "kl": 0.0391845703125, "learning_rate": 2.661285903510541e-06, "loss": 0.0758, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 486 }, { "completion_length": 1581.6041870117188, "epoch": 0.5939024390243902, "grad_norm": 0.27000123262405396, "kl": 0.072265625, "learning_rate": 2.659262462524515e-06, "loss": 0.1263, "reward": 0.1875000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.02083333395421505, "step": 487 }, { "completion_length": 1651.1875610351562, "epoch": 0.5951219512195122, "grad_norm": 0.19407932460308075, "kl": 0.059326171875, "learning_rate": 2.6572337697329145e-06, "loss": 0.0808, "reward": 0.08333333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 488 }, { "completion_length": 1309.1041870117188, "epoch": 0.5963414634146341, "grad_norm": 0.12798021733760834, "kl": 0.05859375, "learning_rate": 2.6551998343263237e-06, "loss": 0.0054, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 489 }, { "completion_length": 1194.625, "epoch": 0.5975609756097561, "grad_norm": 0.16306421160697937, "kl": 0.0546875, "learning_rate": 2.6531606655190777e-06, "loss": 0.0574, "reward": 0.2083333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 490 }, { "completion_length": 1320.1666870117188, "epoch": 0.598780487804878, "grad_norm": 0.2973303198814392, "kl": 0.0498046875, "learning_rate": 2.651116272549222e-06, "loss": 0.023, "reward": 0.1875, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 491 }, { "completion_length": 1426.4583740234375, "epoch": 0.6, "grad_norm": 0.10100162029266357, "kl": 0.060302734375, "learning_rate": 2.649066664678467e-06, "loss": 0.0026, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 492 }, { "completion_length": 1087.4583740234375, "epoch": 0.6012195121951219, "grad_norm": 0.21180382370948792, "kl": 0.044189453125, "learning_rate": 2.64701185119215e-06, "loss": 0.1241, "reward": 0.3125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 493 }, { "completion_length": 1077.7500610351562, "epoch": 0.6024390243902439, "grad_norm": 0.1880342662334442, "kl": 0.05810546875, "learning_rate": 2.64495184139919e-06, "loss": -0.0126, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 494 }, { "completion_length": 1699.6666870117188, "epoch": 0.6036585365853658, "grad_norm": 0.27347323298454285, "kl": 0.07177734375, "learning_rate": 2.642886644632047e-06, "loss": 0.0149, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.02083333395421505, "step": 495 }, { "completion_length": 1039.2292175292969, "epoch": 0.6048780487804878, "grad_norm": 0.30174925923347473, "kl": 0.0496826171875, "learning_rate": 2.640816270246681e-06, "loss": -0.0058, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.02083333395421505, "step": 496 }, { "completion_length": 1039.875, "epoch": 0.6060975609756097, "grad_norm": 0.30453553795814514, "kl": 0.0396728515625, "learning_rate": 2.6387407276225055e-06, "loss": 0.1335, "reward": 0.2916666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 497 }, { "completion_length": 1335.1666870117188, "epoch": 0.6073170731707317, "grad_norm": 0.17169587314128876, "kl": 0.0625, "learning_rate": 2.636660026162351e-06, "loss": 0.0067, "reward": 0.2083333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 498 }, { "completion_length": 1355.5833740234375, "epoch": 0.6085365853658536, "grad_norm": 0.2508138418197632, "kl": 0.0516357421875, "learning_rate": 2.6345741752924154e-06, "loss": 0.0888, "reward": 0.3125000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 499 }, { "completion_length": 1596.0833740234375, "epoch": 0.6097560975609756, "grad_norm": 0.21253181993961334, "kl": 0.0555419921875, "learning_rate": 2.6324831844622278e-06, "loss": 0.1028, "reward": 0.2500000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.02083333395421505, "step": 500 }, { "completion_length": 1455.979248046875, "epoch": 0.6109756097560975, "grad_norm": 0.2276027351617813, "kl": 0.060302734375, "learning_rate": 2.6303870631446013e-06, "loss": 0.0198, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 501 }, { "completion_length": 1299.791748046875, "epoch": 0.6121951219512195, "grad_norm": 0.1989363580942154, "kl": 0.0565185546875, "learning_rate": 2.628285820835593e-06, "loss": 0.0878, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 502 }, { "completion_length": 1726.8959350585938, "epoch": 0.6134146341463415, "grad_norm": 0.14849476516246796, "kl": 0.0677490234375, "learning_rate": 2.6261794670544584e-06, "loss": 0.0424, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 503 }, { "completion_length": 1562.0625, "epoch": 0.6146341463414634, "grad_norm": 0.13582201302051544, "kl": 0.062255859375, "learning_rate": 2.6240680113436096e-06, "loss": 0.0629, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 504 }, { "completion_length": 1658.8125610351562, "epoch": 0.6158536585365854, "grad_norm": 0.1977536678314209, "kl": 0.0645751953125, "learning_rate": 2.6219514632685732e-06, "loss": 0.0363, "reward": 0.10416666977107525, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.02083333395421505, "step": 505 }, { "completion_length": 1048.2292175292969, "epoch": 0.6170731707317073, "grad_norm": 0.2311759740114212, "kl": 0.0509033203125, "learning_rate": 2.619829832417944e-06, "loss": 0.0574, "reward": 0.2708333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 506 }, { "completion_length": 1363.8125610351562, "epoch": 0.6182926829268293, "grad_norm": 0.2936003804206848, "kl": 0.06298828125, "learning_rate": 2.6177031284033447e-06, "loss": 0.0193, "reward": 0.2083333358168602, "reward_std": 0.21650633960962296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0416666679084301, "step": 507 }, { "completion_length": 1696.75, "epoch": 0.6195121951219512, "grad_norm": 0.168170765042305, "kl": 0.0655517578125, "learning_rate": 2.6155713608593796e-06, "loss": 0.0578, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 508 }, { "completion_length": 1585.6875610351562, "epoch": 0.6207317073170732, "grad_norm": 0.11840315163135529, "kl": 0.055908203125, "learning_rate": 2.6134345394435936e-06, "loss": 0.0242, "reward": 0.20833333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 509 }, { "completion_length": 1523.0625610351562, "epoch": 0.6219512195121951, "grad_norm": 0.2206415832042694, "kl": 0.07373046875, "learning_rate": 2.6112926738364267e-06, "loss": 0.0635, "reward": 0.25000000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.22916667722165585, "rewards/format_reward": 0.02083333395421505, "step": 510 }, { "completion_length": 1168.125, "epoch": 0.6231707317073171, "grad_norm": 0.36636242270469666, "kl": 0.04425048828125, "learning_rate": 2.6091457737411704e-06, "loss": 0.0256, "reward": 0.3333333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0416666679084301, "step": 511 }, { "completion_length": 1164.4375610351562, "epoch": 0.624390243902439, "grad_norm": 0.1898491531610489, "kl": 0.048095703125, "learning_rate": 2.606993848883924e-06, "loss": 0.0033, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.02083333395421505, "step": 512 }, { "completion_length": 1449.1458740234375, "epoch": 0.625609756097561, "grad_norm": 0.16993702948093414, "kl": 0.061279296875, "learning_rate": 2.6048369090135504e-06, "loss": 0.0012, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 513 }, { "completion_length": 1298.9375610351562, "epoch": 0.6268292682926829, "grad_norm": 0.188385471701622, "kl": 0.0517578125, "learning_rate": 2.6026749639016327e-06, "loss": 0.0336, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0416666679084301, "step": 514 }, { "completion_length": 1865.375, "epoch": 0.6280487804878049, "grad_norm": 0.1277674436569214, "kl": 0.056884765625, "learning_rate": 2.600508023342428e-06, "loss": 0.0107, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 515 }, { "completion_length": 1496.0833740234375, "epoch": 0.6292682926829268, "grad_norm": 0.18677006661891937, "kl": 0.0616455078125, "learning_rate": 2.5983360971528252e-06, "loss": 0.0695, "reward": 0.2708333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0416666679084301, "step": 516 }, { "completion_length": 1251.3750610351562, "epoch": 0.6304878048780488, "grad_norm": 0.36834806203842163, "kl": 0.0499267578125, "learning_rate": 2.5961591951722993e-06, "loss": 0.1318, "reward": 0.2708333432674408, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.1041666679084301, "step": 517 }, { "completion_length": 1607.8125, "epoch": 0.6317073170731707, "grad_norm": 0.4252477288246155, "kl": 0.0595703125, "learning_rate": 2.5939773272628674e-06, "loss": 0.0948, "reward": 0.16666667722165585, "reward_std": 0.21650633960962296, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0625, "step": 518 }, { "completion_length": 1406.8541870117188, "epoch": 0.6329268292682927, "grad_norm": 0.2951771914958954, "kl": 0.049072265625, "learning_rate": 2.5917905033090436e-06, "loss": 0.0129, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.08333333395421505, "step": 519 }, { "completion_length": 1534.5208740234375, "epoch": 0.6341463414634146, "grad_norm": 0.31893834471702576, "kl": 0.0692138671875, "learning_rate": 2.5895987332177935e-06, "loss": -0.0189, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 520 }, { "completion_length": 1495.3541870117188, "epoch": 0.6353658536585366, "grad_norm": 0.47182220220565796, "kl": 0.052734375, "learning_rate": 2.587402026918492e-06, "loss": -0.0025, "reward": 0.4166666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.1666666716337204, "step": 521 }, { "completion_length": 1343.5208740234375, "epoch": 0.6365853658536585, "grad_norm": 1.3042664527893066, "kl": 0.0570068359375, "learning_rate": 2.5852003943628746e-06, "loss": 0.0214, "reward": 0.3125000149011612, "reward_std": 0.32475951313972473, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.1250000037252903, "step": 522 }, { "completion_length": 1460.041748046875, "epoch": 0.6378048780487805, "grad_norm": 0.3791583180427551, "kl": 0.0567626953125, "learning_rate": 2.5829938455249958e-06, "loss": 0.0304, "reward": 0.2708333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.10416666977107525, "step": 523 }, { "completion_length": 1232.1250610351562, "epoch": 0.6390243902439025, "grad_norm": 0.30961230397224426, "kl": 0.05078125, "learning_rate": 2.5807823904011804e-06, "loss": 0.1432, "reward": 0.25, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.1041666679084301, "step": 524 }, { "completion_length": 1286.8125610351562, "epoch": 0.6402439024390244, "grad_norm": 0.5515032410621643, "kl": 0.0535888671875, "learning_rate": 2.578566039009983e-06, "loss": -0.0019, "reward": 0.4166666865348816, "reward_std": 0.32475951313972473, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.25, "step": 525 }, { "completion_length": 1127.7708740234375, "epoch": 0.6414634146341464, "grad_norm": 0.4442518949508667, "kl": 0.0595703125, "learning_rate": 2.576344801392137e-06, "loss": 0.2436, "reward": 0.458333358168602, "reward_std": 0.21650633960962296, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.25, "step": 526 }, { "completion_length": 1529.916748046875, "epoch": 0.6426829268292683, "grad_norm": 0.35047128796577454, "kl": 0.0565185546875, "learning_rate": 2.5741186876105127e-06, "loss": 0.0015, "reward": 0.3333333432674408, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.1666666716337204, "step": 527 }, { "completion_length": 1643.7708740234375, "epoch": 0.6439024390243903, "grad_norm": 0.33923107385635376, "kl": 0.0679931640625, "learning_rate": 2.571887707750072e-06, "loss": 0.0828, "reward": 0.395833358168602, "reward_std": 0.21650633960962296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.2291666716337204, "step": 528 }, { "completion_length": 1681.2291870117188, "epoch": 0.6451219512195122, "grad_norm": 0.29589563608169556, "kl": 0.06689453125, "learning_rate": 2.56965187191782e-06, "loss": 0.0375, "reward": 0.6041666865348816, "reward_std": 0.25259073078632355, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.1875000074505806, "step": 529 }, { "completion_length": 1482.5416870117188, "epoch": 0.6463414634146342, "grad_norm": 0.39176300168037415, "kl": 0.068115234375, "learning_rate": 2.5674111902427625e-06, "loss": 0.0098, "reward": 0.3541666716337204, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.125, "step": 530 }, { "completion_length": 1705.0833740234375, "epoch": 0.6475609756097561, "grad_norm": 0.2790674567222595, "kl": 0.078369140625, "learning_rate": 2.5651656728758566e-06, "loss": 0.0247, "reward": 0.291666679084301, "reward_std": 0.25259075313806534, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.1458333395421505, "step": 531 }, { "completion_length": 1535.6250610351562, "epoch": 0.6487804878048781, "grad_norm": 0.33650991320610046, "kl": 0.070556640625, "learning_rate": 2.5629153299899673e-06, "loss": 0.1307, "reward": 0.3750000149011612, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0833333358168602, "step": 532 }, { "completion_length": 1602.1458740234375, "epoch": 0.65, "grad_norm": 0.3244190216064453, "kl": 0.07373046875, "learning_rate": 2.5606601717798212e-06, "loss": -0.0058, "reward": 0.1458333395421505, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.08333333395421505, "step": 533 }, { "completion_length": 1641.854248046875, "epoch": 0.651219512195122, "grad_norm": 0.30571281909942627, "kl": 0.08740234375, "learning_rate": 2.5584002084619593e-06, "loss": 0.0964, "reward": 0.1875, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.08333333395421505, "step": 534 }, { "completion_length": 1259.4791870117188, "epoch": 0.6524390243902439, "grad_norm": 0.5386083722114563, "kl": 0.087158203125, "learning_rate": 2.5561354502746907e-06, "loss": 0.1023, "reward": 0.3333333432674408, "reward_std": 0.32475951313972473, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.2083333432674408, "step": 535 }, { "completion_length": 1341.6458740234375, "epoch": 0.6536585365853659, "grad_norm": 1.0190380811691284, "kl": 0.1103515625, "learning_rate": 2.5538659074780484e-06, "loss": 0.0651, "reward": 0.3333333432674408, "reward_std": 0.25259073078632355, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.1666666679084301, "step": 536 }, { "completion_length": 1137.0000610351562, "epoch": 0.6548780487804878, "grad_norm": 0.4159141778945923, "kl": 0.097900390625, "learning_rate": 2.551591590353738e-06, "loss": 0.0564, "reward": 0.3125000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.1666666716337204, "step": 537 }, { "completion_length": 935.2500305175781, "epoch": 0.6560975609756098, "grad_norm": 0.7289742827415466, "kl": 0.16064453125, "learning_rate": 2.549312509205097e-06, "loss": -0.0099, "reward": 0.1458333395421505, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0625, "step": 538 }, { "completion_length": 1087.729248046875, "epoch": 0.6573170731707317, "grad_norm": 0.5276138186454773, "kl": 0.2470703125, "learning_rate": 2.5470286743570447e-06, "loss": 0.0763, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 539 }, { "completion_length": 1276.8541870117188, "epoch": 0.6585365853658537, "grad_norm": 0.7154669165611267, "kl": 0.3427734375, "learning_rate": 2.5447400961560355e-06, "loss": 0.0131, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 540 }, { "completion_length": 1188.5000610351562, "epoch": 0.6597560975609756, "grad_norm": 0.5348107814788818, "kl": 0.3330078125, "learning_rate": 2.542446784970013e-06, "loss": 0.0055, "reward": 0.2708333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 541 }, { "completion_length": 1699.8125610351562, "epoch": 0.6609756097560976, "grad_norm": 0.8080800771713257, "kl": 0.2080078125, "learning_rate": 2.5401487511883627e-06, "loss": 0.0824, "reward": 0.2083333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 542 }, { "completion_length": 2211.166748046875, "epoch": 0.6621951219512195, "grad_norm": 0.46947646141052246, "kl": 0.146728515625, "learning_rate": 2.5378460052218646e-06, "loss": 0.0768, "reward": 0.1875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 543 }, { "completion_length": 2537.4375, "epoch": 0.6634146341463415, "grad_norm": 0.17736782133579254, "kl": 0.119873046875, "learning_rate": 2.5355385575026464e-06, "loss": 0.0676, "reward": 0.2500000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 544 }, { "completion_length": 3070.729248046875, "epoch": 0.6646341463414634, "grad_norm": 1.07599675655365, "kl": 0.216796875, "learning_rate": 2.5332264184841366e-06, "loss": 0.052, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 545 }, { "completion_length": 3293.8125, "epoch": 0.6658536585365854, "grad_norm": 0.11890757083892822, "kl": 0.080810546875, "learning_rate": 2.5309095986410155e-06, "loss": 0.1254, "reward": 0.3333333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 546 }, { "completion_length": 3493.604248046875, "epoch": 0.6670731707317074, "grad_norm": 0.08753108978271484, "kl": 0.23681640625, "learning_rate": 2.5285881084691706e-06, "loss": 0.0323, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 547 }, { "completion_length": 3722.52099609375, "epoch": 0.6682926829268293, "grad_norm": 0.10485602170228958, "kl": 0.0859375, "learning_rate": 2.5262619584856456e-06, "loss": 0.0035, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 548 }, { "completion_length": 3305.52099609375, "epoch": 0.6695121951219513, "grad_norm": 0.298432856798172, "kl": 0.113037109375, "learning_rate": 2.5239311592285966e-06, "loss": 0.0038, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 549 }, { "completion_length": 3981.2708740234375, "epoch": 0.6707317073170732, "grad_norm": 0.09088010340929031, "kl": 0.090087890625, "learning_rate": 2.52159572125724e-06, "loss": 0.003, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 550 }, { "completion_length": 3769.104248046875, "epoch": 0.6719512195121952, "grad_norm": 0.09075552225112915, "kl": 0.129150390625, "learning_rate": 2.5192556551518086e-06, "loss": 0.003, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 551 }, { "completion_length": 3749.791748046875, "epoch": 0.6731707317073171, "grad_norm": 0.19361332058906555, "kl": 0.097900390625, "learning_rate": 2.5169109715135015e-06, "loss": 0.0031, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 552 }, { "completion_length": 3634.6458740234375, "epoch": 0.6743902439024391, "grad_norm": 0.10965518653392792, "kl": 0.1318359375, "learning_rate": 2.514561680964437e-06, "loss": 0.0027, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 553 }, { "completion_length": 3812.9376220703125, "epoch": 0.675609756097561, "grad_norm": 0.09198274463415146, "kl": 0.0633544921875, "learning_rate": 2.512207794147603e-06, "loss": 0.0025, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 554 }, { "completion_length": 4025.916748046875, "epoch": 0.676829268292683, "grad_norm": 0.07839851826429367, "kl": 0.0570068359375, "learning_rate": 2.5098493217268116e-06, "loss": 0.0023, "reward": 0.375, "reward_std": 0.0, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 555 }, { "completion_length": 3608.2708740234375, "epoch": 0.6780487804878049, "grad_norm": 0.0819554477930069, "kl": 0.102294921875, "learning_rate": 2.507486274386647e-06, "loss": 0.0319, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 556 }, { "completion_length": 3511.0208740234375, "epoch": 0.6792682926829269, "grad_norm": 0.06574513763189316, "kl": 0.130859375, "learning_rate": 2.505118662832421e-06, "loss": 0.0019, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 557 }, { "completion_length": 3563.104248046875, "epoch": 0.6804878048780488, "grad_norm": 0.06478916853666306, "kl": 0.2353515625, "learning_rate": 2.5027464977901206e-06, "loss": 0.0309, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 558 }, { "completion_length": 2793.979248046875, "epoch": 0.6817073170731708, "grad_norm": 0.05855738744139671, "kl": 0.13525390625, "learning_rate": 2.5003697900063643e-06, "loss": 0.0019, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 559 }, { "completion_length": 3448.291748046875, "epoch": 0.6829268292682927, "grad_norm": 0.055873386561870575, "kl": 0.064697265625, "learning_rate": 2.4979885502483478e-06, "loss": 0.0295, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 560 }, { "completion_length": 2942.1875, "epoch": 0.6841463414634147, "grad_norm": 0.03953621909022331, "kl": 0.06298828125, "learning_rate": 2.4956027893038004e-06, "loss": 0.0016, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 561 }, { "completion_length": 3429.6876220703125, "epoch": 0.6853658536585366, "grad_norm": 0.0740460455417633, "kl": 0.079345703125, "learning_rate": 2.4932125179809316e-06, "loss": 0.0295, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 562 }, { "completion_length": 3318.52099609375, "epoch": 0.6865853658536586, "grad_norm": 0.12191849946975708, "kl": 0.053955078125, "learning_rate": 2.4908177471083855e-06, "loss": 0.0592, "reward": 0.0833333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 563 }, { "completion_length": 2935.354248046875, "epoch": 0.6878048780487804, "grad_norm": 0.07754002511501312, "kl": 0.11865234375, "learning_rate": 2.4884184875351897e-06, "loss": 0.0245, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 564 }, { "completion_length": 2964.3125, "epoch": 0.6890243902439024, "grad_norm": 0.09392862766981125, "kl": 0.2734375, "learning_rate": 2.486014750130708e-06, "loss": 0.0419, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 565 }, { "completion_length": 2574.7500610351562, "epoch": 0.6902439024390243, "grad_norm": 0.03392501920461655, "kl": 0.084228515625, "learning_rate": 2.48360654578459e-06, "loss": 0.0015, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 566 }, { "completion_length": 2646.729248046875, "epoch": 0.6914634146341463, "grad_norm": 0.17704911530017853, "kl": 0.06591796875, "learning_rate": 2.48119388540672e-06, "loss": 0.0327, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 567 }, { "completion_length": 2399.9376220703125, "epoch": 0.6926829268292682, "grad_norm": 0.08066665381193161, "kl": 0.060791015625, "learning_rate": 2.4787767799271725e-06, "loss": 0.0606, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 568 }, { "completion_length": 2745.3126220703125, "epoch": 0.6939024390243902, "grad_norm": 0.0406438484787941, "kl": 0.06640625, "learning_rate": 2.476355240296157e-06, "loss": 0.0016, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 569 }, { "completion_length": 2265.8751220703125, "epoch": 0.6951219512195121, "grad_norm": 0.6126076579093933, "kl": 0.06396484375, "learning_rate": 2.473929277483972e-06, "loss": 0.0977, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 570 }, { "completion_length": 1698.5625610351562, "epoch": 0.6963414634146341, "grad_norm": 0.15366536378860474, "kl": 0.114990234375, "learning_rate": 2.4714989024809555e-06, "loss": 0.0946, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 571 }, { "completion_length": 2134.0834350585938, "epoch": 0.697560975609756, "grad_norm": 0.09421119838953018, "kl": 0.09130859375, "learning_rate": 2.4690641262974317e-06, "loss": 0.0537, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 572 }, { "completion_length": 1905.1876220703125, "epoch": 0.698780487804878, "grad_norm": 0.13076740503311157, "kl": 0.07958984375, "learning_rate": 2.4666249599636654e-06, "loss": 0.0513, "reward": 0.2916666865348816, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.0, "step": 573 }, { "completion_length": 2079.5833740234375, "epoch": 0.7, "grad_norm": 0.05781978741288185, "kl": 0.052490234375, "learning_rate": 2.464181414529809e-06, "loss": 0.0189, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 574 }, { "completion_length": 1757.9375610351562, "epoch": 0.7012195121951219, "grad_norm": 0.21642546355724335, "kl": 0.06884765625, "learning_rate": 2.4617335010658546e-06, "loss": 0.0292, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 575 }, { "completion_length": 1726.0416870117188, "epoch": 0.7024390243902439, "grad_norm": 0.18158158659934998, "kl": 0.0628662109375, "learning_rate": 2.4592812306615812e-06, "loss": 0.1202, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 576 }, { "completion_length": 1781.2708740234375, "epoch": 0.7036585365853658, "grad_norm": 0.09229027479887009, "kl": 0.064453125, "learning_rate": 2.456824614426508e-06, "loss": 0.0464, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 577 }, { "completion_length": 1600.5, "epoch": 0.7048780487804878, "grad_norm": 0.1545463651418686, "kl": 0.0841064453125, "learning_rate": 2.4543636634898398e-06, "loss": 0.0348, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 578 }, { "completion_length": 1550.6458740234375, "epoch": 0.7060975609756097, "grad_norm": 0.32401180267333984, "kl": 0.0504150390625, "learning_rate": 2.4518983890004216e-06, "loss": 0.1461, "reward": 0.3333333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 579 }, { "completion_length": 1803.791748046875, "epoch": 0.7073170731707317, "grad_norm": 0.21760672330856323, "kl": 0.0458984375, "learning_rate": 2.4494288021266825e-06, "loss": 0.1229, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 580 }, { "completion_length": 2209.4791870117188, "epoch": 0.7085365853658536, "grad_norm": 0.17629334330558777, "kl": 0.0506591796875, "learning_rate": 2.446954914056591e-06, "loss": 0.0404, "reward": 0.2083333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 581 }, { "completion_length": 2085.0001220703125, "epoch": 0.7097560975609756, "grad_norm": 0.0947469025850296, "kl": 0.0426025390625, "learning_rate": 2.444476735997598e-06, "loss": 0.0052, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 582 }, { "completion_length": 1595.9166870117188, "epoch": 0.7109756097560975, "grad_norm": 0.12223908305168152, "kl": 0.0487060546875, "learning_rate": 2.4419942791765926e-06, "loss": 0.0209, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 583 }, { "completion_length": 2358.229248046875, "epoch": 0.7121951219512195, "grad_norm": 0.0974886491894722, "kl": 0.0419921875, "learning_rate": 2.439507554839846e-06, "loss": 0.0205, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 584 }, { "completion_length": 1994.7708740234375, "epoch": 0.7134146341463414, "grad_norm": 0.22642923891544342, "kl": 0.0474853515625, "learning_rate": 2.4370165742529625e-06, "loss": 0.1986, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 585 }, { "completion_length": 1791.0, "epoch": 0.7146341463414634, "grad_norm": 0.29729291796684265, "kl": 0.0543212890625, "learning_rate": 2.4345213487008296e-06, "loss": 0.0895, "reward": 0.1250000037252903, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 586 }, { "completion_length": 1809.5833740234375, "epoch": 0.7158536585365853, "grad_norm": 0.07275247573852539, "kl": 0.048583984375, "learning_rate": 2.4320218894875647e-06, "loss": 0.0191, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 587 }, { "completion_length": 2023.5000610351562, "epoch": 0.7170731707317073, "grad_norm": 0.23188866674900055, "kl": 0.0506591796875, "learning_rate": 2.4295182079364655e-06, "loss": 0.098, "reward": 0.2916666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 588 }, { "completion_length": 1633.3541870117188, "epoch": 0.7182926829268292, "grad_norm": 0.33854755759239197, "kl": 0.056640625, "learning_rate": 2.427010315389958e-06, "loss": 0.0405, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 589 }, { "completion_length": 2046.6041870117188, "epoch": 0.7195121951219512, "grad_norm": 0.13508957624435425, "kl": 0.0498046875, "learning_rate": 2.424498223209545e-06, "loss": 0.0571, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 590 }, { "completion_length": 1559.9375, "epoch": 0.7207317073170731, "grad_norm": 0.24500739574432373, "kl": 0.046630859375, "learning_rate": 2.4219819427757566e-06, "loss": 0.145, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 591 }, { "completion_length": 2086.6458740234375, "epoch": 0.7219512195121951, "grad_norm": 0.13823281228542328, "kl": 0.0499267578125, "learning_rate": 2.4194614854880937e-06, "loss": 0.0951, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 592 }, { "completion_length": 2079.0208740234375, "epoch": 0.723170731707317, "grad_norm": 0.058214690536260605, "kl": 0.046630859375, "learning_rate": 2.4169368627649823e-06, "loss": 0.0162, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 593 }, { "completion_length": 2309.104248046875, "epoch": 0.724390243902439, "grad_norm": 0.05540324002504349, "kl": 0.045654296875, "learning_rate": 2.4144080860437184e-06, "loss": 0.0319, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 594 }, { "completion_length": 2107.2501220703125, "epoch": 0.725609756097561, "grad_norm": 0.265683650970459, "kl": 0.047119140625, "learning_rate": 2.411875166780416e-06, "loss": 0.2441, "reward": 0.1666666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 595 }, { "completion_length": 1953.75, "epoch": 0.7268292682926829, "grad_norm": 0.07148255407810211, "kl": 0.049560546875, "learning_rate": 2.409338116449957e-06, "loss": 0.0426, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 596 }, { "completion_length": 2208.8959350585938, "epoch": 0.7280487804878049, "grad_norm": 0.11657347530126572, "kl": 0.0484619140625, "learning_rate": 2.4067969465459383e-06, "loss": 0.0834, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 597 }, { "completion_length": 2126.14599609375, "epoch": 0.7292682926829268, "grad_norm": 0.0644325315952301, "kl": 0.0489501953125, "learning_rate": 2.404251668580619e-06, "loss": 0.0324, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 598 }, { "completion_length": 2263.9166870117188, "epoch": 0.7304878048780488, "grad_norm": 5.897913455963135, "kl": 0.101806640625, "learning_rate": 2.4017022940848696e-06, "loss": 0.0462, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 599 }, { "completion_length": 1729.8541870117188, "epoch": 0.7317073170731707, "grad_norm": 2.2581729888916016, "kl": 0.0657958984375, "learning_rate": 2.3991488346081183e-06, "loss": 0.0042, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 600 }, { "completion_length": 2596.416748046875, "epoch": 0.7329268292682927, "grad_norm": 0.08442476391792297, "kl": 0.05419921875, "learning_rate": 2.3965913017183006e-06, "loss": 0.0528, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 601 }, { "completion_length": 2007.4584350585938, "epoch": 0.7341463414634146, "grad_norm": 0.08505202829837799, "kl": 0.056396484375, "learning_rate": 2.3940297070018048e-06, "loss": 0.0303, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 602 }, { "completion_length": 1821.041748046875, "epoch": 0.7353658536585366, "grad_norm": 0.15689483284950256, "kl": 0.0592041015625, "learning_rate": 2.3914640620634213e-06, "loss": 0.0811, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 603 }, { "completion_length": 1980.2083740234375, "epoch": 0.7365853658536585, "grad_norm": 0.07433542609214783, "kl": 0.0601806640625, "learning_rate": 2.388894378526288e-06, "loss": 0.0404, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 604 }, { "completion_length": 1860.666748046875, "epoch": 0.7378048780487805, "grad_norm": 0.11207889765501022, "kl": 0.0584716796875, "learning_rate": 2.386320668031841e-06, "loss": 0.0803, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 605 }, { "completion_length": 1405.7083740234375, "epoch": 0.7390243902439024, "grad_norm": 0.2864086925983429, "kl": 0.0579833984375, "learning_rate": 2.383742942239757e-06, "loss": 0.147, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 606 }, { "completion_length": 1762.0625610351562, "epoch": 0.7402439024390244, "grad_norm": 0.1908760368824005, "kl": 0.0751953125, "learning_rate": 2.3811612128279053e-06, "loss": 0.0825, "reward": 0.3125000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 607 }, { "completion_length": 2023.6459350585938, "epoch": 0.7414634146341463, "grad_norm": 0.14952409267425537, "kl": 0.059814453125, "learning_rate": 2.3785754914922923e-06, "loss": -0.034, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 608 }, { "completion_length": 1398.1875610351562, "epoch": 0.7426829268292683, "grad_norm": 0.11472067981958389, "kl": 0.0570068359375, "learning_rate": 2.375985789947008e-06, "loss": 0.0807, "reward": 0.22916667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.22916667722165585, "rewards/format_reward": 0.0, "step": 609 }, { "completion_length": 1686.4166870117188, "epoch": 0.7439024390243902, "grad_norm": 0.26014742255210876, "kl": 0.0625, "learning_rate": 2.3733921199241755e-06, "loss": -0.0833, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 610 }, { "completion_length": 1734.4583740234375, "epoch": 0.7451219512195122, "grad_norm": 0.25340884923934937, "kl": 0.06103515625, "learning_rate": 2.370794493173895e-06, "loss": 0.1184, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 611 }, { "completion_length": 1613.6041870117188, "epoch": 0.7463414634146341, "grad_norm": 0.21497339010238647, "kl": 0.0592041015625, "learning_rate": 2.3681929214641924e-06, "loss": 0.0215, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 612 }, { "completion_length": 2204.229248046875, "epoch": 0.7475609756097561, "grad_norm": 0.07375478744506836, "kl": 0.068603515625, "learning_rate": 2.365587416580966e-06, "loss": 0.0305, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 613 }, { "completion_length": 1871.604248046875, "epoch": 0.748780487804878, "grad_norm": 0.19106429815292358, "kl": 0.0665283203125, "learning_rate": 2.362977990327931e-06, "loss": -0.0363, "reward": 0.1666666716337204, "reward_std": 0.14433757215738297, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 614 }, { "completion_length": 1353.7708740234375, "epoch": 0.75, "grad_norm": 0.20251908898353577, "kl": 0.054931640625, "learning_rate": 2.3603646545265692e-06, "loss": 0.0431, "reward": 0.1875, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 615 }, { "completion_length": 1778.0416870117188, "epoch": 0.751219512195122, "grad_norm": 0.23813143372535706, "kl": 0.06103515625, "learning_rate": 2.357747421016073e-06, "loss": 0.1611, "reward": 0.2291666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 616 }, { "completion_length": 1520.0625610351562, "epoch": 0.7524390243902439, "grad_norm": 0.28580236434936523, "kl": 0.066162109375, "learning_rate": 2.355126301653293e-06, "loss": 0.1565, "reward": 0.1875, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 617 }, { "completion_length": 1987.291748046875, "epoch": 0.7536585365853659, "grad_norm": 0.2749394476413727, "kl": 0.0645751953125, "learning_rate": 2.3525013083126835e-06, "loss": 0.1461, "reward": 0.2500000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 618 }, { "completion_length": 1976.604248046875, "epoch": 0.7548780487804878, "grad_norm": 0.11531209945678711, "kl": 0.063232421875, "learning_rate": 2.349872452886249e-06, "loss": 0.0669, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 619 }, { "completion_length": 2113.6458740234375, "epoch": 0.7560975609756098, "grad_norm": 0.14189231395721436, "kl": 0.059326171875, "learning_rate": 2.34723974728349e-06, "loss": 0.1106, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 620 }, { "completion_length": 2221.3125610351562, "epoch": 0.7573170731707317, "grad_norm": 0.23576445877552032, "kl": 0.06005859375, "learning_rate": 2.3446032034313518e-06, "loss": 0.0422, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 621 }, { "completion_length": 1874.4376220703125, "epoch": 0.7585365853658537, "grad_norm": 0.13401518762111664, "kl": 0.0611572265625, "learning_rate": 2.341962833274165e-06, "loss": 0.0028, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 622 }, { "completion_length": 1937.4583740234375, "epoch": 0.7597560975609756, "grad_norm": 0.2179955244064331, "kl": 0.062255859375, "learning_rate": 2.339318648773596e-06, "loss": 0.1655, "reward": 0.2708333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 623 }, { "completion_length": 2414.25, "epoch": 0.7609756097560976, "grad_norm": 0.08183278888463974, "kl": 0.06591796875, "learning_rate": 2.336670661908592e-06, "loss": 0.0207, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 624 }, { "completion_length": 2146.666748046875, "epoch": 0.7621951219512195, "grad_norm": 0.10090535879135132, "kl": 0.0673828125, "learning_rate": 2.3340188846753245e-06, "loss": 0.0672, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 625 }, { "completion_length": 2222.3333740234375, "epoch": 0.7634146341463415, "grad_norm": 0.07684232294559479, "kl": 0.0673828125, "learning_rate": 2.3313633290871373e-06, "loss": 0.0201, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 626 }, { "completion_length": 2309.229248046875, "epoch": 0.7646341463414634, "grad_norm": 0.0626184344291687, "kl": 0.0643310546875, "learning_rate": 2.328704007174491e-06, "loss": 0.0344, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 627 }, { "completion_length": 2250.2708740234375, "epoch": 0.7658536585365854, "grad_norm": 0.07159028202295303, "kl": 0.065185546875, "learning_rate": 2.3260409309849103e-06, "loss": 0.0395, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 628 }, { "completion_length": 1877.75, "epoch": 0.7670731707317073, "grad_norm": 0.0638006404042244, "kl": 0.06640625, "learning_rate": 2.323374112582925e-06, "loss": -0.001, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 629 }, { "completion_length": 2156.7708740234375, "epoch": 0.7682926829268293, "grad_norm": 0.06523877382278442, "kl": 0.0677490234375, "learning_rate": 2.3207035640500206e-06, "loss": 0.0027, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 630 }, { "completion_length": 1746.0625, "epoch": 0.7695121951219512, "grad_norm": 0.14594532549381256, "kl": 0.07275390625, "learning_rate": 2.3180292974845807e-06, "loss": 0.048, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 631 }, { "completion_length": 2084.5625610351562, "epoch": 0.7707317073170732, "grad_norm": 0.11654707044363022, "kl": 0.06298828125, "learning_rate": 2.315351325001832e-06, "loss": 0.0471, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 632 }, { "completion_length": 2896.6875, "epoch": 0.7719512195121951, "grad_norm": 0.07657080143690109, "kl": 0.070556640625, "learning_rate": 2.3126696587337903e-06, "loss": 0.021, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 633 }, { "completion_length": 1695.3333740234375, "epoch": 0.7731707317073171, "grad_norm": 0.22362147271633148, "kl": 0.061767578125, "learning_rate": 2.3099843108292062e-06, "loss": 0.0728, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 634 }, { "completion_length": 1926.479248046875, "epoch": 0.774390243902439, "grad_norm": 0.15397554636001587, "kl": 0.0673828125, "learning_rate": 2.3072952934535087e-06, "loss": -0.0038, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 635 }, { "completion_length": 2181.916748046875, "epoch": 0.775609756097561, "grad_norm": 0.09441860020160675, "kl": 0.06005859375, "learning_rate": 2.3046026187887498e-06, "loss": 0.0353, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 636 }, { "completion_length": 2454.8334350585938, "epoch": 0.776829268292683, "grad_norm": 0.06336867064237595, "kl": 0.0673828125, "learning_rate": 2.301906299033552e-06, "loss": 0.0026, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 637 }, { "completion_length": 1661.5833740234375, "epoch": 0.7780487804878049, "grad_norm": 0.8363388776779175, "kl": 0.0712890625, "learning_rate": 2.2992063464030482e-06, "loss": -0.0137, "reward": 0.18750000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 638 }, { "completion_length": 1790.104248046875, "epoch": 0.7792682926829269, "grad_norm": 0.12070167809724808, "kl": 0.0692138671875, "learning_rate": 2.2965027731288335e-06, "loss": 0.008, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 639 }, { "completion_length": 2811.166748046875, "epoch": 0.7804878048780488, "grad_norm": 0.07522193342447281, "kl": 0.056884765625, "learning_rate": 2.293795591458901e-06, "loss": 0.0321, "reward": 0.2291666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 640 }, { "completion_length": 2501.25, "epoch": 0.7817073170731708, "grad_norm": 0.057801488786935806, "kl": 0.068603515625, "learning_rate": 2.291084813657594e-06, "loss": 0.0024, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 641 }, { "completion_length": 1160.2916870117188, "epoch": 0.7829268292682927, "grad_norm": 0.11244219541549683, "kl": 0.061767578125, "learning_rate": 2.288370452005547e-06, "loss": 0.0023, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 642 }, { "completion_length": 1767.4583740234375, "epoch": 0.7841463414634147, "grad_norm": 0.16211725771427155, "kl": 0.0780029296875, "learning_rate": 2.2856525187996287e-06, "loss": 0.0043, "reward": 0.2916666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 643 }, { "completion_length": 2484.729248046875, "epoch": 0.7853658536585366, "grad_norm": 0.0806308388710022, "kl": 0.05810546875, "learning_rate": 2.2829310263528907e-06, "loss": 0.03, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 644 }, { "completion_length": 2105.479248046875, "epoch": 0.7865853658536586, "grad_norm": 0.05736877769231796, "kl": 0.0545654296875, "learning_rate": 2.2802059869945057e-06, "loss": 0.0023, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 645 }, { "completion_length": 1822.3333740234375, "epoch": 0.7878048780487805, "grad_norm": 0.1186266541481018, "kl": 0.0546875, "learning_rate": 2.2774774130697184e-06, "loss": 0.0286, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 646 }, { "completion_length": 2049.354248046875, "epoch": 0.7890243902439025, "grad_norm": 0.1890617161989212, "kl": 0.0574951171875, "learning_rate": 2.2747453169397835e-06, "loss": 0.1063, "reward": 0.20833333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 647 }, { "completion_length": 2068.7708740234375, "epoch": 0.7902439024390244, "grad_norm": 0.13897459208965302, "kl": 0.05419921875, "learning_rate": 2.2720097109819135e-06, "loss": 0.0086, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 648 }, { "completion_length": 1861.541748046875, "epoch": 0.7914634146341464, "grad_norm": 0.043004218488931656, "kl": 0.056640625, "learning_rate": 2.269270607589222e-06, "loss": 0.0021, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 649 }, { "completion_length": 1756.3958740234375, "epoch": 0.7926829268292683, "grad_norm": 0.2104645073413849, "kl": 0.077880859375, "learning_rate": 2.2665280191706656e-06, "loss": 0.0369, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 650 }, { "completion_length": 1660.3541870117188, "epoch": 0.7939024390243903, "grad_norm": 0.2580127716064453, "kl": 0.0621337890625, "learning_rate": 2.2637819581509906e-06, "loss": 0.1094, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 651 }, { "completion_length": 1640.1041870117188, "epoch": 0.7951219512195122, "grad_norm": 0.2913719713687897, "kl": 0.0640869140625, "learning_rate": 2.2610324369706735e-06, "loss": 0.0737, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 652 }, { "completion_length": 2081.6875, "epoch": 0.7963414634146342, "grad_norm": 0.0882386863231659, "kl": 0.06103515625, "learning_rate": 2.258279468085868e-06, "loss": 0.0384, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 653 }, { "completion_length": 1463.1041870117188, "epoch": 0.7975609756097561, "grad_norm": 0.18673640489578247, "kl": 0.06982421875, "learning_rate": 2.2555230639683464e-06, "loss": 0.1319, "reward": 0.2708333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 654 }, { "completion_length": 1600.8125, "epoch": 0.7987804878048781, "grad_norm": 0.1471596360206604, "kl": 0.067138671875, "learning_rate": 2.252763237105444e-06, "loss": 0.042, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 655 }, { "completion_length": 1464.5000610351562, "epoch": 0.8, "grad_norm": 0.2611793279647827, "kl": 0.0703125, "learning_rate": 2.25e-06, "loss": 0.0731, "reward": 0.2500000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 656 }, { "completion_length": 1473.8541870117188, "epoch": 0.801219512195122, "grad_norm": 0.19243347644805908, "kl": 0.0574951171875, "learning_rate": 2.247233365170306e-06, "loss": 0.0211, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 657 }, { "completion_length": 1170.3333740234375, "epoch": 0.802439024390244, "grad_norm": 0.13832615315914154, "kl": 0.05908203125, "learning_rate": 2.2444633451500453e-06, "loss": 0.0616, "reward": 0.2916666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 658 }, { "completion_length": 1458.5625610351562, "epoch": 0.8036585365853659, "grad_norm": 0.13568414747714996, "kl": 0.0616455078125, "learning_rate": 2.2416899524882353e-06, "loss": -0.0263, "reward": 0.2291666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 659 }, { "completion_length": 1494.8125610351562, "epoch": 0.8048780487804879, "grad_norm": 0.3329702317714691, "kl": 0.0589599609375, "learning_rate": 2.2389131997491756e-06, "loss": 0.0679, "reward": 0.16666667722165585, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 660 }, { "completion_length": 1057.1250305175781, "epoch": 0.8060975609756098, "grad_norm": 0.1387249231338501, "kl": 0.0638427734375, "learning_rate": 2.236133099512385e-06, "loss": 0.006, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 661 }, { "completion_length": 1057.3541870117188, "epoch": 0.8073170731707318, "grad_norm": 0.2977414131164551, "kl": 0.0628662109375, "learning_rate": 2.2333496643725505e-06, "loss": 0.1484, "reward": 0.3333333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 662 }, { "completion_length": 1342.875, "epoch": 0.8085365853658537, "grad_norm": 0.2909049391746521, "kl": 0.075439453125, "learning_rate": 2.230562906939464e-06, "loss": 0.1124, "reward": 0.2291666716337204, "reward_std": 0.18042194843292236, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 663 }, { "completion_length": 1252.1458740234375, "epoch": 0.8097560975609757, "grad_norm": 0.2964651882648468, "kl": 0.0614013671875, "learning_rate": 2.2277728398379705e-06, "loss": 0.0588, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 664 }, { "completion_length": 1048.2083740234375, "epoch": 0.8109756097560976, "grad_norm": 0.22226032614707947, "kl": 0.068603515625, "learning_rate": 2.2249794757079083e-06, "loss": 0.0523, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 665 }, { "completion_length": 1329.5208740234375, "epoch": 0.8121951219512196, "grad_norm": 0.26652440428733826, "kl": 0.066162109375, "learning_rate": 2.2221828272040517e-06, "loss": 0.069, "reward": 0.08333333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 666 }, { "completion_length": 1261.6041870117188, "epoch": 0.8134146341463414, "grad_norm": 0.30014967918395996, "kl": 0.0645751953125, "learning_rate": 2.2193829069960556e-06, "loss": 0.0492, "reward": 0.2083333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 667 }, { "completion_length": 1635.7916870117188, "epoch": 0.8146341463414634, "grad_norm": 0.1851644217967987, "kl": 0.0712890625, "learning_rate": 2.2165797277683943e-06, "loss": 0.0642, "reward": 0.3125000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 668 }, { "completion_length": 1444.0625610351562, "epoch": 0.8158536585365853, "grad_norm": 0.13802890479564667, "kl": 0.0550537109375, "learning_rate": 2.213773302220309e-06, "loss": 0.0621, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 669 }, { "completion_length": 1763.0001220703125, "epoch": 0.8170731707317073, "grad_norm": 0.10036277770996094, "kl": 0.0565185546875, "learning_rate": 2.2109636430657463e-06, "loss": 0.0192, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 670 }, { "completion_length": 1114.2917175292969, "epoch": 0.8182926829268292, "grad_norm": 0.21276941895484924, "kl": 0.0537109375, "learning_rate": 2.2081507630333016e-06, "loss": 0.0907, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 671 }, { "completion_length": 1650.6041870117188, "epoch": 0.8195121951219512, "grad_norm": 0.13423870503902435, "kl": 0.058349609375, "learning_rate": 2.2053346748661633e-06, "loss": 0.0191, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 672 }, { "completion_length": 1857.4791870117188, "epoch": 0.8207317073170731, "grad_norm": 0.24272115528583527, "kl": 0.065185546875, "learning_rate": 2.202515391322052e-06, "loss": 0.0773, "reward": 0.2500000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 673 }, { "completion_length": 2177.229248046875, "epoch": 0.8219512195121951, "grad_norm": 0.23075804114341736, "kl": 0.0579833984375, "learning_rate": 2.1996929251731665e-06, "loss": 0.1943, "reward": 0.229166679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 674 }, { "completion_length": 2140.6459350585938, "epoch": 0.823170731707317, "grad_norm": 0.138530895113945, "kl": 0.0654296875, "learning_rate": 2.196867289206121e-06, "loss": 0.0861, "reward": 0.2083333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 675 }, { "completion_length": 1775.4166870117188, "epoch": 0.824390243902439, "grad_norm": 0.12044782191514969, "kl": 0.056884765625, "learning_rate": 2.194038496221892e-06, "loss": 0.0336, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 676 }, { "completion_length": 1687.1458740234375, "epoch": 0.8256097560975609, "grad_norm": 0.1958717405796051, "kl": 0.0533447265625, "learning_rate": 2.1912065590357576e-06, "loss": 0.115, "reward": 0.2708333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 677 }, { "completion_length": 1997.7500610351562, "epoch": 0.8268292682926829, "grad_norm": 0.058854639530181885, "kl": 0.057373046875, "learning_rate": 2.188371490477239e-06, "loss": 0.0025, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 678 }, { "completion_length": 2009.1875610351562, "epoch": 0.8280487804878048, "grad_norm": 0.11259932070970535, "kl": 0.0574951171875, "learning_rate": 2.185533303390046e-06, "loss": 0.0765, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 679 }, { "completion_length": 2127.6041870117188, "epoch": 0.8292682926829268, "grad_norm": 0.14925095438957214, "kl": 0.0732421875, "learning_rate": 2.182692010632013e-06, "loss": 0.1267, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 680 }, { "completion_length": 1773.1250610351562, "epoch": 0.8304878048780487, "grad_norm": 0.16422392427921295, "kl": 0.0615234375, "learning_rate": 2.1798476250750473e-06, "loss": 0.0342, "reward": 0.1458333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 681 }, { "completion_length": 1906.0209350585938, "epoch": 0.8317073170731707, "grad_norm": 0.08216903358697891, "kl": 0.07958984375, "learning_rate": 2.177000159605065e-06, "loss": 0.0452, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 682 }, { "completion_length": 1518.104248046875, "epoch": 0.8329268292682926, "grad_norm": 0.28313207626342773, "kl": 0.0626220703125, "learning_rate": 2.174149627121937e-06, "loss": 0.08, "reward": 0.2708333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 683 }, { "completion_length": 1954.5625, "epoch": 0.8341463414634146, "grad_norm": 0.1914631426334381, "kl": 0.0648193359375, "learning_rate": 2.1712960405394265e-06, "loss": 0.1216, "reward": 0.291666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 684 }, { "completion_length": 1862.2291870117188, "epoch": 0.8353658536585366, "grad_norm": 0.22317364811897278, "kl": 0.07275390625, "learning_rate": 2.168439412785135e-06, "loss": 0.1397, "reward": 0.1666666679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 685 }, { "completion_length": 1383.2500610351562, "epoch": 0.8365853658536585, "grad_norm": 0.2157326489686966, "kl": 0.07177734375, "learning_rate": 2.1655797568004397e-06, "loss": 0.1789, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 686 }, { "completion_length": 1613.6666870117188, "epoch": 0.8378048780487805, "grad_norm": 0.11487796902656555, "kl": 0.0606689453125, "learning_rate": 2.1627170855404376e-06, "loss": 0.0164, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 687 }, { "completion_length": 2011.9791870117188, "epoch": 0.8390243902439024, "grad_norm": 0.0972389280796051, "kl": 0.0606689453125, "learning_rate": 2.1598514119738853e-06, "loss": 0.0227, "reward": 0.2708333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 688 }, { "completion_length": 1945.5416870117188, "epoch": 0.8402439024390244, "grad_norm": 0.10604876279830933, "kl": 0.070556640625, "learning_rate": 2.1569827490831408e-06, "loss": 0.1001, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 689 }, { "completion_length": 2262.7083740234375, "epoch": 0.8414634146341463, "grad_norm": 0.15031130611896515, "kl": 0.072265625, "learning_rate": 2.154111109864105e-06, "loss": 0.1073, "reward": 0.10416666977107525, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 690 }, { "completion_length": 1429.479248046875, "epoch": 0.8426829268292683, "grad_norm": 0.20031052827835083, "kl": 0.06494140625, "learning_rate": 2.1512365073261617e-06, "loss": 0.0529, "reward": 0.18750000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 691 }, { "completion_length": 1763.0416870117188, "epoch": 0.8439024390243902, "grad_norm": 0.14333292841911316, "kl": 0.074951171875, "learning_rate": 2.1483589544921202e-06, "loss": 0.1244, "reward": 0.1875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 692 }, { "completion_length": 1624.4375, "epoch": 0.8451219512195122, "grad_norm": 0.20566749572753906, "kl": 0.0732421875, "learning_rate": 2.145478464398156e-06, "loss": 0.0763, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 693 }, { "completion_length": 1682.3958740234375, "epoch": 0.8463414634146341, "grad_norm": 0.21135647594928741, "kl": 0.067138671875, "learning_rate": 2.1425950500937493e-06, "loss": 0.0211, "reward": 0.4166666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 694 }, { "completion_length": 1789.604248046875, "epoch": 0.8475609756097561, "grad_norm": 0.20999476313591003, "kl": 0.06982421875, "learning_rate": 2.13970872464163e-06, "loss": 0.1408, "reward": 0.2083333432674408, "reward_std": 0.21650633960962296, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 695 }, { "completion_length": 2037.1459350585938, "epoch": 0.848780487804878, "grad_norm": 0.15770134329795837, "kl": 0.0625, "learning_rate": 2.1368195011177142e-06, "loss": 0.086, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 696 }, { "completion_length": 1559.0625, "epoch": 0.85, "grad_norm": 0.22309143841266632, "kl": 0.070068359375, "learning_rate": 2.1339273926110494e-06, "loss": 0.1219, "reward": 0.229166679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 697 }, { "completion_length": 1766.5, "epoch": 0.8512195121951219, "grad_norm": 0.24526242911815643, "kl": 0.06298828125, "learning_rate": 2.1310324122237512e-06, "loss": -0.0048, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 698 }, { "completion_length": 1578.166748046875, "epoch": 0.8524390243902439, "grad_norm": 0.1572403460741043, "kl": 0.069580078125, "learning_rate": 2.128134573070947e-06, "loss": 0.023, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 699 }, { "completion_length": 1516.0416870117188, "epoch": 0.8536585365853658, "grad_norm": 0.08679470419883728, "kl": 0.065185546875, "learning_rate": 2.125233888280715e-06, "loss": 0.0026, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 700 }, { "completion_length": 1633.9375, "epoch": 0.8548780487804878, "grad_norm": 0.21567928791046143, "kl": 0.078369140625, "learning_rate": 2.1223303709940226e-06, "loss": 0.1611, "reward": 0.1875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 701 }, { "completion_length": 2085.25, "epoch": 0.8560975609756097, "grad_norm": 0.15565013885498047, "kl": 0.07080078125, "learning_rate": 2.1194240343646732e-06, "loss": 0.0644, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 702 }, { "completion_length": 1335.5208740234375, "epoch": 0.8573170731707317, "grad_norm": 0.14207996428012848, "kl": 0.060302734375, "learning_rate": 2.11651489155924e-06, "loss": 0.008, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 703 }, { "completion_length": 1189.4583740234375, "epoch": 0.8585365853658536, "grad_norm": 0.2305360734462738, "kl": 0.06787109375, "learning_rate": 2.11360295575701e-06, "loss": 0.0487, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 704 }, { "completion_length": 1884.6875610351562, "epoch": 0.8597560975609756, "grad_norm": 0.1316843330860138, "kl": 0.072021484375, "learning_rate": 2.110688240149923e-06, "loss": -0.01, "reward": 0.2291666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 705 }, { "completion_length": 955.2708435058594, "epoch": 0.8609756097560975, "grad_norm": 0.13576047122478485, "kl": 0.069091796875, "learning_rate": 2.1077707579425114e-06, "loss": 0.0046, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 706 }, { "completion_length": 1343.4166870117188, "epoch": 0.8621951219512195, "grad_norm": 0.13969150185585022, "kl": 0.072265625, "learning_rate": 2.1048505223518433e-06, "loss": 0.0418, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 707 }, { "completion_length": 812.9583740234375, "epoch": 0.8634146341463415, "grad_norm": 0.11462213099002838, "kl": 0.0675048828125, "learning_rate": 2.1019275466074585e-06, "loss": 0.0033, "reward": 0.2916666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 708 }, { "completion_length": 984.1666870117188, "epoch": 0.8646341463414634, "grad_norm": 0.23446671664714813, "kl": 0.072509765625, "learning_rate": 2.0990018439513105e-06, "loss": 0.0054, "reward": 0.291666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 709 }, { "completion_length": 1313.7916870117188, "epoch": 0.8658536585365854, "grad_norm": 0.1921614557504654, "kl": 0.087646484375, "learning_rate": 2.0960734276377082e-06, "loss": 0.0935, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 710 }, { "completion_length": 1218.3541870117188, "epoch": 0.8670731707317073, "grad_norm": 0.15616123378276825, "kl": 0.0703125, "learning_rate": 2.093142310933252e-06, "loss": 0.0715, "reward": 0.18750000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 711 }, { "completion_length": 1538.3750610351562, "epoch": 0.8682926829268293, "grad_norm": 0.14647164940834045, "kl": 0.068115234375, "learning_rate": 2.0902085071167774e-06, "loss": 0.0448, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 712 }, { "completion_length": 1394.9375, "epoch": 0.8695121951219512, "grad_norm": 0.08493389189243317, "kl": 0.0631103515625, "learning_rate": 2.0872720294792936e-06, "loss": 0.0613, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 713 }, { "completion_length": 1128.2292175292969, "epoch": 0.8707317073170732, "grad_norm": 0.234442338347435, "kl": 0.0667724609375, "learning_rate": 2.0843328913239216e-06, "loss": 0.1049, "reward": 0.2500000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 714 }, { "completion_length": 1354.1875610351562, "epoch": 0.8719512195121951, "grad_norm": 0.1722685992717743, "kl": 0.05224609375, "learning_rate": 2.081391105965836e-06, "loss": -0.0021, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 715 }, { "completion_length": 1290.8333740234375, "epoch": 0.8731707317073171, "grad_norm": 0.22299982607364655, "kl": 0.056884765625, "learning_rate": 2.0784466867322037e-06, "loss": 0.1172, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 716 }, { "completion_length": 1671.9166870117188, "epoch": 0.874390243902439, "grad_norm": 0.11774788796901703, "kl": 0.0650634765625, "learning_rate": 2.075499646962125e-06, "loss": 0.0908, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 717 }, { "completion_length": 1435.7708740234375, "epoch": 0.875609756097561, "grad_norm": 0.22623351216316223, "kl": 0.0654296875, "learning_rate": 2.0725500000065715e-06, "loss": 0.0631, "reward": 0.2500000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 718 }, { "completion_length": 1647.5416870117188, "epoch": 0.8768292682926829, "grad_norm": 0.1137980967760086, "kl": 0.07373046875, "learning_rate": 2.0695977592283246e-06, "loss": 0.0539, "reward": 0.2291666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 719 }, { "completion_length": 1688.7708740234375, "epoch": 0.8780487804878049, "grad_norm": 0.1766272634267807, "kl": 0.0634765625, "learning_rate": 2.0666429380019185e-06, "loss": -0.0131, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 720 }, { "completion_length": 1692.7500610351562, "epoch": 0.8792682926829268, "grad_norm": 0.10823126137256622, "kl": 0.06005859375, "learning_rate": 2.0636855497135772e-06, "loss": 0.0033, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 721 }, { "completion_length": 1786.229248046875, "epoch": 0.8804878048780488, "grad_norm": 0.29120421409606934, "kl": 0.073486328125, "learning_rate": 2.060725607761153e-06, "loss": 0.213, "reward": 0.1666666716337204, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 722 }, { "completion_length": 1498.1250610351562, "epoch": 0.8817073170731707, "grad_norm": 0.19975915551185608, "kl": 0.0521240234375, "learning_rate": 2.0577631255540692e-06, "loss": 0.0748, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 723 }, { "completion_length": 1925.9166870117188, "epoch": 0.8829268292682927, "grad_norm": 0.08820930868387222, "kl": 0.0682373046875, "learning_rate": 2.0547981165132547e-06, "loss": 0.0359, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 724 }, { "completion_length": 1776.5208740234375, "epoch": 0.8841463414634146, "grad_norm": 0.3257627487182617, "kl": 0.072998046875, "learning_rate": 2.051830594071088e-06, "loss": 0.0975, "reward": 0.2916666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 725 }, { "completion_length": 1462.916748046875, "epoch": 0.8853658536585366, "grad_norm": 0.23209330439567566, "kl": 0.0714111328125, "learning_rate": 2.048860571671332e-06, "loss": -0.0094, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 726 }, { "completion_length": 1617.0000610351562, "epoch": 0.8865853658536585, "grad_norm": 0.13382349908351898, "kl": 0.0625, "learning_rate": 2.045888062769077e-06, "loss": 0.022, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 727 }, { "completion_length": 1737.5625610351562, "epoch": 0.8878048780487805, "grad_norm": 0.16139723360538483, "kl": 0.077880859375, "learning_rate": 2.0429130808306767e-06, "loss": 0.0674, "reward": 0.3125, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 728 }, { "completion_length": 1834.7083740234375, "epoch": 0.8890243902439025, "grad_norm": 0.15899519622325897, "kl": 0.068603515625, "learning_rate": 2.03993563933369e-06, "loss": 0.0978, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 729 }, { "completion_length": 1710.7916870117188, "epoch": 0.8902439024390244, "grad_norm": 0.34318220615386963, "kl": 0.072265625, "learning_rate": 2.036955751766815e-06, "loss": 0.0201, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 730 }, { "completion_length": 2019.1459350585938, "epoch": 0.8914634146341464, "grad_norm": 0.19882003962993622, "kl": 0.066650390625, "learning_rate": 2.033973431629835e-06, "loss": 0.1096, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 731 }, { "completion_length": 1715.3125610351562, "epoch": 0.8926829268292683, "grad_norm": 0.1826465129852295, "kl": 0.0693359375, "learning_rate": 2.030988692433552e-06, "loss": 0.0888, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 732 }, { "completion_length": 2469.5001220703125, "epoch": 0.8939024390243903, "grad_norm": 0.08135739713907242, "kl": 0.08056640625, "learning_rate": 2.0280015476997256e-06, "loss": 0.0035, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 733 }, { "completion_length": 2033.0625610351562, "epoch": 0.8951219512195122, "grad_norm": 0.17229749262332916, "kl": 0.074951171875, "learning_rate": 2.0250120109610155e-06, "loss": 0.0347, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 734 }, { "completion_length": 1606.0208740234375, "epoch": 0.8963414634146342, "grad_norm": 0.058598361909389496, "kl": 0.05908203125, "learning_rate": 2.0220200957609172e-06, "loss": 0.0028, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 735 }, { "completion_length": 1798.1666870117188, "epoch": 0.8975609756097561, "grad_norm": 0.2880723476409912, "kl": 0.06884765625, "learning_rate": 2.019025815653701e-06, "loss": 0.2289, "reward": 0.2916666865348816, "reward_std": 0.32475951313972473, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.0, "step": 736 }, { "completion_length": 1652.2291870117188, "epoch": 0.8987804878048781, "grad_norm": 0.15141387283802032, "kl": 0.067138671875, "learning_rate": 2.016029184204351e-06, "loss": 0.0597, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 737 }, { "completion_length": 1509.6875, "epoch": 0.9, "grad_norm": 0.16033804416656494, "kl": 0.061767578125, "learning_rate": 2.0130302149885033e-06, "loss": -0.012, "reward": 0.18750000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 738 }, { "completion_length": 2484.3333740234375, "epoch": 0.901219512195122, "grad_norm": 0.07207747548818588, "kl": 0.0699462890625, "learning_rate": 2.0100289215923856e-06, "loss": 0.035, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 739 }, { "completion_length": 1837.4791870117188, "epoch": 0.9024390243902439, "grad_norm": 0.27909114956855774, "kl": 0.0643310546875, "learning_rate": 2.007025317612754e-06, "loss": 0.0034, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 740 }, { "completion_length": 2012.5625610351562, "epoch": 0.9036585365853659, "grad_norm": 0.1434146612882614, "kl": 0.0626220703125, "learning_rate": 2.0040194166568337e-06, "loss": -0.0251, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 741 }, { "completion_length": 1704.6666870117188, "epoch": 0.9048780487804878, "grad_norm": 0.10819385945796967, "kl": 0.05615234375, "learning_rate": 2.001011232342253e-06, "loss": 0.0467, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 742 }, { "completion_length": 1873.8126220703125, "epoch": 0.9060975609756098, "grad_norm": 0.14213120937347412, "kl": 0.0577392578125, "learning_rate": 1.9980007782969882e-06, "loss": -0.0043, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 743 }, { "completion_length": 1181.2500610351562, "epoch": 0.9073170731707317, "grad_norm": 0.16152995824813843, "kl": 0.04931640625, "learning_rate": 1.994988068159294e-06, "loss": 0.0798, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 744 }, { "completion_length": 1583.2708740234375, "epoch": 0.9085365853658537, "grad_norm": 0.256203830242157, "kl": 0.0625, "learning_rate": 1.9919731155776504e-06, "loss": 0.1972, "reward": 0.2083333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 745 }, { "completion_length": 1711.7916870117188, "epoch": 0.9097560975609756, "grad_norm": 1.257775068283081, "kl": 0.069580078125, "learning_rate": 1.9889559342106926e-06, "loss": 0.0613, "reward": 0.16666667722165585, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 746 }, { "completion_length": 1768.5208740234375, "epoch": 0.9109756097560976, "grad_norm": 0.19232381880283356, "kl": 0.068359375, "learning_rate": 1.985936537727155e-06, "loss": 0.1317, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 747 }, { "completion_length": 1997.604248046875, "epoch": 0.9121951219512195, "grad_norm": 0.10197277367115021, "kl": 0.068603515625, "learning_rate": 1.9829149398058068e-06, "loss": 0.068, "reward": 0.3333333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 748 }, { "completion_length": 2067.125, "epoch": 0.9134146341463415, "grad_norm": 0.12657509744167328, "kl": 0.084228515625, "learning_rate": 1.9798911541353882e-06, "loss": 0.1183, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 749 }, { "completion_length": 2701.7501220703125, "epoch": 0.9146341463414634, "grad_norm": 0.07679648697376251, "kl": 0.072998046875, "learning_rate": 1.976865194414555e-06, "loss": 0.0324, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 750 }, { "completion_length": 2328.1251220703125, "epoch": 0.9158536585365854, "grad_norm": 0.0738058015704155, "kl": 0.08544921875, "learning_rate": 1.9738370743518076e-06, "loss": 0.0033, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 751 }, { "completion_length": 2699.0, "epoch": 0.9170731707317074, "grad_norm": 0.1525442749261856, "kl": 0.157470703125, "learning_rate": 1.9708068076654364e-06, "loss": -0.0015, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 752 }, { "completion_length": 2260.916748046875, "epoch": 0.9182926829268293, "grad_norm": 0.08167207986116409, "kl": 0.093994140625, "learning_rate": 1.9677744080834547e-06, "loss": 0.0035, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 753 }, { "completion_length": 3038.6251220703125, "epoch": 0.9195121951219513, "grad_norm": 0.08545702695846558, "kl": 0.0908203125, "learning_rate": 1.9647398893435394e-06, "loss": 0.0035, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 754 }, { "completion_length": 2888.8126220703125, "epoch": 0.9207317073170732, "grad_norm": 0.08751173317432404, "kl": 0.1015625, "learning_rate": 1.9617032651929686e-06, "loss": 0.0037, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 755 }, { "completion_length": 2549.979248046875, "epoch": 0.9219512195121952, "grad_norm": 1.3179752826690674, "kl": 0.119873046875, "learning_rate": 1.9586645493885565e-06, "loss": 0.0825, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 756 }, { "completion_length": 2620.14599609375, "epoch": 0.9231707317073171, "grad_norm": 0.09866995364427567, "kl": 0.104248046875, "learning_rate": 1.9556237556965955e-06, "loss": 0.0384, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 757 }, { "completion_length": 2718.8125, "epoch": 0.9243902439024391, "grad_norm": 0.12418857961893082, "kl": 0.088134765625, "learning_rate": 1.9525808978927886e-06, "loss": 0.0365, "reward": 0.1458333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 758 }, { "completion_length": 2632.354248046875, "epoch": 0.925609756097561, "grad_norm": 0.09063011407852173, "kl": 0.084716796875, "learning_rate": 1.9495359897621926e-06, "loss": 0.0231, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 759 }, { "completion_length": 2732.791748046875, "epoch": 0.926829268292683, "grad_norm": 0.06431277096271515, "kl": 0.078369140625, "learning_rate": 1.946489045099152e-06, "loss": 0.0337, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 760 }, { "completion_length": 2694.5001220703125, "epoch": 0.9280487804878049, "grad_norm": 0.13366465270519257, "kl": 0.083984375, "learning_rate": 1.9434400777072364e-06, "loss": 0.118, "reward": 0.1458333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 761 }, { "completion_length": 2297.9376220703125, "epoch": 0.9292682926829269, "grad_norm": 0.30349159240722656, "kl": 0.08544921875, "learning_rate": 1.94038910139918e-06, "loss": 0.0263, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 762 }, { "completion_length": 2242.7708740234375, "epoch": 0.9304878048780488, "grad_norm": 0.19288888573646545, "kl": 0.081787109375, "learning_rate": 1.9373361299968173e-06, "loss": 0.1056, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 763 }, { "completion_length": 2450.8125, "epoch": 0.9317073170731708, "grad_norm": 0.08333387970924377, "kl": 0.078369140625, "learning_rate": 1.934281177331023e-06, "loss": 0.005, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 764 }, { "completion_length": 1886.3125610351562, "epoch": 0.9329268292682927, "grad_norm": 0.16903336346149445, "kl": 0.081787109375, "learning_rate": 1.9312242572416446e-06, "loss": 0.1079, "reward": 0.1875000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 765 }, { "completion_length": 2126.2709350585938, "epoch": 0.9341463414634147, "grad_norm": 0.07558081299066544, "kl": 0.077880859375, "learning_rate": 1.928165383577445e-06, "loss": 0.0399, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 766 }, { "completion_length": 1610.2291870117188, "epoch": 0.9353658536585366, "grad_norm": 0.37747398018836975, "kl": 0.095703125, "learning_rate": 1.925104570196036e-06, "loss": 0.0405, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 767 }, { "completion_length": 1776.3750610351562, "epoch": 0.9365853658536586, "grad_norm": 1.79806387424469, "kl": 0.09326171875, "learning_rate": 1.9220418309638175e-06, "loss": 0.0086, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 768 }, { "completion_length": 1367.6458740234375, "epoch": 0.9378048780487804, "grad_norm": 0.1988687515258789, "kl": 0.100830078125, "learning_rate": 1.9189771797559143e-06, "loss": 0.0506, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 769 }, { "completion_length": 1294.3125610351562, "epoch": 0.9390243902439024, "grad_norm": 0.36677107214927673, "kl": 0.072509765625, "learning_rate": 1.915910630456112e-06, "loss": 0.0376, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 770 }, { "completion_length": 1852.979248046875, "epoch": 0.9402439024390243, "grad_norm": 0.16986016929149628, "kl": 0.081298828125, "learning_rate": 1.9128421969567964e-06, "loss": 0.0482, "reward": 0.1458333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 771 }, { "completion_length": 1887.7501220703125, "epoch": 0.9414634146341463, "grad_norm": 0.09760124981403351, "kl": 0.074951171875, "learning_rate": 1.909771893158889e-06, "loss": 0.0103, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 772 }, { "completion_length": 1744.4791870117188, "epoch": 0.9426829268292682, "grad_norm": 0.12577581405639648, "kl": 0.073486328125, "learning_rate": 1.9066997329717833e-06, "loss": 0.0222, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 773 }, { "completion_length": 2214.2501220703125, "epoch": 0.9439024390243902, "grad_norm": 0.16476677358150482, "kl": 0.06787109375, "learning_rate": 1.9036257303132843e-06, "loss": 0.0181, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 774 }, { "completion_length": 2148.0, "epoch": 0.9451219512195121, "grad_norm": 0.12884533405303955, "kl": 0.05908203125, "learning_rate": 1.9005498991095422e-06, "loss": -0.0049, "reward": 0.2916666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 775 }, { "completion_length": 1907.854248046875, "epoch": 0.9463414634146341, "grad_norm": 0.18242411315441132, "kl": 0.0751953125, "learning_rate": 1.8974722532949929e-06, "loss": 0.0264, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 776 }, { "completion_length": 2112.4583740234375, "epoch": 0.947560975609756, "grad_norm": 1.6324245929718018, "kl": 0.08837890625, "learning_rate": 1.894392806812291e-06, "loss": 0.0489, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 777 }, { "completion_length": 2197.4375, "epoch": 0.948780487804878, "grad_norm": 0.1106872633099556, "kl": 0.06982421875, "learning_rate": 1.8913115736122519e-06, "loss": -0.0232, "reward": 0.2083333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 778 }, { "completion_length": 2174.979248046875, "epoch": 0.95, "grad_norm": 0.1095220148563385, "kl": 0.09716796875, "learning_rate": 1.888228567653781e-06, "loss": 0.0482, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 779 }, { "completion_length": 1890.6458740234375, "epoch": 0.9512195121951219, "grad_norm": 0.10288502275943756, "kl": 0.099609375, "learning_rate": 1.8851438029038191e-06, "loss": 0.0029, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 780 }, { "completion_length": 2760.291748046875, "epoch": 0.9524390243902439, "grad_norm": 0.08262011408805847, "kl": 0.0582275390625, "learning_rate": 1.882057293337271e-06, "loss": 0.0038, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 781 }, { "completion_length": 1957.0208740234375, "epoch": 0.9536585365853658, "grad_norm": 0.1598406583070755, "kl": 0.0927734375, "learning_rate": 1.8789690529369492e-06, "loss": 0.0693, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 782 }, { "completion_length": 1926.9166870117188, "epoch": 0.9548780487804878, "grad_norm": 0.1280995011329651, "kl": 0.07763671875, "learning_rate": 1.8758790956935059e-06, "loss": 0.0754, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 783 }, { "completion_length": 2507.20849609375, "epoch": 0.9560975609756097, "grad_norm": 0.22994191944599152, "kl": 0.07080078125, "learning_rate": 1.8727874356053706e-06, "loss": 0.0068, "reward": 0.2083333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 784 }, { "completion_length": 2073.5001220703125, "epoch": 0.9573170731707317, "grad_norm": 0.2451261281967163, "kl": 0.0703125, "learning_rate": 1.869694086678689e-06, "loss": 0.1429, "reward": 0.2708333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 785 }, { "completion_length": 1637.5, "epoch": 0.9585365853658536, "grad_norm": 0.13371232151985168, "kl": 0.08154296875, "learning_rate": 1.8665990629272555e-06, "loss": 0.0038, "reward": 0.229166679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 786 }, { "completion_length": 1974.7500610351562, "epoch": 0.9597560975609756, "grad_norm": 0.08475237339735031, "kl": 0.076904296875, "learning_rate": 1.863502378372454e-06, "loss": 0.0266, "reward": 0.4166666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 787 }, { "completion_length": 2630.0625, "epoch": 0.9609756097560975, "grad_norm": 0.16697664558887482, "kl": 0.069580078125, "learning_rate": 1.8604040470431908e-06, "loss": 0.0673, "reward": 0.0833333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 788 }, { "completion_length": 2004.8958740234375, "epoch": 0.9621951219512195, "grad_norm": 0.1425103396177292, "kl": 0.06689453125, "learning_rate": 1.857304082975834e-06, "loss": 0.0383, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 789 }, { "completion_length": 1995.5416870117188, "epoch": 0.9634146341463414, "grad_norm": 0.5255331993103027, "kl": 0.12353515625, "learning_rate": 1.8542025002141474e-06, "loss": 0.0893, "reward": 0.20833333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 790 }, { "completion_length": 2964.875, "epoch": 0.9646341463414634, "grad_norm": 0.137781023979187, "kl": 0.0638427734375, "learning_rate": 1.8510993128092273e-06, "loss": 0.0365, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 791 }, { "completion_length": 2194.25, "epoch": 0.9658536585365853, "grad_norm": 0.17564964294433594, "kl": 0.0718994140625, "learning_rate": 1.8479945348194423e-06, "loss": 0.0106, "reward": 0.3333333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 792 }, { "completion_length": 2555.916748046875, "epoch": 0.9670731707317073, "grad_norm": 0.1287163347005844, "kl": 0.070068359375, "learning_rate": 1.8448881803103637e-06, "loss": 0.0503, "reward": 0.1041666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 793 }, { "completion_length": 2289.5001220703125, "epoch": 0.9682926829268292, "grad_norm": 0.28565070033073425, "kl": 0.077392578125, "learning_rate": 1.8417802633547067e-06, "loss": 0.043, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 794 }, { "completion_length": 2157.7291870117188, "epoch": 0.9695121951219512, "grad_norm": 0.1937241107225418, "kl": 0.086181640625, "learning_rate": 1.8386707980322637e-06, "loss": 0.0659, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 795 }, { "completion_length": 2438.0209350585938, "epoch": 0.9707317073170731, "grad_norm": 0.19534894824028015, "kl": 0.066650390625, "learning_rate": 1.8355597984298435e-06, "loss": 0.0454, "reward": 0.229166679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 796 }, { "completion_length": 2299.1250610351562, "epoch": 0.9719512195121951, "grad_norm": 0.1936422884464264, "kl": 0.068603515625, "learning_rate": 1.8324472786412037e-06, "loss": 0.1529, "reward": 0.16666667722165585, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 797 }, { "completion_length": 2028.5834350585938, "epoch": 0.973170731707317, "grad_norm": 0.11078803986310959, "kl": 0.0706787109375, "learning_rate": 1.8293332527669897e-06, "loss": 0.0133, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 798 }, { "completion_length": 1751.1458740234375, "epoch": 0.974390243902439, "grad_norm": 0.1594752073287964, "kl": 0.06640625, "learning_rate": 1.8262177349146702e-06, "loss": 0.036, "reward": 0.1875000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 799 }, { "completion_length": 2286.8126220703125, "epoch": 0.975609756097561, "grad_norm": 0.104021355509758, "kl": 0.068115234375, "learning_rate": 1.823100739198472e-06, "loss": 0.0666, "reward": 0.3125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 800 }, { "completion_length": 2264.1875, "epoch": 0.9768292682926829, "grad_norm": 0.19936567544937134, "kl": 0.064208984375, "learning_rate": 1.8199822797393182e-06, "loss": -0.075, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 801 }, { "completion_length": 1850.9375610351562, "epoch": 0.9780487804878049, "grad_norm": 0.10595716536045074, "kl": 0.072998046875, "learning_rate": 1.816862370664762e-06, "loss": 0.0429, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 802 }, { "completion_length": 2270.041748046875, "epoch": 0.9792682926829268, "grad_norm": 0.11459033936262131, "kl": 0.0859375, "learning_rate": 1.8137410261089253e-06, "loss": 0.0425, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 803 }, { "completion_length": 2035.854248046875, "epoch": 0.9804878048780488, "grad_norm": 0.133082315325737, "kl": 0.08203125, "learning_rate": 1.8106182602124312e-06, "loss": 0.0428, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 804 }, { "completion_length": 1417.6875610351562, "epoch": 0.9817073170731707, "grad_norm": 0.10599875450134277, "kl": 0.0703125, "learning_rate": 1.8074940871223436e-06, "loss": 0.0345, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 805 }, { "completion_length": 1934.375, "epoch": 0.9829268292682927, "grad_norm": 0.23281818628311157, "kl": 0.09716796875, "learning_rate": 1.8043685209921002e-06, "loss": 0.073, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 806 }, { "completion_length": 1447.0208740234375, "epoch": 0.9841463414634146, "grad_norm": 0.23211544752120972, "kl": 0.0810546875, "learning_rate": 1.8012415759814505e-06, "loss": 0.0624, "reward": 0.3125000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 807 }, { "completion_length": 1773.7708740234375, "epoch": 0.9853658536585366, "grad_norm": 0.172435462474823, "kl": 0.075439453125, "learning_rate": 1.7981132662563906e-06, "loss": 0.0911, "reward": 0.229166679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 808 }, { "completion_length": 1861.0625610351562, "epoch": 0.9865853658536585, "grad_norm": 0.1463833600282669, "kl": 0.06982421875, "learning_rate": 1.794983605989098e-06, "loss": 0.0665, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 809 }, { "completion_length": 1151.9375305175781, "epoch": 0.9878048780487805, "grad_norm": 0.31820955872535706, "kl": 0.10693359375, "learning_rate": 1.7918526093578702e-06, "loss": -0.0039, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 810 }, { "completion_length": 1778.4584350585938, "epoch": 0.9890243902439024, "grad_norm": 0.12072424590587616, "kl": 0.07470703125, "learning_rate": 1.7887202905470582e-06, "loss": 0.0661, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 811 }, { "completion_length": 1501.75, "epoch": 0.9902439024390244, "grad_norm": 0.050069134682416916, "kl": 0.0565185546875, "learning_rate": 1.7855866637470027e-06, "loss": 0.0022, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 812 }, { "completion_length": 1543.7083740234375, "epoch": 0.9914634146341463, "grad_norm": 1.3539941310882568, "kl": 0.08251953125, "learning_rate": 1.7824517431539697e-06, "loss": 0.0326, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 813 }, { "completion_length": 1366.9583740234375, "epoch": 0.9926829268292683, "grad_norm": 0.3993377983570099, "kl": 0.063232421875, "learning_rate": 1.7793155429700868e-06, "loss": 0.0022, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 814 }, { "completion_length": 1753.791748046875, "epoch": 0.9939024390243902, "grad_norm": 0.12188085913658142, "kl": 0.06005859375, "learning_rate": 1.776178077403279e-06, "loss": 0.0256, "reward": 0.229166679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 815 }, { "completion_length": 1822.9375, "epoch": 0.9951219512195122, "grad_norm": 0.11630331724882126, "kl": 0.0654296875, "learning_rate": 1.7730393606672033e-06, "loss": 0.058, "reward": 0.22916667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.22916667722165585, "rewards/format_reward": 0.0, "step": 816 }, { "completion_length": 1395.0833740234375, "epoch": 0.9963414634146341, "grad_norm": 0.2967110276222229, "kl": 0.0743408203125, "learning_rate": 1.769899406981185e-06, "loss": 0.0865, "reward": 0.2916666865348816, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.0, "step": 817 }, { "completion_length": 1730.4583740234375, "epoch": 0.9975609756097561, "grad_norm": 0.12924818694591522, "kl": 0.058349609375, "learning_rate": 1.7667582305701528e-06, "loss": 0.0431, "reward": 0.2708333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 818 }, { "completion_length": 909.3541870117188, "epoch": 0.998780487804878, "grad_norm": 0.041504696011543274, "kl": 0.059814453125, "learning_rate": 1.7636158456645754e-06, "loss": 0.0023, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 819 }, { "completion_length": 1264.15625, "epoch": 1.0, "grad_norm": 0.2563475966453552, "kl": 0.063720703125, "learning_rate": 1.7604722665003958e-06, "loss": 0.0263, "reward": 0.3125, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 820 }, { "completion_length": 1558.6458740234375, "epoch": 1.001219512195122, "grad_norm": 0.22059719264507294, "kl": 0.08251953125, "learning_rate": 1.7573275073189677e-06, "loss": 0.0754, "reward": 0.3333333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 821 }, { "completion_length": 1349.0208740234375, "epoch": 1.002439024390244, "grad_norm": 0.17566891014575958, "kl": 0.06884765625, "learning_rate": 1.7541815823669903e-06, "loss": -0.0097, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 822 }, { "completion_length": 1475.1458435058594, "epoch": 1.0036585365853659, "grad_norm": 0.15861055254936218, "kl": 0.05810546875, "learning_rate": 1.7510345058964446e-06, "loss": 0.0699, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 823 }, { "completion_length": 1511.3750305175781, "epoch": 1.0048780487804878, "grad_norm": 0.26808956265449524, "kl": 0.0574951171875, "learning_rate": 1.7478862921645273e-06, "loss": -0.01, "reward": 0.25, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 824 }, { "completion_length": 1735.7708740234375, "epoch": 1.0060975609756098, "grad_norm": 0.08171319961547852, "kl": 0.05126953125, "learning_rate": 1.7447369554335887e-06, "loss": 0.0132, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 825 }, { "completion_length": 1476.0208740234375, "epoch": 1.0073170731707317, "grad_norm": 0.1476607471704483, "kl": 0.0543212890625, "learning_rate": 1.7415865099710657e-06, "loss": -0.0114, "reward": 0.2083333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 826 }, { "completion_length": 1452.104248046875, "epoch": 1.0085365853658537, "grad_norm": 0.09912052005529404, "kl": 0.05322265625, "learning_rate": 1.7384349700494184e-06, "loss": 0.0486, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 827 }, { "completion_length": 1773.5833740234375, "epoch": 1.0097560975609756, "grad_norm": 0.2928617596626282, "kl": 0.0679931640625, "learning_rate": 1.735282349946064e-06, "loss": 0.1092, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 828 }, { "completion_length": 1691.6250610351562, "epoch": 1.0109756097560976, "grad_norm": 0.08107290416955948, "kl": 0.066162109375, "learning_rate": 1.732128663943315e-06, "loss": 0.033, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 829 }, { "completion_length": 1757.1458740234375, "epoch": 1.0121951219512195, "grad_norm": 0.32985296845436096, "kl": 0.1064453125, "learning_rate": 1.7289739263283118e-06, "loss": 0.184, "reward": 0.1875000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 830 }, { "completion_length": 1252.0208740234375, "epoch": 1.0134146341463415, "grad_norm": 0.3417028784751892, "kl": 0.0484619140625, "learning_rate": 1.7258181513929593e-06, "loss": 0.0045, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 831 }, { "completion_length": 1539.1666870117188, "epoch": 1.0146341463414634, "grad_norm": 0.19829852879047394, "kl": 0.052978515625, "learning_rate": 1.7226613534338608e-06, "loss": 0.0946, "reward": 0.1458333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 832 }, { "completion_length": 1928.2083740234375, "epoch": 1.0158536585365854, "grad_norm": 0.15048523247241974, "kl": 0.0557861328125, "learning_rate": 1.7195035467522556e-06, "loss": 0.0795, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 833 }, { "completion_length": 2224.9583740234375, "epoch": 1.0170731707317073, "grad_norm": 0.1385055035352707, "kl": 0.0618896484375, "learning_rate": 1.716344745653952e-06, "loss": 0.0429, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 834 }, { "completion_length": 1723.5416870117188, "epoch": 1.0182926829268293, "grad_norm": 0.05238031595945358, "kl": 0.048095703125, "learning_rate": 1.7131849644492634e-06, "loss": 0.0022, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 835 }, { "completion_length": 1968.7083740234375, "epoch": 1.0195121951219512, "grad_norm": 0.18248939514160156, "kl": 0.0577392578125, "learning_rate": 1.7100242174529439e-06, "loss": 0.087, "reward": 0.1666666716337204, "reward_std": 0.18042196333408356, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 836 }, { "completion_length": 2262.1459350585938, "epoch": 1.0207317073170732, "grad_norm": 0.07669670879840851, "kl": 0.0628662109375, "learning_rate": 1.7068625189841213e-06, "loss": 0.0229, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 837 }, { "completion_length": 1986.7291870117188, "epoch": 1.0219512195121951, "grad_norm": 0.18238304555416107, "kl": 0.0521240234375, "learning_rate": 1.7036998833662359e-06, "loss": 0.0207, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 838 }, { "completion_length": 1957.9583740234375, "epoch": 1.023170731707317, "grad_norm": 0.1262129843235016, "kl": 0.0618896484375, "learning_rate": 1.7005363249269726e-06, "loss": 0.0732, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 839 }, { "completion_length": 1967.7084350585938, "epoch": 1.024390243902439, "grad_norm": 0.10862776637077332, "kl": 0.0592041015625, "learning_rate": 1.6973718579981973e-06, "loss": 0.0421, "reward": 0.125, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 840 }, { "completion_length": 1246.3125610351562, "epoch": 1.025609756097561, "grad_norm": 0.3085106909275055, "kl": 0.063232421875, "learning_rate": 1.6942064969158907e-06, "loss": 0.0423, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 841 }, { "completion_length": 1532.4583740234375, "epoch": 1.026829268292683, "grad_norm": 0.06805000454187393, "kl": 0.0560302734375, "learning_rate": 1.6910402560200854e-06, "loss": 0.0398, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 842 }, { "completion_length": 1584.9166870117188, "epoch": 1.028048780487805, "grad_norm": 0.16937634348869324, "kl": 0.04736328125, "learning_rate": 1.6878731496547987e-06, "loss": 0.1109, "reward": 0.18750000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 843 }, { "completion_length": 2558.2083740234375, "epoch": 1.0292682926829269, "grad_norm": 0.21832555532455444, "kl": 0.0626220703125, "learning_rate": 1.6847051921679702e-06, "loss": 0.022, "reward": 0.08333333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 844 }, { "completion_length": 1414.7291870117188, "epoch": 1.0304878048780488, "grad_norm": 0.1572750210762024, "kl": 0.0491943359375, "learning_rate": 1.6815363979113947e-06, "loss": 0.0007, "reward": 0.3125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 845 }, { "completion_length": 1646.3125, "epoch": 1.0317073170731708, "grad_norm": 0.48570016026496887, "kl": 0.0657958984375, "learning_rate": 1.6783667812406569e-06, "loss": 0.2, "reward": 0.25, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 846 }, { "completion_length": 1742.1875, "epoch": 1.0329268292682927, "grad_norm": 0.19844897091388702, "kl": 0.0587158203125, "learning_rate": 1.6751963565150682e-06, "loss": 0.0304, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 847 }, { "completion_length": 1645.7291870117188, "epoch": 1.0341463414634147, "grad_norm": 0.18595775961875916, "kl": 0.0576171875, "learning_rate": 1.672025138097601e-06, "loss": 0.0738, "reward": 0.2083333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 848 }, { "completion_length": 1999.3750610351562, "epoch": 1.0353658536585366, "grad_norm": 0.11137094348669052, "kl": 0.060791015625, "learning_rate": 1.6688531403548222e-06, "loss": 0.0488, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 849 }, { "completion_length": 1290.7083740234375, "epoch": 1.0365853658536586, "grad_norm": 0.14904099702835083, "kl": 0.05322265625, "learning_rate": 1.6656803776568307e-06, "loss": 0.0828, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 850 }, { "completion_length": 1698.5625, "epoch": 1.0378048780487805, "grad_norm": 0.18798424303531647, "kl": 0.0626220703125, "learning_rate": 1.6625068643771898e-06, "loss": 0.0178, "reward": 0.1875000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 851 }, { "completion_length": 1939.1875, "epoch": 1.0390243902439025, "grad_norm": 0.06463862955570221, "kl": 0.0584716796875, "learning_rate": 1.6593326148928643e-06, "loss": 0.0025, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 852 }, { "completion_length": 1173.0208740234375, "epoch": 1.0402439024390244, "grad_norm": 0.2833917438983917, "kl": 0.0623779296875, "learning_rate": 1.6561576435841515e-06, "loss": 0.0036, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 853 }, { "completion_length": 1896.9375, "epoch": 1.0414634146341464, "grad_norm": 0.13853855431079865, "kl": 0.0684814453125, "learning_rate": 1.652981964834623e-06, "loss": 0.1137, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 854 }, { "completion_length": 1704.6250610351562, "epoch": 1.0426829268292683, "grad_norm": 0.21089263260364532, "kl": 0.06884765625, "learning_rate": 1.6498055930310522e-06, "loss": 0.0496, "reward": 0.2291666679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 855 }, { "completion_length": 1384.7916870117188, "epoch": 1.0439024390243903, "grad_norm": 0.3170362710952759, "kl": 0.0611572265625, "learning_rate": 1.6466285425633527e-06, "loss": 0.1067, "reward": 0.3125000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 856 }, { "completion_length": 1594.979248046875, "epoch": 1.0451219512195122, "grad_norm": 0.2869209349155426, "kl": 0.0694580078125, "learning_rate": 1.6434508278245136e-06, "loss": 0.0857, "reward": 0.2500000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 857 }, { "completion_length": 1287.9375, "epoch": 1.0463414634146342, "grad_norm": 0.11518630385398865, "kl": 0.051513671875, "learning_rate": 1.6402724632105323e-06, "loss": 0.0231, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 858 }, { "completion_length": 1737.729248046875, "epoch": 1.0475609756097561, "grad_norm": 0.1519986093044281, "kl": 0.05859375, "learning_rate": 1.6370934631203516e-06, "loss": 0.0992, "reward": 0.1458333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 859 }, { "completion_length": 1418.0833740234375, "epoch": 1.048780487804878, "grad_norm": 0.5857430696487427, "kl": 0.083740234375, "learning_rate": 1.6339138419557916e-06, "loss": 0.0537, "reward": 0.2916666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 860 }, { "completion_length": 1537.25, "epoch": 1.05, "grad_norm": 0.3215422034263611, "kl": 0.06396484375, "learning_rate": 1.6307336141214877e-06, "loss": 0.2012, "reward": 0.2708333432674408, "reward_std": 0.21650633960962296, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 861 }, { "completion_length": 1814.0416870117188, "epoch": 1.051219512195122, "grad_norm": 0.14509320259094238, "kl": 0.05712890625, "learning_rate": 1.6275527940248218e-06, "loss": 0.0387, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 862 }, { "completion_length": 1340.6458740234375, "epoch": 1.052439024390244, "grad_norm": 0.20823921263217926, "kl": 0.0675048828125, "learning_rate": 1.6243713960758608e-06, "loss": 0.0492, "reward": 0.3125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 863 }, { "completion_length": 1330.2083740234375, "epoch": 1.053658536585366, "grad_norm": 0.17790457606315613, "kl": 0.060546875, "learning_rate": 1.6211894346872887e-06, "loss": 0.0475, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 864 }, { "completion_length": 2002.604248046875, "epoch": 1.0548780487804879, "grad_norm": 0.10235309600830078, "kl": 0.068359375, "learning_rate": 1.6180069242743416e-06, "loss": 0.0246, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 865 }, { "completion_length": 1628.8750610351562, "epoch": 1.0560975609756098, "grad_norm": 0.2700851261615753, "kl": 0.065185546875, "learning_rate": 1.614823879254744e-06, "loss": 0.0491, "reward": 0.3541666865348816, "reward_std": 0.21650633960962296, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 0.0, "step": 866 }, { "completion_length": 2068.229248046875, "epoch": 1.0573170731707318, "grad_norm": 0.1257856786251068, "kl": 0.062744140625, "learning_rate": 1.6116403140486397e-06, "loss": 0.0961, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 867 }, { "completion_length": 1561.0625610351562, "epoch": 1.0585365853658537, "grad_norm": 0.08292888104915619, "kl": 0.07568359375, "learning_rate": 1.6084562430785336e-06, "loss": 0.0007, "reward": 0.1666666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 868 }, { "completion_length": 2177.625, "epoch": 1.0597560975609757, "grad_norm": 0.1228252574801445, "kl": 0.0628662109375, "learning_rate": 1.605271680769217e-06, "loss": 0.0742, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 869 }, { "completion_length": 1705.854248046875, "epoch": 1.0609756097560976, "grad_norm": 0.08072850108146667, "kl": 0.0601806640625, "learning_rate": 1.6020866415477108e-06, "loss": 0.0238, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 870 }, { "completion_length": 1352.8125, "epoch": 1.0621951219512196, "grad_norm": 0.1563325673341751, "kl": 0.0543212890625, "learning_rate": 1.5989011398431943e-06, "loss": 0.0843, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 871 }, { "completion_length": 2121.041748046875, "epoch": 1.0634146341463415, "grad_norm": 0.16216345131397247, "kl": 0.080078125, "learning_rate": 1.5957151900869425e-06, "loss": 0.1096, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 872 }, { "completion_length": 1848.0625610351562, "epoch": 1.0646341463414635, "grad_norm": 0.24038060009479523, "kl": 0.067626953125, "learning_rate": 1.5925288067122614e-06, "loss": 0.1532, "reward": 0.2708333358168602, "reward_std": 0.18042196333408356, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 873 }, { "completion_length": 1316.5208740234375, "epoch": 1.0658536585365854, "grad_norm": 0.33268973231315613, "kl": 0.0677490234375, "learning_rate": 1.5893420041544193e-06, "loss": 0.1209, "reward": 0.1875000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 874 }, { "completion_length": 1971.8751220703125, "epoch": 1.0670731707317074, "grad_norm": 0.27953222393989563, "kl": 0.076416015625, "learning_rate": 1.5861547968505853e-06, "loss": 0.1053, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 875 }, { "completion_length": 1635.6875, "epoch": 1.0682926829268293, "grad_norm": 0.2557886838912964, "kl": 0.065185546875, "learning_rate": 1.582967199239761e-06, "loss": 0.006, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 876 }, { "completion_length": 2022.166748046875, "epoch": 1.0695121951219513, "grad_norm": 0.2831270098686218, "kl": 0.07275390625, "learning_rate": 1.5797792257627168e-06, "loss": 0.0326, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 877 }, { "completion_length": 2250.375, "epoch": 1.0707317073170732, "grad_norm": 0.169064462184906, "kl": 0.074951171875, "learning_rate": 1.5765908908619258e-06, "loss": 0.0343, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 878 }, { "completion_length": 1444.7083740234375, "epoch": 1.0719512195121952, "grad_norm": 0.1236577257514, "kl": 0.061279296875, "learning_rate": 1.573402208981499e-06, "loss": 0.0314, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 879 }, { "completion_length": 1834.0833740234375, "epoch": 1.0731707317073171, "grad_norm": 0.23196040093898773, "kl": 0.080078125, "learning_rate": 1.5702131945671182e-06, "loss": 0.1464, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 880 }, { "completion_length": 1878.9375, "epoch": 1.074390243902439, "grad_norm": 0.1786525994539261, "kl": 0.0692138671875, "learning_rate": 1.5670238620659717e-06, "loss": 0.0935, "reward": 0.1250000037252903, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 881 }, { "completion_length": 1908.5833740234375, "epoch": 1.075609756097561, "grad_norm": 0.1423856019973755, "kl": 0.06787109375, "learning_rate": 1.5638342259266904e-06, "loss": 0.0229, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 882 }, { "completion_length": 2048.2708740234375, "epoch": 1.076829268292683, "grad_norm": 0.17558245360851288, "kl": 0.07177734375, "learning_rate": 1.5606443005992789e-06, "loss": 0.0278, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 883 }, { "completion_length": 2358.7709350585938, "epoch": 1.078048780487805, "grad_norm": 0.07483585923910141, "kl": 0.08203125, "learning_rate": 1.5574541005350532e-06, "loss": 0.0428, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 884 }, { "completion_length": 1560.3333740234375, "epoch": 1.079268292682927, "grad_norm": 0.12679235637187958, "kl": 0.07373046875, "learning_rate": 1.5542636401865733e-06, "loss": 0.0041, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 885 }, { "completion_length": 1970.5625, "epoch": 1.0804878048780489, "grad_norm": 0.18568357825279236, "kl": 0.0712890625, "learning_rate": 1.5510729340075781e-06, "loss": 0.1327, "reward": 0.2500000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 886 }, { "completion_length": 1745.0000610351562, "epoch": 1.0817073170731708, "grad_norm": 0.20618446171283722, "kl": 0.066650390625, "learning_rate": 1.5478819964529216e-06, "loss": 0.1388, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 887 }, { "completion_length": 1843.875, "epoch": 1.0829268292682928, "grad_norm": 0.14224958419799805, "kl": 0.0751953125, "learning_rate": 1.544690841978504e-06, "loss": 0.0554, "reward": 0.3333333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 888 }, { "completion_length": 1947.8750610351562, "epoch": 1.0841463414634147, "grad_norm": 0.09921899437904358, "kl": 0.06591796875, "learning_rate": 1.5414994850412102e-06, "loss": 0.0597, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 889 }, { "completion_length": 2230.604248046875, "epoch": 1.0853658536585367, "grad_norm": 0.07646988332271576, "kl": 0.066650390625, "learning_rate": 1.5383079400988402e-06, "loss": 0.0028, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 890 }, { "completion_length": 1792.541748046875, "epoch": 1.0865853658536586, "grad_norm": 0.2364712506532669, "kl": 0.0692138671875, "learning_rate": 1.5351162216100473e-06, "loss": 0.0529, "reward": 0.25, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 891 }, { "completion_length": 1300.7500610351562, "epoch": 1.0878048780487806, "grad_norm": 0.19870877265930176, "kl": 0.06787109375, "learning_rate": 1.5319243440342713e-06, "loss": 0.0643, "reward": 0.1875000111758709, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 892 }, { "completion_length": 1372.1875610351562, "epoch": 1.0890243902439025, "grad_norm": 0.2156955748796463, "kl": 0.081298828125, "learning_rate": 1.5287323218316713e-06, "loss": 0.0485, "reward": 0.2916666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 893 }, { "completion_length": 1674.6666870117188, "epoch": 1.0902439024390245, "grad_norm": 0.14830811321735382, "kl": 0.069580078125, "learning_rate": 1.5255401694630625e-06, "loss": 0.1327, "reward": 0.2500000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "step": 894 }, { "completion_length": 2112.7916870117188, "epoch": 1.0914634146341464, "grad_norm": 0.14421944320201874, "kl": 0.074462890625, "learning_rate": 1.5223479013898489e-06, "loss": 0.0243, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 895 }, { "completion_length": 1717.8541870117188, "epoch": 1.0926829268292684, "grad_norm": 0.14482933282852173, "kl": 0.05712890625, "learning_rate": 1.5191555320739608e-06, "loss": 0.0968, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 896 }, { "completion_length": 2355.8125, "epoch": 1.0939024390243903, "grad_norm": 0.12093734741210938, "kl": 0.072265625, "learning_rate": 1.5159630759777845e-06, "loss": 0.0442, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 897 }, { "completion_length": 2057.3751220703125, "epoch": 1.0951219512195123, "grad_norm": 0.08225837349891663, "kl": 0.0712890625, "learning_rate": 1.5127705475641014e-06, "loss": 0.0315, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 898 }, { "completion_length": 2534.791748046875, "epoch": 1.0963414634146342, "grad_norm": 0.09056607633829117, "kl": 0.08154296875, "learning_rate": 1.5095779612960189e-06, "loss": 0.0049, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 899 }, { "completion_length": 2296.791748046875, "epoch": 1.0975609756097562, "grad_norm": 0.1415863037109375, "kl": 0.073486328125, "learning_rate": 1.5063853316369081e-06, "loss": 0.0704, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 900 }, { "completion_length": 1881.3958740234375, "epoch": 1.0987804878048781, "grad_norm": 0.14481917023658752, "kl": 0.067138671875, "learning_rate": 1.5031926730503356e-06, "loss": 0.0784, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 901 }, { "completion_length": 1991.5625, "epoch": 1.1, "grad_norm": 0.12288223952054977, "kl": 0.068115234375, "learning_rate": 1.5e-06, "loss": 0.0031, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 902 }, { "completion_length": 1837.9166870117188, "epoch": 1.101219512195122, "grad_norm": 0.162868469953537, "kl": 0.06982421875, "learning_rate": 1.4968073269496644e-06, "loss": 0.0653, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 903 }, { "completion_length": 1570.5625, "epoch": 1.102439024390244, "grad_norm": 0.10922540724277496, "kl": 0.065185546875, "learning_rate": 1.4936146683630921e-06, "loss": 0.0165, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 904 }, { "completion_length": 1983.5416870117188, "epoch": 1.103658536585366, "grad_norm": 0.1241430938243866, "kl": 0.07373046875, "learning_rate": 1.4904220387039814e-06, "loss": 0.0185, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 905 }, { "completion_length": 2171.979248046875, "epoch": 1.104878048780488, "grad_norm": 0.13579416275024414, "kl": 0.084716796875, "learning_rate": 1.4872294524358989e-06, "loss": 0.0874, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 906 }, { "completion_length": 1131.8333740234375, "epoch": 1.1060975609756099, "grad_norm": 0.05733570456504822, "kl": 0.069580078125, "learning_rate": 1.4840369240222158e-06, "loss": 0.0025, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 907 }, { "completion_length": 1814.3958740234375, "epoch": 1.1073170731707318, "grad_norm": 0.30615684390068054, "kl": 0.07958984375, "learning_rate": 1.4808444679260396e-06, "loss": 0.1313, "reward": 0.2083333358168602, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 908 }, { "completion_length": 1927.1041870117188, "epoch": 1.1085365853658538, "grad_norm": 0.18087086081504822, "kl": 0.058837890625, "learning_rate": 1.4776520986101508e-06, "loss": -0.0177, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 909 }, { "completion_length": 1527.6250915527344, "epoch": 1.1097560975609757, "grad_norm": 0.24441739916801453, "kl": 0.053955078125, "learning_rate": 1.4744598305369376e-06, "loss": 0.0501, "reward": 0.2083333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 910 }, { "completion_length": 2031.2708740234375, "epoch": 1.1109756097560977, "grad_norm": 0.20526492595672607, "kl": 0.068603515625, "learning_rate": 1.4712676781683288e-06, "loss": -0.0044, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 911 }, { "completion_length": 2032.3750610351562, "epoch": 1.1121951219512196, "grad_norm": 0.1633402556180954, "kl": 0.072265625, "learning_rate": 1.4680756559657292e-06, "loss": 0.1111, "reward": 0.2708333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 912 }, { "completion_length": 2198.3125610351562, "epoch": 1.1134146341463416, "grad_norm": 0.16198137402534485, "kl": 0.0711669921875, "learning_rate": 1.464883778389953e-06, "loss": 0.0763, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 913 }, { "completion_length": 1950.666748046875, "epoch": 1.1146341463414635, "grad_norm": 0.14530164003372192, "kl": 0.06396484375, "learning_rate": 1.4616920599011603e-06, "loss": 0.0776, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 914 }, { "completion_length": 1977.8541870117188, "epoch": 1.1158536585365855, "grad_norm": 0.1390020102262497, "kl": 0.067626953125, "learning_rate": 1.4585005149587903e-06, "loss": -0.0074, "reward": 0.2083333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 915 }, { "completion_length": 1876.2916870117188, "epoch": 1.1170731707317074, "grad_norm": 0.10858600586652756, "kl": 0.055908203125, "learning_rate": 1.4553091580214963e-06, "loss": 0.043, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 916 }, { "completion_length": 2212.8541870117188, "epoch": 1.1182926829268292, "grad_norm": 0.09590387344360352, "kl": 0.06396484375, "learning_rate": 1.452118003547079e-06, "loss": 0.0448, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 917 }, { "completion_length": 1853.041748046875, "epoch": 1.1195121951219513, "grad_norm": 0.11026542633771896, "kl": 0.055419921875, "learning_rate": 1.4489270659924222e-06, "loss": 0.0453, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 918 }, { "completion_length": 2107.979248046875, "epoch": 1.120731707317073, "grad_norm": 0.08363837003707886, "kl": 0.0634765625, "learning_rate": 1.4457363598134272e-06, "loss": 0.0395, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 919 }, { "completion_length": 1971.729248046875, "epoch": 1.1219512195121952, "grad_norm": 0.09001056849956512, "kl": 0.0648193359375, "learning_rate": 1.442545899464947e-06, "loss": 0.0588, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 920 }, { "completion_length": 1873.2916870117188, "epoch": 1.123170731707317, "grad_norm": 0.08485158532857895, "kl": 0.05810546875, "learning_rate": 1.4393556994007214e-06, "loss": 0.0457, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 921 }, { "completion_length": 1913.3125610351562, "epoch": 1.1243902439024391, "grad_norm": 0.34638532996177673, "kl": 0.089111328125, "learning_rate": 1.4361657740733103e-06, "loss": 0.2759, "reward": 0.291666679084301, "reward_std": 0.32475952059030533, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 922 }, { "completion_length": 1689.25, "epoch": 1.1256097560975609, "grad_norm": 0.052538514137268066, "kl": 0.0592041015625, "learning_rate": 1.4329761379340283e-06, "loss": 0.0023, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 923 }, { "completion_length": 2134.5833740234375, "epoch": 1.126829268292683, "grad_norm": 0.07250283658504486, "kl": 0.0577392578125, "learning_rate": 1.429786805432882e-06, "loss": 0.0028, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 924 }, { "completion_length": 1905.3750610351562, "epoch": 1.1280487804878048, "grad_norm": 0.28468990325927734, "kl": 0.0673828125, "learning_rate": 1.4265977910185013e-06, "loss": 0.047, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 925 }, { "completion_length": 1934.1875, "epoch": 1.129268292682927, "grad_norm": 0.1081048771739006, "kl": 0.0699462890625, "learning_rate": 1.4234091091380743e-06, "loss": 0.0682, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 926 }, { "completion_length": 2407.8751220703125, "epoch": 1.1304878048780487, "grad_norm": 0.14978399872779846, "kl": 0.06689453125, "learning_rate": 1.420220774237284e-06, "loss": 0.0616, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 927 }, { "completion_length": 1975.25, "epoch": 1.1317073170731708, "grad_norm": 0.15296722948551178, "kl": 0.07373046875, "learning_rate": 1.4170328007602395e-06, "loss": 0.0282, "reward": 0.20833333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 928 }, { "completion_length": 1257.0625610351562, "epoch": 1.1329268292682926, "grad_norm": 0.2678631544113159, "kl": 0.07421875, "learning_rate": 1.4138452031494152e-06, "loss": 0.0924, "reward": 0.3958333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3958333358168602, "rewards/format_reward": 0.0, "step": 929 }, { "completion_length": 1355.8333740234375, "epoch": 1.1341463414634148, "grad_norm": 0.24005337059497833, "kl": 0.067626953125, "learning_rate": 1.4106579958455812e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 930 }, { "completion_length": 1582.2083740234375, "epoch": 1.1353658536585365, "grad_norm": 0.1580352932214737, "kl": 0.055908203125, "learning_rate": 1.4074711932877393e-06, "loss": -0.0031, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 931 }, { "completion_length": 1456.0, "epoch": 1.1365853658536587, "grad_norm": 0.27881920337677, "kl": 0.064453125, "learning_rate": 1.4042848099130574e-06, "loss": 0.0437, "reward": 0.2500000111758709, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "step": 932 }, { "completion_length": 1452.5833435058594, "epoch": 1.1378048780487804, "grad_norm": 0.19155366718769073, "kl": 0.0677490234375, "learning_rate": 1.401098860156806e-06, "loss": 0.0382, "reward": 0.291666679084301, "reward_std": 0.14433757215738297, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 933 }, { "completion_length": 2203.1459350585938, "epoch": 1.1390243902439026, "grad_norm": 0.14792536199092865, "kl": 0.0732421875, "learning_rate": 1.3979133584522893e-06, "loss": 0.0128, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 934 }, { "completion_length": 1776.0416870117188, "epoch": 1.1402439024390243, "grad_norm": 0.1104043573141098, "kl": 0.0654296875, "learning_rate": 1.3947283192307831e-06, "loss": 0.0232, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 935 }, { "completion_length": 1734.041748046875, "epoch": 1.1414634146341462, "grad_norm": 0.1387404352426529, "kl": 0.05712890625, "learning_rate": 1.391543756921467e-06, "loss": 0.0847, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 936 }, { "completion_length": 1762.75, "epoch": 1.1426829268292682, "grad_norm": 0.12016936391592026, "kl": 0.0592041015625, "learning_rate": 1.38835968595136e-06, "loss": 0.0617, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 937 }, { "completion_length": 1265.625, "epoch": 1.1439024390243901, "grad_norm": 0.23530256748199463, "kl": 0.0657958984375, "learning_rate": 1.3851761207452565e-06, "loss": 0.092, "reward": 0.3125, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 938 }, { "completion_length": 1543.0833740234375, "epoch": 1.145121951219512, "grad_norm": 0.24685853719711304, "kl": 0.0810546875, "learning_rate": 1.3819930757256585e-06, "loss": 0.2069, "reward": 0.25, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 939 }, { "completion_length": 1149.7083435058594, "epoch": 1.146341463414634, "grad_norm": 0.1647806018590927, "kl": 0.052490234375, "learning_rate": 1.3788105653127118e-06, "loss": 0.0116, "reward": 0.2500000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 940 }, { "completion_length": 1718.8958740234375, "epoch": 1.147560975609756, "grad_norm": 0.05026647076010704, "kl": 0.0635986328125, "learning_rate": 1.3756286039241397e-06, "loss": 0.0023, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 941 }, { "completion_length": 1349.8958740234375, "epoch": 1.148780487804878, "grad_norm": 0.32085680961608887, "kl": 0.068115234375, "learning_rate": 1.3724472059751785e-06, "loss": 0.0641, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 942 }, { "completion_length": 1723.291748046875, "epoch": 1.15, "grad_norm": 0.1573391556739807, "kl": 0.0732421875, "learning_rate": 1.3692663858785126e-06, "loss": 0.1011, "reward": 0.1458333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 943 }, { "completion_length": 1943.6250610351562, "epoch": 1.1512195121951219, "grad_norm": 0.10186924040317535, "kl": 0.069580078125, "learning_rate": 1.3660861580442087e-06, "loss": 0.0807, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 944 }, { "completion_length": 1625.666748046875, "epoch": 1.1524390243902438, "grad_norm": 0.1980433315038681, "kl": 0.0654296875, "learning_rate": 1.3629065368796491e-06, "loss": 0.0834, "reward": 0.2708333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 945 }, { "completion_length": 1781.666748046875, "epoch": 1.1536585365853658, "grad_norm": 0.12164116650819778, "kl": 0.0703125, "learning_rate": 1.3597275367894676e-06, "loss": 0.0668, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 946 }, { "completion_length": 1444.5625610351562, "epoch": 1.1548780487804877, "grad_norm": 0.09009767323732376, "kl": 0.052978515625, "learning_rate": 1.3565491721754867e-06, "loss": 0.0462, "reward": 0.2708333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 947 }, { "completion_length": 2089.8126220703125, "epoch": 1.1560975609756097, "grad_norm": 0.17753665149211884, "kl": 0.073486328125, "learning_rate": 1.3533714574366473e-06, "loss": 0.0563, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 948 }, { "completion_length": 1314.375, "epoch": 1.1573170731707316, "grad_norm": 0.3049113154411316, "kl": 0.067626953125, "learning_rate": 1.3501944069689483e-06, "loss": 0.0457, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 949 }, { "completion_length": 1864.541748046875, "epoch": 1.1585365853658536, "grad_norm": 0.24489426612854004, "kl": 0.0654296875, "learning_rate": 1.3470180351653773e-06, "loss": 0.0629, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 950 }, { "completion_length": 1413.0625610351562, "epoch": 1.1597560975609755, "grad_norm": 0.08154662698507309, "kl": 0.049560546875, "learning_rate": 1.3438423564158484e-06, "loss": 0.0193, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 951 }, { "completion_length": 1927.9791870117188, "epoch": 1.1609756097560975, "grad_norm": 0.17261144518852234, "kl": 0.071533203125, "learning_rate": 1.3406673851071362e-06, "loss": 0.0681, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 952 }, { "completion_length": 1449.6250610351562, "epoch": 1.1621951219512194, "grad_norm": 0.13331378996372223, "kl": 0.0626220703125, "learning_rate": 1.3374931356228103e-06, "loss": 0.0051, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 953 }, { "completion_length": 1777.8959350585938, "epoch": 1.1634146341463414, "grad_norm": 0.08325041830539703, "kl": 0.0506591796875, "learning_rate": 1.3343196223431698e-06, "loss": 0.0292, "reward": 0.1041666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 954 }, { "completion_length": 2075.7708740234375, "epoch": 1.1646341463414633, "grad_norm": 0.16337166726589203, "kl": 0.070556640625, "learning_rate": 1.3311468596451785e-06, "loss": 0.154, "reward": 0.1666666679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 955 }, { "completion_length": 1594.2708740234375, "epoch": 1.1658536585365853, "grad_norm": 0.2501789629459381, "kl": 0.0596923828125, "learning_rate": 1.3279748619023995e-06, "loss": 0.1364, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 956 }, { "completion_length": 1815.5208740234375, "epoch": 1.1670731707317072, "grad_norm": 0.07239025831222534, "kl": 0.05419921875, "learning_rate": 1.3248036434849319e-06, "loss": 0.026, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 957 }, { "completion_length": 1846.2083740234375, "epoch": 1.1682926829268292, "grad_norm": 0.24879348278045654, "kl": 0.063720703125, "learning_rate": 1.3216332187593434e-06, "loss": 0.1948, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 958 }, { "completion_length": 1296.4791870117188, "epoch": 1.1695121951219511, "grad_norm": 0.19259005784988403, "kl": 0.0633544921875, "learning_rate": 1.3184636020886058e-06, "loss": 0.0282, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 959 }, { "completion_length": 1535.0208740234375, "epoch": 1.170731707317073, "grad_norm": 0.2327294498682022, "kl": 0.0733642578125, "learning_rate": 1.3152948078320297e-06, "loss": 0.0974, "reward": 0.1458333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 960 }, { "completion_length": 1602.6458740234375, "epoch": 1.171951219512195, "grad_norm": 0.04861899092793465, "kl": 0.048828125, "learning_rate": 1.3121268503452014e-06, "loss": 0.0022, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 961 }, { "completion_length": 1709.3750610351562, "epoch": 1.173170731707317, "grad_norm": 0.15221288800239563, "kl": 0.079833984375, "learning_rate": 1.3089597439799151e-06, "loss": -0.0102, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 962 }, { "completion_length": 1789.6458740234375, "epoch": 1.174390243902439, "grad_norm": 0.10455447435379028, "kl": 0.0743408203125, "learning_rate": 1.3057935030841096e-06, "loss": 0.0297, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 963 }, { "completion_length": 1864.4583740234375, "epoch": 1.175609756097561, "grad_norm": 0.0651104524731636, "kl": 0.060791015625, "learning_rate": 1.3026281420018034e-06, "loss": 0.0027, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 964 }, { "completion_length": 1348.7500610351562, "epoch": 1.1768292682926829, "grad_norm": 0.45649102330207825, "kl": 0.0523681640625, "learning_rate": 1.2994636750730272e-06, "loss": 0.0784, "reward": 0.4166666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 965 }, { "completion_length": 1728.5833740234375, "epoch": 1.1780487804878048, "grad_norm": 0.2447793334722519, "kl": 0.0635986328125, "learning_rate": 1.2963001166337642e-06, "loss": 0.0616, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 966 }, { "completion_length": 1130.7083740234375, "epoch": 1.1792682926829268, "grad_norm": 0.19312424957752228, "kl": 0.0565185546875, "learning_rate": 1.2931374810158788e-06, "loss": 0.0029, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 967 }, { "completion_length": 1488.3333740234375, "epoch": 1.1804878048780487, "grad_norm": 0.6297222375869751, "kl": 0.0772705078125, "learning_rate": 1.2899757825470568e-06, "loss": 0.12, "reward": 0.2708333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 968 }, { "completion_length": 1177.229248046875, "epoch": 1.1817073170731707, "grad_norm": 0.3343561589717865, "kl": 0.0682373046875, "learning_rate": 1.2868150355507365e-06, "loss": 0.0889, "reward": 0.25000000558793545, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.0, "step": 969 }, { "completion_length": 1707.7083740234375, "epoch": 1.1829268292682926, "grad_norm": 0.3039189577102661, "kl": 0.071533203125, "learning_rate": 1.283655254346048e-06, "loss": -0.0803, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 970 }, { "completion_length": 1240.9166870117188, "epoch": 1.1841463414634146, "grad_norm": 0.08052528649568558, "kl": 0.054443359375, "learning_rate": 1.2804964532477444e-06, "loss": -0.0011, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 971 }, { "completion_length": 1953.7083740234375, "epoch": 1.1853658536585365, "grad_norm": 0.12600348889827728, "kl": 0.072509765625, "learning_rate": 1.2773386465661395e-06, "loss": 0.0391, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 972 }, { "completion_length": 1341.104248046875, "epoch": 1.1865853658536585, "grad_norm": 0.16983240842819214, "kl": 0.07568359375, "learning_rate": 1.2741818486070414e-06, "loss": 0.0405, "reward": 0.0625, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 973 }, { "completion_length": 2178.2708740234375, "epoch": 1.1878048780487804, "grad_norm": 0.17607451975345612, "kl": 0.0693359375, "learning_rate": 1.2710260736716882e-06, "loss": 0.1453, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 974 }, { "completion_length": 1656.8125, "epoch": 1.1890243902439024, "grad_norm": 0.14464262127876282, "kl": 0.066650390625, "learning_rate": 1.267871336056685e-06, "loss": 0.0822, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 975 }, { "completion_length": 1624.2291870117188, "epoch": 1.1902439024390243, "grad_norm": 0.1938604861497879, "kl": 0.064697265625, "learning_rate": 1.264717650053936e-06, "loss": 0.043, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 976 }, { "completion_length": 1119.3958740234375, "epoch": 1.1914634146341463, "grad_norm": 0.1611449122428894, "kl": 0.07421875, "learning_rate": 1.261565029950582e-06, "loss": 0.0598, "reward": 0.1875000111758709, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 977 }, { "completion_length": 1507.3958740234375, "epoch": 1.1926829268292682, "grad_norm": 0.17270319163799286, "kl": 0.0626220703125, "learning_rate": 1.2584134900289346e-06, "loss": 0.0633, "reward": 0.16666667722165585, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 978 }, { "completion_length": 1419.2708740234375, "epoch": 1.1939024390243902, "grad_norm": 0.20238234102725983, "kl": 0.0631103515625, "learning_rate": 1.255263044566411e-06, "loss": 0.0462, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 979 }, { "completion_length": 1453.7916870117188, "epoch": 1.1951219512195121, "grad_norm": 0.0876580998301506, "kl": 0.0594482421875, "learning_rate": 1.2521137078354728e-06, "loss": 0.0499, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 980 }, { "completion_length": 1724.25, "epoch": 1.196341463414634, "grad_norm": 0.13381439447402954, "kl": 0.079345703125, "learning_rate": 1.248965494103556e-06, "loss": 0.0347, "reward": 0.0625, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 981 }, { "completion_length": 1328.979248046875, "epoch": 1.197560975609756, "grad_norm": 0.21646268665790558, "kl": 0.063232421875, "learning_rate": 1.2458184176330102e-06, "loss": 0.0725, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 982 }, { "completion_length": 2082.6875610351562, "epoch": 1.198780487804878, "grad_norm": 0.22613485157489777, "kl": 0.0670166015625, "learning_rate": 1.2426724926810324e-06, "loss": 0.1372, "reward": 0.2916666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 983 }, { "completion_length": 1670.8958740234375, "epoch": 1.2, "grad_norm": 0.0773947536945343, "kl": 0.0517578125, "learning_rate": 1.2395277334996047e-06, "loss": 0.0454, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 984 }, { "completion_length": 1650.6458740234375, "epoch": 1.201219512195122, "grad_norm": 0.2956589460372925, "kl": 0.072509765625, "learning_rate": 1.2363841543354249e-06, "loss": 0.1364, "reward": 0.1666666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 985 }, { "completion_length": 1104.0, "epoch": 1.2024390243902439, "grad_norm": 0.05441882088780403, "kl": 0.0570068359375, "learning_rate": 1.2332417694298477e-06, "loss": 0.0025, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 986 }, { "completion_length": 1141.2916870117188, "epoch": 1.2036585365853658, "grad_norm": 0.16385211050510406, "kl": 0.0511474609375, "learning_rate": 1.2301005930188156e-06, "loss": 0.0799, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 987 }, { "completion_length": 1503.9375, "epoch": 1.2048780487804878, "grad_norm": 0.18123921751976013, "kl": 0.083740234375, "learning_rate": 1.2269606393327968e-06, "loss": 0.0983, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 988 }, { "completion_length": 2004.0416870117188, "epoch": 1.2060975609756097, "grad_norm": 0.15406692028045654, "kl": 0.072265625, "learning_rate": 1.223821922596721e-06, "loss": 0.0735, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 989 }, { "completion_length": 1595.0833740234375, "epoch": 1.2073170731707317, "grad_norm": 1.0181471109390259, "kl": 0.0836181640625, "learning_rate": 1.2206844570299133e-06, "loss": 0.0722, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 990 }, { "completion_length": 1899.2708740234375, "epoch": 1.2085365853658536, "grad_norm": 0.20690324902534485, "kl": 0.07861328125, "learning_rate": 1.2175482568460306e-06, "loss": 0.091, "reward": 0.10416666977107525, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 991 }, { "completion_length": 1357.2083740234375, "epoch": 1.2097560975609756, "grad_norm": 0.17652636766433716, "kl": 0.066650390625, "learning_rate": 1.2144133362529974e-06, "loss": 0.0349, "reward": 0.2291666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 992 }, { "completion_length": 1758.3541870117188, "epoch": 1.2109756097560975, "grad_norm": 0.11993435025215149, "kl": 0.08349609375, "learning_rate": 1.2112797094529417e-06, "loss": 0.037, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 993 }, { "completion_length": 1989.9166870117188, "epoch": 1.2121951219512195, "grad_norm": 0.08298174291849136, "kl": 0.078369140625, "learning_rate": 1.2081473906421298e-06, "loss": 0.0033, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 994 }, { "completion_length": 2159.7501220703125, "epoch": 1.2134146341463414, "grad_norm": 0.12719139456748962, "kl": 0.0771484375, "learning_rate": 1.205016394010902e-06, "loss": 0.0892, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 995 }, { "completion_length": 1851.9375, "epoch": 1.2146341463414634, "grad_norm": 0.1593027561903, "kl": 0.082763671875, "learning_rate": 1.20188673374361e-06, "loss": 0.0596, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 996 }, { "completion_length": 1873.791748046875, "epoch": 1.2158536585365853, "grad_norm": 0.11296737939119339, "kl": 0.090576171875, "learning_rate": 1.1987584240185492e-06, "loss": 0.0417, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 997 }, { "completion_length": 1995.5000610351562, "epoch": 1.2170731707317073, "grad_norm": 0.3072722852230072, "kl": 0.101806640625, "learning_rate": 1.1956314790078998e-06, "loss": 0.0852, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 998 }, { "completion_length": 1843.1875, "epoch": 1.2182926829268292, "grad_norm": 0.4904327094554901, "kl": 0.090087890625, "learning_rate": 1.1925059128776567e-06, "loss": -0.0469, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 999 }, { "completion_length": 2362.6458740234375, "epoch": 1.2195121951219512, "grad_norm": 0.13770951330661774, "kl": 0.08740234375, "learning_rate": 1.189381739787569e-06, "loss": 0.0506, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1000 }, { "completion_length": 1484.8541870117188, "epoch": 1.2207317073170731, "grad_norm": 0.1202976256608963, "kl": 0.090576171875, "learning_rate": 1.1862589738910754e-06, "loss": 0.0066, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1001 }, { "completion_length": 2300.64599609375, "epoch": 1.221951219512195, "grad_norm": 0.1438293159008026, "kl": 0.09375, "learning_rate": 1.1831376293352378e-06, "loss": 0.0807, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 1002 }, { "completion_length": 1858.5208740234375, "epoch": 1.223170731707317, "grad_norm": 0.14884722232818604, "kl": 0.094482421875, "learning_rate": 1.180017720260682e-06, "loss": 0.0427, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1003 }, { "completion_length": 2296.1666870117188, "epoch": 1.224390243902439, "grad_norm": 0.07314486056566238, "kl": 0.0830078125, "learning_rate": 1.176899260801528e-06, "loss": 0.0189, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1004 }, { "completion_length": 1876.5000610351562, "epoch": 1.225609756097561, "grad_norm": 0.18909896910190582, "kl": 0.08642578125, "learning_rate": 1.1737822650853301e-06, "loss": 0.0626, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 1005 }, { "completion_length": 1705.0833740234375, "epoch": 1.226829268292683, "grad_norm": 0.22585119307041168, "kl": 0.087890625, "learning_rate": 1.1706667472330101e-06, "loss": 0.0876, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1006 }, { "completion_length": 1782.8333740234375, "epoch": 1.2280487804878049, "grad_norm": 0.11418358981609344, "kl": 0.0859375, "learning_rate": 1.1675527213587963e-06, "loss": 0.0773, "reward": 0.25, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1007 }, { "completion_length": 2065.7500610351562, "epoch": 1.2292682926829268, "grad_norm": 0.14432381093502045, "kl": 0.08056640625, "learning_rate": 1.1644402015701568e-06, "loss": 0.079, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1008 }, { "completion_length": 2061.9584350585938, "epoch": 1.2304878048780488, "grad_norm": 0.09090807288885117, "kl": 0.0849609375, "learning_rate": 1.1613292019677364e-06, "loss": 0.0414, "reward": 0.1666666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1009 }, { "completion_length": 2231.854248046875, "epoch": 1.2317073170731707, "grad_norm": 0.23062652349472046, "kl": 0.098388671875, "learning_rate": 1.158219736645294e-06, "loss": 0.1195, "reward": 0.2708333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1010 }, { "completion_length": 2217.604248046875, "epoch": 1.2329268292682927, "grad_norm": 0.18628190457820892, "kl": 0.0751953125, "learning_rate": 1.1551118196896364e-06, "loss": 0.0647, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1011 }, { "completion_length": 2098.9584350585938, "epoch": 1.2341463414634146, "grad_norm": 0.2369871586561203, "kl": 0.0693359375, "learning_rate": 1.152005465180558e-06, "loss": 0.0654, "reward": 0.1458333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1012 }, { "completion_length": 1765.229248046875, "epoch": 1.2353658536585366, "grad_norm": 0.21466252207756042, "kl": 0.100830078125, "learning_rate": 1.1489006871907728e-06, "loss": 0.0611, "reward": 0.2500000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 1013 }, { "completion_length": 2238.541748046875, "epoch": 1.2365853658536585, "grad_norm": 0.1744280606508255, "kl": 0.085205078125, "learning_rate": 1.145797499785853e-06, "loss": 0.0862, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1014 }, { "completion_length": 1973.291748046875, "epoch": 1.2378048780487805, "grad_norm": 0.12297386676073074, "kl": 0.076904296875, "learning_rate": 1.1426959170241663e-06, "loss": 0.0355, "reward": 0.2500000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 1015 }, { "completion_length": 1690.6458740234375, "epoch": 1.2390243902439024, "grad_norm": 0.2074672430753708, "kl": 0.070556640625, "learning_rate": 1.1395959529568088e-06, "loss": -0.0172, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1016 }, { "completion_length": 2298.729248046875, "epoch": 1.2402439024390244, "grad_norm": 0.19512587785720825, "kl": 0.09228515625, "learning_rate": 1.1364976216275462e-06, "loss": 0.1496, "reward": 0.1458333358168602, "reward_std": 0.14433755725622177, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1017 }, { "completion_length": 2004.3125, "epoch": 1.2414634146341463, "grad_norm": 0.09646455943584442, "kl": 0.088134765625, "learning_rate": 1.1334009370727446e-06, "loss": 0.0021, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1018 }, { "completion_length": 1802.666748046875, "epoch": 1.2426829268292683, "grad_norm": 0.176173135638237, "kl": 0.0849609375, "learning_rate": 1.1303059133213115e-06, "loss": 0.048, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1019 }, { "completion_length": 1923.6250610351562, "epoch": 1.2439024390243902, "grad_norm": 0.19647178053855896, "kl": 0.08837890625, "learning_rate": 1.127212564394629e-06, "loss": 0.0656, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1020 }, { "completion_length": 1745.8541870117188, "epoch": 1.2451219512195122, "grad_norm": 0.14779818058013916, "kl": 0.093994140625, "learning_rate": 1.1241209043064944e-06, "loss": 0.0082, "reward": 0.2500000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 1021 }, { "completion_length": 1331.8333740234375, "epoch": 1.2463414634146341, "grad_norm": 0.1114281564950943, "kl": 0.0633544921875, "learning_rate": 1.1210309470630509e-06, "loss": 0.0497, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 1022 }, { "completion_length": 1473.0833740234375, "epoch": 1.247560975609756, "grad_norm": 0.19259703159332275, "kl": 0.090087890625, "learning_rate": 1.1179427066627292e-06, "loss": 0.0952, "reward": 0.3958333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 1023 }, { "completion_length": 1584.3958740234375, "epoch": 1.248780487804878, "grad_norm": 0.11709049344062805, "kl": 0.075439453125, "learning_rate": 1.1148561970961818e-06, "loss": 0.0269, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1024 }, { "completion_length": 1366.7916870117188, "epoch": 1.25, "grad_norm": 0.40091991424560547, "kl": 0.078369140625, "learning_rate": 1.1117714323462188e-06, "loss": 0.2077, "reward": 0.3333333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 1025 }, { "completion_length": 1623.0, "epoch": 1.251219512195122, "grad_norm": 0.5116755962371826, "kl": 0.0791015625, "learning_rate": 1.1086884263877486e-06, "loss": 0.0486, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1026 }, { "completion_length": 1503.2500610351562, "epoch": 1.252439024390244, "grad_norm": 0.1446014642715454, "kl": 0.06787109375, "learning_rate": 1.105607193187709e-06, "loss": 0.0505, "reward": 0.3541666865348816, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 0.0, "step": 1027 }, { "completion_length": 2115.2083740234375, "epoch": 1.2536585365853659, "grad_norm": 0.16461709141731262, "kl": 0.084228515625, "learning_rate": 1.1025277467050079e-06, "loss": 0.1069, "reward": 0.2291666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 1028 }, { "completion_length": 2372.0626220703125, "epoch": 1.2548780487804878, "grad_norm": 0.13272832334041595, "kl": 0.08837890625, "learning_rate": 1.0994501008904578e-06, "loss": 0.0822, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1029 }, { "completion_length": 1803.625, "epoch": 1.2560975609756098, "grad_norm": 0.11471788585186005, "kl": 0.0621337890625, "learning_rate": 1.0963742696867162e-06, "loss": 0.05, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1030 }, { "completion_length": 1907.666748046875, "epoch": 1.2573170731707317, "grad_norm": 0.06946831941604614, "kl": 0.0606689453125, "learning_rate": 1.093300267028217e-06, "loss": 0.0384, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1031 }, { "completion_length": 2037.0208740234375, "epoch": 1.2585365853658537, "grad_norm": 0.11572614312171936, "kl": 0.080322265625, "learning_rate": 1.0902281068411114e-06, "loss": 0.1012, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1032 }, { "completion_length": 1929.6041870117188, "epoch": 1.2597560975609756, "grad_norm": 0.2134256511926651, "kl": 0.087158203125, "learning_rate": 1.0871578030432038e-06, "loss": 0.0158, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1033 }, { "completion_length": 2304.041748046875, "epoch": 1.2609756097560976, "grad_norm": 0.22080178558826447, "kl": 0.08544921875, "learning_rate": 1.084089369543888e-06, "loss": 0.039, "reward": 0.1458333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1034 }, { "completion_length": 1821.5000610351562, "epoch": 1.2621951219512195, "grad_norm": 0.13844181597232819, "kl": 0.071044921875, "learning_rate": 1.0810228202440862e-06, "loss": 0.0619, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1035 }, { "completion_length": 2222.666748046875, "epoch": 1.2634146341463415, "grad_norm": 0.19903665781021118, "kl": 0.08935546875, "learning_rate": 1.077958169036183e-06, "loss": 0.0926, "reward": 0.1458333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1036 }, { "completion_length": 2523.354248046875, "epoch": 1.2646341463414634, "grad_norm": 0.10722126066684723, "kl": 0.089599609375, "learning_rate": 1.0748954298039644e-06, "loss": 0.0036, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1037 }, { "completion_length": 1969.0000610351562, "epoch": 1.2658536585365854, "grad_norm": 0.3397144079208374, "kl": 0.067138671875, "learning_rate": 1.0718346164225556e-06, "loss": 0.058, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1038 }, { "completion_length": 1652.0833740234375, "epoch": 1.2670731707317073, "grad_norm": 0.07621407508850098, "kl": 0.06396484375, "learning_rate": 1.0687757427583553e-06, "loss": 0.0029, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1039 }, { "completion_length": 1543.7708740234375, "epoch": 1.2682926829268293, "grad_norm": 0.22013604640960693, "kl": 0.065673828125, "learning_rate": 1.0657188226689772e-06, "loss": 0.0763, "reward": 0.2708333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 1040 }, { "completion_length": 1977.7708740234375, "epoch": 1.2695121951219512, "grad_norm": 0.17711366713047028, "kl": 0.0782470703125, "learning_rate": 1.0626638700031825e-06, "loss": 0.1155, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1041 }, { "completion_length": 2069.291748046875, "epoch": 1.2707317073170732, "grad_norm": 0.22869975864887238, "kl": 0.0601806640625, "learning_rate": 1.0596108986008203e-06, "loss": 0.0031, "reward": 0.20833333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 1042 }, { "completion_length": 1787.7709350585938, "epoch": 1.2719512195121951, "grad_norm": 0.10908153653144836, "kl": 0.060302734375, "learning_rate": 1.0565599222927637e-06, "loss": 0.0489, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1043 }, { "completion_length": 2007.4584350585938, "epoch": 1.273170731707317, "grad_norm": 0.15364407002925873, "kl": 0.062744140625, "learning_rate": 1.0535109549008482e-06, "loss": 0.1277, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1044 }, { "completion_length": 1540.0000610351562, "epoch": 1.274390243902439, "grad_norm": 0.3069708049297333, "kl": 0.0723876953125, "learning_rate": 1.0504640102378075e-06, "loss": 0.0393, "reward": 0.2083333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1045 }, { "completion_length": 2054.4375610351562, "epoch": 1.275609756097561, "grad_norm": 0.16431839764118195, "kl": 0.086669921875, "learning_rate": 1.0474191021072117e-06, "loss": 0.1164, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1046 }, { "completion_length": 1451.1875610351562, "epoch": 1.276829268292683, "grad_norm": 0.17560173571109772, "kl": 0.078125, "learning_rate": 1.0443762443034054e-06, "loss": 0.0601, "reward": 0.3333333544433117, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3333333544433117, "rewards/format_reward": 0.0, "step": 1047 }, { "completion_length": 1978.291748046875, "epoch": 1.278048780487805, "grad_norm": 39.92226791381836, "kl": 0.48486328125, "learning_rate": 1.0413354506114434e-06, "loss": 0.0442, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1048 }, { "completion_length": 2396.5625, "epoch": 1.2792682926829269, "grad_norm": 0.09802167117595673, "kl": 0.072265625, "learning_rate": 1.0382967348070315e-06, "loss": 0.0046, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1049 }, { "completion_length": 1147.1458740234375, "epoch": 1.2804878048780488, "grad_norm": 0.10195109248161316, "kl": 0.085205078125, "learning_rate": 1.0352601106564607e-06, "loss": 0.0376, "reward": 0.3958333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 1050 }, { "completion_length": 2029.75, "epoch": 1.2817073170731708, "grad_norm": 0.2630758285522461, "kl": 0.08642578125, "learning_rate": 1.0322255919165456e-06, "loss": 0.1425, "reward": 0.1875, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1051 }, { "completion_length": 1347.8750610351562, "epoch": 1.2829268292682927, "grad_norm": 0.3488813042640686, "kl": 0.081298828125, "learning_rate": 1.0291931923345635e-06, "loss": 0.1615, "reward": 0.229166679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1052 }, { "completion_length": 1338.7708740234375, "epoch": 1.2841463414634147, "grad_norm": 0.20774394273757935, "kl": 0.06396484375, "learning_rate": 1.0261629256481923e-06, "loss": 0.0317, "reward": 0.3333333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 1053 }, { "completion_length": 1485.0208740234375, "epoch": 1.2853658536585366, "grad_norm": 0.2633892595767975, "kl": 0.06982421875, "learning_rate": 1.0231348055854452e-06, "loss": 0.025, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1054 }, { "completion_length": 1755.8125610351562, "epoch": 1.2865853658536586, "grad_norm": 0.17884060740470886, "kl": 0.0849609375, "learning_rate": 1.0201088458646118e-06, "loss": 0.1126, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1055 }, { "completion_length": 1367.5833740234375, "epoch": 1.2878048780487805, "grad_norm": 0.10132281482219696, "kl": 0.0611572265625, "learning_rate": 1.0170850601941937e-06, "loss": 0.0576, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 1056 }, { "completion_length": 1844.3125610351562, "epoch": 1.2890243902439025, "grad_norm": 0.19846048951148987, "kl": 0.075439453125, "learning_rate": 1.0140634622728447e-06, "loss": 0.1055, "reward": 0.25, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1057 }, { "completion_length": 1462.6875610351562, "epoch": 1.2902439024390244, "grad_norm": 0.3963718116283417, "kl": 0.0986328125, "learning_rate": 1.0110440657893074e-06, "loss": 0.2671, "reward": 0.2916666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1058 }, { "completion_length": 1539.416748046875, "epoch": 1.2914634146341464, "grad_norm": 0.13757258653640747, "kl": 0.062744140625, "learning_rate": 1.00802688442235e-06, "loss": 0.0992, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1059 }, { "completion_length": 1308.8541870117188, "epoch": 1.2926829268292683, "grad_norm": 0.16897155344486237, "kl": 0.08642578125, "learning_rate": 1.0050119318407061e-06, "loss": 0.012, "reward": 0.0833333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1060 }, { "completion_length": 1575.2500610351562, "epoch": 1.2939024390243903, "grad_norm": 0.0865950658917427, "kl": 0.075439453125, "learning_rate": 1.0019992217030127e-06, "loss": 0.0035, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1061 }, { "completion_length": 1565.416748046875, "epoch": 1.2951219512195122, "grad_norm": 0.09194821119308472, "kl": 0.06787109375, "learning_rate": 9.98988767657747e-07, "loss": 0.0448, "reward": 0.1666666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1062 }, { "completion_length": 2212.7083740234375, "epoch": 1.2963414634146342, "grad_norm": 0.17093273997306824, "kl": 0.096923828125, "learning_rate": 9.95980583343167e-07, "loss": 0.1133, "reward": 0.2291666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1063 }, { "completion_length": 1542.416748046875, "epoch": 1.2975609756097561, "grad_norm": 0.24196889996528625, "kl": 0.0828857421875, "learning_rate": 9.929746823872462e-07, "loss": 0.1133, "reward": 0.2708333432674408, "reward_std": 0.25259076058864594, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1064 }, { "completion_length": 1419.354248046875, "epoch": 1.298780487804878, "grad_norm": 0.27168744802474976, "kl": 0.0782470703125, "learning_rate": 9.899710784076147e-07, "loss": 0.1474, "reward": 0.0833333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1065 }, { "completion_length": 1980.7708740234375, "epoch": 1.3, "grad_norm": 0.26821696758270264, "kl": 0.076171875, "learning_rate": 9.86969785011497e-07, "loss": 0.1667, "reward": 0.1875000111758709, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 1066 }, { "completion_length": 1196.5625610351562, "epoch": 1.301219512195122, "grad_norm": 0.06358644366264343, "kl": 0.0511474609375, "learning_rate": 9.839708157956493e-07, "loss": 0.0025, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1067 }, { "completion_length": 1702.9583740234375, "epoch": 1.302439024390244, "grad_norm": 0.20010869204998016, "kl": 0.0687255859375, "learning_rate": 9.809741843462994e-07, "loss": -0.016, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1068 }, { "completion_length": 1781.2708740234375, "epoch": 1.303658536585366, "grad_norm": 0.1712142527103424, "kl": 0.075927734375, "learning_rate": 9.779799042390833e-07, "loss": 0.0519, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1069 }, { "completion_length": 1473.0625, "epoch": 1.3048780487804879, "grad_norm": 0.21228957176208496, "kl": 0.0660400390625, "learning_rate": 9.749879890389848e-07, "loss": 0.1291, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1070 }, { "completion_length": 1565.25, "epoch": 1.3060975609756098, "grad_norm": 0.17412599921226501, "kl": 0.072998046875, "learning_rate": 9.719984523002745e-07, "loss": 0.1064, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1071 }, { "completion_length": 1986.8959350585938, "epoch": 1.3073170731707318, "grad_norm": 0.1549442559480667, "kl": 0.09912109375, "learning_rate": 9.690113075664488e-07, "loss": 0.0417, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1072 }, { "completion_length": 1379.1875610351562, "epoch": 1.3085365853658537, "grad_norm": 0.18815653026103973, "kl": 0.114013671875, "learning_rate": 9.660265683701652e-07, "loss": 0.1036, "reward": 0.3125, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 1073 }, { "completion_length": 1312.9375610351562, "epoch": 1.3097560975609757, "grad_norm": 1.0379447937011719, "kl": 0.10986328125, "learning_rate": 9.630442482331853e-07, "loss": 0.0266, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1074 }, { "completion_length": 1759.0208740234375, "epoch": 1.3109756097560976, "grad_norm": 0.1436193436384201, "kl": 0.0869140625, "learning_rate": 9.600643606663104e-07, "loss": 0.0027, "reward": 0.1041666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1075 }, { "completion_length": 1469.4583740234375, "epoch": 1.3121951219512196, "grad_norm": 0.23053094744682312, "kl": 0.0869140625, "learning_rate": 9.57086919169323e-07, "loss": 0.1141, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1076 }, { "completion_length": 1901.1458740234375, "epoch": 1.3134146341463415, "grad_norm": 0.15696841478347778, "kl": 0.1025390625, "learning_rate": 9.541119372309233e-07, "loss": 0.016, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1077 }, { "completion_length": 1591.1250610351562, "epoch": 1.3146341463414635, "grad_norm": 0.3517068326473236, "kl": 0.101318359375, "learning_rate": 9.511394283286686e-07, "loss": 0.0476, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1078 }, { "completion_length": 1614.5, "epoch": 1.3158536585365854, "grad_norm": 0.14726942777633667, "kl": 0.083984375, "learning_rate": 9.481694059289126e-07, "loss": 0.0617, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1079 }, { "completion_length": 1494.6458740234375, "epoch": 1.3170731707317074, "grad_norm": 0.36606818437576294, "kl": 0.111083984375, "learning_rate": 9.452018834867454e-07, "loss": 0.0607, "reward": 0.3125, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 1080 }, { "completion_length": 1964.5001220703125, "epoch": 1.3182926829268293, "grad_norm": 0.2021106332540512, "kl": 0.10595703125, "learning_rate": 9.422368744459309e-07, "loss": 0.0075, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1081 }, { "completion_length": 1491.229248046875, "epoch": 1.3195121951219513, "grad_norm": 0.17890433967113495, "kl": 0.095703125, "learning_rate": 9.392743922388469e-07, "loss": 0.0402, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1082 }, { "completion_length": 1896.416748046875, "epoch": 1.3207317073170732, "grad_norm": 0.11118951439857483, "kl": 0.094970703125, "learning_rate": 9.363144502864233e-07, "loss": 0.0042, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1083 }, { "completion_length": 2154.854248046875, "epoch": 1.3219512195121952, "grad_norm": 0.21695645153522491, "kl": 0.109619140625, "learning_rate": 9.333570619980818e-07, "loss": 0.0447, "reward": 0.18750000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 1084 }, { "completion_length": 2111.729248046875, "epoch": 1.3231707317073171, "grad_norm": 0.6912495493888855, "kl": 0.12744140625, "learning_rate": 9.304022407716754e-07, "loss": 0.051, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1085 }, { "completion_length": 2149.9375, "epoch": 1.324390243902439, "grad_norm": 0.11633794009685516, "kl": 0.095458984375, "learning_rate": 9.27449999993429e-07, "loss": 0.0043, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1086 }, { "completion_length": 1870.3541870117188, "epoch": 1.325609756097561, "grad_norm": 0.1971844732761383, "kl": 0.091552734375, "learning_rate": 9.245003530378752e-07, "loss": 0.0271, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1087 }, { "completion_length": 1702.9791870117188, "epoch": 1.326829268292683, "grad_norm": 0.13145369291305542, "kl": 0.09375, "learning_rate": 9.215533132677969e-07, "loss": 0.0305, "reward": 0.2916666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1088 }, { "completion_length": 2143.416748046875, "epoch": 1.328048780487805, "grad_norm": 0.18550920486450195, "kl": 0.0869140625, "learning_rate": 9.186088940341646e-07, "loss": 0.076, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1089 }, { "completion_length": 2141.5833740234375, "epoch": 1.329268292682927, "grad_norm": 0.1265704482793808, "kl": 0.105712890625, "learning_rate": 9.156671086760788e-07, "loss": 0.0046, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1090 }, { "completion_length": 1475.0208740234375, "epoch": 1.3304878048780489, "grad_norm": 0.23668980598449707, "kl": 0.091552734375, "learning_rate": 9.127279705207067e-07, "loss": 0.057, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1091 }, { "completion_length": 2198.1666870117188, "epoch": 1.3317073170731708, "grad_norm": 0.08822015672922134, "kl": 0.09765625, "learning_rate": 9.097914928832228e-07, "loss": 0.0103, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1092 }, { "completion_length": 2122.916748046875, "epoch": 1.3329268292682928, "grad_norm": 0.4342389702796936, "kl": 0.078369140625, "learning_rate": 9.068576890667484e-07, "loss": 0.0189, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1093 }, { "completion_length": 1558.9791870117188, "epoch": 1.3341463414634147, "grad_norm": 0.06558932363986969, "kl": 0.079833984375, "learning_rate": 9.039265723622923e-07, "loss": 0.0429, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1094 }, { "completion_length": 2063.6041870117188, "epoch": 1.3353658536585367, "grad_norm": 0.07094716280698776, "kl": 0.072509765625, "learning_rate": 9.009981560486894e-07, "loss": 0.0396, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1095 }, { "completion_length": 2246.2500610351562, "epoch": 1.3365853658536586, "grad_norm": 0.13799157738685608, "kl": 0.0986328125, "learning_rate": 8.980724533925419e-07, "loss": -0.0011, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1096 }, { "completion_length": 2218.8541870117188, "epoch": 1.3378048780487806, "grad_norm": 0.12522904574871063, "kl": 0.092041015625, "learning_rate": 8.95149477648157e-07, "loss": 0.028, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1097 }, { "completion_length": 2047.4376220703125, "epoch": 1.3390243902439025, "grad_norm": 0.10981479287147522, "kl": 0.06884765625, "learning_rate": 8.922292420574888e-07, "loss": 0.0584, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1098 }, { "completion_length": 2453.479248046875, "epoch": 1.3402439024390245, "grad_norm": 0.06847599893808365, "kl": 0.08935546875, "learning_rate": 8.893117598500773e-07, "loss": 0.0338, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1099 }, { "completion_length": 2051.2709350585938, "epoch": 1.3414634146341464, "grad_norm": 0.13371917605400085, "kl": 0.087646484375, "learning_rate": 8.863970442429902e-07, "loss": 0.0997, "reward": 0.125, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1100 }, { "completion_length": 1792.5626220703125, "epoch": 1.3426829268292684, "grad_norm": 0.0879015177488327, "kl": 0.06787109375, "learning_rate": 8.834851084407602e-07, "loss": 0.0246, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1101 }, { "completion_length": 1889.7916870117188, "epoch": 1.34390243902439, "grad_norm": 0.11839289963245392, "kl": 0.0712890625, "learning_rate": 8.805759656353275e-07, "loss": 0.0607, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1102 }, { "completion_length": 1851.4166870117188, "epoch": 1.3451219512195123, "grad_norm": 0.12274506688117981, "kl": 0.082763671875, "learning_rate": 8.776696290059775e-07, "loss": -0.0013, "reward": 0.1041666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1103 }, { "completion_length": 1644.25, "epoch": 1.346341463414634, "grad_norm": 0.21413542330265045, "kl": 0.07177734375, "learning_rate": 8.74766111719286e-07, "loss": 0.1012, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1104 }, { "completion_length": 1746.6250610351562, "epoch": 1.3475609756097562, "grad_norm": 0.2278779298067093, "kl": 0.071533203125, "learning_rate": 8.718654269290535e-07, "loss": 0.1418, "reward": 0.1041666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1105 }, { "completion_length": 1727.6250610351562, "epoch": 1.348780487804878, "grad_norm": 0.06175125017762184, "kl": 0.070556640625, "learning_rate": 8.689675877762487e-07, "loss": 0.0025, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1106 }, { "completion_length": 1544.916748046875, "epoch": 1.35, "grad_norm": 0.07626871019601822, "kl": 0.077392578125, "learning_rate": 8.660726073889511e-07, "loss": 0.0236, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1107 }, { "completion_length": 1699.5, "epoch": 1.3512195121951218, "grad_norm": 0.16760168969631195, "kl": 0.07861328125, "learning_rate": 8.631804988822859e-07, "loss": -0.0178, "reward": 0.3125000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "step": 1108 }, { "completion_length": 1874.6666870117188, "epoch": 1.352439024390244, "grad_norm": 0.09511823207139969, "kl": 0.07421875, "learning_rate": 8.602912753583704e-07, "loss": 0.033, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 1109 }, { "completion_length": 1960.3958740234375, "epoch": 1.3536585365853657, "grad_norm": 0.18511831760406494, "kl": 0.06787109375, "learning_rate": 8.574049499062509e-07, "loss": 0.0886, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 1110 }, { "completion_length": 1369.4166870117188, "epoch": 1.354878048780488, "grad_norm": 0.3482666015625, "kl": 0.077880859375, "learning_rate": 8.545215356018445e-07, "loss": 0.2076, "reward": 0.18750000558793545, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 1111 }, { "completion_length": 1444.1875610351562, "epoch": 1.3560975609756096, "grad_norm": 0.22656023502349854, "kl": 0.072998046875, "learning_rate": 8.516410455078793e-07, "loss": 0.1056, "reward": 0.1875000111758709, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 1112 }, { "completion_length": 1784.2916870117188, "epoch": 1.3573170731707318, "grad_norm": 0.10262267291545868, "kl": 0.0589599609375, "learning_rate": 8.487634926738385e-07, "loss": 0.0557, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1113 }, { "completion_length": 1629.5625610351562, "epoch": 1.3585365853658535, "grad_norm": 0.269161581993103, "kl": 0.0767822265625, "learning_rate": 8.458888901358958e-07, "loss": 0.0358, "reward": 0.229166679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1114 }, { "completion_length": 1606.1250610351562, "epoch": 1.3597560975609757, "grad_norm": 0.12729881703853607, "kl": 0.055908203125, "learning_rate": 8.430172509168594e-07, "loss": 0.0062, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1115 }, { "completion_length": 2068.8958740234375, "epoch": 1.3609756097560974, "grad_norm": 0.058840710669755936, "kl": 0.0606689453125, "learning_rate": 8.401485880261151e-07, "loss": 0.0339, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1116 }, { "completion_length": 1281.1666870117188, "epoch": 1.3621951219512196, "grad_norm": 0.19289402663707733, "kl": 0.080322265625, "learning_rate": 8.372829144595623e-07, "loss": -0.0117, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1117 }, { "completion_length": 1635.7916870117188, "epoch": 1.3634146341463413, "grad_norm": 0.05582652613520622, "kl": 0.070068359375, "learning_rate": 8.344202431995604e-07, "loss": 0.0208, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 1118 }, { "completion_length": 1811.666748046875, "epoch": 1.3646341463414635, "grad_norm": 0.05330521985888481, "kl": 0.0587158203125, "learning_rate": 8.315605872148653e-07, "loss": 0.0023, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1119 }, { "completion_length": 1958.5001220703125, "epoch": 1.3658536585365852, "grad_norm": 0.2082366794347763, "kl": 0.0628662109375, "learning_rate": 8.287039594605737e-07, "loss": -0.0022, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1120 }, { "completion_length": 1453.7083740234375, "epoch": 1.3670731707317074, "grad_norm": 0.27077025175094604, "kl": 0.08642578125, "learning_rate": 8.258503728780638e-07, "loss": 0.0195, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1121 }, { "completion_length": 1443.125, "epoch": 1.3682926829268292, "grad_norm": 0.058373741805553436, "kl": 0.0640869140625, "learning_rate": 8.229998403949348e-07, "loss": 0.0023, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1122 }, { "completion_length": 1766.5208740234375, "epoch": 1.3695121951219513, "grad_norm": 0.12421582639217377, "kl": 0.059814453125, "learning_rate": 8.20152374924953e-07, "loss": 0.0694, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1123 }, { "completion_length": 1656.6875, "epoch": 1.370731707317073, "grad_norm": 0.12559135258197784, "kl": 0.069091796875, "learning_rate": 8.173079893679873e-07, "loss": 0.0549, "reward": 0.2500000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 1124 }, { "completion_length": 1401.4375610351562, "epoch": 1.3719512195121952, "grad_norm": 0.16029055416584015, "kl": 0.0657958984375, "learning_rate": 8.144666966099543e-07, "loss": 0.0742, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1125 }, { "completion_length": 1606.3958740234375, "epoch": 1.373170731707317, "grad_norm": 0.037284355610609055, "kl": 0.0498046875, "learning_rate": 8.116285095227604e-07, "loss": 0.0018, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1126 }, { "completion_length": 1703.8333740234375, "epoch": 1.3743902439024391, "grad_norm": 0.17504902184009552, "kl": 0.0635986328125, "learning_rate": 8.087934409642426e-07, "loss": 0.0313, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1127 }, { "completion_length": 1739.5000610351562, "epoch": 1.3756097560975609, "grad_norm": 0.24940678477287292, "kl": 0.066650390625, "learning_rate": 8.05961503778108e-07, "loss": 0.0211, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1128 }, { "completion_length": 1635.5208740234375, "epoch": 1.376829268292683, "grad_norm": 0.3138916492462158, "kl": 0.0704345703125, "learning_rate": 8.03132710793879e-07, "loss": 0.0023, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1129 }, { "completion_length": 1496.8958740234375, "epoch": 1.3780487804878048, "grad_norm": 0.08514293283224106, "kl": 0.0672607421875, "learning_rate": 8.003070748268339e-07, "loss": 0.0215, "reward": 0.20833333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 1130 }, { "completion_length": 1226.4166870117188, "epoch": 1.379268292682927, "grad_norm": 0.3350156545639038, "kl": 0.080810546875, "learning_rate": 7.974846086779475e-07, "loss": 0.1582, "reward": 0.2916666716337204, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1131 }, { "completion_length": 1508.2708740234375, "epoch": 1.3804878048780487, "grad_norm": 0.15800786018371582, "kl": 0.0614013671875, "learning_rate": 7.94665325133837e-07, "loss": 0.0544, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 1132 }, { "completion_length": 1360.3750610351562, "epoch": 1.3817073170731708, "grad_norm": 0.2012512981891632, "kl": 0.054931640625, "learning_rate": 7.918492369666989e-07, "loss": 0.0679, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1133 }, { "completion_length": 1226.8958740234375, "epoch": 1.3829268292682926, "grad_norm": 0.2620626389980316, "kl": 0.093017578125, "learning_rate": 7.890363569342539e-07, "loss": 0.0319, "reward": 0.4166666865348816, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "step": 1134 }, { "completion_length": 1374.3958740234375, "epoch": 1.3841463414634148, "grad_norm": 0.2990681529045105, "kl": 0.0650634765625, "learning_rate": 7.862266977796907e-07, "loss": 0.1726, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1135 }, { "completion_length": 2098.0833740234375, "epoch": 1.3853658536585365, "grad_norm": 0.28708651661872864, "kl": 0.07470703125, "learning_rate": 7.834202722316054e-07, "loss": 0.1021, "reward": 0.1666666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1136 }, { "completion_length": 1495.479248046875, "epoch": 1.3865853658536587, "grad_norm": 0.056493379175662994, "kl": 0.0557861328125, "learning_rate": 7.806170930039446e-07, "loss": 0.0023, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1137 }, { "completion_length": 1190.1666870117188, "epoch": 1.3878048780487804, "grad_norm": 0.21429282426834106, "kl": 0.06640625, "learning_rate": 7.778171727959482e-07, "loss": 0.0319, "reward": 0.3541666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 1138 }, { "completion_length": 1729.0625, "epoch": 1.3890243902439026, "grad_norm": 0.20377257466316223, "kl": 0.0640869140625, "learning_rate": 7.750205242920921e-07, "loss": 0.0799, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1139 }, { "completion_length": 2013.4583740234375, "epoch": 1.3902439024390243, "grad_norm": 0.0709085762500763, "kl": 0.060791015625, "learning_rate": 7.722271601620293e-07, "loss": 0.0016, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1140 }, { "completion_length": 1754.2708740234375, "epoch": 1.3914634146341465, "grad_norm": 0.1399092823266983, "kl": 0.0615234375, "learning_rate": 7.694370930605362e-07, "loss": 0.0598, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1141 }, { "completion_length": 1492.4791870117188, "epoch": 1.3926829268292682, "grad_norm": 0.11426208168268204, "kl": 0.0631103515625, "learning_rate": 7.6665033562745e-07, "loss": 0.0727, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1142 }, { "completion_length": 1930.8541870117188, "epoch": 1.3939024390243904, "grad_norm": 0.14284688234329224, "kl": 0.077392578125, "learning_rate": 7.638669004876145e-07, "loss": -0.0025, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1143 }, { "completion_length": 1634.2083740234375, "epoch": 1.395121951219512, "grad_norm": 0.27010735869407654, "kl": 0.075439453125, "learning_rate": 7.610868002508248e-07, "loss": 0.0982, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1144 }, { "completion_length": 1558.0208740234375, "epoch": 1.3963414634146343, "grad_norm": 0.08260347694158554, "kl": 0.0693359375, "learning_rate": 7.583100475117643e-07, "loss": -0.0008, "reward": 0.2291666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1145 }, { "completion_length": 1642.6458740234375, "epoch": 1.397560975609756, "grad_norm": 0.1766006350517273, "kl": 0.056396484375, "learning_rate": 7.555366548499551e-07, "loss": 0.0695, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1146 }, { "completion_length": 2065.3333740234375, "epoch": 1.3987804878048782, "grad_norm": 0.18940486013889313, "kl": 0.069580078125, "learning_rate": 7.527666348296941e-07, "loss": 0.0211, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1147 }, { "completion_length": 1813.6875610351562, "epoch": 1.4, "grad_norm": 0.1439916342496872, "kl": 0.079345703125, "learning_rate": 7.500000000000003e-07, "loss": 0.0462, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1148 }, { "completion_length": 1542.875, "epoch": 1.401219512195122, "grad_norm": 0.43714889883995056, "kl": 0.0697021484375, "learning_rate": 7.472367628945564e-07, "loss": -0.0087, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1149 }, { "completion_length": 1767.9791870117188, "epoch": 1.4024390243902438, "grad_norm": 0.11057079583406448, "kl": 0.0704345703125, "learning_rate": 7.444769360316534e-07, "loss": 0.0784, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1150 }, { "completion_length": 1317.8541870117188, "epoch": 1.403658536585366, "grad_norm": 0.23685288429260254, "kl": 0.06689453125, "learning_rate": 7.417205319141321e-07, "loss": 0.1303, "reward": 0.25, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1151 }, { "completion_length": 1806.0208740234375, "epoch": 1.4048780487804877, "grad_norm": 0.13277260959148407, "kl": 0.064697265625, "learning_rate": 7.389675630293269e-07, "loss": 0.0254, "reward": 0.2708333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1152 }, { "completion_length": 1776.5625610351562, "epoch": 1.40609756097561, "grad_norm": 0.22591744363307953, "kl": 0.07080078125, "learning_rate": 7.362180418490099e-07, "loss": 0.1724, "reward": 0.2500000149011612, "reward_std": 0.14433757215738297, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 1153 }, { "completion_length": 1621.1041870117188, "epoch": 1.4073170731707316, "grad_norm": 0.3094944357872009, "kl": 0.102783203125, "learning_rate": 7.334719808293342e-07, "loss": 0.1684, "reward": 0.229166679084301, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1154 }, { "completion_length": 1806.3333740234375, "epoch": 1.4085365853658536, "grad_norm": 0.06356124579906464, "kl": 0.073974609375, "learning_rate": 7.307293924107781e-07, "loss": 0.0394, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1155 }, { "completion_length": 1897.9375610351562, "epoch": 1.4097560975609755, "grad_norm": 0.06797239184379578, "kl": 0.056884765625, "learning_rate": 7.279902890180865e-07, "loss": 0.0193, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1156 }, { "completion_length": 1702.8125610351562, "epoch": 1.4109756097560975, "grad_norm": 0.2892155647277832, "kl": 0.08154296875, "learning_rate": 7.252546830602171e-07, "loss": 0.0607, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1157 }, { "completion_length": 1550.7500610351562, "epoch": 1.4121951219512194, "grad_norm": 0.22689515352249146, "kl": 0.079833984375, "learning_rate": 7.225225869302818e-07, "loss": 0.1623, "reward": 0.16666667722165585, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1158 }, { "completion_length": 1820.4375610351562, "epoch": 1.4134146341463414, "grad_norm": 0.14121899008750916, "kl": 0.06640625, "learning_rate": 7.197940130054943e-07, "loss": 0.102, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1159 }, { "completion_length": 1793.8959350585938, "epoch": 1.4146341463414633, "grad_norm": 0.10780225694179535, "kl": 0.070556640625, "learning_rate": 7.1706897364711e-07, "loss": -0.0006, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 1160 }, { "completion_length": 2098.3958740234375, "epoch": 1.4158536585365853, "grad_norm": 0.13531874120235443, "kl": 0.064208984375, "learning_rate": 7.143474812003715e-07, "loss": -0.0019, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1161 }, { "completion_length": 1959.2291870117188, "epoch": 1.4170731707317072, "grad_norm": 0.25021493434906006, "kl": 0.070068359375, "learning_rate": 7.116295479944533e-07, "loss": 0.1144, "reward": 0.4166666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 1162 }, { "completion_length": 1450.0, "epoch": 1.4182926829268292, "grad_norm": 0.22444204986095428, "kl": 0.0556640625, "learning_rate": 7.089151863424061e-07, "loss": 0.0095, "reward": 0.2083333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1163 }, { "completion_length": 1668.7708740234375, "epoch": 1.4195121951219511, "grad_norm": 0.39352449774742126, "kl": 0.06591796875, "learning_rate": 7.062044085410991e-07, "loss": 0.0149, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1164 }, { "completion_length": 1665.9583740234375, "epoch": 1.420731707317073, "grad_norm": 0.1833227127790451, "kl": 0.078125, "learning_rate": 7.034972268711669e-07, "loss": 0.056, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1165 }, { "completion_length": 1833.3959350585938, "epoch": 1.421951219512195, "grad_norm": 0.2379268854856491, "kl": 0.0648193359375, "learning_rate": 7.007936535969516e-07, "loss": -0.0317, "reward": 0.1041666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1166 }, { "completion_length": 1711.1459350585938, "epoch": 1.423170731707317, "grad_norm": 0.07151176035404205, "kl": 0.055419921875, "learning_rate": 6.980937009664487e-07, "loss": 0.0024, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1167 }, { "completion_length": 1527.3958740234375, "epoch": 1.424390243902439, "grad_norm": 0.10014871507883072, "kl": 0.058349609375, "learning_rate": 6.9539738121125e-07, "loss": 0.0215, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 1168 }, { "completion_length": 1608.9375610351562, "epoch": 1.425609756097561, "grad_norm": 0.2312953621149063, "kl": 0.0577392578125, "learning_rate": 6.927047065464915e-07, "loss": 0.102, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1169 }, { "completion_length": 1379.104248046875, "epoch": 1.4268292682926829, "grad_norm": 0.20466811954975128, "kl": 0.080810546875, "learning_rate": 6.90015689170794e-07, "loss": 0.0334, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1170 }, { "completion_length": 1235.541748046875, "epoch": 1.4280487804878048, "grad_norm": 0.14127695560455322, "kl": 0.0648193359375, "learning_rate": 6.873303412662103e-07, "loss": 0.0659, "reward": 0.3333333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.3333333358168602, "rewards/format_reward": 0.0, "step": 1171 }, { "completion_length": 2018.125, "epoch": 1.4292682926829268, "grad_norm": 0.15425615012645721, "kl": 0.075439453125, "learning_rate": 6.846486749981684e-07, "loss": 0.0082, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1172 }, { "completion_length": 1562.8125, "epoch": 1.4304878048780487, "grad_norm": 0.2830915153026581, "kl": 0.090576171875, "learning_rate": 6.819707025154194e-07, "loss": 0.1371, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1173 }, { "completion_length": 1511.4791870117188, "epoch": 1.4317073170731707, "grad_norm": 0.12435463070869446, "kl": 0.05908203125, "learning_rate": 6.792964359499794e-07, "loss": 0.0612, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1174 }, { "completion_length": 1867.8959350585938, "epoch": 1.4329268292682926, "grad_norm": 0.23749235272407532, "kl": 0.071533203125, "learning_rate": 6.766258874170752e-07, "loss": -0.0209, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 1175 }, { "completion_length": 1647.0833740234375, "epoch": 1.4341463414634146, "grad_norm": 0.2187647670507431, "kl": 0.0643310546875, "learning_rate": 6.739590690150903e-07, "loss": 0.145, "reward": 0.1458333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1176 }, { "completion_length": 1714.5, "epoch": 1.4353658536585365, "grad_norm": 0.16387030482292175, "kl": 0.0543212890625, "learning_rate": 6.712959928255088e-07, "loss": 0.0372, "reward": 0.2916666865348816, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.0, "step": 1177 }, { "completion_length": 1758.6250610351562, "epoch": 1.4365853658536585, "grad_norm": 0.13817749917507172, "kl": 0.0711669921875, "learning_rate": 6.686366709128632e-07, "loss": 0.0389, "reward": 0.2500000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 1178 }, { "completion_length": 1406.7291870117188, "epoch": 1.4378048780487804, "grad_norm": 0.24476493895053864, "kl": 0.0704345703125, "learning_rate": 6.65981115324676e-07, "loss": 0.1672, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1179 }, { "completion_length": 2014.5209350585938, "epoch": 1.4390243902439024, "grad_norm": 0.15607836842536926, "kl": 0.07275390625, "learning_rate": 6.633293380914087e-07, "loss": 0.0866, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 1180 }, { "completion_length": 2089.8958740234375, "epoch": 1.4402439024390243, "grad_norm": 0.12969492375850677, "kl": 0.05419921875, "learning_rate": 6.60681351226404e-07, "loss": 0.0372, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1181 }, { "completion_length": 1604.8958740234375, "epoch": 1.4414634146341463, "grad_norm": 0.21100108325481415, "kl": 0.083251953125, "learning_rate": 6.580371667258349e-07, "loss": 0.0872, "reward": 0.125, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1182 }, { "completion_length": 1923.0, "epoch": 1.4426829268292682, "grad_norm": 0.11882030218839645, "kl": 0.076416015625, "learning_rate": 6.553967965686483e-07, "loss": 0.0783, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1183 }, { "completion_length": 1794.8125, "epoch": 1.4439024390243902, "grad_norm": 0.08982381224632263, "kl": 0.0692138671875, "learning_rate": 6.527602527165099e-07, "loss": 0.0204, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 1184 }, { "completion_length": 1736.6875, "epoch": 1.4451219512195121, "grad_norm": 0.263789564371109, "kl": 0.0723876953125, "learning_rate": 6.501275471137518e-07, "loss": 0.1696, "reward": 0.20833333395421505, "reward_std": 0.18042197078466415, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 1185 }, { "completion_length": 2127.729248046875, "epoch": 1.446341463414634, "grad_norm": 0.1284579485654831, "kl": 0.0709228515625, "learning_rate": 6.474986916873168e-07, "loss": 0.0564, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1186 }, { "completion_length": 1739.5833740234375, "epoch": 1.447560975609756, "grad_norm": 0.20159056782722473, "kl": 0.079833984375, "learning_rate": 6.448736983467072e-07, "loss": 0.1033, "reward": 0.14583333395421505, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 1187 }, { "completion_length": 1589.1666870117188, "epoch": 1.448780487804878, "grad_norm": 0.3127121031284332, "kl": 0.070556640625, "learning_rate": 6.422525789839273e-07, "loss": 0.1563, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1188 }, { "completion_length": 1351.229248046875, "epoch": 1.45, "grad_norm": 0.21829895675182343, "kl": 0.06640625, "learning_rate": 6.396353454734313e-07, "loss": 0.1039, "reward": 0.31250002048909664, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.31250002048909664, "rewards/format_reward": 0.0, "step": 1189 }, { "completion_length": 1479.166748046875, "epoch": 1.451219512195122, "grad_norm": 0.2148381471633911, "kl": 0.0650634765625, "learning_rate": 6.370220096720692e-07, "loss": 0.0536, "reward": 0.18750000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 1190 }, { "completion_length": 1572.5833740234375, "epoch": 1.4524390243902439, "grad_norm": 0.1671302169561386, "kl": 0.064453125, "learning_rate": 6.344125834190345e-07, "loss": 0.0759, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1191 }, { "completion_length": 1476.9166870117188, "epoch": 1.4536585365853658, "grad_norm": 0.29948392510414124, "kl": 0.06103515625, "learning_rate": 6.318070785358074e-07, "loss": 0.1568, "reward": 0.3541666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 1192 }, { "completion_length": 1466.2708740234375, "epoch": 1.4548780487804878, "grad_norm": 0.15255652368068695, "kl": 0.06494140625, "learning_rate": 6.292055068261051e-07, "loss": 0.0553, "reward": 0.125, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1193 }, { "completion_length": 1959.9584350585938, "epoch": 1.4560975609756097, "grad_norm": 0.30061644315719604, "kl": 0.0732421875, "learning_rate": 6.266078800758249e-07, "loss": 0.2037, "reward": 0.2083333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1194 }, { "completion_length": 2005.1876220703125, "epoch": 1.4573170731707317, "grad_norm": 0.20964309573173523, "kl": 0.091796875, "learning_rate": 6.240142100529917e-07, "loss": 0.0522, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1195 }, { "completion_length": 2117.4791870117188, "epoch": 1.4585365853658536, "grad_norm": 0.13498026132583618, "kl": 0.07421875, "learning_rate": 6.214245085077078e-07, "loss": 0.0218, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1196 }, { "completion_length": 2045.416748046875, "epoch": 1.4597560975609756, "grad_norm": 0.1640341877937317, "kl": 0.081787109375, "learning_rate": 6.188387871720946e-07, "loss": 0.0814, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1197 }, { "completion_length": 1704.541748046875, "epoch": 1.4609756097560975, "grad_norm": 0.2538154721260071, "kl": 0.07666015625, "learning_rate": 6.162570577602433e-07, "loss": 0.0841, "reward": 0.16666667722165585, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1198 }, { "completion_length": 1978.3125, "epoch": 1.4621951219512195, "grad_norm": 0.14314088225364685, "kl": 0.088134765625, "learning_rate": 6.136793319681598e-07, "loss": 0.0882, "reward": 0.2708333432674408, "reward_std": 0.10825318098068237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1199 }, { "completion_length": 2181.979248046875, "epoch": 1.4634146341463414, "grad_norm": 0.1773487627506256, "kl": 0.079833984375, "learning_rate": 6.11105621473712e-07, "loss": 0.0774, "reward": 0.25, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1200 }, { "completion_length": 1938.8125, "epoch": 1.4646341463414634, "grad_norm": 0.2243904024362564, "kl": 0.099609375, "learning_rate": 6.085359379365787e-07, "loss": 0.1244, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1201 }, { "completion_length": 2197.3125, "epoch": 1.4658536585365853, "grad_norm": 0.15804284811019897, "kl": 0.098876953125, "learning_rate": 6.059702929981952e-07, "loss": 0.124, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1202 }, { "completion_length": 2071.2708740234375, "epoch": 1.4670731707317073, "grad_norm": 0.13776305317878723, "kl": 0.0966796875, "learning_rate": 6.034086982816998e-07, "loss": 0.0554, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1203 }, { "completion_length": 1861.8541870117188, "epoch": 1.4682926829268292, "grad_norm": 0.22006545960903168, "kl": 0.093017578125, "learning_rate": 6.008511653918821e-07, "loss": 0.0321, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1204 }, { "completion_length": 1740.9375610351562, "epoch": 1.4695121951219512, "grad_norm": 0.24095158278942108, "kl": 0.08447265625, "learning_rate": 5.982977059151307e-07, "loss": 0.0931, "reward": 0.3541666865348816, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 0.0, "step": 1205 }, { "completion_length": 1643.2500610351562, "epoch": 1.4707317073170731, "grad_norm": 0.24007543921470642, "kl": 0.074951171875, "learning_rate": 5.957483314193813e-07, "loss": -0.0216, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1206 }, { "completion_length": 1839.2083740234375, "epoch": 1.471951219512195, "grad_norm": 0.29129984974861145, "kl": 0.093505859375, "learning_rate": 5.93203053454062e-07, "loss": 0.1448, "reward": 0.291666679084301, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 1207 }, { "completion_length": 1503.5000610351562, "epoch": 1.473170731707317, "grad_norm": 0.19293393194675446, "kl": 0.0635986328125, "learning_rate": 5.906618835500434e-07, "loss": 0.0297, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1208 }, { "completion_length": 1455.2083740234375, "epoch": 1.474390243902439, "grad_norm": 0.15374787151813507, "kl": 0.089599609375, "learning_rate": 5.881248332195842e-07, "loss": 0.0105, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1209 }, { "completion_length": 2235.8750610351562, "epoch": 1.475609756097561, "grad_norm": 0.1440083086490631, "kl": 0.1025390625, "learning_rate": 5.855919139562815e-07, "loss": 0.0314, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1210 }, { "completion_length": 2529.6458740234375, "epoch": 1.476829268292683, "grad_norm": 0.24022962152957916, "kl": 0.10693359375, "learning_rate": 5.830631372350176e-07, "loss": 0.0849, "reward": 0.125, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1211 }, { "completion_length": 1365.9583740234375, "epoch": 1.4780487804878049, "grad_norm": 0.20744113624095917, "kl": 0.060791015625, "learning_rate": 5.805385145119064e-07, "loss": 0.0146, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1212 }, { "completion_length": 1912.166748046875, "epoch": 1.4792682926829268, "grad_norm": 0.2510889172554016, "kl": 0.0830078125, "learning_rate": 5.780180572242438e-07, "loss": -0.001, "reward": 0.16666667722165585, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1213 }, { "completion_length": 1758.2500610351562, "epoch": 1.4804878048780488, "grad_norm": 0.4133623540401459, "kl": 0.090087890625, "learning_rate": 5.755017767904543e-07, "loss": 0.1404, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1214 }, { "completion_length": 1874.041748046875, "epoch": 1.4817073170731707, "grad_norm": 0.24060343205928802, "kl": 0.098388671875, "learning_rate": 5.729896846100419e-07, "loss": -0.0133, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 1215 }, { "completion_length": 1907.104248046875, "epoch": 1.4829268292682927, "grad_norm": 0.21801520884037018, "kl": 0.088134765625, "learning_rate": 5.704817920635348e-07, "loss": 0.0963, "reward": 0.2500000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 1216 }, { "completion_length": 1825.75, "epoch": 1.4841463414634146, "grad_norm": 0.2653990685939789, "kl": 0.080322265625, "learning_rate": 5.679781105124357e-07, "loss": 0.1489, "reward": 0.16666667722165585, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1217 }, { "completion_length": 2042.2708740234375, "epoch": 1.4853658536585366, "grad_norm": 0.19444695115089417, "kl": 0.098388671875, "learning_rate": 5.654786512991705e-07, "loss": 0.098, "reward": 0.18750000558793545, "reward_std": 0.14433757215738297, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 1218 }, { "completion_length": 1645.2083740234375, "epoch": 1.4865853658536585, "grad_norm": 0.3152191936969757, "kl": 0.0888671875, "learning_rate": 5.629834257470377e-07, "loss": 0.1355, "reward": 0.1875000111758709, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 1219 }, { "completion_length": 2115.0, "epoch": 1.4878048780487805, "grad_norm": 0.2038986086845398, "kl": 0.082763671875, "learning_rate": 5.60492445160154e-07, "loss": 0.0225, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 1220 }, { "completion_length": 1774.75, "epoch": 1.4890243902439024, "grad_norm": 0.11763674020767212, "kl": 0.093017578125, "learning_rate": 5.580057208234074e-07, "loss": 0.0228, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1221 }, { "completion_length": 1929.5625, "epoch": 1.4902439024390244, "grad_norm": 0.16115932166576385, "kl": 0.0849609375, "learning_rate": 5.555232640024021e-07, "loss": 0.0316, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1222 }, { "completion_length": 1769.6875610351562, "epoch": 1.4914634146341463, "grad_norm": 0.177618145942688, "kl": 0.08349609375, "learning_rate": 5.530450859434092e-07, "loss": 0.0816, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1223 }, { "completion_length": 1733.5833740234375, "epoch": 1.4926829268292683, "grad_norm": 0.2510419189929962, "kl": 0.066650390625, "learning_rate": 5.505711978733175e-07, "loss": 0.1434, "reward": 0.2291666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1224 }, { "completion_length": 1738.3333740234375, "epoch": 1.4939024390243902, "grad_norm": 0.3677607476711273, "kl": 0.075927734375, "learning_rate": 5.48101610999579e-07, "loss": 0.1173, "reward": 0.2708333358168602, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 1225 }, { "completion_length": 1491.0000610351562, "epoch": 1.4951219512195122, "grad_norm": 0.27497199177742004, "kl": 0.0721435546875, "learning_rate": 5.456363365101606e-07, "loss": 0.1494, "reward": 0.2500000111758709, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "step": 1226 }, { "completion_length": 2003.8958740234375, "epoch": 1.4963414634146341, "grad_norm": 0.2507963478565216, "kl": 0.10205078125, "learning_rate": 5.43175385573493e-07, "loss": 0.0729, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1227 }, { "completion_length": 1474.8541870117188, "epoch": 1.497560975609756, "grad_norm": 0.18882271647453308, "kl": 0.08203125, "learning_rate": 5.407187693384191e-07, "loss": -0.0045, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1228 }, { "completion_length": 1964.3958740234375, "epoch": 1.498780487804878, "grad_norm": 0.24142873287200928, "kl": 0.078857421875, "learning_rate": 5.382664989341455e-07, "loss": 0.1248, "reward": 0.1875, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1229 }, { "completion_length": 1280.875, "epoch": 1.5, "grad_norm": 0.1703704297542572, "kl": 0.06005859375, "learning_rate": 5.358185854701909e-07, "loss": -0.0217, "reward": 0.2083333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1230 }, { "completion_length": 1768.8333740234375, "epoch": 1.501219512195122, "grad_norm": 0.33142003417015076, "kl": 0.125732421875, "learning_rate": 5.33375040036335e-07, "loss": 0.1828, "reward": 0.4375000149011612, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 1231 }, { "completion_length": 2192.3125, "epoch": 1.502439024390244, "grad_norm": 0.24712756276130676, "kl": 0.091796875, "learning_rate": 5.309358737025682e-07, "loss": 0.0946, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1232 }, { "completion_length": 1638.6250610351562, "epoch": 1.5036585365853659, "grad_norm": 0.15983736515045166, "kl": 0.08740234375, "learning_rate": 5.285010975190447e-07, "loss": 0.041, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1233 }, { "completion_length": 1563.6250610351562, "epoch": 1.5048780487804878, "grad_norm": 0.34291860461235046, "kl": 0.07470703125, "learning_rate": 5.26070722516028e-07, "loss": 0.0849, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1234 }, { "completion_length": 1879.2708740234375, "epoch": 1.5060975609756098, "grad_norm": 0.13496096432209015, "kl": 0.081298828125, "learning_rate": 5.236447597038434e-07, "loss": 0.0545, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 1235 }, { "completion_length": 1573.5000610351562, "epoch": 1.5073170731707317, "grad_norm": 0.12581363320350647, "kl": 0.0546875, "learning_rate": 5.21223220072828e-07, "loss": 0.0022, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1236 }, { "completion_length": 1460.7083740234375, "epoch": 1.5085365853658537, "grad_norm": 0.1406300663948059, "kl": 0.05712890625, "learning_rate": 5.188061145932798e-07, "loss": 0.0369, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1237 }, { "completion_length": 1281.0625, "epoch": 1.5097560975609756, "grad_norm": 0.28906285762786865, "kl": 0.082275390625, "learning_rate": 5.163934542154106e-07, "loss": 0.1319, "reward": 0.3958333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 1238 }, { "completion_length": 1602.1875610351562, "epoch": 1.5109756097560976, "grad_norm": 0.15002121031284332, "kl": 0.081787109375, "learning_rate": 5.139852498692916e-07, "loss": 0.0031, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 1239 }, { "completion_length": 1315.729248046875, "epoch": 1.5121951219512195, "grad_norm": 0.34209316968917847, "kl": 0.08349609375, "learning_rate": 5.115815124648103e-07, "loss": 0.0804, "reward": 0.1875000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1240 }, { "completion_length": 1814.6666870117188, "epoch": 1.5134146341463415, "grad_norm": 0.26898542046546936, "kl": 0.0692138671875, "learning_rate": 5.091822528916151e-07, "loss": 0.0577, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1241 }, { "completion_length": 1288.3541870117188, "epoch": 1.5146341463414634, "grad_norm": 0.12481115758419037, "kl": 0.052978515625, "learning_rate": 5.067874820190684e-07, "loss": -0.0081, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1242 }, { "completion_length": 2024.8959350585938, "epoch": 1.5158536585365854, "grad_norm": 0.1549234539270401, "kl": 0.08349609375, "learning_rate": 5.043972106961996e-07, "loss": 0.0026, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 1243 }, { "completion_length": 1471.3958740234375, "epoch": 1.5170731707317073, "grad_norm": 0.26823341846466064, "kl": 0.089111328125, "learning_rate": 5.020114497516521e-07, "loss": 0.0917, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1244 }, { "completion_length": 1431.0625610351562, "epoch": 1.5182926829268293, "grad_norm": 0.3262825608253479, "kl": 0.091796875, "learning_rate": 4.996302099936363e-07, "loss": 0.1227, "reward": 0.3541666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 1245 }, { "completion_length": 1459.8958740234375, "epoch": 1.5195121951219512, "grad_norm": 0.21038717031478882, "kl": 0.063720703125, "learning_rate": 4.972535022098795e-07, "loss": 0.0604, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1246 }, { "completion_length": 1371.8333740234375, "epoch": 1.5207317073170732, "grad_norm": 0.23414063453674316, "kl": 0.056640625, "learning_rate": 4.948813371675798e-07, "loss": 0.0486, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1247 }, { "completion_length": 1960.2500610351562, "epoch": 1.5219512195121951, "grad_norm": 0.11400946974754333, "kl": 0.083251953125, "learning_rate": 4.925137256133533e-07, "loss": 0.0431, "reward": 0.1666666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1248 }, { "completion_length": 1651.6875, "epoch": 1.523170731707317, "grad_norm": 0.31756478548049927, "kl": 0.07373046875, "learning_rate": 4.901506782731888e-07, "loss": 0.1186, "reward": 0.1458333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1249 }, { "completion_length": 1899.0000610351562, "epoch": 1.524390243902439, "grad_norm": 0.15580201148986816, "kl": 0.078125, "learning_rate": 4.877922058523971e-07, "loss": 0.0679, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1250 }, { "completion_length": 1268.2708435058594, "epoch": 1.525609756097561, "grad_norm": 0.16028733551502228, "kl": 0.08984375, "learning_rate": 4.854383190355629e-07, "loss": 0.0519, "reward": 0.3541666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 1251 }, { "completion_length": 1780.6458740234375, "epoch": 1.526829268292683, "grad_norm": 0.1254788488149643, "kl": 0.07421875, "learning_rate": 4.830890284864985e-07, "loss": 0.0418, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1252 }, { "completion_length": 2062.0001220703125, "epoch": 1.528048780487805, "grad_norm": 0.24724438786506653, "kl": 0.082275390625, "learning_rate": 4.807443448481917e-07, "loss": 0.1066, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1253 }, { "completion_length": 2147.104248046875, "epoch": 1.5292682926829269, "grad_norm": 0.19106018543243408, "kl": 0.09326171875, "learning_rate": 4.784042787427605e-07, "loss": 0.0047, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1254 }, { "completion_length": 1542.1458740234375, "epoch": 1.5304878048780488, "grad_norm": 0.1542035937309265, "kl": 0.0703125, "learning_rate": 4.7606884077140373e-07, "loss": 0.0042, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1255 }, { "completion_length": 1452.166748046875, "epoch": 1.5317073170731708, "grad_norm": 0.5524031519889832, "kl": 0.1025390625, "learning_rate": 4.7373804151435456e-07, "loss": 0.17, "reward": 0.2291666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1256 }, { "completion_length": 1521.3750610351562, "epoch": 1.5329268292682927, "grad_norm": 0.9722011685371399, "kl": 0.089111328125, "learning_rate": 4.714118915308296e-07, "loss": 0.0288, "reward": 0.3333333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 1257 }, { "completion_length": 1524.5625, "epoch": 1.5341463414634147, "grad_norm": 15.484407424926758, "kl": 0.2685546875, "learning_rate": 4.6909040135898463e-07, "loss": -0.0102, "reward": 0.25, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1258 }, { "completion_length": 1956.5834350585938, "epoch": 1.5353658536585366, "grad_norm": 0.2369980812072754, "kl": 0.119140625, "learning_rate": 4.6677358151586393e-07, "loss": 0.1139, "reward": 0.25, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1259 }, { "completion_length": 1552.479248046875, "epoch": 1.5365853658536586, "grad_norm": 0.42772334814071655, "kl": 0.0654296875, "learning_rate": 4.6446144249735345e-07, "loss": 0.0628, "reward": 0.3541666865348816, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 0.0, "step": 1260 }, { "completion_length": 1963.6666870117188, "epoch": 1.5378048780487805, "grad_norm": 0.8592334985733032, "kl": 0.11865234375, "learning_rate": 4.6215399477813553e-07, "loss": 0.0728, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 1261 }, { "completion_length": 1787.3333740234375, "epoch": 1.5390243902439025, "grad_norm": 0.28327101469039917, "kl": 0.089599609375, "learning_rate": 4.598512488116376e-07, "loss": 0.0938, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1262 }, { "completion_length": 2187.0001220703125, "epoch": 1.5402439024390244, "grad_norm": 0.2931613028049469, "kl": 0.128662109375, "learning_rate": 4.5755321502998733e-07, "loss": 0.0434, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 1263 }, { "completion_length": 2151.3333740234375, "epoch": 1.5414634146341464, "grad_norm": 0.35806915163993835, "kl": 0.1337890625, "learning_rate": 4.552599038439651e-07, "loss": 0.058, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1264 }, { "completion_length": 1823.7916870117188, "epoch": 1.5426829268292683, "grad_norm": 0.36212384700775146, "kl": 0.10595703125, "learning_rate": 4.529713256429556e-07, "loss": 0.0336, "reward": 0.2291666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1265 }, { "completion_length": 1804.5625610351562, "epoch": 1.5439024390243903, "grad_norm": 0.4501515030860901, "kl": 0.12060546875, "learning_rate": 4.506874907949034e-07, "loss": 0.0778, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1266 }, { "completion_length": 2046.9376220703125, "epoch": 1.5451219512195122, "grad_norm": 0.48809370398521423, "kl": 0.104248046875, "learning_rate": 4.484084096462623e-07, "loss": 0.0054, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 1267 }, { "completion_length": 2497.1251220703125, "epoch": 1.5463414634146342, "grad_norm": 0.2089737206697464, "kl": 0.12841796875, "learning_rate": 4.461340925219522e-07, "loss": 0.0745, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1268 }, { "completion_length": 1802.5625, "epoch": 1.5475609756097561, "grad_norm": 0.32952985167503357, "kl": 0.095947265625, "learning_rate": 4.438645497253088e-07, "loss": 0.0308, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 1269 }, { "completion_length": 2152.08349609375, "epoch": 1.548780487804878, "grad_norm": 0.15669040381908417, "kl": 0.087646484375, "learning_rate": 4.4159979153804064e-07, "loss": 0.0348, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 1270 }, { "completion_length": 1815.1458740234375, "epoch": 1.55, "grad_norm": 0.07843388617038727, "kl": 0.060302734375, "learning_rate": 4.3933982822017883e-07, "loss": 0.0028, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1271 }, { "completion_length": 1658.3541870117188, "epoch": 1.551219512195122, "grad_norm": 0.4068981409072876, "kl": 0.107421875, "learning_rate": 4.3708467001003305e-07, "loss": 0.1723, "reward": 0.2916666865348816, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.0, "step": 1272 }, { "completion_length": 2408.8958740234375, "epoch": 1.552439024390244, "grad_norm": 0.1417243331670761, "kl": 0.083984375, "learning_rate": 4.348343271241441e-07, "loss": 0.0212, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 1273 }, { "completion_length": 1865.2501220703125, "epoch": 1.553658536585366, "grad_norm": 0.19239379465579987, "kl": 0.083251953125, "learning_rate": 4.3258880975723777e-07, "loss": 0.0402, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1274 }, { "completion_length": 1838.6250610351562, "epoch": 1.5548780487804879, "grad_norm": 0.17049676179885864, "kl": 0.08935546875, "learning_rate": 4.3034812808218017e-07, "loss": 0.0467, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1275 }, { "completion_length": 1821.7083740234375, "epoch": 1.5560975609756098, "grad_norm": 0.2759491503238678, "kl": 0.069580078125, "learning_rate": 4.2811229224992807e-07, "loss": 0.0096, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1276 }, { "completion_length": 2269.7708740234375, "epoch": 1.5573170731707318, "grad_norm": 0.13659512996673584, "kl": 0.07470703125, "learning_rate": 4.258813123894875e-07, "loss": 0.039, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1277 }, { "completion_length": 2374.3751220703125, "epoch": 1.5585365853658537, "grad_norm": 0.13126373291015625, "kl": 0.098388671875, "learning_rate": 4.2365519860786316e-07, "loss": 0.0043, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1278 }, { "completion_length": 1776.9583740234375, "epoch": 1.5597560975609757, "grad_norm": 0.12836222350597382, "kl": 0.07763671875, "learning_rate": 4.2143396099001724e-07, "loss": 0.0629, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1279 }, { "completion_length": 2061.3334350585938, "epoch": 1.5609756097560976, "grad_norm": 0.1382802575826645, "kl": 0.0751953125, "learning_rate": 4.192176095988196e-07, "loss": 0.0733, "reward": 0.2500000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 1280 }, { "completion_length": 1851.75, "epoch": 1.5621951219512196, "grad_norm": 0.2611844539642334, "kl": 0.099609375, "learning_rate": 4.170061544750048e-07, "loss": 0.1147, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 1281 }, { "completion_length": 1947.6875, "epoch": 1.5634146341463415, "grad_norm": 0.16254766285419464, "kl": 0.077880859375, "learning_rate": 4.147996056371258e-07, "loss": 0.0196, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1282 }, { "completion_length": 1914.5000610351562, "epoch": 1.5646341463414632, "grad_norm": 0.27330905199050903, "kl": 0.0701904296875, "learning_rate": 4.1259797308150816e-07, "loss": 0.1112, "reward": 0.18750000558793545, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 1283 }, { "completion_length": 1781.5626220703125, "epoch": 1.5658536585365854, "grad_norm": 0.1506660282611847, "kl": 0.0751953125, "learning_rate": 4.1040126678220656e-07, "loss": 0.0674, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1284 }, { "completion_length": 1732.7291870117188, "epoch": 1.5670731707317072, "grad_norm": 0.08717305213212967, "kl": 0.077880859375, "learning_rate": 4.0820949669095696e-07, "loss": 0.0595, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1285 }, { "completion_length": 1650.729248046875, "epoch": 1.5682926829268293, "grad_norm": 0.1694628745317459, "kl": 0.07275390625, "learning_rate": 4.060226727371327e-07, "loss": 0.0572, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1286 }, { "completion_length": 2443.375, "epoch": 1.569512195121951, "grad_norm": 0.13245034217834473, "kl": 0.07421875, "learning_rate": 4.038408048277009e-07, "loss": 0.0077, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1287 }, { "completion_length": 1480.1250610351562, "epoch": 1.5707317073170732, "grad_norm": 0.2667634189128876, "kl": 0.0689697265625, "learning_rate": 4.0166390284717475e-07, "loss": 0.0035, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1288 }, { "completion_length": 1399.375, "epoch": 1.571951219512195, "grad_norm": 0.08013737946748734, "kl": 0.064208984375, "learning_rate": 3.994919766575722e-07, "loss": 0.0436, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1289 }, { "completion_length": 2364.45849609375, "epoch": 1.5731707317073171, "grad_norm": 0.1908876597881317, "kl": 0.086181640625, "learning_rate": 3.973250360983677e-07, "loss": 0.1474, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1290 }, { "completion_length": 1783.4584350585938, "epoch": 1.5743902439024389, "grad_norm": 0.20513461530208588, "kl": 0.066162109375, "learning_rate": 3.9516309098645e-07, "loss": 0.0531, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1291 }, { "completion_length": 2203.041748046875, "epoch": 1.575609756097561, "grad_norm": 0.11043775081634521, "kl": 0.075927734375, "learning_rate": 3.930061511160762e-07, "loss": 0.0036, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1292 }, { "completion_length": 1855.4583740234375, "epoch": 1.5768292682926828, "grad_norm": 0.5188458561897278, "kl": 0.09521484375, "learning_rate": 3.9085422625882983e-07, "loss": 0.0227, "reward": 0.2083333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1293 }, { "completion_length": 1604.1250610351562, "epoch": 1.578048780487805, "grad_norm": 0.4142929017543793, "kl": 0.06591796875, "learning_rate": 3.8870732616357364e-07, "loss": 0.0242, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1294 }, { "completion_length": 2135.354248046875, "epoch": 1.5792682926829267, "grad_norm": 0.09533374756574631, "kl": 0.093505859375, "learning_rate": 3.865654605564065e-07, "loss": 0.0268, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1295 }, { "completion_length": 1725.541748046875, "epoch": 1.5804878048780489, "grad_norm": 0.21292603015899658, "kl": 0.0870361328125, "learning_rate": 3.8442863914062065e-07, "loss": 0.0658, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1296 }, { "completion_length": 1563.729248046875, "epoch": 1.5817073170731706, "grad_norm": 0.3118447959423065, "kl": 0.085693359375, "learning_rate": 3.822968715966555e-07, "loss": 0.0759, "reward": 0.3125000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 1297 }, { "completion_length": 2086.5625610351562, "epoch": 1.5829268292682928, "grad_norm": 0.11957163363695145, "kl": 0.0791015625, "learning_rate": 3.8017016758205597e-07, "loss": 0.0337, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 1298 }, { "completion_length": 1858.4583740234375, "epoch": 1.5841463414634145, "grad_norm": 0.18817508220672607, "kl": 0.065185546875, "learning_rate": 3.7804853673142704e-07, "loss": 0.0073, "reward": 0.1458333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1299 }, { "completion_length": 1629.1250610351562, "epoch": 1.5853658536585367, "grad_norm": 0.19607806205749512, "kl": 0.07275390625, "learning_rate": 3.759319886563905e-07, "loss": 0.1136, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1300 }, { "completion_length": 1442.7083740234375, "epoch": 1.5865853658536584, "grad_norm": 0.19717496633529663, "kl": 0.0703125, "learning_rate": 3.7382053294554163e-07, "loss": 0.0807, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1301 }, { "completion_length": 1990.125, "epoch": 1.5878048780487806, "grad_norm": 0.09625036269426346, "kl": 0.0687255859375, "learning_rate": 3.7171417916440714e-07, "loss": 0.003, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 1302 }, { "completion_length": 1855.375, "epoch": 1.5890243902439023, "grad_norm": 0.27812615036964417, "kl": 0.08740234375, "learning_rate": 3.696129368553989e-07, "loss": 0.1147, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1303 }, { "completion_length": 1589.416748046875, "epoch": 1.5902439024390245, "grad_norm": 0.1062350794672966, "kl": 0.061279296875, "learning_rate": 3.6751681553777236e-07, "loss": 0.0372, "reward": 0.18750000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 1304 }, { "completion_length": 1445.2916870117188, "epoch": 1.5914634146341462, "grad_norm": 0.33055344223976135, "kl": 0.0599365234375, "learning_rate": 3.6542582470758496e-07, "loss": 0.0431, "reward": 0.1875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1305 }, { "completion_length": 1766.7291870117188, "epoch": 1.5926829268292684, "grad_norm": 0.26322421431541443, "kl": 0.076171875, "learning_rate": 3.633399738376491e-07, "loss": 0.1547, "reward": 0.229166679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1306 }, { "completion_length": 2099.9166870117188, "epoch": 1.59390243902439, "grad_norm": 0.08777954429388046, "kl": 0.07275390625, "learning_rate": 3.6125927237749416e-07, "loss": 0.0402, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1307 }, { "completion_length": 1679.8541870117188, "epoch": 1.5951219512195123, "grad_norm": 0.1683105230331421, "kl": 0.08154296875, "learning_rate": 3.5918372975331933e-07, "loss": 0.0281, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1308 }, { "completion_length": 1457.4375610351562, "epoch": 1.596341463414634, "grad_norm": 0.08131491392850876, "kl": 0.06689453125, "learning_rate": 3.57113355367953e-07, "loss": 0.003, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1309 }, { "completion_length": 1978.2916870117188, "epoch": 1.5975609756097562, "grad_norm": 0.09362249076366425, "kl": 0.093994140625, "learning_rate": 3.5504815860081056e-07, "loss": 0.0534, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1310 }, { "completion_length": 1130.2708740234375, "epoch": 1.598780487804878, "grad_norm": 0.35080069303512573, "kl": 0.080078125, "learning_rate": 3.5298814880785015e-07, "loss": 0.1295, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1311 }, { "completion_length": 1512.3541870117188, "epoch": 1.6, "grad_norm": 0.12739229202270508, "kl": 0.0587158203125, "learning_rate": 3.5093333532153313e-07, "loss": -0.0104, "reward": 0.2916666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1312 }, { "completion_length": 1944.229248046875, "epoch": 1.6012195121951218, "grad_norm": 0.15987765789031982, "kl": 0.0771484375, "learning_rate": 3.4888372745077845e-07, "loss": 0.0525, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1313 }, { "completion_length": 1672.75, "epoch": 1.602439024390244, "grad_norm": 0.19417889416217804, "kl": 0.06640625, "learning_rate": 3.468393344809222e-07, "loss": -0.0316, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 1314 }, { "completion_length": 2005.0833740234375, "epoch": 1.6036585365853657, "grad_norm": 0.31984251737594604, "kl": 0.0859375, "learning_rate": 3.448001656736763e-07, "loss": 0.145, "reward": 0.2083333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1315 }, { "completion_length": 1590.5000610351562, "epoch": 1.604878048780488, "grad_norm": 0.13055923581123352, "kl": 0.067138671875, "learning_rate": 3.4276623026708556e-07, "loss": 0.0688, "reward": 0.25000000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.0, "step": 1316 }, { "completion_length": 1840.416748046875, "epoch": 1.6060975609756096, "grad_norm": 0.09148050844669342, "kl": 0.0888671875, "learning_rate": 3.4073753747548494e-07, "loss": 0.0033, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1317 }, { "completion_length": 2105.8750610351562, "epoch": 1.6073170731707318, "grad_norm": 0.11954900622367859, "kl": 0.070068359375, "learning_rate": 3.3871409648945955e-07, "loss": 0.0662, "reward": 0.18750000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 1318 }, { "completion_length": 2033.291748046875, "epoch": 1.6085365853658535, "grad_norm": 0.10976627469062805, "kl": 0.06787109375, "learning_rate": 3.3669591647580196e-07, "loss": 0.0166, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1319 }, { "completion_length": 1257.2500610351562, "epoch": 1.6097560975609757, "grad_norm": 0.2053261250257492, "kl": 0.0587158203125, "learning_rate": 3.346830065774706e-07, "loss": 0.0108, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1320 }, { "completion_length": 1712.1458740234375, "epoch": 1.6109756097560974, "grad_norm": 0.33659738302230835, "kl": 0.0859375, "learning_rate": 3.326753759135503e-07, "loss": 0.0603, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 1321 }, { "completion_length": 1375.5208740234375, "epoch": 1.6121951219512196, "grad_norm": 0.21279148757457733, "kl": 0.069580078125, "learning_rate": 3.306730335792075e-07, "loss": 0.1013, "reward": 0.1458333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1322 }, { "completion_length": 1412.5209350585938, "epoch": 1.6134146341463413, "grad_norm": 0.33809521794319153, "kl": 0.070068359375, "learning_rate": 3.286759886456513e-07, "loss": 0.106, "reward": 0.3333333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 1323 }, { "completion_length": 1923.604248046875, "epoch": 1.6146341463414635, "grad_norm": 0.1110396608710289, "kl": 0.07275390625, "learning_rate": 3.266842501600934e-07, "loss": 0.0234, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1324 }, { "completion_length": 2032.5208740234375, "epoch": 1.6158536585365852, "grad_norm": 0.11168409138917923, "kl": 0.066162109375, "learning_rate": 3.2469782714570374e-07, "loss": 0.0541, "reward": 0.125, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1325 }, { "completion_length": 2077.104248046875, "epoch": 1.6170731707317074, "grad_norm": 0.06876803934574127, "kl": 0.0601806640625, "learning_rate": 3.2271672860157324e-07, "loss": 0.0288, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1326 }, { "completion_length": 2052.6458740234375, "epoch": 1.6182926829268292, "grad_norm": 0.17562814056873322, "kl": 0.067626953125, "learning_rate": 3.207409635026704e-07, "loss": 0.0582, "reward": 0.18750000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 1327 }, { "completion_length": 2052.104248046875, "epoch": 1.6195121951219513, "grad_norm": 0.10074646025896072, "kl": 0.06787109375, "learning_rate": 3.187705407998018e-07, "loss": 0.0032, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1328 }, { "completion_length": 2231.9583740234375, "epoch": 1.620731707317073, "grad_norm": 0.07671454548835754, "kl": 0.0751953125, "learning_rate": 3.16805469419571e-07, "loss": 0.0155, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1329 }, { "completion_length": 1601.5208740234375, "epoch": 1.6219512195121952, "grad_norm": 0.3808484673500061, "kl": 0.0751953125, "learning_rate": 3.148457582643398e-07, "loss": 0.0145, "reward": 0.229166679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1330 }, { "completion_length": 1464.5, "epoch": 1.623170731707317, "grad_norm": 0.20483596622943878, "kl": 0.0501708984375, "learning_rate": 3.1289141621218513e-07, "loss": 0.0937, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1331 }, { "completion_length": 2180.9791870117188, "epoch": 1.6243902439024391, "grad_norm": 0.1423572152853012, "kl": 0.07470703125, "learning_rate": 3.1094245211686106e-07, "loss": -0.0091, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1332 }, { "completion_length": 1220.6458740234375, "epoch": 1.6256097560975609, "grad_norm": 0.16833433508872986, "kl": 0.0670166015625, "learning_rate": 3.089988748077572e-07, "loss": 0.0054, "reward": 0.2916666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1333 }, { "completion_length": 1570.041748046875, "epoch": 1.626829268292683, "grad_norm": 0.17097003757953644, "kl": 0.07470703125, "learning_rate": 3.070606930898602e-07, "loss": 0.0818, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1334 }, { "completion_length": 1679.875, "epoch": 1.6280487804878048, "grad_norm": 0.1299789994955063, "kl": 0.071533203125, "learning_rate": 3.051279157437132e-07, "loss": 0.0619, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 1335 }, { "completion_length": 2081.7916870117188, "epoch": 1.629268292682927, "grad_norm": 0.18279516696929932, "kl": 0.0780029296875, "learning_rate": 3.032005515253751e-07, "loss": 0.0154, "reward": 0.1041666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 1336 }, { "completion_length": 2069.354248046875, "epoch": 1.6304878048780487, "grad_norm": 0.11863919347524643, "kl": 0.071044921875, "learning_rate": 3.0127860916638204e-07, "loss": 0.0801, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1337 }, { "completion_length": 1561.5416870117188, "epoch": 1.6317073170731708, "grad_norm": 0.2195342481136322, "kl": 0.056396484375, "learning_rate": 2.9936209737370727e-07, "loss": 0.091, "reward": 0.1458333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 1338 }, { "completion_length": 1686.0416870117188, "epoch": 1.6329268292682926, "grad_norm": 0.20518018305301666, "kl": 0.068359375, "learning_rate": 2.97451024829723e-07, "loss": 0.1161, "reward": 0.2500000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 1339 }, { "completion_length": 1740.8125610351562, "epoch": 1.6341463414634148, "grad_norm": 0.060849081724882126, "kl": 0.0596923828125, "learning_rate": 2.955454001921588e-07, "loss": 0.0023, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1340 }, { "completion_length": 1303.4166870117188, "epoch": 1.6353658536585365, "grad_norm": 0.12662336230278015, "kl": 0.061279296875, "learning_rate": 2.9364523209406423e-07, "loss": 0.0082, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1341 }, { "completion_length": 1604.729248046875, "epoch": 1.6365853658536587, "grad_norm": 0.12785404920578003, "kl": 0.0673828125, "learning_rate": 2.917505291437683e-07, "loss": 0.0474, "reward": 0.20833333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 1342 }, { "completion_length": 1413.0000610351562, "epoch": 1.6378048780487804, "grad_norm": 0.3041352927684784, "kl": 0.07421875, "learning_rate": 2.8986129992484254e-07, "loss": 0.0979, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1343 }, { "completion_length": 1470.7916870117188, "epoch": 1.6390243902439026, "grad_norm": 0.14769724011421204, "kl": 0.0565185546875, "learning_rate": 2.879775529960603e-07, "loss": 0.055, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1344 }, { "completion_length": 1368.0208740234375, "epoch": 1.6402439024390243, "grad_norm": 0.09624624252319336, "kl": 0.08203125, "learning_rate": 2.8609929689135833e-07, "loss": 0.0026, "reward": 0.1666666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1345 }, { "completion_length": 1488.4375, "epoch": 1.6414634146341465, "grad_norm": 0.2585929334163666, "kl": 0.06787109375, "learning_rate": 2.842265401197982e-07, "loss": -0.0043, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1346 }, { "completion_length": 1793.6875610351562, "epoch": 1.6426829268292682, "grad_norm": 0.19037379324436188, "kl": 0.0550537109375, "learning_rate": 2.82359291165528e-07, "loss": -0.009, "reward": 0.2708333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1347 }, { "completion_length": 1599.5416870117188, "epoch": 1.6439024390243904, "grad_norm": 0.25295957922935486, "kl": 0.0643310546875, "learning_rate": 2.8049755848774337e-07, "loss": 0.1249, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1348 }, { "completion_length": 1571.2708740234375, "epoch": 1.645121951219512, "grad_norm": 0.06177343800663948, "kl": 0.055908203125, "learning_rate": 2.78641350520651e-07, "loss": 0.0024, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1349 }, { "completion_length": 1664.5833740234375, "epoch": 1.6463414634146343, "grad_norm": 0.182712584733963, "kl": 0.0574951171875, "learning_rate": 2.7679067567342766e-07, "loss": 0.0327, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1350 }, { "completion_length": 1914.6875610351562, "epoch": 1.647560975609756, "grad_norm": 0.16004031896591187, "kl": 0.063720703125, "learning_rate": 2.749455423301829e-07, "loss": 0.0855, "reward": 0.3125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 1351 }, { "completion_length": 1573.854248046875, "epoch": 1.6487804878048782, "grad_norm": 0.1886346936225891, "kl": 0.06884765625, "learning_rate": 2.7310595884992354e-07, "loss": 0.0758, "reward": 0.2083333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1352 }, { "completion_length": 1584.7083740234375, "epoch": 1.65, "grad_norm": 0.26959505677223206, "kl": 0.06689453125, "learning_rate": 2.7127193356651214e-07, "loss": 0.1297, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1353 }, { "completion_length": 1470.2708740234375, "epoch": 1.651219512195122, "grad_norm": 0.1921737939119339, "kl": 0.0556640625, "learning_rate": 2.6944347478863226e-07, "loss": 0.072, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1354 }, { "completion_length": 1409.0833740234375, "epoch": 1.6524390243902438, "grad_norm": 0.21368928253650665, "kl": 0.072509765625, "learning_rate": 2.676205907997484e-07, "loss": 0.1336, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1355 }, { "completion_length": 1239.6458740234375, "epoch": 1.653658536585366, "grad_norm": 0.2336946278810501, "kl": 0.07275390625, "learning_rate": 2.658032898580702e-07, "loss": 0.034, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1356 }, { "completion_length": 1589.3125610351562, "epoch": 1.6548780487804877, "grad_norm": 0.08435340970754623, "kl": 0.06201171875, "learning_rate": 2.6399158019651364e-07, "loss": 0.034, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1357 }, { "completion_length": 2195.3334350585938, "epoch": 1.65609756097561, "grad_norm": 0.07086175680160522, "kl": 0.0609130859375, "learning_rate": 2.621854700226663e-07, "loss": 0.0391, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1358 }, { "completion_length": 1743.5001220703125, "epoch": 1.6573170731707316, "grad_norm": 0.22611019015312195, "kl": 0.080322265625, "learning_rate": 2.603849675187469e-07, "loss": 0.1591, "reward": 0.2500000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 1359 }, { "completion_length": 2180.8333740234375, "epoch": 1.6585365853658538, "grad_norm": 0.17102093994617462, "kl": 0.08642578125, "learning_rate": 2.5859008084156986e-07, "loss": 0.0859, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1360 }, { "completion_length": 1721.6875610351562, "epoch": 1.6597560975609755, "grad_norm": 0.2011147439479828, "kl": 0.076171875, "learning_rate": 2.5680081812250825e-07, "loss": 0.1321, "reward": 0.125, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1361 }, { "completion_length": 2013.291748046875, "epoch": 1.6609756097560977, "grad_norm": 0.1909981518983841, "kl": 0.080322265625, "learning_rate": 2.5501718746745766e-07, "loss": 0.01, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 1362 }, { "completion_length": 2128.9583740234375, "epoch": 1.6621951219512194, "grad_norm": 0.1600262075662613, "kl": 0.084716796875, "learning_rate": 2.532391969567986e-07, "loss": 0.113, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1363 }, { "completion_length": 2350.041748046875, "epoch": 1.6634146341463416, "grad_norm": 0.13399024307727814, "kl": 0.0689697265625, "learning_rate": 2.514668546453592e-07, "loss": -0.0073, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1364 }, { "completion_length": 1428.1041870117188, "epoch": 1.6646341463414633, "grad_norm": 0.1101190596818924, "kl": 0.0655517578125, "learning_rate": 2.497001685623802e-07, "loss": 0.0014, "reward": 0.1041666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1365 }, { "completion_length": 1915.8333740234375, "epoch": 1.6658536585365855, "grad_norm": 0.09417035430669785, "kl": 0.064208984375, "learning_rate": 2.4793914671147745e-07, "loss": 0.0642, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1366 }, { "completion_length": 1585.9375610351562, "epoch": 1.6670731707317072, "grad_norm": 0.3500385880470276, "kl": 0.080322265625, "learning_rate": 2.4618379707060703e-07, "loss": 0.2066, "reward": 0.2083333432674408, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1367 }, { "completion_length": 1420.6875, "epoch": 1.6682926829268294, "grad_norm": 0.3389686048030853, "kl": 0.0673828125, "learning_rate": 2.4443412759202745e-07, "loss": 0.1864, "reward": 0.1875000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1368 }, { "completion_length": 1500.8958740234375, "epoch": 1.6695121951219511, "grad_norm": 0.1681443303823471, "kl": 0.0609130859375, "learning_rate": 2.426901462022645e-07, "loss": 0.0197, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1369 }, { "completion_length": 2384.0625, "epoch": 1.6707317073170733, "grad_norm": 0.2086997777223587, "kl": 0.082763671875, "learning_rate": 2.4095186080207505e-07, "loss": 0.1638, "reward": 0.1875000111758709, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 1370 }, { "completion_length": 1422.0000610351562, "epoch": 1.671951219512195, "grad_norm": 0.1464904397726059, "kl": 0.0540771484375, "learning_rate": 2.392192792664121e-07, "loss": 0.0108, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1371 }, { "completion_length": 1383.541748046875, "epoch": 1.6731707317073172, "grad_norm": 0.21518297493457794, "kl": 0.076171875, "learning_rate": 2.3749240944438845e-07, "loss": 0.1397, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1372 }, { "completion_length": 1569.5208740234375, "epoch": 1.674390243902439, "grad_norm": 0.1873239427804947, "kl": 0.075927734375, "learning_rate": 2.3577125915924004e-07, "loss": 0.0794, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1373 }, { "completion_length": 2065.354248046875, "epoch": 1.6756097560975611, "grad_norm": 0.26460734009742737, "kl": 0.07568359375, "learning_rate": 2.3405583620829268e-07, "loss": 0.1802, "reward": 0.2291666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 1374 }, { "completion_length": 1586.354248046875, "epoch": 1.6768292682926829, "grad_norm": 0.2182699739933014, "kl": 0.064208984375, "learning_rate": 2.3234614836292462e-07, "loss": -0.0146, "reward": 0.2708333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2708333395421505, "rewards/format_reward": 0.0, "step": 1375 }, { "completion_length": 1304.9583740234375, "epoch": 1.678048780487805, "grad_norm": 0.06871350854635239, "kl": 0.074462890625, "learning_rate": 2.3064220336853398e-07, "loss": 0.0027, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1376 }, { "completion_length": 2058.5208740234375, "epoch": 1.6792682926829268, "grad_norm": 0.1673513799905777, "kl": 0.0849609375, "learning_rate": 2.289440089445004e-07, "loss": 0.0355, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1377 }, { "completion_length": 1487.5208740234375, "epoch": 1.680487804878049, "grad_norm": 0.27194637060165405, "kl": 0.08837890625, "learning_rate": 2.272515727841527e-07, "loss": 0.1337, "reward": 0.12500000558793545, "reward_std": 0.21650633960962296, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 1378 }, { "completion_length": 1535.875, "epoch": 1.6817073170731707, "grad_norm": 0.3196185231208801, "kl": 0.075439453125, "learning_rate": 2.2556490255473205e-07, "loss": 0.1581, "reward": 0.3333333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 1379 }, { "completion_length": 1874.7916870117188, "epoch": 1.6829268292682928, "grad_norm": 0.19186078011989594, "kl": 0.0791015625, "learning_rate": 2.2388400589735985e-07, "loss": -0.0233, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1380 }, { "completion_length": 1360.625, "epoch": 1.6841463414634146, "grad_norm": 0.30192944407463074, "kl": 0.0596923828125, "learning_rate": 2.2220889042699976e-07, "loss": 0.1235, "reward": 0.2500000074505806, "reward_std": 0.14433757215738297, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 1381 }, { "completion_length": 1521.1875, "epoch": 1.6853658536585368, "grad_norm": 0.17477788031101227, "kl": 0.067626953125, "learning_rate": 2.205395637324264e-07, "loss": 0.0009, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 1382 }, { "completion_length": 1709.6041870117188, "epoch": 1.6865853658536585, "grad_norm": 0.17048640549182892, "kl": 0.07373046875, "learning_rate": 2.188760333761885e-07, "loss": 0.0503, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1383 }, { "completion_length": 1321.5625610351562, "epoch": 1.6878048780487804, "grad_norm": 0.26942193508148193, "kl": 0.074462890625, "learning_rate": 2.1721830689457583e-07, "loss": 0.0996, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1384 }, { "completion_length": 1709.0626220703125, "epoch": 1.6890243902439024, "grad_norm": 0.5606319904327393, "kl": 0.078857421875, "learning_rate": 2.1556639179758502e-07, "loss": 0.0053, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1385 }, { "completion_length": 1391.9166870117188, "epoch": 1.6902439024390243, "grad_norm": 0.08258421719074249, "kl": 0.062255859375, "learning_rate": 2.1392029556888576e-07, "loss": 0.0413, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1386 }, { "completion_length": 1680.3333740234375, "epoch": 1.6914634146341463, "grad_norm": 0.2426934689283371, "kl": 0.082275390625, "learning_rate": 2.1228002566578598e-07, "loss": 0.0809, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1387 }, { "completion_length": 1246.4791870117188, "epoch": 1.6926829268292682, "grad_norm": 0.2584293484687805, "kl": 0.0692138671875, "learning_rate": 2.1064558951919854e-07, "loss": 0.0059, "reward": 0.229166679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1388 }, { "completion_length": 1518.4583740234375, "epoch": 1.6939024390243902, "grad_norm": 0.3332042694091797, "kl": 0.095947265625, "learning_rate": 2.0901699453360784e-07, "loss": 0.0264, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 1389 }, { "completion_length": 1265.7916870117188, "epoch": 1.6951219512195121, "grad_norm": 0.36877796053886414, "kl": 0.065185546875, "learning_rate": 2.0739424808703638e-07, "loss": 0.1154, "reward": 0.2500000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 1390 }, { "completion_length": 1657.3958740234375, "epoch": 1.696341463414634, "grad_norm": 0.19026337563991547, "kl": 0.093505859375, "learning_rate": 2.057773575310109e-07, "loss": 0.0999, "reward": 0.2916666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1391 }, { "completion_length": 1802.5833740234375, "epoch": 1.697560975609756, "grad_norm": 0.316201388835907, "kl": 0.08154296875, "learning_rate": 2.0416633019052882e-07, "loss": 0.1024, "reward": 0.3541666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 1392 }, { "completion_length": 1957.2500610351562, "epoch": 1.698780487804878, "grad_norm": 0.10019800066947937, "kl": 0.0684814453125, "learning_rate": 2.0256117336402586e-07, "loss": 0.0259, "reward": 0.20833333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 1393 }, { "completion_length": 1585.416748046875, "epoch": 1.7, "grad_norm": 0.18442444503307343, "kl": 0.0791015625, "learning_rate": 2.0096189432334195e-07, "loss": 0.0733, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 1394 }, { "completion_length": 1682.8958740234375, "epoch": 1.701219512195122, "grad_norm": 0.226985365152359, "kl": 0.075927734375, "learning_rate": 1.9936850031369003e-07, "loss": 0.0904, "reward": 0.1458333358168602, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1395 }, { "completion_length": 1594.6458740234375, "epoch": 1.7024390243902439, "grad_norm": 0.249538853764534, "kl": 0.0888671875, "learning_rate": 1.9778099855362085e-07, "loss": 0.0894, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1396 }, { "completion_length": 1465.1458740234375, "epoch": 1.7036585365853658, "grad_norm": 0.19627216458320618, "kl": 0.0672607421875, "learning_rate": 1.9619939623499238e-07, "loss": 0.0715, "reward": 0.3333333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 1397 }, { "completion_length": 1380.5625, "epoch": 1.7048780487804878, "grad_norm": 0.05953046306967735, "kl": 0.065185546875, "learning_rate": 1.9462370052293544e-07, "loss": 0.0025, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1398 }, { "completion_length": 1659.8125610351562, "epoch": 1.7060975609756097, "grad_norm": 0.30355092883110046, "kl": 0.07568359375, "learning_rate": 1.9305391855582355e-07, "loss": 0.2134, "reward": 0.25, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1399 }, { "completion_length": 1571.8541870117188, "epoch": 1.7073170731707317, "grad_norm": 0.11099091917276382, "kl": 0.06982421875, "learning_rate": 1.9149005744523757e-07, "loss": 0.0, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1400 }, { "completion_length": 1507.375, "epoch": 1.7085365853658536, "grad_norm": 0.2584591805934906, "kl": 0.072998046875, "learning_rate": 1.8993212427593658e-07, "loss": 0.168, "reward": 0.2916666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1401 }, { "completion_length": 1526.6666870117188, "epoch": 1.7097560975609756, "grad_norm": 0.296516090631485, "kl": 0.08154296875, "learning_rate": 1.8838012610582356e-07, "loss": 0.1651, "reward": 0.3125000111758709, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3125000111758709, "rewards/format_reward": 0.0, "step": 1402 }, { "completion_length": 2122.0625, "epoch": 1.7109756097560975, "grad_norm": 0.1855512112379074, "kl": 0.083251953125, "learning_rate": 1.8683406996591373e-07, "loss": 0.1401, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1403 }, { "completion_length": 1555.6875610351562, "epoch": 1.7121951219512195, "grad_norm": 0.20343612134456635, "kl": 0.072021484375, "learning_rate": 1.852939628603046e-07, "loss": 0.0984, "reward": 0.3333333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 1404 }, { "completion_length": 1857.0209350585938, "epoch": 1.7134146341463414, "grad_norm": 0.18099896609783173, "kl": 0.08056640625, "learning_rate": 1.8375981176614114e-07, "loss": 0.1204, "reward": 0.1458333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1405 }, { "completion_length": 1569.3958740234375, "epoch": 1.7146341463414634, "grad_norm": 0.31066232919692993, "kl": 0.084228515625, "learning_rate": 1.822316236335867e-07, "loss": 0.1741, "reward": 0.1875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1406 }, { "completion_length": 1439.1041870117188, "epoch": 1.7158536585365853, "grad_norm": 0.20668961107730865, "kl": 0.061279296875, "learning_rate": 1.8070940538579044e-07, "loss": 0.0263, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 1407 }, { "completion_length": 2108.1458740234375, "epoch": 1.7170731707317073, "grad_norm": 0.2627232074737549, "kl": 0.08642578125, "learning_rate": 1.7919316391885593e-07, "loss": 0.1596, "reward": 0.1875000074505806, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1408 }, { "completion_length": 1574.6875610351562, "epoch": 1.7182926829268292, "grad_norm": 0.164041668176651, "kl": 0.1005859375, "learning_rate": 1.7768290610181065e-07, "loss": 0.1056, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1409 }, { "completion_length": 2014.1458740234375, "epoch": 1.7195121951219512, "grad_norm": 0.1380937546491623, "kl": 0.07470703125, "learning_rate": 1.761786387765743e-07, "loss": 0.073, "reward": 0.1458333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1410 }, { "completion_length": 1534.9375610351562, "epoch": 1.7207317073170731, "grad_norm": 0.22028258442878723, "kl": 0.08251953125, "learning_rate": 1.7468036875792747e-07, "loss": 0.1283, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 1411 }, { "completion_length": 1945.0208740234375, "epoch": 1.721951219512195, "grad_norm": 0.2191510945558548, "kl": 0.107177734375, "learning_rate": 1.731881028334808e-07, "loss": 0.113, "reward": 0.1666666716337204, "reward_std": 0.14433755725622177, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1412 }, { "completion_length": 2270.375, "epoch": 1.723170731707317, "grad_norm": 0.1634645313024521, "kl": 0.075439453125, "learning_rate": 1.71701847763646e-07, "loss": 0.1076, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 1413 }, { "completion_length": 1829.8541870117188, "epoch": 1.724390243902439, "grad_norm": 0.27810633182525635, "kl": 0.104248046875, "learning_rate": 1.7022161028160244e-07, "loss": 0.148, "reward": 0.2083333395421505, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 1414 }, { "completion_length": 2301.8958740234375, "epoch": 1.725609756097561, "grad_norm": 0.15658597648143768, "kl": 0.095947265625, "learning_rate": 1.6874739709326858e-07, "loss": 0.0692, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1415 }, { "completion_length": 1359.5833740234375, "epoch": 1.726829268292683, "grad_norm": 0.4897747039794922, "kl": 0.085205078125, "learning_rate": 1.6727921487727095e-07, "loss": -0.0411, "reward": 0.3125000149011612, "reward_std": 0.14433757215738297, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 1416 }, { "completion_length": 1640.6666870117188, "epoch": 1.7280487804878049, "grad_norm": 0.12924863398075104, "kl": 0.0645751953125, "learning_rate": 1.658170702849135e-07, "loss": 0.0473, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1417 }, { "completion_length": 1866.6041870117188, "epoch": 1.7292682926829268, "grad_norm": 0.13372871279716492, "kl": 0.07763671875, "learning_rate": 1.64360969940149e-07, "loss": 0.0657, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1418 }, { "completion_length": 1496.9791870117188, "epoch": 1.7304878048780488, "grad_norm": 0.26394447684288025, "kl": 0.073486328125, "learning_rate": 1.6291092043954752e-07, "loss": 0.0975, "reward": 0.3333333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 1419 }, { "completion_length": 1398.2083740234375, "epoch": 1.7317073170731707, "grad_norm": 0.17623132467269897, "kl": 0.07568359375, "learning_rate": 1.6146692835226669e-07, "loss": 0.0797, "reward": 0.20833333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 1420 }, { "completion_length": 1855.4583740234375, "epoch": 1.7329268292682927, "grad_norm": 0.11644536256790161, "kl": 0.091796875, "learning_rate": 1.6002900022002193e-07, "loss": 0.0653, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1421 }, { "completion_length": 1852.791748046875, "epoch": 1.7341463414634146, "grad_norm": 0.12358993291854858, "kl": 0.09033203125, "learning_rate": 1.5859714255705843e-07, "loss": 0.0554, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1422 }, { "completion_length": 1701.791748046875, "epoch": 1.7353658536585366, "grad_norm": 0.2510251998901367, "kl": 0.086669921875, "learning_rate": 1.57171361850119e-07, "loss": 0.1584, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 1423 }, { "completion_length": 1819.9583740234375, "epoch": 1.7365853658536585, "grad_norm": 0.2161521166563034, "kl": 0.087158203125, "learning_rate": 1.5575166455841677e-07, "loss": 0.0006, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1424 }, { "completion_length": 1383.1458740234375, "epoch": 1.7378048780487805, "grad_norm": 0.47753164172172546, "kl": 0.067626953125, "learning_rate": 1.5433805711360484e-07, "loss": 0.1202, "reward": 0.125, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1425 }, { "completion_length": 1967.4375610351562, "epoch": 1.7390243902439024, "grad_norm": 0.14851343631744385, "kl": 0.076171875, "learning_rate": 1.5293054591974726e-07, "loss": 0.0039, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1426 }, { "completion_length": 1738.6250610351562, "epoch": 1.7402439024390244, "grad_norm": 0.19428478181362152, "kl": 0.0908203125, "learning_rate": 1.5152913735329128e-07, "loss": 0.0091, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1427 }, { "completion_length": 1566.541748046875, "epoch": 1.7414634146341463, "grad_norm": 0.1476166844367981, "kl": 0.06201171875, "learning_rate": 1.501338377630362e-07, "loss": 0.017, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1428 }, { "completion_length": 1183.7083740234375, "epoch": 1.7426829268292683, "grad_norm": 0.2785905599594116, "kl": 0.064453125, "learning_rate": 1.4874465347010663e-07, "loss": -0.04, "reward": 0.20833333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 1429 }, { "completion_length": 1857.6666870117188, "epoch": 1.7439024390243902, "grad_norm": 0.15956029295921326, "kl": 0.08740234375, "learning_rate": 1.473615907679229e-07, "loss": 0.0081, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1430 }, { "completion_length": 1887.7708740234375, "epoch": 1.7451219512195122, "grad_norm": 0.15829989314079285, "kl": 0.0986328125, "learning_rate": 1.459846559221721e-07, "loss": 0.0047, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1431 }, { "completion_length": 1646.2083740234375, "epoch": 1.7463414634146341, "grad_norm": 0.1425284594297409, "kl": 0.0799560546875, "learning_rate": 1.446138551707814e-07, "loss": 0.0469, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1432 }, { "completion_length": 1711.729248046875, "epoch": 1.747560975609756, "grad_norm": 0.20764295756816864, "kl": 0.076171875, "learning_rate": 1.432491947238876e-07, "loss": 0.0418, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1433 }, { "completion_length": 1742.8333740234375, "epoch": 1.748780487804878, "grad_norm": 0.28253892064094543, "kl": 0.08642578125, "learning_rate": 1.4189068076381078e-07, "loss": 0.0774, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1434 }, { "completion_length": 1602.4375610351562, "epoch": 1.75, "grad_norm": 0.19883811473846436, "kl": 0.09375, "learning_rate": 1.405383194450251e-07, "loss": 0.0289, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1435 }, { "completion_length": 1092.4375610351562, "epoch": 1.751219512195122, "grad_norm": 0.10824039578437805, "kl": 0.064453125, "learning_rate": 1.3919211689413207e-07, "loss": 0.0003, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1436 }, { "completion_length": 1812.9583740234375, "epoch": 1.752439024390244, "grad_norm": 0.12448608130216599, "kl": 0.095703125, "learning_rate": 1.3785207920983145e-07, "loss": 0.0044, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1437 }, { "completion_length": 1848.1458740234375, "epoch": 1.7536585365853659, "grad_norm": 0.1668328046798706, "kl": 0.073974609375, "learning_rate": 1.365182124628949e-07, "loss": 0.0261, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1438 }, { "completion_length": 1938.9583740234375, "epoch": 1.7548780487804878, "grad_norm": 0.16632770001888275, "kl": 0.078857421875, "learning_rate": 1.3519052269613757e-07, "loss": 0.0451, "reward": 0.1666666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1439 }, { "completion_length": 1080.7083740234375, "epoch": 1.7560975609756098, "grad_norm": 0.09403663873672485, "kl": 0.060546875, "learning_rate": 1.3386901592439071e-07, "loss": 0.0481, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1440 }, { "completion_length": 1812.6875610351562, "epoch": 1.7573170731707317, "grad_norm": 0.15411347150802612, "kl": 0.086181640625, "learning_rate": 1.3255369813447572e-07, "loss": 0.0807, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1441 }, { "completion_length": 1777.666748046875, "epoch": 1.7585365853658537, "grad_norm": 0.2594011425971985, "kl": 0.1025390625, "learning_rate": 1.3124457528517503e-07, "loss": 0.1567, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1442 }, { "completion_length": 2030.0626220703125, "epoch": 1.7597560975609756, "grad_norm": 0.2035287767648697, "kl": 0.08984375, "learning_rate": 1.2994165330720675e-07, "loss": 0.079, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1443 }, { "completion_length": 1300.3333740234375, "epoch": 1.7609756097560976, "grad_norm": 0.13286280632019043, "kl": 0.0606689453125, "learning_rate": 1.2864493810319676e-07, "loss": 0.0218, "reward": 0.1666666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1444 }, { "completion_length": 1413.4583740234375, "epoch": 1.7621951219512195, "grad_norm": 0.4096165597438812, "kl": 0.074462890625, "learning_rate": 1.2735443554765313e-07, "loss": 0.2109, "reward": 0.3125000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 1445 }, { "completion_length": 1677.1041870117188, "epoch": 1.7634146341463415, "grad_norm": 0.27949607372283936, "kl": 0.088134765625, "learning_rate": 1.260701514869379e-07, "loss": 0.1764, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1446 }, { "completion_length": 1744.8125610351562, "epoch": 1.7646341463414634, "grad_norm": 0.4421347677707672, "kl": 0.105712890625, "learning_rate": 1.2479209173924182e-07, "loss": 0.2277, "reward": 0.1875000074505806, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1447 }, { "completion_length": 1317.7291870117188, "epoch": 1.7658536585365854, "grad_norm": 0.35280731320381165, "kl": 0.06591796875, "learning_rate": 1.2352026209455808e-07, "loss": 0.0267, "reward": 0.18750000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 1448 }, { "completion_length": 1116.7500610351562, "epoch": 1.7670731707317073, "grad_norm": 0.2101072371006012, "kl": 0.055908203125, "learning_rate": 1.2225466831465486e-07, "loss": 0.0527, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1449 }, { "completion_length": 1257.6042175292969, "epoch": 1.7682926829268293, "grad_norm": 0.08068323135375977, "kl": 0.05810546875, "learning_rate": 1.209953161330507e-07, "loss": 0.003, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1450 }, { "completion_length": 1773.3958740234375, "epoch": 1.7695121951219512, "grad_norm": 0.1861063838005066, "kl": 0.071044921875, "learning_rate": 1.1974221125498734e-07, "loss": 0.0304, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1451 }, { "completion_length": 1888.7501220703125, "epoch": 1.7707317073170732, "grad_norm": 0.1378265619277954, "kl": 0.080810546875, "learning_rate": 1.1849535935740474e-07, "loss": 0.0429, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1452 }, { "completion_length": 1788.7708740234375, "epoch": 1.7719512195121951, "grad_norm": 0.34163737297058105, "kl": 0.093994140625, "learning_rate": 1.1725476608891478e-07, "loss": 0.1023, "reward": 0.2708333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1453 }, { "completion_length": 1680.2708740234375, "epoch": 1.773170731707317, "grad_norm": 0.36342260241508484, "kl": 0.103271484375, "learning_rate": 1.1602043706977538e-07, "loss": 0.0673, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 1454 }, { "completion_length": 1547.916748046875, "epoch": 1.774390243902439, "grad_norm": 0.23036308586597443, "kl": 0.093017578125, "learning_rate": 1.147923778918667e-07, "loss": 0.0433, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1455 }, { "completion_length": 2156.7708740234375, "epoch": 1.775609756097561, "grad_norm": 0.12080541998147964, "kl": 0.087890625, "learning_rate": 1.1357059411866355e-07, "loss": 0.0666, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1456 }, { "completion_length": 1547.8333740234375, "epoch": 1.776829268292683, "grad_norm": 0.178075909614563, "kl": 0.08935546875, "learning_rate": 1.1235509128521221e-07, "loss": 0.0376, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1457 }, { "completion_length": 1893.041748046875, "epoch": 1.778048780487805, "grad_norm": 0.11980780959129333, "kl": 0.091552734375, "learning_rate": 1.1114587489810352e-07, "loss": 0.0039, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1458 }, { "completion_length": 1494.0000610351562, "epoch": 1.7792682926829269, "grad_norm": 0.15189304947853088, "kl": 0.08203125, "learning_rate": 1.0994295043544978e-07, "loss": 0.0521, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1459 }, { "completion_length": 1750.979248046875, "epoch": 1.7804878048780488, "grad_norm": 0.21471711993217468, "kl": 0.083740234375, "learning_rate": 1.0874632334685808e-07, "loss": 0.0215, "reward": 0.3750000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 1460 }, { "completion_length": 1990.3333740234375, "epoch": 1.7817073170731708, "grad_norm": 0.1337735503911972, "kl": 0.093017578125, "learning_rate": 1.0755599905340701e-07, "loss": 0.0224, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1461 }, { "completion_length": 2160.8125, "epoch": 1.7829268292682927, "grad_norm": 0.10762909054756165, "kl": 0.10302734375, "learning_rate": 1.0637198294762152e-07, "loss": 0.035, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1462 }, { "completion_length": 2431.9583740234375, "epoch": 1.7841463414634147, "grad_norm": 0.17587214708328247, "kl": 0.08935546875, "learning_rate": 1.0519428039344836e-07, "loss": 0.046, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1463 }, { "completion_length": 2378.541748046875, "epoch": 1.7853658536585366, "grad_norm": 0.1693323701620102, "kl": 0.10546875, "learning_rate": 1.0402289672623272e-07, "loss": 0.0995, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1464 }, { "completion_length": 1632.0625610351562, "epoch": 1.7865853658536586, "grad_norm": 0.21734333038330078, "kl": 0.0751953125, "learning_rate": 1.0285783725269232e-07, "loss": 0.1061, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1465 }, { "completion_length": 1681.4375610351562, "epoch": 1.7878048780487805, "grad_norm": 0.2751876711845398, "kl": 0.09619140625, "learning_rate": 1.0169910725089548e-07, "loss": 0.1901, "reward": 0.2291666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 1466 }, { "completion_length": 1615.1458740234375, "epoch": 1.7890243902439025, "grad_norm": 0.19052988290786743, "kl": 0.07177734375, "learning_rate": 1.005467119702353e-07, "loss": 0.0757, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1467 }, { "completion_length": 1656.4791870117188, "epoch": 1.7902439024390244, "grad_norm": 0.18948304653167725, "kl": 0.0869140625, "learning_rate": 9.940065663140663e-08, "loss": 0.0407, "reward": 0.0625, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1468 }, { "completion_length": 1512.3750610351562, "epoch": 1.7914634146341464, "grad_norm": 0.27637651562690735, "kl": 0.0831298828125, "learning_rate": 9.82609464263835e-08, "loss": 0.0988, "reward": 0.229166679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1469 }, { "completion_length": 1361.8333740234375, "epoch": 1.7926829268292683, "grad_norm": 0.17504960298538208, "kl": 0.081298828125, "learning_rate": 9.71275865183936e-08, "loss": 0.0767, "reward": 0.291666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 1470 }, { "completion_length": 1753.875, "epoch": 1.7939024390243903, "grad_norm": 0.16409631073474884, "kl": 0.087158203125, "learning_rate": 9.600058204189627e-08, "loss": 0.0694, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1471 }, { "completion_length": 1589.3333740234375, "epoch": 1.7951219512195122, "grad_norm": 0.22304117679595947, "kl": 0.085205078125, "learning_rate": 9.487993810255823e-08, "loss": 0.0838, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 1472 }, { "completion_length": 1593.25, "epoch": 1.7963414634146342, "grad_norm": 0.16661493480205536, "kl": 0.078125, "learning_rate": 9.376565977723229e-08, "loss": 0.0415, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1473 }, { "completion_length": 1645.9166870117188, "epoch": 1.7975609756097561, "grad_norm": 0.1744689792394638, "kl": 0.06884765625, "learning_rate": 9.265775211393224e-08, "loss": 0.0053, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1474 }, { "completion_length": 1572.4166870117188, "epoch": 1.798780487804878, "grad_norm": 0.22043155133724213, "kl": 0.0626220703125, "learning_rate": 9.15562201318107e-08, "loss": 0.145, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1475 }, { "completion_length": 1458.6458740234375, "epoch": 1.8, "grad_norm": 0.1582263708114624, "kl": 0.056884765625, "learning_rate": 9.046106882113752e-08, "loss": 0.0492, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1476 }, { "completion_length": 2193.541748046875, "epoch": 1.801219512195122, "grad_norm": 0.14705690741539001, "kl": 0.077880859375, "learning_rate": 8.937230314327504e-08, "loss": 0.004, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1477 }, { "completion_length": 1965.8125, "epoch": 1.802439024390244, "grad_norm": 0.242121160030365, "kl": 0.07958984375, "learning_rate": 8.828992803065772e-08, "loss": 0.1537, "reward": 0.1875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1478 }, { "completion_length": 2212.729248046875, "epoch": 1.803658536585366, "grad_norm": 0.2533845901489258, "kl": 0.08837890625, "learning_rate": 8.721394838676816e-08, "loss": 0.0408, "reward": 0.18750000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 1479 }, { "completion_length": 2389.0833740234375, "epoch": 1.8048780487804879, "grad_norm": 0.18107719719409943, "kl": 0.10693359375, "learning_rate": 8.614436908611617e-08, "loss": 0.0099, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1480 }, { "completion_length": 2266.6458740234375, "epoch": 1.8060975609756098, "grad_norm": 0.14060205221176147, "kl": 0.092041015625, "learning_rate": 8.508119497421524e-08, "loss": 0.0474, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1481 }, { "completion_length": 1527.916748046875, "epoch": 1.8073170731707318, "grad_norm": 0.21066421270370483, "kl": 0.0751953125, "learning_rate": 8.402443086756273e-08, "loss": 0.0589, "reward": 0.2291666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 1482 }, { "completion_length": 1487.1666870117188, "epoch": 1.8085365853658537, "grad_norm": 0.1191595122218132, "kl": 0.068115234375, "learning_rate": 8.297408155361542e-08, "loss": 0.0698, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1483 }, { "completion_length": 1575.2916870117188, "epoch": 1.8097560975609757, "grad_norm": 0.1543770283460617, "kl": 0.092529296875, "learning_rate": 8.193015179076996e-08, "loss": 0.0233, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 1484 }, { "completion_length": 1574.8333740234375, "epoch": 1.8109756097560976, "grad_norm": 0.2664240300655365, "kl": 0.110107421875, "learning_rate": 8.089264630834032e-08, "loss": 0.0346, "reward": 0.1875000074505806, "reward_std": 0.18042196333408356, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1485 }, { "completion_length": 1739.3125, "epoch": 1.8121951219512196, "grad_norm": 0.19781696796417236, "kl": 0.087158203125, "learning_rate": 7.986156980653653e-08, "loss": 0.1084, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1486 }, { "completion_length": 1476.104248046875, "epoch": 1.8134146341463415, "grad_norm": 0.519705057144165, "kl": 0.09912109375, "learning_rate": 7.883692695644363e-08, "loss": 0.2674, "reward": 0.2500000149011612, "reward_std": 0.3247595429420471, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 1487 }, { "completion_length": 1751.3125, "epoch": 1.8146341463414632, "grad_norm": 0.22625093162059784, "kl": 0.0732421875, "learning_rate": 7.781872239999993e-08, "loss": 0.0028, "reward": 0.3333333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 1488 }, { "completion_length": 1769.104248046875, "epoch": 1.8158536585365854, "grad_norm": 0.324908584356308, "kl": 0.093994140625, "learning_rate": 7.680696074997645e-08, "loss": 0.1584, "reward": 0.2083333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1489 }, { "completion_length": 1481.9375610351562, "epoch": 1.8170731707317072, "grad_norm": 0.25108274817466736, "kl": 0.07275390625, "learning_rate": 7.580164658995603e-08, "loss": 0.0728, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1490 }, { "completion_length": 1305.8333740234375, "epoch": 1.8182926829268293, "grad_norm": 0.26763343811035156, "kl": 0.0731201171875, "learning_rate": 7.480278447431221e-08, "loss": 0.1035, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1491 }, { "completion_length": 1256.5625, "epoch": 1.819512195121951, "grad_norm": 0.23040653765201569, "kl": 0.0703125, "learning_rate": 7.381037892818959e-08, "loss": 0.0809, "reward": 0.229166679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1492 }, { "completion_length": 1299.5000610351562, "epoch": 1.8207317073170732, "grad_norm": 0.20258349180221558, "kl": 0.0760498046875, "learning_rate": 7.282443444748149e-08, "loss": 0.0604, "reward": 0.2916666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1493 }, { "completion_length": 1417.0416870117188, "epoch": 1.821951219512195, "grad_norm": 0.31238001585006714, "kl": 0.086181640625, "learning_rate": 7.184495549881131e-08, "loss": 0.1183, "reward": 0.125, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1494 }, { "completion_length": 1355.0208740234375, "epoch": 1.8231707317073171, "grad_norm": 0.24863460659980774, "kl": 0.080810546875, "learning_rate": 7.087194651951157e-08, "loss": 0.0819, "reward": 0.2083333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 1495 }, { "completion_length": 1363.9375610351562, "epoch": 1.8243902439024389, "grad_norm": 0.2733905613422394, "kl": 0.0908203125, "learning_rate": 6.990541191760418e-08, "loss": -0.0189, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 1496 }, { "completion_length": 1837.354248046875, "epoch": 1.825609756097561, "grad_norm": 0.29813429713249207, "kl": 0.0966796875, "learning_rate": 6.894535607177959e-08, "loss": 0.1719, "reward": 0.2083333432674408, "reward_std": 0.21650635451078415, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1497 }, { "completion_length": 1790.5001220703125, "epoch": 1.8268292682926828, "grad_norm": 0.2676183879375458, "kl": 0.090576171875, "learning_rate": 6.799178333137784e-08, "loss": 0.1374, "reward": 0.10416666977107525, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1498 }, { "completion_length": 1153.2292175292969, "epoch": 1.828048780487805, "grad_norm": 0.19397395849227905, "kl": 0.06787109375, "learning_rate": 6.704469801636881e-08, "loss": 0.0058, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1499 }, { "completion_length": 1183.9583740234375, "epoch": 1.8292682926829267, "grad_norm": 0.3470856249332428, "kl": 0.06982421875, "learning_rate": 6.610410441733156e-08, "loss": 0.0799, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 1500 }, { "completion_length": 1873.875, "epoch": 1.8304878048780489, "grad_norm": 0.18581156432628632, "kl": 0.1044921875, "learning_rate": 6.5170006795437e-08, "loss": 0.107, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1501 }, { "completion_length": 1708.0208740234375, "epoch": 1.8317073170731706, "grad_norm": 0.2814028561115265, "kl": 0.078125, "learning_rate": 6.424240938242643e-08, "loss": -0.035, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1502 }, { "completion_length": 1427.5416870117188, "epoch": 1.8329268292682928, "grad_norm": 0.10806597024202347, "kl": 0.06640625, "learning_rate": 6.332131638059318e-08, "loss": 0.042, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1503 }, { "completion_length": 1689.0, "epoch": 1.8341463414634145, "grad_norm": 0.37589481472969055, "kl": 0.081787109375, "learning_rate": 6.24067319627642e-08, "loss": 0.1749, "reward": 0.3750000149011612, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 1504 }, { "completion_length": 1530.8958740234375, "epoch": 1.8353658536585367, "grad_norm": 0.2855803966522217, "kl": 0.097900390625, "learning_rate": 6.149866027228046e-08, "loss": 0.081, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1505 }, { "completion_length": 1636.3333740234375, "epoch": 1.8365853658536584, "grad_norm": 0.20363250374794006, "kl": 0.07568359375, "learning_rate": 6.059710542297824e-08, "loss": 0.0516, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1506 }, { "completion_length": 1617.9166870117188, "epoch": 1.8378048780487806, "grad_norm": 0.19811317324638367, "kl": 0.08740234375, "learning_rate": 5.970207149917062e-08, "loss": 0.0542, "reward": 0.1458333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1507 }, { "completion_length": 1626.3958740234375, "epoch": 1.8390243902439023, "grad_norm": 0.18653440475463867, "kl": 0.0675048828125, "learning_rate": 5.8813562555628585e-08, "loss": 0.054, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1508 }, { "completion_length": 1751.4791870117188, "epoch": 1.8402439024390245, "grad_norm": 0.1762414574623108, "kl": 0.080810546875, "learning_rate": 5.7931582617563316e-08, "loss": 0.0265, "reward": 0.2916666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1509 }, { "completion_length": 1366.0833740234375, "epoch": 1.8414634146341462, "grad_norm": 0.2419130504131317, "kl": 0.08154296875, "learning_rate": 5.7056135680607965e-08, "loss": 0.1777, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1510 }, { "completion_length": 1783.5833740234375, "epoch": 1.8426829268292684, "grad_norm": 0.1566605567932129, "kl": 0.07568359375, "learning_rate": 5.6187225710798704e-08, "loss": 0.0971, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1511 }, { "completion_length": 1052.625, "epoch": 1.84390243902439, "grad_norm": 0.29989922046661377, "kl": 0.0599365234375, "learning_rate": 5.532485664455755e-08, "loss": -0.0069, "reward": 0.2500000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 1512 }, { "completion_length": 1870.5625, "epoch": 1.8451219512195123, "grad_norm": 0.16965952515602112, "kl": 0.078857421875, "learning_rate": 5.4469032388674236e-08, "loss": 0.0626, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1513 }, { "completion_length": 1709.7708740234375, "epoch": 1.846341463414634, "grad_norm": 0.09151699393987656, "kl": 0.072265625, "learning_rate": 5.3619756820288525e-08, "loss": 0.0032, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1514 }, { "completion_length": 1388.3541870117188, "epoch": 1.8475609756097562, "grad_norm": 0.3744647800922394, "kl": 0.076904296875, "learning_rate": 5.2777033786872595e-08, "loss": 0.1673, "reward": 0.2291666679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 1515 }, { "completion_length": 2006.541748046875, "epoch": 1.848780487804878, "grad_norm": 0.1917242854833603, "kl": 0.0859375, "learning_rate": 5.194086710621404e-08, "loss": 0.0088, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1516 }, { "completion_length": 1441.9583740234375, "epoch": 1.85, "grad_norm": 0.20222026109695435, "kl": 0.0682373046875, "learning_rate": 5.11112605663977e-08, "loss": 0.0428, "reward": 0.2500000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 1517 }, { "completion_length": 1881.5833740234375, "epoch": 1.8512195121951218, "grad_norm": 0.27488046884536743, "kl": 0.091796875, "learning_rate": 5.0288217925789025e-08, "loss": 0.0978, "reward": 0.229166679084301, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1518 }, { "completion_length": 1497.291748046875, "epoch": 1.852439024390244, "grad_norm": 0.2601536810398102, "kl": 0.0771484375, "learning_rate": 4.947174291301776e-08, "loss": 0.1025, "reward": 0.25, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1519 }, { "completion_length": 1789.291748046875, "epoch": 1.8536585365853657, "grad_norm": 0.17641666531562805, "kl": 0.079833984375, "learning_rate": 4.86618392269596e-08, "loss": 0.0018, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1520 }, { "completion_length": 1491.666748046875, "epoch": 1.854878048780488, "grad_norm": 0.24217985570430756, "kl": 0.072265625, "learning_rate": 4.785851053672041e-08, "loss": 0.1131, "reward": 0.1458333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1521 }, { "completion_length": 1143.0208740234375, "epoch": 1.8560975609756096, "grad_norm": 0.18729360401630402, "kl": 0.0728759765625, "learning_rate": 4.70617604816192e-08, "loss": 0.0391, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1522 }, { "completion_length": 1423.0208740234375, "epoch": 1.8573170731707318, "grad_norm": 0.19250337779521942, "kl": 0.063720703125, "learning_rate": 4.627159267117215e-08, "loss": 0.0473, "reward": 0.3125, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 1523 }, { "completion_length": 1908.8958740234375, "epoch": 1.8585365853658535, "grad_norm": 0.242966890335083, "kl": 0.08251953125, "learning_rate": 4.54880106850758e-08, "loss": 0.0794, "reward": 0.1875, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1524 }, { "completion_length": 2028.0416870117188, "epoch": 1.8597560975609757, "grad_norm": 0.25495606660842896, "kl": 0.085693359375, "learning_rate": 4.471101807319072e-08, "loss": 0.1613, "reward": 0.2291666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1525 }, { "completion_length": 1622.6666870117188, "epoch": 1.8609756097560974, "grad_norm": 0.30292466282844543, "kl": 0.0718994140625, "learning_rate": 4.394061835552554e-08, "loss": 0.127, "reward": 0.14583333395421505, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 1526 }, { "completion_length": 1484.9166870117188, "epoch": 1.8621951219512196, "grad_norm": 0.2254457324743271, "kl": 0.089599609375, "learning_rate": 4.317681502222159e-08, "loss": 0.107, "reward": 0.1458333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1527 }, { "completion_length": 1774.6666870117188, "epoch": 1.8634146341463413, "grad_norm": 0.11391057819128036, "kl": 0.079345703125, "learning_rate": 4.2419611533536296e-08, "loss": 0.0286, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1528 }, { "completion_length": 1796.2291870117188, "epoch": 1.8646341463414635, "grad_norm": 0.3482476472854614, "kl": 0.106201171875, "learning_rate": 4.1669011319827975e-08, "loss": 0.1972, "reward": 0.3541666865348816, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 0.0, "step": 1529 }, { "completion_length": 1198.979248046875, "epoch": 1.8658536585365852, "grad_norm": 0.22851526737213135, "kl": 0.074462890625, "learning_rate": 4.0925017781539896e-08, "loss": 0.0904, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1530 }, { "completion_length": 1743.0625610351562, "epoch": 1.8670731707317074, "grad_norm": 0.17961429059505463, "kl": 0.0787353515625, "learning_rate": 4.018763428918509e-08, "loss": 0.0164, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1531 }, { "completion_length": 1505.25, "epoch": 1.8682926829268292, "grad_norm": 0.1478978544473648, "kl": 0.0859375, "learning_rate": 3.9456864183331557e-08, "loss": 0.0767, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1532 }, { "completion_length": 1589.0416870117188, "epoch": 1.8695121951219513, "grad_norm": 0.27158448100090027, "kl": 0.1162109375, "learning_rate": 3.873271077458607e-08, "loss": 0.0411, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1533 }, { "completion_length": 1770.4375610351562, "epoch": 1.870731707317073, "grad_norm": 0.3546469807624817, "kl": 0.084228515625, "learning_rate": 3.80151773435804e-08, "loss": 0.1149, "reward": 0.2708333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1534 }, { "completion_length": 1809.6251220703125, "epoch": 1.8719512195121952, "grad_norm": 0.1018819659948349, "kl": 0.0791015625, "learning_rate": 3.7304267140955305e-08, "loss": 0.0347, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1535 }, { "completion_length": 1563.4375, "epoch": 1.873170731707317, "grad_norm": 0.11332801729440689, "kl": 0.096435546875, "learning_rate": 3.659998338734671e-08, "loss": 0.0002, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1536 }, { "completion_length": 1866.9166870117188, "epoch": 1.8743902439024391, "grad_norm": 0.1634679138660431, "kl": 0.08740234375, "learning_rate": 3.590232927337056e-08, "loss": 0.0716, "reward": 0.3125000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "step": 1537 }, { "completion_length": 1803.3541870117188, "epoch": 1.8756097560975609, "grad_norm": 0.17867393791675568, "kl": 0.082275390625, "learning_rate": 3.5211307959608475e-08, "loss": 0.1041, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 1538 }, { "completion_length": 1455.4375610351562, "epoch": 1.876829268292683, "grad_norm": 0.1541653722524643, "kl": 0.080322265625, "learning_rate": 3.452692257659379e-08, "loss": 0.0452, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1539 }, { "completion_length": 1353.9375610351562, "epoch": 1.8780487804878048, "grad_norm": 0.285623699426651, "kl": 0.0830078125, "learning_rate": 3.3849176224796884e-08, "loss": 0.1648, "reward": 0.3750000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 1540 }, { "completion_length": 1943.8541870117188, "epoch": 1.879268292682927, "grad_norm": 0.17982155084609985, "kl": 0.091552734375, "learning_rate": 3.317807197461137e-08, "loss": 0.0505, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1541 }, { "completion_length": 1920.8541870117188, "epoch": 1.8804878048780487, "grad_norm": 0.17678050696849823, "kl": 0.091796875, "learning_rate": 3.2513612866339916e-08, "loss": 0.0924, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1542 }, { "completion_length": 2131.8333740234375, "epoch": 1.8817073170731708, "grad_norm": 0.19584013521671295, "kl": 0.095458984375, "learning_rate": 3.185580191018128e-08, "loss": 0.0843, "reward": 0.18750000558793545, "reward_std": 0.14433757215738297, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 1543 }, { "completion_length": 1422.2708740234375, "epoch": 1.8829268292682926, "grad_norm": 0.3136843144893646, "kl": 0.07568359375, "learning_rate": 3.1204642086215817e-08, "loss": 0.1935, "reward": 0.2708333432674408, "reward_std": 0.21650633960962296, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1544 }, { "completion_length": 1726.0625, "epoch": 1.8841463414634148, "grad_norm": 0.2898077964782715, "kl": 0.073974609375, "learning_rate": 3.056013634439198e-08, "loss": 0.1008, "reward": 0.25, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1545 }, { "completion_length": 1523.3125, "epoch": 1.8853658536585365, "grad_norm": 0.26922017335891724, "kl": 0.0830078125, "learning_rate": 2.992228760451349e-08, "loss": 0.1457, "reward": 0.16666667722165585, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.02083333395421505, "step": 1546 }, { "completion_length": 1560.4166870117188, "epoch": 1.8865853658536587, "grad_norm": 0.271720290184021, "kl": 0.0830078125, "learning_rate": 2.929109875622621e-08, "loss": 0.0784, "reward": 0.1875000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1547 }, { "completion_length": 1812.4583740234375, "epoch": 1.8878048780487804, "grad_norm": 0.1532987654209137, "kl": 0.077392578125, "learning_rate": 2.8666572659003965e-08, "loss": 0.1279, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1548 }, { "completion_length": 1646.3125610351562, "epoch": 1.8890243902439026, "grad_norm": 0.16763700544834137, "kl": 0.0869140625, "learning_rate": 2.804871214213689e-08, "loss": 0.035, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1549 }, { "completion_length": 1498.7083740234375, "epoch": 1.8902439024390243, "grad_norm": 0.21274901926517487, "kl": 0.07421875, "learning_rate": 2.743752000471761e-08, "loss": 0.1138, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 1550 }, { "completion_length": 1561.0416870117188, "epoch": 1.8914634146341465, "grad_norm": 0.2538447380065918, "kl": 0.065185546875, "learning_rate": 2.6832999015629577e-08, "loss": 0.0851, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1551 }, { "completion_length": 1460.0000610351562, "epoch": 1.8926829268292682, "grad_norm": 0.21260160207748413, "kl": 0.060302734375, "learning_rate": 2.6235151913533595e-08, "loss": 0.0841, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1552 }, { "completion_length": 1328.916748046875, "epoch": 1.8939024390243904, "grad_norm": 0.2561925947666168, "kl": 0.06884765625, "learning_rate": 2.5643981406855642e-08, "loss": 0.0947, "reward": 0.3958333432674408, "reward_std": 0.18042194843292236, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 1553 }, { "completion_length": 1512.5833740234375, "epoch": 1.895121951219512, "grad_norm": 0.13209940493106842, "kl": 0.070068359375, "learning_rate": 2.50594901737749e-08, "loss": 0.0525, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1554 }, { "completion_length": 2127.2708740234375, "epoch": 1.8963414634146343, "grad_norm": 0.15172114968299866, "kl": 0.095458984375, "learning_rate": 2.4481680862211418e-08, "loss": 0.0047, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1555 }, { "completion_length": 1334.6875610351562, "epoch": 1.897560975609756, "grad_norm": 0.29449230432510376, "kl": 0.0574951171875, "learning_rate": 2.3910556089814294e-08, "loss": 0.115, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1556 }, { "completion_length": 1694.625, "epoch": 1.8987804878048782, "grad_norm": 0.22945699095726013, "kl": 0.080078125, "learning_rate": 2.334611844394935e-08, "loss": 0.0051, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 1557 }, { "completion_length": 1668.7708740234375, "epoch": 1.9, "grad_norm": 0.1354336142539978, "kl": 0.07568359375, "learning_rate": 2.278837048168797e-08, "loss": 0.0544, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1558 }, { "completion_length": 1920.5625, "epoch": 1.901219512195122, "grad_norm": 0.2421872615814209, "kl": 0.094482421875, "learning_rate": 2.223731472979512e-08, "loss": 0.0983, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1559 }, { "completion_length": 1489.5416870117188, "epoch": 1.9024390243902438, "grad_norm": 0.4394077658653259, "kl": 0.093505859375, "learning_rate": 2.1692953684718187e-08, "loss": 0.1442, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1560 }, { "completion_length": 1351.8958740234375, "epoch": 1.903658536585366, "grad_norm": 0.2472190260887146, "kl": 0.085205078125, "learning_rate": 2.1155289812575305e-08, "loss": 0.0129, "reward": 0.2916666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666679084301, "rewards/format_reward": 0.0, "step": 1561 }, { "completion_length": 1484.5, "epoch": 1.9048780487804877, "grad_norm": 0.16479888558387756, "kl": 0.06689453125, "learning_rate": 2.0624325549144894e-08, "loss": 0.0455, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1562 }, { "completion_length": 1996.4584350585938, "epoch": 1.90609756097561, "grad_norm": 0.20834337174892426, "kl": 0.071533203125, "learning_rate": 2.0100063299853645e-08, "loss": -0.0314, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1563 }, { "completion_length": 2074.854248046875, "epoch": 1.9073170731707316, "grad_norm": 0.20955395698547363, "kl": 0.111572265625, "learning_rate": 1.9582505439766028e-08, "loss": 0.0718, "reward": 0.125, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1564 }, { "completion_length": 1382.6250610351562, "epoch": 1.9085365853658538, "grad_norm": 0.21121741831302643, "kl": 0.07177734375, "learning_rate": 1.9071654313574495e-08, "loss": 0.0729, "reward": 0.1458333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 1565 }, { "completion_length": 1462.1875610351562, "epoch": 1.9097560975609755, "grad_norm": 0.32714754343032837, "kl": 0.0704345703125, "learning_rate": 1.856751223558695e-08, "loss": 0.117, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1566 }, { "completion_length": 1424.7083740234375, "epoch": 1.9109756097560977, "grad_norm": 0.18917284905910492, "kl": 0.061767578125, "learning_rate": 1.807008148971795e-08, "loss": 0.0662, "reward": 0.2083333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1567 }, { "completion_length": 1608.8958740234375, "epoch": 1.9121951219512194, "grad_norm": 0.2899203598499298, "kl": 0.06640625, "learning_rate": 1.7579364329477375e-08, "loss": 0.1107, "reward": 0.1875000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1568 }, { "completion_length": 1354.2500610351562, "epoch": 1.9134146341463416, "grad_norm": 0.1603468358516693, "kl": 0.09033203125, "learning_rate": 1.7095362977960605e-08, "loss": 0.0573, "reward": 0.1458333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1569 }, { "completion_length": 1490.2916870117188, "epoch": 1.9146341463414633, "grad_norm": 0.3233860731124878, "kl": 0.09130859375, "learning_rate": 1.661807962783851e-08, "loss": 0.1418, "reward": 0.2708333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1570 }, { "completion_length": 1607.3958740234375, "epoch": 1.9158536585365855, "grad_norm": 0.19245733320713043, "kl": 0.081298828125, "learning_rate": 1.6147516441347822e-08, "loss": 0.0502, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1571 }, { "completion_length": 1488.625, "epoch": 1.9170731707317072, "grad_norm": 0.36428895592689514, "kl": 0.072509765625, "learning_rate": 1.5683675550279943e-08, "loss": 0.2118, "reward": 0.16666667722165585, "reward_std": 0.18042197078466415, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1572 }, { "completion_length": 1643.2708740234375, "epoch": 1.9182926829268294, "grad_norm": 0.12023729085922241, "kl": 0.072265625, "learning_rate": 1.5226559055972976e-08, "loss": 0.0036, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1573 }, { "completion_length": 1692.3125610351562, "epoch": 1.9195121951219511, "grad_norm": 0.26583561301231384, "kl": 0.08642578125, "learning_rate": 1.4776169029301234e-08, "loss": 0.1114, "reward": 0.2291666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1574 }, { "completion_length": 1607.1666870117188, "epoch": 1.9207317073170733, "grad_norm": 0.46342435479164124, "kl": 0.073974609375, "learning_rate": 1.4332507510666393e-08, "loss": 0.0336, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 1575 }, { "completion_length": 1333.9166870117188, "epoch": 1.921951219512195, "grad_norm": 0.1875724196434021, "kl": 0.071533203125, "learning_rate": 1.3895576509987685e-08, "loss": 0.0819, "reward": 0.25000000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.0, "step": 1576 }, { "completion_length": 1503.1875, "epoch": 1.9231707317073172, "grad_norm": 0.36296355724334717, "kl": 0.079345703125, "learning_rate": 1.346537800669323e-08, "loss": 0.095, "reward": 0.1875000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.02083333395421505, "step": 1577 }, { "completion_length": 1452.9583740234375, "epoch": 1.924390243902439, "grad_norm": 0.16157208383083344, "kl": 0.05810546875, "learning_rate": 1.3041913949710715e-08, "loss": 0.0553, "reward": 0.3541666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 1578 }, { "completion_length": 1682.6250610351562, "epoch": 1.9256097560975611, "grad_norm": 0.28617843985557556, "kl": 0.085205078125, "learning_rate": 1.2625186257459064e-08, "loss": 0.1668, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1579 }, { "completion_length": 1681.7083740234375, "epoch": 1.9268292682926829, "grad_norm": 0.30020374059677124, "kl": 0.083740234375, "learning_rate": 1.2215196817839447e-08, "loss": 0.1881, "reward": 0.2291666716337204, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1580 }, { "completion_length": 1715.75, "epoch": 1.928048780487805, "grad_norm": 5.214967250823975, "kl": 0.398681640625, "learning_rate": 1.1811947488226282e-08, "loss": 0.1676, "reward": 0.3333333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 1581 }, { "completion_length": 1255.6458740234375, "epoch": 1.9292682926829268, "grad_norm": 0.0864172950387001, "kl": 0.067626953125, "learning_rate": 1.1415440095460083e-08, "loss": 0.003, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1582 }, { "completion_length": 1603.8125, "epoch": 1.930487804878049, "grad_norm": 0.26917973160743713, "kl": 0.0673828125, "learning_rate": 1.1025676435837296e-08, "loss": 0.1413, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1583 }, { "completion_length": 1398.854248046875, "epoch": 1.9317073170731707, "grad_norm": 0.0717015191912651, "kl": 0.0487060546875, "learning_rate": 1.06426582751043e-08, "loss": 0.0026, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1584 }, { "completion_length": 1726.8958740234375, "epoch": 1.9329268292682928, "grad_norm": 0.3111580014228821, "kl": 0.1005859375, "learning_rate": 1.0266387348447758e-08, "loss": 0.1816, "reward": 0.1458333395421505, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 1585 }, { "completion_length": 1688.7291870117188, "epoch": 1.9341463414634146, "grad_norm": 0.22925113141536713, "kl": 0.076171875, "learning_rate": 9.896865360487451e-09, "loss": 0.0663, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1586 }, { "completion_length": 1239.8542175292969, "epoch": 1.9353658536585368, "grad_norm": 0.0671822801232338, "kl": 0.068359375, "learning_rate": 9.534093985268444e-09, "loss": 0.0024, "reward": 0.1041666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1587 }, { "completion_length": 1915.5208740234375, "epoch": 1.9365853658536585, "grad_norm": 0.19842761754989624, "kl": 0.0732421875, "learning_rate": 9.178074866253605e-09, "loss": -0.0099, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1588 }, { "completion_length": 1713.7500610351562, "epoch": 1.9378048780487804, "grad_norm": 0.2526528537273407, "kl": 0.069091796875, "learning_rate": 8.82880961631577e-09, "loss": 0.0117, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 1589 }, { "completion_length": 1601.1458740234375, "epoch": 1.9390243902439024, "grad_norm": 0.2622646987438202, "kl": 0.084716796875, "learning_rate": 8.486299817731412e-09, "loss": 0.0503, "reward": 0.18750000558793545, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 1590 }, { "completion_length": 1773.2291870117188, "epoch": 1.9402439024390243, "grad_norm": 0.3941381871700287, "kl": 0.086669921875, "learning_rate": 8.150547022171828e-09, "loss": 0.0019, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.02083333395421505, "step": 1591 }, { "completion_length": 1638.6041870117188, "epoch": 1.9414634146341463, "grad_norm": 0.3316585123538971, "kl": 0.0657958984375, "learning_rate": 7.821552750697958e-09, "loss": -0.0408, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 1592 }, { "completion_length": 1504.3125610351562, "epoch": 1.9426829268292682, "grad_norm": 0.2543622851371765, "kl": 0.0760498046875, "learning_rate": 7.499318493751905e-09, "loss": 0.1861, "reward": 0.2500000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 1593 }, { "completion_length": 1519.0000610351562, "epoch": 1.9439024390243902, "grad_norm": 0.3067987561225891, "kl": 0.064697265625, "learning_rate": 7.1838457111516044e-09, "loss": 0.0084, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1594 }, { "completion_length": 1091.9583740234375, "epoch": 1.9451219512195121, "grad_norm": 0.388936847448349, "kl": 0.0635986328125, "learning_rate": 6.875135832082657e-09, "loss": 0.0606, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1595 }, { "completion_length": 1456.9791870117188, "epoch": 1.946341463414634, "grad_norm": 0.18355830013751984, "kl": 0.09033203125, "learning_rate": 6.573190255093342e-09, "loss": 0.0988, "reward": 0.2916666865348816, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.0, "step": 1596 }, { "completion_length": 1886.666748046875, "epoch": 1.947560975609756, "grad_norm": 0.27557137608528137, "kl": 0.082275390625, "learning_rate": 6.278010348087282e-09, "loss": 0.0396, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 1597 }, { "completion_length": 1928.1876220703125, "epoch": 1.948780487804878, "grad_norm": 0.158290833234787, "kl": 0.079833984375, "learning_rate": 5.989597448317785e-09, "loss": 0.0611, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1598 }, { "completion_length": 1291.3958740234375, "epoch": 1.95, "grad_norm": 0.2550250291824341, "kl": 0.066650390625, "learning_rate": 5.707952862381682e-09, "loss": 0.1601, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 1599 }, { "completion_length": 2106.5001220703125, "epoch": 1.951219512195122, "grad_norm": 0.26331937313079834, "kl": 0.09814453125, "learning_rate": 5.433077866212999e-09, "loss": 0.1503, "reward": 0.16666667722165585, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1600 }, { "completion_length": 1600.3541870117188, "epoch": 1.9524390243902439, "grad_norm": 0.2648146450519562, "kl": 0.063720703125, "learning_rate": 5.164973705077624e-09, "loss": 0.0027, "reward": 0.1041666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 1601 }, { "completion_length": 1986.6041870117188, "epoch": 1.9536585365853658, "grad_norm": 0.10995600372552872, "kl": 0.086669921875, "learning_rate": 4.903641593567654e-09, "loss": 0.0656, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1602 }, { "completion_length": 1539.8541870117188, "epoch": 1.9548780487804878, "grad_norm": 0.17011724412441254, "kl": 0.080322265625, "learning_rate": 4.649082715595554e-09, "loss": 0.009, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 1603 }, { "completion_length": 1689.854248046875, "epoch": 1.9560975609756097, "grad_norm": 0.17988145351409912, "kl": 0.070556640625, "learning_rate": 4.401298224389338e-09, "loss": 0.0136, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 1604 }, { "completion_length": 1966.1875610351562, "epoch": 1.9573170731707317, "grad_norm": 0.21440589427947998, "kl": 0.08251953125, "learning_rate": 4.160289242486737e-09, "loss": 0.1013, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1605 }, { "completion_length": 1129.8958740234375, "epoch": 1.9585365853658536, "grad_norm": 0.41219407320022583, "kl": 0.081787109375, "learning_rate": 3.926056861730532e-09, "loss": 0.0846, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.02083333395421505, "step": 1606 }, { "completion_length": 1903.3958740234375, "epoch": 1.9597560975609756, "grad_norm": 2.8246495723724365, "kl": 0.209228515625, "learning_rate": 3.6986021432633967e-09, "loss": 0.1304, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1607 }, { "completion_length": 1447.7708740234375, "epoch": 1.9609756097560975, "grad_norm": 0.13706016540527344, "kl": 0.107666015625, "learning_rate": 3.4779261175232334e-09, "loss": 0.004, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 1608 }, { "completion_length": 1748.2916870117188, "epoch": 1.9621951219512195, "grad_norm": 0.18737952411174774, "kl": 0.0927734375, "learning_rate": 3.2640297842385092e-09, "loss": 0.0112, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 1609 }, { "completion_length": 1413.75, "epoch": 1.9634146341463414, "grad_norm": 0.21151605248451233, "kl": 0.073974609375, "learning_rate": 3.0569141124234256e-09, "loss": 0.0469, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1610 }, { "completion_length": 1560.2083740234375, "epoch": 1.9646341463414634, "grad_norm": 0.13386563956737518, "kl": 0.075439453125, "learning_rate": 2.8565800403740906e-09, "loss": 0.04, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 1611 }, { "completion_length": 1867.4376220703125, "epoch": 1.9658536585365853, "grad_norm": 0.28275448083877563, "kl": 0.094482421875, "learning_rate": 2.6630284756635204e-09, "loss": 0.0541, "reward": 0.3333333432674408, "reward_std": 0.18042196333408356, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 1612 }, { "completion_length": 1994.0626220703125, "epoch": 1.9670731707317073, "grad_norm": 0.23630951344966888, "kl": 0.08203125, "learning_rate": 2.4762602951383104e-09, "loss": 0.1314, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1613 }, { "completion_length": 1835.5416870117188, "epoch": 1.9682926829268292, "grad_norm": 0.1736697554588318, "kl": 0.08544921875, "learning_rate": 2.2962763449141387e-09, "loss": 0.0501, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 1614 }, { "completion_length": 1318.7708740234375, "epoch": 1.9695121951219512, "grad_norm": 0.24054989218711853, "kl": 0.08447265625, "learning_rate": 2.123077440372101e-09, "loss": 0.0182, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1615 }, { "completion_length": 1692.9375610351562, "epoch": 1.9707317073170731, "grad_norm": 0.1033225804567337, "kl": 0.07177734375, "learning_rate": 1.9566643661550478e-09, "loss": 0.0035, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1616 }, { "completion_length": 1433.4583740234375, "epoch": 1.971951219512195, "grad_norm": 1.3399320840835571, "kl": 0.101806640625, "learning_rate": 1.7970378761639206e-09, "loss": -0.0413, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.02083333395421505, "step": 1617 }, { "completion_length": 1417.104248046875, "epoch": 1.973170731707317, "grad_norm": 0.1722671240568161, "kl": 0.083984375, "learning_rate": 1.6441986935545884e-09, "loss": 0.0236, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 1618 }, { "completion_length": 1818.2916870117188, "epoch": 1.974390243902439, "grad_norm": 0.15060457587242126, "kl": 0.078125, "learning_rate": 1.4981475107341825e-09, "loss": 0.0191, "reward": 0.3125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 1619 }, { "completion_length": 1104.3958740234375, "epoch": 1.975609756097561, "grad_norm": 0.2602185606956482, "kl": 0.048095703125, "learning_rate": 1.3588849893579336e-09, "loss": -0.0594, "reward": 0.2916666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 1620 }, { "completion_length": 1972.1041870117188, "epoch": 1.976829268292683, "grad_norm": 0.25712689757347107, "kl": 0.082763671875, "learning_rate": 1.226411760327173e-09, "loss": 0.1432, "reward": 0.3333333432674408, "reward_std": 0.18042197078466415, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 1621 }, { "completion_length": 1682.2916870117188, "epoch": 1.9780487804878049, "grad_norm": 0.2629391849040985, "kl": 0.078369140625, "learning_rate": 1.1007284237850025e-09, "loss": 0.1281, "reward": 0.1875, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 1622 }, { "completion_length": 1827.6041870117188, "epoch": 1.9792682926829268, "grad_norm": 0.16482365131378174, "kl": 0.083251953125, "learning_rate": 9.818355491144626e-10, "loss": 0.0276, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 1623 }, { "completion_length": 1699.1875, "epoch": 1.9804878048780488, "grad_norm": 0.35353758931159973, "kl": 0.076416015625, "learning_rate": 8.697336749358687e-10, "loss": 0.084, "reward": 0.1666666716337204, "reward_std": 0.18042196333408356, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.02083333395421505, "step": 1624 }, { "completion_length": 1986.104248046875, "epoch": 1.9817073170731707, "grad_norm": 0.19637711346149445, "kl": 0.096923828125, "learning_rate": 7.644233091043118e-10, "loss": 0.1318, "reward": 0.291666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 1625 }, { "completion_length": 2079.375, "epoch": 1.9829268292682927, "grad_norm": 0.5732306838035583, "kl": 0.101806640625, "learning_rate": 6.659049287071617e-10, "loss": 0.0738, "reward": 0.1041666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 1626 }, { "completion_length": 1584.9375610351562, "epoch": 1.9841463414634146, "grad_norm": 0.2708092927932739, "kl": 0.0692138671875, "learning_rate": 5.741789800622344e-10, "loss": 0.0969, "reward": 0.3125000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "step": 1627 }, { "completion_length": 1467.916748046875, "epoch": 1.9853658536585366, "grad_norm": 0.2703225910663605, "kl": 0.0703125, "learning_rate": 4.892458787154608e-10, "loss": 0.1274, "reward": 0.3125, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 1628 }, { "completion_length": 1708.0625610351562, "epoch": 1.9865853658536585, "grad_norm": 0.22886954247951508, "kl": 0.076171875, "learning_rate": 4.1110600943905507e-10, "loss": -0.0003, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 1629 }, { "completion_length": 1154.5208740234375, "epoch": 1.9878048780487805, "grad_norm": 0.34930136799812317, "kl": 0.06640625, "learning_rate": 3.397597262300156e-10, "loss": 0.1308, "reward": 0.16666667722165585, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 1630 }, { "completion_length": 1498.666748046875, "epoch": 1.9890243902439024, "grad_norm": 0.28230759501457214, "kl": 0.07666015625, "learning_rate": 2.7520735230845973e-10, "loss": 0.1489, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1631 }, { "completion_length": 1845.0834350585938, "epoch": 1.9902439024390244, "grad_norm": 0.16053642332553864, "kl": 0.080322265625, "learning_rate": 2.1744918011595837e-10, "loss": 0.0048, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 1632 }, { "completion_length": 1551.8750610351562, "epoch": 1.9914634146341463, "grad_norm": 0.2249978631734848, "kl": 0.0660400390625, "learning_rate": 1.664854713142039e-10, "loss": 0.0327, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 1633 }, { "completion_length": 1795.0208740234375, "epoch": 1.9926829268292683, "grad_norm": 0.1317099928855896, "kl": 0.07861328125, "learning_rate": 1.2231645678401072e-10, "loss": 0.0675, "reward": 0.2291666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 1634 }, { "completion_length": 1227.0625610351562, "epoch": 1.9939024390243902, "grad_norm": 0.26589706540107727, "kl": 0.0771484375, "learning_rate": 8.494233662431627e-11, "loss": 0.0804, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 1635 }, { "completion_length": 1571.6041870117188, "epoch": 1.9951219512195122, "grad_norm": 4.611546516418457, "kl": 0.1064453125, "learning_rate": 5.436328015101522e-11, "loss": 0.1019, "reward": 0.3541666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 1636 }, { "completion_length": 1775.104248046875, "epoch": 1.9963414634146341, "grad_norm": 0.1396946907043457, "kl": 0.0775146484375, "learning_rate": 3.057942589645979e-11, "loss": 0.0413, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 1637 }, { "completion_length": 1204.9583740234375, "epoch": 1.997560975609756, "grad_norm": 0.22410139441490173, "kl": 0.05712890625, "learning_rate": 1.359088160846067e-11, "loss": 0.0944, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 1638 }, { "completion_length": 1376.9583740234375, "epoch": 1.998780487804878, "grad_norm": 0.4389346241950989, "kl": 0.0791015625, "learning_rate": 3.3977242502869487e-12, "loss": 0.0996, "reward": 0.2708333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.02083333395421505, "step": 1639 }, { "completion_length": 1408.625, "epoch": 2.0, "grad_norm": 0.18749462068080902, "kl": 0.0655517578125, "learning_rate": 0.0, "loss": 0.1093, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 1640 }, { "epoch": 2.0, "step": 1640, "total_flos": 0.0, "train_loss": 0.0569690075825815, "train_runtime": 141636.0417, "train_samples_per_second": 0.185, "train_steps_per_second": 0.012 } ], "logging_steps": 1, "max_steps": 1640, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }