{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994666666666666, "eval_steps": 10, "global_step": 937, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 859.6666870117188, "epoch": 0.0010666666666666667, "grad_norm": 0.1633366346359253, "kl": 0.0, "learning_rate": 3.191489361702128e-08, "loss": 0.1468, "reward": 0.6666666865348816, "reward_std": 0.38552647083997726, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 1 }, { "completion_length": 698.1041870117188, "epoch": 0.0021333333333333334, "grad_norm": 0.2470741868019104, "kl": 0.0, "learning_rate": 6.382978723404255e-08, "loss": 0.037, "reward": 0.5000000055879354, "reward_std": 0.36809216812253, "rewards/accuracy_reward": 0.5000000055879354, "rewards/format_reward": 0.0, "step": 2 }, { "completion_length": 923.3542022705078, "epoch": 0.0032, "grad_norm": 0.46142876148223877, "kl": 0.00016450881958007812, "learning_rate": 9.574468085106382e-08, "loss": 0.2867, "reward": 0.6041666939854622, "reward_std": 0.36417656019330025, "rewards/accuracy_reward": 0.6041666939854622, "rewards/format_reward": 0.0, "step": 3 }, { "completion_length": 928.0208740234375, "epoch": 0.004266666666666667, "grad_norm": 0.14440442621707916, "kl": 0.00013947486877441406, "learning_rate": 1.276595744680851e-07, "loss": 0.127, "reward": 0.6666666865348816, "reward_std": 0.4326418787240982, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 4 }, { "completion_length": 813.9375228881836, "epoch": 0.005333333333333333, "grad_norm": 0.5711305737495422, "kl": 0.00018334388732910156, "learning_rate": 1.5957446808510638e-07, "loss": -0.002, "reward": 0.6875000298023224, "reward_std": 0.40168892219662666, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 925.6458435058594, "epoch": 0.0064, "grad_norm": 0.34275493025779724, "kl": 0.00015985965728759766, "learning_rate": 1.9148936170212765e-07, "loss": 0.0915, "reward": 0.4583333432674408, "reward_std": 0.3680921643972397, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "step": 6 }, { "completion_length": 938.5833435058594, "epoch": 0.007466666666666667, "grad_norm": 0.13078002631664276, "kl": 0.0002027750015258789, "learning_rate": 2.2340425531914894e-07, "loss": 0.0871, "reward": 0.604166679084301, "reward_std": 0.2350771203637123, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "step": 7 }, { "completion_length": 828.3125152587891, "epoch": 0.008533333333333334, "grad_norm": 0.1887660175561905, "kl": 0.00016546249389648438, "learning_rate": 2.553191489361702e-07, "loss": 0.1321, "reward": 0.7916666865348816, "reward_std": 0.3506578877568245, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 8 }, { "completion_length": 649.0625152587891, "epoch": 0.0096, "grad_norm": 0.21328449249267578, "kl": 0.00018727779388427734, "learning_rate": 2.872340425531915e-07, "loss": 0.0493, "reward": 0.708333358168602, "reward_std": 0.3506578654050827, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 9 }, { "completion_length": 736.4375152587891, "epoch": 0.010666666666666666, "grad_norm": 0.21250943839550018, "kl": 0.00017786026000976562, "learning_rate": 3.1914893617021275e-07, "loss": 0.2306, "reward": 0.833333358168602, "reward_std": 0.3707359507679939, "rewards/accuracy_reward": 0.833333358168602, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 645.5625228881836, "epoch": 0.011733333333333333, "grad_norm": 0.6602387428283691, "kl": 0.00023448467254638672, "learning_rate": 3.5106382978723405e-07, "loss": -0.1184, "reward": 0.7500000149011612, "reward_std": 0.23116153478622437, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 11 }, { "completion_length": 613.8125152587891, "epoch": 0.0128, "grad_norm": 0.3021746277809143, "kl": 0.00020456314086914062, "learning_rate": 3.829787234042553e-07, "loss": 0.0905, "reward": 0.7500000149011612, "reward_std": 0.4152076169848442, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 12 }, { "completion_length": 623.2083511352539, "epoch": 0.013866666666666666, "grad_norm": 0.19089840352535248, "kl": 0.0002193450927734375, "learning_rate": 4.1489361702127664e-07, "loss": 0.0999, "reward": 0.5625000149011612, "reward_std": 0.2525113970041275, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 13 }, { "completion_length": 773.1041946411133, "epoch": 0.014933333333333333, "grad_norm": 0.1808082014322281, "kl": 0.00021839141845703125, "learning_rate": 4.468085106382979e-07, "loss": 0.1891, "reward": 0.6250000223517418, "reward_std": 0.3776952549815178, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 14 }, { "completion_length": 954.3750152587891, "epoch": 0.016, "grad_norm": 0.15659639239311218, "kl": 0.0002205371856689453, "learning_rate": 4.787234042553192e-07, "loss": 0.2952, "reward": 0.541666679084301, "reward_std": 0.4326419234275818, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 719.1250152587891, "epoch": 0.017066666666666667, "grad_norm": 0.4306057393550873, "kl": 0.0002903938293457031, "learning_rate": 5.106382978723404e-07, "loss": 0.2142, "reward": 0.6875000223517418, "reward_std": 0.33713920414447784, "rewards/accuracy_reward": 0.6875000223517418, "rewards/format_reward": 0.0, "step": 16 }, { "completion_length": 707.5000152587891, "epoch": 0.018133333333333335, "grad_norm": 0.311917781829834, "kl": 0.0002505779266357422, "learning_rate": 5.425531914893618e-07, "loss": 0.1325, "reward": 0.8125000149011612, "reward_std": 0.33713919669389725, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 17 }, { "completion_length": 739.6666870117188, "epoch": 0.0192, "grad_norm": 0.7353296279907227, "kl": 0.000469207763671875, "learning_rate": 5.74468085106383e-07, "loss": 0.077, "reward": 0.583333358168602, "reward_std": 0.3506578803062439, "rewards/accuracy_reward": 0.583333358168602, "rewards/format_reward": 0.0, "step": 18 }, { "completion_length": 765.5416870117188, "epoch": 0.020266666666666665, "grad_norm": 0.31223347783088684, "kl": 0.0003113746643066406, "learning_rate": 6.063829787234043e-07, "loss": 0.2174, "reward": 0.645833358168602, "reward_std": 0.41912318766117096, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 19 }, { "completion_length": 789.3750152587891, "epoch": 0.021333333333333333, "grad_norm": 0.2303486168384552, "kl": 0.0003361701965332031, "learning_rate": 6.382978723404255e-07, "loss": 0.1042, "reward": 0.6250000111758709, "reward_std": 0.3131455257534981, "rewards/accuracy_reward": 0.6250000111758709, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 1017.3333740234375, "epoch": 0.0224, "grad_norm": 0.2264050990343094, "kl": 0.0006613731384277344, "learning_rate": 6.702127659574468e-07, "loss": 0.1415, "reward": 0.5208333432674408, "reward_std": 0.30922994762659073, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 21 }, { "completion_length": 538.1875228881836, "epoch": 0.023466666666666667, "grad_norm": 1.0090715885162354, "kl": 0.0003657341003417969, "learning_rate": 7.021276595744681e-07, "loss": -0.0447, "reward": 0.7916667014360428, "reward_std": 0.3602609857916832, "rewards/accuracy_reward": 0.7916667014360428, "rewards/format_reward": 0.0, "step": 22 }, { "completion_length": 646.5625076293945, "epoch": 0.024533333333333334, "grad_norm": 0.25761550664901733, "kl": 0.0011539459228515625, "learning_rate": 7.340425531914893e-07, "loss": 0.1518, "reward": 0.895833358168602, "reward_std": 0.21764283627271652, "rewards/accuracy_reward": 0.895833358168602, "rewards/format_reward": 0.0, "step": 23 }, { "completion_length": 857.6250610351562, "epoch": 0.0256, "grad_norm": 0.19121389091014862, "kl": 0.0004467964172363281, "learning_rate": 7.659574468085106e-07, "loss": 0.2569, "reward": 0.6041666865348816, "reward_std": 0.3266642391681671, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 24 }, { "completion_length": 752.9583435058594, "epoch": 0.02666666666666667, "grad_norm": 0.4088599979877472, "kl": 0.0013475418090820312, "learning_rate": 7.978723404255319e-07, "loss": 0.1923, "reward": 0.604166679084301, "reward_std": 0.36417657136917114, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 816.2916870117188, "epoch": 0.027733333333333332, "grad_norm": 0.36438021063804626, "kl": 0.00135040283203125, "learning_rate": 8.297872340425533e-07, "loss": 0.1505, "reward": 0.5625000149011612, "reward_std": 0.31970490142703056, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 26 }, { "completion_length": 678.8958435058594, "epoch": 0.0288, "grad_norm": 0.34434083104133606, "kl": 0.0011892318725585938, "learning_rate": 8.617021276595745e-07, "loss": 0.0864, "reward": 0.7708333432674408, "reward_std": 0.299626849591732, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 27 }, { "completion_length": 861.5417022705078, "epoch": 0.029866666666666666, "grad_norm": 0.27874091267585754, "kl": 0.0007102489471435547, "learning_rate": 8.936170212765958e-07, "loss": 0.1469, "reward": 0.6875000298023224, "reward_std": 0.42872631549835205, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 28 }, { "completion_length": 888.2291793823242, "epoch": 0.030933333333333334, "grad_norm": 0.17686326801776886, "kl": 0.000659942626953125, "learning_rate": 9.25531914893617e-07, "loss": 0.0299, "reward": 0.5000000149011612, "reward_std": 0.4152076169848442, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "step": 29 }, { "completion_length": 901.2292098999023, "epoch": 0.032, "grad_norm": 0.13885818421840668, "kl": 0.0005984306335449219, "learning_rate": 9.574468085106384e-07, "loss": 0.1389, "reward": 0.6875000149011612, "reward_std": 0.235077116638422, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 617.3125152587891, "epoch": 0.03306666666666667, "grad_norm": 0.47630903124809265, "kl": 0.0006999969482421875, "learning_rate": 9.893617021276595e-07, "loss": 0.1104, "reward": 0.5833333432674408, "reward_std": 0.2957112528383732, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 31 }, { "completion_length": 1011.6250305175781, "epoch": 0.034133333333333335, "grad_norm": 0.22713471949100494, "kl": 0.0005393028259277344, "learning_rate": 1.0212765957446809e-06, "loss": 0.1717, "reward": 0.3958333432674408, "reward_std": 0.43832940608263016, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 32 }, { "completion_length": 744.1458435058594, "epoch": 0.0352, "grad_norm": 0.5959047079086304, "kl": 0.0012426376342773438, "learning_rate": 1.0531914893617022e-06, "loss": 0.1088, "reward": 0.6875000149011612, "reward_std": 0.43655748665332794, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 33 }, { "completion_length": 775.1666870117188, "epoch": 0.03626666666666667, "grad_norm": 0.30156779289245605, "kl": 0.0012769699096679688, "learning_rate": 1.0851063829787236e-06, "loss": 0.3362, "reward": 0.7291666865348816, "reward_std": 0.40168893337249756, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 34 }, { "completion_length": 688.3125152587891, "epoch": 0.037333333333333336, "grad_norm": 0.3756963014602661, "kl": 0.0014295578002929688, "learning_rate": 1.1170212765957447e-06, "loss": 0.146, "reward": 0.8541666865348816, "reward_std": 0.28219256550073624, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 639.8125152587891, "epoch": 0.0384, "grad_norm": 1.0038411617279053, "kl": 0.0008306503295898438, "learning_rate": 1.148936170212766e-06, "loss": 0.0944, "reward": 0.8125000298023224, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 0.0, "step": 36 }, { "completion_length": 934.0000457763672, "epoch": 0.039466666666666664, "grad_norm": 0.2201177030801773, "kl": 0.000911712646484375, "learning_rate": 1.1808510638297874e-06, "loss": 0.0708, "reward": 0.43750002048909664, "reward_std": 0.28219255432486534, "rewards/accuracy_reward": 0.43750002048909664, "rewards/format_reward": 0.0, "step": 37 }, { "completion_length": 792.3750152587891, "epoch": 0.04053333333333333, "grad_norm": 0.18807095289230347, "kl": 0.0006055831909179688, "learning_rate": 1.2127659574468085e-06, "loss": 0.2337, "reward": 0.8125000298023224, "reward_std": 0.34674230217933655, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 0.0, "step": 38 }, { "completion_length": 992.3333740234375, "epoch": 0.0416, "grad_norm": 0.18009750545024872, "kl": 0.0004706382751464844, "learning_rate": 1.2446808510638299e-06, "loss": 0.0857, "reward": 0.6666666939854622, "reward_std": 0.36809216812253, "rewards/accuracy_reward": 0.6666666939854622, "rewards/format_reward": 0.0, "step": 39 }, { "completion_length": 668.6666870117188, "epoch": 0.042666666666666665, "grad_norm": 0.539271354675293, "kl": 0.0025882720947265625, "learning_rate": 1.276595744680851e-06, "loss": 0.0287, "reward": 0.7916666865348816, "reward_std": 0.3506578914821148, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 823.8958587646484, "epoch": 0.04373333333333333, "grad_norm": 0.9937110543251038, "kl": 0.0034613609313964844, "learning_rate": 1.3085106382978724e-06, "loss": 0.0637, "reward": 0.5208333507180214, "reward_std": 0.28219257295131683, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 41 }, { "completion_length": 640.8333435058594, "epoch": 0.0448, "grad_norm": 0.22486519813537598, "kl": 0.0022001266479492188, "learning_rate": 1.3404255319148935e-06, "loss": 0.1832, "reward": 0.833333358168602, "reward_std": 0.23116152361035347, "rewards/accuracy_reward": 0.833333358168602, "rewards/format_reward": 0.0, "step": 42 }, { "completion_length": 724.2916870117188, "epoch": 0.04586666666666667, "grad_norm": 0.4574947953224182, "kl": 0.01588582992553711, "learning_rate": 1.3723404255319149e-06, "loss": -0.0154, "reward": 0.6250000149011612, "reward_std": 0.3236205130815506, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 43 }, { "completion_length": 734.1250305175781, "epoch": 0.046933333333333334, "grad_norm": 0.30649104714393616, "kl": 0.0010318756103515625, "learning_rate": 1.4042553191489362e-06, "loss": 0.0574, "reward": 0.708333358168602, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 44 }, { "completion_length": 663.2292022705078, "epoch": 0.048, "grad_norm": 0.07640363276004791, "kl": 0.0008792877197265625, "learning_rate": 1.4361702127659576e-06, "loss": 0.1074, "reward": 0.7291666865348816, "reward_std": 0.1801304928958416, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 861.2916946411133, "epoch": 0.04906666666666667, "grad_norm": 0.15726080536842346, "kl": 0.0009150505065917969, "learning_rate": 1.4680851063829787e-06, "loss": 0.1497, "reward": 0.5833333395421505, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.5833333395421505, "rewards/format_reward": 0.0, "step": 46 }, { "completion_length": 942.6666870117188, "epoch": 0.050133333333333335, "grad_norm": 0.15566232800483704, "kl": 0.0007233619689941406, "learning_rate": 1.5e-06, "loss": 0.1334, "reward": 0.5625000298023224, "reward_std": 0.38161084800958633, "rewards/accuracy_reward": 0.5625000298023224, "rewards/format_reward": 0.0, "step": 47 }, { "completion_length": 699.3750152587891, "epoch": 0.0512, "grad_norm": 0.0709948018193245, "kl": 0.00090789794921875, "learning_rate": 1.5319148936170212e-06, "loss": 0.0385, "reward": 0.9375000149011612, "reward_std": 0.11558076739311218, "rewards/accuracy_reward": 0.9375000149011612, "rewards/format_reward": 0.0, "step": 48 }, { "completion_length": 904.3958587646484, "epoch": 0.05226666666666667, "grad_norm": 0.1681407243013382, "kl": 0.000804901123046875, "learning_rate": 1.5638297872340427e-06, "loss": 0.1381, "reward": 0.6250000149011612, "reward_std": 0.4230388104915619, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 49 }, { "completion_length": 662.5833435058594, "epoch": 0.05333333333333334, "grad_norm": 0.1360897421836853, "kl": 0.0031037330627441406, "learning_rate": 1.5957446808510639e-06, "loss": 0.0153, "reward": 0.8541666865348816, "reward_std": 0.21764283254742622, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 1014.4791870117188, "epoch": 0.0544, "grad_norm": 0.13733625411987305, "kl": 0.0012197494506835938, "learning_rate": 1.627659574468085e-06, "loss": 0.2745, "reward": 0.645833358168602, "reward_std": 0.3842546306550503, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 51 }, { "completion_length": 1132.0625305175781, "epoch": 0.055466666666666664, "grad_norm": 0.17733652889728546, "kl": 0.0011796951293945312, "learning_rate": 1.6595744680851066e-06, "loss": 0.1677, "reward": 0.520833358168602, "reward_std": 0.33713920414447784, "rewards/accuracy_reward": 0.520833358168602, "rewards/format_reward": 0.0, "step": 52 }, { "completion_length": 504.7916717529297, "epoch": 0.05653333333333333, "grad_norm": 0.11918134987354279, "kl": 0.0012044906616210938, "learning_rate": 1.6914893617021277e-06, "loss": 0.0709, "reward": 0.9166666716337204, "reward_std": 0.16661179438233376, "rewards/accuracy_reward": 0.9166666716337204, "rewards/format_reward": 0.0, "step": 53 }, { "completion_length": 588.9375228881836, "epoch": 0.0576, "grad_norm": 0.17483025789260864, "kl": 0.0025815963745117188, "learning_rate": 1.723404255319149e-06, "loss": 0.2063, "reward": 0.8125000149011612, "reward_std": 0.2996268570423126, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 54 }, { "completion_length": 759.8542175292969, "epoch": 0.058666666666666666, "grad_norm": 0.15940438210964203, "kl": 0.0022678375244140625, "learning_rate": 1.7553191489361702e-06, "loss": 0.043, "reward": 0.7291666716337204, "reward_std": 0.41129201278090477, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 620.4375152587891, "epoch": 0.05973333333333333, "grad_norm": 0.07168873399496078, "kl": 0.0022439956665039062, "learning_rate": 1.7872340425531915e-06, "loss": 0.0365, "reward": 0.8541666716337204, "reward_std": 0.13301505148410797, "rewards/accuracy_reward": 0.8541666716337204, "rewards/format_reward": 0.0, "step": 56 }, { "completion_length": 846.7917022705078, "epoch": 0.0608, "grad_norm": 0.0883839800953865, "kl": 0.000598907470703125, "learning_rate": 1.819148936170213e-06, "loss": 0.1734, "reward": 0.6666666865348816, "reward_std": 0.3584890849888325, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 57 }, { "completion_length": 841.9583435058594, "epoch": 0.06186666666666667, "grad_norm": 0.10056422650814056, "kl": 0.0028505325317382812, "learning_rate": 1.851063829787234e-06, "loss": 0.0888, "reward": 0.6666666716337204, "reward_std": 0.11949635669589043, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 58 }, { "completion_length": 722.6458587646484, "epoch": 0.06293333333333333, "grad_norm": 0.08924444764852524, "kl": 0.0010976791381835938, "learning_rate": 1.8829787234042552e-06, "loss": -0.0281, "reward": 0.6250000111758709, "reward_std": 0.22155842557549477, "rewards/accuracy_reward": 0.6250000111758709, "rewards/format_reward": 0.0, "step": 59 }, { "completion_length": 670.5000152587891, "epoch": 0.064, "grad_norm": 0.12996236979961395, "kl": 0.0016269683837890625, "learning_rate": 1.9148936170212767e-06, "loss": 0.3339, "reward": 0.8750000149011612, "reward_std": 0.2686738669872284, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 622.1875228881836, "epoch": 0.06506666666666666, "grad_norm": 0.0977378636598587, "kl": 0.004355430603027344, "learning_rate": 1.946808510638298e-06, "loss": 0.194, "reward": 0.7708333507180214, "reward_std": 0.21764284372329712, "rewards/accuracy_reward": 0.7708333507180214, "rewards/format_reward": 0.0, "step": 61 }, { "completion_length": 649.0416870117188, "epoch": 0.06613333333333334, "grad_norm": 0.09003892540931702, "kl": 0.0029087066650390625, "learning_rate": 1.978723404255319e-06, "loss": 0.0244, "reward": 0.3750000149011612, "reward_std": 0.18404607102274895, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 62 }, { "completion_length": 746.7083587646484, "epoch": 0.0672, "grad_norm": 0.12206298857927322, "kl": 0.004050254821777344, "learning_rate": 2.0106382978723404e-06, "loss": 0.1148, "reward": 0.645833358168602, "reward_std": 0.2996268570423126, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 63 }, { "completion_length": 722.3125228881836, "epoch": 0.06826666666666667, "grad_norm": 0.13454227149486542, "kl": 0.002227783203125, "learning_rate": 2.0425531914893617e-06, "loss": 0.1884, "reward": 0.708333358168602, "reward_std": 0.3332235887646675, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 64 }, { "completion_length": 729.4375152587891, "epoch": 0.06933333333333333, "grad_norm": 0.0882275253534317, "kl": 0.0024290084838867188, "learning_rate": 2.074468085106383e-06, "loss": 0.0764, "reward": 0.6041666716337204, "reward_std": 0.18796167895197868, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 65 }, { "completion_length": 590.0625305175781, "epoch": 0.0704, "grad_norm": 0.1144060492515564, "kl": 0.0020112991333007812, "learning_rate": 2.1063829787234044e-06, "loss": 0.0087, "reward": 0.6666666716337204, "reward_std": 0.11949635669589043, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 66 }, { "completion_length": 653.5833740234375, "epoch": 0.07146666666666666, "grad_norm": 0.13921166956424713, "kl": 0.0026102066040039062, "learning_rate": 2.1382978723404253e-06, "loss": 0.0934, "reward": 0.6666666828095913, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.6666666828095913, "rewards/format_reward": 0.0, "step": 67 }, { "completion_length": 637.5208435058594, "epoch": 0.07253333333333334, "grad_norm": 0.07036515325307846, "kl": 0.002750396728515625, "learning_rate": 2.170212765957447e-06, "loss": 0.085, "reward": 0.833333358168602, "reward_std": 0.24859581887722015, "rewards/accuracy_reward": 0.833333358168602, "rewards/format_reward": 0.0, "step": 68 }, { "completion_length": 792.1042098999023, "epoch": 0.0736, "grad_norm": 0.09435410797595978, "kl": 0.0015087127685546875, "learning_rate": 2.202127659574468e-06, "loss": 0.0771, "reward": 0.6458333507180214, "reward_std": 0.2350771203637123, "rewards/accuracy_reward": 0.6458333507180214, "rewards/format_reward": 0.0, "step": 69 }, { "completion_length": 597.5625305175781, "epoch": 0.07466666666666667, "grad_norm": 0.10136708617210388, "kl": 0.0028162002563476562, "learning_rate": 2.2340425531914894e-06, "loss": 0.0167, "reward": 0.7916666716337204, "reward_std": 0.24859581142663956, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 651.020866394043, "epoch": 0.07573333333333333, "grad_norm": 0.17569655179977417, "kl": 0.009462356567382812, "learning_rate": 2.2659574468085107e-06, "loss": 0.1583, "reward": 0.7916666865348816, "reward_std": 0.3332235924899578, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 71 }, { "completion_length": 558.4375305175781, "epoch": 0.0768, "grad_norm": 0.1982911080121994, "kl": 0.0030059814453125, "learning_rate": 2.297872340425532e-06, "loss": 0.1395, "reward": 0.7708333432674408, "reward_std": 0.27258946374058723, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 72 }, { "completion_length": 529.8125076293945, "epoch": 0.07786666666666667, "grad_norm": 0.21401475369930267, "kl": 0.0028095245361328125, "learning_rate": 2.329787234042553e-06, "loss": 0.0041, "reward": 0.7083333432674408, "reward_std": 0.24859581515192986, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 73 }, { "completion_length": 726.0625228881836, "epoch": 0.07893333333333333, "grad_norm": 12476.7119140625, "kl": 6.251659393310547, "learning_rate": 2.3617021276595748e-06, "loss": 0.4681, "reward": 0.708333358168602, "reward_std": 0.3332236036658287, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 74 }, { "completion_length": 686.0000152587891, "epoch": 0.08, "grad_norm": 1.635213851928711, "kl": 0.018024444580078125, "learning_rate": 2.3936170212765957e-06, "loss": 0.0249, "reward": 0.9375000149011612, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.9375000149011612, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 689.2291717529297, "epoch": 0.08106666666666666, "grad_norm": 0.11620714515447617, "kl": 0.0021572113037109375, "learning_rate": 2.425531914893617e-06, "loss": 0.0219, "reward": 0.7916666865348816, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 76 }, { "completion_length": 612.9166717529297, "epoch": 0.08213333333333334, "grad_norm": 0.16293609142303467, "kl": 0.002117156982421875, "learning_rate": 2.4574468085106384e-06, "loss": 0.0731, "reward": 0.7291666865348816, "reward_std": 0.2350771203637123, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 77 }, { "completion_length": 585.3541793823242, "epoch": 0.0832, "grad_norm": 0.07114370167255402, "kl": 0.003589630126953125, "learning_rate": 2.4893617021276598e-06, "loss": 0.0198, "reward": 0.9583333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 78 }, { "completion_length": 518.8125305175781, "epoch": 0.08426666666666667, "grad_norm": 0.24749654531478882, "kl": 0.0082855224609375, "learning_rate": 2.521276595744681e-06, "loss": 0.0594, "reward": 0.8541666865348816, "reward_std": 0.1530931070446968, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.0, "step": 79 }, { "completion_length": 789.5625152587891, "epoch": 0.08533333333333333, "grad_norm": 0.24383904039859772, "kl": 0.014476776123046875, "learning_rate": 2.553191489361702e-06, "loss": 0.0755, "reward": 0.8541666865348816, "reward_std": 0.27258947491645813, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 821.1250305175781, "epoch": 0.0864, "grad_norm": 0.15687987208366394, "kl": 0.0060272216796875, "learning_rate": 2.5851063829787234e-06, "loss": 0.0831, "reward": 0.5833333432674408, "reward_std": 0.2686738707125187, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 81 }, { "completion_length": 620.4166793823242, "epoch": 0.08746666666666666, "grad_norm": 0.10259342193603516, "kl": 0.004123687744140625, "learning_rate": 2.6170212765957447e-06, "loss": 0.0853, "reward": 0.7291666865348816, "reward_std": 0.30922994762659073, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 82 }, { "completion_length": 633.4166870117188, "epoch": 0.08853333333333334, "grad_norm": 0.10479988157749176, "kl": 0.003204345703125, "learning_rate": 2.648936170212766e-06, "loss": 0.0737, "reward": 0.7708333432674408, "reward_std": 0.1705273911356926, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 83 }, { "completion_length": 413.3333435058594, "epoch": 0.0896, "grad_norm": 0.09932970255613327, "kl": 0.00691986083984375, "learning_rate": 2.680851063829787e-06, "loss": 0.0675, "reward": 0.9791666716337204, "reward_std": 0.05103103816509247, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 84 }, { "completion_length": 717.9375152587891, "epoch": 0.09066666666666667, "grad_norm": 0.10688722133636475, "kl": 0.004673004150390625, "learning_rate": 2.7127659574468088e-06, "loss": 0.1328, "reward": 0.7291666865348816, "reward_std": 0.2350771203637123, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 85 }, { "completion_length": 660.7291793823242, "epoch": 0.09173333333333333, "grad_norm": 0.05062931403517723, "kl": 0.005130767822265625, "learning_rate": 2.7446808510638297e-06, "loss": 0.0075, "reward": 0.5833333432674408, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 86 }, { "completion_length": 483.7916793823242, "epoch": 0.0928, "grad_norm": 0.19415028393268585, "kl": 0.0061492919921875, "learning_rate": 2.776595744680851e-06, "loss": 0.1008, "reward": 0.9375000149011612, "reward_std": 0.1530931033194065, "rewards/accuracy_reward": 0.9375000149011612, "rewards/format_reward": 0.0, "step": 87 }, { "completion_length": 535.9583435058594, "epoch": 0.09386666666666667, "grad_norm": 0.1171206533908844, "kl": 0.003173828125, "learning_rate": 2.8085106382978724e-06, "loss": 0.0126, "reward": 0.8750000149011612, "reward_std": 0.18404608964920044, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 88 }, { "completion_length": 713.5208587646484, "epoch": 0.09493333333333333, "grad_norm": 0.22358191013336182, "kl": 0.0032672882080078125, "learning_rate": 2.8404255319148938e-06, "loss": 0.0927, "reward": 0.5625000074505806, "reward_std": 0.1801304891705513, "rewards/accuracy_reward": 0.5625000074505806, "rewards/format_reward": 0.0, "step": 89 }, { "completion_length": 455.50001525878906, "epoch": 0.096, "grad_norm": 0.048782192170619965, "kl": 0.0037994384765625, "learning_rate": 2.872340425531915e-06, "loss": -0.005, "reward": 0.9583333432674408, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 693.1666793823242, "epoch": 0.09706666666666666, "grad_norm": 0.1026625707745552, "kl": 0.00514984130859375, "learning_rate": 2.9042553191489365e-06, "loss": 0.0521, "reward": 0.7500000149011612, "reward_std": 0.31314554065465927, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 91 }, { "completion_length": 672.6875152587891, "epoch": 0.09813333333333334, "grad_norm": 0.10129702091217041, "kl": 0.0051727294921875, "learning_rate": 2.9361702127659574e-06, "loss": 0.0469, "reward": 0.7500000298023224, "reward_std": 0.24859581887722015, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.0, "step": 92 }, { "completion_length": 680.7500152587891, "epoch": 0.0992, "grad_norm": 0.05066928640007973, "kl": 0.004344940185546875, "learning_rate": 2.9680851063829787e-06, "loss": 0.0532, "reward": 0.8541666716337204, "reward_std": 0.05103103816509247, "rewards/accuracy_reward": 0.8541666716337204, "rewards/format_reward": 0.0, "step": 93 }, { "completion_length": 706.5000228881836, "epoch": 0.10026666666666667, "grad_norm": 0.5558317303657532, "kl": 0.01628875732421875, "learning_rate": 3e-06, "loss": 0.0262, "reward": 0.833333358168602, "reward_std": 0.16661180555820465, "rewards/accuracy_reward": 0.833333358168602, "rewards/format_reward": 0.0, "step": 94 }, { "completion_length": 567.2500228881836, "epoch": 0.10133333333333333, "grad_norm": 0.1021793931722641, "kl": 0.0043487548828125, "learning_rate": 2.9999895838948146e-06, "loss": -0.006, "reward": 0.9166666716337204, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.9166666716337204, "rewards/format_reward": 0.0, "step": 95 }, { "completion_length": 779.8958435058594, "epoch": 0.1024, "grad_norm": 0.06159654259681702, "kl": 0.003650665283203125, "learning_rate": 2.999958335723919e-06, "loss": 0.0339, "reward": 0.6458333432674408, "reward_std": 0.13301505148410797, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 96 }, { "completion_length": 661.7500152587891, "epoch": 0.10346666666666667, "grad_norm": 0.08456037938594818, "kl": 0.00357818603515625, "learning_rate": 2.9999062559212913e-06, "loss": 0.0509, "reward": 0.7916666716337204, "reward_std": 0.24859581887722015, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.0, "step": 97 }, { "completion_length": 661.7083435058594, "epoch": 0.10453333333333334, "grad_norm": 0.09946412593126297, "kl": 0.004486083984375, "learning_rate": 2.9998333452102236e-06, "loss": 0.0121, "reward": 0.7500000149011612, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 98 }, { "completion_length": 449.0208435058594, "epoch": 0.1056, "grad_norm": 0.055227383971214294, "kl": 0.004207611083984375, "learning_rate": 2.999739604603311e-06, "loss": 0.0191, "reward": 0.9583333432674408, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 99 }, { "completion_length": 507.81251525878906, "epoch": 0.10666666666666667, "grad_norm": 0.13886040449142456, "kl": 0.007358551025390625, "learning_rate": 2.9996250354024346e-06, "loss": 0.0459, "reward": 0.7708333432674408, "reward_std": 0.1801304891705513, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 100 }, { "completion_length": 712.6250152587891, "epoch": 0.10773333333333333, "grad_norm": 0.08429905027151108, "kl": 0.004486083984375, "learning_rate": 2.9994896391987487e-06, "loss": -0.007, "reward": 0.8125000149011612, "reward_std": 0.1975647658109665, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 101 }, { "completion_length": 616.4166793823242, "epoch": 0.1088, "grad_norm": 0.055599987506866455, "kl": 0.0048828125, "learning_rate": 2.9993334178726546e-06, "loss": -0.0006, "reward": 0.8750000149011612, "reward_std": 0.10206207260489464, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 102 }, { "completion_length": 742.0208511352539, "epoch": 0.10986666666666667, "grad_norm": 0.0759417712688446, "kl": 0.00399017333984375, "learning_rate": 2.9991563735937752e-06, "loss": -0.0498, "reward": 0.7916666865348816, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 103 }, { "completion_length": 546.3750152587891, "epoch": 0.11093333333333333, "grad_norm": 0.06980567425489426, "kl": 0.0062255859375, "learning_rate": 2.9989585088209272e-06, "loss": -0.0213, "reward": 0.6666666679084301, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.6666666679084301, "rewards/format_reward": 0.0, "step": 104 }, { "completion_length": 457.6041793823242, "epoch": 0.112, "grad_norm": 0.03110707364976406, "kl": 0.003936767578125, "learning_rate": 2.9987398263020837e-06, "loss": 0.0115, "reward": 0.9791666716337204, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 105 }, { "completion_length": 781.7083587646484, "epoch": 0.11306666666666666, "grad_norm": 0.13586241006851196, "kl": 0.003814697265625, "learning_rate": 2.998500329074338e-06, "loss": 0.0591, "reward": 0.7916667014360428, "reward_std": 0.23116153106093407, "rewards/accuracy_reward": 0.7916667014360428, "rewards/format_reward": 0.0, "step": 106 }, { "completion_length": 713.2291870117188, "epoch": 0.11413333333333334, "grad_norm": 0.06634881347417831, "kl": 0.004085540771484375, "learning_rate": 2.9982400204638626e-06, "loss": 0.0159, "reward": 0.625, "reward_std": 0.18404608219861984, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 107 }, { "completion_length": 551.7500152587891, "epoch": 0.1152, "grad_norm": 0.0972931832075119, "kl": 0.004058837890625, "learning_rate": 2.9979589040858586e-06, "loss": 0.0379, "reward": 0.8958333432674408, "reward_std": 0.1801304891705513, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.0, "step": 108 }, { "completion_length": 864.5625305175781, "epoch": 0.11626666666666667, "grad_norm": 0.16979259252548218, "kl": 0.00384521484375, "learning_rate": 2.9976569838445097e-06, "loss": 0.0628, "reward": 0.7916666865348816, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 109 }, { "completion_length": 506.3541793823242, "epoch": 0.11733333333333333, "grad_norm": 0.1602563112974167, "kl": 0.005359649658203125, "learning_rate": 2.997334263932927e-06, "loss": 0.0227, "reward": 0.9375000149011612, "reward_std": 0.1530931107699871, "rewards/accuracy_reward": 0.9375000149011612, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 725.9375152587891, "epoch": 0.1184, "grad_norm": 0.27847498655319214, "kl": 0.005428314208984375, "learning_rate": 2.9969907488330905e-06, "loss": 0.0661, "reward": 0.7291666716337204, "reward_std": 0.1530931070446968, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.0, "step": 111 }, { "completion_length": 676.9166870117188, "epoch": 0.11946666666666667, "grad_norm": 0.1168823167681694, "kl": 0.004245758056640625, "learning_rate": 2.996626443315785e-06, "loss": 0.0607, "reward": 0.833333358168602, "reward_std": 0.23116153106093407, "rewards/accuracy_reward": 0.833333358168602, "rewards/format_reward": 0.0, "step": 112 }, { "completion_length": 629.8958511352539, "epoch": 0.12053333333333334, "grad_norm": 0.09983277320861816, "kl": 0.0041351318359375, "learning_rate": 2.996241352440537e-06, "loss": 0.0755, "reward": 0.8125000149011612, "reward_std": 0.1705273911356926, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 113 }, { "completion_length": 538.7291717529297, "epoch": 0.1216, "grad_norm": 0.10589924454689026, "kl": 0.0055084228515625, "learning_rate": 2.9958354815555427e-06, "loss": 0.0238, "reward": 0.8958333432674408, "reward_std": 0.1705274023115635, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.0, "step": 114 }, { "completion_length": 740.8750152587891, "epoch": 0.12266666666666666, "grad_norm": 1.6198092699050903, "kl": 0.01663970947265625, "learning_rate": 2.9954088362975936e-06, "loss": 0.031, "reward": 0.6041666865348816, "reward_std": 0.3170611336827278, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 115 }, { "completion_length": 527.0833435058594, "epoch": 0.12373333333333333, "grad_norm": 0.09877801686525345, "kl": 0.0058746337890625, "learning_rate": 2.994961422591999e-06, "loss": 0.0054, "reward": 0.8750000149011612, "reward_std": 0.10206207260489464, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 116 }, { "completion_length": 486.3333435058594, "epoch": 0.1248, "grad_norm": 0.30120348930358887, "kl": 0.01122283935546875, "learning_rate": 2.994493246652504e-06, "loss": 0.074, "reward": 0.7291666865348816, "reward_std": 0.1801304966211319, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 117 }, { "completion_length": 727.2291870117188, "epoch": 0.12586666666666665, "grad_norm": 0.08286140114068985, "kl": 0.0025634765625, "learning_rate": 2.9940043149812002e-06, "loss": 0.0506, "reward": 0.8125000149011612, "reward_std": 0.28219256550073624, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 118 }, { "completion_length": 673.8125152587891, "epoch": 0.12693333333333334, "grad_norm": 0.26215964555740356, "kl": 0.005458831787109375, "learning_rate": 2.9934946343684403e-06, "loss": 0.1181, "reward": 0.7500000149011612, "reward_std": 0.18404608592391014, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 119 }, { "completion_length": 466.0208435058594, "epoch": 0.128, "grad_norm": 0.032901108264923096, "kl": 0.004306793212890625, "learning_rate": 2.99296421189274e-06, "loss": 0.0034, "reward": 0.9166666716337204, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.9166666716337204, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 817.3750076293945, "epoch": 0.12906666666666666, "grad_norm": 0.08746203780174255, "kl": 0.004161834716796875, "learning_rate": 2.9924130549206804e-06, "loss": 0.0042, "reward": 0.354166679084301, "reward_std": 0.2446802258491516, "rewards/accuracy_reward": 0.354166679084301, "rewards/format_reward": 0.0, "step": 121 }, { "completion_length": 614.0000228881836, "epoch": 0.13013333333333332, "grad_norm": 0.05599169805645943, "kl": 0.0033721923828125, "learning_rate": 2.9918411711068073e-06, "loss": -0.0109, "reward": 0.9166666716337204, "reward_std": 0.11949635669589043, "rewards/accuracy_reward": 0.9166666716337204, "rewards/format_reward": 0.0, "step": 122 }, { "completion_length": 534.7916717529297, "epoch": 0.1312, "grad_norm": 0.1435793787240982, "kl": 0.00457763671875, "learning_rate": 2.991248568393524e-06, "loss": 0.0434, "reward": 0.7708333432674408, "reward_std": 0.13301503658294678, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 123 }, { "completion_length": 684.2500305175781, "epoch": 0.13226666666666667, "grad_norm": 0.06084440276026726, "kl": 0.0050048828125, "learning_rate": 2.9906352550109787e-06, "loss": 0.018, "reward": 0.8958333432674408, "reward_std": 0.1705274023115635, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.0, "step": 124 }, { "completion_length": 665.8541717529297, "epoch": 0.13333333333333333, "grad_norm": 0.1194707602262497, "kl": 0.00388336181640625, "learning_rate": 2.9900012394769546e-06, "loss": 0.0599, "reward": 0.8958333432674408, "reward_std": 0.21764283254742622, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.0, "step": 125 }, { "completion_length": 645.4375152587891, "epoch": 0.1344, "grad_norm": 0.10876529663801193, "kl": 0.0048828125, "learning_rate": 2.989346530596748e-06, "loss": 0.0557, "reward": 0.7708333432674408, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 126 }, { "completion_length": 798.3333587646484, "epoch": 0.13546666666666668, "grad_norm": 0.08001799881458282, "kl": 0.003505706787109375, "learning_rate": 2.988671137463048e-06, "loss": 0.0374, "reward": 0.729166679084301, "reward_std": 0.2350771315395832, "rewards/accuracy_reward": 0.729166679084301, "rewards/format_reward": 0.0, "step": 127 }, { "completion_length": 745.1250152587891, "epoch": 0.13653333333333334, "grad_norm": 0.1032245010137558, "kl": 0.0030364990234375, "learning_rate": 2.987975069455809e-06, "loss": 0.052, "reward": 0.6041666865348816, "reward_std": 0.3990451246500015, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 128 }, { "completion_length": 534.3333435058594, "epoch": 0.1376, "grad_norm": 0.03779646381735802, "kl": 0.004276275634765625, "learning_rate": 2.9872583362421204e-06, "loss": 0.0041, "reward": 0.9791666716337204, "reward_std": 0.05103103816509247, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 129 }, { "completion_length": 504.9166793823242, "epoch": 0.13866666666666666, "grad_norm": 0.08333314955234528, "kl": 0.00482940673828125, "learning_rate": 2.986520947776075e-06, "loss": -0.0387, "reward": 0.8750000149011612, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 543.3125228881836, "epoch": 0.13973333333333332, "grad_norm": 0.05304025486111641, "kl": 0.003147125244140625, "learning_rate": 2.985762914298626e-06, "loss": 0.0163, "reward": 0.9583333432674408, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 131 }, { "completion_length": 696.9166870117188, "epoch": 0.1408, "grad_norm": 0.1652069389820099, "kl": 0.0041961669921875, "learning_rate": 2.984984246337449e-06, "loss": 0.0327, "reward": 0.8125000149011612, "reward_std": 0.1530931033194065, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 132 }, { "completion_length": 394.3958549499512, "epoch": 0.14186666666666667, "grad_norm": 0.2892446517944336, "kl": 0.010234832763671875, "learning_rate": 2.9841849547067944e-06, "loss": 0.0548, "reward": 0.9375000149011612, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.9375000149011612, "rewards/format_reward": 0.0, "step": 133 }, { "completion_length": 520.1666793823242, "epoch": 0.14293333333333333, "grad_norm": 0.10460761189460754, "kl": 0.00528717041015625, "learning_rate": 2.983365050507336e-06, "loss": -0.0131, "reward": 0.9166666865348816, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 0.0, "step": 134 }, { "completion_length": 688.1458435058594, "epoch": 0.144, "grad_norm": 0.13016089797019958, "kl": 0.003635406494140625, "learning_rate": 2.982524545126018e-06, "loss": -0.0054, "reward": 0.6666666865348816, "reward_std": 0.2686738632619381, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 135 }, { "completion_length": 510.18751525878906, "epoch": 0.14506666666666668, "grad_norm": 0.12998753786087036, "kl": 0.005603790283203125, "learning_rate": 2.9816634502358974e-06, "loss": 0.0502, "reward": 0.9583333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 136 }, { "completion_length": 660.0833587646484, "epoch": 0.14613333333333334, "grad_norm": 0.2562927007675171, "kl": 0.0079803466796875, "learning_rate": 2.980781777795981e-06, "loss": 0.0813, "reward": 0.8125000149011612, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 137 }, { "completion_length": 519.1875152587891, "epoch": 0.1472, "grad_norm": 0.08546563237905502, "kl": 0.00726318359375, "learning_rate": 2.979879540051059e-06, "loss": 0.0501, "reward": 0.9791666716337204, "reward_std": 0.05103103816509247, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 138 }, { "completion_length": 681.125, "epoch": 0.14826666666666666, "grad_norm": 0.045832425355911255, "kl": 0.004791259765625, "learning_rate": 2.978956749531536e-06, "loss": 0.0379, "reward": 0.8958333432674408, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.0, "step": 139 }, { "completion_length": 445.1458511352539, "epoch": 0.14933333333333335, "grad_norm": 0.05344467982649803, "kl": 0.008579254150390625, "learning_rate": 2.9780134190532553e-06, "loss": -0.0363, "reward": 0.9583333432674408, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 604.2916870117188, "epoch": 0.1504, "grad_norm": 0.14787505567073822, "kl": 0.00617218017578125, "learning_rate": 2.977049561717324e-06, "loss": -0.0179, "reward": 0.9166666716337204, "reward_std": 0.16661179438233376, "rewards/accuracy_reward": 0.9166666716337204, "rewards/format_reward": 0.0, "step": 141 }, { "completion_length": 615.0833435058594, "epoch": 0.15146666666666667, "grad_norm": 0.12652121484279633, "kl": 0.008087158203125, "learning_rate": 2.976065190909927e-06, "loss": 0.0138, "reward": 0.7291667014360428, "reward_std": 0.3266642242670059, "rewards/accuracy_reward": 0.7291667014360428, "rewards/format_reward": 0.0, "step": 142 }, { "completion_length": 1025.6458587646484, "epoch": 0.15253333333333333, "grad_norm": 0.0651557669043541, "kl": 0.00666046142578125, "learning_rate": 2.975060320302145e-06, "loss": -0.0083, "reward": 0.5416666772216558, "reward_std": 0.10206207260489464, "rewards/accuracy_reward": 0.5416666772216558, "rewards/format_reward": 0.0, "step": 143 }, { "completion_length": 548.9166793823242, "epoch": 0.1536, "grad_norm": 0.18320025503635406, "kl": 0.006748199462890625, "learning_rate": 2.9740349638497614e-06, "loss": 0.0382, "reward": 0.8125000149011612, "reward_std": 0.2996268607676029, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 144 }, { "completion_length": 553.6041717529297, "epoch": 0.15466666666666667, "grad_norm": 0.07292579859495163, "kl": 0.00726318359375, "learning_rate": 2.972989135793071e-06, "loss": -0.0316, "reward": 0.770833358168602, "reward_std": 0.1530931070446968, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 145 }, { "completion_length": 478.2708435058594, "epoch": 0.15573333333333333, "grad_norm": 3.902277708053589, "kl": 0.0879058837890625, "learning_rate": 2.971922850656679e-06, "loss": 0.0371, "reward": 0.9791666716337204, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 146 }, { "completion_length": 531.1250076293945, "epoch": 0.1568, "grad_norm": 0.06389074772596359, "kl": 0.00485992431640625, "learning_rate": 2.970836123249305e-06, "loss": -0.0175, "reward": 0.7916666716337204, "reward_std": 0.11949635669589043, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.0, "step": 147 }, { "completion_length": 770.0416870117188, "epoch": 0.15786666666666666, "grad_norm": 0.2661518156528473, "kl": 0.01258087158203125, "learning_rate": 2.9697289686635704e-06, "loss": 0.0998, "reward": 0.7500000149011612, "reward_std": 0.2686738781630993, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 148 }, { "completion_length": 666.1250381469727, "epoch": 0.15893333333333334, "grad_norm": 0.11005847901105881, "kl": 0.0070037841796875, "learning_rate": 2.9686014022757936e-06, "loss": -0.0312, "reward": 0.7291667014360428, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.7291667014360428, "rewards/format_reward": 0.0, "step": 149 }, { "completion_length": 563.8750076293945, "epoch": 0.16, "grad_norm": 0.09993892908096313, "kl": 0.011260986328125, "learning_rate": 2.967453439745775e-06, "loss": 0.037, "reward": 0.7708333507180214, "reward_std": 0.21764284372329712, "rewards/accuracy_reward": 0.7708333507180214, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 695.5000305175781, "epoch": 0.16106666666666666, "grad_norm": 0.08412571996450424, "kl": 0.0047607421875, "learning_rate": 2.9662850970165785e-06, "loss": 0.0287, "reward": 0.6458333432674408, "reward_std": 0.2350771278142929, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 151 }, { "completion_length": 702.2708587646484, "epoch": 0.16213333333333332, "grad_norm": 0.11586946994066238, "kl": 0.00862884521484375, "learning_rate": 2.9650963903143124e-06, "loss": 0.0071, "reward": 0.8125000149011612, "reward_std": 0.2350771315395832, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 152 }, { "completion_length": 515.1458435058594, "epoch": 0.1632, "grad_norm": 0.3182407021522522, "kl": 0.0129852294921875, "learning_rate": 2.9638873361479016e-06, "loss": 0.0728, "reward": 0.8125000149011612, "reward_std": 0.1801304966211319, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 153 }, { "completion_length": 731.1041717529297, "epoch": 0.16426666666666667, "grad_norm": 0.034731581807136536, "kl": 0.00762176513671875, "learning_rate": 2.9626579513088605e-06, "loss": -0.0045, "reward": 0.7916666716337204, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.0, "step": 154 }, { "completion_length": 515.4791717529297, "epoch": 0.16533333333333333, "grad_norm": 0.07761654257774353, "kl": 0.006103515625, "learning_rate": 2.961408252871058e-06, "loss": 0.006, "reward": 0.9791666716337204, "reward_std": 0.05103103816509247, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 155 }, { "completion_length": 416.3333511352539, "epoch": 0.1664, "grad_norm": 0.05361328274011612, "kl": 0.00830078125, "learning_rate": 2.9601382581904815e-06, "loss": -0.0111, "reward": 0.9791666716337204, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 156 }, { "completion_length": 810.0833435058594, "epoch": 0.16746666666666668, "grad_norm": 0.0816839337348938, "kl": 0.00737762451171875, "learning_rate": 2.958847984904994e-06, "loss": 0.041, "reward": 0.6250000149011612, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 157 }, { "completion_length": 541.0208511352539, "epoch": 0.16853333333333334, "grad_norm": 0.09771662950515747, "kl": 0.00588226318359375, "learning_rate": 2.9575374509340937e-06, "loss": -0.0047, "reward": 0.6458333432674408, "reward_std": 0.1705273948609829, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 158 }, { "completion_length": 808.4791870117188, "epoch": 0.1696, "grad_norm": 0.07247886061668396, "kl": 0.00653076171875, "learning_rate": 2.9562066744786588e-06, "loss": 0.0621, "reward": 0.770833358168602, "reward_std": 0.2350771203637123, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 159 }, { "completion_length": 694.2916870117188, "epoch": 0.17066666666666666, "grad_norm": 0.09774583578109741, "kl": 0.009857177734375, "learning_rate": 2.9548556740207e-06, "loss": 0.0452, "reward": 0.708333358168602, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 655.1666870117188, "epoch": 0.17173333333333332, "grad_norm": 0.12977637350559235, "kl": 0.00725555419921875, "learning_rate": 2.9534844683231005e-06, "loss": -0.0152, "reward": 0.770833358168602, "reward_std": 0.1801304928958416, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 161 }, { "completion_length": 662.3958587646484, "epoch": 0.1728, "grad_norm": 0.3387124538421631, "kl": 0.0087890625, "learning_rate": 2.9520930764293584e-06, "loss": -0.0406, "reward": 0.7916666716337204, "reward_std": 0.16661179438233376, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.0, "step": 162 }, { "completion_length": 522.9583435058594, "epoch": 0.17386666666666667, "grad_norm": 0.1772523820400238, "kl": 0.01194000244140625, "learning_rate": 2.9506815176633184e-06, "loss": 0.0524, "reward": 0.8750000298023224, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.8750000298023224, "rewards/format_reward": 0.0, "step": 163 }, { "completion_length": 504.8333435058594, "epoch": 0.17493333333333333, "grad_norm": 0.05171928554773331, "kl": 0.008270263671875, "learning_rate": 2.949249811628907e-06, "loss": 0.0002, "reward": 0.7916666716337204, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.0, "step": 164 }, { "completion_length": 804.1458587646484, "epoch": 0.176, "grad_norm": 0.10338763892650604, "kl": 0.00841522216796875, "learning_rate": 2.9477979782098592e-06, "loss": 0.02, "reward": 0.5416666865348816, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 165 }, { "completion_length": 461.35418701171875, "epoch": 0.17706666666666668, "grad_norm": 0.07222110033035278, "kl": 0.0083160400390625, "learning_rate": 2.94632603756944e-06, "loss": 0.0491, "reward": 0.9791666716337204, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 166 }, { "completion_length": 590.9583435058594, "epoch": 0.17813333333333334, "grad_norm": 0.12525935471057892, "kl": 0.00612640380859375, "learning_rate": 2.9448340101501676e-06, "loss": -0.0399, "reward": 0.7500000298023224, "reward_std": 0.2957112565636635, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.0, "step": 167 }, { "completion_length": 743.4166793823242, "epoch": 0.1792, "grad_norm": 0.0554216243326664, "kl": 0.0072784423828125, "learning_rate": 2.9433219166735286e-06, "loss": -0.0102, "reward": 0.8333333432674408, "reward_std": 0.10206207260489464, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.0, "step": 168 }, { "completion_length": 633.3125152587891, "epoch": 0.18026666666666666, "grad_norm": 0.08849887549877167, "kl": 0.00717926025390625, "learning_rate": 2.9417897781396884e-06, "loss": -0.0317, "reward": 0.8750000298023224, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.8750000298023224, "rewards/format_reward": 0.0, "step": 169 }, { "completion_length": 829.2917022705078, "epoch": 0.18133333333333335, "grad_norm": 0.12259866297245026, "kl": 0.0110321044921875, "learning_rate": 2.9402376158272022e-06, "loss": 0.0051, "reward": 0.7083333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 726.8125305175781, "epoch": 0.1824, "grad_norm": 0.16716063022613525, "kl": 0.01651763916015625, "learning_rate": 2.938665451292719e-06, "loss": 0.0057, "reward": 0.8750000149011612, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 171 }, { "completion_length": 608.9375, "epoch": 0.18346666666666667, "grad_norm": 0.09395790100097656, "kl": 0.0068817138671875, "learning_rate": 2.937073306370679e-06, "loss": -0.0042, "reward": 0.7916666865348816, "reward_std": 0.22155843675136566, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 172 }, { "completion_length": 838.9583435058594, "epoch": 0.18453333333333333, "grad_norm": 0.0960073322057724, "kl": 0.00701904296875, "learning_rate": 2.9354612031730146e-06, "loss": 0.1132, "reward": 0.7916666865348816, "reward_std": 0.22155843675136566, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 173 }, { "completion_length": 549.0000152587891, "epoch": 0.1856, "grad_norm": 0.07480474561452866, "kl": 0.00726318359375, "learning_rate": 2.933829164088841e-06, "loss": 0.0363, "reward": 0.9166666865348816, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 0.0, "step": 174 }, { "completion_length": 663.9375152587891, "epoch": 0.18666666666666668, "grad_norm": 0.4338505268096924, "kl": 0.01030731201171875, "learning_rate": 2.9321772117841463e-06, "loss": 0.0485, "reward": 0.8333333432674408, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.0, "step": 175 }, { "completion_length": 779.6875305175781, "epoch": 0.18773333333333334, "grad_norm": 0.0840056985616684, "kl": 0.0088348388671875, "learning_rate": 2.9305053692014753e-06, "loss": -0.0265, "reward": 0.7708333432674408, "reward_std": 0.1530931033194065, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 176 }, { "completion_length": 627.2708511352539, "epoch": 0.1888, "grad_norm": 0.04047759622335434, "kl": 0.00780487060546875, "learning_rate": 2.928813659559612e-06, "loss": 0.0215, "reward": 0.9791666716337204, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 177 }, { "completion_length": 454.3333435058594, "epoch": 0.18986666666666666, "grad_norm": 0.015875019133090973, "kl": 0.0098419189453125, "learning_rate": 2.9271021063532586e-06, "loss": 0.0004, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 178 }, { "completion_length": 480.9166793823242, "epoch": 0.19093333333333334, "grad_norm": 0.05155733600258827, "kl": 0.00673675537109375, "learning_rate": 2.925370733352704e-06, "loss": -0.013, "reward": 0.9583333432674408, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 179 }, { "completion_length": 628.0416946411133, "epoch": 0.192, "grad_norm": 0.20751002430915833, "kl": 0.016265869140625, "learning_rate": 2.923619564603501e-06, "loss": -0.0011, "reward": 0.8958333432674408, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 551.7083511352539, "epoch": 0.19306666666666666, "grad_norm": 0.09491605311632156, "kl": 0.00775909423828125, "learning_rate": 2.921848624426126e-06, "loss": 0.0053, "reward": 0.833333358168602, "reward_std": 0.23116153106093407, "rewards/accuracy_reward": 0.833333358168602, "rewards/format_reward": 0.0, "step": 181 }, { "completion_length": 689.6250076293945, "epoch": 0.19413333333333332, "grad_norm": 0.1940665990114212, "kl": 0.00716400146484375, "learning_rate": 2.9200579374156446e-06, "loss": -0.0719, "reward": 0.5833333432674408, "reward_std": 0.24859581887722015, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 182 }, { "completion_length": 592.3333435058594, "epoch": 0.1952, "grad_norm": 0.08693472295999527, "kl": 0.010772705078125, "learning_rate": 2.918247528441369e-06, "loss": -0.0066, "reward": 0.7500000298023224, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.0, "step": 183 }, { "completion_length": 469.02085876464844, "epoch": 0.19626666666666667, "grad_norm": 0.35742974281311035, "kl": 0.00875091552734375, "learning_rate": 2.9164174226465136e-06, "loss": -0.0089, "reward": 0.9791666716337204, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 184 }, { "completion_length": 584.3333587646484, "epoch": 0.19733333333333333, "grad_norm": 0.12243155390024185, "kl": 0.00921630859375, "learning_rate": 2.9145676454478435e-06, "loss": 0.0449, "reward": 0.7291666865348816, "reward_std": 0.27258947491645813, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 185 }, { "completion_length": 525.4583511352539, "epoch": 0.1984, "grad_norm": 0.008311565034091473, "kl": 0.0063018798828125, "learning_rate": 2.912698222535324e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 186 }, { "completion_length": 591.8750076293945, "epoch": 0.19946666666666665, "grad_norm": 0.14724919199943542, "kl": 0.009857177734375, "learning_rate": 2.9108091798717634e-06, "loss": 0.0836, "reward": 0.666666679084301, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.0, "step": 187 }, { "completion_length": 615.7708511352539, "epoch": 0.20053333333333334, "grad_norm": 0.0707949548959732, "kl": 0.0063934326171875, "learning_rate": 2.9089005436924505e-06, "loss": 0.0135, "reward": 0.9375000149011612, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.9375000149011612, "rewards/format_reward": 0.0, "step": 188 }, { "completion_length": 681.6458435058594, "epoch": 0.2016, "grad_norm": 0.10560750961303711, "kl": 0.00757598876953125, "learning_rate": 2.9069723405047926e-06, "loss": 0.0842, "reward": 0.6666667014360428, "reward_std": 0.268673874437809, "rewards/accuracy_reward": 0.6666667014360428, "rewards/format_reward": 0.0, "step": 189 }, { "completion_length": 453.7083511352539, "epoch": 0.20266666666666666, "grad_norm": 0.05186254903674126, "kl": 0.00640869140625, "learning_rate": 2.9050245970879456e-06, "loss": 0.0288, "reward": 0.9583333432674408, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 465.6458511352539, "epoch": 0.20373333333333332, "grad_norm": 0.006153787951916456, "kl": 0.00669097900390625, "learning_rate": 2.903057340492444e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "step": 191 }, { "completion_length": 438.10418701171875, "epoch": 0.2048, "grad_norm": 0.10477295517921448, "kl": 0.0060577392578125, "learning_rate": 2.901070598039822e-06, "loss": 0.033, "reward": 0.9375000149011612, "reward_std": 0.1530931033194065, "rewards/accuracy_reward": 0.9375000149011612, "rewards/format_reward": 0.0, "step": 192 }, { "completion_length": 531.8125305175781, "epoch": 0.20586666666666667, "grad_norm": 2.2601284980773926, "kl": 0.0524749755859375, "learning_rate": 2.8990643973222383e-06, "loss": 0.0722, "reward": 0.9375000149011612, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.9375000149011612, "rewards/format_reward": 0.0, "step": 193 }, { "completion_length": 595.3125152587891, "epoch": 0.20693333333333333, "grad_norm": 0.03044331632554531, "kl": 0.00616455078125, "learning_rate": 2.89703876620209e-06, "loss": -0.0018, "reward": 0.8333333432674408, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.0, "step": 194 }, { "completion_length": 741.2500152587891, "epoch": 0.208, "grad_norm": 0.11531760543584824, "kl": 0.00676727294921875, "learning_rate": 2.8949937328116252e-06, "loss": 0.0888, "reward": 0.770833358168602, "reward_std": 0.21764282882213593, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 195 }, { "completion_length": 748.4375228881836, "epoch": 0.20906666666666668, "grad_norm": 19.34681510925293, "kl": 0.5645294189453125, "learning_rate": 2.8929293255525563e-06, "loss": 0.1034, "reward": 0.7083333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 196 }, { "completion_length": 538.8750076293945, "epoch": 0.21013333333333334, "grad_norm": 2.2137532234191895, "kl": 0.1054840087890625, "learning_rate": 2.8908455730956588e-06, "loss": 0.0142, "reward": 0.9583333432674408, "reward_std": 0.10206207260489464, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 197 }, { "completion_length": 671.2916870117188, "epoch": 0.2112, "grad_norm": 0.09089936316013336, "kl": 0.00830078125, "learning_rate": 2.88874250438038e-06, "loss": 0.0702, "reward": 0.8750000149011612, "reward_std": 0.18404608964920044, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 198 }, { "completion_length": 417.81251525878906, "epoch": 0.21226666666666666, "grad_norm": 0.08758632093667984, "kl": 0.00824737548828125, "learning_rate": 2.8866201486144333e-06, "loss": -0.0186, "reward": 0.9583333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 199 }, { "completion_length": 426.3333435058594, "epoch": 0.21333333333333335, "grad_norm": 0.1501622349023819, "kl": 0.0216217041015625, "learning_rate": 2.884478535273393e-06, "loss": -0.0602, "reward": 0.895833358168602, "reward_std": 0.21764283627271652, "rewards/accuracy_reward": 0.895833358168602, "rewards/format_reward": 0.0, "step": 200 }, { "completion_length": 530.7291946411133, "epoch": 0.2144, "grad_norm": 0.10818256437778473, "kl": 0.01129150390625, "learning_rate": 2.8823176941002853e-06, "loss": -0.0205, "reward": 0.708333358168602, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 201 }, { "completion_length": 533.9791870117188, "epoch": 0.21546666666666667, "grad_norm": 0.12741748988628387, "kl": 0.00815582275390625, "learning_rate": 2.880137655105176e-06, "loss": 0.0177, "reward": 0.9166666865348816, "reward_std": 0.16661180555820465, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 0.0, "step": 202 }, { "completion_length": 565.8125152587891, "epoch": 0.21653333333333333, "grad_norm": 0.17605474591255188, "kl": 0.024810791015625, "learning_rate": 2.877938448564752e-06, "loss": 0.0557, "reward": 0.9375000149011612, "reward_std": 0.11558076739311218, "rewards/accuracy_reward": 0.9375000149011612, "rewards/format_reward": 0.0, "step": 203 }, { "completion_length": 576.0625228881836, "epoch": 0.2176, "grad_norm": 0.04692959785461426, "kl": 0.00820159912109375, "learning_rate": 2.875720105021903e-06, "loss": 0.0209, "reward": 0.7500000149011612, "reward_std": 0.10206207260489464, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 204 }, { "completion_length": 505.12500762939453, "epoch": 0.21866666666666668, "grad_norm": 0.07776006311178207, "kl": 0.00800323486328125, "learning_rate": 2.8734826552852934e-06, "loss": 0.018, "reward": 0.8333333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.0, "step": 205 }, { "completion_length": 586.2083435058594, "epoch": 0.21973333333333334, "grad_norm": 0.07713919878005981, "kl": 0.0096893310546875, "learning_rate": 2.8712261304289407e-06, "loss": 0.1099, "reward": 0.833333358168602, "reward_std": 0.19364918768405914, "rewards/accuracy_reward": 0.833333358168602, "rewards/format_reward": 0.0, "step": 206 }, { "completion_length": 680.6041870117188, "epoch": 0.2208, "grad_norm": 0.049553271383047104, "kl": 0.0110321044921875, "learning_rate": 2.868950561791778e-06, "loss": 0.0024, "reward": 0.6875, "reward_std": 0.06846532225608826, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 207 }, { "completion_length": 760.2708435058594, "epoch": 0.22186666666666666, "grad_norm": 0.31244659423828125, "kl": 0.0194244384765625, "learning_rate": 2.8666559809772215e-06, "loss": 0.0703, "reward": 0.6250000204890966, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.6250000204890966, "rewards/format_reward": 0.0, "step": 208 }, { "completion_length": 656.8750305175781, "epoch": 0.22293333333333334, "grad_norm": 0.09441377967596054, "kl": 0.0111083984375, "learning_rate": 2.8643424198527314e-06, "loss": -0.0038, "reward": 0.7916666716337204, "reward_std": 0.16661179438233376, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.0, "step": 209 }, { "completion_length": 459.4791793823242, "epoch": 0.224, "grad_norm": 0.08553262054920197, "kl": 0.01025390625, "learning_rate": 2.86200991054937e-06, "loss": -0.0185, "reward": 0.8750000149011612, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 554.3125305175781, "epoch": 0.22506666666666666, "grad_norm": 0.0791153684258461, "kl": 0.0096282958984375, "learning_rate": 2.8596584854613513e-06, "loss": -0.0007, "reward": 0.7916666716337204, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.0, "step": 211 }, { "completion_length": 561.2916946411133, "epoch": 0.22613333333333333, "grad_norm": 0.04319440945982933, "kl": 0.00849151611328125, "learning_rate": 2.8572881772455993e-06, "loss": 0.0114, "reward": 0.9583333432674408, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 212 }, { "completion_length": 542.8750152587891, "epoch": 0.2272, "grad_norm": 0.061369989067316055, "kl": 0.0102691650390625, "learning_rate": 2.8548990188212853e-06, "loss": 0.0052, "reward": 0.8541666716337204, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.8541666716337204, "rewards/format_reward": 0.0, "step": 213 }, { "completion_length": 595.3125152587891, "epoch": 0.22826666666666667, "grad_norm": 0.0969829261302948, "kl": 0.00824737548828125, "learning_rate": 2.852491043369377e-06, "loss": -0.029, "reward": 0.6875000149011612, "reward_std": 0.27258947119116783, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 214 }, { "completion_length": 843.9375305175781, "epoch": 0.22933333333333333, "grad_norm": 0.022766612470149994, "kl": 0.0091094970703125, "learning_rate": 2.850064284332176e-06, "loss": 0.0143, "reward": 0.8125, "reward_std": 0.06846532225608826, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 215 }, { "completion_length": 655.8958435058594, "epoch": 0.2304, "grad_norm": 0.10100144892930984, "kl": 0.0084686279296875, "learning_rate": 2.847618775412851e-06, "loss": 0.0493, "reward": 0.7500000149011612, "reward_std": 0.16661180555820465, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 216 }, { "completion_length": 489.00001525878906, "epoch": 0.23146666666666665, "grad_norm": 0.0977906659245491, "kl": 0.01055145263671875, "learning_rate": 2.845154550574973e-06, "loss": 0.0265, "reward": 0.8750000149011612, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 217 }, { "completion_length": 677.0625152587891, "epoch": 0.23253333333333334, "grad_norm": 0.08799886703491211, "kl": 0.009307861328125, "learning_rate": 2.842671644042043e-06, "loss": 0.0675, "reward": 0.7291666865348816, "reward_std": 0.3170611336827278, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 218 }, { "completion_length": 697.3958435058594, "epoch": 0.2336, "grad_norm": 0.2211955189704895, "kl": 0.01151275634765625, "learning_rate": 2.840170090297014e-06, "loss": 0.0534, "reward": 0.729166679084301, "reward_std": 0.2900237701833248, "rewards/accuracy_reward": 0.729166679084301, "rewards/format_reward": 0.0, "step": 219 }, { "completion_length": 646.0625152587891, "epoch": 0.23466666666666666, "grad_norm": 0.04136532172560692, "kl": 0.0084381103515625, "learning_rate": 2.8376499240818166e-06, "loss": 0.0364, "reward": 0.7291666716337204, "reward_std": 0.05103103816509247, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 636.4791793823242, "epoch": 0.23573333333333332, "grad_norm": 0.3394457995891571, "kl": 0.01605224609375, "learning_rate": 2.8351111803968714e-06, "loss": 0.0456, "reward": 0.7500000074505806, "reward_std": 0.18404609709978104, "rewards/accuracy_reward": 0.7500000074505806, "rewards/format_reward": 0.0, "step": 221 }, { "completion_length": 681.0000228881836, "epoch": 0.2368, "grad_norm": 0.07565105706453323, "kl": 0.007781982421875, "learning_rate": 2.8325538945006067e-06, "loss": -0.0281, "reward": 0.8125000149011612, "reward_std": 0.21764283254742622, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 222 }, { "completion_length": 972.5833892822266, "epoch": 0.23786666666666667, "grad_norm": 0.0812593549489975, "kl": 0.011962890625, "learning_rate": 2.829978101908969e-06, "loss": 0.0414, "reward": 0.6041666716337204, "reward_std": 0.1530931107699871, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 223 }, { "completion_length": 671.8958587646484, "epoch": 0.23893333333333333, "grad_norm": 0.09207544475793839, "kl": 0.00724029541015625, "learning_rate": 2.827383838394926e-06, "loss": 0.0258, "reward": 0.7916666865348816, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 224 }, { "completion_length": 506.1458511352539, "epoch": 0.24, "grad_norm": 0.11194305121898651, "kl": 0.0118865966796875, "learning_rate": 2.8247711399879734e-06, "loss": 0.0085, "reward": 0.7500000149011612, "reward_std": 0.20412414520978928, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 225 }, { "completion_length": 588.2500228881836, "epoch": 0.24106666666666668, "grad_norm": 0.06695137172937393, "kl": 0.008941650390625, "learning_rate": 2.8221400429736333e-06, "loss": 0.0266, "reward": 0.9791666716337204, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 226 }, { "completion_length": 558.1458435058594, "epoch": 0.24213333333333334, "grad_norm": 0.10279230773448944, "kl": 0.01178741455078125, "learning_rate": 2.81949058389295e-06, "loss": -0.0118, "reward": 0.9791666716337204, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 227 }, { "completion_length": 588.3333511352539, "epoch": 0.2432, "grad_norm": 0.16600674390792847, "kl": 0.00972747802734375, "learning_rate": 2.8168227995419826e-06, "loss": 0.064, "reward": 0.8125000298023224, "reward_std": 0.28219256550073624, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 0.0, "step": 228 }, { "completion_length": 547.4791870117188, "epoch": 0.24426666666666666, "grad_norm": 0.15054546296596527, "kl": 0.0087738037109375, "learning_rate": 2.8141367269712943e-06, "loss": 0.0335, "reward": 0.7916666865348816, "reward_std": 0.2686738669872284, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 229 }, { "completion_length": 630.1875152587891, "epoch": 0.24533333333333332, "grad_norm": 0.5896067023277283, "kl": 0.040771484375, "learning_rate": 2.8114324034854378e-06, "loss": 0.0851, "reward": 0.708333358168602, "reward_std": 0.24859580025076866, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 605.7500152587891, "epoch": 0.2464, "grad_norm": 0.19836215674877167, "kl": 0.02120208740234375, "learning_rate": 2.808709866642437e-06, "loss": 0.0008, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 231 }, { "completion_length": 482.2708511352539, "epoch": 0.24746666666666667, "grad_norm": 0.08827076107263565, "kl": 0.013824462890625, "learning_rate": 2.8059691542532654e-06, "loss": 0.009, "reward": 0.708333358168602, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 232 }, { "completion_length": 530.0833511352539, "epoch": 0.24853333333333333, "grad_norm": 0.006477909628301859, "kl": 0.00848388671875, "learning_rate": 2.8032103043813213e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "step": 233 }, { "completion_length": 488.8541793823242, "epoch": 0.2496, "grad_norm": 0.08857883512973785, "kl": 0.008758544921875, "learning_rate": 2.800433355341898e-06, "loss": 0.003, "reward": 0.8541666716337204, "reward_std": 0.18796167895197868, "rewards/accuracy_reward": 0.8541666716337204, "rewards/format_reward": 0.0, "step": 234 }, { "completion_length": 505.37501525878906, "epoch": 0.25066666666666665, "grad_norm": 0.35280442237854004, "kl": 0.01116943359375, "learning_rate": 2.7976383457016535e-06, "loss": 0.0274, "reward": 0.7291666716337204, "reward_std": 0.28219255432486534, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.0, "step": 235 }, { "completion_length": 568.7083435058594, "epoch": 0.2517333333333333, "grad_norm": 0.09588750451803207, "kl": 0.00836181640625, "learning_rate": 2.7948253142780738e-06, "loss": -0.0116, "reward": 0.7708333432674408, "reward_std": 0.1705273948609829, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 236 }, { "completion_length": 653.0833511352539, "epoch": 0.2528, "grad_norm": 0.09476999193429947, "kl": 0.010833740234375, "learning_rate": 2.791994300138934e-06, "loss": 0.0407, "reward": 0.666666692122817, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.666666692122817, "rewards/format_reward": 0.0, "step": 237 }, { "completion_length": 535.8125228881836, "epoch": 0.2538666666666667, "grad_norm": 0.1287631392478943, "kl": 0.00875091552734375, "learning_rate": 2.789145342601755e-06, "loss": 0.0096, "reward": 0.9166666865348816, "reward_std": 0.20412414893507957, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 0.0, "step": 238 }, { "completion_length": 658.6250228881836, "epoch": 0.25493333333333335, "grad_norm": 0.10654311627149582, "kl": 0.01226806640625, "learning_rate": 2.786278481233259e-06, "loss": 0.0541, "reward": 0.7916666865348816, "reward_std": 0.20412414893507957, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 239 }, { "completion_length": 829.9791870117188, "epoch": 0.256, "grad_norm": 0.08298163115978241, "kl": 0.01019287109375, "learning_rate": 2.7833937558488187e-06, "loss": 0.0521, "reward": 0.5625000149011612, "reward_std": 0.24468021839857101, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 483.9166717529297, "epoch": 0.25706666666666667, "grad_norm": 0.11503268033266068, "kl": 0.010162353515625, "learning_rate": 2.7804912065119048e-06, "loss": 0.0408, "reward": 0.895833358168602, "reward_std": 0.1801304928958416, "rewards/accuracy_reward": 0.895833358168602, "rewards/format_reward": 0.0, "step": 241 }, { "completion_length": 507.1666793823242, "epoch": 0.2581333333333333, "grad_norm": 0.011149649508297443, "kl": 0.00823974609375, "learning_rate": 2.777570873533529e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 242 }, { "completion_length": 552.125, "epoch": 0.2592, "grad_norm": 0.06200383976101875, "kl": 0.0134735107421875, "learning_rate": 2.7746327974716863e-06, "loss": 0.009, "reward": 0.9375000149011612, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.9375000149011612, "rewards/format_reward": 0.0, "step": 243 }, { "completion_length": 539.3333511352539, "epoch": 0.26026666666666665, "grad_norm": 0.15987475216388702, "kl": 0.011199951171875, "learning_rate": 2.7716770191307885e-06, "loss": 0.0598, "reward": 0.7291666828095913, "reward_std": 0.25515518337488174, "rewards/accuracy_reward": 0.7291666828095913, "rewards/format_reward": 0.0, "step": 244 }, { "completion_length": 451.1666717529297, "epoch": 0.2613333333333333, "grad_norm": 0.05804312229156494, "kl": 0.00928497314453125, "learning_rate": 2.7687035795611003e-06, "loss": -0.0137, "reward": 0.9791666716337204, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 245 }, { "completion_length": 452.7083435058594, "epoch": 0.2624, "grad_norm": 0.1001739501953125, "kl": 0.0129241943359375, "learning_rate": 2.7657125200581663e-06, "loss": 0.0206, "reward": 0.7291666865348816, "reward_std": 0.1530931070446968, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 246 }, { "completion_length": 657.1041717529297, "epoch": 0.2634666666666667, "grad_norm": 0.10195165872573853, "kl": 0.01123046875, "learning_rate": 2.7627038821622417e-06, "loss": -0.0313, "reward": 0.6666666865348816, "reward_std": 0.24859580025076866, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 247 }, { "completion_length": 643.5416870117188, "epoch": 0.26453333333333334, "grad_norm": 0.08988294750452042, "kl": 0.01009368896484375, "learning_rate": 2.7596777076577106e-06, "loss": 0.011, "reward": 0.8125000074505806, "reward_std": 0.11558076739311218, "rewards/accuracy_reward": 0.8125000074505806, "rewards/format_reward": 0.0, "step": 248 }, { "completion_length": 573.9166793823242, "epoch": 0.2656, "grad_norm": 0.19295960664749146, "kl": 0.0142974853515625, "learning_rate": 2.7566340385725087e-06, "loss": 0.0091, "reward": 0.8125000149011612, "reward_std": 0.21764283999800682, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 249 }, { "completion_length": 757.8125152587891, "epoch": 0.26666666666666666, "grad_norm": 0.10752751678228378, "kl": 0.01110076904296875, "learning_rate": 2.7535729171775408e-06, "loss": -0.0001, "reward": 0.8125000298023224, "reward_std": 0.21764283627271652, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 509.8125228881836, "epoch": 0.2677333333333333, "grad_norm": 0.17306388914585114, "kl": 0.01340484619140625, "learning_rate": 2.7504943859860883e-06, "loss": -0.0749, "reward": 0.7916666865348816, "reward_std": 0.20412414893507957, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 251 }, { "completion_length": 691.2500152587891, "epoch": 0.2688, "grad_norm": 1.702752709388733, "kl": 0.0494232177734375, "learning_rate": 2.7473984877532248e-06, "loss": 0.0817, "reward": 0.7708333507180214, "reward_std": 0.27258947864174843, "rewards/accuracy_reward": 0.7708333507180214, "rewards/format_reward": 0.0, "step": 252 }, { "completion_length": 536.5208435058594, "epoch": 0.26986666666666664, "grad_norm": 0.05621562525629997, "kl": 0.009185791015625, "learning_rate": 2.7442852654752197e-06, "loss": -0.0023, "reward": 0.9583333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 253 }, { "completion_length": 464.7291793823242, "epoch": 0.27093333333333336, "grad_norm": 0.07164980471134186, "kl": 0.0111083984375, "learning_rate": 2.74115476238894e-06, "loss": -0.0119, "reward": 0.9791666716337204, "reward_std": 0.05103103816509247, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 254 }, { "completion_length": 590.1041717529297, "epoch": 0.272, "grad_norm": 0.15541806817054749, "kl": 0.011383056640625, "learning_rate": 2.7380070219712514e-06, "loss": 0.0647, "reward": 0.8958333432674408, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.0, "step": 255 }, { "completion_length": 638.5208511352539, "epoch": 0.2730666666666667, "grad_norm": 0.02778136357665062, "kl": 0.010711669921875, "learning_rate": 2.734842087938415e-06, "loss": 0.0098, "reward": 0.8541666716337204, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.8541666716337204, "rewards/format_reward": 0.0, "step": 256 }, { "completion_length": 448.5208435058594, "epoch": 0.27413333333333334, "grad_norm": 0.012363338842988014, "kl": 0.0143585205078125, "learning_rate": 2.731660004245478e-06, "loss": 0.0006, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "step": 257 }, { "completion_length": 694.6666870117188, "epoch": 0.2752, "grad_norm": 0.18281190097332, "kl": 0.0093536376953125, "learning_rate": 2.728460815085665e-06, "loss": 0.0967, "reward": 0.7083333507180214, "reward_std": 0.2861081548035145, "rewards/accuracy_reward": 0.7083333507180214, "rewards/format_reward": 0.0, "step": 258 }, { "completion_length": 884.3542022705078, "epoch": 0.27626666666666666, "grad_norm": 0.0680878534913063, "kl": 0.013946533203125, "learning_rate": 2.725244564889764e-06, "loss": 0.0013, "reward": 0.729166679084301, "reward_std": 0.1801304928958416, "rewards/accuracy_reward": 0.729166679084301, "rewards/format_reward": 0.0, "step": 259 }, { "completion_length": 722.5416870117188, "epoch": 0.2773333333333333, "grad_norm": 0.104962557554245, "kl": 0.013519287109375, "learning_rate": 2.722011298325509e-06, "loss": 0.1055, "reward": 0.75, "reward_std": 0.18404608219861984, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 461.0000228881836, "epoch": 0.2784, "grad_norm": 3.486393690109253, "kl": 0.05315399169921875, "learning_rate": 2.7187610602969586e-06, "loss": 0.1157, "reward": 0.9583333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 261 }, { "completion_length": 842.0625305175781, "epoch": 0.27946666666666664, "grad_norm": 1.1189333200454712, "kl": 0.064605712890625, "learning_rate": 2.7154938959438756e-06, "loss": -0.0489, "reward": 0.5000000055879354, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.5000000055879354, "rewards/format_reward": 0.0, "step": 262 }, { "completion_length": 606.3333358764648, "epoch": 0.28053333333333336, "grad_norm": 0.4565054774284363, "kl": 0.020965576171875, "learning_rate": 2.7122098506410955e-06, "loss": 0.0139, "reward": 0.7916666865348816, "reward_std": 0.2861081510782242, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 263 }, { "completion_length": 852.5000305175781, "epoch": 0.2816, "grad_norm": 10.727190971374512, "kl": 1.582427978515625, "learning_rate": 2.7089089699979008e-06, "loss": 0.0973, "reward": 0.6458333432674408, "reward_std": 0.13301505148410797, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 264 }, { "completion_length": 628.2708435058594, "epoch": 0.2826666666666667, "grad_norm": 0.08597361296415329, "kl": 0.0169677734375, "learning_rate": 2.705591299857385e-06, "loss": 0.0366, "reward": 0.6666666716337204, "reward_std": 0.11949636042118073, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 265 }, { "completion_length": 591.7291717529297, "epoch": 0.28373333333333334, "grad_norm": 0.1049145758152008, "kl": 0.0169677734375, "learning_rate": 2.7022568862958153e-06, "loss": 0.0417, "reward": 0.791666679084301, "reward_std": 0.22155843675136566, "rewards/accuracy_reward": 0.791666679084301, "rewards/format_reward": 0.0, "step": 266 }, { "completion_length": 933.5000152587891, "epoch": 0.2848, "grad_norm": 0.10108722001314163, "kl": 0.01446533203125, "learning_rate": 2.6989057756219958e-06, "loss": 0.0719, "reward": 0.5833333432674408, "reward_std": 0.24859581142663956, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 267 }, { "completion_length": 500.7291793823242, "epoch": 0.28586666666666666, "grad_norm": 0.1733587384223938, "kl": 0.0144195556640625, "learning_rate": 2.6955380143766217e-06, "loss": 0.0422, "reward": 0.8541666865348816, "reward_std": 0.21764284372329712, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.0, "step": 268 }, { "completion_length": 581.2291793823242, "epoch": 0.2869333333333333, "grad_norm": 0.23753900825977325, "kl": 0.0107879638671875, "learning_rate": 2.6921536493316326e-06, "loss": 0.0167, "reward": 0.8333333432674408, "reward_std": 0.23116153106093407, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.0, "step": 269 }, { "completion_length": 564.2916870117188, "epoch": 0.288, "grad_norm": 0.3392970860004425, "kl": 0.0313720703125, "learning_rate": 2.6887527274895657e-06, "loss": -0.0766, "reward": 0.8541666865348816, "reward_std": 0.27258946374058723, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 435.4583435058594, "epoch": 0.2890666666666667, "grad_norm": 0.11167998611927032, "kl": 0.013763427734375, "learning_rate": 2.6853352960829e-06, "loss": -0.0062, "reward": 0.7916666865348816, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 271 }, { "completion_length": 454.7083511352539, "epoch": 0.29013333333333335, "grad_norm": 0.05223336070775986, "kl": 0.0141143798828125, "learning_rate": 2.6819014025734022e-06, "loss": -0.001, "reward": 0.9583333432674408, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 272 }, { "completion_length": 676.4375152587891, "epoch": 0.2912, "grad_norm": 0.21856988966464996, "kl": 0.0127105712890625, "learning_rate": 2.678451094651467e-06, "loss": 0.0121, "reward": 0.8750000298023224, "reward_std": 0.268673874437809, "rewards/accuracy_reward": 0.8750000298023224, "rewards/format_reward": 0.0, "step": 273 }, { "completion_length": 640.3125152587891, "epoch": 0.2922666666666667, "grad_norm": 0.17233766615390778, "kl": 0.02386474609375, "learning_rate": 2.6749844202354553e-06, "loss": -0.0093, "reward": 0.729166679084301, "reward_std": 0.19756478071212769, "rewards/accuracy_reward": 0.729166679084301, "rewards/format_reward": 0.0, "step": 274 }, { "completion_length": 468.2916793823242, "epoch": 0.29333333333333333, "grad_norm": 0.0835915207862854, "kl": 0.0177154541015625, "learning_rate": 2.6715014274710265e-06, "loss": -0.0076, "reward": 0.8958333432674408, "reward_std": 0.13301505148410797, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.0, "step": 275 }, { "completion_length": 551.7916946411133, "epoch": 0.2944, "grad_norm": 0.3406735956668854, "kl": 0.02288818359375, "learning_rate": 2.6680021647304735e-06, "loss": 0.0198, "reward": 0.8125000298023224, "reward_std": 0.21764284372329712, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 0.0, "step": 276 }, { "completion_length": 681.5208587646484, "epoch": 0.29546666666666666, "grad_norm": 0.22284318506717682, "kl": 0.0221405029296875, "learning_rate": 2.6644866806120474e-06, "loss": 0.0398, "reward": 0.5416666865348816, "reward_std": 0.3506578914821148, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 277 }, { "completion_length": 634.6250305175781, "epoch": 0.2965333333333333, "grad_norm": 0.23726777732372284, "kl": 0.0242919921875, "learning_rate": 2.6609550239392854e-06, "loss": 0.1419, "reward": 0.7500000298023224, "reward_std": 0.3332235999405384, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.0, "step": 278 }, { "completion_length": 604.083366394043, "epoch": 0.2976, "grad_norm": 0.15768684446811676, "kl": 0.02440643310546875, "learning_rate": 2.65740724376033e-06, "loss": 0.034, "reward": 0.7708333507180214, "reward_std": 0.1801304966211319, "rewards/accuracy_reward": 0.7708333507180214, "rewards/format_reward": 0.0, "step": 279 }, { "completion_length": 670.1667022705078, "epoch": 0.2986666666666667, "grad_norm": 0.15399956703186035, "kl": 0.0207977294921875, "learning_rate": 2.65384338934725e-06, "loss": -0.0011, "reward": 0.7916666716337204, "reward_std": 0.22155842557549477, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 570.3541793823242, "epoch": 0.29973333333333335, "grad_norm": 20.69408416748047, "kl": 0.437774658203125, "learning_rate": 2.6502635101953553e-06, "loss": 0.0744, "reward": 0.6875000149011612, "reward_std": 0.2350771203637123, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 281 }, { "completion_length": 594.2916870117188, "epoch": 0.3008, "grad_norm": 0.23112766444683075, "kl": 0.03295135498046875, "learning_rate": 2.64666765602251e-06, "loss": -0.0167, "reward": 0.833333358168602, "reward_std": 0.2686738632619381, "rewards/accuracy_reward": 0.833333358168602, "rewards/format_reward": 0.0, "step": 282 }, { "completion_length": 684.9375076293945, "epoch": 0.30186666666666667, "grad_norm": 2.3511464595794678, "kl": 0.17095947265625, "learning_rate": 2.6430558767684408e-06, "loss": 0.1247, "reward": 0.604166679084301, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "step": 283 }, { "completion_length": 524.2291793823242, "epoch": 0.30293333333333333, "grad_norm": 0.11116162687540054, "kl": 0.020782470703125, "learning_rate": 2.6394282225940447e-06, "loss": 0.069, "reward": 0.7916666865348816, "reward_std": 0.19364918768405914, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 284 }, { "completion_length": 634.7500305175781, "epoch": 0.304, "grad_norm": 0.15139205753803253, "kl": 0.036468505859375, "learning_rate": 2.6357847438806916e-06, "loss": 0.0687, "reward": 0.7083333432674408, "reward_std": 0.2861081659793854, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 285 }, { "completion_length": 721.1666870117188, "epoch": 0.30506666666666665, "grad_norm": 0.09776723384857178, "kl": 0.0411529541015625, "learning_rate": 2.6321254912295243e-06, "loss": 0.0867, "reward": 0.625, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 286 }, { "completion_length": 836.7500152587891, "epoch": 0.3061333333333333, "grad_norm": 0.43000465631484985, "kl": 0.0511474609375, "learning_rate": 2.628450515460758e-06, "loss": 0.1565, "reward": 0.5833333432674408, "reward_std": 0.3506578840315342, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 287 }, { "completion_length": 800.2500305175781, "epoch": 0.3072, "grad_norm": 0.15286049246788025, "kl": 0.046630859375, "learning_rate": 2.624759867612971e-06, "loss": 0.0192, "reward": 0.5625000260770321, "reward_std": 0.30922994762659073, "rewards/accuracy_reward": 0.5625000260770321, "rewards/format_reward": 0.0, "step": 288 }, { "completion_length": 589.2083511352539, "epoch": 0.3082666666666667, "grad_norm": 0.38929155468940735, "kl": 0.05108642578125, "learning_rate": 2.621053598942398e-06, "loss": 0.0459, "reward": 0.7708333507180214, "reward_std": 0.21764283627271652, "rewards/accuracy_reward": 0.7708333507180214, "rewards/format_reward": 0.0, "step": 289 }, { "completion_length": 892.7708587646484, "epoch": 0.30933333333333335, "grad_norm": 0.17254580557346344, "kl": 0.0424346923828125, "learning_rate": 2.617331760922218e-06, "loss": 0.1481, "reward": 0.6250000204890966, "reward_std": 0.2861081436276436, "rewards/accuracy_reward": 0.6250000204890966, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 647.5833435058594, "epoch": 0.3104, "grad_norm": 0.46308109164237976, "kl": 0.034820556640625, "learning_rate": 2.61359440524184e-06, "loss": 0.1922, "reward": 0.8125000298023224, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 0.0, "step": 291 }, { "completion_length": 843.3333587646484, "epoch": 0.31146666666666667, "grad_norm": 2.414015054702759, "kl": 0.29534912109375, "learning_rate": 2.6098415838061832e-06, "loss": -0.005, "reward": 0.5833333358168602, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.5833333358168602, "rewards/format_reward": 0.0, "step": 292 }, { "completion_length": 909.2083435058594, "epoch": 0.31253333333333333, "grad_norm": 0.19494932889938354, "kl": 0.11865234375, "learning_rate": 2.6060733487349584e-06, "loss": 0.0893, "reward": 0.520833358168602, "reward_std": 0.3842546232044697, "rewards/accuracy_reward": 0.520833358168602, "rewards/format_reward": 0.0, "step": 293 }, { "completion_length": 767.8750152587891, "epoch": 0.3136, "grad_norm": 0.296319842338562, "kl": 0.130615234375, "learning_rate": 2.6022897523619424e-06, "loss": 0.0751, "reward": 0.708333358168602, "reward_std": 0.23116152361035347, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 294 }, { "completion_length": 853.9167022705078, "epoch": 0.31466666666666665, "grad_norm": 0.38666868209838867, "kl": 0.08770751953125, "learning_rate": 2.598490847234253e-06, "loss": 0.0895, "reward": 0.604166692122817, "reward_std": 0.34674229845404625, "rewards/accuracy_reward": 0.604166692122817, "rewards/format_reward": 0.0, "step": 295 }, { "completion_length": 865.1041870117188, "epoch": 0.3157333333333333, "grad_norm": 0.285860151052475, "kl": 0.177001953125, "learning_rate": 2.5946766861116167e-06, "loss": 0.208, "reward": 0.416666679084301, "reward_std": 0.31314554065465927, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 296 }, { "completion_length": 597.4166946411133, "epoch": 0.3168, "grad_norm": 0.39170873165130615, "kl": 0.1065673828125, "learning_rate": 2.5908473219656386e-06, "loss": 0.0312, "reward": 0.7291667014360428, "reward_std": 0.37465155497193336, "rewards/accuracy_reward": 0.7291667014360428, "rewards/format_reward": 0.0, "step": 297 }, { "completion_length": 899.1458587646484, "epoch": 0.3178666666666667, "grad_norm": 0.49817365407943726, "kl": 0.1624755859375, "learning_rate": 2.5870028079790647e-06, "loss": 0.1746, "reward": 0.6250000149011612, "reward_std": 0.4230388030409813, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 298 }, { "completion_length": 725.7708435058594, "epoch": 0.31893333333333335, "grad_norm": 0.36379411816596985, "kl": 0.190185546875, "learning_rate": 2.583143197545044e-06, "loss": 0.2246, "reward": 0.6041666939854622, "reward_std": 0.33713920786976814, "rewards/accuracy_reward": 0.6041666939854622, "rewards/format_reward": 0.0, "step": 299 }, { "completion_length": 809.2708435058594, "epoch": 0.32, "grad_norm": 0.42334237694740295, "kl": 0.298828125, "learning_rate": 2.5792685442663883e-06, "loss": 0.19, "reward": 0.479166679084301, "reward_std": 0.2446802258491516, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 300 }, { "completion_length": 595.2708435058594, "epoch": 0.32106666666666667, "grad_norm": 0.40623852610588074, "kl": 0.4130859375, "learning_rate": 2.5753789019548255e-06, "loss": 0.0796, "reward": 0.4583333469927311, "reward_std": 0.3680921792984009, "rewards/accuracy_reward": 0.4583333469927311, "rewards/format_reward": 0.0, "step": 301 }, { "completion_length": 528.4791793823242, "epoch": 0.3221333333333333, "grad_norm": 0.8719145655632019, "kl": 0.383056640625, "learning_rate": 2.571474324630253e-06, "loss": 0.0473, "reward": 0.541666679084301, "reward_std": 0.4326418936252594, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "step": 302 }, { "completion_length": 700.3125076293945, "epoch": 0.3232, "grad_norm": 0.4154370129108429, "kl": 0.21954345703125, "learning_rate": 2.567554866519989e-06, "loss": 0.1027, "reward": 0.5833333432674408, "reward_std": 0.23899272456765175, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 303 }, { "completion_length": 552.5208435058594, "epoch": 0.32426666666666665, "grad_norm": 3.053973913192749, "kl": 0.780517578125, "learning_rate": 2.5636205820580173e-06, "loss": 0.1136, "reward": 0.5833333432674408, "reward_std": 0.3977733179926872, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 304 }, { "completion_length": 487.4375228881836, "epoch": 0.3253333333333333, "grad_norm": 3204.74267578125, "kl": 38.24029541015625, "learning_rate": 2.559671525884232e-06, "loss": 3.0782, "reward": 0.6250000149011612, "reward_std": 0.18404608592391014, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 305 }, { "completion_length": 546.0000305175781, "epoch": 0.3264, "grad_norm": 3.2141385078430176, "kl": 0.310882568359375, "learning_rate": 2.5557077528436792e-06, "loss": -0.0677, "reward": 0.6041666679084301, "reward_std": 0.27258947491645813, "rewards/accuracy_reward": 0.6041666679084301, "rewards/format_reward": 0.0, "step": 306 }, { "completion_length": 550.5000228881836, "epoch": 0.3274666666666667, "grad_norm": 0.801479697227478, "kl": 0.307373046875, "learning_rate": 2.551729317985795e-06, "loss": -0.0929, "reward": 0.6041666865348816, "reward_std": 0.44616059213876724, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 307 }, { "completion_length": 629.8541870117188, "epoch": 0.32853333333333334, "grad_norm": 0.3986165523529053, "kl": 0.132080078125, "learning_rate": 2.5477362765636408e-06, "loss": -0.0122, "reward": 0.645833358168602, "reward_std": 0.3720077574253082, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 308 }, { "completion_length": 435.5625228881836, "epoch": 0.3296, "grad_norm": 1.774492621421814, "kl": 0.191619873046875, "learning_rate": 2.5437286840331353e-06, "loss": -0.023, "reward": 0.6666666865348816, "reward_std": 0.18404607102274895, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 309 }, { "completion_length": 440.06251525878906, "epoch": 0.33066666666666666, "grad_norm": 0.22383469343185425, "kl": 0.10833740234375, "learning_rate": 2.539706596052286e-06, "loss": -0.04, "reward": 0.770833358168602, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 511.5833511352539, "epoch": 0.3317333333333333, "grad_norm": 0.31802231073379517, "kl": 0.195068359375, "learning_rate": 2.535670068480414e-06, "loss": -0.0554, "reward": 0.6041666865348816, "reward_std": 0.21764283254742622, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 311 }, { "completion_length": 473.2708435058594, "epoch": 0.3328, "grad_norm": 0.21668870747089386, "kl": 0.107177734375, "learning_rate": 2.531619157377382e-06, "loss": -0.0125, "reward": 0.7083333507180214, "reward_std": 0.24859582632780075, "rewards/accuracy_reward": 0.7083333507180214, "rewards/format_reward": 0.0, "step": 312 }, { "completion_length": 384.18750762939453, "epoch": 0.33386666666666664, "grad_norm": 0.8326098322868347, "kl": 0.308349609375, "learning_rate": 2.5275539190028104e-06, "loss": -0.1279, "reward": 0.7500000298023224, "reward_std": 0.2957112565636635, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.0, "step": 313 }, { "completion_length": 358.37501525878906, "epoch": 0.33493333333333336, "grad_norm": 0.9355361461639404, "kl": 0.15020751953125, "learning_rate": 2.5234744098153e-06, "loss": -0.0259, "reward": 0.895833358168602, "reward_std": 0.1801304966211319, "rewards/accuracy_reward": 0.895833358168602, "rewards/format_reward": 0.0, "step": 314 }, { "completion_length": 490.3541946411133, "epoch": 0.336, "grad_norm": 0.28254133462905884, "kl": 0.13427734375, "learning_rate": 2.5193806864716466e-06, "loss": -0.0532, "reward": 0.7291666865348816, "reward_std": 0.1530931070446968, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 315 }, { "completion_length": 518.3333511352539, "epoch": 0.3370666666666667, "grad_norm": 1.7725809812545776, "kl": 0.271575927734375, "learning_rate": 2.5152728058260543e-06, "loss": -0.0418, "reward": 0.7083333432674408, "reward_std": 0.3776952736079693, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 316 }, { "completion_length": 519.1666946411133, "epoch": 0.33813333333333334, "grad_norm": 0.4923071563243866, "kl": 0.10223388671875, "learning_rate": 2.5111508249293456e-06, "loss": -0.0763, "reward": 0.770833358168602, "reward_std": 0.1801304928958416, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 317 }, { "completion_length": 409.12501525878906, "epoch": 0.3392, "grad_norm": 1.416187047958374, "kl": 0.29833984375, "learning_rate": 2.507014801028169e-06, "loss": -0.053, "reward": 0.8750000298023224, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.8750000298023224, "rewards/format_reward": 0.0, "step": 318 }, { "completion_length": 653.0000228881836, "epoch": 0.34026666666666666, "grad_norm": 1.7095825672149658, "kl": 0.52587890625, "learning_rate": 2.502864791564205e-06, "loss": 0.0318, "reward": 0.43750001303851604, "reward_std": 0.33713920786976814, "rewards/accuracy_reward": 0.43750001303851604, "rewards/format_reward": 0.0, "step": 319 }, { "completion_length": 557.1458511352539, "epoch": 0.3413333333333333, "grad_norm": 0.5773310661315918, "kl": 0.22314453125, "learning_rate": 2.4987008541733663e-06, "loss": 0.0049, "reward": 0.5833333507180214, "reward_std": 0.3332235924899578, "rewards/accuracy_reward": 0.5833333507180214, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 522.8958435058594, "epoch": 0.3424, "grad_norm": 0.7010601162910461, "kl": 0.206787109375, "learning_rate": 2.494523046685e-06, "loss": -0.0008, "reward": 0.7291666716337204, "reward_std": 0.1801304891705513, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.0, "step": 321 }, { "completion_length": 571.4166870117188, "epoch": 0.34346666666666664, "grad_norm": 0.9590763449668884, "kl": 0.238525390625, "learning_rate": 2.4903314271210824e-06, "loss": 0.0624, "reward": 0.7500000149011612, "reward_std": 0.45271995663642883, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 322 }, { "completion_length": 560.1250076293945, "epoch": 0.34453333333333336, "grad_norm": 1.436085820198059, "kl": 0.2183837890625, "learning_rate": 2.486126053695414e-06, "loss": -0.0305, "reward": 0.6250000204890966, "reward_std": 0.23116153478622437, "rewards/accuracy_reward": 0.6250000204890966, "rewards/format_reward": 0.0, "step": 323 }, { "completion_length": 668.7916870117188, "epoch": 0.3456, "grad_norm": 1.4011485576629639, "kl": 0.732421875, "learning_rate": 2.48190698481281e-06, "loss": -0.0285, "reward": 0.6250000298023224, "reward_std": 0.3332236036658287, "rewards/accuracy_reward": 0.6250000298023224, "rewards/format_reward": 0.0, "step": 324 }, { "completion_length": 708.6875152587891, "epoch": 0.3466666666666667, "grad_norm": 0.8237535357475281, "kl": 0.524169921875, "learning_rate": 2.477674279068291e-06, "loss": 0.025, "reward": 0.4583333432674408, "reward_std": 0.20148035883903503, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "step": 325 }, { "completion_length": 504.85418701171875, "epoch": 0.34773333333333334, "grad_norm": 1.6992197036743164, "kl": 0.484130859375, "learning_rate": 2.473427995246269e-06, "loss": -0.1224, "reward": 0.7083333507180214, "reward_std": 0.30354245379567146, "rewards/accuracy_reward": 0.7083333507180214, "rewards/format_reward": 0.0, "step": 326 }, { "completion_length": 521.8750076293945, "epoch": 0.3488, "grad_norm": 1.1015336513519287, "kl": 0.191680908203125, "learning_rate": 2.4691681923197277e-06, "loss": -0.0298, "reward": 0.9166666716337204, "reward_std": 0.11949635669589043, "rewards/accuracy_reward": 0.9166666716337204, "rewards/format_reward": 0.0, "step": 327 }, { "completion_length": 564.8125152587891, "epoch": 0.34986666666666666, "grad_norm": 0.7968987226486206, "kl": 0.4095458984375, "learning_rate": 2.464894929449408e-06, "loss": 0.0019, "reward": 0.6875000074505806, "reward_std": 0.34674228355288506, "rewards/accuracy_reward": 0.6875000074505806, "rewards/format_reward": 0.0, "step": 328 }, { "completion_length": 697.5417022705078, "epoch": 0.3509333333333333, "grad_norm": 0.3374710977077484, "kl": 0.2398681640625, "learning_rate": 2.460608265982985e-06, "loss": -0.0275, "reward": 0.5208333507180214, "reward_std": 0.2350771315395832, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 329 }, { "completion_length": 564.2916870117188, "epoch": 0.352, "grad_norm": 0.6989172101020813, "kl": 0.3328857421875, "learning_rate": 2.4563082614542412e-06, "loss": -0.0221, "reward": 0.8333333432674408, "reward_std": 0.24859581515192986, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 730.0625305175781, "epoch": 0.35306666666666664, "grad_norm": 0.15640997886657715, "kl": 0.018341064453125, "learning_rate": 2.4519949755822433e-06, "loss": 0.1364, "reward": 0.8750000149011612, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 331 }, { "completion_length": 741.5208435058594, "epoch": 0.35413333333333336, "grad_norm": 0.3140040636062622, "kl": 0.2025909423828125, "learning_rate": 2.447668468270509e-06, "loss": 0.012, "reward": 0.6666666865348816, "reward_std": 0.25819891691207886, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 332 }, { "completion_length": 596.1041870117188, "epoch": 0.3552, "grad_norm": 1.0607224702835083, "kl": 0.175048828125, "learning_rate": 2.44332879960618e-06, "loss": -0.0819, "reward": 0.7291666716337204, "reward_std": 0.235077116638422, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.0, "step": 333 }, { "completion_length": 614.8750228881836, "epoch": 0.3562666666666667, "grad_norm": 0.2515202760696411, "kl": 0.0470123291015625, "learning_rate": 2.4389760298591824e-06, "loss": 0.0915, "reward": 0.7500000149011612, "reward_std": 0.31314554810523987, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 334 }, { "completion_length": 622.3541946411133, "epoch": 0.35733333333333334, "grad_norm": 1.5843651294708252, "kl": 0.643646240234375, "learning_rate": 2.4346102194813937e-06, "loss": 0.0357, "reward": 0.8125000149011612, "reward_std": 0.2900237515568733, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 335 }, { "completion_length": 478.1250228881836, "epoch": 0.3584, "grad_norm": 1.2630350589752197, "kl": 0.234466552734375, "learning_rate": 2.4302314291058004e-06, "loss": 0.1113, "reward": 0.8333333432674408, "reward_std": 0.10206207260489464, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.0, "step": 336 }, { "completion_length": 549.5833435058594, "epoch": 0.35946666666666666, "grad_norm": 0.6331356167793274, "kl": 0.1217803955078125, "learning_rate": 2.4258397195456573e-06, "loss": 0.0583, "reward": 0.9583333432674408, "reward_std": 0.10206207260489464, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 337 }, { "completion_length": 519.9791793823242, "epoch": 0.3605333333333333, "grad_norm": 0.09054915606975555, "kl": 0.03369140625, "learning_rate": 2.4214351517936423e-06, "loss": 0.0036, "reward": 0.9375000149011612, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.9375000149011612, "rewards/format_reward": 0.0, "step": 338 }, { "completion_length": 567.3541870117188, "epoch": 0.3616, "grad_norm": 0.43106403946876526, "kl": 0.0825347900390625, "learning_rate": 2.4170177870210112e-06, "loss": 0.017, "reward": 0.9166666716337204, "reward_std": 0.11949635669589043, "rewards/accuracy_reward": 0.9166666716337204, "rewards/format_reward": 0.0, "step": 339 }, { "completion_length": 514.8333435058594, "epoch": 0.3626666666666667, "grad_norm": 0.07967344671487808, "kl": 0.0357513427734375, "learning_rate": 2.4125876865767443e-06, "loss": -0.045, "reward": 0.8333333432674408, "reward_std": 0.18404608964920044, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 782.3750457763672, "epoch": 0.36373333333333335, "grad_norm": 0.8787924647331238, "kl": 0.353240966796875, "learning_rate": 2.4081449119866983e-06, "loss": 0.0107, "reward": 0.7500000149011612, "reward_std": 0.23116152361035347, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 341 }, { "completion_length": 501.31250762939453, "epoch": 0.3648, "grad_norm": 0.42713209986686707, "kl": 0.132110595703125, "learning_rate": 2.40368952495275e-06, "loss": 0.0103, "reward": 0.9375, "reward_std": 0.06846532225608826, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 342 }, { "completion_length": 491.8958435058594, "epoch": 0.3658666666666667, "grad_norm": 0.04763878509402275, "kl": 0.0247039794921875, "learning_rate": 2.399221587351939e-06, "loss": 0.0036, "reward": 0.9791666716337204, "reward_std": 0.05103103816509247, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 343 }, { "completion_length": 653.9583435058594, "epoch": 0.36693333333333333, "grad_norm": 0.038816187530756, "kl": 0.0173492431640625, "learning_rate": 2.3947411612356092e-06, "loss": 0.0238, "reward": 0.6875000149011612, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 344 }, { "completion_length": 679.5000076293945, "epoch": 0.368, "grad_norm": 0.2047126442193985, "kl": 0.049591064453125, "learning_rate": 2.390248308828548e-06, "loss": 0.0004, "reward": 0.8541666716337204, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.8541666716337204, "rewards/format_reward": 0.0, "step": 345 }, { "completion_length": 613.2916870117188, "epoch": 0.36906666666666665, "grad_norm": 0.1377575546503067, "kl": 0.03253173828125, "learning_rate": 2.3857430925281186e-06, "loss": 0.0381, "reward": 0.8750000149011612, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 346 }, { "completion_length": 474.8958511352539, "epoch": 0.3701333333333333, "grad_norm": 0.2331426441669464, "kl": 0.051910400390625, "learning_rate": 2.3812255749033975e-06, "loss": 0.0265, "reward": 0.9791666716337204, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.9791666716337204, "rewards/format_reward": 0.0, "step": 347 }, { "completion_length": 552.2500228881836, "epoch": 0.3712, "grad_norm": 3.375819206237793, "kl": 0.715484619140625, "learning_rate": 2.3766958186943022e-06, "loss": 0.0245, "reward": 0.75, "reward_std": 0.18404608219861984, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 348 }, { "completion_length": 547.0625228881836, "epoch": 0.3722666666666667, "grad_norm": 0.21201248466968536, "kl": 0.074920654296875, "learning_rate": 2.3721538868107225e-06, "loss": 0.0594, "reward": 0.833333358168602, "reward_std": 0.3236205093562603, "rewards/accuracy_reward": 0.833333358168602, "rewards/format_reward": 0.0, "step": 349 }, { "completion_length": 577.8541870117188, "epoch": 0.37333333333333335, "grad_norm": 0.29011285305023193, "kl": 0.3007659912109375, "learning_rate": 2.367599842331646e-06, "loss": 0.0067, "reward": 0.9166666716337204, "reward_std": 0.11949635669589043, "rewards/accuracy_reward": 0.9166666716337204, "rewards/format_reward": 0.0, "step": 350 }, { "completion_length": 576.8125228881836, "epoch": 0.3744, "grad_norm": 0.16102126240730286, "kl": 0.026397705078125, "learning_rate": 2.3630337485042807e-06, "loss": -0.0094, "reward": 0.7500000149011612, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 351 }, { "completion_length": 764.5625305175781, "epoch": 0.37546666666666667, "grad_norm": 1.7207412719726562, "kl": 0.0829620361328125, "learning_rate": 2.3584556687431787e-06, "loss": 0.0222, "reward": 0.6666666716337204, "reward_std": 0.22155842930078506, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 352 }, { "completion_length": 557.8958587646484, "epoch": 0.37653333333333333, "grad_norm": 18.228918075561523, "kl": 5.29864501953125, "learning_rate": 2.3538656666293525e-06, "loss": 0.0635, "reward": 0.9166666716337204, "reward_std": 0.11949636042118073, "rewards/accuracy_reward": 0.9166666716337204, "rewards/format_reward": 0.0, "step": 353 }, { "completion_length": 611.4375152587891, "epoch": 0.3776, "grad_norm": 0.3171495497226715, "kl": 0.0928192138671875, "learning_rate": 2.3492638059093957e-06, "loss": -0.0242, "reward": 0.7916666865348816, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 354 }, { "completion_length": 548.6458435058594, "epoch": 0.37866666666666665, "grad_norm": 6.29301643371582, "kl": 0.81048583984375, "learning_rate": 2.344650150494596e-06, "loss": 0.1014, "reward": 0.7500000149011612, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 355 }, { "completion_length": 625.9583511352539, "epoch": 0.3797333333333333, "grad_norm": 0.07980922609567642, "kl": 0.0118560791015625, "learning_rate": 2.340024764460046e-06, "loss": 0.0533, "reward": 0.9166666716337204, "reward_std": 0.11949636787176132, "rewards/accuracy_reward": 0.9166666716337204, "rewards/format_reward": 0.0, "step": 356 }, { "completion_length": 615.7500228881836, "epoch": 0.3808, "grad_norm": 5.467812538146973, "kl": 0.6986541748046875, "learning_rate": 2.3353877120437565e-06, "loss": 0.1259, "reward": 0.9375000149011612, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.9375000149011612, "rewards/format_reward": 0.0, "step": 357 }, { "completion_length": 799.0000076293945, "epoch": 0.3818666666666667, "grad_norm": 0.43769994378089905, "kl": 0.2205810546875, "learning_rate": 2.330739057645761e-06, "loss": 0.0492, "reward": 0.5416666716337204, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "step": 358 }, { "completion_length": 550.0625228881836, "epoch": 0.38293333333333335, "grad_norm": 4.504573345184326, "kl": 0.39459228515625, "learning_rate": 2.3260788658272246e-06, "loss": 0.1499, "reward": 0.8333333432674408, "reward_std": 0.24859581142663956, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.0, "step": 359 }, { "completion_length": 669.6458587646484, "epoch": 0.384, "grad_norm": 2.0192832946777344, "kl": 0.4339599609375, "learning_rate": 2.3214072013095436e-06, "loss": 0.0573, "reward": 0.6875, "reward_std": 0.29962684214115143, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 360 }, { "completion_length": 636.0625305175781, "epoch": 0.38506666666666667, "grad_norm": 0.13703946769237518, "kl": 0.0513458251953125, "learning_rate": 2.3167241289734514e-06, "loss": -0.0135, "reward": 0.7500000149011612, "reward_std": 0.31314554065465927, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 361 }, { "completion_length": 537.4583587646484, "epoch": 0.38613333333333333, "grad_norm": 0.5447300672531128, "kl": 0.04064178466796875, "learning_rate": 2.312029713858112e-06, "loss": -0.0077, "reward": 0.8750000149011612, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 362 }, { "completion_length": 623.2291793823242, "epoch": 0.3872, "grad_norm": 0.20528902113437653, "kl": 0.067138671875, "learning_rate": 2.307324021160222e-06, "loss": 0.0422, "reward": 0.6458333507180214, "reward_std": 0.2350771240890026, "rewards/accuracy_reward": 0.6458333507180214, "rewards/format_reward": 0.0, "step": 363 }, { "completion_length": 625.8333511352539, "epoch": 0.38826666666666665, "grad_norm": 0.33284875750541687, "kl": 0.130035400390625, "learning_rate": 2.302607116233101e-06, "loss": 0.0349, "reward": 0.708333358168602, "reward_std": 0.20412414520978928, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 364 }, { "completion_length": 549.7916793823242, "epoch": 0.3893333333333333, "grad_norm": 0.2864645719528198, "kl": 0.1048431396484375, "learning_rate": 2.2978790645857867e-06, "loss": 0.0111, "reward": 0.8750000149011612, "reward_std": 0.18404608592391014, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 365 }, { "completion_length": 592.8333511352539, "epoch": 0.3904, "grad_norm": 6.874788761138916, "kl": 0.919219970703125, "learning_rate": 2.293139931882123e-06, "loss": 0.0664, "reward": 0.7291666865348816, "reward_std": 0.1530931070446968, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 366 }, { "completion_length": 792.8125305175781, "epoch": 0.3914666666666667, "grad_norm": 0.25127074122428894, "kl": 0.07236480712890625, "learning_rate": 2.28838978393985e-06, "loss": 0.0223, "reward": 0.6666666865348816, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 367 }, { "completion_length": 554.8958587646484, "epoch": 0.39253333333333335, "grad_norm": 0.24670687317848206, "kl": 0.0403900146484375, "learning_rate": 2.2836286867296872e-06, "loss": 0.0919, "reward": 0.8958333432674408, "reward_std": 0.1705274060368538, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.0, "step": 368 }, { "completion_length": 601.8125228881836, "epoch": 0.3936, "grad_norm": 0.2513871490955353, "kl": 0.07169342041015625, "learning_rate": 2.278856706374422e-06, "loss": 0.0226, "reward": 0.6458333358168602, "reward_std": 0.1801304891705513, "rewards/accuracy_reward": 0.6458333358168602, "rewards/format_reward": 0.0, "step": 369 }, { "completion_length": 744.0, "epoch": 0.39466666666666667, "grad_norm": 4.767421245574951, "kl": 1.105499267578125, "learning_rate": 2.274073909147986e-06, "loss": 0.1005, "reward": 0.6458333432674408, "reward_std": 0.1801304966211319, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 370 }, { "completion_length": 638.5416870117188, "epoch": 0.3957333333333333, "grad_norm": 1.0954631567001343, "kl": 0.4050140380859375, "learning_rate": 2.2692803614745386e-06, "loss": 0.0678, "reward": 0.6875000223517418, "reward_std": 0.3092299550771713, "rewards/accuracy_reward": 0.6875000223517418, "rewards/format_reward": 0.0, "step": 371 }, { "completion_length": 635.5208435058594, "epoch": 0.3968, "grad_norm": 0.6578418612480164, "kl": 0.217529296875, "learning_rate": 2.264476129927541e-06, "loss": 0.0578, "reward": 0.6041666828095913, "reward_std": 0.21764284372329712, "rewards/accuracy_reward": 0.6041666828095913, "rewards/format_reward": 0.0, "step": 372 }, { "completion_length": 589.2708511352539, "epoch": 0.39786666666666665, "grad_norm": 1.2886306047439575, "kl": 0.0250396728515625, "learning_rate": 2.259661281228836e-06, "loss": 0.0638, "reward": 0.8750000298023224, "reward_std": 0.2686738707125187, "rewards/accuracy_reward": 0.8750000298023224, "rewards/format_reward": 0.0, "step": 373 }, { "completion_length": 686.0833435058594, "epoch": 0.3989333333333333, "grad_norm": 0.30835169553756714, "kl": 0.14544677734375, "learning_rate": 2.254835882247716e-06, "loss": -0.0131, "reward": 0.5208333395421505, "reward_std": 0.11558076739311218, "rewards/accuracy_reward": 0.5208333395421505, "rewards/format_reward": 0.0, "step": 374 }, { "completion_length": 655.0208511352539, "epoch": 0.4, "grad_norm": 0.1555933803319931, "kl": 0.052520751953125, "learning_rate": 2.25e-06, "loss": 0.0061, "reward": 0.7291666716337204, "reward_std": 0.2699456959962845, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.0, "step": 375 }, { "completion_length": 613.2916870117188, "epoch": 0.4010666666666667, "grad_norm": 0.24230921268463135, "kl": 0.0810546875, "learning_rate": 2.245153701647099e-06, "loss": 0.0101, "reward": 0.8125000149011612, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 376 }, { "completion_length": 466.75, "epoch": 0.40213333333333334, "grad_norm": 0.3571372330188751, "kl": 0.121002197265625, "learning_rate": 2.2402970544950836e-06, "loss": -0.0626, "reward": 0.833333358168602, "reward_std": 0.23116153106093407, "rewards/accuracy_reward": 0.833333358168602, "rewards/format_reward": 0.0, "step": 377 }, { "completion_length": 493.5208511352539, "epoch": 0.4032, "grad_norm": 0.1839544177055359, "kl": 0.0637664794921875, "learning_rate": 2.23543012599375e-06, "loss": 0.0071, "reward": 0.7500000149011612, "reward_std": 0.18404609709978104, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 378 }, { "completion_length": 624.6250228881836, "epoch": 0.40426666666666666, "grad_norm": 0.6626420021057129, "kl": 0.041015625, "learning_rate": 2.230552983735686e-06, "loss": 0.0538, "reward": 0.8541666865348816, "reward_std": 0.1530931107699871, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.0, "step": 379 }, { "completion_length": 677.6875152587891, "epoch": 0.4053333333333333, "grad_norm": 0.8058792352676392, "kl": 0.1177825927734375, "learning_rate": 2.225665695455325e-06, "loss": -0.0416, "reward": 0.729166679084301, "reward_std": 0.1801304928958416, "rewards/accuracy_reward": 0.729166679084301, "rewards/format_reward": 0.0, "step": 380 }, { "completion_length": 517.8333435058594, "epoch": 0.4064, "grad_norm": 0.7754347324371338, "kl": 0.3004150390625, "learning_rate": 2.220768329028013e-06, "loss": 0.0056, "reward": 0.8750000149011612, "reward_std": 0.22155842557549477, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 381 }, { "completion_length": 459.18750762939453, "epoch": 0.40746666666666664, "grad_norm": 0.17720593512058258, "kl": 0.1490478515625, "learning_rate": 2.2158609524690615e-06, "loss": -0.0009, "reward": 0.8958333432674408, "reward_std": 0.1705273948609829, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.0, "step": 382 }, { "completion_length": 728.6250152587891, "epoch": 0.40853333333333336, "grad_norm": 0.41954395174980164, "kl": 0.29119873046875, "learning_rate": 2.210943633932805e-06, "loss": -0.0105, "reward": 0.6875000149011612, "reward_std": 0.2900237552821636, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 383 }, { "completion_length": 518.6041793823242, "epoch": 0.4096, "grad_norm": 0.144319087266922, "kl": 0.0495452880859375, "learning_rate": 2.206016441711652e-06, "loss": 0.0107, "reward": 0.8541666865348816, "reward_std": 0.1530931070446968, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.0, "step": 384 }, { "completion_length": 447.0416793823242, "epoch": 0.4106666666666667, "grad_norm": 0.27949658036231995, "kl": 0.0768585205078125, "learning_rate": 2.20107944423514e-06, "loss": 0.0077, "reward": 0.791666679084301, "reward_std": 0.22155844420194626, "rewards/accuracy_reward": 0.791666679084301, "rewards/format_reward": 0.0, "step": 385 }, { "completion_length": 423.54168701171875, "epoch": 0.41173333333333334, "grad_norm": 3.5651707649230957, "kl": 0.59869384765625, "learning_rate": 2.1961327100689823e-06, "loss": 0.0638, "reward": 0.8541666716337204, "reward_std": 0.1530931070446968, "rewards/accuracy_reward": 0.8541666716337204, "rewards/format_reward": 0.0, "step": 386 }, { "completion_length": 510.10418701171875, "epoch": 0.4128, "grad_norm": 0.6789741516113281, "kl": 0.30023193359375, "learning_rate": 2.1911763079141163e-06, "loss": 0.0564, "reward": 0.583333358168602, "reward_std": 0.2861081659793854, "rewards/accuracy_reward": 0.583333358168602, "rewards/format_reward": 0.0, "step": 387 }, { "completion_length": 470.45835876464844, "epoch": 0.41386666666666666, "grad_norm": 0.9891305565834045, "kl": 0.154266357421875, "learning_rate": 2.1862103066057508e-06, "loss": -0.0513, "reward": 0.7291666865348816, "reward_std": 0.27258947864174843, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 388 }, { "completion_length": 542.6875152587891, "epoch": 0.4149333333333333, "grad_norm": 4.347565174102783, "kl": 0.52301025390625, "learning_rate": 2.1812347751124072e-06, "loss": 0.0038, "reward": 0.6458333358168602, "reward_std": 0.13301505148410797, "rewards/accuracy_reward": 0.6458333358168602, "rewards/format_reward": 0.0, "step": 389 }, { "completion_length": 552.125, "epoch": 0.416, "grad_norm": 0.6563839912414551, "kl": 0.2119140625, "learning_rate": 2.1762497825349665e-06, "loss": -0.0101, "reward": 0.6458333544433117, "reward_std": 0.3092299550771713, "rewards/accuracy_reward": 0.6458333544433117, "rewards/format_reward": 0.0, "step": 390 }, { "completion_length": 574.7708587646484, "epoch": 0.41706666666666664, "grad_norm": 0.5116036534309387, "kl": 0.422607421875, "learning_rate": 2.171255398105703e-06, "loss": 0.0538, "reward": 0.27083334140479565, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.27083334140479565, "rewards/format_reward": 0.0, "step": 391 }, { "completion_length": 599.0416717529297, "epoch": 0.41813333333333336, "grad_norm": 6.512560844421387, "kl": 2.5517578125, "learning_rate": 2.166251691187329e-06, "loss": -0.073, "reward": 0.6041666865348816, "reward_std": 0.49327602982521057, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 392 }, { "completion_length": 559.2083435058594, "epoch": 0.4192, "grad_norm": 1.7538548707962036, "kl": 0.6044921875, "learning_rate": 2.1612387312720286e-06, "loss": -0.111, "reward": 0.5625000223517418, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 393 }, { "completion_length": 385.5208511352539, "epoch": 0.4202666666666667, "grad_norm": 2.3734023571014404, "kl": 0.7052001953125, "learning_rate": 2.156216587980491e-06, "loss": 0.0271, "reward": 0.47916669212281704, "reward_std": 0.33713918551802635, "rewards/accuracy_reward": 0.47916669212281704, "rewards/format_reward": 0.0, "step": 394 }, { "completion_length": 476.0625228881836, "epoch": 0.42133333333333334, "grad_norm": 3.187304735183716, "kl": 0.3585205078125, "learning_rate": 2.1511853310609467e-06, "loss": -0.0051, "reward": 0.5625000111758709, "reward_std": 0.2350771240890026, "rewards/accuracy_reward": 0.5625000111758709, "rewards/format_reward": 0.0, "step": 395 }, { "completion_length": 425.43750762939453, "epoch": 0.4224, "grad_norm": 2.695509910583496, "kl": 0.94970703125, "learning_rate": 2.146145030388198e-06, "loss": -0.0234, "reward": 0.625, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 396 }, { "completion_length": 534.8125076293945, "epoch": 0.42346666666666666, "grad_norm": 1.7001129388809204, "kl": 0.262451171875, "learning_rate": 2.141095755962647e-06, "loss": -0.1495, "reward": 0.6666666716337204, "reward_std": 0.2957112528383732, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 397 }, { "completion_length": 459.8333511352539, "epoch": 0.4245333333333333, "grad_norm": 0.9032135009765625, "kl": 0.29150390625, "learning_rate": 2.1360375779093257e-06, "loss": -0.0025, "reward": 0.7291666865348816, "reward_std": 0.2350771352648735, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 398 }, { "completion_length": 505.5833511352539, "epoch": 0.4256, "grad_norm": 0.548815131187439, "kl": 0.3984375, "learning_rate": 2.1309705664769195e-06, "loss": 0.0059, "reward": 0.3958333395421505, "reward_std": 0.21764282882213593, "rewards/accuracy_reward": 0.3958333395421505, "rewards/format_reward": 0.0, "step": 399 }, { "completion_length": 571.0416870117188, "epoch": 0.4266666666666667, "grad_norm": 1.3241275548934937, "kl": 0.587158203125, "learning_rate": 2.1258947920367943e-06, "loss": -0.0431, "reward": 0.29166667349636555, "reward_std": 0.3776952587068081, "rewards/accuracy_reward": 0.29166667349636555, "rewards/format_reward": 0.0, "step": 400 }, { "completion_length": 387.43751525878906, "epoch": 0.42773333333333335, "grad_norm": 69.63048553466797, "kl": 1.656982421875, "learning_rate": 2.120810325082017e-06, "loss": -0.0062, "reward": 0.6875000149011612, "reward_std": 0.38161084800958633, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 401 }, { "completion_length": 464.18751525878906, "epoch": 0.4288, "grad_norm": 0.7559735774993896, "kl": 0.167724609375, "learning_rate": 2.1157172362263782e-06, "loss": -0.0162, "reward": 0.7916666865348816, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 402 }, { "completion_length": 592.3750152587891, "epoch": 0.4298666666666667, "grad_norm": 0.5397282242774963, "kl": 0.20556640625, "learning_rate": 2.1106155962034103e-06, "loss": 0.0643, "reward": 0.479166679084301, "reward_std": 0.2900237739086151, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 403 }, { "completion_length": 582.0416870117188, "epoch": 0.43093333333333333, "grad_norm": 6.082925319671631, "kl": 0.3802490234375, "learning_rate": 2.1055054758654056e-06, "loss": -0.172, "reward": 0.4791666716337204, "reward_std": 0.43655748665332794, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "step": 404 }, { "completion_length": 392.6041793823242, "epoch": 0.432, "grad_norm": 1.5327900648117065, "kl": 0.3214111328125, "learning_rate": 2.100386946182431e-06, "loss": -0.0483, "reward": 0.5000000111758709, "reward_std": 0.23116152361035347, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.0, "step": 405 }, { "completion_length": 531.3125076293945, "epoch": 0.43306666666666666, "grad_norm": 6.009570121765137, "kl": 1.4208984375, "learning_rate": 2.0952600782413454e-06, "loss": -0.0214, "reward": 0.375, "reward_std": 0.1369306445121765, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 406 }, { "completion_length": 588.1458511352539, "epoch": 0.4341333333333333, "grad_norm": 145.150390625, "kl": 0.2872314453125, "learning_rate": 2.090124943244809e-06, "loss": 0.0099, "reward": 0.479166679084301, "reward_std": 0.29962683469057083, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 407 }, { "completion_length": 435.5416793823242, "epoch": 0.4352, "grad_norm": 8.69697380065918, "kl": 0.403076171875, "learning_rate": 2.084981612510298e-06, "loss": -0.1562, "reward": 0.604166679084301, "reward_std": 0.27258947119116783, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "step": 408 }, { "completion_length": 482.3333511352539, "epoch": 0.4362666666666667, "grad_norm": 4.428235054016113, "kl": 0.616455078125, "learning_rate": 2.0798301574691106e-06, "loss": -0.0322, "reward": 0.7708333432674408, "reward_std": 0.2996268458664417, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 409 }, { "completion_length": 379.68750762939453, "epoch": 0.43733333333333335, "grad_norm": 7.094764232635498, "kl": 0.794921875, "learning_rate": 2.0746706496653765e-06, "loss": -0.1481, "reward": 0.541666679084301, "reward_std": 0.4326419234275818, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "step": 410 }, { "completion_length": 429.7916793823242, "epoch": 0.4384, "grad_norm": 17.427528381347656, "kl": 0.48663330078125, "learning_rate": 2.069503160755064e-06, "loss": -0.0863, "reward": 0.7291666865348816, "reward_std": 0.33713920786976814, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 411 }, { "completion_length": 401.89583587646484, "epoch": 0.43946666666666667, "grad_norm": 26150.78515625, "kl": 4.6513671875, "learning_rate": 2.0643277625049832e-06, "loss": -0.0044, "reward": 0.5208333432674408, "reward_std": 0.21764283254742622, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 412 }, { "completion_length": 593.4791793823242, "epoch": 0.44053333333333333, "grad_norm": 95612.453125, "kl": 92.1591796875, "learning_rate": 2.0591445267917923e-06, "loss": 2.9462, "reward": 0.5000000298023224, "reward_std": 0.2861081473529339, "rewards/accuracy_reward": 0.5000000298023224, "rewards/format_reward": 0.0, "step": 413 }, { "completion_length": 545.2291793823242, "epoch": 0.4416, "grad_norm": 166.13198852539062, "kl": 0.303466796875, "learning_rate": 2.053953525600994e-06, "loss": -0.1041, "reward": 0.5208333656191826, "reward_std": 0.44616056606173515, "rewards/accuracy_reward": 0.5208333656191826, "rewards/format_reward": 0.0, "step": 414 }, { "completion_length": 459.7083435058594, "epoch": 0.44266666666666665, "grad_norm": 11.17627239227295, "kl": 1.50543212890625, "learning_rate": 2.048754831025942e-06, "loss": -0.1983, "reward": 0.6666666865348816, "reward_std": 0.350657869130373, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 415 }, { "completion_length": 542.1666870117188, "epoch": 0.4437333333333333, "grad_norm": 3.6018333435058594, "kl": 0.34765625, "learning_rate": 2.0435485152668356e-06, "loss": -0.0383, "reward": 0.604166679084301, "reward_std": 0.2350771240890026, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "step": 416 }, { "completion_length": 456.75001525878906, "epoch": 0.4448, "grad_norm": 3.0526721477508545, "kl": 0.200439453125, "learning_rate": 2.038334650629718e-06, "loss": -0.1317, "reward": 0.6458333358168602, "reward_std": 0.3170611336827278, "rewards/accuracy_reward": 0.6458333358168602, "rewards/format_reward": 0.0, "step": 417 }, { "completion_length": 518.6250152587891, "epoch": 0.4458666666666667, "grad_norm": 1.6928656101226807, "kl": 0.33642578125, "learning_rate": 2.033113309525472e-06, "loss": -0.0655, "reward": 0.479166679084301, "reward_std": 0.41912321746349335, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 418 }, { "completion_length": 480.0416793823242, "epoch": 0.44693333333333335, "grad_norm": 3.02109432220459, "kl": 0.3017578125, "learning_rate": 2.027884564468816e-06, "loss": -0.1819, "reward": 0.5625000149011612, "reward_std": 0.48367293179035187, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 419 }, { "completion_length": 456.8958435058594, "epoch": 0.448, "grad_norm": 0.33562108874320984, "kl": 0.12249755859375, "learning_rate": 2.0226484880772943e-06, "loss": -0.0493, "reward": 0.7916666865348816, "reward_std": 0.22155842557549477, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 420 }, { "completion_length": 559.0625152587891, "epoch": 0.44906666666666667, "grad_norm": 9.423727035522461, "kl": 0.6015625, "learning_rate": 2.01740515307027e-06, "loss": -0.0007, "reward": 0.3333333544433117, "reward_std": 0.2861081399023533, "rewards/accuracy_reward": 0.3333333544433117, "rewards/format_reward": 0.0, "step": 421 }, { "completion_length": 649.6875152587891, "epoch": 0.45013333333333333, "grad_norm": 0.589624285697937, "kl": 0.210205078125, "learning_rate": 2.012154632267915e-06, "loss": -0.0543, "reward": 0.6458333730697632, "reward_std": 0.44616056233644485, "rewards/accuracy_reward": 0.6458333730697632, "rewards/format_reward": 0.0, "step": 422 }, { "completion_length": 452.9583435058594, "epoch": 0.4512, "grad_norm": 0.716648280620575, "kl": 0.14862060546875, "learning_rate": 2.0068969985901996e-06, "loss": -0.0543, "reward": 0.7291666865348816, "reward_std": 0.34674228355288506, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 423 }, { "completion_length": 428.1458435058594, "epoch": 0.45226666666666665, "grad_norm": 0.8198894262313843, "kl": 0.30303955078125, "learning_rate": 2.0016323250558765e-06, "loss": -0.1415, "reward": 0.7291666865348816, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 424 }, { "completion_length": 504.16668701171875, "epoch": 0.4533333333333333, "grad_norm": 0.5582399964332581, "kl": 0.1332550048828125, "learning_rate": 1.9963606847814702e-06, "loss": -0.0403, "reward": 0.6041666865348816, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 425 }, { "completion_length": 519.7708587646484, "epoch": 0.4544, "grad_norm": 1.0627249479293823, "kl": 0.22119140625, "learning_rate": 1.991082150980261e-06, "loss": -0.0354, "reward": 0.6875000298023224, "reward_std": 0.21764283254742622, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 426 }, { "completion_length": 484.5833511352539, "epoch": 0.4554666666666667, "grad_norm": 0.6926480531692505, "kl": 0.19287109375, "learning_rate": 1.9857967969612654e-06, "loss": -0.0566, "reward": 0.7083333432674408, "reward_std": 0.18404608964920044, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 427 }, { "completion_length": 381.8958435058594, "epoch": 0.45653333333333335, "grad_norm": 0.941911518573761, "kl": 0.456298828125, "learning_rate": 1.9805046961282226e-06, "loss": -0.1716, "reward": 0.7291666865348816, "reward_std": 0.3170611374080181, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 428 }, { "completion_length": 381.7291793823242, "epoch": 0.4576, "grad_norm": 4.331463813781738, "kl": 0.6407470703125, "learning_rate": 1.9752059219785703e-06, "loss": -0.1509, "reward": 0.6041666716337204, "reward_std": 0.41129201650619507, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 429 }, { "completion_length": 482.9166793823242, "epoch": 0.45866666666666667, "grad_norm": 25.180204391479492, "kl": 0.25439453125, "learning_rate": 1.9699005481024273e-06, "loss": -0.0411, "reward": 0.6250000149011612, "reward_std": 0.4797573462128639, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 430 }, { "completion_length": 505.16668701171875, "epoch": 0.4597333333333333, "grad_norm": 13.052921295166016, "kl": 0.49951171875, "learning_rate": 1.96458864818157e-06, "loss": -0.1229, "reward": 0.6041666716337204, "reward_std": 0.18796168267726898, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 431 }, { "completion_length": 660.3750152587891, "epoch": 0.4608, "grad_norm": 1.405260682106018, "kl": 0.61328125, "learning_rate": 1.9592702959884095e-06, "loss": -0.0424, "reward": 0.6041666865348816, "reward_std": 0.29962683469057083, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 432 }, { "completion_length": 544.9375076293945, "epoch": 0.46186666666666665, "grad_norm": 1.421413540840149, "kl": 0.255615234375, "learning_rate": 1.953945565384967e-06, "loss": -0.0098, "reward": 0.5208333432674408, "reward_std": 0.1705274023115635, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 433 }, { "completion_length": 591.6250305175781, "epoch": 0.4629333333333333, "grad_norm": 1.7305474281311035, "kl": 0.1530914306640625, "learning_rate": 1.948614530321848e-06, "loss": 0.045, "reward": 0.7291666716337204, "reward_std": 0.28219256177544594, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.0, "step": 434 }, { "completion_length": 453.20833587646484, "epoch": 0.464, "grad_norm": 2.309199333190918, "kl": 0.88330078125, "learning_rate": 1.943277264837214e-06, "loss": -0.0575, "reward": 0.541666679084301, "reward_std": 0.32097674161195755, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "step": 435 }, { "completion_length": 330.9791717529297, "epoch": 0.4650666666666667, "grad_norm": 7.435601711273193, "kl": 3.25732421875, "learning_rate": 1.9379338430557582e-06, "loss": -0.1747, "reward": 0.770833358168602, "reward_std": 0.28219256177544594, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 436 }, { "completion_length": 494.77085876464844, "epoch": 0.46613333333333334, "grad_norm": 31.305166244506836, "kl": 0.71380615234375, "learning_rate": 1.932584339187671e-06, "loss": -0.1046, "reward": 0.7083333432674408, "reward_std": 0.32097672671079636, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 437 }, { "completion_length": 438.0416793823242, "epoch": 0.4672, "grad_norm": 2.069824695587158, "kl": 0.7882080078125, "learning_rate": 1.927228827527612e-06, "loss": -0.1993, "reward": 0.7500000149011612, "reward_std": 0.4152076207101345, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 438 }, { "completion_length": 521.5625076293945, "epoch": 0.46826666666666666, "grad_norm": 3.2062323093414307, "kl": 1.4931640625, "learning_rate": 1.921867382453679e-06, "loss": -0.0807, "reward": 0.6041666865348816, "reward_std": 0.41129200905561447, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 439 }, { "completion_length": 465.31250762939453, "epoch": 0.4693333333333333, "grad_norm": 3.669417381286621, "kl": 0.96484375, "learning_rate": 1.9165000784263734e-06, "loss": -0.1217, "reward": 0.5833333432674408, "reward_std": 0.4152076207101345, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 440 }, { "completion_length": 475.0416717529297, "epoch": 0.4704, "grad_norm": 4.333220958709717, "kl": 0.75, "learning_rate": 1.911126989987565e-06, "loss": -0.1713, "reward": 0.5625000298023224, "reward_std": 0.4016888998448849, "rewards/accuracy_reward": 0.5625000298023224, "rewards/format_reward": 0.0, "step": 441 }, { "completion_length": 456.7083435058594, "epoch": 0.47146666666666665, "grad_norm": 20.424489974975586, "kl": 0.9342041015625, "learning_rate": 1.9057481917594604e-06, "loss": -0.179, "reward": 0.8541666865348816, "reward_std": 0.28219256550073624, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.0, "step": 442 }, { "completion_length": 460.68751525878906, "epoch": 0.47253333333333336, "grad_norm": 12.73831844329834, "kl": 0.3369140625, "learning_rate": 1.9003637584435633e-06, "loss": -0.0482, "reward": 0.7083333507180214, "reward_std": 0.2957112602889538, "rewards/accuracy_reward": 0.7083333507180214, "rewards/format_reward": 0.0, "step": 443 }, { "completion_length": 446.1666793823242, "epoch": 0.4736, "grad_norm": 1.1070239543914795, "kl": 0.37908935546875, "learning_rate": 1.8949737648196395e-06, "loss": -0.0933, "reward": 0.770833358168602, "reward_std": 0.33713920041918755, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 444 }, { "completion_length": 648.2916870117188, "epoch": 0.4746666666666667, "grad_norm": 0.8892545104026794, "kl": 0.25146484375, "learning_rate": 1.8895782857446754e-06, "loss": -0.0218, "reward": 0.6458333507180214, "reward_std": 0.1801304966211319, "rewards/accuracy_reward": 0.6458333507180214, "rewards/format_reward": 0.0, "step": 445 }, { "completion_length": 497.31250762939453, "epoch": 0.47573333333333334, "grad_norm": 3.394148111343384, "kl": 2.803955078125, "learning_rate": 1.8841773961518417e-06, "loss": -0.1998, "reward": 0.5625000223517418, "reward_std": 0.47406984120607376, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 446 }, { "completion_length": 510.75001525878906, "epoch": 0.4768, "grad_norm": 4.031276702880859, "kl": 2.0634765625, "learning_rate": 1.8787711710494509e-06, "loss": -0.1812, "reward": 0.5625000149011612, "reward_std": 0.33713918924331665, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 447 }, { "completion_length": 472.68750762939453, "epoch": 0.47786666666666666, "grad_norm": 6.448513984680176, "kl": 3.84619140625, "learning_rate": 1.8733596855199147e-06, "loss": -0.0489, "reward": 0.5000000111758709, "reward_std": 0.3602609820663929, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.0, "step": 448 }, { "completion_length": 616.5625152587891, "epoch": 0.4789333333333333, "grad_norm": 5.1840410232543945, "kl": 0.58978271484375, "learning_rate": 1.8679430147187031e-06, "loss": -0.1038, "reward": 0.666666679084301, "reward_std": 0.3602609783411026, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.0, "step": 449 }, { "completion_length": 602.5416717529297, "epoch": 0.48, "grad_norm": 0.5829527378082275, "kl": 0.26849365234375, "learning_rate": 1.8625212338733005e-06, "loss": -0.0714, "reward": 0.6250000111758709, "reward_std": 0.3602609783411026, "rewards/accuracy_reward": 0.6250000111758709, "rewards/format_reward": 0.0, "step": 450 }, { "completion_length": 540.7708587646484, "epoch": 0.48106666666666664, "grad_norm": 5.727835178375244, "kl": 1.69091796875, "learning_rate": 1.8570944182821588e-06, "loss": -0.0564, "reward": 0.5416666716337204, "reward_std": 0.22155842930078506, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "step": 451 }, { "completion_length": 438.54168701171875, "epoch": 0.48213333333333336, "grad_norm": 0.21317069232463837, "kl": 0.5336456298828125, "learning_rate": 1.8516626433136547e-06, "loss": -0.0487, "reward": 0.7708333358168602, "reward_std": 0.13301505148410797, "rewards/accuracy_reward": 0.7708333358168602, "rewards/format_reward": 0.0, "step": 452 }, { "completion_length": 481.68750762939453, "epoch": 0.4832, "grad_norm": 1.012858510017395, "kl": 0.4264678955078125, "learning_rate": 1.8462259844050408e-06, "loss": -0.1401, "reward": 0.7291666716337204, "reward_std": 0.317061148583889, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.0, "step": 453 }, { "completion_length": 580.0208435058594, "epoch": 0.4842666666666667, "grad_norm": 0.2919260263442993, "kl": 0.0543975830078125, "learning_rate": 1.840784517061398e-06, "loss": -0.0052, "reward": 0.8125, "reward_std": 0.06846532225608826, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 454 }, { "completion_length": 556.9166793823242, "epoch": 0.48533333333333334, "grad_norm": 0.8342655897140503, "kl": 0.19622802734375, "learning_rate": 1.835338316854588e-06, "loss": -0.0478, "reward": 0.8333333432674408, "reward_std": 0.20412414148449898, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.0, "step": 455 }, { "completion_length": 543.8750305175781, "epoch": 0.4864, "grad_norm": 0.5063223838806152, "kl": 0.160308837890625, "learning_rate": 1.8298874594222035e-06, "loss": -0.0834, "reward": 0.7708333432674408, "reward_std": 0.35457348451018333, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 456 }, { "completion_length": 494.56250762939453, "epoch": 0.48746666666666666, "grad_norm": 1.1164201498031616, "kl": 0.17620849609375, "learning_rate": 1.824432020466517e-06, "loss": -0.0571, "reward": 0.6458333432674408, "reward_std": 0.2350771203637123, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 457 }, { "completion_length": 587.3333511352539, "epoch": 0.4885333333333333, "grad_norm": 1.3672682046890259, "kl": 0.17120361328125, "learning_rate": 1.8189720757534291e-06, "loss": 0.0009, "reward": 0.5833333432674408, "reward_std": 0.18404608219861984, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 458 }, { "completion_length": 515.8125152587891, "epoch": 0.4896, "grad_norm": 3.731281042098999, "kl": 3.951263427734375, "learning_rate": 1.8135077011114185e-06, "loss": -0.0999, "reward": 0.6875000149011612, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 459 }, { "completion_length": 580.0833435058594, "epoch": 0.49066666666666664, "grad_norm": 0.4759094715118408, "kl": 0.283599853515625, "learning_rate": 1.8080389724304863e-06, "loss": -0.0581, "reward": 0.7500000149011612, "reward_std": 0.3707359507679939, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 460 }, { "completion_length": 470.97918701171875, "epoch": 0.49173333333333336, "grad_norm": 2.1912169456481934, "kl": 0.51611328125, "learning_rate": 1.8025659656611033e-06, "loss": -0.0696, "reward": 0.7500000298023224, "reward_std": 0.20412414148449898, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.0, "step": 461 }, { "completion_length": 662.6458435058594, "epoch": 0.4928, "grad_norm": 0.315313458442688, "kl": 0.1390380859375, "learning_rate": 1.797088756813155e-06, "loss": -0.0152, "reward": 0.6666666865348816, "reward_std": 0.16661180555820465, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 462 }, { "completion_length": 487.4375228881836, "epoch": 0.4938666666666667, "grad_norm": 0.6089401245117188, "kl": 0.136962890625, "learning_rate": 1.7916074219548866e-06, "loss": -0.0055, "reward": 0.7916666865348816, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 463 }, { "completion_length": 660.7916717529297, "epoch": 0.49493333333333334, "grad_norm": 0.4967513680458069, "kl": 0.23046875, "learning_rate": 1.7861220372118446e-06, "loss": -0.0592, "reward": 0.6250000149011612, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 464 }, { "completion_length": 556.5625152587891, "epoch": 0.496, "grad_norm": 0.3465183973312378, "kl": 0.07342529296875, "learning_rate": 1.7806326787658219e-06, "loss": -0.0295, "reward": 0.8958333432674408, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.0, "step": 465 }, { "completion_length": 527.9375228881836, "epoch": 0.49706666666666666, "grad_norm": 1.0866291522979736, "kl": 0.419189453125, "learning_rate": 1.7751394228537989e-06, "loss": -0.1516, "reward": 0.6250000223517418, "reward_std": 0.45271996408700943, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 466 }, { "completion_length": 526.4166793823242, "epoch": 0.4981333333333333, "grad_norm": 0.9034936428070068, "kl": 0.262451171875, "learning_rate": 1.7696423457668832e-06, "loss": -0.0335, "reward": 0.4166666753590107, "reward_std": 0.25819891691207886, "rewards/accuracy_reward": 0.4166666753590107, "rewards/format_reward": 0.0, "step": 467 }, { "completion_length": 492.4166793823242, "epoch": 0.4992, "grad_norm": 1.2580136060714722, "kl": 1.09112548828125, "learning_rate": 1.7641415238492536e-06, "loss": -0.109, "reward": 0.7291666716337204, "reward_std": 0.30922994762659073, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.0, "step": 468 }, { "completion_length": 462.81251525878906, "epoch": 0.5002666666666666, "grad_norm": 1.6139076948165894, "kl": 0.78790283203125, "learning_rate": 1.7586370334970954e-06, "loss": -0.0591, "reward": 0.6666666865348816, "reward_std": 0.23116152361035347, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 469 }, { "completion_length": 482.0833435058594, "epoch": 0.5013333333333333, "grad_norm": 0.7503280639648438, "kl": 0.2613525390625, "learning_rate": 1.7531289511575427e-06, "loss": -0.0513, "reward": 0.7083333432674408, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 470 }, { "completion_length": 635.9583358764648, "epoch": 0.5024, "grad_norm": 3.3692362308502197, "kl": 1.046875, "learning_rate": 1.747617353327616e-06, "loss": -0.0182, "reward": 0.5625000223517418, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 471 }, { "completion_length": 538.9166870117188, "epoch": 0.5034666666666666, "grad_norm": 2.1965389251708984, "kl": 0.74267578125, "learning_rate": 1.7421023165531584e-06, "loss": -0.0296, "reward": 0.4583333507180214, "reward_std": 0.2686738707125187, "rewards/accuracy_reward": 0.4583333507180214, "rewards/format_reward": 0.0, "step": 472 }, { "completion_length": 568.1458435058594, "epoch": 0.5045333333333333, "grad_norm": 2.382479667663574, "kl": 0.66064453125, "learning_rate": 1.7365839174277743e-06, "loss": -0.1249, "reward": 0.5000000111758709, "reward_std": 0.4422449842095375, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.0, "step": 473 }, { "completion_length": 486.1041717529297, "epoch": 0.5056, "grad_norm": 0.9544143676757812, "kl": 0.715087890625, "learning_rate": 1.7310622325917648e-06, "loss": -0.1045, "reward": 0.5625, "reward_std": 0.3074580430984497, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 474 }, { "completion_length": 600.5625076293945, "epoch": 0.5066666666666667, "grad_norm": 2.5569818019866943, "kl": 0.966064453125, "learning_rate": 1.7255373387310633e-06, "loss": -0.0572, "reward": 0.7500000074505806, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.7500000074505806, "rewards/format_reward": 0.0, "step": 475 }, { "completion_length": 438.2083511352539, "epoch": 0.5077333333333334, "grad_norm": 1.1740267276763916, "kl": 0.278564453125, "learning_rate": 1.7200093125761706e-06, "loss": -0.1605, "reward": 0.6666666716337204, "reward_std": 0.3977733142673969, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 476 }, { "completion_length": 679.2291870117188, "epoch": 0.5088, "grad_norm": 0.5418806076049805, "kl": 0.25933837890625, "learning_rate": 1.714478230901089e-06, "loss": -0.0693, "reward": 0.5625000223517418, "reward_std": 0.21764283999800682, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 477 }, { "completion_length": 579.5208435058594, "epoch": 0.5098666666666667, "grad_norm": 0.9323630928993225, "kl": 0.46722412109375, "learning_rate": 1.7089441705222568e-06, "loss": -0.0856, "reward": 0.6875000149011612, "reward_std": 0.35457348451018333, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 478 }, { "completion_length": 482.3958435058594, "epoch": 0.5109333333333334, "grad_norm": 0.3191574811935425, "kl": 0.040130615234375, "learning_rate": 1.7034072082974805e-06, "loss": 0.01, "reward": 0.8541666716337204, "reward_std": 0.05103103816509247, "rewards/accuracy_reward": 0.8541666716337204, "rewards/format_reward": 0.0, "step": 479 }, { "completion_length": 624.7500305175781, "epoch": 0.512, "grad_norm": 0.6216436624526978, "kl": 0.274658203125, "learning_rate": 1.6978674211248676e-06, "loss": -0.1179, "reward": 0.6250000223517418, "reward_std": 0.24859582632780075, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 480 }, { "completion_length": 608.7291793823242, "epoch": 0.5130666666666667, "grad_norm": 0.7217719554901123, "kl": 0.10003662109375, "learning_rate": 1.69232488594176e-06, "loss": -0.0325, "reward": 0.7916666865348816, "reward_std": 0.2957112602889538, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 481 }, { "completion_length": 546.4791793823242, "epoch": 0.5141333333333333, "grad_norm": 0.3178690969944, "kl": 1.89013671875, "learning_rate": 1.6867796797236638e-06, "loss": -0.0097, "reward": 0.708333358168602, "reward_std": 0.24859582632780075, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 482 }, { "completion_length": 433.97918701171875, "epoch": 0.5152, "grad_norm": 0.220798060297966, "kl": 0.095428466796875, "learning_rate": 1.6812318794831804e-06, "loss": -0.0444, "reward": 0.8958333432674408, "reward_std": 0.11558075994253159, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.0, "step": 483 }, { "completion_length": 643.6666946411133, "epoch": 0.5162666666666667, "grad_norm": 0.3336416780948639, "kl": 0.1563720703125, "learning_rate": 1.6756815622689371e-06, "loss": -0.0022, "reward": 0.41666667722165585, "reward_std": 0.22155843302607536, "rewards/accuracy_reward": 0.41666667722165585, "rewards/format_reward": 0.0, "step": 484 }, { "completion_length": 506.81251525878906, "epoch": 0.5173333333333333, "grad_norm": 0.26856812834739685, "kl": 0.0916900634765625, "learning_rate": 1.6701288051645182e-06, "loss": -0.0386, "reward": 0.7708333507180214, "reward_std": 0.2725894823670387, "rewards/accuracy_reward": 0.7708333507180214, "rewards/format_reward": 0.0, "step": 485 }, { "completion_length": 477.12500762939453, "epoch": 0.5184, "grad_norm": 0.4715536832809448, "kl": 0.228302001953125, "learning_rate": 1.664573685287393e-06, "loss": -0.1128, "reward": 0.8125000149011612, "reward_std": 0.2350771315395832, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 486 }, { "completion_length": 575.0000228881836, "epoch": 0.5194666666666666, "grad_norm": 0.23960432410240173, "kl": 0.135345458984375, "learning_rate": 1.6590162797878457e-06, "loss": -0.0456, "reward": 0.7500000149011612, "reward_std": 0.3332235924899578, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 487 }, { "completion_length": 635.0625152587891, "epoch": 0.5205333333333333, "grad_norm": 0.9215066432952881, "kl": 0.444183349609375, "learning_rate": 1.653456665847903e-06, "loss": -0.09, "reward": 0.6041666865348816, "reward_std": 0.36417656019330025, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 488 }, { "completion_length": 512.7500152587891, "epoch": 0.5216, "grad_norm": 0.3210693299770355, "kl": 0.1290283203125, "learning_rate": 1.6478949206802629e-06, "loss": -0.0998, "reward": 0.7916666865348816, "reward_std": 0.2957112528383732, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 489 }, { "completion_length": 645.7708587646484, "epoch": 0.5226666666666666, "grad_norm": 0.5099627375602722, "kl": 0.6461181640625, "learning_rate": 1.642331121527223e-06, "loss": -0.0403, "reward": 0.7083333432674408, "reward_std": 0.2957112528383732, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 490 }, { "completion_length": 522.9791793823242, "epoch": 0.5237333333333334, "grad_norm": 0.2744123041629791, "kl": 0.163848876953125, "learning_rate": 1.6367653456596054e-06, "loss": -0.0852, "reward": 0.8125000149011612, "reward_std": 0.2900237739086151, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 491 }, { "completion_length": 569.7916831970215, "epoch": 0.5248, "grad_norm": 0.1064935177564621, "kl": 0.1077117919921875, "learning_rate": 1.6311976703756868e-06, "loss": -0.0267, "reward": 0.6875000111758709, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.6875000111758709, "rewards/format_reward": 0.0, "step": 492 }, { "completion_length": 616.0416870117188, "epoch": 0.5258666666666667, "grad_norm": 0.11436577886343002, "kl": 0.0694122314453125, "learning_rate": 1.6256281730001213e-06, "loss": 0.0171, "reward": 0.6041666865348816, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 493 }, { "completion_length": 480.2708435058594, "epoch": 0.5269333333333334, "grad_norm": 0.1317841112613678, "kl": 0.3686981201171875, "learning_rate": 1.6200569308828705e-06, "loss": -0.0581, "reward": 0.8333333432674408, "reward_std": 0.18404608592391014, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.0, "step": 494 }, { "completion_length": 516.4791946411133, "epoch": 0.528, "grad_norm": 0.21842017769813538, "kl": 0.1536865234375, "learning_rate": 1.6144840213981257e-06, "loss": -0.0218, "reward": 0.833333358168602, "reward_std": 0.3332236036658287, "rewards/accuracy_reward": 0.833333358168602, "rewards/format_reward": 0.0, "step": 495 }, { "completion_length": 625.9166793823242, "epoch": 0.5290666666666667, "grad_norm": 0.14914527535438538, "kl": 0.141387939453125, "learning_rate": 1.6089095219432359e-06, "loss": -0.0629, "reward": 0.7500000149011612, "reward_std": 0.18404608592391014, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 496 }, { "completion_length": 479.06251525878906, "epoch": 0.5301333333333333, "grad_norm": 0.3558216094970703, "kl": 0.26324462890625, "learning_rate": 1.6033335099376315e-06, "loss": -0.0609, "reward": 0.7500000149011612, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 497 }, { "completion_length": 566.2708435058594, "epoch": 0.5312, "grad_norm": 0.7444754242897034, "kl": 0.224609375, "learning_rate": 1.5977560628217482e-06, "loss": -0.0226, "reward": 0.708333358168602, "reward_std": 0.30354244261980057, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 498 }, { "completion_length": 658.8125152587891, "epoch": 0.5322666666666667, "grad_norm": 0.17983639240264893, "kl": 0.45703125, "learning_rate": 1.5921772580559549e-06, "loss": -0.1949, "reward": 0.708333358168602, "reward_std": 0.4326418936252594, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 499 }, { "completion_length": 511.8541793823242, "epoch": 0.5333333333333333, "grad_norm": 1.0113751888275146, "kl": 0.490814208984375, "learning_rate": 1.5865971731194738e-06, "loss": -0.061, "reward": 0.770833358168602, "reward_std": 0.1801304928958416, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 500 }, { "completion_length": 459.0833435058594, "epoch": 0.5344, "grad_norm": 0.9630478024482727, "kl": 0.860565185546875, "learning_rate": 1.5810158855093075e-06, "loss": -0.0817, "reward": 0.5000000149011612, "reward_std": 0.33057981729507446, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "step": 501 }, { "completion_length": 626.9791870117188, "epoch": 0.5354666666666666, "grad_norm": 0.20844507217407227, "kl": 0.58123779296875, "learning_rate": 1.5754334727391613e-06, "loss": -0.1157, "reward": 0.5833333432674408, "reward_std": 0.32274864614009857, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 502 }, { "completion_length": 605.5833587646484, "epoch": 0.5365333333333333, "grad_norm": 0.504833996295929, "kl": 0.1934814453125, "learning_rate": 1.5698500123383657e-06, "loss": -0.103, "reward": 0.5833333432674408, "reward_std": 0.38552645966410637, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 503 }, { "completion_length": 375.4166793823242, "epoch": 0.5376, "grad_norm": 0.33515799045562744, "kl": 0.057830810546875, "learning_rate": 1.5642655818508029e-06, "loss": -0.0163, "reward": 0.8750000149011612, "reward_std": 0.18404608964920044, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 504 }, { "completion_length": 480.93750762939453, "epoch": 0.5386666666666666, "grad_norm": 1.4276330471038818, "kl": 0.33392333984375, "learning_rate": 1.5586802588338262e-06, "loss": -0.0682, "reward": 0.8333333432674408, "reward_std": 0.16661179438233376, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.0, "step": 505 }, { "completion_length": 646.1041870117188, "epoch": 0.5397333333333333, "grad_norm": 0.413056343793869, "kl": 0.080963134765625, "learning_rate": 1.553094120857185e-06, "loss": -0.0663, "reward": 0.7291666865348816, "reward_std": 0.21764283999800682, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 506 }, { "completion_length": 494.0000114440918, "epoch": 0.5408, "grad_norm": 0.5526696443557739, "kl": 0.4974365234375, "learning_rate": 1.547507245501947e-06, "loss": -0.0894, "reward": 0.7291666865348816, "reward_std": 0.25515518710017204, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 507 }, { "completion_length": 503.1041831970215, "epoch": 0.5418666666666667, "grad_norm": 0.9157426357269287, "kl": 0.5810546875, "learning_rate": 1.5419197103594208e-06, "loss": 0.0209, "reward": 0.5208333488553762, "reward_std": 0.28219256177544594, "rewards/accuracy_reward": 0.5208333488553762, "rewards/format_reward": 0.0, "step": 508 }, { "completion_length": 560.9166870117188, "epoch": 0.5429333333333334, "grad_norm": 0.9824774265289307, "kl": 0.694091796875, "learning_rate": 1.5363315930300777e-06, "loss": -0.1535, "reward": 0.8125000149011612, "reward_std": 0.2996268458664417, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 509 }, { "completion_length": 410.0416793823242, "epoch": 0.544, "grad_norm": 0.9444297552108765, "kl": 1.02587890625, "learning_rate": 1.5307429711224756e-06, "loss": -0.0655, "reward": 0.6041666716337204, "reward_std": 0.2350771203637123, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 510 }, { "completion_length": 404.0833435058594, "epoch": 0.5450666666666667, "grad_norm": 0.7588927149772644, "kl": 0.496429443359375, "learning_rate": 1.525153922252179e-06, "loss": -0.1785, "reward": 0.7083333358168602, "reward_std": 0.30354245007038116, "rewards/accuracy_reward": 0.7083333358168602, "rewards/format_reward": 0.0, "step": 511 }, { "completion_length": 373.77083587646484, "epoch": 0.5461333333333334, "grad_norm": 0.8721346855163574, "kl": 0.4884033203125, "learning_rate": 1.519564524040682e-06, "loss": -0.2026, "reward": 0.645833358168602, "reward_std": 0.41129200905561447, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 512 }, { "completion_length": 475.3541831970215, "epoch": 0.5472, "grad_norm": 1.548012614250183, "kl": 1.08349609375, "learning_rate": 1.5139748541143317e-06, "loss": -0.1661, "reward": 0.39583333395421505, "reward_std": 0.36417657509446144, "rewards/accuracy_reward": 0.39583333395421505, "rewards/format_reward": 0.0, "step": 513 }, { "completion_length": 514.4583587646484, "epoch": 0.5482666666666667, "grad_norm": 0.8743815422058105, "kl": 0.3927001953125, "learning_rate": 1.5083849901032472e-06, "loss": -0.1145, "reward": 0.4375000149011612, "reward_std": 0.2446802258491516, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 514 }, { "completion_length": 438.50001525878906, "epoch": 0.5493333333333333, "grad_norm": 0.7772073149681091, "kl": 0.457275390625, "learning_rate": 1.5027950096402447e-06, "loss": -0.1784, "reward": 0.708333358168602, "reward_std": 0.32362050563097, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 515 }, { "completion_length": 404.1666793823242, "epoch": 0.5504, "grad_norm": 0.7086036801338196, "kl": 0.1859130859375, "learning_rate": 1.4972049903597554e-06, "loss": -0.1562, "reward": 0.8125000149011612, "reward_std": 0.33713920041918755, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 516 }, { "completion_length": 396.64583587646484, "epoch": 0.5514666666666667, "grad_norm": 1.8514900207519531, "kl": 1.1123046875, "learning_rate": 1.4916150098967525e-06, "loss": -0.2544, "reward": 0.708333358168602, "reward_std": 0.3602609783411026, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 517 }, { "completion_length": 466.2708511352539, "epoch": 0.5525333333333333, "grad_norm": 0.8457176685333252, "kl": 0.56640625, "learning_rate": 1.4860251458856683e-06, "loss": -0.1298, "reward": 0.3958333507180214, "reward_std": 0.2621144950389862, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "step": 518 }, { "completion_length": 531.3333435058594, "epoch": 0.5536, "grad_norm": 0.5000897645950317, "kl": 0.239501953125, "learning_rate": 1.4804354759593176e-06, "loss": -0.1036, "reward": 0.8541666865348816, "reward_std": 0.2350771203637123, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.0, "step": 519 }, { "completion_length": 590.375, "epoch": 0.5546666666666666, "grad_norm": 3.2117254734039307, "kl": 1.677734375, "learning_rate": 1.474846077747821e-06, "loss": -0.1571, "reward": 0.6250000111758709, "reward_std": 0.3977733254432678, "rewards/accuracy_reward": 0.6250000111758709, "rewards/format_reward": 0.0, "step": 520 }, { "completion_length": 634.9583587646484, "epoch": 0.5557333333333333, "grad_norm": 4.070802688598633, "kl": 1.60479736328125, "learning_rate": 1.4692570288775243e-06, "loss": -0.0762, "reward": 0.39583333767950535, "reward_std": 0.1801304966211319, "rewards/accuracy_reward": 0.39583333767950535, "rewards/format_reward": 0.0, "step": 521 }, { "completion_length": 440.1875228881836, "epoch": 0.5568, "grad_norm": 0.706484854221344, "kl": 0.5311279296875, "learning_rate": 1.4636684069699222e-06, "loss": -0.0906, "reward": 0.4166666753590107, "reward_std": 0.24859582632780075, "rewards/accuracy_reward": 0.4166666753590107, "rewards/format_reward": 0.0, "step": 522 }, { "completion_length": 410.0416831970215, "epoch": 0.5578666666666666, "grad_norm": 0.7914196252822876, "kl": 0.6809539794921875, "learning_rate": 1.4580802896405793e-06, "loss": -0.1672, "reward": 0.6875000260770321, "reward_std": 0.28219256177544594, "rewards/accuracy_reward": 0.6875000260770321, "rewards/format_reward": 0.0, "step": 523 }, { "completion_length": 474.7083435058594, "epoch": 0.5589333333333333, "grad_norm": 1.9133433103561401, "kl": 0.3736724853515625, "learning_rate": 1.452492754498053e-06, "loss": -0.0745, "reward": 0.6875000149011612, "reward_std": 0.2525113970041275, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 524 }, { "completion_length": 497.9583511352539, "epoch": 0.56, "grad_norm": 1.363409399986267, "kl": 0.601043701171875, "learning_rate": 1.4469058791428154e-06, "loss": -0.1599, "reward": 0.7500000298023224, "reward_std": 0.3977733254432678, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.0, "step": 525 }, { "completion_length": 445.68751525878906, "epoch": 0.5610666666666667, "grad_norm": 1.4084800481796265, "kl": 0.4372406005859375, "learning_rate": 1.4413197411661739e-06, "loss": -0.0733, "reward": 0.791666679084301, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.791666679084301, "rewards/format_reward": 0.0, "step": 526 }, { "completion_length": 518.7708435058594, "epoch": 0.5621333333333334, "grad_norm": 0.3551030158996582, "kl": 0.697265625, "learning_rate": 1.4357344181491972e-06, "loss": -0.2357, "reward": 0.6458333730697632, "reward_std": 0.3842546306550503, "rewards/accuracy_reward": 0.6458333730697632, "rewards/format_reward": 0.0, "step": 527 }, { "completion_length": 428.0625, "epoch": 0.5632, "grad_norm": 0.3091790974140167, "kl": 0.2898101806640625, "learning_rate": 1.4301499876616344e-06, "loss": -0.1295, "reward": 0.8750000149011612, "reward_std": 0.23116153106093407, "rewards/accuracy_reward": 0.8750000149011612, "rewards/format_reward": 0.0, "step": 528 }, { "completion_length": 491.3958435058594, "epoch": 0.5642666666666667, "grad_norm": 0.32577309012413025, "kl": 0.66302490234375, "learning_rate": 1.4245665272608392e-06, "loss": -0.1218, "reward": 0.6458333432674408, "reward_std": 0.2350771315395832, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 529 }, { "completion_length": 531.6875152587891, "epoch": 0.5653333333333334, "grad_norm": 1.2284932136535645, "kl": 0.6751708984375, "learning_rate": 1.4189841144906928e-06, "loss": -0.0872, "reward": 0.47916666977107525, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.47916666977107525, "rewards/format_reward": 0.0, "step": 530 }, { "completion_length": 618.2291870117188, "epoch": 0.5664, "grad_norm": 0.10674097388982773, "kl": 0.0190582275390625, "learning_rate": 1.4134028268805265e-06, "loss": 0.0233, "reward": 0.708333358168602, "reward_std": 0.24859581887722015, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 531 }, { "completion_length": 544.8333435058594, "epoch": 0.5674666666666667, "grad_norm": 1.2158888578414917, "kl": 0.20538330078125, "learning_rate": 1.4078227419440454e-06, "loss": -0.1553, "reward": 0.8125000298023224, "reward_std": 0.37465154752135277, "rewards/accuracy_reward": 0.8125000298023224, "rewards/format_reward": 0.0, "step": 532 }, { "completion_length": 456.43751525878906, "epoch": 0.5685333333333333, "grad_norm": 2.0294442176818848, "kl": 0.184234619140625, "learning_rate": 1.402243937178252e-06, "loss": -0.0379, "reward": 0.7500000149011612, "reward_std": 0.4056045301258564, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 533 }, { "completion_length": 403.8958435058594, "epoch": 0.5696, "grad_norm": 2.342531442642212, "kl": 0.40924072265625, "learning_rate": 1.396666490062369e-06, "loss": -0.0834, "reward": 0.9583333432674408, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.9583333432674408, "rewards/format_reward": 0.0, "step": 534 }, { "completion_length": 437.7500114440918, "epoch": 0.5706666666666667, "grad_norm": 0.480355441570282, "kl": 0.392333984375, "learning_rate": 1.3910904780567642e-06, "loss": -0.014, "reward": 0.8541666865348816, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.0, "step": 535 }, { "completion_length": 522.5000152587891, "epoch": 0.5717333333333333, "grad_norm": 2.535088062286377, "kl": 0.900909423828125, "learning_rate": 1.3855159786018744e-06, "loss": -0.1307, "reward": 0.5208333544433117, "reward_std": 0.36417657136917114, "rewards/accuracy_reward": 0.5208333544433117, "rewards/format_reward": 0.0, "step": 536 }, { "completion_length": 498.79168701171875, "epoch": 0.5728, "grad_norm": 0.7282590270042419, "kl": 0.336639404296875, "learning_rate": 1.37994306911713e-06, "loss": -0.0307, "reward": 0.5416666716337204, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "step": 537 }, { "completion_length": 431.72918701171875, "epoch": 0.5738666666666666, "grad_norm": 1.5327588319778442, "kl": 0.43017578125, "learning_rate": 1.374371826999879e-06, "loss": -0.1302, "reward": 0.6875000298023224, "reward_std": 0.2621144950389862, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 538 }, { "completion_length": 499.7291793823242, "epoch": 0.5749333333333333, "grad_norm": 3.617853879928589, "kl": 0.795501708984375, "learning_rate": 1.368802329624314e-06, "loss": -0.1135, "reward": 0.6666666865348816, "reward_std": 0.30354245752096176, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 539 }, { "completion_length": 498.54168701171875, "epoch": 0.576, "grad_norm": 0.4012615978717804, "kl": 0.18994140625, "learning_rate": 1.3632346543403946e-06, "loss": -0.0101, "reward": 0.5625000111758709, "reward_std": 0.19756478071212769, "rewards/accuracy_reward": 0.5625000111758709, "rewards/format_reward": 0.0, "step": 540 }, { "completion_length": 703.8958587646484, "epoch": 0.5770666666666666, "grad_norm": 0.08834701031446457, "kl": 0.0388031005859375, "learning_rate": 1.3576688784727775e-06, "loss": 0.0127, "reward": 0.5416666865348816, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 541 }, { "completion_length": 698.2916870117188, "epoch": 0.5781333333333334, "grad_norm": 0.5507186651229858, "kl": 1.079498291015625, "learning_rate": 1.3521050793197374e-06, "loss": -0.0385, "reward": 0.5416666865348816, "reward_std": 0.22155842930078506, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 542 }, { "completion_length": 511.7291717529297, "epoch": 0.5792, "grad_norm": 0.47935521602630615, "kl": 0.1450958251953125, "learning_rate": 1.3465433341520975e-06, "loss": 0.0135, "reward": 0.7708333432674408, "reward_std": 0.1705274023115635, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 543 }, { "completion_length": 497.12501525878906, "epoch": 0.5802666666666667, "grad_norm": 1.5171895027160645, "kl": 1.0719146728515625, "learning_rate": 1.3409837202121548e-06, "loss": -0.1249, "reward": 0.6875, "reward_std": 0.31970490515232086, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 544 }, { "completion_length": 460.0833511352539, "epoch": 0.5813333333333334, "grad_norm": 0.690382182598114, "kl": 0.2831573486328125, "learning_rate": 1.335426314712607e-06, "loss": -0.1279, "reward": 0.8125000149011612, "reward_std": 0.1705273911356926, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 545 }, { "completion_length": 620.6250228881836, "epoch": 0.5824, "grad_norm": 0.2855072319507599, "kl": 0.39544677734375, "learning_rate": 1.3298711948354818e-06, "loss": -0.0758, "reward": 0.895833358168602, "reward_std": 0.21764283627271652, "rewards/accuracy_reward": 0.895833358168602, "rewards/format_reward": 0.0, "step": 546 }, { "completion_length": 645.3750152587891, "epoch": 0.5834666666666667, "grad_norm": 0.1421433538198471, "kl": 0.0274200439453125, "learning_rate": 1.324318437731063e-06, "loss": -0.0199, "reward": 0.7291666865348816, "reward_std": 0.2996268458664417, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 547 }, { "completion_length": 615.0833358764648, "epoch": 0.5845333333333333, "grad_norm": 0.5363072752952576, "kl": 0.33673095703125, "learning_rate": 1.3187681205168196e-06, "loss": -0.1186, "reward": 0.7500000149011612, "reward_std": 0.23116153106093407, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 548 }, { "completion_length": 560.4375152587891, "epoch": 0.5856, "grad_norm": 0.803733229637146, "kl": 0.6334228515625, "learning_rate": 1.313220320276336e-06, "loss": -0.1152, "reward": 0.6666666865348816, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 549 }, { "completion_length": 692.8958740234375, "epoch": 0.5866666666666667, "grad_norm": 1.1930656433105469, "kl": 0.1956787109375, "learning_rate": 1.3076751140582396e-06, "loss": -0.037, "reward": 0.5000000111758709, "reward_std": 0.268673874437809, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.0, "step": 550 }, { "completion_length": 642.7708587646484, "epoch": 0.5877333333333333, "grad_norm": 0.6253939270973206, "kl": 0.35272216796875, "learning_rate": 1.3021325788751322e-06, "loss": -0.1924, "reward": 0.770833358168602, "reward_std": 0.27258947119116783, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 551 }, { "completion_length": 865.7500152587891, "epoch": 0.5888, "grad_norm": 0.20867317914962769, "kl": 0.1536865234375, "learning_rate": 1.2965927917025198e-06, "loss": -0.0589, "reward": 0.43750001303851604, "reward_std": 0.28219256922602654, "rewards/accuracy_reward": 0.43750001303851604, "rewards/format_reward": 0.0, "step": 552 }, { "completion_length": 350.6666793823242, "epoch": 0.5898666666666667, "grad_norm": 2.211613178253174, "kl": 0.926025390625, "learning_rate": 1.2910558294777435e-06, "loss": -0.0247, "reward": 0.8125000149011612, "reward_std": 0.2900237739086151, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 553 }, { "completion_length": 383.7291793823242, "epoch": 0.5909333333333333, "grad_norm": 0.6421911716461182, "kl": 0.361785888671875, "learning_rate": 1.285521769098911e-06, "loss": -0.1081, "reward": 0.875, "reward_std": 0.18404608219861984, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 554 }, { "completion_length": 528.7708435058594, "epoch": 0.592, "grad_norm": 0.4189884066581726, "kl": 1.20361328125, "learning_rate": 1.2799906874238297e-06, "loss": -0.0942, "reward": 0.6458333507180214, "reward_std": 0.3092299550771713, "rewards/accuracy_reward": 0.6458333507180214, "rewards/format_reward": 0.0, "step": 555 }, { "completion_length": 501.12501525878906, "epoch": 0.5930666666666666, "grad_norm": 1.2208458185195923, "kl": 0.434814453125, "learning_rate": 1.2744626612689368e-06, "loss": -0.1021, "reward": 0.7916666865348816, "reward_std": 0.2957112565636635, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 556 }, { "completion_length": 574.6250076293945, "epoch": 0.5941333333333333, "grad_norm": 0.590181291103363, "kl": 0.53961181640625, "learning_rate": 1.2689377674082355e-06, "loss": -0.0865, "reward": 0.6875000223517418, "reward_std": 0.2621145099401474, "rewards/accuracy_reward": 0.6875000223517418, "rewards/format_reward": 0.0, "step": 557 }, { "completion_length": 589.8750152587891, "epoch": 0.5952, "grad_norm": 0.37682050466537476, "kl": 0.705078125, "learning_rate": 1.263416082572226e-06, "loss": -0.1311, "reward": 0.7291666865348816, "reward_std": 0.28219256550073624, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 558 }, { "completion_length": 534.3541870117188, "epoch": 0.5962666666666666, "grad_norm": 0.7812970876693726, "kl": 1.1470947265625, "learning_rate": 1.257897683446842e-06, "loss": -0.168, "reward": 0.6875000298023224, "reward_std": 0.36417656019330025, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 559 }, { "completion_length": 582.0416870117188, "epoch": 0.5973333333333334, "grad_norm": 0.9342625737190247, "kl": 2.1710205078125, "learning_rate": 1.2523826466723843e-06, "loss": -0.1977, "reward": 0.6041666865348816, "reward_std": 0.34674228727817535, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 560 }, { "completion_length": 476.00001525878906, "epoch": 0.5984, "grad_norm": 4.963130950927734, "kl": 2.7384033203125, "learning_rate": 1.2468710488424574e-06, "loss": -0.1267, "reward": 0.5625000149011612, "reward_std": 0.29962684214115143, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 561 }, { "completion_length": 486.08335876464844, "epoch": 0.5994666666666667, "grad_norm": 0.5785970091819763, "kl": 1.076416015625, "learning_rate": 1.2413629665029049e-06, "loss": -0.1463, "reward": 0.6041666865348816, "reward_std": 0.38161084800958633, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 562 }, { "completion_length": 477.89583587646484, "epoch": 0.6005333333333334, "grad_norm": 0.5653138756752014, "kl": 0.508453369140625, "learning_rate": 1.2358584761507467e-06, "loss": -0.0544, "reward": 0.7708333432674408, "reward_std": 0.2996268570423126, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 563 }, { "completion_length": 453.18751525878906, "epoch": 0.6016, "grad_norm": 1.1444580554962158, "kl": 1.164031982421875, "learning_rate": 1.2303576542331168e-06, "loss": -0.1092, "reward": 0.6250000149011612, "reward_std": 0.286108173429966, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 564 }, { "completion_length": 552.4791793823242, "epoch": 0.6026666666666667, "grad_norm": 0.4598291218280792, "kl": 0.1890411376953125, "learning_rate": 1.2248605771462016e-06, "loss": -0.1127, "reward": 0.666666679084301, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.0, "step": 565 }, { "completion_length": 392.75000762939453, "epoch": 0.6037333333333333, "grad_norm": 0.4130389988422394, "kl": 0.337860107421875, "learning_rate": 1.2193673212341784e-06, "loss": -0.037, "reward": 0.6250000223517418, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 566 }, { "completion_length": 705.3958511352539, "epoch": 0.6048, "grad_norm": 0.3726612627506256, "kl": 0.2266845703125, "learning_rate": 1.213877962788156e-06, "loss": -0.0718, "reward": 0.6458333507180214, "reward_std": 0.2350771240890026, "rewards/accuracy_reward": 0.6458333507180214, "rewards/format_reward": 0.0, "step": 567 }, { "completion_length": 382.5416793823242, "epoch": 0.6058666666666667, "grad_norm": 0.8625046610832214, "kl": 0.42523193359375, "learning_rate": 1.2083925780451142e-06, "loss": -0.1328, "reward": 0.7291666939854622, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.7291666939854622, "rewards/format_reward": 0.0, "step": 568 }, { "completion_length": 345.5833435058594, "epoch": 0.6069333333333333, "grad_norm": 8.424480438232422, "kl": 3.04296875, "learning_rate": 1.2029112431868455e-06, "loss": 0.0038, "reward": 0.4375000074505806, "reward_std": 0.2525114193558693, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "step": 569 }, { "completion_length": 499.5416717529297, "epoch": 0.608, "grad_norm": 0.2600148022174835, "kl": 0.2049560546875, "learning_rate": 1.1974340343388974e-06, "loss": -0.0894, "reward": 0.5625000149011612, "reward_std": 0.33713919296860695, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 570 }, { "completion_length": 365.93751525878906, "epoch": 0.6090666666666666, "grad_norm": 1.744339942932129, "kl": 0.84228515625, "learning_rate": 1.1919610275695144e-06, "loss": -0.1009, "reward": 0.5000000149011612, "reward_std": 0.4230387918651104, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "step": 571 }, { "completion_length": 467.6041793823242, "epoch": 0.6101333333333333, "grad_norm": 1.045207142829895, "kl": 1.203125, "learning_rate": 1.186492298888582e-06, "loss": -0.1349, "reward": 0.5000000298023224, "reward_std": 0.3977733179926872, "rewards/accuracy_reward": 0.5000000298023224, "rewards/format_reward": 0.0, "step": 572 }, { "completion_length": 553.8541946411133, "epoch": 0.6112, "grad_norm": 0.7255181074142456, "kl": 0.559326171875, "learning_rate": 1.1810279242465714e-06, "loss": -0.1536, "reward": 0.5625000149011612, "reward_std": 0.33713918924331665, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 573 }, { "completion_length": 298.9583435058594, "epoch": 0.6122666666666666, "grad_norm": 3.8063817024230957, "kl": 3.80615234375, "learning_rate": 1.1755679795334832e-06, "loss": -0.2565, "reward": 0.4166666716337204, "reward_std": 0.3602609932422638, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 574 }, { "completion_length": 484.75001525878906, "epoch": 0.6133333333333333, "grad_norm": 65.61083984375, "kl": 23.1558837890625, "learning_rate": 1.1701125405777965e-06, "loss": 0.3101, "reward": 0.6666666716337204, "reward_std": 0.3236205205321312, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 575 }, { "completion_length": 414.31250762939453, "epoch": 0.6144, "grad_norm": 2.7992453575134277, "kl": 2.2578125, "learning_rate": 1.164661683145412e-06, "loss": -0.0322, "reward": 0.5625000074505806, "reward_std": 0.38161084800958633, "rewards/accuracy_reward": 0.5625000074505806, "rewards/format_reward": 0.0, "step": 576 }, { "completion_length": 461.97918701171875, "epoch": 0.6154666666666667, "grad_norm": 12.74209976196289, "kl": 5.1767578125, "learning_rate": 1.1592154829386022e-06, "loss": -0.222, "reward": 0.3125000074505806, "reward_std": 0.3894420489668846, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "step": 577 }, { "completion_length": 400.0833396911621, "epoch": 0.6165333333333334, "grad_norm": 8.208133697509766, "kl": 2.153076171875, "learning_rate": 1.1537740155949595e-06, "loss": -0.0212, "reward": 0.5208333544433117, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.5208333544433117, "rewards/format_reward": 0.0, "step": 578 }, { "completion_length": 366.00000762939453, "epoch": 0.6176, "grad_norm": 3.278599977493286, "kl": 2.154296875, "learning_rate": 1.1483373566863454e-06, "loss": -0.2361, "reward": 0.3958333507180214, "reward_std": 0.2996268533170223, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "step": 579 }, { "completion_length": 311.68750762939453, "epoch": 0.6186666666666667, "grad_norm": 2.9980435371398926, "kl": 1.889892578125, "learning_rate": 1.142905581717841e-06, "loss": -0.3507, "reward": 0.5625000149011612, "reward_std": 0.30922994762659073, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 580 }, { "completion_length": 320.8958435058594, "epoch": 0.6197333333333334, "grad_norm": 1.3429142236709595, "kl": 1.5419921875, "learning_rate": 1.1374787661266998e-06, "loss": -0.1385, "reward": 0.479166679084301, "reward_std": 0.3720077611505985, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 581 }, { "completion_length": 207.85416793823242, "epoch": 0.6208, "grad_norm": 1.7898343801498413, "kl": 1.845703125, "learning_rate": 1.132056985281297e-06, "loss": -0.0357, "reward": 0.3750000111758709, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.0, "step": 582 }, { "completion_length": 318.0416793823242, "epoch": 0.6218666666666667, "grad_norm": 2.3739800453186035, "kl": 1.8046875, "learning_rate": 1.1266403144800856e-06, "loss": -0.0319, "reward": 0.3333333432674408, "reward_std": 0.32274864614009857, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 583 }, { "completion_length": 244.58333778381348, "epoch": 0.6229333333333333, "grad_norm": 3.071056604385376, "kl": 1.923095703125, "learning_rate": 1.1212288289505494e-06, "loss": -0.2843, "reward": 0.43750001676380634, "reward_std": 0.33713918179273605, "rewards/accuracy_reward": 0.43750001676380634, "rewards/format_reward": 0.0, "step": 584 }, { "completion_length": 228.56250762939453, "epoch": 0.624, "grad_norm": 3.125417709350586, "kl": 1.775390625, "learning_rate": 1.1158226038481584e-06, "loss": -0.2054, "reward": 0.22916667349636555, "reward_std": 0.27258946746587753, "rewards/accuracy_reward": 0.22916667349636555, "rewards/format_reward": 0.0, "step": 585 }, { "completion_length": 340.0416717529297, "epoch": 0.6250666666666667, "grad_norm": 1.150846242904663, "kl": 1.05810546875, "learning_rate": 1.1104217142553247e-06, "loss": -0.2216, "reward": 0.4375000111758709, "reward_std": 0.299626849591732, "rewards/accuracy_reward": 0.4375000111758709, "rewards/format_reward": 0.0, "step": 586 }, { "completion_length": 292.62500762939453, "epoch": 0.6261333333333333, "grad_norm": 1.1351382732391357, "kl": 1.12841796875, "learning_rate": 1.105026235180361e-06, "loss": -0.2787, "reward": 0.4166666716337204, "reward_std": 0.30354243889451027, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 587 }, { "completion_length": 324.6875114440918, "epoch": 0.6272, "grad_norm": 2.3973066806793213, "kl": 1.09466552734375, "learning_rate": 1.099636241556437e-06, "loss": -0.3059, "reward": 0.5208333432674408, "reward_std": 0.40168893337249756, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 588 }, { "completion_length": 521.6041717529297, "epoch": 0.6282666666666666, "grad_norm": 0.9124022722244263, "kl": 0.46484375, "learning_rate": 1.0942518082405401e-06, "loss": -0.1929, "reward": 0.479166679084301, "reward_std": 0.42872631922364235, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 589 }, { "completion_length": 399.6250228881836, "epoch": 0.6293333333333333, "grad_norm": 0.6999115347862244, "kl": 1.02734375, "learning_rate": 1.0888730100124355e-06, "loss": -0.1487, "reward": 0.3125000149011612, "reward_std": 0.3344953954219818, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 590 }, { "completion_length": 375.93750762939453, "epoch": 0.6304, "grad_norm": 0.550220251083374, "kl": 0.89697265625, "learning_rate": 1.0834999215736271e-06, "loss": -0.3814, "reward": 0.6458333432674408, "reward_std": 0.43655748665332794, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 591 }, { "completion_length": 336.3333435058594, "epoch": 0.6314666666666666, "grad_norm": 0.9784324169158936, "kl": 1.52880859375, "learning_rate": 1.0781326175463212e-06, "loss": -0.2741, "reward": 0.2500000111758709, "reward_std": 0.3680921494960785, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "step": 592 }, { "completion_length": 367.8958435058594, "epoch": 0.6325333333333333, "grad_norm": 5.7101006507873535, "kl": 1.042236328125, "learning_rate": 1.0727711724723881e-06, "loss": -0.0913, "reward": 0.6875000298023224, "reward_std": 0.28219256177544594, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 593 }, { "completion_length": 419.2291793823242, "epoch": 0.6336, "grad_norm": 0.6156479716300964, "kl": 0.259521484375, "learning_rate": 1.0674156608123294e-06, "loss": -0.1221, "reward": 0.645833358168602, "reward_std": 0.33713920786976814, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 594 }, { "completion_length": 464.5416793823242, "epoch": 0.6346666666666667, "grad_norm": 0.7851059436798096, "kl": 0.341064453125, "learning_rate": 1.062066156944242e-06, "loss": -0.1531, "reward": 0.5208333432674408, "reward_std": 0.27258946001529694, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 595 }, { "completion_length": 486.9166793823242, "epoch": 0.6357333333333334, "grad_norm": 1.0571407079696655, "kl": 0.4609375, "learning_rate": 1.0567227351627864e-06, "loss": -0.1282, "reward": 0.7916666865348816, "reward_std": 0.20412414148449898, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 596 }, { "completion_length": 429.54167556762695, "epoch": 0.6368, "grad_norm": 0.7962088584899902, "kl": 1.0166015625, "learning_rate": 1.0513854696781531e-06, "loss": -0.1449, "reward": 0.5625000223517418, "reward_std": 0.37377967685461044, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 597 }, { "completion_length": 410.56250762939453, "epoch": 0.6378666666666667, "grad_norm": 0.41316255927085876, "kl": 1.74169921875, "learning_rate": 1.0460544346150335e-06, "loss": -0.2235, "reward": 0.479166679084301, "reward_std": 0.3720077648758888, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 598 }, { "completion_length": 490.54168701171875, "epoch": 0.6389333333333334, "grad_norm": 1.0226408243179321, "kl": 0.587158203125, "learning_rate": 1.040729704011591e-06, "loss": -0.156, "reward": 0.416666679084301, "reward_std": 0.2957112565636635, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 599 }, { "completion_length": 335.75000381469727, "epoch": 0.64, "grad_norm": 0.3579675555229187, "kl": 0.5692138671875, "learning_rate": 1.0354113518184304e-06, "loss": -0.0884, "reward": 0.6458333432674408, "reward_std": 0.2621145099401474, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 600 }, { "completion_length": 404.37500762939453, "epoch": 0.6410666666666667, "grad_norm": 2.026745557785034, "kl": 1.849609375, "learning_rate": 1.0300994518975732e-06, "loss": -0.2567, "reward": 0.4166666716337204, "reward_std": 0.3506578654050827, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 601 }, { "completion_length": 443.6666717529297, "epoch": 0.6421333333333333, "grad_norm": 0.46400952339172363, "kl": 1.53125, "learning_rate": 1.0247940780214302e-06, "loss": -0.244, "reward": 0.6250000111758709, "reward_std": 0.3602609783411026, "rewards/accuracy_reward": 0.6250000111758709, "rewards/format_reward": 0.0, "step": 602 }, { "completion_length": 316.1666717529297, "epoch": 0.6432, "grad_norm": 1.4741816520690918, "kl": 2.314453125, "learning_rate": 1.0194953038717773e-06, "loss": -0.1987, "reward": 0.6041666865348816, "reward_std": 0.31970490515232086, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 603 }, { "completion_length": 332.6458435058594, "epoch": 0.6442666666666667, "grad_norm": 4.430507183074951, "kl": 5.9921875, "learning_rate": 1.0142032030387342e-06, "loss": -0.0602, "reward": 0.229166679084301, "reward_std": 0.21764283627271652, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 604 }, { "completion_length": 354.66667556762695, "epoch": 0.6453333333333333, "grad_norm": 0.6781348586082458, "kl": 1.3876953125, "learning_rate": 1.008917849019739e-06, "loss": -0.1885, "reward": 0.5208333507180214, "reward_std": 0.24468021839857101, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 605 }, { "completion_length": 326.2083435058594, "epoch": 0.6464, "grad_norm": 2.5700299739837646, "kl": 3.1435546875, "learning_rate": 1.0036393152185294e-06, "loss": -0.1541, "reward": 0.45833333395421505, "reward_std": 0.3506578765809536, "rewards/accuracy_reward": 0.45833333395421505, "rewards/format_reward": 0.0, "step": 606 }, { "completion_length": 291.5208435058594, "epoch": 0.6474666666666666, "grad_norm": 2.16619873046875, "kl": 2.74609375, "learning_rate": 9.983676749441236e-07, "loss": 0.0568, "reward": 0.2708333432674408, "reward_std": 0.1975647658109665, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 607 }, { "completion_length": 377.9583435058594, "epoch": 0.6485333333333333, "grad_norm": 1.0914307832717896, "kl": 1.39453125, "learning_rate": 9.931030014098005e-07, "loss": -0.299, "reward": 0.416666679084301, "reward_std": 0.487588532269001, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 608 }, { "completion_length": 293.1458396911621, "epoch": 0.6496, "grad_norm": 0.5953905582427979, "kl": 1.1201171875, "learning_rate": 9.878453677320847e-07, "loss": -0.3072, "reward": 0.5625000149011612, "reward_std": 0.36417658627033234, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 609 }, { "completion_length": 329.1041793823242, "epoch": 0.6506666666666666, "grad_norm": 0.9543739557266235, "kl": 1.3505859375, "learning_rate": 9.825948469297303e-07, "loss": -0.1496, "reward": 0.41666667722165585, "reward_std": 0.3506578840315342, "rewards/accuracy_reward": 0.41666667722165585, "rewards/format_reward": 0.0, "step": 610 }, { "completion_length": 442.0833435058594, "epoch": 0.6517333333333334, "grad_norm": 0.5010097622871399, "kl": 0.2734375, "learning_rate": 9.77351511922706e-07, "loss": -0.1495, "reward": 0.708333358168602, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 611 }, { "completion_length": 432.31250762939453, "epoch": 0.6528, "grad_norm": 14.443533897399902, "kl": 13.607421875, "learning_rate": 9.721154355311845e-07, "loss": -0.1732, "reward": 0.6250000149011612, "reward_std": 0.40296071767807007, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 612 }, { "completion_length": 326.4375114440918, "epoch": 0.6538666666666667, "grad_norm": 1.9707064628601074, "kl": 2.61474609375, "learning_rate": 9.668866904745284e-07, "loss": -0.0429, "reward": 0.3958333395421505, "reward_std": 0.317061148583889, "rewards/accuracy_reward": 0.3958333395421505, "rewards/format_reward": 0.0, "step": 613 }, { "completion_length": 261.16667556762695, "epoch": 0.6549333333333334, "grad_norm": 1.9508821964263916, "kl": 1.6171875, "learning_rate": 9.616653493702824e-07, "loss": -0.3977, "reward": 0.4583333432674408, "reward_std": 0.4152076095342636, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "step": 614 }, { "completion_length": 408.2916717529297, "epoch": 0.656, "grad_norm": 0.7465296387672424, "kl": 0.96484375, "learning_rate": 9.564514847331647e-07, "loss": -0.2593, "reward": 0.2708333432674408, "reward_std": 0.3266642242670059, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 615 }, { "completion_length": 378.3750114440918, "epoch": 0.6570666666666667, "grad_norm": 0.9854875206947327, "kl": 0.828125, "learning_rate": 9.512451689740579e-07, "loss": -0.32, "reward": 0.5000000111758709, "reward_std": 0.44047310948371887, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.0, "step": 616 }, { "completion_length": 367.25000381469727, "epoch": 0.6581333333333333, "grad_norm": 0.7154622673988342, "kl": 0.5791015625, "learning_rate": 9.460464743990059e-07, "loss": -0.0378, "reward": 0.5416666865348816, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 617 }, { "completion_length": 304.00001525878906, "epoch": 0.6592, "grad_norm": 0.9910948872566223, "kl": 1.3984375, "learning_rate": 9.40855473208208e-07, "loss": -0.2609, "reward": 0.416666679084301, "reward_std": 0.3332235999405384, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 618 }, { "completion_length": 300.00000762939453, "epoch": 0.6602666666666667, "grad_norm": 1.6954786777496338, "kl": 1.174560546875, "learning_rate": 9.356722374950166e-07, "loss": -0.1071, "reward": 0.3750000111758709, "reward_std": 0.2861081659793854, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.0, "step": 619 }, { "completion_length": 376.64583587646484, "epoch": 0.6613333333333333, "grad_norm": 0.9147951602935791, "kl": 1.4188232421875, "learning_rate": 9.304968392449361e-07, "loss": -0.1821, "reward": 0.5833333432674408, "reward_std": 0.24859581515192986, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 620 }, { "completion_length": 392.7708435058594, "epoch": 0.6624, "grad_norm": 0.5840030312538147, "kl": 1.439453125, "learning_rate": 9.253293503346238e-07, "loss": -0.3963, "reward": 0.4166666716337204, "reward_std": 0.4152076169848442, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 621 }, { "completion_length": 378.6666793823242, "epoch": 0.6634666666666666, "grad_norm": 1.1304214000701904, "kl": 1.0712890625, "learning_rate": 9.201698425308896e-07, "loss": -0.2635, "reward": 0.3750000074505806, "reward_std": 0.377695269882679, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "step": 622 }, { "completion_length": 456.0416717529297, "epoch": 0.6645333333333333, "grad_norm": 0.745384156703949, "kl": 0.7978515625, "learning_rate": 9.150183874897021e-07, "loss": -0.286, "reward": 0.6041666716337204, "reward_std": 0.44616060703992844, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 623 }, { "completion_length": 319.70834732055664, "epoch": 0.6656, "grad_norm": 0.8978861570358276, "kl": 0.9462890625, "learning_rate": 9.098750567551911e-07, "loss": -0.3601, "reward": 0.6666666716337204, "reward_std": 0.4056045114994049, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 624 }, { "completion_length": 556.7291870117188, "epoch": 0.6666666666666666, "grad_norm": 0.6026521921157837, "kl": 0.529296875, "learning_rate": 9.047399217586552e-07, "loss": -0.2068, "reward": 0.7500000149011612, "reward_std": 0.3332235924899578, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 625 }, { "completion_length": 428.9791717529297, "epoch": 0.6677333333333333, "grad_norm": 0.624866247177124, "kl": 0.568115234375, "learning_rate": 8.996130538175697e-07, "loss": -0.1938, "reward": 0.583333358168602, "reward_std": 0.385526429861784, "rewards/accuracy_reward": 0.583333358168602, "rewards/format_reward": 0.0, "step": 626 }, { "completion_length": 343.1458435058594, "epoch": 0.6688, "grad_norm": 1.2301315069198608, "kl": 1.01953125, "learning_rate": 8.944945241345953e-07, "loss": -0.3367, "reward": 0.6250000149011612, "reward_std": 0.470154270529747, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 627 }, { "completion_length": 406.2291793823242, "epoch": 0.6698666666666667, "grad_norm": 0.5766564011573792, "kl": 0.81585693359375, "learning_rate": 8.893844037965898e-07, "loss": -0.1459, "reward": 0.5208333432674408, "reward_std": 0.2350771278142929, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 628 }, { "completion_length": 602.7291870117188, "epoch": 0.6709333333333334, "grad_norm": 0.6043370962142944, "kl": 0.56298828125, "learning_rate": 8.842827637736218e-07, "loss": -0.1111, "reward": 0.4791666828095913, "reward_std": 0.38161084800958633, "rewards/accuracy_reward": 0.4791666828095913, "rewards/format_reward": 0.0, "step": 629 }, { "completion_length": 305.35417556762695, "epoch": 0.672, "grad_norm": 1.03105628490448, "kl": 1.7607421875, "learning_rate": 8.791896749179831e-07, "loss": -0.0851, "reward": 0.41666667722165585, "reward_std": 0.18404608964920044, "rewards/accuracy_reward": 0.41666667722165585, "rewards/format_reward": 0.0, "step": 630 }, { "completion_length": 369.0416793823242, "epoch": 0.6730666666666667, "grad_norm": 1.4543663263320923, "kl": 1.34326171875, "learning_rate": 8.741052079632063e-07, "loss": -0.2254, "reward": 0.6875000298023224, "reward_std": 0.3572172559797764, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 631 }, { "completion_length": 458.3333511352539, "epoch": 0.6741333333333334, "grad_norm": 0.7188706398010254, "kl": 1.23974609375, "learning_rate": 8.690294335230808e-07, "loss": -0.186, "reward": 0.5208333507180214, "reward_std": 0.34674229472875595, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 632 }, { "completion_length": 445.81251525878906, "epoch": 0.6752, "grad_norm": 1.1345467567443848, "kl": 1.521484375, "learning_rate": 8.639624220906747e-07, "loss": -0.0718, "reward": 0.3750000074505806, "reward_std": 0.3776952847838402, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "step": 633 }, { "completion_length": 420.37500762939453, "epoch": 0.6762666666666667, "grad_norm": 2.2340855598449707, "kl": 2.7783203125, "learning_rate": 8.589042440373532e-07, "loss": 0.0302, "reward": 0.3750000074505806, "reward_std": 0.2957112491130829, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "step": 634 }, { "completion_length": 463.3125114440918, "epoch": 0.6773333333333333, "grad_norm": 2.391413450241089, "kl": 2.94921875, "learning_rate": 8.538549696118023e-07, "loss": -0.1582, "reward": 0.3750000223517418, "reward_std": 0.22155843675136566, "rewards/accuracy_reward": 0.3750000223517418, "rewards/format_reward": 0.0, "step": 635 }, { "completion_length": 470.5416793823242, "epoch": 0.6784, "grad_norm": 1.6083927154541016, "kl": 1.7421875, "learning_rate": 8.488146689390535e-07, "loss": -0.249, "reward": 0.3958333432674408, "reward_std": 0.36417657509446144, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 636 }, { "completion_length": 385.2500114440918, "epoch": 0.6794666666666667, "grad_norm": 1.3435205221176147, "kl": 2.263671875, "learning_rate": 8.437834120195094e-07, "loss": -0.2269, "reward": 0.604166679084301, "reward_std": 0.4392012804746628, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "step": 637 }, { "completion_length": 464.22918701171875, "epoch": 0.6805333333333333, "grad_norm": 1.2460261583328247, "kl": 0.95654296875, "learning_rate": 8.387612687279718e-07, "loss": -0.0877, "reward": 0.3958333432674408, "reward_std": 0.3170611187815666, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 638 }, { "completion_length": 412.93751525878906, "epoch": 0.6816, "grad_norm": 1.326361060142517, "kl": 1.3134765625, "learning_rate": 8.337483088126709e-07, "loss": -0.1159, "reward": 0.5000000149011612, "reward_std": 0.3506578765809536, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "step": 639 }, { "completion_length": 403.4583435058594, "epoch": 0.6826666666666666, "grad_norm": 4.9673051834106445, "kl": 2.156494140625, "learning_rate": 8.287446018942973e-07, "loss": -0.0823, "reward": 0.6041666865348816, "reward_std": 0.34674228355288506, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 640 }, { "completion_length": 365.06250762939453, "epoch": 0.6837333333333333, "grad_norm": 1.4079667329788208, "kl": 1.248046875, "learning_rate": 8.237502174650336e-07, "loss": -0.2064, "reward": 0.6041666865348816, "reward_std": 0.43655746430158615, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 641 }, { "completion_length": 374.8541793823242, "epoch": 0.6848, "grad_norm": 0.9105111360549927, "kl": 0.81201171875, "learning_rate": 8.187652248875924e-07, "loss": -0.1919, "reward": 0.520833358168602, "reward_std": 0.34674229472875595, "rewards/accuracy_reward": 0.520833358168602, "rewards/format_reward": 0.0, "step": 642 }, { "completion_length": 390.3958396911621, "epoch": 0.6858666666666666, "grad_norm": 3.103208065032959, "kl": 1.9169921875, "learning_rate": 8.137896933942495e-07, "loss": -0.1925, "reward": 0.645833358168602, "reward_std": 0.38161086291074753, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 643 }, { "completion_length": 604.2291870117188, "epoch": 0.6869333333333333, "grad_norm": 0.8146976828575134, "kl": 1.3701171875, "learning_rate": 8.088236920858835e-07, "loss": -0.3179, "reward": 0.45833333395421505, "reward_std": 0.4248107150197029, "rewards/accuracy_reward": 0.45833333395421505, "rewards/format_reward": 0.0, "step": 644 }, { "completion_length": 433.9166793823242, "epoch": 0.688, "grad_norm": 0.6227619647979736, "kl": 1.2216796875, "learning_rate": 8.038672899310176e-07, "loss": -0.1728, "reward": 0.4375000149011612, "reward_std": 0.3074580505490303, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 645 }, { "completion_length": 407.64583587646484, "epoch": 0.6890666666666667, "grad_norm": 5.091135501861572, "kl": 1.703125, "learning_rate": 7.989205557648598e-07, "loss": -0.0914, "reward": 0.4166666865348816, "reward_std": 0.2861081697046757, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "step": 646 }, { "completion_length": 368.7916793823242, "epoch": 0.6901333333333334, "grad_norm": 1.0875145196914673, "kl": 2.232177734375, "learning_rate": 7.939835582883478e-07, "loss": -0.0804, "reward": 0.4375000260770321, "reward_std": 0.28219256177544594, "rewards/accuracy_reward": 0.4375000260770321, "rewards/format_reward": 0.0, "step": 647 }, { "completion_length": 286.4166793823242, "epoch": 0.6912, "grad_norm": 1.0956978797912598, "kl": 1.4345703125, "learning_rate": 7.890563660671952e-07, "loss": -0.2279, "reward": 0.4791666828095913, "reward_std": 0.3170611336827278, "rewards/accuracy_reward": 0.4791666828095913, "rewards/format_reward": 0.0, "step": 648 }, { "completion_length": 378.50000762939453, "epoch": 0.6922666666666667, "grad_norm": 0.8587469458580017, "kl": 1.2119140625, "learning_rate": 7.841390475309386e-07, "loss": -0.1953, "reward": 0.4166666716337204, "reward_std": 0.3506578840315342, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 649 }, { "completion_length": 412.37501525878906, "epoch": 0.6933333333333334, "grad_norm": 1.27677321434021, "kl": 1.4302978515625, "learning_rate": 7.792316709719875e-07, "loss": -0.1338, "reward": 0.645833358168602, "reward_std": 0.34674228355288506, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 650 }, { "completion_length": 465.62501525878906, "epoch": 0.6944, "grad_norm": 0.808260977268219, "kl": 0.794921875, "learning_rate": 7.743343045446756e-07, "loss": -0.1961, "reward": 0.5625000149011612, "reward_std": 0.41129201650619507, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 651 }, { "completion_length": 294.6666793823242, "epoch": 0.6954666666666667, "grad_norm": 1.425496220588684, "kl": 1.849609375, "learning_rate": 7.694470162643147e-07, "loss": -0.2499, "reward": 0.43750000558793545, "reward_std": 0.2525113932788372, "rewards/accuracy_reward": 0.43750000558793545, "rewards/format_reward": 0.0, "step": 652 }, { "completion_length": 373.3958549499512, "epoch": 0.6965333333333333, "grad_norm": 1.9875823259353638, "kl": 1.239959716796875, "learning_rate": 7.6456987400625e-07, "loss": -0.145, "reward": 0.6875000223517418, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.6875000223517418, "rewards/format_reward": 0.0, "step": 653 }, { "completion_length": 299.7291793823242, "epoch": 0.6976, "grad_norm": 1.2214486598968506, "kl": 2.80078125, "learning_rate": 7.59702945504917e-07, "loss": -0.355, "reward": 0.4166666828095913, "reward_std": 0.4326418787240982, "rewards/accuracy_reward": 0.4166666828095913, "rewards/format_reward": 0.0, "step": 654 }, { "completion_length": 338.66667556762695, "epoch": 0.6986666666666667, "grad_norm": 1.592851996421814, "kl": 1.9635009765625, "learning_rate": 7.548462983529016e-07, "loss": -0.0822, "reward": 0.4791666716337204, "reward_std": 0.2699456810951233, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "step": 655 }, { "completion_length": 483.5000228881836, "epoch": 0.6997333333333333, "grad_norm": 0.57366943359375, "kl": 0.75933837890625, "learning_rate": 7.500000000000003e-07, "loss": -0.1271, "reward": 0.7500000298023224, "reward_std": 0.23116153478622437, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.0, "step": 656 }, { "completion_length": 398.6458396911621, "epoch": 0.7008, "grad_norm": 0.5850063562393188, "kl": 0.904541015625, "learning_rate": 7.451641177522844e-07, "loss": -0.2188, "reward": 0.541666679084301, "reward_std": 0.23116153478622437, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "step": 657 }, { "completion_length": 392.9791717529297, "epoch": 0.7018666666666666, "grad_norm": 3.8712940216064453, "kl": 6.947265625, "learning_rate": 7.40338718771165e-07, "loss": -0.2333, "reward": 0.6875000149011612, "reward_std": 0.41912322491407394, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 658 }, { "completion_length": 506.0208511352539, "epoch": 0.7029333333333333, "grad_norm": 0.606567919254303, "kl": 0.7677001953125, "learning_rate": 7.355238700724594e-07, "loss": -0.1482, "reward": 0.4791666865348816, "reward_std": 0.33713917806744576, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 0.0, "step": 659 }, { "completion_length": 407.0000190734863, "epoch": 0.704, "grad_norm": 1.817237377166748, "kl": 3.267822265625, "learning_rate": 7.307196385254621e-07, "loss": -0.1994, "reward": 0.4583333507180214, "reward_std": 0.3602609857916832, "rewards/accuracy_reward": 0.4583333507180214, "rewards/format_reward": 0.0, "step": 660 }, { "completion_length": 309.5208511352539, "epoch": 0.7050666666666666, "grad_norm": 1.4466520547866821, "kl": 1.7333984375, "learning_rate": 7.259260908520137e-07, "loss": -0.2772, "reward": 0.4791666939854622, "reward_std": 0.44616056233644485, "rewards/accuracy_reward": 0.4791666939854622, "rewards/format_reward": 0.0, "step": 661 }, { "completion_length": 507.3125228881836, "epoch": 0.7061333333333333, "grad_norm": 0.7477433681488037, "kl": 0.8857421875, "learning_rate": 7.211432936255779e-07, "loss": -0.1608, "reward": 0.5625000074505806, "reward_std": 0.43832940608263016, "rewards/accuracy_reward": 0.5625000074505806, "rewards/format_reward": 0.0, "step": 662 }, { "completion_length": 383.3541717529297, "epoch": 0.7072, "grad_norm": 4.203952312469482, "kl": 1.6162109375, "learning_rate": 7.163713132703127e-07, "loss": -0.1087, "reward": 0.5208333507180214, "reward_std": 0.45663557201623917, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 663 }, { "completion_length": 366.8541717529297, "epoch": 0.7082666666666667, "grad_norm": 1.4318974018096924, "kl": 2.447265625, "learning_rate": 7.116102160601505e-07, "loss": -0.2281, "reward": 0.354166679084301, "reward_std": 0.4932760149240494, "rewards/accuracy_reward": 0.354166679084301, "rewards/format_reward": 0.0, "step": 664 }, { "completion_length": 386.56251525878906, "epoch": 0.7093333333333334, "grad_norm": 1.772848129272461, "kl": 2.5546875, "learning_rate": 7.068600681178772e-07, "loss": -0.1966, "reward": 0.6666666865348816, "reward_std": 0.3680921532213688, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 665 }, { "completion_length": 447.0416793823242, "epoch": 0.7104, "grad_norm": 4.727251052856445, "kl": 4.646484375, "learning_rate": 7.021209354142133e-07, "loss": -0.1202, "reward": 0.6666666865348816, "reward_std": 0.3977733142673969, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 666 }, { "completion_length": 533.7500152587891, "epoch": 0.7114666666666667, "grad_norm": 38.85273742675781, "kl": 1.0753173828125, "learning_rate": 6.97392883766899e-07, "loss": -0.1272, "reward": 0.770833358168602, "reward_std": 0.3170611262321472, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 667 }, { "completion_length": 441.62501525878906, "epoch": 0.7125333333333334, "grad_norm": 2.2018003463745117, "kl": 0.9462890625, "learning_rate": 6.926759788397783e-07, "loss": -0.0792, "reward": 0.5625000149011612, "reward_std": 0.35457348823547363, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 668 }, { "completion_length": 525.7916870117188, "epoch": 0.7136, "grad_norm": 8.468934059143066, "kl": 1.03369140625, "learning_rate": 6.879702861418883e-07, "loss": 0.1546, "reward": 0.6875000149011612, "reward_std": 0.21764283254742622, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 669 }, { "completion_length": 601.5000076293945, "epoch": 0.7146666666666667, "grad_norm": 0.3675132691860199, "kl": 0.978515625, "learning_rate": 6.832758710265492e-07, "loss": -0.0634, "reward": 0.354166679084301, "reward_std": 0.2446802258491516, "rewards/accuracy_reward": 0.354166679084301, "rewards/format_reward": 0.0, "step": 670 }, { "completion_length": 508.93751525878906, "epoch": 0.7157333333333333, "grad_norm": 1.1383217573165894, "kl": 1.470703125, "learning_rate": 6.785927986904567e-07, "loss": -0.1695, "reward": 0.5625000074505806, "reward_std": 0.42872631549835205, "rewards/accuracy_reward": 0.5625000074505806, "rewards/format_reward": 0.0, "step": 671 }, { "completion_length": 379.4583435058594, "epoch": 0.7168, "grad_norm": 1.2972238063812256, "kl": 1.58984375, "learning_rate": 6.739211341727761e-07, "loss": -0.1773, "reward": 0.5000000111758709, "reward_std": 0.32274864614009857, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.0, "step": 672 }, { "completion_length": 416.6666793823242, "epoch": 0.7178666666666667, "grad_norm": 0.41672948002815247, "kl": 0.65234375, "learning_rate": 6.692609423542393e-07, "loss": -0.0909, "reward": 0.5000000149011612, "reward_std": 0.16661179438233376, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "step": 673 }, { "completion_length": 562.3333435058594, "epoch": 0.7189333333333333, "grad_norm": 0.7178722620010376, "kl": 1.03125, "learning_rate": 6.646122879562435e-07, "loss": -0.08, "reward": 0.3333333395421505, "reward_std": 0.3602609857916832, "rewards/accuracy_reward": 0.3333333395421505, "rewards/format_reward": 0.0, "step": 674 }, { "completion_length": 522.8750228881836, "epoch": 0.72, "grad_norm": 0.7057859897613525, "kl": 0.8994140625, "learning_rate": 6.599752355399538e-07, "loss": -0.062, "reward": 0.6875000149011612, "reward_std": 0.299626849591732, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 675 }, { "completion_length": 658.2500152587891, "epoch": 0.7210666666666666, "grad_norm": 0.6263889074325562, "kl": 0.932861328125, "learning_rate": 6.55349849505404e-07, "loss": 0.1142, "reward": 0.3541666828095913, "reward_std": 0.21764283254742622, "rewards/accuracy_reward": 0.3541666828095913, "rewards/format_reward": 0.0, "step": 676 }, { "completion_length": 503.31250762939453, "epoch": 0.7221333333333333, "grad_norm": 1.2372171878814697, "kl": 1.172454833984375, "learning_rate": 6.507361940906042e-07, "loss": -0.0732, "reward": 0.479166679084301, "reward_std": 0.48367295414209366, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 677 }, { "completion_length": 439.22918701171875, "epoch": 0.7232, "grad_norm": 1.79371178150177, "kl": 1.51171875, "learning_rate": 6.461343333706476e-07, "loss": -0.1055, "reward": 0.5416666865348816, "reward_std": 0.38817023858428, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 678 }, { "completion_length": 518.4791793823242, "epoch": 0.7242666666666666, "grad_norm": 0.9538657069206238, "kl": 1.0654296875, "learning_rate": 6.415443312568216e-07, "loss": -0.3215, "reward": 0.6875000298023224, "reward_std": 0.38161084800958633, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 679 }, { "completion_length": 543.125, "epoch": 0.7253333333333334, "grad_norm": 0.5762695074081421, "kl": 0.546142578125, "learning_rate": 6.369662514957191e-07, "loss": -0.1024, "reward": 0.6875000298023224, "reward_std": 0.36417656391859055, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 680 }, { "completion_length": 621.3333435058594, "epoch": 0.7264, "grad_norm": 0.4392271339893341, "kl": 0.7666015625, "learning_rate": 6.324001576683539e-07, "loss": -0.103, "reward": 0.3750000149011612, "reward_std": 0.30354244261980057, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 681 }, { "completion_length": 543.2500305175781, "epoch": 0.7274666666666667, "grad_norm": 0.7209781408309937, "kl": 1.259765625, "learning_rate": 6.278461131892775e-07, "loss": -0.0577, "reward": 0.5000000149011612, "reward_std": 0.24859579652547836, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "step": 682 }, { "completion_length": 357.2916717529297, "epoch": 0.7285333333333334, "grad_norm": 0.7437350749969482, "kl": 0.86279296875, "learning_rate": 6.233041813056982e-07, "loss": -0.1902, "reward": 0.6666666865348816, "reward_std": 0.2957112528383732, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 683 }, { "completion_length": 473.8333511352539, "epoch": 0.7296, "grad_norm": 1.4083120822906494, "kl": 0.8643798828125, "learning_rate": 6.187744250966031e-07, "loss": -0.1807, "reward": 0.5625000298023224, "reward_std": 0.43655747920274734, "rewards/accuracy_reward": 0.5625000298023224, "rewards/format_reward": 0.0, "step": 684 }, { "completion_length": 516.5208435058594, "epoch": 0.7306666666666667, "grad_norm": 2.1828339099884033, "kl": 1.04345703125, "learning_rate": 6.142569074718818e-07, "loss": -0.1162, "reward": 0.604166679084301, "reward_std": 0.3266642242670059, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "step": 685 }, { "completion_length": 550.5208435058594, "epoch": 0.7317333333333333, "grad_norm": 0.6842460036277771, "kl": 0.6025390625, "learning_rate": 6.097516911714523e-07, "loss": -0.2038, "reward": 0.5208333395421505, "reward_std": 0.40168891102075577, "rewards/accuracy_reward": 0.5208333395421505, "rewards/format_reward": 0.0, "step": 686 }, { "completion_length": 418.87500762939453, "epoch": 0.7328, "grad_norm": 1.221979022026062, "kl": 1.14453125, "learning_rate": 6.052588387643908e-07, "loss": -0.1391, "reward": 0.37500001303851604, "reward_std": 0.2861081622540951, "rewards/accuracy_reward": 0.37500001303851604, "rewards/format_reward": 0.0, "step": 687 }, { "completion_length": 452.0208435058594, "epoch": 0.7338666666666667, "grad_norm": 1.6487637758255005, "kl": 1.427734375, "learning_rate": 6.007784126480615e-07, "loss": -0.1352, "reward": 0.2708333395421505, "reward_std": 0.25515517964959145, "rewards/accuracy_reward": 0.2708333395421505, "rewards/format_reward": 0.0, "step": 688 }, { "completion_length": 548.2916946411133, "epoch": 0.7349333333333333, "grad_norm": 0.9768587350845337, "kl": 0.576416015625, "learning_rate": 5.963104750472507e-07, "loss": 0.0031, "reward": 0.3958333507180214, "reward_std": 0.34674228355288506, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "step": 689 }, { "completion_length": 465.6041793823242, "epoch": 0.736, "grad_norm": 0.41289690136909485, "kl": 0.77392578125, "learning_rate": 5.918550880133018e-07, "loss": -0.0799, "reward": 0.7083333432674408, "reward_std": 0.2861081510782242, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 690 }, { "completion_length": 527.0625076293945, "epoch": 0.7370666666666666, "grad_norm": 0.4476119577884674, "kl": 0.69085693359375, "learning_rate": 5.874123134232558e-07, "loss": -0.0509, "reward": 0.5000000149011612, "reward_std": 0.18404608964920044, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "step": 691 }, { "completion_length": 397.6666793823242, "epoch": 0.7381333333333333, "grad_norm": 0.6019869446754456, "kl": 0.674591064453125, "learning_rate": 5.829822129789891e-07, "loss": -0.0704, "reward": 0.8125000074505806, "reward_std": 0.1705274023115635, "rewards/accuracy_reward": 0.8125000074505806, "rewards/format_reward": 0.0, "step": 692 }, { "completion_length": 517.8541793823242, "epoch": 0.7392, "grad_norm": 0.8973357677459717, "kl": 1.37640380859375, "learning_rate": 5.785648482063575e-07, "loss": -0.257, "reward": 0.6458333507180214, "reward_std": 0.35457348078489304, "rewards/accuracy_reward": 0.6458333507180214, "rewards/format_reward": 0.0, "step": 693 }, { "completion_length": 484.66668701171875, "epoch": 0.7402666666666666, "grad_norm": 74.59620666503906, "kl": 8.19091796875, "learning_rate": 5.741602804543429e-07, "loss": 0.5877, "reward": 0.7500000149011612, "reward_std": 0.3680921718478203, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 694 }, { "completion_length": 666.5833511352539, "epoch": 0.7413333333333333, "grad_norm": 0.33290398120880127, "kl": 0.30859375, "learning_rate": 5.697685708941996e-07, "loss": -0.0457, "reward": 0.3958333358168602, "reward_std": 0.13301505148410797, "rewards/accuracy_reward": 0.3958333358168602, "rewards/format_reward": 0.0, "step": 695 }, { "completion_length": 566.7500152587891, "epoch": 0.7424, "grad_norm": 0.5462177991867065, "kl": 0.46270751953125, "learning_rate": 5.653897805186062e-07, "loss": -0.0982, "reward": 0.7916666865348816, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 696 }, { "completion_length": 417.0833435058594, "epoch": 0.7434666666666667, "grad_norm": 0.5006996393203735, "kl": 0.36529541015625, "learning_rate": 5.610239701408176e-07, "loss": -0.0623, "reward": 0.7500000149011612, "reward_std": 0.31314554065465927, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 697 }, { "completion_length": 710.6875152587891, "epoch": 0.7445333333333334, "grad_norm": 0.5701322555541992, "kl": 0.830078125, "learning_rate": 5.566712003938203e-07, "loss": -0.1474, "reward": 0.3541666716337204, "reward_std": 0.2350771278142929, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 698 }, { "completion_length": 479.33333587646484, "epoch": 0.7456, "grad_norm": 0.4590436816215515, "kl": 0.68878173828125, "learning_rate": 5.52331531729491e-07, "loss": -0.0586, "reward": 0.520833358168602, "reward_std": 0.3170611225068569, "rewards/accuracy_reward": 0.520833358168602, "rewards/format_reward": 0.0, "step": 699 }, { "completion_length": 530.2916793823242, "epoch": 0.7466666666666667, "grad_norm": 0.5251709818840027, "kl": 1.177001953125, "learning_rate": 5.480050244177573e-07, "loss": -0.0989, "reward": 0.6250000223517418, "reward_std": 0.25819891691207886, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 700 }, { "completion_length": 518.5416793823242, "epoch": 0.7477333333333334, "grad_norm": 0.6059989333152771, "kl": 0.60107421875, "learning_rate": 5.436917385457589e-07, "loss": 0.0242, "reward": 0.6875000149011612, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 701 }, { "completion_length": 418.75001525878906, "epoch": 0.7488, "grad_norm": 0.4303725063800812, "kl": 0.51861572265625, "learning_rate": 5.393917340170151e-07, "loss": -0.0373, "reward": 0.708333358168602, "reward_std": 0.24859580025076866, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 702 }, { "completion_length": 481.4791793823242, "epoch": 0.7498666666666667, "grad_norm": 4.108148097991943, "kl": 2.7802734375, "learning_rate": 5.351050705505919e-07, "loss": -0.0646, "reward": 0.6458333432674408, "reward_std": 0.41129200905561447, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 703 }, { "completion_length": 454.0000228881836, "epoch": 0.7509333333333333, "grad_norm": 0.5140090584754944, "kl": 0.9378662109375, "learning_rate": 5.308318076802728e-07, "loss": -0.0934, "reward": 0.7500000298023224, "reward_std": 0.3680921457707882, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.0, "step": 704 }, { "completion_length": 552.9375152587891, "epoch": 0.752, "grad_norm": 1.6533596515655518, "kl": 1.04498291015625, "learning_rate": 5.265720047537318e-07, "loss": -0.0528, "reward": 0.6041666716337204, "reward_std": 0.35457347333431244, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 705 }, { "completion_length": 545.7083435058594, "epoch": 0.7530666666666667, "grad_norm": 0.8393456339836121, "kl": 1.177093505859375, "learning_rate": 5.223257209317092e-07, "loss": -0.1265, "reward": 0.645833358168602, "reward_std": 0.34674228727817535, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 706 }, { "completion_length": 570.5208587646484, "epoch": 0.7541333333333333, "grad_norm": 1.196060299873352, "kl": 1.0955810546875, "learning_rate": 5.180930151871906e-07, "loss": 0.0045, "reward": 0.7916666865348816, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 707 }, { "completion_length": 481.0833435058594, "epoch": 0.7552, "grad_norm": 5.495587348937988, "kl": 1.9208984375, "learning_rate": 5.138739463045863e-07, "loss": -0.1908, "reward": 0.7500000149011612, "reward_std": 0.4152076169848442, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 708 }, { "completion_length": 457.33333587646484, "epoch": 0.7562666666666666, "grad_norm": 0.5725620985031128, "kl": 1.122802734375, "learning_rate": 5.096685728789175e-07, "loss": -0.057, "reward": 0.6458333432674408, "reward_std": 0.2350771278142929, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 709 }, { "completion_length": 555.7916870117188, "epoch": 0.7573333333333333, "grad_norm": 0.14078089594841003, "kl": 0.0555572509765625, "learning_rate": 5.054769533149999e-07, "loss": 0.0196, "reward": 0.7500000149011612, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 710 }, { "completion_length": 501.62501525878906, "epoch": 0.7584, "grad_norm": 0.8129284381866455, "kl": 1.023681640625, "learning_rate": 5.012991458266337e-07, "loss": -0.022, "reward": 0.5833333432674408, "reward_std": 0.24859581887722015, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 711 }, { "completion_length": 428.93751525878906, "epoch": 0.7594666666666666, "grad_norm": 1.4528568983078003, "kl": 1.968505859375, "learning_rate": 4.971352084357953e-07, "loss": -0.1353, "reward": 0.33333333395421505, "reward_std": 0.24859581515192986, "rewards/accuracy_reward": 0.33333333395421505, "rewards/format_reward": 0.0, "step": 712 }, { "completion_length": 523.1041870117188, "epoch": 0.7605333333333333, "grad_norm": 3.810222864151001, "kl": 1.7374267578125, "learning_rate": 4.92985198971831e-07, "loss": -0.0013, "reward": 0.7708333432674408, "reward_std": 0.05103103443980217, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 713 }, { "completion_length": 460.00000762939453, "epoch": 0.7616, "grad_norm": 0.7631555795669556, "kl": 0.8095703125, "learning_rate": 4.888491750706547e-07, "loss": -0.1797, "reward": 0.5625000260770321, "reward_std": 0.357217263430357, "rewards/accuracy_reward": 0.5625000260770321, "rewards/format_reward": 0.0, "step": 714 }, { "completion_length": 356.93751525878906, "epoch": 0.7626666666666667, "grad_norm": 0.4716258645057678, "kl": 0.50054931640625, "learning_rate": 4.847271941739458e-07, "loss": -0.0549, "reward": 0.6666666716337204, "reward_std": 0.19364918768405914, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 715 }, { "completion_length": 409.1041793823242, "epoch": 0.7637333333333334, "grad_norm": 0.954680860042572, "kl": 1.60302734375, "learning_rate": 4.806193135283535e-07, "loss": -0.1878, "reward": 0.6041667014360428, "reward_std": 0.46359483897686005, "rewards/accuracy_reward": 0.6041667014360428, "rewards/format_reward": 0.0, "step": 716 }, { "completion_length": 405.9583435058594, "epoch": 0.7648, "grad_norm": 0.6702188849449158, "kl": 0.9315185546875, "learning_rate": 4.765255901847003e-07, "loss": -0.0437, "reward": 0.6250000149011612, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 717 }, { "completion_length": 547.9375152587891, "epoch": 0.7658666666666667, "grad_norm": 1.7199496030807495, "kl": 0.5561065673828125, "learning_rate": 4.7244608099719e-07, "loss": -0.0485, "reward": 0.6458333432674408, "reward_std": 0.2350771315395832, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 718 }, { "completion_length": 577.5000152587891, "epoch": 0.7669333333333334, "grad_norm": 0.413357138633728, "kl": 0.59478759765625, "learning_rate": 4.6838084262261776e-07, "loss": -0.1147, "reward": 0.7291666828095913, "reward_std": 0.21764283627271652, "rewards/accuracy_reward": 0.7291666828095913, "rewards/format_reward": 0.0, "step": 719 }, { "completion_length": 549.8750152587891, "epoch": 0.768, "grad_norm": 1.4418047666549683, "kl": 0.8048095703125, "learning_rate": 4.643299315195855e-07, "loss": -0.0281, "reward": 0.6666666865348816, "reward_std": 0.3410547971725464, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 720 }, { "completion_length": 543.6458511352539, "epoch": 0.7690666666666667, "grad_norm": 0.77348393201828, "kl": 0.6099853515625, "learning_rate": 4.6029340394771426e-07, "loss": -0.0122, "reward": 0.5625000149011612, "reward_std": 0.41912320256233215, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 721 }, { "completion_length": 462.8958511352539, "epoch": 0.7701333333333333, "grad_norm": 1.136657953262329, "kl": 0.72021484375, "learning_rate": 4.562713159668648e-07, "loss": -0.034, "reward": 0.5833333507180214, "reward_std": 0.2957112640142441, "rewards/accuracy_reward": 0.5833333507180214, "rewards/format_reward": 0.0, "step": 722 }, { "completion_length": 526.6666793823242, "epoch": 0.7712, "grad_norm": 0.8422014117240906, "kl": 0.73046875, "learning_rate": 4.522637234363593e-07, "loss": -0.0883, "reward": 0.7291666865348816, "reward_std": 0.28219256550073624, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 723 }, { "completion_length": 427.6458511352539, "epoch": 0.7722666666666667, "grad_norm": 0.6945573091506958, "kl": 0.75732421875, "learning_rate": 4.4827068201420486e-07, "loss": -0.0169, "reward": 0.7291667014360428, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.7291667014360428, "rewards/format_reward": 0.0, "step": 724 }, { "completion_length": 574.8750152587891, "epoch": 0.7733333333333333, "grad_norm": 0.6600297689437866, "kl": 0.746063232421875, "learning_rate": 4.442922471563205e-07, "loss": -0.0801, "reward": 0.729166679084301, "reward_std": 0.2350771203637123, "rewards/accuracy_reward": 0.729166679084301, "rewards/format_reward": 0.0, "step": 725 }, { "completion_length": 565.7500152587891, "epoch": 0.7744, "grad_norm": 0.7589944005012512, "kl": 0.6602783203125, "learning_rate": 4.4032847411576785e-07, "loss": -0.0616, "reward": 0.33333334140479565, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.33333334140479565, "rewards/format_reward": 0.0, "step": 726 }, { "completion_length": 608.7291870117188, "epoch": 0.7754666666666666, "grad_norm": 1.1247073411941528, "kl": 1.03857421875, "learning_rate": 4.3637941794198264e-07, "loss": -0.0898, "reward": 0.6458333730697632, "reward_std": 0.44616056233644485, "rewards/accuracy_reward": 0.6458333730697632, "rewards/format_reward": 0.0, "step": 727 }, { "completion_length": 502.85418701171875, "epoch": 0.7765333333333333, "grad_norm": 0.532287061214447, "kl": 0.68115234375, "learning_rate": 4.3244513348001104e-07, "loss": -0.0845, "reward": 0.7291666865348816, "reward_std": 0.34674228727817535, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 728 }, { "completion_length": 449.68750381469727, "epoch": 0.7776, "grad_norm": 1.742480754852295, "kl": 1.7890625, "learning_rate": 4.2852567536974705e-07, "loss": -0.0597, "reward": 0.5416666828095913, "reward_std": 0.31314555555582047, "rewards/accuracy_reward": 0.5416666828095913, "rewards/format_reward": 0.0, "step": 729 }, { "completion_length": 445.7708435058594, "epoch": 0.7786666666666666, "grad_norm": 2.2945778369903564, "kl": 3.47265625, "learning_rate": 4.24621098045175e-07, "loss": -0.1491, "reward": 0.5416666716337204, "reward_std": 0.4326419085264206, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "step": 730 }, { "completion_length": 699.0833511352539, "epoch": 0.7797333333333333, "grad_norm": 0.8936269879341125, "kl": 0.8231201171875, "learning_rate": 4.2073145573361197e-07, "loss": -0.0557, "reward": 0.6458333656191826, "reward_std": 0.36417656391859055, "rewards/accuracy_reward": 0.6458333656191826, "rewards/format_reward": 0.0, "step": 731 }, { "completion_length": 456.6666717529297, "epoch": 0.7808, "grad_norm": 2.6436824798583984, "kl": 1.1591796875, "learning_rate": 4.168568024549562e-07, "loss": 0.0954, "reward": 0.6666666716337204, "reward_std": 0.11949635669589043, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 732 }, { "completion_length": 621.4166870117188, "epoch": 0.7818666666666667, "grad_norm": 0.9900630712509155, "kl": 1.2640380859375, "learning_rate": 4.129971920209359e-07, "loss": -0.1447, "reward": 0.4583333469927311, "reward_std": 0.3410547971725464, "rewards/accuracy_reward": 0.4583333469927311, "rewards/format_reward": 0.0, "step": 733 }, { "completion_length": 493.7291717529297, "epoch": 0.7829333333333334, "grad_norm": 0.7805712819099426, "kl": 1.392578125, "learning_rate": 4.0915267803436186e-07, "loss": -0.1085, "reward": 0.5625000298023224, "reward_std": 0.35457347333431244, "rewards/accuracy_reward": 0.5625000298023224, "rewards/format_reward": 0.0, "step": 734 }, { "completion_length": 550.6250305175781, "epoch": 0.784, "grad_norm": 1.0304354429244995, "kl": 0.6483154296875, "learning_rate": 4.053233138883835e-07, "loss": -0.1286, "reward": 0.6666666865348816, "reward_std": 0.3506578952074051, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 735 }, { "completion_length": 480.0208511352539, "epoch": 0.7850666666666667, "grad_norm": 0.8233749866485596, "kl": 1.53216552734375, "learning_rate": 4.015091527657472e-07, "loss": -0.0865, "reward": 0.6250000074505806, "reward_std": 0.18404609709978104, "rewards/accuracy_reward": 0.6250000074505806, "rewards/format_reward": 0.0, "step": 736 }, { "completion_length": 464.7083511352539, "epoch": 0.7861333333333334, "grad_norm": 1.3681738376617432, "kl": 0.77392578125, "learning_rate": 3.977102476380576e-07, "loss": -0.0859, "reward": 0.5416666716337204, "reward_std": 0.38552645593881607, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "step": 737 }, { "completion_length": 511.87501525878906, "epoch": 0.7872, "grad_norm": 0.9729070067405701, "kl": 2.478515625, "learning_rate": 3.9392665126504196e-07, "loss": -0.0914, "reward": 0.4583333395421505, "reward_std": 0.4152076318860054, "rewards/accuracy_reward": 0.4583333395421505, "rewards/format_reward": 0.0, "step": 738 }, { "completion_length": 624.4166870117188, "epoch": 0.7882666666666667, "grad_norm": 0.617068886756897, "kl": 1.2333984375, "learning_rate": 3.901584161938172e-07, "loss": -0.0364, "reward": 0.6250000298023224, "reward_std": 0.3506578952074051, "rewards/accuracy_reward": 0.6250000298023224, "rewards/format_reward": 0.0, "step": 739 }, { "completion_length": 664.5625305175781, "epoch": 0.7893333333333333, "grad_norm": 0.38951918482780457, "kl": 0.4439697265625, "learning_rate": 3.864055947581605e-07, "loss": -0.0648, "reward": 0.6250000149011612, "reward_std": 0.23116152361035347, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 740 }, { "completion_length": 530.7291946411133, "epoch": 0.7904, "grad_norm": 0.7132443189620972, "kl": 1.1328125, "learning_rate": 3.8266823907778244e-07, "loss": -0.1378, "reward": 0.5000000186264515, "reward_std": 0.3506578728556633, "rewards/accuracy_reward": 0.5000000186264515, "rewards/format_reward": 0.0, "step": 741 }, { "completion_length": 408.91668701171875, "epoch": 0.7914666666666667, "grad_norm": 2.50813364982605, "kl": 1.0703125, "learning_rate": 3.7894640105760217e-07, "loss": -0.0268, "reward": 0.6041666865348816, "reward_std": 0.35457348823547363, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 742 }, { "completion_length": 492.43751525878906, "epoch": 0.7925333333333333, "grad_norm": 1.072017788887024, "kl": 1.3250732421875, "learning_rate": 3.7524013238702907e-07, "loss": -0.2014, "reward": 0.4166666865348816, "reward_std": 0.3680921532213688, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "step": 743 }, { "completion_length": 428.68750762939453, "epoch": 0.7936, "grad_norm": 0.57767653465271, "kl": 1.490234375, "learning_rate": 3.715494845392418e-07, "loss": -0.1023, "reward": 0.3750000111758709, "reward_std": 0.3131455257534981, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.0, "step": 744 }, { "completion_length": 640.7500152587891, "epoch": 0.7946666666666666, "grad_norm": 0.4153386950492859, "kl": 0.1949462890625, "learning_rate": 3.6787450877047543e-07, "loss": -0.0705, "reward": 0.7500000298023224, "reward_std": 0.2957112565636635, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.0, "step": 745 }, { "completion_length": 447.4791793823242, "epoch": 0.7957333333333333, "grad_norm": 0.7878170013427734, "kl": 0.820556640625, "learning_rate": 3.6421525611930873e-07, "loss": -0.1322, "reward": 0.6458333730697632, "reward_std": 0.38161084055900574, "rewards/accuracy_reward": 0.6458333730697632, "rewards/format_reward": 0.0, "step": 746 }, { "completion_length": 529.6666870117188, "epoch": 0.7968, "grad_norm": 0.4066816568374634, "kl": 0.14483642578125, "learning_rate": 3.6057177740595546e-07, "loss": -0.0263, "reward": 0.6666666828095913, "reward_std": 0.3236205093562603, "rewards/accuracy_reward": 0.6666666828095913, "rewards/format_reward": 0.0, "step": 747 }, { "completion_length": 508.75000762939453, "epoch": 0.7978666666666666, "grad_norm": 0.6631574630737305, "kl": 0.7890625, "learning_rate": 3.569441232315594e-07, "loss": -0.1345, "reward": 0.5625000149011612, "reward_std": 0.31970490515232086, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 748 }, { "completion_length": 392.89584732055664, "epoch": 0.7989333333333334, "grad_norm": 0.7003363966941833, "kl": 0.9915771484375, "learning_rate": 3.5333234397748987e-07, "loss": -0.0325, "reward": 0.6875000074505806, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.6875000074505806, "rewards/format_reward": 0.0, "step": 749 }, { "completion_length": 432.62500762939453, "epoch": 0.8, "grad_norm": 1.4344288110733032, "kl": 0.714599609375, "learning_rate": 3.4973648980464454e-07, "loss": -0.0629, "reward": 0.6875000111758709, "reward_std": 0.28219256550073624, "rewards/accuracy_reward": 0.6875000111758709, "rewards/format_reward": 0.0, "step": 750 }, { "completion_length": 456.7083435058594, "epoch": 0.8010666666666667, "grad_norm": 0.6702500581741333, "kl": 0.812347412109375, "learning_rate": 3.4615661065275007e-07, "loss": -0.1607, "reward": 0.7291666865348816, "reward_std": 0.34674228727817535, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 751 }, { "completion_length": 478.4791793823242, "epoch": 0.8021333333333334, "grad_norm": 0.7735109925270081, "kl": 0.4871826171875, "learning_rate": 3.425927562396702e-07, "loss": -0.0696, "reward": 0.7708333507180214, "reward_std": 0.2350771315395832, "rewards/accuracy_reward": 0.7708333507180214, "rewards/format_reward": 0.0, "step": 752 }, { "completion_length": 497.43750762939453, "epoch": 0.8032, "grad_norm": 4.989002704620361, "kl": 1.0400390625, "learning_rate": 3.3904497606071473e-07, "loss": -0.0826, "reward": 0.45833334885537624, "reward_std": 0.3680921569466591, "rewards/accuracy_reward": 0.45833334885537624, "rewards/format_reward": 0.0, "step": 753 }, { "completion_length": 427.31250762939453, "epoch": 0.8042666666666667, "grad_norm": 0.9394975900650024, "kl": 0.3402099609375, "learning_rate": 3.3551331938795246e-07, "loss": 0.0133, "reward": 0.6875000149011612, "reward_std": 0.1530931033194065, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 754 }, { "completion_length": 479.9166793823242, "epoch": 0.8053333333333333, "grad_norm": 0.9788852334022522, "kl": 1.9697265625, "learning_rate": 3.3199783526952656e-07, "loss": -0.1067, "reward": 0.6041667014360428, "reward_std": 0.28219256177544594, "rewards/accuracy_reward": 0.6041667014360428, "rewards/format_reward": 0.0, "step": 755 }, { "completion_length": 450.2083435058594, "epoch": 0.8064, "grad_norm": 1.0135194063186646, "kl": 1.19873046875, "learning_rate": 3.284985725289734e-07, "loss": -0.1256, "reward": 0.645833358168602, "reward_std": 0.42872628942131996, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 756 }, { "completion_length": 390.1041793823242, "epoch": 0.8074666666666667, "grad_norm": 2.793994426727295, "kl": 1.322265625, "learning_rate": 3.25015579764545e-07, "loss": -0.0744, "reward": 0.6666666865348816, "reward_std": 0.3602609857916832, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 757 }, { "completion_length": 447.37500762939453, "epoch": 0.8085333333333333, "grad_norm": 0.877232015132904, "kl": 0.6070556640625, "learning_rate": 3.2154890534853295e-07, "loss": -0.0194, "reward": 0.6666666865348816, "reward_std": 0.23116152361035347, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 758 }, { "completion_length": 629.2292022705078, "epoch": 0.8096, "grad_norm": 0.6380199193954468, "kl": 0.2042999267578125, "learning_rate": 3.1809859742659784e-07, "loss": -0.0382, "reward": 0.666666679084301, "reward_std": 0.23116153478622437, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.0, "step": 759 }, { "completion_length": 619.0833511352539, "epoch": 0.8106666666666666, "grad_norm": 1.2432677745819092, "kl": 0.933349609375, "learning_rate": 3.146647039171002e-07, "loss": -0.167, "reward": 0.41666667722165585, "reward_std": 0.37073594704270363, "rewards/accuracy_reward": 0.41666667722165585, "rewards/format_reward": 0.0, "step": 760 }, { "completion_length": 455.6041793823242, "epoch": 0.8117333333333333, "grad_norm": 0.4903915822505951, "kl": 1.07269287109375, "learning_rate": 3.112472725104345e-07, "loss": -0.1323, "reward": 0.6875, "reward_std": 0.3266642391681671, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 761 }, { "completion_length": 438.27083587646484, "epoch": 0.8128, "grad_norm": 0.7331161499023438, "kl": 1.126953125, "learning_rate": 3.078463506683674e-07, "loss": -0.1265, "reward": 0.5833333432674408, "reward_std": 0.3602609820663929, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 762 }, { "completion_length": 423.9583511352539, "epoch": 0.8138666666666666, "grad_norm": 0.8931428790092468, "kl": 0.6461181640625, "learning_rate": 3.0446198562337857e-07, "loss": -0.1391, "reward": 0.541666679084301, "reward_std": 0.24859580025076866, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "step": 763 }, { "completion_length": 665.1666831970215, "epoch": 0.8149333333333333, "grad_norm": 1.325466275215149, "kl": 1.394775390625, "learning_rate": 3.0109422437800415e-07, "loss": -0.0193, "reward": 0.4791666865348816, "reward_std": 0.3170611336827278, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 0.0, "step": 764 }, { "completion_length": 317.3958511352539, "epoch": 0.816, "grad_norm": 3.5130774974823, "kl": 2.91796875, "learning_rate": 2.977431137041848e-07, "loss": -0.1804, "reward": 0.6041666977107525, "reward_std": 0.31970491260290146, "rewards/accuracy_reward": 0.6041666977107525, "rewards/format_reward": 0.0, "step": 765 }, { "completion_length": 403.00000762939453, "epoch": 0.8170666666666667, "grad_norm": 1.156663179397583, "kl": 1.58447265625, "learning_rate": 2.944087001426154e-07, "loss": -0.0921, "reward": 0.7291666939854622, "reward_std": 0.28219257295131683, "rewards/accuracy_reward": 0.7291666939854622, "rewards/format_reward": 0.0, "step": 766 }, { "completion_length": 514.2291870117188, "epoch": 0.8181333333333334, "grad_norm": 0.5595344305038452, "kl": 0.84912109375, "learning_rate": 2.9109103000209945e-07, "loss": -0.1747, "reward": 0.666666679084301, "reward_std": 0.30354245379567146, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.0, "step": 767 }, { "completion_length": 501.62501525878906, "epoch": 0.8192, "grad_norm": 0.6911687850952148, "kl": 1.05224609375, "learning_rate": 2.877901493589048e-07, "loss": -0.0577, "reward": 0.6666666865348816, "reward_std": 0.38552645593881607, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 768 }, { "completion_length": 483.6458511352539, "epoch": 0.8202666666666667, "grad_norm": 0.71360182762146, "kl": 1.3599853515625, "learning_rate": 2.8450610405612504e-07, "loss": -0.091, "reward": 0.7500000149011612, "reward_std": 0.22155844420194626, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 769 }, { "completion_length": 544.1875152587891, "epoch": 0.8213333333333334, "grad_norm": 0.445164293050766, "kl": 0.431884765625, "learning_rate": 2.8123893970304154e-07, "loss": -0.1013, "reward": 0.5416666828095913, "reward_std": 0.23116153478622437, "rewards/accuracy_reward": 0.5416666828095913, "rewards/format_reward": 0.0, "step": 770 }, { "completion_length": 399.0833435058594, "epoch": 0.8224, "grad_norm": 2.299436569213867, "kl": 7.3145751953125, "learning_rate": 2.779887016744915e-07, "loss": -0.2016, "reward": 0.833333358168602, "reward_std": 0.2861081659793854, "rewards/accuracy_reward": 0.833333358168602, "rewards/format_reward": 0.0, "step": 771 }, { "completion_length": 490.3333435058594, "epoch": 0.8234666666666667, "grad_norm": 0.996925413608551, "kl": 2.005828857421875, "learning_rate": 2.7475543511023627e-07, "loss": -0.0392, "reward": 0.6875000149011612, "reward_std": 0.1801304891705513, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 772 }, { "completion_length": 425.0208435058594, "epoch": 0.8245333333333333, "grad_norm": 1.0204027891159058, "kl": 1.387939453125, "learning_rate": 2.715391849143354e-07, "loss": -0.1413, "reward": 0.7708333432674408, "reward_std": 0.31970490515232086, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 773 }, { "completion_length": 407.18750762939453, "epoch": 0.8256, "grad_norm": 1.0735416412353516, "kl": 0.97332763671875, "learning_rate": 2.6833999575452256e-07, "loss": -0.1301, "reward": 0.5208333432674408, "reward_std": 0.39121395349502563, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 774 }, { "completion_length": 526.8750076293945, "epoch": 0.8266666666666667, "grad_norm": 1.8516161441802979, "kl": 1.850677490234375, "learning_rate": 2.651579120615855e-07, "loss": -0.0739, "reward": 0.7083333432674408, "reward_std": 0.23116151988506317, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 775 }, { "completion_length": 476.5416793823242, "epoch": 0.8277333333333333, "grad_norm": 0.9802082777023315, "kl": 1.6435546875, "learning_rate": 2.6199297802874865e-07, "loss": -0.162, "reward": 0.5625000149011612, "reward_std": 0.38161085173487663, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 776 }, { "completion_length": 411.9791793823242, "epoch": 0.8288, "grad_norm": 0.7443565130233765, "kl": 1.12255859375, "learning_rate": 2.5884523761106026e-07, "loss": -0.0167, "reward": 0.5, "reward_std": 0.18404608219861984, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 777 }, { "completion_length": 406.56251525878906, "epoch": 0.8298666666666666, "grad_norm": 3.1902129650115967, "kl": 2.28125, "learning_rate": 2.5571473452478045e-07, "loss": -0.1441, "reward": 0.4166666865348816, "reward_std": 0.3602609857916832, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "step": 778 }, { "completion_length": 484.50001525878906, "epoch": 0.8309333333333333, "grad_norm": 2.6404922008514404, "kl": 1.9296875, "learning_rate": 2.526015122467751e-07, "loss": -0.0808, "reward": 0.4583333432674408, "reward_std": 0.23899272456765175, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "step": 779 }, { "completion_length": 482.81251525878906, "epoch": 0.832, "grad_norm": 1.4247092008590698, "kl": 1.224365234375, "learning_rate": 2.495056140139119e-07, "loss": -0.1248, "reward": 0.7500000149011612, "reward_std": 0.2957112491130829, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 780 }, { "completion_length": 633.7708435058594, "epoch": 0.8330666666666666, "grad_norm": 0.8521125316619873, "kl": 0.57470703125, "learning_rate": 2.464270828224597e-07, "loss": -0.1166, "reward": 0.583333358168602, "reward_std": 0.30354245379567146, "rewards/accuracy_reward": 0.583333358168602, "rewards/format_reward": 0.0, "step": 781 }, { "completion_length": 677.3541870117188, "epoch": 0.8341333333333333, "grad_norm": 0.7411309480667114, "kl": 1.0498046875, "learning_rate": 2.433659614274909e-07, "loss": -0.2329, "reward": 0.645833358168602, "reward_std": 0.48367292433977127, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 782 }, { "completion_length": 567.2083511352539, "epoch": 0.8352, "grad_norm": 0.7431278228759766, "kl": 0.9560546875, "learning_rate": 2.403222923422895e-07, "loss": -0.1362, "reward": 0.6250000149011612, "reward_std": 0.377695269882679, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 783 }, { "completion_length": 517.2916793823242, "epoch": 0.8362666666666667, "grad_norm": 0.496969074010849, "kl": 0.2730865478515625, "learning_rate": 2.372961178377585e-07, "loss": -0.029, "reward": 0.6041666716337204, "reward_std": 0.1801304966211319, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 784 }, { "completion_length": 608.0416793823242, "epoch": 0.8373333333333334, "grad_norm": 1.1124740839004517, "kl": 1.0220947265625, "learning_rate": 2.3428747994183364e-07, "loss": -0.0471, "reward": 0.6666667014360428, "reward_std": 0.3776952587068081, "rewards/accuracy_reward": 0.6666667014360428, "rewards/format_reward": 0.0, "step": 785 }, { "completion_length": 466.56251525878906, "epoch": 0.8384, "grad_norm": 2.8153281211853027, "kl": 3.5576171875, "learning_rate": 2.312964204389e-07, "loss": -0.1158, "reward": 0.5416666865348816, "reward_std": 0.3872983753681183, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 786 }, { "completion_length": 401.37501525878906, "epoch": 0.8394666666666667, "grad_norm": 1.8699891567230225, "kl": 2.1640625, "learning_rate": 2.2832298086921127e-07, "loss": -0.206, "reward": 0.5833333507180214, "reward_std": 0.3506578952074051, "rewards/accuracy_reward": 0.5833333507180214, "rewards/format_reward": 0.0, "step": 787 }, { "completion_length": 431.8958511352539, "epoch": 0.8405333333333334, "grad_norm": 1.1240679025650024, "kl": 1.984375, "learning_rate": 2.2536720252831367e-07, "loss": -0.059, "reward": 0.4791666716337204, "reward_std": 0.28219255432486534, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "step": 788 }, { "completion_length": 353.50000381469727, "epoch": 0.8416, "grad_norm": 0.7561388611793518, "kl": 1.725830078125, "learning_rate": 2.2242912646647086e-07, "loss": -0.0747, "reward": 0.4375000037252903, "reward_std": 0.2996268570423126, "rewards/accuracy_reward": 0.4375000037252903, "rewards/format_reward": 0.0, "step": 789 }, { "completion_length": 546.9166793823242, "epoch": 0.8426666666666667, "grad_norm": 0.7471057176589966, "kl": 1.4912109375, "learning_rate": 2.1950879348809548e-07, "loss": -0.1625, "reward": 0.5625000223517418, "reward_std": 0.2996268607676029, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 790 }, { "completion_length": 625.4583587646484, "epoch": 0.8437333333333333, "grad_norm": 1.7431368827819824, "kl": 1.8984375, "learning_rate": 2.1660624415118158e-07, "loss": -0.1151, "reward": 0.43750002048909664, "reward_std": 0.35457347333431244, "rewards/accuracy_reward": 0.43750002048909664, "rewards/format_reward": 0.0, "step": 791 }, { "completion_length": 456.68751525878906, "epoch": 0.8448, "grad_norm": 2.094015598297119, "kl": 2.337890625, "learning_rate": 2.1372151876674112e-07, "loss": -0.116, "reward": 0.5208333432674408, "reward_std": 0.36417657136917114, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 792 }, { "completion_length": 444.91668701171875, "epoch": 0.8458666666666667, "grad_norm": 1.9445648193359375, "kl": 2.919921875, "learning_rate": 2.1085465739824516e-07, "loss": -0.0623, "reward": 0.4375000111758709, "reward_std": 0.44616059213876724, "rewards/accuracy_reward": 0.4375000111758709, "rewards/format_reward": 0.0, "step": 793 }, { "completion_length": 418.4166793823242, "epoch": 0.8469333333333333, "grad_norm": 2.066197633743286, "kl": 1.9716796875, "learning_rate": 2.080056998610662e-07, "loss": -0.1738, "reward": 0.5208333432674408, "reward_std": 0.34674230217933655, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 794 }, { "completion_length": 304.95833587646484, "epoch": 0.848, "grad_norm": 1.8181145191192627, "kl": 2.66015625, "learning_rate": 2.0517468572192632e-07, "loss": -0.1461, "reward": 0.45833334885537624, "reward_std": 0.4230388067662716, "rewards/accuracy_reward": 0.45833334885537624, "rewards/format_reward": 0.0, "step": 795 }, { "completion_length": 388.2291793823242, "epoch": 0.8490666666666666, "grad_norm": 0.9407544732093811, "kl": 1.9212646484375, "learning_rate": 2.023616542983466e-07, "loss": -0.2375, "reward": 0.7291666865348816, "reward_std": 0.35457348078489304, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 796 }, { "completion_length": 489.20835876464844, "epoch": 0.8501333333333333, "grad_norm": 1.590345859527588, "kl": 3.18359375, "learning_rate": 1.995666446581023e-07, "loss": -0.1702, "reward": 0.6875000149011612, "reward_std": 0.41912322491407394, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 797 }, { "completion_length": 394.6458435058594, "epoch": 0.8512, "grad_norm": 8.732400894165039, "kl": 4.6259765625, "learning_rate": 1.9678969561867894e-07, "loss": 0.0021, "reward": 0.3958333432674408, "reward_std": 0.1705273985862732, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 798 }, { "completion_length": 372.12501525878906, "epoch": 0.8522666666666666, "grad_norm": 1.9658806324005127, "kl": 2.35986328125, "learning_rate": 1.9403084574673463e-07, "loss": -0.0757, "reward": 0.3333333432674408, "reward_std": 0.23899272456765175, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 799 }, { "completion_length": 447.5208435058594, "epoch": 0.8533333333333334, "grad_norm": 1.091248631477356, "kl": 0.8143310546875, "learning_rate": 1.9129013335756317e-07, "loss": -0.0083, "reward": 0.4375000074505806, "reward_std": 0.1975647658109665, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "step": 800 }, { "completion_length": 402.5208435058594, "epoch": 0.8544, "grad_norm": 0.9836958050727844, "kl": 1.3583984375, "learning_rate": 1.8856759651456234e-07, "loss": -0.2458, "reward": 0.5625000223517418, "reward_std": 0.37377967685461044, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 801 }, { "completion_length": 349.2291717529297, "epoch": 0.8554666666666667, "grad_norm": 1.1232430934906006, "kl": 0.78759765625, "learning_rate": 1.8586327302870599e-07, "loss": -0.0314, "reward": 0.6458333432674408, "reward_std": 0.27258947864174843, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 802 }, { "completion_length": 309.1458396911621, "epoch": 0.8565333333333334, "grad_norm": 2.9496724605560303, "kl": 2.416015625, "learning_rate": 1.8317720045801778e-07, "loss": -0.219, "reward": 0.41666666977107525, "reward_std": 0.4152076207101345, "rewards/accuracy_reward": 0.41666666977107525, "rewards/format_reward": 0.0, "step": 803 }, { "completion_length": 488.0416793823242, "epoch": 0.8576, "grad_norm": 4.358962535858154, "kl": 0.713134765625, "learning_rate": 1.8050941610705053e-07, "loss": -0.0855, "reward": 0.708333358168602, "reward_std": 0.2686738707125187, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 804 }, { "completion_length": 531.2708587646484, "epoch": 0.8586666666666667, "grad_norm": 0.6511515974998474, "kl": 0.8094635009765625, "learning_rate": 1.7785995702636698e-07, "loss": -0.1127, "reward": 0.6041666716337204, "reward_std": 0.1530931033194065, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 805 }, { "completion_length": 378.2083435058594, "epoch": 0.8597333333333333, "grad_norm": 20.179170608520508, "kl": 5.22509765625, "learning_rate": 1.7522886001202687e-07, "loss": -0.08, "reward": 0.5208333507180214, "reward_std": 0.37377968057990074, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 806 }, { "completion_length": 527.0416793823242, "epoch": 0.8608, "grad_norm": 0.9842159748077393, "kl": 1.6875, "learning_rate": 1.7261616160507403e-07, "loss": -0.2206, "reward": 0.3541666753590107, "reward_std": 0.3816108666360378, "rewards/accuracy_reward": 0.3541666753590107, "rewards/format_reward": 0.0, "step": 807 }, { "completion_length": 442.7291793823242, "epoch": 0.8618666666666667, "grad_norm": 1.5493794679641724, "kl": 0.75390625, "learning_rate": 1.700218980910311e-07, "loss": -0.0974, "reward": 0.7291666865348816, "reward_std": 0.42872628942131996, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 808 }, { "completion_length": 457.2708511352539, "epoch": 0.8629333333333333, "grad_norm": 2.104104995727539, "kl": 0.9306640625, "learning_rate": 1.6744610549939322e-07, "loss": -0.1643, "reward": 0.7500000298023224, "reward_std": 0.3881702348589897, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.0, "step": 809 }, { "completion_length": 512.7083587646484, "epoch": 0.864, "grad_norm": 0.521250307559967, "kl": 0.998046875, "learning_rate": 1.64888819603129e-07, "loss": -0.1327, "reward": 0.5000000149011612, "reward_std": 0.23116152361035347, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "step": 810 }, { "completion_length": 405.08333587646484, "epoch": 0.8650666666666667, "grad_norm": 0.9727557301521301, "kl": 1.14886474609375, "learning_rate": 1.6235007591818385e-07, "loss": -0.1686, "reward": 0.5625000111758709, "reward_std": 0.24468021839857101, "rewards/accuracy_reward": 0.5625000111758709, "rewards/format_reward": 0.0, "step": 811 }, { "completion_length": 510.7916946411133, "epoch": 0.8661333333333333, "grad_norm": 0.6314650177955627, "kl": 0.6416015625, "learning_rate": 1.598299097029859e-07, "loss": -0.0752, "reward": 0.5833333432674408, "reward_std": 0.23899271339178085, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 812 }, { "completion_length": 415.12500762939453, "epoch": 0.8672, "grad_norm": 0.48967182636260986, "kl": 1.0283203125, "learning_rate": 1.573283559579572e-07, "loss": -0.1185, "reward": 0.583333358168602, "reward_std": 0.3506578952074051, "rewards/accuracy_reward": 0.583333358168602, "rewards/format_reward": 0.0, "step": 813 }, { "completion_length": 318.2083511352539, "epoch": 0.8682666666666666, "grad_norm": 1.1536293029785156, "kl": 1.3525390625, "learning_rate": 1.5484544942502694e-07, "loss": -0.137, "reward": 0.41666667722165585, "reward_std": 0.3332235887646675, "rewards/accuracy_reward": 0.41666667722165585, "rewards/format_reward": 0.0, "step": 814 }, { "completion_length": 346.66667556762695, "epoch": 0.8693333333333333, "grad_norm": 1.4599665403366089, "kl": 1.0927734375, "learning_rate": 1.5238122458714925e-07, "loss": -0.1203, "reward": 0.7083333432674408, "reward_std": 0.31314554437994957, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 815 }, { "completion_length": 488.7291793823242, "epoch": 0.8704, "grad_norm": 0.794980525970459, "kl": 0.7735595703125, "learning_rate": 1.4993571566782404e-07, "loss": -0.0577, "reward": 0.5416666865348816, "reward_std": 0.3506578914821148, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 816 }, { "completion_length": 450.20833587646484, "epoch": 0.8714666666666666, "grad_norm": 0.8937171101570129, "kl": 1.19189453125, "learning_rate": 1.475089566306226e-07, "loss": -0.1571, "reward": 0.604166679084301, "reward_std": 0.34674228727817535, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "step": 817 }, { "completion_length": 535.2500076293945, "epoch": 0.8725333333333334, "grad_norm": 0.6431587338447571, "kl": 1.28515625, "learning_rate": 1.4510098117871462e-07, "loss": 0.0158, "reward": 0.4583333432674408, "reward_std": 0.23899272456765175, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "step": 818 }, { "completion_length": 422.93750762939453, "epoch": 0.8736, "grad_norm": 0.5082134008407593, "kl": 1.353515625, "learning_rate": 1.4271182275440077e-07, "loss": -0.2023, "reward": 0.5208333432674408, "reward_std": 0.31970491260290146, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 819 }, { "completion_length": 576.8125152587891, "epoch": 0.8746666666666667, "grad_norm": 0.8667093515396118, "kl": 1.12841796875, "learning_rate": 1.4034151453864846e-07, "loss": -0.1293, "reward": 0.4166666865348816, "reward_std": 0.395129531621933, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "step": 820 }, { "completion_length": 470.2291793823242, "epoch": 0.8757333333333334, "grad_norm": 2.089831590652466, "kl": 1.444580078125, "learning_rate": 1.3799008945063046e-07, "loss": -0.1263, "reward": 0.3958333395421505, "reward_std": 0.33713919296860695, "rewards/accuracy_reward": 0.3958333395421505, "rewards/format_reward": 0.0, "step": 821 }, { "completion_length": 438.87500762939453, "epoch": 0.8768, "grad_norm": 1.0309497117996216, "kl": 1.3544921875, "learning_rate": 1.3565758014726843e-07, "loss": -0.1002, "reward": 0.33333334140479565, "reward_std": 0.3602609857916832, "rewards/accuracy_reward": 0.33333334140479565, "rewards/format_reward": 0.0, "step": 822 }, { "completion_length": 517.0208511352539, "epoch": 0.8778666666666667, "grad_norm": 0.6552605032920837, "kl": 0.8349609375, "learning_rate": 1.3334401902277849e-07, "loss": -0.1531, "reward": 0.708333358168602, "reward_std": 0.3506578914821148, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 823 }, { "completion_length": 508.5416793823242, "epoch": 0.8789333333333333, "grad_norm": 1.5171558856964111, "kl": 0.84130859375, "learning_rate": 1.3104943820822195e-07, "loss": -0.0542, "reward": 0.6458333656191826, "reward_std": 0.4566355496644974, "rewards/accuracy_reward": 0.6458333656191826, "rewards/format_reward": 0.0, "step": 824 }, { "completion_length": 614.2708358764648, "epoch": 0.88, "grad_norm": 0.7536208629608154, "kl": 0.50732421875, "learning_rate": 1.287738695710592e-07, "loss": -0.0141, "reward": 0.6875000298023224, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 825 }, { "completion_length": 412.2291793823242, "epoch": 0.8810666666666667, "grad_norm": 1.2087587118148804, "kl": 0.685516357421875, "learning_rate": 1.265173447147064e-07, "loss": -0.1473, "reward": 0.645833358168602, "reward_std": 0.29962683096528053, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 826 }, { "completion_length": 514.5416717529297, "epoch": 0.8821333333333333, "grad_norm": 0.6650320887565613, "kl": 0.8076171875, "learning_rate": 1.2427989497809733e-07, "loss": -0.1263, "reward": 0.4375000149011612, "reward_std": 0.309229951351881, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 827 }, { "completion_length": 545.0416717529297, "epoch": 0.8832, "grad_norm": 2.4935503005981445, "kl": 1.350341796875, "learning_rate": 1.220615514352479e-07, "loss": -0.0822, "reward": 0.6458333507180214, "reward_std": 0.2900237590074539, "rewards/accuracy_reward": 0.6458333507180214, "rewards/format_reward": 0.0, "step": 828 }, { "completion_length": 536.7916793823242, "epoch": 0.8842666666666666, "grad_norm": 0.6058483719825745, "kl": 0.692108154296875, "learning_rate": 1.19862344894824e-07, "loss": -0.1062, "reward": 0.6250000223517418, "reward_std": 0.4248107150197029, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 829 }, { "completion_length": 519.5000152587891, "epoch": 0.8853333333333333, "grad_norm": 1.5790636539459229, "kl": 1.1142578125, "learning_rate": 1.1768230589971457e-07, "loss": -0.1849, "reward": 0.416666679084301, "reward_std": 0.38552645593881607, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 830 }, { "completion_length": 455.27083587646484, "epoch": 0.8864, "grad_norm": 47.1816291809082, "kl": 1.60302734375, "learning_rate": 1.1552146472660724e-07, "loss": -0.006, "reward": 0.5208333507180214, "reward_std": 0.42872630059719086, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 831 }, { "completion_length": 543.1666793823242, "epoch": 0.8874666666666666, "grad_norm": 1.1635318994522095, "kl": 1.310546875, "learning_rate": 1.1337985138556695e-07, "loss": -0.0144, "reward": 0.5208333432674408, "reward_std": 0.3170611187815666, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 832 }, { "completion_length": 525.6666870117188, "epoch": 0.8885333333333333, "grad_norm": 1.7766854763031006, "kl": 1.04052734375, "learning_rate": 1.1125749561962023e-07, "loss": 0.0107, "reward": 0.43750002048909664, "reward_std": 0.3170611262321472, "rewards/accuracy_reward": 0.43750002048909664, "rewards/format_reward": 0.0, "step": 833 }, { "completion_length": 401.5416793823242, "epoch": 0.8896, "grad_norm": 3.1519179344177246, "kl": 0.61083984375, "learning_rate": 1.0915442690434158e-07, "loss": 0.0379, "reward": 0.7291667014360428, "reward_std": 0.40168892592191696, "rewards/accuracy_reward": 0.7291667014360428, "rewards/format_reward": 0.0, "step": 834 }, { "completion_length": 416.85418701171875, "epoch": 0.8906666666666667, "grad_norm": 1.3824037313461304, "kl": 1.2197265625, "learning_rate": 1.0707067444744439e-07, "loss": -0.122, "reward": 0.6458333432674408, "reward_std": 0.2996268533170223, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 835 }, { "completion_length": 505.72918701171875, "epoch": 0.8917333333333334, "grad_norm": 1.0924291610717773, "kl": 0.594085693359375, "learning_rate": 1.0500626718837453e-07, "loss": -0.0781, "reward": 0.541666679084301, "reward_std": 0.3131455257534981, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "step": 836 }, { "completion_length": 394.0208435058594, "epoch": 0.8928, "grad_norm": 1.696822166442871, "kl": 1.31494140625, "learning_rate": 1.0296123379791039e-07, "loss": -0.1486, "reward": 0.5833333432674408, "reward_std": 0.3977733142673969, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 837 }, { "completion_length": 410.7083435058594, "epoch": 0.8938666666666667, "grad_norm": 1.8628230094909668, "kl": 1.70947265625, "learning_rate": 1.009356026777618e-07, "loss": 0.0768, "reward": 0.4583333507180214, "reward_std": 0.4422449879348278, "rewards/accuracy_reward": 0.4583333507180214, "rewards/format_reward": 0.0, "step": 838 }, { "completion_length": 468.50000762939453, "epoch": 0.8949333333333334, "grad_norm": 1.2724510431289673, "kl": 1.55078125, "learning_rate": 9.89294019601783e-08, "loss": -0.1652, "reward": 0.5, "reward_std": 0.43528566509485245, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 839 }, { "completion_length": 476.2083435058594, "epoch": 0.896, "grad_norm": 0.7524177432060242, "kl": 1.11083984375, "learning_rate": 9.69426595075566e-08, "loss": -0.1966, "reward": 0.4583333544433117, "reward_std": 0.3602609783411026, "rewards/accuracy_reward": 0.4583333544433117, "rewards/format_reward": 0.0, "step": 840 }, { "completion_length": 416.7916793823242, "epoch": 0.8970666666666667, "grad_norm": 0.3261817693710327, "kl": 0.5357666015625, "learning_rate": 9.497540291205459e-08, "loss": -0.0755, "reward": 0.5625000149011612, "reward_std": 0.2525114193558693, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 841 }, { "completion_length": 537.1666717529297, "epoch": 0.8981333333333333, "grad_norm": 1.8056341409683228, "kl": 1.128204345703125, "learning_rate": 9.302765949520765e-08, "loss": -0.1718, "reward": 0.5208333358168602, "reward_std": 0.24468021839857101, "rewards/accuracy_reward": 0.5208333358168602, "rewards/format_reward": 0.0, "step": 842 }, { "completion_length": 506.81250762939453, "epoch": 0.8992, "grad_norm": 4.631187438964844, "kl": 1.144378662109375, "learning_rate": 9.109945630754974e-08, "loss": 0.0272, "reward": 0.6041666865348816, "reward_std": 0.29962683096528053, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 843 }, { "completion_length": 545.8958511352539, "epoch": 0.9002666666666667, "grad_norm": 1.1916682720184326, "kl": 1.341796875, "learning_rate": 8.919082012823675e-08, "loss": -0.1772, "reward": 0.39583334885537624, "reward_std": 0.40168894082307816, "rewards/accuracy_reward": 0.39583334885537624, "rewards/format_reward": 0.0, "step": 844 }, { "completion_length": 501.81251525878906, "epoch": 0.9013333333333333, "grad_norm": 1.042085886001587, "kl": 1.64453125, "learning_rate": 8.730177746467616e-08, "loss": -0.1432, "reward": 0.3958333432674408, "reward_std": 0.38161085173487663, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 845 }, { "completion_length": 522.6041870117188, "epoch": 0.9024, "grad_norm": 1.349289894104004, "kl": 1.4052734375, "learning_rate": 8.543235455215687e-08, "loss": -0.0978, "reward": 0.3125000074505806, "reward_std": 0.1975647658109665, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "step": 846 }, { "completion_length": 506.9791717529297, "epoch": 0.9034666666666666, "grad_norm": 0.9991172552108765, "kl": 1.10693359375, "learning_rate": 8.358257735348695e-08, "loss": -0.0862, "reward": 0.6250000260770321, "reward_std": 0.3506578765809536, "rewards/accuracy_reward": 0.6250000260770321, "rewards/format_reward": 0.0, "step": 847 }, { "completion_length": 514.8125152587891, "epoch": 0.9045333333333333, "grad_norm": 0.6356818079948425, "kl": 0.423828125, "learning_rate": 8.175247155863124e-08, "loss": -0.0733, "reward": 0.7916666716337204, "reward_std": 0.2957112565636635, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.0, "step": 848 }, { "completion_length": 400.81250762939453, "epoch": 0.9056, "grad_norm": 0.9313225150108337, "kl": 0.56512451171875, "learning_rate": 7.994206258435576e-08, "loss": -0.0325, "reward": 0.7500000149011612, "reward_std": 0.18404607102274895, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 849 }, { "completion_length": 424.6666793823242, "epoch": 0.9066666666666666, "grad_norm": 1.7935646772384644, "kl": 1.7960205078125, "learning_rate": 7.81513755738742e-08, "loss": -0.0499, "reward": 0.5208333507180214, "reward_std": 0.37377968057990074, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 850 }, { "completion_length": 519.2083435058594, "epoch": 0.9077333333333333, "grad_norm": 0.5925604104995728, "kl": 0.590576171875, "learning_rate": 7.638043539649897e-08, "loss": -0.0583, "reward": 0.7083333432674408, "reward_std": 0.2861081622540951, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 851 }, { "completion_length": 543.1875152587891, "epoch": 0.9088, "grad_norm": 0.9502411484718323, "kl": 1.4638671875, "learning_rate": 7.462926664729592e-08, "loss": -0.1052, "reward": 0.41666669212281704, "reward_std": 0.350657869130373, "rewards/accuracy_reward": 0.41666669212281704, "rewards/format_reward": 0.0, "step": 852 }, { "completion_length": 375.93750381469727, "epoch": 0.9098666666666667, "grad_norm": 0.9063624143600464, "kl": 1.1943359375, "learning_rate": 7.289789364674165e-08, "loss": -0.0687, "reward": 0.6458333432674408, "reward_std": 0.38161085173487663, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 853 }, { "completion_length": 446.6666793823242, "epoch": 0.9109333333333334, "grad_norm": 0.668630063533783, "kl": 1.462890625, "learning_rate": 7.118634044038774e-08, "loss": -0.0607, "reward": 0.5208333414047956, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.5208333414047956, "rewards/format_reward": 0.0, "step": 854 }, { "completion_length": 437.3333511352539, "epoch": 0.912, "grad_norm": 1.8877599239349365, "kl": 0.714599609375, "learning_rate": 6.949463079852491e-08, "loss": -0.0254, "reward": 0.7500000149011612, "reward_std": 0.3680921792984009, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 855 }, { "completion_length": 420.5416793823242, "epoch": 0.9130666666666667, "grad_norm": 0.8522939085960388, "kl": 1.3408203125, "learning_rate": 6.782278821585386e-08, "loss": -0.1444, "reward": 0.645833358168602, "reward_std": 0.44616056978702545, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 856 }, { "completion_length": 491.0416717529297, "epoch": 0.9141333333333334, "grad_norm": 8.273486137390137, "kl": 5.8466796875, "learning_rate": 6.617083591115897e-08, "loss": -0.004, "reward": 0.6458333432674408, "reward_std": 0.27258946001529694, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 857 }, { "completion_length": 514.7083511352539, "epoch": 0.9152, "grad_norm": 0.9362492561340332, "kl": 1.1533203125, "learning_rate": 6.453879682698543e-08, "loss": -0.0446, "reward": 0.6041666939854622, "reward_std": 0.33713921159505844, "rewards/accuracy_reward": 0.6041666939854622, "rewards/format_reward": 0.0, "step": 858 }, { "completion_length": 399.60418701171875, "epoch": 0.9162666666666667, "grad_norm": 1.3416930437088013, "kl": 1.3427734375, "learning_rate": 6.292669362932102e-08, "loss": -0.1518, "reward": 0.5625000149011612, "reward_std": 0.4662386253476143, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 859 }, { "completion_length": 564.5208511352539, "epoch": 0.9173333333333333, "grad_norm": 36.98863983154297, "kl": 6.210693359375, "learning_rate": 6.133454870728111e-08, "loss": -0.0487, "reward": 0.645833358168602, "reward_std": 0.35457347333431244, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 860 }, { "completion_length": 581.8125076293945, "epoch": 0.9184, "grad_norm": 0.8380128741264343, "kl": 0.918121337890625, "learning_rate": 5.97623841727975e-08, "loss": 0.0008, "reward": 0.6458333507180214, "reward_std": 0.2621144950389862, "rewards/accuracy_reward": 0.6458333507180214, "rewards/format_reward": 0.0, "step": 861 }, { "completion_length": 530.7083435058594, "epoch": 0.9194666666666667, "grad_norm": 3.008572816848755, "kl": 1.96234130859375, "learning_rate": 5.8210221860311774e-08, "loss": 0.0682, "reward": 0.3333333432674408, "reward_std": 0.31314554065465927, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 862 }, { "completion_length": 482.4166793823242, "epoch": 0.9205333333333333, "grad_norm": 2.746826410293579, "kl": 1.723388671875, "learning_rate": 5.6678083326472064e-08, "loss": -0.0118, "reward": 0.5833333432674408, "reward_std": 0.38817023858428, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 863 }, { "completion_length": 538.1875076293945, "epoch": 0.9216, "grad_norm": 2.4731650352478027, "kl": 2.2513427734375, "learning_rate": 5.516598984983279e-08, "loss": 0.0214, "reward": 0.5625000223517418, "reward_std": 0.34674228355288506, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 864 }, { "completion_length": 403.06250762939453, "epoch": 0.9226666666666666, "grad_norm": 1.5095337629318237, "kl": 1.823974609375, "learning_rate": 5.367396243056022e-08, "loss": -0.164, "reward": 0.6458333432674408, "reward_std": 0.299626849591732, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 865 }, { "completion_length": 477.8958511352539, "epoch": 0.9237333333333333, "grad_norm": 1.383907675743103, "kl": 2.1025390625, "learning_rate": 5.2202021790140884e-08, "loss": -0.1396, "reward": 0.6875000298023224, "reward_std": 0.41912319883704185, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 866 }, { "completion_length": 538.3958587646484, "epoch": 0.9248, "grad_norm": 0.9723994135856628, "kl": 1.3857421875, "learning_rate": 5.075018837109263e-08, "loss": -0.1602, "reward": 0.4791666753590107, "reward_std": 0.3816108778119087, "rewards/accuracy_reward": 0.4791666753590107, "rewards/format_reward": 0.0, "step": 867 }, { "completion_length": 577.3541946411133, "epoch": 0.9258666666666666, "grad_norm": 0.8414086699485779, "kl": 0.5478515625, "learning_rate": 4.9318482336681515e-08, "loss": 0.0067, "reward": 0.6250000149011612, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 868 }, { "completion_length": 520.7708435058594, "epoch": 0.9269333333333334, "grad_norm": 1.4330782890319824, "kl": 1.22998046875, "learning_rate": 4.7906923570641695e-08, "loss": -0.0399, "reward": 0.6875000298023224, "reward_std": 0.41912319883704185, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 869 }, { "completion_length": 587.3333435058594, "epoch": 0.928, "grad_norm": 0.5407471656799316, "kl": 0.624908447265625, "learning_rate": 4.6515531676899316e-08, "loss": -0.0903, "reward": 0.6250000149011612, "reward_std": 0.3680921830236912, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 870 }, { "completion_length": 496.10418701171875, "epoch": 0.9290666666666667, "grad_norm": 1.6368472576141357, "kl": 2.27734375, "learning_rate": 4.514432597930007e-08, "loss": -0.1408, "reward": 0.5000000074505806, "reward_std": 0.3680921792984009, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.0, "step": 871 }, { "completion_length": 492.2708435058594, "epoch": 0.9301333333333334, "grad_norm": 1.3034980297088623, "kl": 1.0908203125, "learning_rate": 4.379332552134124e-08, "loss": -0.1238, "reward": 0.5833333507180214, "reward_std": 0.30354246497154236, "rewards/accuracy_reward": 0.5833333507180214, "rewards/format_reward": 0.0, "step": 872 }, { "completion_length": 485.2708511352539, "epoch": 0.9312, "grad_norm": 0.678155243396759, "kl": 1.120819091796875, "learning_rate": 4.246254906590641e-08, "loss": -0.1494, "reward": 0.7500000149011612, "reward_std": 0.18404608592391014, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 873 }, { "completion_length": 447.5833435058594, "epoch": 0.9322666666666667, "grad_norm": 0.7376736402511597, "kl": 0.75347900390625, "learning_rate": 4.115201509500582e-08, "loss": -0.1739, "reward": 0.583333358168602, "reward_std": 0.4152075946331024, "rewards/accuracy_reward": 0.583333358168602, "rewards/format_reward": 0.0, "step": 874 }, { "completion_length": 481.9583435058594, "epoch": 0.9333333333333333, "grad_norm": 0.7634227871894836, "kl": 0.363037109375, "learning_rate": 3.986174180951896e-08, "loss": -0.1235, "reward": 0.770833358168602, "reward_std": 0.3170611262321472, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 875 }, { "completion_length": 459.37500762939453, "epoch": 0.9344, "grad_norm": 1913.397705078125, "kl": 213.49658203125, "learning_rate": 3.8591747128942033e-08, "loss": 6.7753, "reward": 0.6875000149011612, "reward_std": 0.42872628569602966, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 876 }, { "completion_length": 483.4166793823242, "epoch": 0.9354666666666667, "grad_norm": 2.145822286605835, "kl": 0.8583984375, "learning_rate": 3.734204869113955e-08, "loss": -0.1165, "reward": 0.708333358168602, "reward_std": 0.4326419234275818, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 877 }, { "completion_length": 474.72918701171875, "epoch": 0.9365333333333333, "grad_norm": 0.713074266910553, "kl": 1.00830078125, "learning_rate": 3.611266385209849e-08, "loss": -0.0961, "reward": 0.5416666828095913, "reward_std": 0.23116152361035347, "rewards/accuracy_reward": 0.5416666828095913, "rewards/format_reward": 0.0, "step": 878 }, { "completion_length": 473.8958511352539, "epoch": 0.9376, "grad_norm": 1.45132577419281, "kl": 1.669921875, "learning_rate": 3.490360968568801e-08, "loss": -0.137, "reward": 0.5416666828095913, "reward_std": 0.2861081659793854, "rewards/accuracy_reward": 0.5416666828095913, "rewards/format_reward": 0.0, "step": 879 }, { "completion_length": 455.3333435058594, "epoch": 0.9386666666666666, "grad_norm": 0.8694736361503601, "kl": 1.328125, "learning_rate": 3.3714902983421944e-08, "loss": -0.0909, "reward": 0.7083333432674408, "reward_std": 0.3506578765809536, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 880 }, { "completion_length": 506.7083511352539, "epoch": 0.9397333333333333, "grad_norm": 1.3230805397033691, "kl": 0.633544921875, "learning_rate": 3.254656025422553e-08, "loss": -0.0648, "reward": 0.6458333432674408, "reward_std": 0.28219256922602654, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 881 }, { "completion_length": 612.1041717529297, "epoch": 0.9408, "grad_norm": 1.3860043287277222, "kl": 0.5888671875, "learning_rate": 3.1398597724206555e-08, "loss": 0.0368, "reward": 0.4375000074505806, "reward_std": 0.30745804682374, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "step": 882 }, { "completion_length": 428.75001525878906, "epoch": 0.9418666666666666, "grad_norm": 1.0901955366134644, "kl": 1.36328125, "learning_rate": 3.027103133642972e-08, "loss": -0.1809, "reward": 0.5833333432674408, "reward_std": 0.2861081585288048, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 883 }, { "completion_length": 396.16668701171875, "epoch": 0.9429333333333333, "grad_norm": 1.3204232454299927, "kl": 1.77734375, "learning_rate": 2.9163876750694986e-08, "loss": -0.1108, "reward": 0.41666668467223644, "reward_std": 0.4056045189499855, "rewards/accuracy_reward": 0.41666668467223644, "rewards/format_reward": 0.0, "step": 884 }, { "completion_length": 392.08335876464844, "epoch": 0.944, "grad_norm": 1.2917091846466064, "kl": 1.268310546875, "learning_rate": 2.807714934332073e-08, "loss": 0.0309, "reward": 0.6458333432674408, "reward_std": 0.33713919296860695, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 885 }, { "completion_length": 466.25000762939453, "epoch": 0.9450666666666667, "grad_norm": 0.7749580144882202, "kl": 0.68017578125, "learning_rate": 2.7010864206929443e-08, "loss": -0.0397, "reward": 0.604166679084301, "reward_std": 0.42872631549835205, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "step": 886 }, { "completion_length": 413.18751525878906, "epoch": 0.9461333333333334, "grad_norm": 1.4309486150741577, "kl": 1.228363037109375, "learning_rate": 2.5965036150238706e-08, "loss": -0.1937, "reward": 0.6875000298023224, "reward_std": 0.38161084800958633, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 887 }, { "completion_length": 540.7291870117188, "epoch": 0.9472, "grad_norm": 0.40669217705726624, "kl": 0.582275390625, "learning_rate": 2.4939679697855212e-08, "loss": -0.0548, "reward": 0.47916667722165585, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.47916667722165585, "rewards/format_reward": 0.0, "step": 888 }, { "completion_length": 399.0625, "epoch": 0.9482666666666667, "grad_norm": 1.3262017965316772, "kl": 2.2470703125, "learning_rate": 2.393480909007306e-08, "loss": -0.1883, "reward": 0.5625000223517418, "reward_std": 0.36417657136917114, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 889 }, { "completion_length": 503.4166717529297, "epoch": 0.9493333333333334, "grad_norm": 1.6317317485809326, "kl": 0.5078125, "learning_rate": 2.2950438282676455e-08, "loss": -0.0209, "reward": 0.770833358168602, "reward_std": 0.29962683841586113, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 890 }, { "completion_length": 578.6666793823242, "epoch": 0.9504, "grad_norm": 1.166954755783081, "kl": 1.12060546875, "learning_rate": 2.1986580946744993e-08, "loss": -0.0318, "reward": 0.6041666716337204, "reward_std": 0.3720077611505985, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 891 }, { "completion_length": 573.5625152587891, "epoch": 0.9514666666666667, "grad_norm": 1.2898495197296143, "kl": 0.9755859375, "learning_rate": 2.104325046846467e-08, "loss": -0.0435, "reward": 0.6666666865348816, "reward_std": 0.3332235999405384, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 892 }, { "completion_length": 351.3333435058594, "epoch": 0.9525333333333333, "grad_norm": 1.3097673654556274, "kl": 1.20166015625, "learning_rate": 2.012045994894135e-08, "loss": 0.002, "reward": 0.5833333432674408, "reward_std": 0.2957112491130829, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 893 }, { "completion_length": 519.0416870117188, "epoch": 0.9536, "grad_norm": 1.5249446630477905, "kl": 0.9130859375, "learning_rate": 1.9218222204019087e-08, "loss": -0.0491, "reward": 0.5833333507180214, "reward_std": 0.23116153106093407, "rewards/accuracy_reward": 0.5833333507180214, "rewards/format_reward": 0.0, "step": 894 }, { "completion_length": 436.0416717529297, "epoch": 0.9546666666666667, "grad_norm": 1.0876318216323853, "kl": 0.940673828125, "learning_rate": 1.8336549764102594e-08, "loss": -0.1246, "reward": 0.6041666865348816, "reward_std": 0.4662386327981949, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 895 }, { "completion_length": 386.18750762939453, "epoch": 0.9557333333333333, "grad_norm": 1.198004126548767, "kl": 1.06689453125, "learning_rate": 1.7475454873982057e-08, "loss": -0.1149, "reward": 0.6250000149011612, "reward_std": 0.25819891691207886, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 896 }, { "completion_length": 472.56251525878906, "epoch": 0.9568, "grad_norm": 0.70694500207901, "kl": 1.1414794921875, "learning_rate": 1.6634949492664253e-08, "loss": -0.0987, "reward": 0.6458333488553762, "reward_std": 0.27258947864174843, "rewards/accuracy_reward": 0.6458333488553762, "rewards/format_reward": 0.0, "step": 897 }, { "completion_length": 548.0416870117188, "epoch": 0.9578666666666666, "grad_norm": 0.5221810340881348, "kl": 0.828826904296875, "learning_rate": 1.5815045293205544e-08, "loss": -0.0768, "reward": 0.5416666772216558, "reward_std": 0.268673874437809, "rewards/accuracy_reward": 0.5416666772216558, "rewards/format_reward": 0.0, "step": 898 }, { "completion_length": 431.8541793823242, "epoch": 0.9589333333333333, "grad_norm": 1.817257285118103, "kl": 0.437255859375, "learning_rate": 1.5015753662550813e-08, "loss": -0.0735, "reward": 0.770833358168602, "reward_std": 0.34674228355288506, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 899 }, { "completion_length": 522.1250076293945, "epoch": 0.96, "grad_norm": 1.2571015357971191, "kl": 0.243408203125, "learning_rate": 1.4237085701374109e-08, "loss": -0.0943, "reward": 0.6666666865348816, "reward_std": 0.18404608592391014, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 900 }, { "completion_length": 472.81250762939453, "epoch": 0.9610666666666666, "grad_norm": 1.7115795612335205, "kl": 0.849609375, "learning_rate": 1.3479052223925259e-08, "loss": -0.0707, "reward": 0.708333358168602, "reward_std": 0.4422449879348278, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 901 }, { "completion_length": 516.7083511352539, "epoch": 0.9621333333333333, "grad_norm": 1.2389189004898071, "kl": 1.3466796875, "learning_rate": 1.2741663757879496e-08, "loss": 0.0138, "reward": 0.5208333544433117, "reward_std": 0.30922994762659073, "rewards/accuracy_reward": 0.5208333544433117, "rewards/format_reward": 0.0, "step": 902 }, { "completion_length": 348.7083435058594, "epoch": 0.9632, "grad_norm": 1.0608021020889282, "kl": 2.015167236328125, "learning_rate": 1.2024930544191237e-08, "loss": -0.0826, "reward": 0.6666666865348816, "reward_std": 0.3855264410376549, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 903 }, { "completion_length": 512.3958435058594, "epoch": 0.9642666666666667, "grad_norm": 1.2611281871795654, "kl": 0.51025390625, "learning_rate": 1.1328862536952033e-08, "loss": 0.0165, "reward": 0.5833333544433117, "reward_std": 0.268673874437809, "rewards/accuracy_reward": 0.5833333544433117, "rewards/format_reward": 0.0, "step": 904 }, { "completion_length": 535.1666793823242, "epoch": 0.9653333333333334, "grad_norm": 1.12821626663208, "kl": 1.040283203125, "learning_rate": 1.0653469403252015e-08, "loss": -0.1254, "reward": 0.45833334885537624, "reward_std": 0.3506578654050827, "rewards/accuracy_reward": 0.45833334885537624, "rewards/format_reward": 0.0, "step": 905 }, { "completion_length": 539.2083435058594, "epoch": 0.9664, "grad_norm": 0.47398731112480164, "kl": 0.530029296875, "learning_rate": 9.998760523045492e-09, "loss": -0.1255, "reward": 0.770833358168602, "reward_std": 0.2996268607676029, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 906 }, { "completion_length": 531.5000152587891, "epoch": 0.9674666666666667, "grad_norm": 1.882703185081482, "kl": 0.912109375, "learning_rate": 9.3647449890214e-09, "loss": -0.0896, "reward": 0.6041666716337204, "reward_std": 0.36417657509446144, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 907 }, { "completion_length": 369.1458511352539, "epoch": 0.9685333333333334, "grad_norm": 1.4364863634109497, "kl": 1.11163330078125, "learning_rate": 8.751431606476234e-09, "loss": 0.0828, "reward": 0.7500000149011612, "reward_std": 0.268673874437809, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 908 }, { "completion_length": 406.9583435058594, "epoch": 0.9696, "grad_norm": 1.0051575899124146, "kl": 1.28564453125, "learning_rate": 8.158828893192471e-09, "loss": -0.136, "reward": 0.5625000149011612, "reward_std": 0.2525114119052887, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 909 }, { "completion_length": 640.2500228881836, "epoch": 0.9706666666666667, "grad_norm": 1.0727453231811523, "kl": 1.3828125, "learning_rate": 7.586945079319673e-09, "loss": -0.0026, "reward": 0.3958333432674408, "reward_std": 0.2621145099401474, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 910 }, { "completion_length": 562.7083587646484, "epoch": 0.9717333333333333, "grad_norm": 0.25173723697662354, "kl": 0.6207275390625, "learning_rate": 7.035788107260244e-09, "loss": -0.081, "reward": 0.8125000149011612, "reward_std": 0.1530931070446968, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 911 }, { "completion_length": 524.9791946411133, "epoch": 0.9728, "grad_norm": 0.4839745759963989, "kl": 1.01025390625, "learning_rate": 6.5053656315598455e-09, "loss": -0.1506, "reward": 0.5625000223517418, "reward_std": 0.41912321001291275, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 912 }, { "completion_length": 628.6458435058594, "epoch": 0.9738666666666667, "grad_norm": 2.1724607944488525, "kl": 1.15234375, "learning_rate": 5.9956850187998235e-09, "loss": 0.0061, "reward": 0.5416666716337204, "reward_std": 0.3855264410376549, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "step": 913 }, { "completion_length": 421.5416717529297, "epoch": 0.9749333333333333, "grad_norm": 0.5961008667945862, "kl": 0.94189453125, "learning_rate": 5.506753347496285e-09, "loss": -0.1688, "reward": 0.520833358168602, "reward_std": 0.42872629314661026, "rewards/accuracy_reward": 0.520833358168602, "rewards/format_reward": 0.0, "step": 914 }, { "completion_length": 475.70835876464844, "epoch": 0.976, "grad_norm": 0.501269519329071, "kl": 1.17578125, "learning_rate": 5.038577408000844e-09, "loss": -0.0503, "reward": 0.6250000149011612, "reward_std": 0.32097674161195755, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 915 }, { "completion_length": 394.39583587646484, "epoch": 0.9770666666666666, "grad_norm": 1.3275353908538818, "kl": 1.42578125, "learning_rate": 4.591163702406531e-09, "loss": -0.1458, "reward": 0.770833358168602, "reward_std": 0.40168892592191696, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 916 }, { "completion_length": 440.1041717529297, "epoch": 0.9781333333333333, "grad_norm": 3.1325180530548096, "kl": 0.995361328125, "learning_rate": 4.1645184444575325e-09, "loss": -0.1163, "reward": 0.7708333432674408, "reward_std": 0.235077116638422, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 917 }, { "completion_length": 386.5208396911621, "epoch": 0.9792, "grad_norm": 0.6285461783409119, "kl": 0.828125, "learning_rate": 3.758647559463091e-09, "loss": -0.0844, "reward": 0.666666679084301, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.0, "step": 918 }, { "completion_length": 465.22918701171875, "epoch": 0.9802666666666666, "grad_norm": 0.7328125238418579, "kl": 1.3427734375, "learning_rate": 3.37355668421524e-09, "loss": -0.0888, "reward": 0.5208333507180214, "reward_std": 0.43655750527977943, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 919 }, { "completion_length": 658.0416946411133, "epoch": 0.9813333333333333, "grad_norm": 0.7257159948348999, "kl": 0.482177734375, "learning_rate": 3.009251166909699e-09, "loss": -0.018, "reward": 0.5208333507180214, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 920 }, { "completion_length": 494.7083435058594, "epoch": 0.9824, "grad_norm": 1.1216107606887817, "kl": 1.72406005859375, "learning_rate": 2.665736067072766e-09, "loss": -0.0329, "reward": 0.6666666865348816, "reward_std": 0.3506578765809536, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 921 }, { "completion_length": 505.0625228881836, "epoch": 0.9834666666666667, "grad_norm": 4.102955341339111, "kl": 1.9156494140625, "learning_rate": 2.343016155490374e-09, "loss": 0.0661, "reward": 0.3750000037252903, "reward_std": 0.25819891691207886, "rewards/accuracy_reward": 0.3750000037252903, "rewards/format_reward": 0.0, "step": 922 }, { "completion_length": 427.47918701171875, "epoch": 0.9845333333333334, "grad_norm": 0.7036573886871338, "kl": 1.05908203125, "learning_rate": 2.041095914141644e-09, "loss": -0.2307, "reward": 0.6041666865348816, "reward_std": 0.3720077611505985, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 923 }, { "completion_length": 557.4583435058594, "epoch": 0.9856, "grad_norm": 0.6506845951080322, "kl": 1.345703125, "learning_rate": 1.7599795361376015e-09, "loss": -0.1027, "reward": 0.3125000074505806, "reward_std": 0.2621145099401474, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "step": 924 }, { "completion_length": 563.4583435058594, "epoch": 0.9866666666666667, "grad_norm": 0.9383055567741394, "kl": 0.392578125, "learning_rate": 1.4996709256617225e-09, "loss": -0.0844, "reward": 0.5208333432674408, "reward_std": 0.3720077611505985, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 925 }, { "completion_length": 535.1041793823242, "epoch": 0.9877333333333334, "grad_norm": 1.6107115745544434, "kl": 2.4765625, "learning_rate": 1.260173697916478e-09, "loss": -0.1147, "reward": 0.4791666865348816, "reward_std": 0.4932760149240494, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 0.0, "step": 926 }, { "completion_length": 477.87500762939453, "epoch": 0.9888, "grad_norm": 0.6815553903579712, "kl": 1.0716552734375, "learning_rate": 1.0414911790730397e-09, "loss": 0.037, "reward": 0.7500000298023224, "reward_std": 0.3332235962152481, "rewards/accuracy_reward": 0.7500000298023224, "rewards/format_reward": 0.0, "step": 927 }, { "completion_length": 525.3541870117188, "epoch": 0.9898666666666667, "grad_norm": 1.9460089206695557, "kl": 0.2510986328125, "learning_rate": 8.436264062248178e-10, "loss": -0.0219, "reward": 0.7708333432674408, "reward_std": 0.2996268458664417, "rewards/accuracy_reward": 0.7708333432674408, "rewards/format_reward": 0.0, "step": 928 }, { "completion_length": 404.9791793823242, "epoch": 0.9909333333333333, "grad_norm": 1.956809163093567, "kl": 2.3310546875, "learning_rate": 6.665821273456607e-10, "loss": -0.0434, "reward": 0.354166679084301, "reward_std": 0.38161084800958633, "rewards/accuracy_reward": 0.354166679084301, "rewards/format_reward": 0.0, "step": 929 }, { "completion_length": 511.54168701171875, "epoch": 0.992, "grad_norm": 0.5832378268241882, "kl": 0.56011962890625, "learning_rate": 5.103608012512195e-10, "loss": -0.0405, "reward": 0.6875000149011612, "reward_std": 0.29962683469057083, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 930 }, { "completion_length": 406.06251525878906, "epoch": 0.9930666666666667, "grad_norm": 0.7464867234230042, "kl": 0.6728515625, "learning_rate": 3.749645975653082e-10, "loss": -0.1216, "reward": 0.6666666865348816, "reward_std": 0.3776952549815178, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 931 }, { "completion_length": 515.1666870117188, "epoch": 0.9941333333333333, "grad_norm": 1.1789774894714355, "kl": 1.755859375, "learning_rate": 2.6039539668909486e-10, "loss": -0.0297, "reward": 0.5000000260770321, "reward_std": 0.3506578914821148, "rewards/accuracy_reward": 0.5000000260770321, "rewards/format_reward": 0.0, "step": 932 }, { "completion_length": 492.0833435058594, "epoch": 0.9952, "grad_norm": 1.3584299087524414, "kl": 1.2734375, "learning_rate": 1.666547897761217e-10, "loss": -0.0571, "reward": 0.5416666716337204, "reward_std": 0.18404608219861984, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "step": 933 }, { "completion_length": 548.9166870117188, "epoch": 0.9962666666666666, "grad_norm": 0.960150957107544, "kl": 1.1982421875, "learning_rate": 9.374407870882396e-11, "loss": -0.0858, "reward": 0.6250000298023224, "reward_std": 0.3881702311336994, "rewards/accuracy_reward": 0.6250000298023224, "rewards/format_reward": 0.0, "step": 934 }, { "completion_length": 635.5625152587891, "epoch": 0.9973333333333333, "grad_norm": 0.761093020439148, "kl": 1.18505859375, "learning_rate": 4.1664276081376796e-11, "loss": -0.0645, "reward": 0.4375000111758709, "reward_std": 0.21764282882213593, "rewards/accuracy_reward": 0.4375000111758709, "rewards/format_reward": 0.0, "step": 935 }, { "completion_length": 483.62501525878906, "epoch": 0.9984, "grad_norm": 21.442607879638672, "kl": 3.529296875, "learning_rate": 1.0416105185373503e-11, "loss": -0.0201, "reward": 0.6875000298023224, "reward_std": 0.4392012506723404, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 936 }, { "completion_length": 434.7708511352539, "epoch": 0.9994666666666666, "grad_norm": 1.006341814994812, "kl": 1.091796875, "learning_rate": 0.0, "loss": -0.167, "reward": 0.6666666716337204, "reward_std": 0.4422449991106987, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 937 }, { "epoch": 0.9994666666666666, "eval_completion_length": 469.99496316833495, "eval_kl": 1.3432896240234375, "eval_loss": -0.0576137974858284, "eval_reward": 0.5376333479851484, "eval_reward_std": 0.29796607765555383, "eval_rewards/accuracy_reward": 0.5376333479851484, "eval_rewards/format_reward": 0.0, "eval_runtime": 32298.0146, "eval_samples_per_second": 0.155, "eval_steps_per_second": 0.013, "step": 937 }, { "epoch": 0.9994666666666666, "step": 937, "total_flos": 0.0, "train_loss": -0.02607913586368691, "train_runtime": 106620.2283, "train_samples_per_second": 0.07, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 937, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }