{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.392694063926941, "eval_steps": 500, "global_step": 1400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 101.3359375, "epoch": 0.0045662100456621, "grad_norm": 6.199877738952637, "kl": 0.0, "learning_rate": 9.995433789954337e-07, "loss": -0.0, "reward": 0.6208333522081375, "reward_std": 0.723351925611496, "rewards/accuracy_reward": 0.2614583298563957, "rewards/format_reward": 0.359375, "step": 1 }, { "completion_length": 105.796875, "epoch": 0.0091324200913242, "grad_norm": 13.76264476776123, "kl": 0.0005550384521484375, "learning_rate": 9.990867579908674e-07, "loss": 0.0, "reward": 1.1440449953079224, "reward_std": 0.6297117471694946, "rewards/accuracy_reward": 0.5346700549125671, "rewards/format_reward": 0.609375, "step": 2 }, { "completion_length": 106.609375, "epoch": 0.0136986301369863, "grad_norm": 6.990738868713379, "kl": 0.000965118408203125, "learning_rate": 9.986301369863014e-07, "loss": 0.0, "reward": 0.9957855343818665, "reward_std": 0.726005494594574, "rewards/accuracy_reward": 0.45672303438186646, "rewards/format_reward": 0.5390625, "step": 3 }, { "completion_length": 101.4140625, "epoch": 0.0182648401826484, "grad_norm": 4.9568257331848145, "kl": 0.001346588134765625, "learning_rate": 9.98173515981735e-07, "loss": 0.0001, "reward": 0.991195559501648, "reward_std": 0.7322099208831787, "rewards/accuracy_reward": 0.43650802969932556, "rewards/format_reward": 0.5546875, "step": 4 }, { "completion_length": 106.4453125, "epoch": 0.0228310502283105, "grad_norm": 4.493799209594727, "kl": 0.0035247802734375, "learning_rate": 9.977168949771688e-07, "loss": 0.0001, "reward": 0.9764133095741272, "reward_std": 0.6022496819496155, "rewards/accuracy_reward": 0.421725794672966, "rewards/format_reward": 0.5546875, "step": 5 }, { "completion_length": 125.046875, "epoch": 0.0273972602739726, "grad_norm": 3.48476505279541, "kl": 0.00315093994140625, "learning_rate": 9.972602739726028e-07, "loss": 0.0001, "reward": 1.162934124469757, "reward_std": 0.5378114879131317, "rewards/accuracy_reward": 0.49887165427207947, "rewards/format_reward": 0.6640625, "step": 6 }, { "completion_length": 107.1484375, "epoch": 0.0319634703196347, "grad_norm": 3.6667165756225586, "kl": 0.0055999755859375, "learning_rate": 9.968036529680365e-07, "loss": 0.0002, "reward": 1.1229968070983887, "reward_std": 0.6023176610469818, "rewards/accuracy_reward": 0.4511217921972275, "rewards/format_reward": 0.671875, "step": 7 }, { "completion_length": 76.890625, "epoch": 0.0365296803652968, "grad_norm": 4.247033596038818, "kl": 0.017059326171875, "learning_rate": 9.963470319634703e-07, "loss": 0.0007, "reward": 0.9820911884307861, "reward_std": 0.6595480144023895, "rewards/accuracy_reward": 0.31021616607904434, "rewards/format_reward": 0.671875, "step": 8 }, { "completion_length": 99.359375, "epoch": 0.0410958904109589, "grad_norm": 3.2795424461364746, "kl": 0.018402099609375, "learning_rate": 9.95890410958904e-07, "loss": 0.0007, "reward": 1.3125749230384827, "reward_std": 0.5887171626091003, "rewards/accuracy_reward": 0.5235124826431274, "rewards/format_reward": 0.7890625, "step": 9 }, { "completion_length": 68.734375, "epoch": 0.045662100456621, "grad_norm": 4.997896671295166, "kl": 0.04248046875, "learning_rate": 9.954337899543377e-07, "loss": 0.0017, "reward": 1.2461639046669006, "reward_std": 0.527179092168808, "rewards/accuracy_reward": 0.386788934469223, "rewards/format_reward": 0.859375, "step": 10 }, { "completion_length": 98.9453125, "epoch": 0.0502283105022831, "grad_norm": 7.262642860412598, "kl": 0.023773193359375, "learning_rate": 9.949771689497717e-07, "loss": 0.001, "reward": 1.3204909563064575, "reward_std": 0.4520634114742279, "rewards/accuracy_reward": 0.46892838180065155, "rewards/format_reward": 0.8515625, "step": 11 }, { "completion_length": 102.6953125, "epoch": 0.0547945205479452, "grad_norm": 2.358719825744629, "kl": 0.032470703125, "learning_rate": 9.945205479452054e-07, "loss": 0.0013, "reward": 1.3686427474021912, "reward_std": 0.45033006370067596, "rewards/accuracy_reward": 0.5405177175998688, "rewards/format_reward": 0.828125, "step": 12 }, { "completion_length": 83.1171875, "epoch": 0.0593607305936073, "grad_norm": 3.457000255584717, "kl": 0.044189453125, "learning_rate": 9.940639269406391e-07, "loss": 0.0018, "reward": 1.3782268166542053, "reward_std": 0.4138905107975006, "rewards/accuracy_reward": 0.45635175704956055, "rewards/format_reward": 0.921875, "step": 13 }, { "completion_length": 48.734375, "epoch": 0.0639269406392694, "grad_norm": 3.6976211071014404, "kl": 0.056884765625, "learning_rate": 9.93607305936073e-07, "loss": 0.0023, "reward": 1.340489685535431, "reward_std": 0.37420646846294403, "rewards/accuracy_reward": 0.4029896557331085, "rewards/format_reward": 0.9375, "step": 14 }, { "completion_length": 79.2421875, "epoch": 0.0684931506849315, "grad_norm": 2.8622705936431885, "kl": 0.043212890625, "learning_rate": 9.931506849315068e-07, "loss": 0.0017, "reward": 1.2608134746551514, "reward_std": 0.45702624320983887, "rewards/accuracy_reward": 0.393625944852829, "rewards/format_reward": 0.8671875, "step": 15 }, { "completion_length": 101.453125, "epoch": 0.0730593607305936, "grad_norm": 3.5660035610198975, "kl": 0.034423828125, "learning_rate": 9.926940639269406e-07, "loss": 0.0014, "reward": 1.4559400081634521, "reward_std": 0.3906751722097397, "rewards/accuracy_reward": 0.5418775081634521, "rewards/format_reward": 0.9140625, "step": 16 }, { "completion_length": 58.125, "epoch": 0.0776255707762557, "grad_norm": 3.2858753204345703, "kl": 0.052978515625, "learning_rate": 9.922374429223745e-07, "loss": 0.0021, "reward": 1.3879406452178955, "reward_std": 0.32537831366062164, "rewards/accuracy_reward": 0.4113781452178955, "rewards/format_reward": 0.9765625, "step": 17 }, { "completion_length": 77.3671875, "epoch": 0.0821917808219178, "grad_norm": 2.0785841941833496, "kl": 0.0511474609375, "learning_rate": 9.917808219178082e-07, "loss": 0.002, "reward": 1.5014740824699402, "reward_std": 0.27246882766485214, "rewards/accuracy_reward": 0.540536567568779, "rewards/format_reward": 0.9609375, "step": 18 }, { "completion_length": 70.7890625, "epoch": 0.0867579908675799, "grad_norm": 3.1388633251190186, "kl": 0.0772705078125, "learning_rate": 9.91324200913242e-07, "loss": 0.0031, "reward": 1.4673261046409607, "reward_std": 0.3659689426422119, "rewards/accuracy_reward": 0.5142011493444443, "rewards/format_reward": 0.953125, "step": 19 }, { "completion_length": 73.71875, "epoch": 0.091324200913242, "grad_norm": 3.4941606521606445, "kl": 0.0772705078125, "learning_rate": 9.908675799086757e-07, "loss": 0.0031, "reward": 1.2669085264205933, "reward_std": 0.41668441891670227, "rewards/accuracy_reward": 0.3919084817171097, "rewards/format_reward": 0.875, "step": 20 }, { "completion_length": 86.0390625, "epoch": 0.0958904109589041, "grad_norm": 4.310213565826416, "kl": 0.0654296875, "learning_rate": 9.904109589041094e-07, "loss": 0.0026, "reward": 1.4882500767707825, "reward_std": 0.30715326964855194, "rewards/accuracy_reward": 0.5351250767707825, "rewards/format_reward": 0.953125, "step": 21 }, { "completion_length": 71.8359375, "epoch": 0.1004566210045662, "grad_norm": 3.6013519763946533, "kl": 0.09912109375, "learning_rate": 9.899543378995434e-07, "loss": 0.004, "reward": 1.602288007736206, "reward_std": 0.2894471287727356, "rewards/accuracy_reward": 0.6257254481315613, "rewards/format_reward": 0.9765625, "step": 22 }, { "completion_length": 80.5078125, "epoch": 0.1050228310502283, "grad_norm": 2.9229257106781006, "kl": 0.07373046875, "learning_rate": 9.894977168949771e-07, "loss": 0.0029, "reward": 1.5954613089561462, "reward_std": 0.29795171320438385, "rewards/accuracy_reward": 0.6267113089561462, "rewards/format_reward": 0.96875, "step": 23 }, { "completion_length": 78.6953125, "epoch": 0.1095890410958904, "grad_norm": 3.3861594200134277, "kl": 0.0640869140625, "learning_rate": 9.89041095890411e-07, "loss": 0.0026, "reward": 1.54551100730896, "reward_std": 0.2194862775504589, "rewards/accuracy_reward": 0.5611358880996704, "rewards/format_reward": 0.984375, "step": 24 }, { "completion_length": 73.4453125, "epoch": 0.1141552511415525, "grad_norm": 3.412217140197754, "kl": 0.093994140625, "learning_rate": 9.885844748858448e-07, "loss": 0.0038, "reward": 1.3831676244735718, "reward_std": 0.3298991322517395, "rewards/accuracy_reward": 0.4378551170229912, "rewards/format_reward": 0.9453125, "step": 25 }, { "completion_length": 67.359375, "epoch": 0.1187214611872146, "grad_norm": 2.750807285308838, "kl": 0.0947265625, "learning_rate": 9.881278538812785e-07, "loss": 0.0038, "reward": 1.5090773701667786, "reward_std": 0.30529043078422546, "rewards/accuracy_reward": 0.5559523701667786, "rewards/format_reward": 0.953125, "step": 26 }, { "completion_length": 65.1953125, "epoch": 0.1232876712328767, "grad_norm": 5.965710163116455, "kl": 0.10595703125, "learning_rate": 9.876712328767123e-07, "loss": 0.0042, "reward": 1.5281611680984497, "reward_std": 0.3281934857368469, "rewards/accuracy_reward": 0.5437861680984497, "rewards/format_reward": 0.984375, "step": 27 }, { "completion_length": 76.8671875, "epoch": 0.1278538812785388, "grad_norm": 3.2638497352600098, "kl": 0.07958984375, "learning_rate": 9.87214611872146e-07, "loss": 0.0032, "reward": 1.5927692651748657, "reward_std": 0.24901984632015228, "rewards/accuracy_reward": 0.6318317353725433, "rewards/format_reward": 0.9609375, "step": 28 }, { "completion_length": 66.0234375, "epoch": 0.1324200913242009, "grad_norm": 3.1384220123291016, "kl": 0.106689453125, "learning_rate": 9.867579908675797e-07, "loss": 0.0043, "reward": 1.5447565913200378, "reward_std": 0.284378357231617, "rewards/accuracy_reward": 0.5760065913200378, "rewards/format_reward": 0.96875, "step": 29 }, { "completion_length": 56.90625, "epoch": 0.136986301369863, "grad_norm": 5.531259059906006, "kl": 0.11572265625, "learning_rate": 9.863013698630137e-07, "loss": 0.0046, "reward": 1.6361945867538452, "reward_std": 0.24622034281492233, "rewards/accuracy_reward": 0.6361946165561676, "rewards/format_reward": 1.0, "step": 30 }, { "completion_length": 55.5703125, "epoch": 0.1415525114155251, "grad_norm": 3.289832353591919, "kl": 0.10986328125, "learning_rate": 9.858447488584474e-07, "loss": 0.0044, "reward": 1.4083333611488342, "reward_std": 0.32854069769382477, "rewards/accuracy_reward": 0.45520833134651184, "rewards/format_reward": 0.953125, "step": 31 }, { "completion_length": 76.828125, "epoch": 0.1461187214611872, "grad_norm": 10.165680885314941, "kl": 0.091064453125, "learning_rate": 9.853881278538814e-07, "loss": 0.0036, "reward": 1.5438354015350342, "reward_std": 0.23943377658724785, "rewards/accuracy_reward": 0.5672729313373566, "rewards/format_reward": 0.9765625, "step": 32 }, { "completion_length": 58.3515625, "epoch": 0.1506849315068493, "grad_norm": 5.475100517272949, "kl": 0.108642578125, "learning_rate": 9.84931506849315e-07, "loss": 0.0043, "reward": 1.6295759081840515, "reward_std": 0.25405153632164, "rewards/accuracy_reward": 0.6373884081840515, "rewards/format_reward": 0.9921875, "step": 33 }, { "completion_length": 67.96875, "epoch": 0.1552511415525114, "grad_norm": 2.0600223541259766, "kl": 0.085693359375, "learning_rate": 9.844748858447488e-07, "loss": 0.0034, "reward": 1.5968750715255737, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.5968749821186066, "rewards/format_reward": 1.0, "step": 34 }, { "completion_length": 55.0703125, "epoch": 0.1598173515981735, "grad_norm": 4.03507661819458, "kl": 0.134033203125, "learning_rate": 9.840182648401826e-07, "loss": 0.0054, "reward": 1.5210416316986084, "reward_std": 0.262370266020298, "rewards/accuracy_reward": 0.5288541465997696, "rewards/format_reward": 0.9921875, "step": 35 }, { "completion_length": 69.890625, "epoch": 0.1643835616438356, "grad_norm": 2.3833439350128174, "kl": 0.091796875, "learning_rate": 9.835616438356163e-07, "loss": 0.0037, "reward": 1.6906250715255737, "reward_std": 0.20069602131843567, "rewards/accuracy_reward": 0.6906249821186066, "rewards/format_reward": 1.0, "step": 36 }, { "completion_length": 67.2265625, "epoch": 0.1689497716894977, "grad_norm": 3.298387050628662, "kl": 0.126220703125, "learning_rate": 9.831050228310502e-07, "loss": 0.0051, "reward": 1.5833591222763062, "reward_std": 0.23695440590381622, "rewards/accuracy_reward": 0.5911716222763062, "rewards/format_reward": 0.9921875, "step": 37 }, { "completion_length": 75.1875, "epoch": 0.1735159817351598, "grad_norm": 2.7184557914733887, "kl": 0.126953125, "learning_rate": 9.82648401826484e-07, "loss": 0.0051, "reward": 1.6548610925674438, "reward_std": 0.18883602693676949, "rewards/accuracy_reward": 0.6704860329627991, "rewards/format_reward": 0.984375, "step": 38 }, { "completion_length": 64.015625, "epoch": 0.1780821917808219, "grad_norm": 3.2870519161224365, "kl": 0.117431640625, "learning_rate": 9.821917808219177e-07, "loss": 0.0047, "reward": 1.5351698398590088, "reward_std": 0.267734594643116, "rewards/accuracy_reward": 0.5664198100566864, "rewards/format_reward": 0.96875, "step": 39 }, { "completion_length": 52.1171875, "epoch": 0.182648401826484, "grad_norm": 5.351809024810791, "kl": 0.15234375, "learning_rate": 9.817351598173517e-07, "loss": 0.0061, "reward": 1.443321943283081, "reward_std": 0.32138869166374207, "rewards/accuracy_reward": 0.44332198798656464, "rewards/format_reward": 1.0, "step": 40 }, { "completion_length": 70.34375, "epoch": 0.1872146118721461, "grad_norm": 2.1224160194396973, "kl": 0.126220703125, "learning_rate": 9.812785388127854e-07, "loss": 0.0051, "reward": 1.761244773864746, "reward_std": 0.13502541184425354, "rewards/accuracy_reward": 0.7846822142601013, "rewards/format_reward": 0.9765625, "step": 41 }, { "completion_length": 64.1796875, "epoch": 0.1917808219178082, "grad_norm": 3.5647408962249756, "kl": 0.15234375, "learning_rate": 9.808219178082191e-07, "loss": 0.0061, "reward": 1.4638384580612183, "reward_std": 0.2666083872318268, "rewards/accuracy_reward": 0.47946345806121826, "rewards/format_reward": 0.984375, "step": 42 }, { "completion_length": 51.8515625, "epoch": 0.1963470319634703, "grad_norm": 9.191572189331055, "kl": 0.15185546875, "learning_rate": 9.803652968036529e-07, "loss": 0.0061, "reward": 1.5036438703536987, "reward_std": 0.22346660494804382, "rewards/accuracy_reward": 0.5114563703536987, "rewards/format_reward": 0.9921875, "step": 43 }, { "completion_length": 67.1328125, "epoch": 0.2009132420091324, "grad_norm": 4.356919765472412, "kl": 0.18994140625, "learning_rate": 9.799086757990868e-07, "loss": 0.0076, "reward": 1.448957920074463, "reward_std": 0.2659284919500351, "rewards/accuracy_reward": 0.4645828604698181, "rewards/format_reward": 0.984375, "step": 44 }, { "completion_length": 82.8203125, "epoch": 0.2054794520547945, "grad_norm": 4.919662952423096, "kl": 0.074951171875, "learning_rate": 9.794520547945205e-07, "loss": 0.003, "reward": 1.6562500596046448, "reward_std": 0.1944543570280075, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 45 }, { "completion_length": 79.3125, "epoch": 0.2100456621004566, "grad_norm": 3.381512403488159, "kl": 0.0947265625, "learning_rate": 9.789954337899543e-07, "loss": 0.0038, "reward": 1.5502474904060364, "reward_std": 0.18939972668886185, "rewards/accuracy_reward": 0.5502474904060364, "rewards/format_reward": 1.0, "step": 46 }, { "completion_length": 63.5390625, "epoch": 0.2146118721461187, "grad_norm": 1.7941492795944214, "kl": 0.101806640625, "learning_rate": 9.78538812785388e-07, "loss": 0.0041, "reward": 1.634374976158142, "reward_std": 0.13939146511256695, "rewards/accuracy_reward": 0.6343749165534973, "rewards/format_reward": 1.0, "step": 47 }, { "completion_length": 54.125, "epoch": 0.2191780821917808, "grad_norm": 3.9591901302337646, "kl": 0.2177734375, "learning_rate": 9.78082191780822e-07, "loss": 0.0087, "reward": 1.4007303714752197, "reward_std": 0.33134153485298157, "rewards/accuracy_reward": 0.4163554012775421, "rewards/format_reward": 0.984375, "step": 48 }, { "completion_length": 31.8984375, "epoch": 0.2237442922374429, "grad_norm": 3.5492911338806152, "kl": 0.15966796875, "learning_rate": 9.776255707762557e-07, "loss": 0.0064, "reward": 1.44140625, "reward_std": 0.32149194180965424, "rewards/accuracy_reward": 0.44140625, "rewards/format_reward": 1.0, "step": 49 }, { "completion_length": 59.5625, "epoch": 0.228310502283105, "grad_norm": 6.129611968994141, "kl": 0.15771484375, "learning_rate": 9.771689497716894e-07, "loss": 0.0063, "reward": 1.6152344346046448, "reward_std": 0.17767168581485748, "rewards/accuracy_reward": 0.623046875, "rewards/format_reward": 0.9921875, "step": 50 }, { "completion_length": 51.4453125, "epoch": 0.2328767123287671, "grad_norm": 1.8558975458145142, "kl": 0.1015625, "learning_rate": 9.767123287671234e-07, "loss": 0.0041, "reward": 1.6468749642372131, "reward_std": 0.16151440143585205, "rewards/accuracy_reward": 0.6468749940395355, "rewards/format_reward": 1.0, "step": 51 }, { "completion_length": 64.2734375, "epoch": 0.2374429223744292, "grad_norm": 1.8877390623092651, "kl": 0.122802734375, "learning_rate": 9.762557077625571e-07, "loss": 0.0049, "reward": 1.557812511920929, "reward_std": 0.1784707009792328, "rewards/accuracy_reward": 0.557812511920929, "rewards/format_reward": 1.0, "step": 52 }, { "completion_length": 67.5625, "epoch": 0.2420091324200913, "grad_norm": 3.6787755489349365, "kl": 0.111328125, "learning_rate": 9.757990867579908e-07, "loss": 0.0045, "reward": 1.6884114742279053, "reward_std": 0.17782026529312134, "rewards/accuracy_reward": 0.6962239742279053, "rewards/format_reward": 0.9921875, "step": 53 }, { "completion_length": 63.3671875, "epoch": 0.2465753424657534, "grad_norm": 2.3778576850891113, "kl": 0.0980224609375, "learning_rate": 9.753424657534246e-07, "loss": 0.0039, "reward": 1.6024739742279053, "reward_std": 0.19950664788484573, "rewards/accuracy_reward": 0.6102864444255829, "rewards/format_reward": 0.9921875, "step": 54 }, { "completion_length": 78.6640625, "epoch": 0.2511415525114155, "grad_norm": 3.8972158432006836, "kl": 0.113037109375, "learning_rate": 9.748858447488583e-07, "loss": 0.0045, "reward": 1.591796875, "reward_std": 0.17978167533874512, "rewards/accuracy_reward": 0.5917968302965164, "rewards/format_reward": 1.0, "step": 55 }, { "completion_length": 84.65625, "epoch": 0.2557077625570776, "grad_norm": 1.890934705734253, "kl": 0.086181640625, "learning_rate": 9.744292237442923e-07, "loss": 0.0035, "reward": 1.5904948115348816, "reward_std": 0.1708931028842926, "rewards/accuracy_reward": 0.5983072519302368, "rewards/format_reward": 0.9921875, "step": 56 }, { "completion_length": 68.1015625, "epoch": 0.2602739726027397, "grad_norm": 2.091691017150879, "kl": 0.110107421875, "learning_rate": 9.73972602739726e-07, "loss": 0.0044, "reward": 1.590624988079071, "reward_std": 0.21159524470567703, "rewards/accuracy_reward": 0.6062500178813934, "rewards/format_reward": 0.984375, "step": 57 }, { "completion_length": 58.8671875, "epoch": 0.2648401826484018, "grad_norm": 2.6050846576690674, "kl": 0.10546875, "learning_rate": 9.735159817351597e-07, "loss": 0.0042, "reward": 1.4429687857627869, "reward_std": 0.3084883987903595, "rewards/accuracy_reward": 0.4585937559604645, "rewards/format_reward": 0.984375, "step": 58 }, { "completion_length": 57.8125, "epoch": 0.2694063926940639, "grad_norm": 6.616756439208984, "kl": 0.104736328125, "learning_rate": 9.730593607305937e-07, "loss": 0.0042, "reward": 1.4373698234558105, "reward_std": 0.3443310409784317, "rewards/accuracy_reward": 0.46861980855464935, "rewards/format_reward": 0.96875, "step": 59 }, { "completion_length": 72.84375, "epoch": 0.273972602739726, "grad_norm": 2.8635833263397217, "kl": 0.115478515625, "learning_rate": 9.726027397260274e-07, "loss": 0.0046, "reward": 1.5471354722976685, "reward_std": 0.25735440850257874, "rewards/accuracy_reward": 0.5549479126930237, "rewards/format_reward": 0.9921875, "step": 60 }, { "completion_length": 89.2578125, "epoch": 0.2785388127853881, "grad_norm": 2.5327181816101074, "kl": 0.0791015625, "learning_rate": 9.721461187214611e-07, "loss": 0.0032, "reward": 1.732812523841858, "reward_std": 0.17859892547130585, "rewards/accuracy_reward": 0.7484374642372131, "rewards/format_reward": 0.984375, "step": 61 }, { "completion_length": 55.9453125, "epoch": 0.2831050228310502, "grad_norm": 2.024601697921753, "kl": 0.15771484375, "learning_rate": 9.716894977168949e-07, "loss": 0.0063, "reward": 1.6088541746139526, "reward_std": 0.28504033386707306, "rewards/accuracy_reward": 0.6401041746139526, "rewards/format_reward": 0.96875, "step": 62 }, { "completion_length": 69.328125, "epoch": 0.2876712328767123, "grad_norm": 4.5766730308532715, "kl": 0.111083984375, "learning_rate": 9.712328767123286e-07, "loss": 0.0044, "reward": 1.7034826278686523, "reward_std": 0.12682656943798065, "rewards/accuracy_reward": 0.719107449054718, "rewards/format_reward": 0.984375, "step": 63 }, { "completion_length": 78.0234375, "epoch": 0.2922374429223744, "grad_norm": 3.555784225463867, "kl": 0.109375, "learning_rate": 9.707762557077626e-07, "loss": 0.0044, "reward": 1.5473958253860474, "reward_std": 0.23826827853918076, "rewards/accuracy_reward": 0.5630208402872086, "rewards/format_reward": 0.984375, "step": 64 }, { "completion_length": 46.015625, "epoch": 0.2968036529680365, "grad_norm": 5.032433032989502, "kl": 0.17724609375, "learning_rate": 9.703196347031963e-07, "loss": 0.0071, "reward": 1.401562511920929, "reward_std": 0.2856694385409355, "rewards/accuracy_reward": 0.40156251192092896, "rewards/format_reward": 1.0, "step": 65 }, { "completion_length": 80.3203125, "epoch": 0.3013698630136986, "grad_norm": 3.9773030281066895, "kl": 0.083740234375, "learning_rate": 9.6986301369863e-07, "loss": 0.0034, "reward": 1.6140625476837158, "reward_std": 0.1680549457669258, "rewards/accuracy_reward": 0.6218749582767487, "rewards/format_reward": 0.9921875, "step": 66 }, { "completion_length": 72.734375, "epoch": 0.3059360730593607, "grad_norm": 4.309236526489258, "kl": 0.091796875, "learning_rate": 9.69406392694064e-07, "loss": 0.0037, "reward": 1.4543966054916382, "reward_std": 0.27315448969602585, "rewards/accuracy_reward": 0.4778340458869934, "rewards/format_reward": 0.9765625, "step": 67 }, { "completion_length": 48.328125, "epoch": 0.3105022831050228, "grad_norm": 2.2709150314331055, "kl": 0.17724609375, "learning_rate": 9.689497716894977e-07, "loss": 0.0071, "reward": 1.6068063974380493, "reward_std": 0.247873917222023, "rewards/accuracy_reward": 0.6380563378334045, "rewards/format_reward": 0.96875, "step": 68 }, { "completion_length": 64.4921875, "epoch": 0.3150684931506849, "grad_norm": 2.840082883834839, "kl": 0.12841796875, "learning_rate": 9.684931506849314e-07, "loss": 0.0051, "reward": 1.65234375, "reward_std": 0.1649593710899353, "rewards/accuracy_reward": 0.6601562201976776, "rewards/format_reward": 0.9921875, "step": 69 }, { "completion_length": 65.5, "epoch": 0.319634703196347, "grad_norm": 2.1552846431732178, "kl": 0.14453125, "learning_rate": 9.680365296803652e-07, "loss": 0.0058, "reward": 1.6424851417541504, "reward_std": 0.14906217902898788, "rewards/accuracy_reward": 0.642485111951828, "rewards/format_reward": 1.0, "step": 70 }, { "completion_length": 66.7265625, "epoch": 0.3242009132420091, "grad_norm": 2.2692887783050537, "kl": 0.1513671875, "learning_rate": 9.675799086757991e-07, "loss": 0.0061, "reward": 1.689843773841858, "reward_std": 0.17708637565374374, "rewards/accuracy_reward": 0.6976562440395355, "rewards/format_reward": 0.9921875, "step": 71 }, { "completion_length": 76.78125, "epoch": 0.3287671232876712, "grad_norm": 2.679795265197754, "kl": 0.121337890625, "learning_rate": 9.671232876712329e-07, "loss": 0.0049, "reward": 1.6899740099906921, "reward_std": 0.1546846330165863, "rewards/accuracy_reward": 0.6977863907814026, "rewards/format_reward": 0.9921875, "step": 72 }, { "completion_length": 44.765625, "epoch": 0.3333333333333333, "grad_norm": 5.212102890014648, "kl": 0.125, "learning_rate": 9.666666666666666e-07, "loss": 0.005, "reward": 1.5778015851974487, "reward_std": 0.2252170294523239, "rewards/accuracy_reward": 0.5778016149997711, "rewards/format_reward": 1.0, "step": 73 }, { "completion_length": 79.1953125, "epoch": 0.3378995433789954, "grad_norm": 2.121600389480591, "kl": 0.102783203125, "learning_rate": 9.662100456621003e-07, "loss": 0.0041, "reward": 1.4783854484558105, "reward_std": 0.2087068259716034, "rewards/accuracy_reward": 0.5018228888511658, "rewards/format_reward": 0.9765625, "step": 74 }, { "completion_length": 67.703125, "epoch": 0.3424657534246575, "grad_norm": 3.5149497985839844, "kl": 0.11279296875, "learning_rate": 9.657534246575343e-07, "loss": 0.0045, "reward": 1.655573308467865, "reward_std": 0.293765589594841, "rewards/accuracy_reward": 0.6868232786655426, "rewards/format_reward": 0.96875, "step": 75 }, { "completion_length": 64.078125, "epoch": 0.3470319634703196, "grad_norm": 3.8966798782348633, "kl": 0.150634765625, "learning_rate": 9.65296803652968e-07, "loss": 0.006, "reward": 1.686715006828308, "reward_std": 0.2304065003991127, "rewards/accuracy_reward": 0.7023399174213409, "rewards/format_reward": 0.984375, "step": 76 }, { "completion_length": 62.2890625, "epoch": 0.3515981735159817, "grad_norm": 3.204103469848633, "kl": 0.12451171875, "learning_rate": 9.648401826484017e-07, "loss": 0.005, "reward": 1.4418154954910278, "reward_std": 0.305108904838562, "rewards/accuracy_reward": 0.5199404954910278, "rewards/format_reward": 0.921875, "step": 77 }, { "completion_length": 56.8125, "epoch": 0.3561643835616438, "grad_norm": 35.58188247680664, "kl": 0.1640625, "learning_rate": 9.643835616438357e-07, "loss": 0.0066, "reward": 1.5536458492279053, "reward_std": 0.34119510650634766, "rewards/accuracy_reward": 0.6161458492279053, "rewards/format_reward": 0.9375, "step": 78 }, { "completion_length": 79.78125, "epoch": 0.3607305936073059, "grad_norm": 2.3489811420440674, "kl": 0.107421875, "learning_rate": 9.639269406392694e-07, "loss": 0.0043, "reward": 1.5434371829032898, "reward_std": 0.2838926613330841, "rewards/accuracy_reward": 0.6059371829032898, "rewards/format_reward": 0.9375, "step": 79 }, { "completion_length": 76.34375, "epoch": 0.365296803652968, "grad_norm": 3.45082426071167, "kl": 0.1591796875, "learning_rate": 9.634703196347032e-07, "loss": 0.0064, "reward": 1.510318636894226, "reward_std": 0.3496459722518921, "rewards/accuracy_reward": 0.5806310772895813, "rewards/format_reward": 0.9296875, "step": 80 }, { "completion_length": 83.40625, "epoch": 0.3698630136986301, "grad_norm": 2.6958389282226562, "kl": 0.13671875, "learning_rate": 9.630136986301369e-07, "loss": 0.0055, "reward": 1.4917969107627869, "reward_std": 0.29826872050762177, "rewards/accuracy_reward": 0.5308593809604645, "rewards/format_reward": 0.9609375, "step": 81 }, { "completion_length": 61.8984375, "epoch": 0.3744292237442922, "grad_norm": 9.85806941986084, "kl": 0.15966796875, "learning_rate": 9.625570776255706e-07, "loss": 0.0064, "reward": 1.2881510257720947, "reward_std": 0.37539002299308777, "rewards/accuracy_reward": 0.3506510257720947, "rewards/format_reward": 0.9375, "step": 82 }, { "completion_length": 87.1640625, "epoch": 0.3789954337899543, "grad_norm": 1.7815834283828735, "kl": 0.11572265625, "learning_rate": 9.621004566210046e-07, "loss": 0.0046, "reward": 1.624854028224945, "reward_std": 0.19279810786247253, "rewards/accuracy_reward": 0.6482914686203003, "rewards/format_reward": 0.9765625, "step": 83 }, { "completion_length": 72.078125, "epoch": 0.3835616438356164, "grad_norm": 3.1070339679718018, "kl": 0.14013671875, "learning_rate": 9.616438356164383e-07, "loss": 0.0056, "reward": 1.5429518222808838, "reward_std": 0.3384602963924408, "rewards/accuracy_reward": 0.5820143222808838, "rewards/format_reward": 0.9609375, "step": 84 }, { "completion_length": 81.640625, "epoch": 0.3881278538812785, "grad_norm": 2.3276116847991943, "kl": 0.14990234375, "learning_rate": 9.61187214611872e-07, "loss": 0.006, "reward": 1.5467524528503418, "reward_std": 0.23365992307662964, "rewards/accuracy_reward": 0.570189893245697, "rewards/format_reward": 0.9765625, "step": 85 }, { "completion_length": 72.9296875, "epoch": 0.3926940639269406, "grad_norm": 13.4576416015625, "kl": 0.111572265625, "learning_rate": 9.60730593607306e-07, "loss": 0.0045, "reward": 1.580468773841858, "reward_std": 0.3186973035335541, "rewards/accuracy_reward": 0.6195312440395355, "rewards/format_reward": 0.9609375, "step": 86 }, { "completion_length": 76.4453125, "epoch": 0.3972602739726027, "grad_norm": 9.36266040802002, "kl": 0.135498046875, "learning_rate": 9.602739726027397e-07, "loss": 0.0054, "reward": 1.459251582622528, "reward_std": 0.3123939037322998, "rewards/accuracy_reward": 0.5061265677213669, "rewards/format_reward": 0.953125, "step": 87 }, { "completion_length": 71.3828125, "epoch": 0.4018264840182648, "grad_norm": 3.115593671798706, "kl": 0.110107421875, "learning_rate": 9.598173515981735e-07, "loss": 0.0044, "reward": 1.513058066368103, "reward_std": 0.2660725861787796, "rewards/accuracy_reward": 0.5521205365657806, "rewards/format_reward": 0.9609375, "step": 88 }, { "completion_length": 64.2109375, "epoch": 0.4063926940639269, "grad_norm": 3.5730538368225098, "kl": 0.169921875, "learning_rate": 9.593607305936072e-07, "loss": 0.0068, "reward": 1.4846374988555908, "reward_std": 0.25958995521068573, "rewards/accuracy_reward": 0.5080749839544296, "rewards/format_reward": 0.9765625, "step": 89 }, { "completion_length": 95.4921875, "epoch": 0.410958904109589, "grad_norm": 2.466034173965454, "kl": 0.11083984375, "learning_rate": 9.58904109589041e-07, "loss": 0.0044, "reward": 1.6667287349700928, "reward_std": 0.21123456954956055, "rewards/accuracy_reward": 0.7057911455631256, "rewards/format_reward": 0.9609375, "step": 90 }, { "completion_length": 68.859375, "epoch": 0.4155251141552511, "grad_norm": 2.8238866329193115, "kl": 0.147216796875, "learning_rate": 9.584474885844749e-07, "loss": 0.0059, "reward": 1.4025428891181946, "reward_std": 0.2600523456931114, "rewards/accuracy_reward": 0.4416054040193558, "rewards/format_reward": 0.9609375, "step": 91 }, { "completion_length": 86.1484375, "epoch": 0.4200913242009132, "grad_norm": 3.0570385456085205, "kl": 0.112548828125, "learning_rate": 9.579908675799086e-07, "loss": 0.0045, "reward": 1.394381046295166, "reward_std": 0.3288656920194626, "rewards/accuracy_reward": 0.44125598669052124, "rewards/format_reward": 0.953125, "step": 92 }, { "completion_length": 61.4609375, "epoch": 0.4246575342465753, "grad_norm": 3.7054836750030518, "kl": 0.154541015625, "learning_rate": 9.575342465753423e-07, "loss": 0.0062, "reward": 1.3927083611488342, "reward_std": 0.3015543594956398, "rewards/accuracy_reward": 0.40052083134651184, "rewards/format_reward": 0.9921875, "step": 93 }, { "completion_length": 75.2578125, "epoch": 0.4292237442922374, "grad_norm": 2.8483920097351074, "kl": 0.123046875, "learning_rate": 9.570776255707763e-07, "loss": 0.0049, "reward": 1.5788614153862, "reward_std": 0.26594626903533936, "rewards/accuracy_reward": 0.6022988557815552, "rewards/format_reward": 0.9765625, "step": 94 }, { "completion_length": 51.71875, "epoch": 0.4337899543378995, "grad_norm": 3.642658233642578, "kl": 0.2294921875, "learning_rate": 9.5662100456621e-07, "loss": 0.0092, "reward": 1.3690476417541504, "reward_std": 0.32933689653873444, "rewards/accuracy_reward": 0.400297611951828, "rewards/format_reward": 0.96875, "step": 95 }, { "completion_length": 81.4765625, "epoch": 0.4383561643835616, "grad_norm": 2.5764758586883545, "kl": 0.14013671875, "learning_rate": 9.561643835616437e-07, "loss": 0.0056, "reward": 1.6177189946174622, "reward_std": 0.20244715362787247, "rewards/accuracy_reward": 0.6333439946174622, "rewards/format_reward": 0.984375, "step": 96 }, { "completion_length": 64.3984375, "epoch": 0.4429223744292237, "grad_norm": 3.5117011070251465, "kl": 0.18408203125, "learning_rate": 9.557077625570777e-07, "loss": 0.0074, "reward": 1.3848958611488342, "reward_std": 0.33436155319213867, "rewards/accuracy_reward": 0.39270834624767303, "rewards/format_reward": 0.9921875, "step": 97 }, { "completion_length": 75.0703125, "epoch": 0.4474885844748858, "grad_norm": 5.965494632720947, "kl": 0.16650390625, "learning_rate": 9.552511415525114e-07, "loss": 0.0067, "reward": 1.490625023841858, "reward_std": 0.3021724224090576, "rewards/accuracy_reward": 0.5140624940395355, "rewards/format_reward": 0.9765625, "step": 98 }, { "completion_length": 81.328125, "epoch": 0.4520547945205479, "grad_norm": 5.425507545471191, "kl": 0.12939453125, "learning_rate": 9.547945205479452e-07, "loss": 0.0052, "reward": 1.489062488079071, "reward_std": 0.30329059064388275, "rewards/accuracy_reward": 0.5046875178813934, "rewards/format_reward": 0.984375, "step": 99 }, { "completion_length": 85.65625, "epoch": 0.45662100456621, "grad_norm": 3.3313958644866943, "kl": 0.124755859375, "learning_rate": 9.54337899543379e-07, "loss": 0.005, "reward": 1.4551078081130981, "reward_std": 0.2936365008354187, "rewards/accuracy_reward": 0.5019826889038086, "rewards/format_reward": 0.953125, "step": 100 }, { "completion_length": 101.9921875, "epoch": 0.4611872146118721, "grad_norm": 7.004181861877441, "kl": 0.100830078125, "learning_rate": 9.538812785388126e-07, "loss": 0.004, "reward": 1.8317708373069763, "reward_std": 0.11421890184283257, "rewards/accuracy_reward": 0.8395832479000092, "rewards/format_reward": 0.9921875, "step": 101 }, { "completion_length": 65.2890625, "epoch": 0.4657534246575342, "grad_norm": 5.00878381729126, "kl": 0.13720703125, "learning_rate": 9.534246575342465e-07, "loss": 0.0055, "reward": 1.4119791984558105, "reward_std": 0.3915850967168808, "rewards/accuracy_reward": 0.45885418355464935, "rewards/format_reward": 0.953125, "step": 102 }, { "completion_length": 66.1875, "epoch": 0.4703196347031963, "grad_norm": 33.22105026245117, "kl": 0.1689453125, "learning_rate": 9.529680365296803e-07, "loss": 0.0068, "reward": 1.5824455618858337, "reward_std": 0.362550288438797, "rewards/accuracy_reward": 0.5980705618858337, "rewards/format_reward": 0.984375, "step": 103 }, { "completion_length": 94.6015625, "epoch": 0.4748858447488584, "grad_norm": 2.8027193546295166, "kl": 0.100341796875, "learning_rate": 9.525114155251142e-07, "loss": 0.004, "reward": 1.573001742362976, "reward_std": 0.2286304533481598, "rewards/accuracy_reward": 0.5886267125606537, "rewards/format_reward": 0.984375, "step": 104 }, { "completion_length": 88.09375, "epoch": 0.4794520547945205, "grad_norm": 3.694000482559204, "kl": 0.1162109375, "learning_rate": 9.520547945205479e-07, "loss": 0.0046, "reward": 1.616619348526001, "reward_std": 0.24134992063045502, "rewards/accuracy_reward": 0.6244317889213562, "rewards/format_reward": 0.9921875, "step": 105 }, { "completion_length": 97.0546875, "epoch": 0.4840182648401826, "grad_norm": 3.0992743968963623, "kl": 0.1025390625, "learning_rate": 9.515981735159817e-07, "loss": 0.0041, "reward": 1.6255208253860474, "reward_std": 0.2588481456041336, "rewards/accuracy_reward": 0.6489583253860474, "rewards/format_reward": 0.9765625, "step": 106 }, { "completion_length": 76.7890625, "epoch": 0.4885844748858447, "grad_norm": 2.923036813735962, "kl": 0.12255859375, "learning_rate": 9.511415525114155e-07, "loss": 0.0049, "reward": 1.6026042103767395, "reward_std": 0.23826471716165543, "rewards/accuracy_reward": 0.6260416805744171, "rewards/format_reward": 0.9765625, "step": 107 }, { "completion_length": 78.6796875, "epoch": 0.4931506849315068, "grad_norm": 2.8310043811798096, "kl": 0.1357421875, "learning_rate": 9.506849315068493e-07, "loss": 0.0054, "reward": 1.4074218273162842, "reward_std": 0.3311483561992645, "rewards/accuracy_reward": 0.44648435711860657, "rewards/format_reward": 0.9609375, "step": 108 }, { "completion_length": 76.40625, "epoch": 0.4977168949771689, "grad_norm": 4.38238525390625, "kl": 0.119873046875, "learning_rate": 9.50228310502283e-07, "loss": 0.0048, "reward": 1.5411389470100403, "reward_std": 0.22866196930408478, "rewards/accuracy_reward": 0.5802014023065567, "rewards/format_reward": 0.9609375, "step": 109 }, { "completion_length": 71.234375, "epoch": 0.502283105022831, "grad_norm": 5.190479755401611, "kl": 0.18994140625, "learning_rate": 9.497716894977168e-07, "loss": 0.0076, "reward": 1.5246779322624207, "reward_std": 0.31538626551628113, "rewards/accuracy_reward": 0.5637404024600983, "rewards/format_reward": 0.9609375, "step": 110 }, { "completion_length": 63.21875, "epoch": 0.5068493150684932, "grad_norm": 7.178065776824951, "kl": 0.111083984375, "learning_rate": 9.493150684931507e-07, "loss": 0.0044, "reward": 1.421093761920929, "reward_std": 0.3210388720035553, "rewards/accuracy_reward": 0.42890626192092896, "rewards/format_reward": 0.9921875, "step": 111 }, { "completion_length": 86.609375, "epoch": 0.5114155251141552, "grad_norm": 3.856398105621338, "kl": 0.115478515625, "learning_rate": 9.488584474885845e-07, "loss": 0.0046, "reward": 1.504786729812622, "reward_std": 0.22957277297973633, "rewards/accuracy_reward": 0.5125992149114609, "rewards/format_reward": 0.9921875, "step": 112 }, { "completion_length": 77.28125, "epoch": 0.5159817351598174, "grad_norm": 8.988385200500488, "kl": 0.15869140625, "learning_rate": 9.484018264840182e-07, "loss": 0.0064, "reward": 1.4809608459472656, "reward_std": 0.3224972039461136, "rewards/accuracy_reward": 0.5043983459472656, "rewards/format_reward": 0.9765625, "step": 113 }, { "completion_length": 84.90625, "epoch": 0.5205479452054794, "grad_norm": 4.3304219245910645, "kl": 0.1025390625, "learning_rate": 9.47945205479452e-07, "loss": 0.0041, "reward": 1.5533854961395264, "reward_std": 0.2838037610054016, "rewards/accuracy_reward": 0.5846354365348816, "rewards/format_reward": 0.96875, "step": 114 }, { "completion_length": 94.1328125, "epoch": 0.5251141552511416, "grad_norm": 2.1755430698394775, "kl": 0.11572265625, "learning_rate": 9.474885844748858e-07, "loss": 0.0046, "reward": 1.6908555626869202, "reward_std": 0.18422479182481766, "rewards/accuracy_reward": 0.7064805030822754, "rewards/format_reward": 0.984375, "step": 115 }, { "completion_length": 87.375, "epoch": 0.5296803652968036, "grad_norm": 2.527374029159546, "kl": 0.09033203125, "learning_rate": 9.470319634703196e-07, "loss": 0.0036, "reward": 1.4536417722702026, "reward_std": 0.23572513461112976, "rewards/accuracy_reward": 0.46926678717136383, "rewards/format_reward": 0.984375, "step": 116 }, { "completion_length": 90.4140625, "epoch": 0.5342465753424658, "grad_norm": 2.9267947673797607, "kl": 0.098388671875, "learning_rate": 9.465753424657534e-07, "loss": 0.0039, "reward": 1.52434903383255, "reward_std": 0.2611450105905533, "rewards/accuracy_reward": 0.5477864444255829, "rewards/format_reward": 0.9765625, "step": 117 }, { "completion_length": 108.2890625, "epoch": 0.5388127853881278, "grad_norm": 4.520658493041992, "kl": 0.1044921875, "learning_rate": 9.461187214611872e-07, "loss": 0.0042, "reward": 1.5627976655960083, "reward_std": 0.2677070200443268, "rewards/accuracy_reward": 0.6252975761890411, "rewards/format_reward": 0.9375, "step": 118 }, { "completion_length": 74.1953125, "epoch": 0.54337899543379, "grad_norm": 4.915615558624268, "kl": 0.1650390625, "learning_rate": 9.45662100456621e-07, "loss": 0.0066, "reward": 1.5641247630119324, "reward_std": 0.22196320444345474, "rewards/accuracy_reward": 0.56412473320961, "rewards/format_reward": 1.0, "step": 119 }, { "completion_length": 76.375, "epoch": 0.547945205479452, "grad_norm": 4.11935567855835, "kl": 0.094482421875, "learning_rate": 9.452054794520548e-07, "loss": 0.0038, "reward": 1.5418658256530762, "reward_std": 0.2516227439045906, "rewards/accuracy_reward": 0.5496782958507538, "rewards/format_reward": 0.9921875, "step": 120 }, { "completion_length": 68.0859375, "epoch": 0.5525114155251142, "grad_norm": 3.256564140319824, "kl": 0.14208984375, "learning_rate": 9.447488584474885e-07, "loss": 0.0057, "reward": 1.520312488079071, "reward_std": 0.2812953442335129, "rewards/accuracy_reward": 0.5281250178813934, "rewards/format_reward": 0.9921875, "step": 121 }, { "completion_length": 92.7109375, "epoch": 0.5570776255707762, "grad_norm": 2.9781503677368164, "kl": 0.10546875, "learning_rate": 9.442922374429223e-07, "loss": 0.0042, "reward": 1.6091517806053162, "reward_std": 0.23979534208774567, "rewards/accuracy_reward": 0.6404017508029938, "rewards/format_reward": 0.96875, "step": 122 }, { "completion_length": 55.8984375, "epoch": 0.5616438356164384, "grad_norm": 4.986601829528809, "kl": 0.146484375, "learning_rate": 9.438356164383561e-07, "loss": 0.0059, "reward": 1.562890648841858, "reward_std": 0.2664823532104492, "rewards/accuracy_reward": 0.5707031190395355, "rewards/format_reward": 0.9921875, "step": 123 }, { "completion_length": 66.03125, "epoch": 0.5662100456621004, "grad_norm": 5.077969551086426, "kl": 0.140625, "learning_rate": 9.4337899543379e-07, "loss": 0.0056, "reward": 1.5604352951049805, "reward_std": 0.2387000024318695, "rewards/accuracy_reward": 0.5604352653026581, "rewards/format_reward": 1.0, "step": 124 }, { "completion_length": 75.796875, "epoch": 0.5707762557077626, "grad_norm": 2.614694356918335, "kl": 0.124755859375, "learning_rate": 9.429223744292237e-07, "loss": 0.005, "reward": 1.7544271349906921, "reward_std": 0.13757340610027313, "rewards/accuracy_reward": 0.7700520753860474, "rewards/format_reward": 0.984375, "step": 125 }, { "completion_length": 63.4765625, "epoch": 0.5753424657534246, "grad_norm": 5.098128795623779, "kl": 0.13330078125, "learning_rate": 9.424657534246575e-07, "loss": 0.0053, "reward": 1.545653522014618, "reward_std": 0.2823333144187927, "rewards/accuracy_reward": 0.5612785220146179, "rewards/format_reward": 0.984375, "step": 126 }, { "completion_length": 81.3046875, "epoch": 0.5799086757990868, "grad_norm": 2.6090292930603027, "kl": 0.072021484375, "learning_rate": 9.420091324200913e-07, "loss": 0.0029, "reward": 1.5642718076705933, "reward_std": 0.16427360475063324, "rewards/accuracy_reward": 0.5642717182636261, "rewards/format_reward": 1.0, "step": 127 }, { "completion_length": 63.546875, "epoch": 0.5844748858447488, "grad_norm": 4.00266170501709, "kl": 0.093505859375, "learning_rate": 9.41552511415525e-07, "loss": 0.0037, "reward": 1.5677236318588257, "reward_std": 0.30948029458522797, "rewards/accuracy_reward": 0.5755361616611481, "rewards/format_reward": 0.9921875, "step": 128 }, { "completion_length": 75.8359375, "epoch": 0.589041095890411, "grad_norm": 3.477132558822632, "kl": 0.099609375, "learning_rate": 9.410958904109588e-07, "loss": 0.004, "reward": 1.5374347567558289, "reward_std": 0.24908459186553955, "rewards/accuracy_reward": 0.5608722269535065, "rewards/format_reward": 0.9765625, "step": 129 }, { "completion_length": 76.46875, "epoch": 0.593607305936073, "grad_norm": 2.5465404987335205, "kl": 0.1416015625, "learning_rate": 9.406392694063926e-07, "loss": 0.0057, "reward": 1.6419846415519714, "reward_std": 0.1617630198597908, "rewards/accuracy_reward": 0.6497970819473267, "rewards/format_reward": 0.9921875, "step": 130 }, { "completion_length": 69.015625, "epoch": 0.5981735159817352, "grad_norm": 4.162069320678711, "kl": 0.14794921875, "learning_rate": 9.401826484018265e-07, "loss": 0.0059, "reward": 1.5735609531402588, "reward_std": 0.24296356737613678, "rewards/accuracy_reward": 0.5735608339309692, "rewards/format_reward": 1.0, "step": 131 }, { "completion_length": 63.578125, "epoch": 0.6027397260273972, "grad_norm": 2.8231759071350098, "kl": 0.158935546875, "learning_rate": 9.397260273972603e-07, "loss": 0.0064, "reward": 1.6688734889030457, "reward_std": 0.20343666523694992, "rewards/accuracy_reward": 0.6766859591007233, "rewards/format_reward": 0.9921875, "step": 132 }, { "completion_length": 58.921875, "epoch": 0.6073059360730594, "grad_norm": 3.4793238639831543, "kl": 0.13818359375, "learning_rate": 9.39269406392694e-07, "loss": 0.0055, "reward": 1.4002604484558105, "reward_std": 0.2559265270829201, "rewards/accuracy_reward": 0.40026040375232697, "rewards/format_reward": 1.0, "step": 133 }, { "completion_length": 56.109375, "epoch": 0.6118721461187214, "grad_norm": 3.3541765213012695, "kl": 0.14404296875, "learning_rate": 9.388127853881278e-07, "loss": 0.0058, "reward": 1.470312476158142, "reward_std": 0.3302301913499832, "rewards/accuracy_reward": 0.4937499612569809, "rewards/format_reward": 0.9765625, "step": 134 }, { "completion_length": 70.4609375, "epoch": 0.6164383561643836, "grad_norm": 2.545574903488159, "kl": 0.104248046875, "learning_rate": 9.383561643835616e-07, "loss": 0.0042, "reward": 1.6333333849906921, "reward_std": 0.16338077187538147, "rewards/accuracy_reward": 0.633333295583725, "rewards/format_reward": 1.0, "step": 135 }, { "completion_length": 47.046875, "epoch": 0.6210045662100456, "grad_norm": 3.1955769062042236, "kl": 0.102783203125, "learning_rate": 9.378995433789953e-07, "loss": 0.0041, "reward": 1.459455132484436, "reward_std": 0.3040963113307953, "rewards/accuracy_reward": 0.47508013248443604, "rewards/format_reward": 0.984375, "step": 136 }, { "completion_length": 67.234375, "epoch": 0.6255707762557078, "grad_norm": 4.687453269958496, "kl": 0.13427734375, "learning_rate": 9.374429223744292e-07, "loss": 0.0054, "reward": 1.600705862045288, "reward_std": 0.20340244472026825, "rewards/accuracy_reward": 0.6085183620452881, "rewards/format_reward": 0.9921875, "step": 137 }, { "completion_length": 68.546875, "epoch": 0.6301369863013698, "grad_norm": 2.4881796836853027, "kl": 0.13037109375, "learning_rate": 9.36986301369863e-07, "loss": 0.0052, "reward": 1.5750191807746887, "reward_std": 0.21577110141515732, "rewards/accuracy_reward": 0.5906441807746887, "rewards/format_reward": 0.984375, "step": 138 }, { "completion_length": 58.6171875, "epoch": 0.634703196347032, "grad_norm": 4.482968807220459, "kl": 0.1357421875, "learning_rate": 9.365296803652968e-07, "loss": 0.0054, "reward": 1.6022320985794067, "reward_std": 0.25604283809661865, "rewards/accuracy_reward": 0.6022321283817291, "rewards/format_reward": 1.0, "step": 139 }, { "completion_length": 59.625, "epoch": 0.639269406392694, "grad_norm": 2.6462514400482178, "kl": 0.144287109375, "learning_rate": 9.360730593607306e-07, "loss": 0.0058, "reward": 1.6302083134651184, "reward_std": 0.2878073900938034, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.984375, "step": 140 }, { "completion_length": 67.953125, "epoch": 0.6438356164383562, "grad_norm": 14.897021293640137, "kl": 0.10888671875, "learning_rate": 9.356164383561643e-07, "loss": 0.0044, "reward": 1.5768229961395264, "reward_std": 0.26041025668382645, "rewards/accuracy_reward": 0.5846354067325592, "rewards/format_reward": 0.9921875, "step": 141 }, { "completion_length": 62.2421875, "epoch": 0.6484018264840182, "grad_norm": 2.723630428314209, "kl": 0.140625, "learning_rate": 9.351598173515981e-07, "loss": 0.0056, "reward": 1.5526549220085144, "reward_std": 0.2013382911682129, "rewards/accuracy_reward": 0.552654892206192, "rewards/format_reward": 1.0, "step": 142 }, { "completion_length": 62.234375, "epoch": 0.6529680365296804, "grad_norm": 4.193840026855469, "kl": 0.12744140625, "learning_rate": 9.347031963470319e-07, "loss": 0.0051, "reward": 1.5169403553009033, "reward_std": 0.3250080794095993, "rewards/accuracy_reward": 0.5481902956962585, "rewards/format_reward": 0.96875, "step": 143 }, { "completion_length": 54.5078125, "epoch": 0.6575342465753424, "grad_norm": 3.8824496269226074, "kl": 0.1943359375, "learning_rate": 9.342465753424658e-07, "loss": 0.0078, "reward": 1.5293877124786377, "reward_std": 0.20680496841669083, "rewards/accuracy_reward": 0.5293876230716705, "rewards/format_reward": 1.0, "step": 144 }, { "completion_length": 70.8359375, "epoch": 0.6621004566210046, "grad_norm": 2.7752647399902344, "kl": 0.109375, "learning_rate": 9.337899543378995e-07, "loss": 0.0044, "reward": 1.6002604365348816, "reward_std": 0.16751104593276978, "rewards/accuracy_reward": 0.6002604067325592, "rewards/format_reward": 1.0, "step": 145 }, { "completion_length": 43.4921875, "epoch": 0.6666666666666666, "grad_norm": 4.007904529571533, "kl": 0.1455078125, "learning_rate": 9.333333333333333e-07, "loss": 0.0058, "reward": 1.4145089387893677, "reward_std": 0.37413105368614197, "rewards/accuracy_reward": 0.4301339387893677, "rewards/format_reward": 0.984375, "step": 146 }, { "completion_length": 55.015625, "epoch": 0.6712328767123288, "grad_norm": 2.844416618347168, "kl": 0.14501953125, "learning_rate": 9.328767123287671e-07, "loss": 0.0058, "reward": 1.434374988079071, "reward_std": 0.2953774631023407, "rewards/accuracy_reward": 0.44218750298023224, "rewards/format_reward": 0.9921875, "step": 147 }, { "completion_length": 63.953125, "epoch": 0.6757990867579908, "grad_norm": 3.3027069568634033, "kl": 0.11376953125, "learning_rate": 9.324200913242009e-07, "loss": 0.0045, "reward": 1.5338541269302368, "reward_std": 0.2085491269826889, "rewards/accuracy_reward": 0.5494791865348816, "rewards/format_reward": 0.984375, "step": 148 }, { "completion_length": 60.046875, "epoch": 0.680365296803653, "grad_norm": 2.4687771797180176, "kl": 0.150146484375, "learning_rate": 9.319634703196346e-07, "loss": 0.006, "reward": 1.4104894995689392, "reward_std": 0.2745247557759285, "rewards/accuracy_reward": 0.4104894697666168, "rewards/format_reward": 1.0, "step": 149 }, { "completion_length": 70.234375, "epoch": 0.684931506849315, "grad_norm": 3.1265056133270264, "kl": 0.112548828125, "learning_rate": 9.315068493150684e-07, "loss": 0.0045, "reward": 1.5757813453674316, "reward_std": 0.2977932393550873, "rewards/accuracy_reward": 0.5914062261581421, "rewards/format_reward": 0.984375, "step": 150 }, { "completion_length": 55.9609375, "epoch": 0.6894977168949772, "grad_norm": 2.7954444885253906, "kl": 0.138916015625, "learning_rate": 9.310502283105023e-07, "loss": 0.0056, "reward": 1.532336711883545, "reward_std": 0.2634681910276413, "rewards/accuracy_reward": 0.5557741522789001, "rewards/format_reward": 0.9765625, "step": 151 }, { "completion_length": 91.7734375, "epoch": 0.6940639269406392, "grad_norm": 3.276801586151123, "kl": 0.09521484375, "learning_rate": 9.30593607305936e-07, "loss": 0.0038, "reward": 1.8068453073501587, "reward_std": 0.13430681824684143, "rewards/accuracy_reward": 0.8146576881408691, "rewards/format_reward": 0.9921875, "step": 152 }, { "completion_length": 71.9453125, "epoch": 0.6986301369863014, "grad_norm": 5.107734680175781, "kl": 0.126708984375, "learning_rate": 9.301369863013698e-07, "loss": 0.0051, "reward": 1.5772569179534912, "reward_std": 0.3007543087005615, "rewards/accuracy_reward": 0.60069440305233, "rewards/format_reward": 0.9765625, "step": 153 }, { "completion_length": 65.1015625, "epoch": 0.7031963470319634, "grad_norm": 11.66229248046875, "kl": 0.142578125, "learning_rate": 9.296803652968036e-07, "loss": 0.0057, "reward": 1.5702590942382812, "reward_std": 0.2691568061709404, "rewards/accuracy_reward": 0.5780715942382812, "rewards/format_reward": 0.9921875, "step": 154 }, { "completion_length": 70.734375, "epoch": 0.7077625570776256, "grad_norm": 2.577918291091919, "kl": 0.101318359375, "learning_rate": 9.292237442922374e-07, "loss": 0.0041, "reward": 1.706770896911621, "reward_std": 0.17131806910037994, "rewards/accuracy_reward": 0.7145833075046539, "rewards/format_reward": 0.9921875, "step": 155 }, { "completion_length": 60.515625, "epoch": 0.7123287671232876, "grad_norm": 3.4716637134552, "kl": 0.119384765625, "learning_rate": 9.287671232876712e-07, "loss": 0.0048, "reward": 1.534895896911621, "reward_std": 0.2703789845108986, "rewards/accuracy_reward": 0.5817708373069763, "rewards/format_reward": 0.953125, "step": 156 }, { "completion_length": 56.15625, "epoch": 0.7168949771689498, "grad_norm": 3.562089204788208, "kl": 0.119140625, "learning_rate": 9.28310502283105e-07, "loss": 0.0048, "reward": 1.586718738079071, "reward_std": 0.23555129766464233, "rewards/accuracy_reward": 0.5867187678813934, "rewards/format_reward": 1.0, "step": 157 }, { "completion_length": 78.2890625, "epoch": 0.7214611872146118, "grad_norm": 3.0020015239715576, "kl": 0.124755859375, "learning_rate": 9.278538812785388e-07, "loss": 0.005, "reward": 1.6171875596046448, "reward_std": 0.29713982343673706, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.9609375, "step": 158 }, { "completion_length": 79.8203125, "epoch": 0.726027397260274, "grad_norm": 3.9287021160125732, "kl": 0.089599609375, "learning_rate": 9.273972602739726e-07, "loss": 0.0036, "reward": 1.5328125953674316, "reward_std": 0.3052903413772583, "rewards/accuracy_reward": 0.5953125059604645, "rewards/format_reward": 0.9375, "step": 159 }, { "completion_length": 81.546875, "epoch": 0.730593607305936, "grad_norm": 4.853604793548584, "kl": 0.142333984375, "learning_rate": 9.269406392694063e-07, "loss": 0.0057, "reward": 1.6752576231956482, "reward_std": 0.17081036418676376, "rewards/accuracy_reward": 0.6830700039863586, "rewards/format_reward": 0.9921875, "step": 160 }, { "completion_length": 62.828125, "epoch": 0.7351598173515982, "grad_norm": 6.2052903175354, "kl": 0.14013671875, "learning_rate": 9.264840182648401e-07, "loss": 0.0056, "reward": 1.4204427003860474, "reward_std": 0.3042631521821022, "rewards/accuracy_reward": 0.42825521528720856, "rewards/format_reward": 0.9921875, "step": 161 }, { "completion_length": 113.703125, "epoch": 0.7397260273972602, "grad_norm": 0.730038583278656, "kl": 0.066162109375, "learning_rate": 9.260273972602739e-07, "loss": 0.0026, "reward": 1.8767098784446716, "reward_std": 0.03975973278284073, "rewards/accuracy_reward": 0.8845223784446716, "rewards/format_reward": 0.9921875, "step": 162 }, { "completion_length": 77.8515625, "epoch": 0.7442922374429224, "grad_norm": 2.9642155170440674, "kl": 0.10009765625, "learning_rate": 9.255707762557077e-07, "loss": 0.004, "reward": 1.702742099761963, "reward_std": 0.18021715432405472, "rewards/accuracy_reward": 0.7105545401573181, "rewards/format_reward": 0.9921875, "step": 163 }, { "completion_length": 73.984375, "epoch": 0.7488584474885844, "grad_norm": 3.072006940841675, "kl": 0.13330078125, "learning_rate": 9.251141552511416e-07, "loss": 0.0053, "reward": 1.768545389175415, "reward_std": 0.1539178043603897, "rewards/accuracy_reward": 0.768545389175415, "rewards/format_reward": 1.0, "step": 164 }, { "completion_length": 65.953125, "epoch": 0.7534246575342466, "grad_norm": 5.146835803985596, "kl": 0.12841796875, "learning_rate": 9.246575342465753e-07, "loss": 0.0051, "reward": 1.503950297832489, "reward_std": 0.3056825324892998, "rewards/accuracy_reward": 0.5195753127336502, "rewards/format_reward": 0.984375, "step": 165 }, { "completion_length": 91.4921875, "epoch": 0.7579908675799086, "grad_norm": 2.6764745712280273, "kl": 0.1031494140625, "learning_rate": 9.242009132420091e-07, "loss": 0.0041, "reward": 1.5589421391487122, "reward_std": 0.23093140125274658, "rewards/accuracy_reward": 0.5745670646429062, "rewards/format_reward": 0.984375, "step": 166 }, { "completion_length": 81.7734375, "epoch": 0.7625570776255708, "grad_norm": 2.828564405441284, "kl": 0.12109375, "learning_rate": 9.237442922374429e-07, "loss": 0.0048, "reward": 1.5663008093833923, "reward_std": 0.20618148148059845, "rewards/accuracy_reward": 0.5975507497787476, "rewards/format_reward": 0.96875, "step": 167 }, { "completion_length": 76.296875, "epoch": 0.7671232876712328, "grad_norm": 2.9903767108917236, "kl": 0.1298828125, "learning_rate": 9.232876712328766e-07, "loss": 0.0052, "reward": 1.645312488079071, "reward_std": 0.2312253788113594, "rewards/accuracy_reward": 0.6531249731779099, "rewards/format_reward": 0.9921875, "step": 168 }, { "completion_length": 77.71875, "epoch": 0.771689497716895, "grad_norm": 2.3729984760284424, "kl": 0.103759765625, "learning_rate": 9.228310502283104e-07, "loss": 0.0042, "reward": 1.5843749642372131, "reward_std": 0.26058951020240784, "rewards/accuracy_reward": 0.6000000238418579, "rewards/format_reward": 0.984375, "step": 169 }, { "completion_length": 109.0234375, "epoch": 0.776255707762557, "grad_norm": 2.769318103790283, "kl": 0.076416015625, "learning_rate": 9.223744292237442e-07, "loss": 0.0031, "reward": 1.7934027910232544, "reward_std": 0.09004722535610199, "rewards/accuracy_reward": 0.801215261220932, "rewards/format_reward": 0.9921875, "step": 170 }, { "completion_length": 75.8203125, "epoch": 0.7808219178082192, "grad_norm": 2.6014890670776367, "kl": 0.131103515625, "learning_rate": 9.219178082191781e-07, "loss": 0.0052, "reward": 1.5569196343421936, "reward_std": 0.26341303437948227, "rewards/accuracy_reward": 0.580357164144516, "rewards/format_reward": 0.9765625, "step": 171 }, { "completion_length": 97.9375, "epoch": 0.7853881278538812, "grad_norm": 2.9145772457122803, "kl": 0.091064453125, "learning_rate": 9.214611872146119e-07, "loss": 0.0036, "reward": 1.6800222992897034, "reward_std": 0.15495410561561584, "rewards/accuracy_reward": 0.6956472992897034, "rewards/format_reward": 0.984375, "step": 172 }, { "completion_length": 82.0234375, "epoch": 0.7899543378995434, "grad_norm": 5.809459686279297, "kl": 0.27392578125, "learning_rate": 9.210045662100456e-07, "loss": 0.0109, "reward": 1.489843726158142, "reward_std": 0.22895930707454681, "rewards/accuracy_reward": 0.4898437559604645, "rewards/format_reward": 1.0, "step": 173 }, { "completion_length": 76.0859375, "epoch": 0.7945205479452054, "grad_norm": 4.242368698120117, "kl": 0.111572265625, "learning_rate": 9.205479452054794e-07, "loss": 0.0045, "reward": 1.4959192276000977, "reward_std": 0.2452109009027481, "rewards/accuracy_reward": 0.5037316679954529, "rewards/format_reward": 0.9921875, "step": 174 }, { "completion_length": 72.4921875, "epoch": 0.7990867579908676, "grad_norm": 4.817114353179932, "kl": 0.12060546875, "learning_rate": 9.200913242009132e-07, "loss": 0.0048, "reward": 1.607180118560791, "reward_std": 0.2259521484375, "rewards/accuracy_reward": 0.6149925291538239, "rewards/format_reward": 0.9921875, "step": 175 }, { "completion_length": 73.3828125, "epoch": 0.8036529680365296, "grad_norm": 5.523159027099609, "kl": 0.11669921875, "learning_rate": 9.196347031963469e-07, "loss": 0.0047, "reward": 1.4424473643302917, "reward_std": 0.35005757212638855, "rewards/accuracy_reward": 0.46588489413261414, "rewards/format_reward": 0.9765625, "step": 176 }, { "completion_length": 83.7109375, "epoch": 0.8082191780821918, "grad_norm": 6.852357387542725, "kl": 0.101318359375, "learning_rate": 9.191780821917808e-07, "loss": 0.0041, "reward": 1.7020823955535889, "reward_std": 0.1837347373366356, "rewards/accuracy_reward": 0.7020823657512665, "rewards/format_reward": 1.0, "step": 177 }, { "completion_length": 95.171875, "epoch": 0.8127853881278538, "grad_norm": 3.1507697105407715, "kl": 0.0908203125, "learning_rate": 9.187214611872146e-07, "loss": 0.0036, "reward": 1.6673295497894287, "reward_std": 0.14267417788505554, "rewards/accuracy_reward": 0.6673295497894287, "rewards/format_reward": 1.0, "step": 178 }, { "completion_length": 110.1640625, "epoch": 0.817351598173516, "grad_norm": 2.0009899139404297, "kl": 0.0653076171875, "learning_rate": 9.182648401826484e-07, "loss": 0.0026, "reward": 1.7578993439674377, "reward_std": 0.12590338289737701, "rewards/accuracy_reward": 0.765711784362793, "rewards/format_reward": 0.9921875, "step": 179 }, { "completion_length": 83.59375, "epoch": 0.821917808219178, "grad_norm": 2.3591296672821045, "kl": 0.09765625, "learning_rate": 9.178082191780822e-07, "loss": 0.0039, "reward": 1.72604501247406, "reward_std": 0.1832278147339821, "rewards/accuracy_reward": 0.7338575124740601, "rewards/format_reward": 0.9921875, "step": 180 }, { "completion_length": 90.8125, "epoch": 0.8264840182648402, "grad_norm": 4.365949630737305, "kl": 0.1068115234375, "learning_rate": 9.173515981735159e-07, "loss": 0.0043, "reward": 1.7241073250770569, "reward_std": 0.1286808280274272, "rewards/accuracy_reward": 0.7241072654724121, "rewards/format_reward": 1.0, "step": 181 }, { "completion_length": 70.1015625, "epoch": 0.8310502283105022, "grad_norm": 4.564199924468994, "kl": 0.1298828125, "learning_rate": 9.168949771689497e-07, "loss": 0.0052, "reward": 1.4811267852783203, "reward_std": 0.23117025196552277, "rewards/accuracy_reward": 0.4967518150806427, "rewards/format_reward": 0.984375, "step": 182 }, { "completion_length": 70.0859375, "epoch": 0.8356164383561644, "grad_norm": 8.132929801940918, "kl": 0.15234375, "learning_rate": 9.164383561643835e-07, "loss": 0.0061, "reward": 1.513009488582611, "reward_std": 0.31318600475788116, "rewards/accuracy_reward": 0.5208219289779663, "rewards/format_reward": 0.9921875, "step": 183 }, { "completion_length": 87.390625, "epoch": 0.8401826484018264, "grad_norm": 2.4152746200561523, "kl": 0.086669921875, "learning_rate": 9.159817351598174e-07, "loss": 0.0035, "reward": 1.6757907271385193, "reward_std": 0.16663195937871933, "rewards/accuracy_reward": 0.6836032569408417, "rewards/format_reward": 0.9921875, "step": 184 }, { "completion_length": 71.40625, "epoch": 0.8447488584474886, "grad_norm": 3.8499457836151123, "kl": 0.14111328125, "learning_rate": 9.155251141552511e-07, "loss": 0.0057, "reward": 1.418749988079071, "reward_std": 0.27360689640045166, "rewards/accuracy_reward": 0.42656251788139343, "rewards/format_reward": 0.9921875, "step": 185 }, { "completion_length": 100.109375, "epoch": 0.8493150684931506, "grad_norm": 1.6684777736663818, "kl": 0.103271484375, "learning_rate": 9.150684931506849e-07, "loss": 0.0041, "reward": 1.820052146911621, "reward_std": 0.09526955150067806, "rewards/accuracy_reward": 0.8278645575046539, "rewards/format_reward": 0.9921875, "step": 186 }, { "completion_length": 110.109375, "epoch": 0.8538812785388128, "grad_norm": 2.6076531410217285, "kl": 0.0618896484375, "learning_rate": 9.146118721461187e-07, "loss": 0.0025, "reward": 1.6446732878684998, "reward_std": 0.1990083083510399, "rewards/accuracy_reward": 0.6602982878684998, "rewards/format_reward": 0.984375, "step": 187 }, { "completion_length": 73.921875, "epoch": 0.8584474885844748, "grad_norm": 3.0167715549468994, "kl": 0.129150390625, "learning_rate": 9.141552511415525e-07, "loss": 0.0052, "reward": 1.4889777302742004, "reward_std": 0.310699999332428, "rewards/accuracy_reward": 0.520227700471878, "rewards/format_reward": 0.96875, "step": 188 }, { "completion_length": 77.5625, "epoch": 0.863013698630137, "grad_norm": 5.573866844177246, "kl": 0.142578125, "learning_rate": 9.136986301369862e-07, "loss": 0.0057, "reward": 1.5417782664299011, "reward_std": 0.21660251170396805, "rewards/accuracy_reward": 0.5417782664299011, "rewards/format_reward": 1.0, "step": 189 }, { "completion_length": 98.015625, "epoch": 0.867579908675799, "grad_norm": 3.7342417240142822, "kl": 0.12890625, "learning_rate": 9.1324200913242e-07, "loss": 0.0051, "reward": 1.5212674140930176, "reward_std": 0.3198610842227936, "rewards/accuracy_reward": 0.5681423544883728, "rewards/format_reward": 0.953125, "step": 190 }, { "completion_length": 87.9609375, "epoch": 0.8721461187214612, "grad_norm": 5.50727653503418, "kl": 0.09033203125, "learning_rate": 9.127853881278539e-07, "loss": 0.0036, "reward": 1.4584822058677673, "reward_std": 0.2924446016550064, "rewards/accuracy_reward": 0.47410714626312256, "rewards/format_reward": 0.984375, "step": 191 }, { "completion_length": 86.59375, "epoch": 0.8767123287671232, "grad_norm": 4.182096004486084, "kl": 0.110595703125, "learning_rate": 9.123287671232876e-07, "loss": 0.0044, "reward": 1.4973958134651184, "reward_std": 0.27411043643951416, "rewards/accuracy_reward": 0.5130208432674408, "rewards/format_reward": 0.984375, "step": 192 }, { "completion_length": 83.3359375, "epoch": 0.8812785388127854, "grad_norm": 3.033630609512329, "kl": 0.1015625, "learning_rate": 9.118721461187214e-07, "loss": 0.0041, "reward": 1.6757813096046448, "reward_std": 0.18561583012342453, "rewards/accuracy_reward": 0.6835937201976776, "rewards/format_reward": 0.9921875, "step": 193 }, { "completion_length": 86.953125, "epoch": 0.8858447488584474, "grad_norm": 2.5366592407226562, "kl": 0.0810546875, "learning_rate": 9.114155251141552e-07, "loss": 0.0032, "reward": 1.605208396911621, "reward_std": 0.2767683416604996, "rewards/accuracy_reward": 0.6286458075046539, "rewards/format_reward": 0.9765625, "step": 194 }, { "completion_length": 87.75, "epoch": 0.8904109589041096, "grad_norm": 2.646970748901367, "kl": 0.121826171875, "learning_rate": 9.10958904109589e-07, "loss": 0.0049, "reward": 1.6739211678504944, "reward_std": 0.22777877748012543, "rewards/accuracy_reward": 0.6817336082458496, "rewards/format_reward": 0.9921875, "step": 195 }, { "completion_length": 85.375, "epoch": 0.8949771689497716, "grad_norm": 4.142679214477539, "kl": 0.116455078125, "learning_rate": 9.105022831050228e-07, "loss": 0.0047, "reward": 1.5367559790611267, "reward_std": 0.22284159809350967, "rewards/accuracy_reward": 0.5367559492588043, "rewards/format_reward": 1.0, "step": 196 }, { "completion_length": 73.3359375, "epoch": 0.8995433789954338, "grad_norm": 98.02293395996094, "kl": 0.123779296875, "learning_rate": 9.100456621004566e-07, "loss": 0.005, "reward": 1.613690435886383, "reward_std": 0.3185143321752548, "rewards/accuracy_reward": 0.6293154358863831, "rewards/format_reward": 0.984375, "step": 197 }, { "completion_length": 68.6796875, "epoch": 0.9041095890410958, "grad_norm": 3.6209559440612793, "kl": 0.13037109375, "learning_rate": 9.095890410958904e-07, "loss": 0.0052, "reward": 1.440234363079071, "reward_std": 0.31120626628398895, "rewards/accuracy_reward": 0.47148437798023224, "rewards/format_reward": 0.96875, "step": 198 }, { "completion_length": 79.1953125, "epoch": 0.908675799086758, "grad_norm": 5.003497123718262, "kl": 0.1123046875, "learning_rate": 9.091324200913242e-07, "loss": 0.0045, "reward": 1.648740530014038, "reward_std": 0.1829584613442421, "rewards/accuracy_reward": 0.6487405002117157, "rewards/format_reward": 1.0, "step": 199 }, { "completion_length": 102.6171875, "epoch": 0.91324200913242, "grad_norm": 2.219266891479492, "kl": 0.0859375, "learning_rate": 9.08675799086758e-07, "loss": 0.0034, "reward": 1.7168915271759033, "reward_std": 0.12932297587394714, "rewards/accuracy_reward": 0.7247040569782257, "rewards/format_reward": 0.9921875, "step": 200 }, { "completion_length": 72.140625, "epoch": 0.9178082191780822, "grad_norm": 3.0909345149993896, "kl": 0.1162109375, "learning_rate": 9.082191780821917e-07, "loss": 0.0046, "reward": 1.4611505270004272, "reward_std": 0.3023676201701164, "rewards/accuracy_reward": 0.4924005717039108, "rewards/format_reward": 0.96875, "step": 201 }, { "completion_length": 94.140625, "epoch": 0.9223744292237442, "grad_norm": 4.803802967071533, "kl": 0.103759765625, "learning_rate": 9.077625570776255e-07, "loss": 0.0042, "reward": 1.6710938215255737, "reward_std": 0.18428679555654526, "rewards/accuracy_reward": 0.6789062321186066, "rewards/format_reward": 0.9921875, "step": 202 }, { "completion_length": 86.5234375, "epoch": 0.9269406392694064, "grad_norm": 4.511474609375, "kl": 0.078125, "learning_rate": 9.073059360730593e-07, "loss": 0.0031, "reward": 1.6398438215255737, "reward_std": 0.17492686957120895, "rewards/accuracy_reward": 0.6632812321186066, "rewards/format_reward": 0.9765625, "step": 203 }, { "completion_length": 77.4296875, "epoch": 0.9315068493150684, "grad_norm": 3.1800436973571777, "kl": 0.1025390625, "learning_rate": 9.068493150684932e-07, "loss": 0.0041, "reward": 1.5895833373069763, "reward_std": 0.2831973433494568, "rewards/accuracy_reward": 0.6130208373069763, "rewards/format_reward": 0.9765625, "step": 204 }, { "completion_length": 78.21875, "epoch": 0.9360730593607306, "grad_norm": 2.723695993423462, "kl": 0.110107421875, "learning_rate": 9.063926940639269e-07, "loss": 0.0044, "reward": 1.508962869644165, "reward_std": 0.293969988822937, "rewards/accuracy_reward": 0.5324003100395203, "rewards/format_reward": 0.9765625, "step": 205 }, { "completion_length": 71.1953125, "epoch": 0.9406392694063926, "grad_norm": 4.026218414306641, "kl": 0.125244140625, "learning_rate": 9.059360730593607e-07, "loss": 0.005, "reward": 1.475000023841858, "reward_std": 0.31207825243473053, "rewards/accuracy_reward": 0.4828125238418579, "rewards/format_reward": 0.9921875, "step": 206 }, { "completion_length": 91.5546875, "epoch": 0.9452054794520548, "grad_norm": 3.48559832572937, "kl": 0.078125, "learning_rate": 9.054794520547945e-07, "loss": 0.0031, "reward": 1.5565290451049805, "reward_std": 0.17772940546274185, "rewards/accuracy_reward": 0.5721540153026581, "rewards/format_reward": 0.984375, "step": 207 }, { "completion_length": 80.015625, "epoch": 0.9497716894977168, "grad_norm": 2.530951499938965, "kl": 0.103759765625, "learning_rate": 9.050228310502282e-07, "loss": 0.0042, "reward": 1.7421875, "reward_std": 0.17887144908308983, "rewards/accuracy_reward": 0.7499999403953552, "rewards/format_reward": 0.9921875, "step": 208 }, { "completion_length": 85.671875, "epoch": 0.954337899543379, "grad_norm": 3.1076207160949707, "kl": 0.084228515625, "learning_rate": 9.04566210045662e-07, "loss": 0.0034, "reward": 1.6094618439674377, "reward_std": 0.16175774857401848, "rewards/accuracy_reward": 0.617274284362793, "rewards/format_reward": 0.9921875, "step": 209 }, { "completion_length": 84.546875, "epoch": 0.958904109589041, "grad_norm": 2.6074228286743164, "kl": 0.076171875, "learning_rate": 9.041095890410958e-07, "loss": 0.003, "reward": 1.5931640267372131, "reward_std": 0.18021905422210693, "rewards/accuracy_reward": 0.5931640565395355, "rewards/format_reward": 1.0, "step": 210 }, { "completion_length": 74.015625, "epoch": 0.9634703196347032, "grad_norm": 3.9831788539886475, "kl": 0.087890625, "learning_rate": 9.036529680365297e-07, "loss": 0.0035, "reward": 1.5644965171813965, "reward_std": 0.20533857494592667, "rewards/accuracy_reward": 0.5644965171813965, "rewards/format_reward": 1.0, "step": 211 }, { "completion_length": 60.9296875, "epoch": 0.9680365296803652, "grad_norm": 2.8256754875183105, "kl": 0.10986328125, "learning_rate": 9.031963470319635e-07, "loss": 0.0044, "reward": 1.5828006267547607, "reward_std": 0.2151413932442665, "rewards/accuracy_reward": 0.5906131863594055, "rewards/format_reward": 0.9921875, "step": 212 }, { "completion_length": 88.8515625, "epoch": 0.9726027397260274, "grad_norm": 2.615966796875, "kl": 0.113037109375, "learning_rate": 9.027397260273972e-07, "loss": 0.0045, "reward": 1.5743975639343262, "reward_std": 0.24481570720672607, "rewards/accuracy_reward": 0.5822100639343262, "rewards/format_reward": 0.9921875, "step": 213 }, { "completion_length": 66.3359375, "epoch": 0.9771689497716894, "grad_norm": 11.469498634338379, "kl": 0.106201171875, "learning_rate": 9.02283105022831e-07, "loss": 0.0042, "reward": 1.7420889139175415, "reward_std": 0.13549309968948364, "rewards/accuracy_reward": 0.7420888245105743, "rewards/format_reward": 1.0, "step": 214 }, { "completion_length": 74.0390625, "epoch": 0.9817351598173516, "grad_norm": 3.873460292816162, "kl": 0.0693359375, "learning_rate": 9.018264840182648e-07, "loss": 0.0028, "reward": 1.6421875953674316, "reward_std": 0.2022070661187172, "rewards/accuracy_reward": 0.6421874761581421, "rewards/format_reward": 1.0, "step": 215 }, { "completion_length": 93.375, "epoch": 0.9863013698630136, "grad_norm": 2.612290143966675, "kl": 0.09765625, "learning_rate": 9.013698630136985e-07, "loss": 0.0039, "reward": 1.645312488079071, "reward_std": 0.25715554505586624, "rewards/accuracy_reward": 0.676562488079071, "rewards/format_reward": 0.96875, "step": 216 }, { "completion_length": 83.8515625, "epoch": 0.9908675799086758, "grad_norm": 1.9043883085250854, "kl": 0.077880859375, "learning_rate": 9.009132420091324e-07, "loss": 0.0031, "reward": 1.6007813215255737, "reward_std": 0.2224196195602417, "rewards/accuracy_reward": 0.6242187321186066, "rewards/format_reward": 0.9765625, "step": 217 }, { "completion_length": 70.2421875, "epoch": 0.9954337899543378, "grad_norm": 9.268891334533691, "kl": 0.114013671875, "learning_rate": 9.004566210045662e-07, "loss": 0.0046, "reward": 1.487464964389801, "reward_std": 0.3045327961444855, "rewards/accuracy_reward": 0.510902464389801, "rewards/format_reward": 0.9765625, "step": 218 }, { "completion_length": 138.875, "epoch": 1.0, "grad_norm": 2.398301839828491, "kl": 0.1318359375, "learning_rate": 9e-07, "loss": 0.0041, "reward": 1.899999976158142, "reward_std": 0.0, "rewards/accuracy_reward": 0.8999999761581421, "rewards/format_reward": 1.0, "step": 219 }, { "completion_length": 70.625, "epoch": 1.004566210045662, "grad_norm": 3.741870880126953, "kl": 0.12158203125, "learning_rate": 8.995433789954338e-07, "loss": 0.0049, "reward": 1.50390625, "reward_std": 0.26016832888126373, "rewards/accuracy_reward": 0.5039062350988388, "rewards/format_reward": 1.0, "step": 220 }, { "completion_length": 64.859375, "epoch": 1.009132420091324, "grad_norm": 11.450385093688965, "kl": 0.133056640625, "learning_rate": 8.990867579908675e-07, "loss": 0.0053, "reward": 1.7419270873069763, "reward_std": 0.11560981348156929, "rewards/accuracy_reward": 0.7419269979000092, "rewards/format_reward": 1.0, "step": 221 }, { "completion_length": 82.046875, "epoch": 1.0136986301369864, "grad_norm": 3.4675490856170654, "kl": 0.08935546875, "learning_rate": 8.986301369863013e-07, "loss": 0.0036, "reward": 1.6703497171401978, "reward_std": 0.2566395699977875, "rewards/accuracy_reward": 0.6859746873378754, "rewards/format_reward": 0.984375, "step": 222 }, { "completion_length": 88.4375, "epoch": 1.0182648401826484, "grad_norm": 2.4552440643310547, "kl": 0.11328125, "learning_rate": 8.981735159817351e-07, "loss": 0.0045, "reward": 1.646279752254486, "reward_std": 0.1442876234650612, "rewards/accuracy_reward": 0.6462797522544861, "rewards/format_reward": 1.0, "step": 223 }, { "completion_length": 60.6171875, "epoch": 1.0228310502283104, "grad_norm": 2.843845844268799, "kl": 0.144287109375, "learning_rate": 8.97716894977169e-07, "loss": 0.0058, "reward": 1.5498643517494202, "reward_std": 0.2656385153532028, "rewards/accuracy_reward": 0.5498644113540649, "rewards/format_reward": 1.0, "step": 224 }, { "completion_length": 59.640625, "epoch": 1.0273972602739727, "grad_norm": 6.659507751464844, "kl": 0.26123046875, "learning_rate": 8.972602739726027e-07, "loss": 0.0105, "reward": 1.522736370563507, "reward_std": 0.23698078095912933, "rewards/accuracy_reward": 0.5383614003658295, "rewards/format_reward": 0.984375, "step": 225 }, { "completion_length": 76.2578125, "epoch": 1.0319634703196348, "grad_norm": 7.1479926109313965, "kl": 0.11474609375, "learning_rate": 8.968036529680365e-07, "loss": 0.0046, "reward": 1.5441184043884277, "reward_std": 0.1862540915608406, "rewards/accuracy_reward": 0.5519307851791382, "rewards/format_reward": 0.9921875, "step": 226 }, { "completion_length": 52.7109375, "epoch": 1.0365296803652968, "grad_norm": 5.714673042297363, "kl": 0.17578125, "learning_rate": 8.963470319634703e-07, "loss": 0.007, "reward": 1.4614583253860474, "reward_std": 0.3212399482727051, "rewards/accuracy_reward": 0.47708331048488617, "rewards/format_reward": 0.984375, "step": 227 }, { "completion_length": 59.234375, "epoch": 1.0410958904109588, "grad_norm": 2.191619396209717, "kl": 0.143798828125, "learning_rate": 8.958904109589041e-07, "loss": 0.0058, "reward": 1.6684895753860474, "reward_std": 0.1687004156410694, "rewards/accuracy_reward": 0.6763020753860474, "rewards/format_reward": 0.9921875, "step": 228 }, { "completion_length": 64.375, "epoch": 1.045662100456621, "grad_norm": 7.204253196716309, "kl": 0.1484375, "learning_rate": 8.954337899543378e-07, "loss": 0.0059, "reward": 1.6486244797706604, "reward_std": 0.20006748288869858, "rewards/accuracy_reward": 0.6486244797706604, "rewards/format_reward": 1.0, "step": 229 }, { "completion_length": 68.6796875, "epoch": 1.0502283105022832, "grad_norm": 34.476558685302734, "kl": 0.112060546875, "learning_rate": 8.949771689497716e-07, "loss": 0.0045, "reward": 1.484375, "reward_std": 0.2940969169139862, "rewards/accuracy_reward": 0.5078124701976776, "rewards/format_reward": 0.9765625, "step": 230 }, { "completion_length": 60.7421875, "epoch": 1.0547945205479452, "grad_norm": 1.8852620124816895, "kl": 0.15234375, "learning_rate": 8.945205479452055e-07, "loss": 0.0061, "reward": 1.641055941581726, "reward_std": 0.19024673104286194, "rewards/accuracy_reward": 0.6488684415817261, "rewards/format_reward": 0.9921875, "step": 231 }, { "completion_length": 57.875, "epoch": 1.0593607305936072, "grad_norm": 5.363572597503662, "kl": 0.19677734375, "learning_rate": 8.940639269406392e-07, "loss": 0.0079, "reward": 1.5763392448425293, "reward_std": 0.3014761507511139, "rewards/accuracy_reward": 0.5997768044471741, "rewards/format_reward": 0.9765625, "step": 232 }, { "completion_length": 58.1484375, "epoch": 1.0639269406392695, "grad_norm": 2.532191276550293, "kl": 0.22900390625, "learning_rate": 8.93607305936073e-07, "loss": 0.0092, "reward": 1.5628709197044373, "reward_std": 0.1907019466161728, "rewards/accuracy_reward": 0.5706833899021149, "rewards/format_reward": 0.9921875, "step": 233 }, { "completion_length": 66.8046875, "epoch": 1.0684931506849316, "grad_norm": 5.099847793579102, "kl": 0.16259765625, "learning_rate": 8.931506849315068e-07, "loss": 0.0065, "reward": 1.6207798719406128, "reward_std": 0.2609640061855316, "rewards/accuracy_reward": 0.659842312335968, "rewards/format_reward": 0.9609375, "step": 234 }, { "completion_length": 60.0078125, "epoch": 1.0730593607305936, "grad_norm": 4.480944633483887, "kl": 0.22509765625, "learning_rate": 8.926940639269406e-07, "loss": 0.009, "reward": 1.514843761920929, "reward_std": 0.3193802535533905, "rewards/accuracy_reward": 0.5460937023162842, "rewards/format_reward": 0.96875, "step": 235 }, { "completion_length": 59.7109375, "epoch": 1.0776255707762556, "grad_norm": 3.332169532775879, "kl": 0.203125, "learning_rate": 8.922374429223744e-07, "loss": 0.0081, "reward": 1.5903645753860474, "reward_std": 0.2678108364343643, "rewards/accuracy_reward": 0.6216145753860474, "rewards/format_reward": 0.96875, "step": 236 }, { "completion_length": 60.46875, "epoch": 1.0821917808219177, "grad_norm": 2.0340123176574707, "kl": 0.14794921875, "learning_rate": 8.917808219178081e-07, "loss": 0.0059, "reward": 1.5674479603767395, "reward_std": 0.2526354044675827, "rewards/accuracy_reward": 0.5752603709697723, "rewards/format_reward": 0.9921875, "step": 237 }, { "completion_length": 57.0859375, "epoch": 1.08675799086758, "grad_norm": 2.9721603393554688, "kl": 0.18408203125, "learning_rate": 8.91324200913242e-07, "loss": 0.0074, "reward": 1.5576340556144714, "reward_std": 0.17624534666538239, "rewards/accuracy_reward": 0.5576339960098267, "rewards/format_reward": 1.0, "step": 238 }, { "completion_length": 90.328125, "epoch": 1.091324200913242, "grad_norm": 2.5770187377929688, "kl": 0.18798828125, "learning_rate": 8.908675799086758e-07, "loss": 0.0075, "reward": 1.7246136665344238, "reward_std": 0.16096369177103043, "rewards/accuracy_reward": 0.7480511367321014, "rewards/format_reward": 0.9765625, "step": 239 }, { "completion_length": 42.3828125, "epoch": 1.095890410958904, "grad_norm": 1.3982386589050293, "kl": 0.20263671875, "learning_rate": 8.904109589041095e-07, "loss": 0.0081, "reward": 1.5574405193328857, "reward_std": 0.1670553982257843, "rewards/accuracy_reward": 0.5574404895305634, "rewards/format_reward": 1.0, "step": 240 }, { "completion_length": 71.734375, "epoch": 1.1004566210045663, "grad_norm": 3.7010498046875, "kl": 0.16162109375, "learning_rate": 8.899543378995433e-07, "loss": 0.0065, "reward": 1.8125000596046448, "reward_std": 0.12704820185899734, "rewards/accuracy_reward": 0.8124999701976776, "rewards/format_reward": 1.0, "step": 241 }, { "completion_length": 65.9609375, "epoch": 1.1050228310502284, "grad_norm": 2.443540096282959, "kl": 0.1474609375, "learning_rate": 8.894977168949771e-07, "loss": 0.0059, "reward": 1.6270038485527039, "reward_std": 0.2314547374844551, "rewards/accuracy_reward": 0.6426288187503815, "rewards/format_reward": 0.984375, "step": 242 }, { "completion_length": 63.46875, "epoch": 1.1095890410958904, "grad_norm": 2.410088062286377, "kl": 0.1611328125, "learning_rate": 8.890410958904109e-07, "loss": 0.0064, "reward": 1.5, "reward_std": 0.2109457477927208, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 243 }, { "completion_length": 61.3671875, "epoch": 1.1141552511415524, "grad_norm": 3.672989845275879, "kl": 0.21826171875, "learning_rate": 8.885844748858448e-07, "loss": 0.0087, "reward": 1.6392003893852234, "reward_std": 0.21675443649291992, "rewards/accuracy_reward": 0.6626378297805786, "rewards/format_reward": 0.9765625, "step": 244 }, { "completion_length": 67.1171875, "epoch": 1.1187214611872145, "grad_norm": 2.5747616291046143, "kl": 0.1923828125, "learning_rate": 8.881278538812785e-07, "loss": 0.0077, "reward": 1.5857422351837158, "reward_std": 0.18916229903697968, "rewards/accuracy_reward": 0.5935547053813934, "rewards/format_reward": 0.9921875, "step": 245 }, { "completion_length": 51.1640625, "epoch": 1.1232876712328768, "grad_norm": 2.2640976905822754, "kl": 0.18017578125, "learning_rate": 8.876712328767123e-07, "loss": 0.0072, "reward": 1.5431283712387085, "reward_std": 0.19479237496852875, "rewards/accuracy_reward": 0.5431284010410309, "rewards/format_reward": 1.0, "step": 246 }, { "completion_length": 70.78125, "epoch": 1.1278538812785388, "grad_norm": 6.029892444610596, "kl": 0.1064453125, "learning_rate": 8.872146118721461e-07, "loss": 0.0042, "reward": 1.6893415451049805, "reward_std": 0.1636102795600891, "rewards/accuracy_reward": 0.6971540153026581, "rewards/format_reward": 0.9921875, "step": 247 }, { "completion_length": 67.9296875, "epoch": 1.1324200913242009, "grad_norm": 9.093193054199219, "kl": 0.1455078125, "learning_rate": 8.867579908675798e-07, "loss": 0.0058, "reward": 1.6471974849700928, "reward_std": 0.1802346110343933, "rewards/accuracy_reward": 0.647197425365448, "rewards/format_reward": 1.0, "step": 248 }, { "completion_length": 86.5234375, "epoch": 1.1369863013698631, "grad_norm": 3.213435411453247, "kl": 0.114501953125, "learning_rate": 8.863013698630136e-07, "loss": 0.0046, "reward": 1.550682783126831, "reward_std": 0.20900916308164597, "rewards/accuracy_reward": 0.5663077533245087, "rewards/format_reward": 0.984375, "step": 249 }, { "completion_length": 79.015625, "epoch": 1.1415525114155252, "grad_norm": 3.8062937259674072, "kl": 0.124267578125, "learning_rate": 8.858447488584474e-07, "loss": 0.005, "reward": 1.5841034650802612, "reward_std": 0.17966121435165405, "rewards/accuracy_reward": 0.5997284352779388, "rewards/format_reward": 0.984375, "step": 250 }, { "completion_length": 67.578125, "epoch": 1.1461187214611872, "grad_norm": 2.4963302612304688, "kl": 0.147705078125, "learning_rate": 8.853881278538813e-07, "loss": 0.0059, "reward": 1.712117850780487, "reward_std": 0.15082692354917526, "rewards/accuracy_reward": 0.7199303209781647, "rewards/format_reward": 0.9921875, "step": 251 }, { "completion_length": 66.7109375, "epoch": 1.1506849315068493, "grad_norm": 4.159963607788086, "kl": 0.132080078125, "learning_rate": 8.849315068493151e-07, "loss": 0.0053, "reward": 1.4834584593772888, "reward_std": 0.2506244257092476, "rewards/accuracy_reward": 0.4834584891796112, "rewards/format_reward": 1.0, "step": 252 }, { "completion_length": 65.9140625, "epoch": 1.1552511415525113, "grad_norm": 3.3972127437591553, "kl": 0.11669921875, "learning_rate": 8.844748858447488e-07, "loss": 0.0047, "reward": 1.7484084367752075, "reward_std": 0.17951766774058342, "rewards/accuracy_reward": 0.748408317565918, "rewards/format_reward": 1.0, "step": 253 }, { "completion_length": 70.4921875, "epoch": 1.1598173515981736, "grad_norm": 5.158849239349365, "kl": 0.105712890625, "learning_rate": 8.840182648401826e-07, "loss": 0.0042, "reward": 1.6956559419631958, "reward_std": 0.19831054285168648, "rewards/accuracy_reward": 0.7034684121608734, "rewards/format_reward": 0.9921875, "step": 254 }, { "completion_length": 65.328125, "epoch": 1.1643835616438356, "grad_norm": 4.302172660827637, "kl": 0.1591796875, "learning_rate": 8.835616438356164e-07, "loss": 0.0064, "reward": 1.5410139560699463, "reward_std": 0.24922513961791992, "rewards/accuracy_reward": 0.5488264262676239, "rewards/format_reward": 0.9921875, "step": 255 }, { "completion_length": 90.1015625, "epoch": 1.1689497716894977, "grad_norm": 4.157711029052734, "kl": 0.07763671875, "learning_rate": 8.831050228310501e-07, "loss": 0.0031, "reward": 1.8432291746139526, "reward_std": 0.1643235646188259, "rewards/accuracy_reward": 0.8588541150093079, "rewards/format_reward": 0.984375, "step": 256 }, { "completion_length": 76.1171875, "epoch": 1.17351598173516, "grad_norm": 8.10622787475586, "kl": 0.111328125, "learning_rate": 8.826484018264839e-07, "loss": 0.0045, "reward": 1.6758702397346497, "reward_std": 0.2606264054775238, "rewards/accuracy_reward": 0.699307769536972, "rewards/format_reward": 0.9765625, "step": 257 }, { "completion_length": 79.9609375, "epoch": 1.178082191780822, "grad_norm": 9.346076011657715, "kl": 0.105224609375, "learning_rate": 8.821917808219178e-07, "loss": 0.0042, "reward": 1.7317607402801514, "reward_std": 0.1373404860496521, "rewards/accuracy_reward": 0.7395730912685394, "rewards/format_reward": 0.9921875, "step": 258 }, { "completion_length": 52.734375, "epoch": 1.182648401826484, "grad_norm": 2.3975396156311035, "kl": 0.13427734375, "learning_rate": 8.817351598173516e-07, "loss": 0.0054, "reward": 1.545721709728241, "reward_std": 0.26958315074443817, "rewards/accuracy_reward": 0.561346709728241, "rewards/format_reward": 0.984375, "step": 259 }, { "completion_length": 59.125, "epoch": 1.187214611872146, "grad_norm": 4.990502834320068, "kl": 0.130126953125, "learning_rate": 8.812785388127854e-07, "loss": 0.0052, "reward": 1.5396197438240051, "reward_std": 0.338210791349411, "rewards/accuracy_reward": 0.5552447736263275, "rewards/format_reward": 0.984375, "step": 260 }, { "completion_length": 92.8984375, "epoch": 1.191780821917808, "grad_norm": 1.891558051109314, "kl": 0.083251953125, "learning_rate": 8.808219178082191e-07, "loss": 0.0033, "reward": 1.6179687976837158, "reward_std": 0.19345303252339363, "rewards/accuracy_reward": 0.6414062082767487, "rewards/format_reward": 0.9765625, "step": 261 }, { "completion_length": 37.671875, "epoch": 1.1963470319634704, "grad_norm": 3.5058436393737793, "kl": 0.12646484375, "learning_rate": 8.803652968036529e-07, "loss": 0.0051, "reward": 1.3005208373069763, "reward_std": 0.35989323258399963, "rewards/accuracy_reward": 0.3317708224058151, "rewards/format_reward": 0.96875, "step": 262 }, { "completion_length": 67.3203125, "epoch": 1.2009132420091324, "grad_norm": 3.314962387084961, "kl": 0.121826171875, "learning_rate": 8.799086757990867e-07, "loss": 0.0049, "reward": 1.6558881998062134, "reward_std": 0.2189551442861557, "rewards/accuracy_reward": 0.655888170003891, "rewards/format_reward": 1.0, "step": 263 }, { "completion_length": 69.8203125, "epoch": 1.2054794520547945, "grad_norm": 3.078131675720215, "kl": 0.14013671875, "learning_rate": 8.794520547945205e-07, "loss": 0.0056, "reward": 1.6604167222976685, "reward_std": 0.2180890440940857, "rewards/accuracy_reward": 0.6760416328907013, "rewards/format_reward": 0.984375, "step": 264 }, { "completion_length": 79.875, "epoch": 1.2100456621004567, "grad_norm": 2.9452457427978516, "kl": 0.086669921875, "learning_rate": 8.789954337899543e-07, "loss": 0.0035, "reward": 1.616637647151947, "reward_std": 0.1805976927280426, "rewards/accuracy_reward": 0.6244500577449799, "rewards/format_reward": 0.9921875, "step": 265 }, { "completion_length": 68.3984375, "epoch": 1.2146118721461188, "grad_norm": 3.070889711380005, "kl": 0.1455078125, "learning_rate": 8.785388127853881e-07, "loss": 0.0058, "reward": 1.6356534361839294, "reward_std": 0.20898383855819702, "rewards/accuracy_reward": 0.6356533765792847, "rewards/format_reward": 1.0, "step": 266 }, { "completion_length": 84.015625, "epoch": 1.2191780821917808, "grad_norm": 4.916512489318848, "kl": 0.090087890625, "learning_rate": 8.780821917808219e-07, "loss": 0.0036, "reward": 1.6764622926712036, "reward_std": 0.22947455942630768, "rewards/accuracy_reward": 0.6998997330665588, "rewards/format_reward": 0.9765625, "step": 267 }, { "completion_length": 56.9296875, "epoch": 1.2237442922374429, "grad_norm": 3.320021629333496, "kl": 0.1376953125, "learning_rate": 8.776255707762557e-07, "loss": 0.0055, "reward": 1.5441592335700989, "reward_std": 0.2436549812555313, "rewards/accuracy_reward": 0.5519717484712601, "rewards/format_reward": 0.9921875, "step": 268 }, { "completion_length": 55.03125, "epoch": 1.228310502283105, "grad_norm": 2.28918194770813, "kl": 0.099609375, "learning_rate": 8.771689497716894e-07, "loss": 0.004, "reward": 1.567187488079071, "reward_std": 0.24649156630039215, "rewards/accuracy_reward": 0.582812488079071, "rewards/format_reward": 0.984375, "step": 269 }, { "completion_length": 65.3203125, "epoch": 1.2328767123287672, "grad_norm": 4.561470985412598, "kl": 0.09423828125, "learning_rate": 8.767123287671232e-07, "loss": 0.0038, "reward": 1.5898438096046448, "reward_std": 0.22158773988485336, "rewards/accuracy_reward": 0.59765625, "rewards/format_reward": 0.9921875, "step": 270 }, { "completion_length": 71.8984375, "epoch": 1.2374429223744292, "grad_norm": 2.199537992477417, "kl": 0.130126953125, "learning_rate": 8.762557077625571e-07, "loss": 0.0052, "reward": 1.6058040857315063, "reward_std": 0.20108795166015625, "rewards/accuracy_reward": 0.6058041155338287, "rewards/format_reward": 1.0, "step": 271 }, { "completion_length": 76.6015625, "epoch": 1.2420091324200913, "grad_norm": 1.9431648254394531, "kl": 0.116455078125, "learning_rate": 8.757990867579908e-07, "loss": 0.0047, "reward": 1.7972594499588013, "reward_std": 0.12338224425911903, "rewards/accuracy_reward": 0.7972594499588013, "rewards/format_reward": 1.0, "step": 272 }, { "completion_length": 69.6015625, "epoch": 1.2465753424657535, "grad_norm": 1.7630584239959717, "kl": 0.103515625, "learning_rate": 8.753424657534246e-07, "loss": 0.0041, "reward": 1.6484375, "reward_std": 0.1649293452501297, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.9921875, "step": 273 }, { "completion_length": 98.6640625, "epoch": 1.2511415525114156, "grad_norm": 3.8383920192718506, "kl": 0.103271484375, "learning_rate": 8.748858447488584e-07, "loss": 0.0041, "reward": 1.765743374824524, "reward_std": 0.13689537346363068, "rewards/accuracy_reward": 0.7813683450222015, "rewards/format_reward": 0.984375, "step": 274 }, { "completion_length": 69.1328125, "epoch": 1.2557077625570776, "grad_norm": 3.83646297454834, "kl": 0.107666015625, "learning_rate": 8.744292237442922e-07, "loss": 0.0043, "reward": 1.5489718914031982, "reward_std": 0.2690936550498009, "rewards/accuracy_reward": 0.5645968914031982, "rewards/format_reward": 0.984375, "step": 275 }, { "completion_length": 74.171875, "epoch": 1.2602739726027397, "grad_norm": 1.6941070556640625, "kl": 0.103759765625, "learning_rate": 8.73972602739726e-07, "loss": 0.0042, "reward": 1.6818824410438538, "reward_std": 0.10927051305770874, "rewards/accuracy_reward": 0.6818824410438538, "rewards/format_reward": 1.0, "step": 276 }, { "completion_length": 68.9375, "epoch": 1.2648401826484017, "grad_norm": 5.96689510345459, "kl": 0.126220703125, "learning_rate": 8.735159817351597e-07, "loss": 0.0051, "reward": 1.4658617973327637, "reward_std": 0.19346562027931213, "rewards/accuracy_reward": 0.4736742377281189, "rewards/format_reward": 0.9921875, "step": 277 }, { "completion_length": 73.265625, "epoch": 1.269406392694064, "grad_norm": 2.1737794876098633, "kl": 0.124267578125, "learning_rate": 8.730593607305936e-07, "loss": 0.005, "reward": 1.6984375715255737, "reward_std": 0.13310657069087029, "rewards/accuracy_reward": 0.7062499523162842, "rewards/format_reward": 0.9921875, "step": 278 }, { "completion_length": 65.4609375, "epoch": 1.273972602739726, "grad_norm": 2.1923165321350098, "kl": 0.1279296875, "learning_rate": 8.726027397260274e-07, "loss": 0.0051, "reward": 1.7382813096046448, "reward_std": 0.1450316607952118, "rewards/accuracy_reward": 0.7460937201976776, "rewards/format_reward": 0.9921875, "step": 279 }, { "completion_length": 69.6328125, "epoch": 1.278538812785388, "grad_norm": 2.4596610069274902, "kl": 0.11279296875, "learning_rate": 8.721461187214611e-07, "loss": 0.0045, "reward": 1.7476563453674316, "reward_std": 0.15961872786283493, "rewards/accuracy_reward": 0.7476562261581421, "rewards/format_reward": 1.0, "step": 280 }, { "completion_length": 74.1640625, "epoch": 1.2831050228310503, "grad_norm": 1.8160244226455688, "kl": 0.08837890625, "learning_rate": 8.716894977168949e-07, "loss": 0.0035, "reward": 1.625745713710785, "reward_std": 0.17970408126711845, "rewards/accuracy_reward": 0.6257456988096237, "rewards/format_reward": 1.0, "step": 281 }, { "completion_length": 56.1875, "epoch": 1.2876712328767124, "grad_norm": 2.5006446838378906, "kl": 0.14306640625, "learning_rate": 8.712328767123287e-07, "loss": 0.0057, "reward": 1.571769893169403, "reward_std": 0.2276684269309044, "rewards/accuracy_reward": 0.5717698335647583, "rewards/format_reward": 1.0, "step": 282 }, { "completion_length": 55.5234375, "epoch": 1.2922374429223744, "grad_norm": 6.536046504974365, "kl": 0.1826171875, "learning_rate": 8.707762557077625e-07, "loss": 0.0073, "reward": 1.6325520277023315, "reward_std": 0.21870127320289612, "rewards/accuracy_reward": 0.6325520575046539, "rewards/format_reward": 1.0, "step": 283 }, { "completion_length": 93.65625, "epoch": 1.2968036529680365, "grad_norm": 10.913942337036133, "kl": 0.0833740234375, "learning_rate": 8.703196347031964e-07, "loss": 0.0033, "reward": 1.6755207777023315, "reward_std": 0.17495759949088097, "rewards/accuracy_reward": 0.6833332777023315, "rewards/format_reward": 0.9921875, "step": 284 }, { "completion_length": 67.125, "epoch": 1.3013698630136985, "grad_norm": 2.0972490310668945, "kl": 0.146484375, "learning_rate": 8.698630136986301e-07, "loss": 0.0059, "reward": 1.4372395873069763, "reward_std": 0.24129608273506165, "rewards/accuracy_reward": 0.4372395873069763, "rewards/format_reward": 1.0, "step": 285 }, { "completion_length": 83.25, "epoch": 1.3059360730593608, "grad_norm": 2.7569093704223633, "kl": 0.1005859375, "learning_rate": 8.694063926940639e-07, "loss": 0.004, "reward": 1.6421875953674316, "reward_std": 0.2471975013613701, "rewards/accuracy_reward": 0.6734374463558197, "rewards/format_reward": 0.96875, "step": 286 }, { "completion_length": 60.7578125, "epoch": 1.3105022831050228, "grad_norm": 4.288987636566162, "kl": 0.10888671875, "learning_rate": 8.689497716894977e-07, "loss": 0.0044, "reward": 1.562853455543518, "reward_std": 0.25185681879520416, "rewards/accuracy_reward": 0.5706659406423569, "rewards/format_reward": 0.9921875, "step": 287 }, { "completion_length": 72.6484375, "epoch": 1.3150684931506849, "grad_norm": 5.571771621704102, "kl": 0.128662109375, "learning_rate": 8.684931506849314e-07, "loss": 0.0052, "reward": 1.57805997133255, "reward_std": 0.23417949676513672, "rewards/accuracy_reward": 0.5858723521232605, "rewards/format_reward": 0.9921875, "step": 288 }, { "completion_length": 84.0390625, "epoch": 1.3196347031963471, "grad_norm": 1.9545791149139404, "kl": 0.085693359375, "learning_rate": 8.680365296803652e-07, "loss": 0.0034, "reward": 1.768750011920929, "reward_std": 0.1173202283680439, "rewards/accuracy_reward": 0.7765624225139618, "rewards/format_reward": 0.9921875, "step": 289 }, { "completion_length": 79.34375, "epoch": 1.3242009132420092, "grad_norm": 2.6059577465057373, "kl": 0.100830078125, "learning_rate": 8.67579908675799e-07, "loss": 0.004, "reward": 1.5458333492279053, "reward_std": 0.31738781929016113, "rewards/accuracy_reward": 0.5848958194255829, "rewards/format_reward": 0.9609375, "step": 290 }, { "completion_length": 77.8828125, "epoch": 1.3287671232876712, "grad_norm": 2.8494393825531006, "kl": 0.126953125, "learning_rate": 8.671232876712329e-07, "loss": 0.0051, "reward": 1.6368772983551025, "reward_std": 0.22899606823921204, "rewards/accuracy_reward": 0.6603147983551025, "rewards/format_reward": 0.9765625, "step": 291 }, { "completion_length": 91.6484375, "epoch": 1.3333333333333333, "grad_norm": 2.535183906555176, "kl": 0.08740234375, "learning_rate": 8.666666666666667e-07, "loss": 0.0035, "reward": 1.6453125476837158, "reward_std": 0.15308689326047897, "rewards/accuracy_reward": 0.6687499582767487, "rewards/format_reward": 0.9765625, "step": 292 }, { "completion_length": 65.921875, "epoch": 1.3378995433789953, "grad_norm": 3.120173692703247, "kl": 0.135009765625, "learning_rate": 8.662100456621004e-07, "loss": 0.0054, "reward": 1.541332721710205, "reward_std": 0.2652948349714279, "rewards/accuracy_reward": 0.5647702217102051, "rewards/format_reward": 0.9765625, "step": 293 }, { "completion_length": 64.9609375, "epoch": 1.3424657534246576, "grad_norm": 3.71287202835083, "kl": 0.096923828125, "learning_rate": 8.657534246575342e-07, "loss": 0.0039, "reward": 1.375390648841858, "reward_std": 0.3063650578260422, "rewards/accuracy_reward": 0.3988281339406967, "rewards/format_reward": 0.9765625, "step": 294 }, { "completion_length": 86.9296875, "epoch": 1.3470319634703196, "grad_norm": 4.879149913787842, "kl": 0.112548828125, "learning_rate": 8.65296803652968e-07, "loss": 0.0045, "reward": 1.7951704263687134, "reward_std": 0.1273602545261383, "rewards/accuracy_reward": 0.795170396566391, "rewards/format_reward": 1.0, "step": 295 }, { "completion_length": 84.3203125, "epoch": 1.3515981735159817, "grad_norm": 2.7764885425567627, "kl": 0.102783203125, "learning_rate": 8.648401826484017e-07, "loss": 0.0041, "reward": 1.7486329078674316, "reward_std": 0.2014031484723091, "rewards/accuracy_reward": 0.7564452290534973, "rewards/format_reward": 0.9921875, "step": 296 }, { "completion_length": 87.6875, "epoch": 1.356164383561644, "grad_norm": 3.286853551864624, "kl": 0.1103515625, "learning_rate": 8.643835616438355e-07, "loss": 0.0044, "reward": 1.5868489742279053, "reward_std": 0.21391719579696655, "rewards/accuracy_reward": 0.6024739146232605, "rewards/format_reward": 0.984375, "step": 297 }, { "completion_length": 72.0, "epoch": 1.360730593607306, "grad_norm": 3.1161341667175293, "kl": 0.130615234375, "learning_rate": 8.639269406392694e-07, "loss": 0.0052, "reward": 1.551562488079071, "reward_std": 0.2892322689294815, "rewards/accuracy_reward": 0.5750000178813934, "rewards/format_reward": 0.9765625, "step": 298 }, { "completion_length": 95.8828125, "epoch": 1.365296803652968, "grad_norm": 2.0366954803466797, "kl": 0.091796875, "learning_rate": 8.634703196347032e-07, "loss": 0.0037, "reward": 1.8364962935447693, "reward_std": 0.09190401062369347, "rewards/accuracy_reward": 0.8443087041378021, "rewards/format_reward": 0.9921875, "step": 299 }, { "completion_length": 101.921875, "epoch": 1.36986301369863, "grad_norm": 9.95484447479248, "kl": 0.080322265625, "learning_rate": 8.63013698630137e-07, "loss": 0.0032, "reward": 1.629079818725586, "reward_std": 0.13987341336905956, "rewards/accuracy_reward": 0.6290798187255859, "rewards/format_reward": 1.0, "step": 300 }, { "completion_length": 79.34375, "epoch": 1.374429223744292, "grad_norm": 19.206026077270508, "kl": 0.117919921875, "learning_rate": 8.625570776255707e-07, "loss": 0.0047, "reward": 1.6438615918159485, "reward_std": 0.21473538875579834, "rewards/accuracy_reward": 0.6516740918159485, "rewards/format_reward": 0.9921875, "step": 301 }, { "completion_length": 86.4609375, "epoch": 1.3789954337899544, "grad_norm": 9.368078231811523, "kl": 0.09228515625, "learning_rate": 8.621004566210045e-07, "loss": 0.0037, "reward": 1.6070312857627869, "reward_std": 0.1797475889325142, "rewards/accuracy_reward": 0.6226562261581421, "rewards/format_reward": 0.984375, "step": 302 }, { "completion_length": 79.0625, "epoch": 1.3835616438356164, "grad_norm": 2.789032220840454, "kl": 0.101806640625, "learning_rate": 8.616438356164383e-07, "loss": 0.0041, "reward": 1.62343031167984, "reward_std": 0.2234785482287407, "rewards/accuracy_reward": 0.6234302222728729, "rewards/format_reward": 1.0, "step": 303 }, { "completion_length": 68.3984375, "epoch": 1.3881278538812785, "grad_norm": 2.6755340099334717, "kl": 0.10009765625, "learning_rate": 8.611872146118721e-07, "loss": 0.004, "reward": 1.5965625047683716, "reward_std": 0.1608435958623886, "rewards/accuracy_reward": 0.6121874451637268, "rewards/format_reward": 0.984375, "step": 304 }, { "completion_length": 79.3203125, "epoch": 1.3926940639269407, "grad_norm": 3.4425978660583496, "kl": 0.1259765625, "learning_rate": 8.607305936073059e-07, "loss": 0.005, "reward": 1.7191716432571411, "reward_std": 0.11866222321987152, "rewards/accuracy_reward": 0.7269841134548187, "rewards/format_reward": 0.9921875, "step": 305 }, { "completion_length": 105.5546875, "epoch": 1.3972602739726028, "grad_norm": 4.509172439575195, "kl": 0.063720703125, "learning_rate": 8.602739726027397e-07, "loss": 0.0025, "reward": 1.7390625476837158, "reward_std": 0.14966704696416855, "rewards/accuracy_reward": 0.739062488079071, "rewards/format_reward": 1.0, "step": 306 }, { "completion_length": 87.1953125, "epoch": 1.4018264840182648, "grad_norm": 2.9753623008728027, "kl": 0.13232421875, "learning_rate": 8.598173515981735e-07, "loss": 0.0053, "reward": 1.6669872403144836, "reward_std": 0.14199939370155334, "rewards/accuracy_reward": 0.6669871509075165, "rewards/format_reward": 1.0, "step": 307 }, { "completion_length": 68.671875, "epoch": 1.4063926940639269, "grad_norm": 3.5725512504577637, "kl": 0.105712890625, "learning_rate": 8.593607305936073e-07, "loss": 0.0042, "reward": 1.5044271349906921, "reward_std": 0.2842589318752289, "rewards/accuracy_reward": 0.5044270902872086, "rewards/format_reward": 1.0, "step": 308 }, { "completion_length": 84.8671875, "epoch": 1.410958904109589, "grad_norm": 2.2308547496795654, "kl": 0.091796875, "learning_rate": 8.58904109589041e-07, "loss": 0.0037, "reward": 1.5451836585998535, "reward_std": 0.23898552358150482, "rewards/accuracy_reward": 0.5608087480068207, "rewards/format_reward": 0.984375, "step": 309 }, { "completion_length": 74.3984375, "epoch": 1.4155251141552512, "grad_norm": 5.253906726837158, "kl": 0.139892578125, "learning_rate": 8.584474885844748e-07, "loss": 0.0056, "reward": 1.5776489973068237, "reward_std": 0.20516303181648254, "rewards/accuracy_reward": 0.5854615569114685, "rewards/format_reward": 0.9921875, "step": 310 }, { "completion_length": 75.515625, "epoch": 1.4200913242009132, "grad_norm": 3.5134174823760986, "kl": 0.0849609375, "learning_rate": 8.579908675799087e-07, "loss": 0.0034, "reward": 1.4553078413009644, "reward_std": 0.26812630146741867, "rewards/accuracy_reward": 0.47093285620212555, "rewards/format_reward": 0.984375, "step": 311 }, { "completion_length": 100.5, "epoch": 1.4246575342465753, "grad_norm": 2.315570116043091, "kl": 0.090087890625, "learning_rate": 8.575342465753424e-07, "loss": 0.0036, "reward": 1.597842276096344, "reward_std": 0.192316435277462, "rewards/accuracy_reward": 0.5978422611951828, "rewards/format_reward": 1.0, "step": 312 }, { "completion_length": 85.46875, "epoch": 1.4292237442922375, "grad_norm": 4.950066566467285, "kl": 0.09375, "learning_rate": 8.570776255707762e-07, "loss": 0.0037, "reward": 1.6005207896232605, "reward_std": 0.23914001137018204, "rewards/accuracy_reward": 0.6083333194255829, "rewards/format_reward": 0.9921875, "step": 313 }, { "completion_length": 73.1328125, "epoch": 1.4337899543378996, "grad_norm": 3.737804651260376, "kl": 0.13037109375, "learning_rate": 8.5662100456621e-07, "loss": 0.0052, "reward": 1.542373538017273, "reward_std": 0.24436407536268234, "rewards/accuracy_reward": 0.542373538017273, "rewards/format_reward": 1.0, "step": 314 }, { "completion_length": 81.2265625, "epoch": 1.4383561643835616, "grad_norm": 2.7811484336853027, "kl": 0.10693359375, "learning_rate": 8.561643835616438e-07, "loss": 0.0043, "reward": 1.7172211408615112, "reward_std": 0.15319041907787323, "rewards/accuracy_reward": 0.7172211408615112, "rewards/format_reward": 1.0, "step": 315 }, { "completion_length": 82.328125, "epoch": 1.4429223744292237, "grad_norm": 5.399558067321777, "kl": 0.10986328125, "learning_rate": 8.557077625570776e-07, "loss": 0.0044, "reward": 1.5121857523918152, "reward_std": 0.28201115131378174, "rewards/accuracy_reward": 0.5199982225894928, "rewards/format_reward": 0.9921875, "step": 316 }, { "completion_length": 103.671875, "epoch": 1.4474885844748857, "grad_norm": 1.9453126192092896, "kl": 0.070068359375, "learning_rate": 8.552511415525113e-07, "loss": 0.0028, "reward": 1.7234273552894592, "reward_std": 0.12736555561423302, "rewards/accuracy_reward": 0.7234272956848145, "rewards/format_reward": 1.0, "step": 317 }, { "completion_length": 91.109375, "epoch": 1.452054794520548, "grad_norm": 4.417510032653809, "kl": 0.09716796875, "learning_rate": 8.547945205479452e-07, "loss": 0.0039, "reward": 1.6735481023788452, "reward_std": 0.17602262273430824, "rewards/accuracy_reward": 0.68136066198349, "rewards/format_reward": 0.9921875, "step": 318 }, { "completion_length": 86.109375, "epoch": 1.45662100456621, "grad_norm": 3.274066925048828, "kl": 0.105712890625, "learning_rate": 8.54337899543379e-07, "loss": 0.0042, "reward": 1.6061198115348816, "reward_std": 0.2346249595284462, "rewards/accuracy_reward": 0.6061197817325592, "rewards/format_reward": 1.0, "step": 319 }, { "completion_length": 103.8828125, "epoch": 1.461187214611872, "grad_norm": 3.0639874935150146, "kl": 0.09375, "learning_rate": 8.538812785388127e-07, "loss": 0.0037, "reward": 1.5671589970588684, "reward_std": 0.2653961777687073, "rewards/accuracy_reward": 0.5984089076519012, "rewards/format_reward": 0.96875, "step": 320 }, { "completion_length": 91.578125, "epoch": 1.4657534246575343, "grad_norm": 3.1881332397460938, "kl": 0.11669921875, "learning_rate": 8.534246575342465e-07, "loss": 0.0047, "reward": 1.7369791269302368, "reward_std": 0.24841733276844025, "rewards/accuracy_reward": 0.7369791567325592, "rewards/format_reward": 1.0, "step": 321 }, { "completion_length": 90.1328125, "epoch": 1.4703196347031964, "grad_norm": 7.73578405380249, "kl": 0.12841796875, "learning_rate": 8.529680365296803e-07, "loss": 0.0051, "reward": 1.5886787176132202, "reward_std": 0.27549922466278076, "rewards/accuracy_reward": 0.6043036431074142, "rewards/format_reward": 0.984375, "step": 322 }, { "completion_length": 105.2578125, "epoch": 1.4748858447488584, "grad_norm": 2.833080768585205, "kl": 0.063720703125, "learning_rate": 8.52511415525114e-07, "loss": 0.0026, "reward": 1.6982238292694092, "reward_std": 0.09853163920342922, "rewards/accuracy_reward": 0.7060362696647644, "rewards/format_reward": 0.9921875, "step": 323 }, { "completion_length": 101.9375, "epoch": 1.4794520547945205, "grad_norm": 1.9237995147705078, "kl": 0.0869140625, "learning_rate": 8.52054794520548e-07, "loss": 0.0035, "reward": 1.6929687857627869, "reward_std": 0.14721976220607758, "rewards/accuracy_reward": 0.7085936963558197, "rewards/format_reward": 0.984375, "step": 324 }, { "completion_length": 93.171875, "epoch": 1.4840182648401825, "grad_norm": 21.809825897216797, "kl": 0.100830078125, "learning_rate": 8.515981735159817e-07, "loss": 0.004, "reward": 1.464756965637207, "reward_std": 0.26955385506153107, "rewards/accuracy_reward": 0.503819465637207, "rewards/format_reward": 0.9609375, "step": 325 }, { "completion_length": 102.03125, "epoch": 1.4885844748858448, "grad_norm": 2.1946544647216797, "kl": 0.075439453125, "learning_rate": 8.511415525114155e-07, "loss": 0.003, "reward": 1.7023438215255737, "reward_std": 0.15592241287231445, "rewards/accuracy_reward": 0.7179687023162842, "rewards/format_reward": 0.984375, "step": 326 }, { "completion_length": 78.6875, "epoch": 1.4931506849315068, "grad_norm": 5.698726654052734, "kl": 0.11376953125, "learning_rate": 8.506849315068493e-07, "loss": 0.0046, "reward": 1.5953141450881958, "reward_std": 0.2819615304470062, "rewards/accuracy_reward": 0.618751734495163, "rewards/format_reward": 0.9765625, "step": 327 }, { "completion_length": 89.71875, "epoch": 1.4977168949771689, "grad_norm": 4.1140007972717285, "kl": 0.080078125, "learning_rate": 8.50228310502283e-07, "loss": 0.0032, "reward": 1.6470133662223816, "reward_std": 0.18788425624370575, "rewards/accuracy_reward": 0.6626383662223816, "rewards/format_reward": 0.984375, "step": 328 }, { "completion_length": 116.9375, "epoch": 1.5022831050228311, "grad_norm": 2.189347743988037, "kl": 0.0545654296875, "learning_rate": 8.497716894977168e-07, "loss": 0.0022, "reward": 1.71875, "reward_std": 0.12179600074887276, "rewards/accuracy_reward": 0.7343749701976776, "rewards/format_reward": 0.984375, "step": 329 }, { "completion_length": 88.703125, "epoch": 1.5068493150684932, "grad_norm": 2.298283815383911, "kl": 0.105712890625, "learning_rate": 8.493150684931506e-07, "loss": 0.0042, "reward": 1.686813473701477, "reward_std": 0.1672440692782402, "rewards/accuracy_reward": 0.6868133842945099, "rewards/format_reward": 1.0, "step": 330 }, { "completion_length": 69.828125, "epoch": 1.5114155251141552, "grad_norm": 6.333926200866699, "kl": 0.13134765625, "learning_rate": 8.488584474885845e-07, "loss": 0.0053, "reward": 1.7103299498558044, "reward_std": 0.1922176629304886, "rewards/accuracy_reward": 0.7103298306465149, "rewards/format_reward": 1.0, "step": 331 }, { "completion_length": 95.6484375, "epoch": 1.5159817351598175, "grad_norm": 1.9884992837905884, "kl": 0.109375, "learning_rate": 8.484018264840183e-07, "loss": 0.0044, "reward": 1.6320313215255737, "reward_std": 0.2171119600534439, "rewards/accuracy_reward": 0.6398437321186066, "rewards/format_reward": 0.9921875, "step": 332 }, { "completion_length": 80.28125, "epoch": 1.5205479452054793, "grad_norm": 3.4122371673583984, "kl": 0.15234375, "learning_rate": 8.47945205479452e-07, "loss": 0.0061, "reward": 1.6989583373069763, "reward_std": 0.22790630161762238, "rewards/accuracy_reward": 0.7067708373069763, "rewards/format_reward": 0.9921875, "step": 333 }, { "completion_length": 82.734375, "epoch": 1.5251141552511416, "grad_norm": 16.75699234008789, "kl": 0.084228515625, "learning_rate": 8.474885844748858e-07, "loss": 0.0034, "reward": 1.6330461502075195, "reward_std": 0.1892491653561592, "rewards/accuracy_reward": 0.6408586502075195, "rewards/format_reward": 0.9921875, "step": 334 }, { "completion_length": 81.421875, "epoch": 1.5296803652968036, "grad_norm": 2.46637225151062, "kl": 0.095703125, "learning_rate": 8.470319634703196e-07, "loss": 0.0038, "reward": 1.6607667207717896, "reward_std": 0.23428593575954437, "rewards/accuracy_reward": 0.6763917207717896, "rewards/format_reward": 0.984375, "step": 335 }, { "completion_length": 87.4375, "epoch": 1.5342465753424657, "grad_norm": 4.196310043334961, "kl": 0.09423828125, "learning_rate": 8.465753424657533e-07, "loss": 0.0038, "reward": 1.5312500596046448, "reward_std": 0.2585080787539482, "rewards/accuracy_reward": 0.5390624850988388, "rewards/format_reward": 0.9921875, "step": 336 }, { "completion_length": 73.046875, "epoch": 1.538812785388128, "grad_norm": 2.5180585384368896, "kl": 0.13671875, "learning_rate": 8.461187214611871e-07, "loss": 0.0055, "reward": 1.630428671836853, "reward_std": 0.20693185180425644, "rewards/accuracy_reward": 0.6538661420345306, "rewards/format_reward": 0.9765625, "step": 337 }, { "completion_length": 83.09375, "epoch": 1.54337899543379, "grad_norm": 4.625184059143066, "kl": 0.107666015625, "learning_rate": 8.45662100456621e-07, "loss": 0.0043, "reward": 1.7304688096046448, "reward_std": 0.16690129786729813, "rewards/accuracy_reward": 0.7304687201976776, "rewards/format_reward": 1.0, "step": 338 }, { "completion_length": 79.3515625, "epoch": 1.547945205479452, "grad_norm": 7.074364185333252, "kl": 0.2109375, "learning_rate": 8.452054794520548e-07, "loss": 0.0084, "reward": 1.6570312976837158, "reward_std": 0.24604861438274384, "rewards/accuracy_reward": 0.680468738079071, "rewards/format_reward": 0.9765625, "step": 339 }, { "completion_length": 79.8125, "epoch": 1.5525114155251143, "grad_norm": 3.3716442584991455, "kl": 0.086669921875, "learning_rate": 8.447488584474886e-07, "loss": 0.0035, "reward": 1.5007859468460083, "reward_std": 0.2506624162197113, "rewards/accuracy_reward": 0.5242233872413635, "rewards/format_reward": 0.9765625, "step": 340 }, { "completion_length": 75.390625, "epoch": 1.5570776255707761, "grad_norm": 3.0893497467041016, "kl": 0.099853515625, "learning_rate": 8.442922374429223e-07, "loss": 0.004, "reward": 1.546875, "reward_std": 0.2782461494207382, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.96875, "step": 341 }, { "completion_length": 80.75, "epoch": 1.5616438356164384, "grad_norm": 1.9481297731399536, "kl": 0.0885009765625, "learning_rate": 8.438356164383561e-07, "loss": 0.0035, "reward": 1.5662733316421509, "reward_std": 0.20162209123373032, "rewards/accuracy_reward": 0.5897107124328613, "rewards/format_reward": 0.9765625, "step": 342 }, { "completion_length": 72.890625, "epoch": 1.5662100456621004, "grad_norm": 2.927980899810791, "kl": 0.112548828125, "learning_rate": 8.433789954337899e-07, "loss": 0.0045, "reward": 1.4613777995109558, "reward_std": 0.34207557141780853, "rewards/accuracy_reward": 0.5473152101039886, "rewards/format_reward": 0.9140625, "step": 343 }, { "completion_length": 56.953125, "epoch": 1.5707762557077625, "grad_norm": 3.1426687240600586, "kl": 0.155517578125, "learning_rate": 8.429223744292237e-07, "loss": 0.0062, "reward": 1.3815755248069763, "reward_std": 0.3664630800485611, "rewards/accuracy_reward": 0.44407549500465393, "rewards/format_reward": 0.9375, "step": 344 }, { "completion_length": 82.2265625, "epoch": 1.5753424657534247, "grad_norm": 2.98221492767334, "kl": 0.126220703125, "learning_rate": 8.424657534246576e-07, "loss": 0.005, "reward": 1.6400888562202454, "reward_std": 0.24476776085793972, "rewards/accuracy_reward": 0.6947763860225677, "rewards/format_reward": 0.9453125, "step": 345 }, { "completion_length": 67.859375, "epoch": 1.5799086757990868, "grad_norm": 2.4362642765045166, "kl": 0.122314453125, "learning_rate": 8.420091324200913e-07, "loss": 0.0049, "reward": 1.4768601059913635, "reward_std": 0.258517824113369, "rewards/accuracy_reward": 0.5159225761890411, "rewards/format_reward": 0.9609375, "step": 346 }, { "completion_length": 70.0859375, "epoch": 1.5844748858447488, "grad_norm": 3.0861399173736572, "kl": 0.166015625, "learning_rate": 8.415525114155251e-07, "loss": 0.0066, "reward": 1.6798083782196045, "reward_std": 0.2793383300304413, "rewards/accuracy_reward": 0.7032458782196045, "rewards/format_reward": 0.9765625, "step": 347 }, { "completion_length": 50.015625, "epoch": 1.589041095890411, "grad_norm": 3.495870351791382, "kl": 0.162109375, "learning_rate": 8.410958904109589e-07, "loss": 0.0065, "reward": 1.582118034362793, "reward_std": 0.21435417234897614, "rewards/accuracy_reward": 0.5821180641651154, "rewards/format_reward": 1.0, "step": 348 }, { "completion_length": 50.2890625, "epoch": 1.593607305936073, "grad_norm": 4.53682804107666, "kl": 0.146240234375, "learning_rate": 8.406392694063926e-07, "loss": 0.0059, "reward": 1.6024181842803955, "reward_std": 0.2329491451382637, "rewards/accuracy_reward": 0.6258556544780731, "rewards/format_reward": 0.9765625, "step": 349 }, { "completion_length": 54.34375, "epoch": 1.5981735159817352, "grad_norm": 2.6551053524017334, "kl": 0.127685546875, "learning_rate": 8.401826484018264e-07, "loss": 0.0051, "reward": 1.5606771111488342, "reward_std": 0.1882794126868248, "rewards/accuracy_reward": 0.568489596247673, "rewards/format_reward": 0.9921875, "step": 350 }, { "completion_length": 66.4140625, "epoch": 1.6027397260273972, "grad_norm": 6.037808418273926, "kl": 0.146484375, "learning_rate": 8.397260273972603e-07, "loss": 0.0059, "reward": 1.6453644037246704, "reward_std": 0.18358591571450233, "rewards/accuracy_reward": 0.660989373922348, "rewards/format_reward": 0.984375, "step": 351 }, { "completion_length": 49.375, "epoch": 1.6073059360730593, "grad_norm": 2.8399972915649414, "kl": 0.1689453125, "learning_rate": 8.39269406392694e-07, "loss": 0.0068, "reward": 1.6010416746139526, "reward_std": 0.2630682438611984, "rewards/accuracy_reward": 0.6088541746139526, "rewards/format_reward": 0.9921875, "step": 352 }, { "completion_length": 58.4921875, "epoch": 1.6118721461187215, "grad_norm": 1.744779109954834, "kl": 0.1396484375, "learning_rate": 8.388127853881279e-07, "loss": 0.0056, "reward": 1.675000011920929, "reward_std": 0.16849348694086075, "rewards/accuracy_reward": 0.675000011920929, "rewards/format_reward": 1.0, "step": 353 }, { "completion_length": 63.8046875, "epoch": 1.6164383561643836, "grad_norm": 4.681619167327881, "kl": 0.14404296875, "learning_rate": 8.383561643835616e-07, "loss": 0.0058, "reward": 1.7920387387275696, "reward_std": 0.14393481612205505, "rewards/accuracy_reward": 0.7920385599136353, "rewards/format_reward": 1.0, "step": 354 }, { "completion_length": 67.0234375, "epoch": 1.6210045662100456, "grad_norm": 3.167783498764038, "kl": 0.1318359375, "learning_rate": 8.378995433789954e-07, "loss": 0.0053, "reward": 1.728416085243225, "reward_std": 0.18405602872371674, "rewards/accuracy_reward": 0.7518534958362579, "rewards/format_reward": 0.9765625, "step": 355 }, { "completion_length": 63.390625, "epoch": 1.625570776255708, "grad_norm": 3.923906087875366, "kl": 0.13330078125, "learning_rate": 8.374429223744292e-07, "loss": 0.0053, "reward": 1.6279487609863281, "reward_std": 0.12273544818162918, "rewards/accuracy_reward": 0.6279487460851669, "rewards/format_reward": 1.0, "step": 356 }, { "completion_length": 61.1796875, "epoch": 1.6301369863013697, "grad_norm": 3.410731077194214, "kl": 0.13427734375, "learning_rate": 8.369863013698629e-07, "loss": 0.0054, "reward": 1.6447545289993286, "reward_std": 0.22194860875606537, "rewards/accuracy_reward": 0.6603794991970062, "rewards/format_reward": 0.984375, "step": 357 }, { "completion_length": 68.53125, "epoch": 1.634703196347032, "grad_norm": 4.652464389801025, "kl": 0.1181640625, "learning_rate": 8.365296803652968e-07, "loss": 0.0047, "reward": 1.6500000953674316, "reward_std": 0.17417245358228683, "rewards/accuracy_reward": 0.6499999761581421, "rewards/format_reward": 1.0, "step": 358 }, { "completion_length": 55.984375, "epoch": 1.639269406392694, "grad_norm": 1.8993587493896484, "kl": 0.1298828125, "learning_rate": 8.360730593607306e-07, "loss": 0.0052, "reward": 1.7149627804756165, "reward_std": 0.11658288538455963, "rewards/accuracy_reward": 0.7149626910686493, "rewards/format_reward": 1.0, "step": 359 }, { "completion_length": 81.7578125, "epoch": 1.643835616438356, "grad_norm": 1.6242355108261108, "kl": 0.145751953125, "learning_rate": 8.356164383561643e-07, "loss": 0.0058, "reward": 1.7650251388549805, "reward_std": 0.07628657668828964, "rewards/accuracy_reward": 0.7650250494480133, "rewards/format_reward": 1.0, "step": 360 }, { "completion_length": 92.7265625, "epoch": 1.6484018264840183, "grad_norm": 1.6242085695266724, "kl": 0.06298828125, "learning_rate": 8.351598173515981e-07, "loss": 0.0025, "reward": 1.75, "reward_std": 0.1409430019557476, "rewards/accuracy_reward": 0.7656249701976776, "rewards/format_reward": 0.984375, "step": 361 }, { "completion_length": 71.1328125, "epoch": 1.6529680365296804, "grad_norm": 3.2496068477630615, "kl": 0.109130859375, "learning_rate": 8.347031963470319e-07, "loss": 0.0044, "reward": 1.614074468612671, "reward_std": 0.19926752150058746, "rewards/accuracy_reward": 0.6296994388103485, "rewards/format_reward": 0.984375, "step": 362 }, { "completion_length": 68.6953125, "epoch": 1.6575342465753424, "grad_norm": 1.9562387466430664, "kl": 0.100830078125, "learning_rate": 8.342465753424657e-07, "loss": 0.004, "reward": 1.7125000953674316, "reward_std": 0.13402405753731728, "rewards/accuracy_reward": 0.7124999761581421, "rewards/format_reward": 1.0, "step": 363 }, { "completion_length": 65.21875, "epoch": 1.6621004566210047, "grad_norm": 2.6374332904815674, "kl": 0.15673828125, "learning_rate": 8.337899543378996e-07, "loss": 0.0063, "reward": 1.653124988079071, "reward_std": 0.1894800141453743, "rewards/accuracy_reward": 0.668749988079071, "rewards/format_reward": 0.984375, "step": 364 }, { "completion_length": 50.03125, "epoch": 1.6666666666666665, "grad_norm": 3.6687989234924316, "kl": 0.1171875, "learning_rate": 8.333333333333333e-07, "loss": 0.0047, "reward": 1.5525545477867126, "reward_std": 0.18541519343852997, "rewards/accuracy_reward": 0.5525545328855515, "rewards/format_reward": 1.0, "step": 365 }, { "completion_length": 50.265625, "epoch": 1.6712328767123288, "grad_norm": 27.93817710876465, "kl": 0.8662109375, "learning_rate": 8.328767123287671e-07, "loss": 0.0347, "reward": 1.6250372529029846, "reward_std": 0.26529867947101593, "rewards/accuracy_reward": 0.6250371932983398, "rewards/format_reward": 1.0, "step": 366 }, { "completion_length": 55.046875, "epoch": 1.6757990867579908, "grad_norm": 1.959330439567566, "kl": 0.128662109375, "learning_rate": 8.324200913242009e-07, "loss": 0.0051, "reward": 1.611718773841858, "reward_std": 0.20705362409353256, "rewards/accuracy_reward": 0.6117187142372131, "rewards/format_reward": 1.0, "step": 367 }, { "completion_length": 73.1796875, "epoch": 1.6803652968036529, "grad_norm": 4.830105781555176, "kl": 0.11083984375, "learning_rate": 8.319634703196346e-07, "loss": 0.0044, "reward": 1.7028512358665466, "reward_std": 0.15199671685695648, "rewards/accuracy_reward": 0.7106637060642242, "rewards/format_reward": 0.9921875, "step": 368 }, { "completion_length": 44.71875, "epoch": 1.6849315068493151, "grad_norm": 2.5528032779693604, "kl": 0.1474609375, "learning_rate": 8.315068493150684e-07, "loss": 0.0059, "reward": 1.5036830306053162, "reward_std": 0.2237851321697235, "rewards/accuracy_reward": 0.5036830604076385, "rewards/format_reward": 1.0, "step": 369 }, { "completion_length": 64.7578125, "epoch": 1.6894977168949772, "grad_norm": 3.1371445655822754, "kl": 0.10693359375, "learning_rate": 8.310502283105022e-07, "loss": 0.0043, "reward": 1.7922433018684387, "reward_std": 0.1861533522605896, "rewards/accuracy_reward": 0.8000558018684387, "rewards/format_reward": 0.9921875, "step": 370 }, { "completion_length": 81.265625, "epoch": 1.6940639269406392, "grad_norm": 3.3165316581726074, "kl": 0.094482421875, "learning_rate": 8.305936073059361e-07, "loss": 0.0038, "reward": 1.5851722359657288, "reward_std": 0.17648599669337273, "rewards/accuracy_reward": 0.5851722061634064, "rewards/format_reward": 1.0, "step": 371 }, { "completion_length": 84.890625, "epoch": 1.6986301369863015, "grad_norm": 3.0761749744415283, "kl": 0.10693359375, "learning_rate": 8.301369863013699e-07, "loss": 0.0043, "reward": 1.7469556331634521, "reward_std": 0.1787155643105507, "rewards/accuracy_reward": 0.770393043756485, "rewards/format_reward": 0.9765625, "step": 372 }, { "completion_length": 50.25, "epoch": 1.7031963470319633, "grad_norm": 3.6592459678649902, "kl": 0.15380859375, "learning_rate": 8.296803652968036e-07, "loss": 0.0062, "reward": 1.6217397451400757, "reward_std": 0.2100653052330017, "rewards/accuracy_reward": 0.6217397451400757, "rewards/format_reward": 1.0, "step": 373 }, { "completion_length": 69.7734375, "epoch": 1.7077625570776256, "grad_norm": 4.063467502593994, "kl": 0.1123046875, "learning_rate": 8.292237442922374e-07, "loss": 0.0045, "reward": 1.6185640096664429, "reward_std": 0.17512290179729462, "rewards/accuracy_reward": 0.6185639351606369, "rewards/format_reward": 1.0, "step": 374 }, { "completion_length": 44.3125, "epoch": 1.7123287671232876, "grad_norm": 2.8335328102111816, "kl": 0.1572265625, "learning_rate": 8.287671232876712e-07, "loss": 0.0063, "reward": 1.528906226158142, "reward_std": 0.238134503364563, "rewards/accuracy_reward": 0.5289062559604645, "rewards/format_reward": 1.0, "step": 375 }, { "completion_length": 47.7578125, "epoch": 1.7168949771689497, "grad_norm": 2.0584352016448975, "kl": 0.14306640625, "learning_rate": 8.283105022831049e-07, "loss": 0.0057, "reward": 1.5130573511123657, "reward_std": 0.23582037538290024, "rewards/accuracy_reward": 0.5130573809146881, "rewards/format_reward": 1.0, "step": 376 }, { "completion_length": 81.203125, "epoch": 1.721461187214612, "grad_norm": 2.3844358921051025, "kl": 0.11669921875, "learning_rate": 8.278538812785387e-07, "loss": 0.0047, "reward": 1.616406261920929, "reward_std": 0.22177018970251083, "rewards/accuracy_reward": 0.6320312321186066, "rewards/format_reward": 0.984375, "step": 377 }, { "completion_length": 70.109375, "epoch": 1.726027397260274, "grad_norm": 4.480605602264404, "kl": 0.093994140625, "learning_rate": 8.273972602739726e-07, "loss": 0.0038, "reward": 1.7281250357627869, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.7359374761581421, "rewards/format_reward": 0.9921875, "step": 378 }, { "completion_length": 57.6015625, "epoch": 1.730593607305936, "grad_norm": 3.5465571880340576, "kl": 0.12109375, "learning_rate": 8.269406392694064e-07, "loss": 0.0048, "reward": 1.6325520873069763, "reward_std": 0.20021257549524307, "rewards/accuracy_reward": 0.6325520575046539, "rewards/format_reward": 1.0, "step": 379 }, { "completion_length": 85.328125, "epoch": 1.7351598173515983, "grad_norm": 2.483445882797241, "kl": 0.0963134765625, "learning_rate": 8.264840182648402e-07, "loss": 0.0038, "reward": 1.5804979801177979, "reward_std": 0.16095227003097534, "rewards/accuracy_reward": 0.5804979428648949, "rewards/format_reward": 1.0, "step": 380 }, { "completion_length": 64.453125, "epoch": 1.7397260273972601, "grad_norm": 2.9765961170196533, "kl": 0.16650390625, "learning_rate": 8.260273972602739e-07, "loss": 0.0067, "reward": 1.6214489340782166, "reward_std": 0.20274285972118378, "rewards/accuracy_reward": 0.6214488744735718, "rewards/format_reward": 1.0, "step": 381 }, { "completion_length": 76.828125, "epoch": 1.7442922374429224, "grad_norm": 3.132500410079956, "kl": 0.1435546875, "learning_rate": 8.255707762557077e-07, "loss": 0.0058, "reward": 1.5574839115142822, "reward_std": 0.22679631412029266, "rewards/accuracy_reward": 0.5652963519096375, "rewards/format_reward": 0.9921875, "step": 382 }, { "completion_length": 70.46875, "epoch": 1.7488584474885844, "grad_norm": 2.3076906204223633, "kl": 0.114501953125, "learning_rate": 8.251141552511415e-07, "loss": 0.0046, "reward": 1.688330888748169, "reward_std": 0.13296211138367653, "rewards/accuracy_reward": 0.6883308291435242, "rewards/format_reward": 1.0, "step": 383 }, { "completion_length": 73.0078125, "epoch": 1.7534246575342465, "grad_norm": 4.062695026397705, "kl": 0.114501953125, "learning_rate": 8.246575342465753e-07, "loss": 0.0046, "reward": 1.5453124642372131, "reward_std": 0.2351284772157669, "rewards/accuracy_reward": 0.5609375238418579, "rewards/format_reward": 0.984375, "step": 384 }, { "completion_length": 48.984375, "epoch": 1.7579908675799087, "grad_norm": 6.5755510330200195, "kl": 0.20068359375, "learning_rate": 8.242009132420092e-07, "loss": 0.008, "reward": 1.5984994769096375, "reward_std": 0.3222559839487076, "rewards/accuracy_reward": 0.5984995067119598, "rewards/format_reward": 1.0, "step": 385 }, { "completion_length": 71.703125, "epoch": 1.7625570776255708, "grad_norm": 13.117231369018555, "kl": 0.083251953125, "learning_rate": 8.237442922374429e-07, "loss": 0.0033, "reward": 1.5032986402511597, "reward_std": 0.21686269342899323, "rewards/accuracy_reward": 0.5032986104488373, "rewards/format_reward": 1.0, "step": 386 }, { "completion_length": 87.375, "epoch": 1.7671232876712328, "grad_norm": 2.233595371246338, "kl": 0.107177734375, "learning_rate": 8.232876712328767e-07, "loss": 0.0043, "reward": 1.701716125011444, "reward_std": 0.11170049756765366, "rewards/accuracy_reward": 0.7017160058021545, "rewards/format_reward": 1.0, "step": 387 }, { "completion_length": 77.515625, "epoch": 1.771689497716895, "grad_norm": 4.436239719390869, "kl": 0.1083984375, "learning_rate": 8.228310502283105e-07, "loss": 0.0043, "reward": 1.6824799180030823, "reward_std": 0.18607579916715622, "rewards/accuracy_reward": 0.6902924478054047, "rewards/format_reward": 0.9921875, "step": 388 }, { "completion_length": 60.2265625, "epoch": 1.776255707762557, "grad_norm": 2.3695435523986816, "kl": 0.1494140625, "learning_rate": 8.223744292237442e-07, "loss": 0.006, "reward": 1.7026600241661072, "reward_std": 0.13800616562366486, "rewards/accuracy_reward": 0.7104724645614624, "rewards/format_reward": 0.9921875, "step": 389 }, { "completion_length": 71.140625, "epoch": 1.7808219178082192, "grad_norm": 4.455196380615234, "kl": 0.15966796875, "learning_rate": 8.21917808219178e-07, "loss": 0.0064, "reward": 1.6663504838943481, "reward_std": 0.16872704774141312, "rewards/accuracy_reward": 0.6663504242897034, "rewards/format_reward": 1.0, "step": 390 }, { "completion_length": 65.8359375, "epoch": 1.7853881278538812, "grad_norm": 15.527393341064453, "kl": 0.148681640625, "learning_rate": 8.214611872146119e-07, "loss": 0.0059, "reward": 1.553125023841858, "reward_std": 0.21595830470323563, "rewards/accuracy_reward": 0.5531250089406967, "rewards/format_reward": 1.0, "step": 391 }, { "completion_length": 73.0625, "epoch": 1.7899543378995433, "grad_norm": 4.076765537261963, "kl": 0.101806640625, "learning_rate": 8.210045662100456e-07, "loss": 0.0041, "reward": 1.6578125357627869, "reward_std": 0.17282497137784958, "rewards/accuracy_reward": 0.6578125059604645, "rewards/format_reward": 1.0, "step": 392 }, { "completion_length": 64.3515625, "epoch": 1.7945205479452055, "grad_norm": 4.910162448883057, "kl": 0.24609375, "learning_rate": 8.205479452054795e-07, "loss": 0.0099, "reward": 1.5686192512512207, "reward_std": 0.20994295924901962, "rewards/accuracy_reward": 0.5764318108558655, "rewards/format_reward": 0.9921875, "step": 393 }, { "completion_length": 71.3359375, "epoch": 1.7990867579908676, "grad_norm": 3.380282402038574, "kl": 0.098876953125, "learning_rate": 8.200913242009132e-07, "loss": 0.004, "reward": 1.6727213859558105, "reward_std": 0.09240220487117767, "rewards/accuracy_reward": 0.672721341252327, "rewards/format_reward": 1.0, "step": 394 }, { "completion_length": 70.625, "epoch": 1.8036529680365296, "grad_norm": 15.939676284790039, "kl": 0.1748046875, "learning_rate": 8.19634703196347e-07, "loss": 0.007, "reward": 1.4804688096046448, "reward_std": 0.22750268876552582, "rewards/accuracy_reward": 0.4882812350988388, "rewards/format_reward": 0.9921875, "step": 395 }, { "completion_length": 76.9453125, "epoch": 1.808219178082192, "grad_norm": 8.642084121704102, "kl": 0.10009765625, "learning_rate": 8.191780821917808e-07, "loss": 0.004, "reward": 1.6074219346046448, "reward_std": 0.21606218069791794, "rewards/accuracy_reward": 0.615234375, "rewards/format_reward": 0.9921875, "step": 396 }, { "completion_length": 71.0546875, "epoch": 1.8127853881278537, "grad_norm": 3.1132235527038574, "kl": 0.1259765625, "learning_rate": 8.187214611872145e-07, "loss": 0.005, "reward": 1.6144831776618958, "reward_std": 0.22148973494768143, "rewards/accuracy_reward": 0.6144831776618958, "rewards/format_reward": 1.0, "step": 397 }, { "completion_length": 64.6484375, "epoch": 1.817351598173516, "grad_norm": 5.543883800506592, "kl": 0.135498046875, "learning_rate": 8.182648401826484e-07, "loss": 0.0054, "reward": 1.5686756372451782, "reward_std": 0.1985800489783287, "rewards/accuracy_reward": 0.5764880925416946, "rewards/format_reward": 0.9921875, "step": 398 }, { "completion_length": 59.421875, "epoch": 1.821917808219178, "grad_norm": 6.449603080749512, "kl": 0.14892578125, "learning_rate": 8.178082191780822e-07, "loss": 0.006, "reward": 1.6453125476837158, "reward_std": 0.1793966293334961, "rewards/accuracy_reward": 0.6531250476837158, "rewards/format_reward": 0.9921875, "step": 399 }, { "completion_length": 65.0859375, "epoch": 1.82648401826484, "grad_norm": 3.558039665222168, "kl": 0.121826171875, "learning_rate": 8.173515981735159e-07, "loss": 0.0049, "reward": 1.5231584310531616, "reward_std": 0.27039487659931183, "rewards/accuracy_reward": 0.5309710204601288, "rewards/format_reward": 0.9921875, "step": 400 }, { "completion_length": 74.875, "epoch": 1.8310502283105023, "grad_norm": 2.6240084171295166, "kl": 0.138427734375, "learning_rate": 8.168949771689498e-07, "loss": 0.0055, "reward": 1.5302269458770752, "reward_std": 0.19186384975910187, "rewards/accuracy_reward": 0.5302269384264946, "rewards/format_reward": 1.0, "step": 401 }, { "completion_length": 72.109375, "epoch": 1.8356164383561644, "grad_norm": 2.054145574569702, "kl": 0.11572265625, "learning_rate": 8.164383561643835e-07, "loss": 0.0046, "reward": 1.7239583730697632, "reward_std": 0.13660814613103867, "rewards/accuracy_reward": 0.723958283662796, "rewards/format_reward": 1.0, "step": 402 }, { "completion_length": 93.1484375, "epoch": 1.8401826484018264, "grad_norm": 2.950429916381836, "kl": 0.108642578125, "learning_rate": 8.159817351598172e-07, "loss": 0.0043, "reward": 1.734375, "reward_std": 0.11295716743916273, "rewards/accuracy_reward": 0.7343749403953552, "rewards/format_reward": 1.0, "step": 403 }, { "completion_length": 68.5546875, "epoch": 1.8447488584474887, "grad_norm": 2.904849052429199, "kl": 0.154541015625, "learning_rate": 8.155251141552512e-07, "loss": 0.0062, "reward": 1.6253038048744202, "reward_std": 0.26257922500371933, "rewards/accuracy_reward": 0.6409288048744202, "rewards/format_reward": 0.984375, "step": 404 }, { "completion_length": 80.0390625, "epoch": 1.8493150684931505, "grad_norm": 2.698310136795044, "kl": 0.10546875, "learning_rate": 8.150684931506849e-07, "loss": 0.0042, "reward": 1.6187500357627869, "reward_std": 0.1505398042500019, "rewards/accuracy_reward": 0.6343750059604645, "rewards/format_reward": 0.984375, "step": 405 }, { "completion_length": 88.8046875, "epoch": 1.8538812785388128, "grad_norm": 3.7120022773742676, "kl": 0.091064453125, "learning_rate": 8.146118721461187e-07, "loss": 0.0036, "reward": 1.6648437976837158, "reward_std": 0.16381803154945374, "rewards/accuracy_reward": 0.6882811784744263, "rewards/format_reward": 0.9765625, "step": 406 }, { "completion_length": 82.1171875, "epoch": 1.8584474885844748, "grad_norm": 17.568010330200195, "kl": 0.09521484375, "learning_rate": 8.141552511415525e-07, "loss": 0.0038, "reward": 1.5686570405960083, "reward_std": 0.27213824540376663, "rewards/accuracy_reward": 0.5920945107936859, "rewards/format_reward": 0.9765625, "step": 407 }, { "completion_length": 72.0390625, "epoch": 1.8630136986301369, "grad_norm": 5.123001575469971, "kl": 0.108642578125, "learning_rate": 8.136986301369862e-07, "loss": 0.0044, "reward": 1.6002225279808044, "reward_std": 0.1988915428519249, "rewards/accuracy_reward": 0.6002225577831268, "rewards/format_reward": 1.0, "step": 408 }, { "completion_length": 65.7890625, "epoch": 1.8675799086757991, "grad_norm": 3.825585126876831, "kl": 0.12646484375, "learning_rate": 8.132420091324201e-07, "loss": 0.0051, "reward": 1.5363582372665405, "reward_std": 0.22431423515081406, "rewards/accuracy_reward": 0.5363581627607346, "rewards/format_reward": 1.0, "step": 409 }, { "completion_length": 84.0859375, "epoch": 1.8721461187214612, "grad_norm": 2.313400983810425, "kl": 0.123779296875, "learning_rate": 8.127853881278538e-07, "loss": 0.005, "reward": 1.6218750476837158, "reward_std": 0.28108011931180954, "rewards/accuracy_reward": 0.7078124582767487, "rewards/format_reward": 0.9140625, "step": 410 }, { "completion_length": 88.03125, "epoch": 1.8767123287671232, "grad_norm": 4.612303256988525, "kl": 0.132568359375, "learning_rate": 8.123287671232877e-07, "loss": 0.0053, "reward": 1.662500023841858, "reward_std": 0.18329600244760513, "rewards/accuracy_reward": 0.6781249642372131, "rewards/format_reward": 0.984375, "step": 411 }, { "completion_length": 60.7265625, "epoch": 1.8812785388127855, "grad_norm": 2.7391302585601807, "kl": 0.14794921875, "learning_rate": 8.118721461187215e-07, "loss": 0.0059, "reward": 1.648708462715149, "reward_std": 0.15135541558265686, "rewards/accuracy_reward": 0.6487084329128265, "rewards/format_reward": 1.0, "step": 412 }, { "completion_length": 103.84375, "epoch": 1.8858447488584473, "grad_norm": 1.170571208000183, "kl": 0.0670166015625, "learning_rate": 8.114155251141552e-07, "loss": 0.0027, "reward": 1.811079502105713, "reward_std": 0.0749700665473938, "rewards/accuracy_reward": 0.8267044425010681, "rewards/format_reward": 0.984375, "step": 413 }, { "completion_length": 70.9921875, "epoch": 1.8904109589041096, "grad_norm": 3.7362253665924072, "kl": 0.14404296875, "learning_rate": 8.10958904109589e-07, "loss": 0.0058, "reward": 1.6984003186225891, "reward_std": 0.19452574849128723, "rewards/accuracy_reward": 0.7062127590179443, "rewards/format_reward": 0.9921875, "step": 414 }, { "completion_length": 66.953125, "epoch": 1.8949771689497716, "grad_norm": 4.104921340942383, "kl": 0.133056640625, "learning_rate": 8.105022831050228e-07, "loss": 0.0053, "reward": 1.5964038372039795, "reward_std": 0.2661294490098953, "rewards/accuracy_reward": 0.6042163074016571, "rewards/format_reward": 0.9921875, "step": 415 }, { "completion_length": 67.984375, "epoch": 1.8995433789954337, "grad_norm": 5.06357479095459, "kl": 0.11669921875, "learning_rate": 8.100456621004565e-07, "loss": 0.0047, "reward": 1.6529513597488403, "reward_std": 0.16535750776529312, "rewards/accuracy_reward": 0.6529513597488403, "rewards/format_reward": 1.0, "step": 416 }, { "completion_length": 72.1640625, "epoch": 1.904109589041096, "grad_norm": 9.87835693359375, "kl": 0.137451171875, "learning_rate": 8.095890410958903e-07, "loss": 0.0055, "reward": 1.6302083134651184, "reward_std": 0.21513652801513672, "rewards/accuracy_reward": 0.6458333134651184, "rewards/format_reward": 0.984375, "step": 417 }, { "completion_length": 73.6015625, "epoch": 1.908675799086758, "grad_norm": 10.113238334655762, "kl": 0.142333984375, "learning_rate": 8.091324200913242e-07, "loss": 0.0057, "reward": 1.6189236044883728, "reward_std": 0.23221635073423386, "rewards/accuracy_reward": 0.6345485746860504, "rewards/format_reward": 0.984375, "step": 418 }, { "completion_length": 62.078125, "epoch": 1.91324200913242, "grad_norm": 3.4071831703186035, "kl": 0.1513671875, "learning_rate": 8.08675799086758e-07, "loss": 0.0061, "reward": 1.356249988079071, "reward_std": 0.3108007460832596, "rewards/accuracy_reward": 0.36406250298023224, "rewards/format_reward": 0.9921875, "step": 419 }, { "completion_length": 83.203125, "epoch": 1.9178082191780823, "grad_norm": 3.302225351333618, "kl": 0.2509765625, "learning_rate": 8.082191780821918e-07, "loss": 0.0101, "reward": 1.6587789058685303, "reward_std": 0.1820889264345169, "rewards/accuracy_reward": 0.6665914356708527, "rewards/format_reward": 0.9921875, "step": 420 }, { "completion_length": 76.1015625, "epoch": 1.9223744292237441, "grad_norm": 4.783244609832764, "kl": 0.1474609375, "learning_rate": 8.077625570776255e-07, "loss": 0.0059, "reward": 1.7332961559295654, "reward_std": 0.11496374011039734, "rewards/accuracy_reward": 0.733296126127243, "rewards/format_reward": 1.0, "step": 421 }, { "completion_length": 94.6484375, "epoch": 1.9269406392694064, "grad_norm": 2.7827274799346924, "kl": 0.09375, "learning_rate": 8.073059360730593e-07, "loss": 0.0038, "reward": 1.8006826043128967, "reward_std": 0.09438200853765011, "rewards/accuracy_reward": 0.800682544708252, "rewards/format_reward": 1.0, "step": 422 }, { "completion_length": 70.4375, "epoch": 1.9315068493150684, "grad_norm": 4.186135768890381, "kl": 0.16455078125, "learning_rate": 8.068493150684931e-07, "loss": 0.0066, "reward": 1.6078130006790161, "reward_std": 0.21306797862052917, "rewards/accuracy_reward": 0.6078130900859833, "rewards/format_reward": 1.0, "step": 423 }, { "completion_length": 77.6953125, "epoch": 1.9360730593607305, "grad_norm": 2.545297861099243, "kl": 0.111083984375, "learning_rate": 8.063926940639269e-07, "loss": 0.0045, "reward": 1.6678841710090637, "reward_std": 0.11376722529530525, "rewards/accuracy_reward": 0.6756967306137085, "rewards/format_reward": 0.9921875, "step": 424 }, { "completion_length": 87.1171875, "epoch": 1.9406392694063928, "grad_norm": 2.4129421710968018, "kl": 0.080078125, "learning_rate": 8.059360730593608e-07, "loss": 0.0032, "reward": 1.6949777007102966, "reward_std": 0.14730913192033768, "rewards/accuracy_reward": 0.7027901709079742, "rewards/format_reward": 0.9921875, "step": 425 }, { "completion_length": 78.53125, "epoch": 1.9452054794520548, "grad_norm": 2.179104804992676, "kl": 0.120361328125, "learning_rate": 8.054794520547945e-07, "loss": 0.0048, "reward": 1.7349414825439453, "reward_std": 0.20185434818267822, "rewards/accuracy_reward": 0.7505663931369781, "rewards/format_reward": 0.984375, "step": 426 }, { "completion_length": 70.2578125, "epoch": 1.9497716894977168, "grad_norm": 3.5189919471740723, "kl": 0.15869140625, "learning_rate": 8.050228310502283e-07, "loss": 0.0064, "reward": 1.6359375715255737, "reward_std": 0.1778659224510193, "rewards/accuracy_reward": 0.6359374523162842, "rewards/format_reward": 1.0, "step": 427 }, { "completion_length": 91.140625, "epoch": 1.954337899543379, "grad_norm": 4.333745002746582, "kl": 0.07861328125, "learning_rate": 8.045662100456621e-07, "loss": 0.0031, "reward": 1.6484509706497192, "reward_std": 0.23022788017988205, "rewards/accuracy_reward": 0.6797009706497192, "rewards/format_reward": 0.96875, "step": 428 }, { "completion_length": 75.453125, "epoch": 1.958904109589041, "grad_norm": 3.482632637023926, "kl": 0.12841796875, "learning_rate": 8.041095890410958e-07, "loss": 0.0051, "reward": 1.6106771230697632, "reward_std": 0.23327118158340454, "rewards/accuracy_reward": 0.626302033662796, "rewards/format_reward": 0.984375, "step": 429 }, { "completion_length": 87.984375, "epoch": 1.9634703196347032, "grad_norm": 1.5310007333755493, "kl": 0.12451171875, "learning_rate": 8.036529680365296e-07, "loss": 0.005, "reward": 1.7687500715255737, "reward_std": 0.1117947231978178, "rewards/accuracy_reward": 0.7765624523162842, "rewards/format_reward": 0.9921875, "step": 430 }, { "completion_length": 74.0625, "epoch": 1.9680365296803652, "grad_norm": 2.0865707397460938, "kl": 0.120849609375, "learning_rate": 8.031963470319635e-07, "loss": 0.0048, "reward": 1.7373016476631165, "reward_std": 0.1270090974867344, "rewards/accuracy_reward": 0.7373015582561493, "rewards/format_reward": 1.0, "step": 431 }, { "completion_length": 74.6953125, "epoch": 1.9726027397260273, "grad_norm": 2.8522982597351074, "kl": 0.104248046875, "learning_rate": 8.027397260273972e-07, "loss": 0.0042, "reward": 1.6342076063156128, "reward_std": 0.18551241606473923, "rewards/accuracy_reward": 0.6498326063156128, "rewards/format_reward": 0.984375, "step": 432 }, { "completion_length": 86.6484375, "epoch": 1.9771689497716896, "grad_norm": 2.4067180156707764, "kl": 0.10009765625, "learning_rate": 8.022831050228311e-07, "loss": 0.004, "reward": 1.6945313215255737, "reward_std": 0.11847387999296188, "rewards/accuracy_reward": 0.694531261920929, "rewards/format_reward": 1.0, "step": 433 }, { "completion_length": 80.296875, "epoch": 1.9817351598173516, "grad_norm": 2.204301357269287, "kl": 0.12890625, "learning_rate": 8.018264840182648e-07, "loss": 0.0052, "reward": 1.6941722631454468, "reward_std": 0.16894984245300293, "rewards/accuracy_reward": 0.7176096439361572, "rewards/format_reward": 0.9765625, "step": 434 }, { "completion_length": 69.578125, "epoch": 1.9863013698630136, "grad_norm": 3.0245296955108643, "kl": 0.127197265625, "learning_rate": 8.013698630136985e-07, "loss": 0.0051, "reward": 1.627698838710785, "reward_std": 0.1703873947262764, "rewards/accuracy_reward": 0.6276988387107849, "rewards/format_reward": 1.0, "step": 435 }, { "completion_length": 59.9140625, "epoch": 1.990867579908676, "grad_norm": 3.451076030731201, "kl": 0.14404296875, "learning_rate": 8.009132420091324e-07, "loss": 0.0058, "reward": 1.5428819060325623, "reward_std": 0.2504672184586525, "rewards/accuracy_reward": 0.542881965637207, "rewards/format_reward": 1.0, "step": 436 }, { "completion_length": 84.5625, "epoch": 1.9954337899543377, "grad_norm": 2.452648639678955, "kl": 0.08056640625, "learning_rate": 8.004566210045661e-07, "loss": 0.0032, "reward": 1.6554688215255737, "reward_std": 0.2031169831752777, "rewards/accuracy_reward": 0.6710937917232513, "rewards/format_reward": 0.984375, "step": 437 }, { "completion_length": 45.5, "epoch": 2.0, "grad_norm": 1.8669296503067017, "kl": 0.126953125, "learning_rate": 8e-07, "loss": 0.0047, "reward": 1.875, "reward_std": 0.43671509623527527, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 438 }, { "completion_length": 79.21875, "epoch": 2.0045662100456623, "grad_norm": 1.4508730173110962, "kl": 0.127685546875, "learning_rate": 7.995433789954338e-07, "loss": 0.0051, "reward": 1.7248697876930237, "reward_std": 0.1105603277683258, "rewards/accuracy_reward": 0.7404947876930237, "rewards/format_reward": 0.984375, "step": 439 }, { "completion_length": 62.09375, "epoch": 2.009132420091324, "grad_norm": 2.7377336025238037, "kl": 0.1376953125, "learning_rate": 7.990867579908675e-07, "loss": 0.0055, "reward": 1.3338541984558105, "reward_std": 0.3162979334592819, "rewards/accuracy_reward": 0.34947916865348816, "rewards/format_reward": 0.984375, "step": 440 }, { "completion_length": 87.34375, "epoch": 2.0136986301369864, "grad_norm": 2.2570443153381348, "kl": 0.12060546875, "learning_rate": 7.986301369863014e-07, "loss": 0.0048, "reward": 1.7250688076019287, "reward_std": 0.08361868560314178, "rewards/accuracy_reward": 0.7250687181949615, "rewards/format_reward": 1.0, "step": 441 }, { "completion_length": 94.140625, "epoch": 2.018264840182648, "grad_norm": 8.999687194824219, "kl": 0.10595703125, "learning_rate": 7.981735159817351e-07, "loss": 0.0042, "reward": 1.6828125715255737, "reward_std": 0.16398613899946213, "rewards/accuracy_reward": 0.7062499523162842, "rewards/format_reward": 0.9765625, "step": 442 }, { "completion_length": 69.984375, "epoch": 2.0228310502283104, "grad_norm": 2.834467649459839, "kl": 0.096923828125, "learning_rate": 7.977168949771688e-07, "loss": 0.0039, "reward": 1.5455728769302368, "reward_std": 0.25657252967357635, "rewards/accuracy_reward": 0.5533854067325592, "rewards/format_reward": 0.9921875, "step": 443 }, { "completion_length": 98.921875, "epoch": 2.0273972602739727, "grad_norm": 2.6678521633148193, "kl": 0.082275390625, "learning_rate": 7.972602739726027e-07, "loss": 0.0033, "reward": 1.663573145866394, "reward_std": 0.1522715613245964, "rewards/accuracy_reward": 0.6635731160640717, "rewards/format_reward": 1.0, "step": 444 }, { "completion_length": 80.84375, "epoch": 2.0319634703196345, "grad_norm": 1.9825059175491333, "kl": 0.09326171875, "learning_rate": 7.968036529680365e-07, "loss": 0.0037, "reward": 1.603124976158142, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.6109374761581421, "rewards/format_reward": 0.9921875, "step": 445 }, { "completion_length": 78.09375, "epoch": 2.036529680365297, "grad_norm": 5.509555816650391, "kl": 0.1341552734375, "learning_rate": 7.963470319634703e-07, "loss": 0.0054, "reward": 1.6062500476837158, "reward_std": 0.24452467262744904, "rewards/accuracy_reward": 0.621874988079071, "rewards/format_reward": 0.984375, "step": 446 }, { "completion_length": 59.6484375, "epoch": 2.041095890410959, "grad_norm": 4.631182670593262, "kl": 0.154296875, "learning_rate": 7.958904109589041e-07, "loss": 0.0062, "reward": 1.6522735357284546, "reward_std": 0.24951402842998505, "rewards/accuracy_reward": 0.667898565530777, "rewards/format_reward": 0.984375, "step": 447 }, { "completion_length": 69.25, "epoch": 2.045662100456621, "grad_norm": 3.0625455379486084, "kl": 0.152099609375, "learning_rate": 7.954337899543378e-07, "loss": 0.0061, "reward": 1.6024305820465088, "reward_std": 0.21730707585811615, "rewards/accuracy_reward": 0.6102430522441864, "rewards/format_reward": 0.9921875, "step": 448 }, { "completion_length": 66.5859375, "epoch": 2.050228310502283, "grad_norm": 7.600069522857666, "kl": 0.118408203125, "learning_rate": 7.949771689497717e-07, "loss": 0.0047, "reward": 1.7647135853767395, "reward_std": 0.16752880066633224, "rewards/accuracy_reward": 0.7725259959697723, "rewards/format_reward": 0.9921875, "step": 449 }, { "completion_length": 83.8828125, "epoch": 2.0547945205479454, "grad_norm": 4.596118450164795, "kl": 0.149658203125, "learning_rate": 7.945205479452054e-07, "loss": 0.006, "reward": 1.6769480109214783, "reward_std": 0.1898738443851471, "rewards/accuracy_reward": 0.684760570526123, "rewards/format_reward": 0.9921875, "step": 450 }, { "completion_length": 48.546875, "epoch": 2.0593607305936072, "grad_norm": 3.093118906021118, "kl": 0.171875, "learning_rate": 7.940639269406393e-07, "loss": 0.0069, "reward": 1.6568829417228699, "reward_std": 0.20424779504537582, "rewards/accuracy_reward": 0.656883031129837, "rewards/format_reward": 1.0, "step": 451 }, { "completion_length": 73.203125, "epoch": 2.0639269406392695, "grad_norm": 3.5054690837860107, "kl": 0.135498046875, "learning_rate": 7.936073059360731e-07, "loss": 0.0054, "reward": 1.6708519458770752, "reward_std": 0.18046213686466217, "rewards/accuracy_reward": 0.6708519458770752, "rewards/format_reward": 1.0, "step": 452 }, { "completion_length": 78.1953125, "epoch": 2.0684931506849313, "grad_norm": 1.87969172000885, "kl": 0.096923828125, "learning_rate": 7.931506849315068e-07, "loss": 0.0039, "reward": 1.6500000953674316, "reward_std": 0.1841355338692665, "rewards/accuracy_reward": 0.6656249761581421, "rewards/format_reward": 0.984375, "step": 453 }, { "completion_length": 66.203125, "epoch": 2.0730593607305936, "grad_norm": 3.03586483001709, "kl": 0.116455078125, "learning_rate": 7.926940639269406e-07, "loss": 0.0047, "reward": 1.5984020233154297, "reward_std": 0.2402767539024353, "rewards/accuracy_reward": 0.6218394637107849, "rewards/format_reward": 0.9765625, "step": 454 }, { "completion_length": 60.4453125, "epoch": 2.077625570776256, "grad_norm": 5.758031368255615, "kl": 0.15185546875, "learning_rate": 7.922374429223744e-07, "loss": 0.0061, "reward": 1.6396695375442505, "reward_std": 0.18095022439956665, "rewards/accuracy_reward": 0.6396694481372833, "rewards/format_reward": 1.0, "step": 455 }, { "completion_length": 80.0703125, "epoch": 2.0821917808219177, "grad_norm": 2.098630905151367, "kl": 0.105712890625, "learning_rate": 7.917808219178081e-07, "loss": 0.0042, "reward": 1.6691096425056458, "reward_std": 0.1658947691321373, "rewards/accuracy_reward": 0.684734582901001, "rewards/format_reward": 0.984375, "step": 456 }, { "completion_length": 72.2734375, "epoch": 2.08675799086758, "grad_norm": 2.7200841903686523, "kl": 0.11083984375, "learning_rate": 7.91324200913242e-07, "loss": 0.0044, "reward": 1.5380051136016846, "reward_std": 0.2077661082148552, "rewards/accuracy_reward": 0.5458175092935562, "rewards/format_reward": 0.9921875, "step": 457 }, { "completion_length": 85.140625, "epoch": 2.091324200913242, "grad_norm": 12.362757682800293, "kl": 0.091552734375, "learning_rate": 7.908675799086758e-07, "loss": 0.0037, "reward": 1.6828125715255737, "reward_std": 0.19778337329626083, "rewards/accuracy_reward": 0.6984374821186066, "rewards/format_reward": 0.984375, "step": 458 }, { "completion_length": 84.1875, "epoch": 2.095890410958904, "grad_norm": 2.426860809326172, "kl": 0.10595703125, "learning_rate": 7.904109589041096e-07, "loss": 0.0042, "reward": 1.6260236501693726, "reward_std": 0.18746323138475418, "rewards/accuracy_reward": 0.6338361203670502, "rewards/format_reward": 0.9921875, "step": 459 }, { "completion_length": 91.34375, "epoch": 2.1004566210045663, "grad_norm": 9.778014183044434, "kl": 0.09765625, "learning_rate": 7.899543378995434e-07, "loss": 0.0039, "reward": 1.660937488079071, "reward_std": 0.19728107750415802, "rewards/accuracy_reward": 0.6921874582767487, "rewards/format_reward": 0.96875, "step": 460 }, { "completion_length": 60.7265625, "epoch": 2.105022831050228, "grad_norm": 12.834918975830078, "kl": 0.32470703125, "learning_rate": 7.894977168949771e-07, "loss": 0.0129, "reward": 1.7225513458251953, "reward_std": 0.1815098226070404, "rewards/accuracy_reward": 0.7225514352321625, "rewards/format_reward": 1.0, "step": 461 }, { "completion_length": 65.0546875, "epoch": 2.1095890410958904, "grad_norm": 3.4787039756774902, "kl": 0.1533203125, "learning_rate": 7.890410958904109e-07, "loss": 0.0061, "reward": 1.6404947638511658, "reward_std": 0.19807539880275726, "rewards/accuracy_reward": 0.6483072936534882, "rewards/format_reward": 0.9921875, "step": 462 }, { "completion_length": 82.46875, "epoch": 2.1141552511415527, "grad_norm": 2.7784857749938965, "kl": 0.1136474609375, "learning_rate": 7.885844748858447e-07, "loss": 0.0045, "reward": 1.6932291984558105, "reward_std": 0.14191660657525063, "rewards/accuracy_reward": 0.6932291388511658, "rewards/format_reward": 1.0, "step": 463 }, { "completion_length": 78.5, "epoch": 2.1187214611872145, "grad_norm": 2.705900192260742, "kl": 0.098388671875, "learning_rate": 7.881278538812784e-07, "loss": 0.0039, "reward": 1.6399182081222534, "reward_std": 0.18248122185468674, "rewards/accuracy_reward": 0.6477306485176086, "rewards/format_reward": 0.9921875, "step": 464 }, { "completion_length": 88.0703125, "epoch": 2.1232876712328768, "grad_norm": 2.8577768802642822, "kl": 0.099609375, "learning_rate": 7.876712328767124e-07, "loss": 0.004, "reward": 1.688281238079071, "reward_std": 0.19159993529319763, "rewards/accuracy_reward": 0.6960937082767487, "rewards/format_reward": 0.9921875, "step": 465 }, { "completion_length": 66.6015625, "epoch": 2.127853881278539, "grad_norm": 2.5234570503234863, "kl": 0.12158203125, "learning_rate": 7.872146118721461e-07, "loss": 0.0049, "reward": 1.5919778943061829, "reward_std": 0.24576038867235184, "rewards/accuracy_reward": 0.6076028943061829, "rewards/format_reward": 0.984375, "step": 466 }, { "completion_length": 75.390625, "epoch": 2.132420091324201, "grad_norm": 3.172330141067505, "kl": 0.1142578125, "learning_rate": 7.867579908675798e-07, "loss": 0.0046, "reward": 1.4447365403175354, "reward_std": 0.22060814499855042, "rewards/accuracy_reward": 0.452549085021019, "rewards/format_reward": 0.9921875, "step": 467 }, { "completion_length": 91.1640625, "epoch": 2.136986301369863, "grad_norm": 2.385173797607422, "kl": 0.0966796875, "learning_rate": 7.863013698630137e-07, "loss": 0.0039, "reward": 1.7272321581840515, "reward_std": 0.13396714627742767, "rewards/accuracy_reward": 0.7350445687770844, "rewards/format_reward": 0.9921875, "step": 468 }, { "completion_length": 75.5234375, "epoch": 2.141552511415525, "grad_norm": 2.3278234004974365, "kl": 0.130126953125, "learning_rate": 7.858447488584474e-07, "loss": 0.0052, "reward": 1.643117606639862, "reward_std": 0.1804744228720665, "rewards/accuracy_reward": 0.6509300172328949, "rewards/format_reward": 0.9921875, "step": 469 }, { "completion_length": 63.9453125, "epoch": 2.146118721461187, "grad_norm": 5.3924360275268555, "kl": 0.16552734375, "learning_rate": 7.853881278538812e-07, "loss": 0.0066, "reward": 1.5500783324241638, "reward_std": 0.25609923899173737, "rewards/accuracy_reward": 0.5735158026218414, "rewards/format_reward": 0.9765625, "step": 470 }, { "completion_length": 86.0625, "epoch": 2.1506849315068495, "grad_norm": 2.030338764190674, "kl": 0.1171875, "learning_rate": 7.849315068493151e-07, "loss": 0.0047, "reward": 1.6851562857627869, "reward_std": 0.14745555073022842, "rewards/accuracy_reward": 0.6929687559604645, "rewards/format_reward": 0.9921875, "step": 471 }, { "completion_length": 66.0, "epoch": 2.1552511415525113, "grad_norm": 18.199661254882812, "kl": 0.13330078125, "learning_rate": 7.844748858447488e-07, "loss": 0.0053, "reward": 1.486718773841858, "reward_std": 0.32600878179073334, "rewards/accuracy_reward": 0.5101562440395355, "rewards/format_reward": 0.9765625, "step": 472 }, { "completion_length": 93.5625, "epoch": 2.1598173515981736, "grad_norm": 3.143573760986328, "kl": 0.07470703125, "learning_rate": 7.840182648401827e-07, "loss": 0.003, "reward": 1.7031250596046448, "reward_std": 0.18717344850301743, "rewards/accuracy_reward": 0.7109374701976776, "rewards/format_reward": 0.9921875, "step": 473 }, { "completion_length": 68.9453125, "epoch": 2.1643835616438354, "grad_norm": 3.1616029739379883, "kl": 0.121337890625, "learning_rate": 7.835616438356164e-07, "loss": 0.0049, "reward": 1.6342764496803284, "reward_std": 0.23029568046331406, "rewards/accuracy_reward": 0.6499014496803284, "rewards/format_reward": 0.984375, "step": 474 }, { "completion_length": 80.5859375, "epoch": 2.1689497716894977, "grad_norm": 2.141812324523926, "kl": 0.11572265625, "learning_rate": 7.831050228310501e-07, "loss": 0.0046, "reward": 1.7305381298065186, "reward_std": 0.19372030347585678, "rewards/accuracy_reward": 0.7383506596088409, "rewards/format_reward": 0.9921875, "step": 475 }, { "completion_length": 89.4921875, "epoch": 2.17351598173516, "grad_norm": 3.011232852935791, "kl": 0.098876953125, "learning_rate": 7.82648401826484e-07, "loss": 0.004, "reward": 1.60247403383255, "reward_std": 0.28417903184890747, "rewards/accuracy_reward": 0.6415364444255829, "rewards/format_reward": 0.9609375, "step": 476 }, { "completion_length": 88.90625, "epoch": 2.1780821917808217, "grad_norm": 1.868679165840149, "kl": 0.101806640625, "learning_rate": 7.821917808219177e-07, "loss": 0.0041, "reward": 1.5997712016105652, "reward_std": 0.1929171234369278, "rewards/accuracy_reward": 0.5997711420059204, "rewards/format_reward": 1.0, "step": 477 }, { "completion_length": 102.1640625, "epoch": 2.182648401826484, "grad_norm": 3.3878626823425293, "kl": 0.11962890625, "learning_rate": 7.817351598173516e-07, "loss": 0.0048, "reward": 1.7378038167953491, "reward_std": 0.13828756287693977, "rewards/accuracy_reward": 0.7456162571907043, "rewards/format_reward": 0.9921875, "step": 478 }, { "completion_length": 73.65625, "epoch": 2.1872146118721463, "grad_norm": 3.133633613586426, "kl": 0.1357421875, "learning_rate": 7.812785388127854e-07, "loss": 0.0054, "reward": 1.7677083611488342, "reward_std": 0.18527808785438538, "rewards/accuracy_reward": 0.7911458313465118, "rewards/format_reward": 0.9765625, "step": 479 }, { "completion_length": 65.0703125, "epoch": 2.191780821917808, "grad_norm": 2.9012463092803955, "kl": 0.14794921875, "learning_rate": 7.808219178082191e-07, "loss": 0.0059, "reward": 1.5422247648239136, "reward_std": 0.2771962434053421, "rewards/accuracy_reward": 0.5500372052192688, "rewards/format_reward": 0.9921875, "step": 480 }, { "completion_length": 80.53125, "epoch": 2.1963470319634704, "grad_norm": 7.046329498291016, "kl": 0.103271484375, "learning_rate": 7.80365296803653e-07, "loss": 0.0041, "reward": 1.66796875, "reward_std": 0.1984855979681015, "rewards/accuracy_reward": 0.6679687798023224, "rewards/format_reward": 1.0, "step": 481 }, { "completion_length": 81.921875, "epoch": 2.2009132420091326, "grad_norm": 2.9581196308135986, "kl": 0.108642578125, "learning_rate": 7.799086757990867e-07, "loss": 0.0043, "reward": 1.6520833373069763, "reward_std": 0.23736542463302612, "rewards/accuracy_reward": 0.6598958075046539, "rewards/format_reward": 0.9921875, "step": 482 }, { "completion_length": 79.78125, "epoch": 2.2054794520547945, "grad_norm": 8.972982406616211, "kl": 0.10693359375, "learning_rate": 7.794520547945204e-07, "loss": 0.0043, "reward": 1.5802912712097168, "reward_std": 0.21900298446416855, "rewards/accuracy_reward": 0.580291211605072, "rewards/format_reward": 1.0, "step": 483 }, { "completion_length": 83.5703125, "epoch": 2.2100456621004567, "grad_norm": 2.1765618324279785, "kl": 0.12548828125, "learning_rate": 7.789954337899543e-07, "loss": 0.005, "reward": 1.6838542222976685, "reward_std": 0.21648824214935303, "rewards/accuracy_reward": 0.6838541924953461, "rewards/format_reward": 1.0, "step": 484 }, { "completion_length": 82.1171875, "epoch": 2.2146118721461185, "grad_norm": 6.6184563636779785, "kl": 0.13916015625, "learning_rate": 7.785388127853881e-07, "loss": 0.0056, "reward": 1.576785683631897, "reward_std": 0.2656140699982643, "rewards/accuracy_reward": 0.615848183631897, "rewards/format_reward": 0.9609375, "step": 485 }, { "completion_length": 79.90625, "epoch": 2.219178082191781, "grad_norm": 3.397468328475952, "kl": 0.12109375, "learning_rate": 7.780821917808219e-07, "loss": 0.0048, "reward": 1.6415550708770752, "reward_std": 0.1549607552587986, "rewards/accuracy_reward": 0.6571800261735916, "rewards/format_reward": 0.984375, "step": 486 }, { "completion_length": 84.609375, "epoch": 2.223744292237443, "grad_norm": 2.6514155864715576, "kl": 0.127685546875, "learning_rate": 7.776255707762557e-07, "loss": 0.0051, "reward": 1.6367188096046448, "reward_std": 0.24639248847961426, "rewards/accuracy_reward": 0.64453125, "rewards/format_reward": 0.9921875, "step": 487 }, { "completion_length": 67.40625, "epoch": 2.228310502283105, "grad_norm": 2.794630289077759, "kl": 0.130859375, "learning_rate": 7.771689497716894e-07, "loss": 0.0052, "reward": 1.7186384201049805, "reward_std": 0.18474777042865753, "rewards/accuracy_reward": 0.7264508605003357, "rewards/format_reward": 0.9921875, "step": 488 }, { "completion_length": 89.7578125, "epoch": 2.232876712328767, "grad_norm": 3.7008538246154785, "kl": 0.115478515625, "learning_rate": 7.767123287671233e-07, "loss": 0.0046, "reward": 1.6265625953674316, "reward_std": 0.28076815605163574, "rewards/accuracy_reward": 0.6656249761581421, "rewards/format_reward": 0.9609375, "step": 489 }, { "completion_length": 66.0, "epoch": 2.237442922374429, "grad_norm": 5.58867883682251, "kl": 0.19970703125, "learning_rate": 7.76255707762557e-07, "loss": 0.008, "reward": 1.714453101158142, "reward_std": 0.2889961302280426, "rewards/accuracy_reward": 0.7378906309604645, "rewards/format_reward": 0.9765625, "step": 490 }, { "completion_length": 71.7265625, "epoch": 2.2420091324200913, "grad_norm": 2.161491870880127, "kl": 0.137451171875, "learning_rate": 7.757990867579909e-07, "loss": 0.0055, "reward": 1.5816146731376648, "reward_std": 0.19107923656702042, "rewards/accuracy_reward": 0.5894270539283752, "rewards/format_reward": 0.9921875, "step": 491 }, { "completion_length": 79.7734375, "epoch": 2.2465753424657535, "grad_norm": 5.536935329437256, "kl": 0.09423828125, "learning_rate": 7.753424657534247e-07, "loss": 0.0038, "reward": 1.5820313096046448, "reward_std": 0.23220208287239075, "rewards/accuracy_reward": 0.5976562649011612, "rewards/format_reward": 0.984375, "step": 492 }, { "completion_length": 105.1640625, "epoch": 2.2511415525114153, "grad_norm": 2.601624011993408, "kl": 0.102294921875, "learning_rate": 7.748858447488584e-07, "loss": 0.0041, "reward": 1.7345969676971436, "reward_std": 0.1583983302116394, "rewards/accuracy_reward": 0.7424094080924988, "rewards/format_reward": 0.9921875, "step": 493 }, { "completion_length": 76.2578125, "epoch": 2.2557077625570776, "grad_norm": 3.9378445148468018, "kl": 0.14208984375, "learning_rate": 7.744292237442922e-07, "loss": 0.0057, "reward": 1.6885417103767395, "reward_std": 0.24918173253536224, "rewards/accuracy_reward": 0.7197916507720947, "rewards/format_reward": 0.96875, "step": 494 }, { "completion_length": 69.578125, "epoch": 2.26027397260274, "grad_norm": 9.057132720947266, "kl": 0.1142578125, "learning_rate": 7.73972602739726e-07, "loss": 0.0046, "reward": 1.7063058018684387, "reward_std": 0.18320050090551376, "rewards/accuracy_reward": 0.7141183018684387, "rewards/format_reward": 0.9921875, "step": 495 }, { "completion_length": 58.1640625, "epoch": 2.2648401826484017, "grad_norm": 3.844257116317749, "kl": 0.14892578125, "learning_rate": 7.735159817351597e-07, "loss": 0.006, "reward": 1.5278646349906921, "reward_std": 0.2481004297733307, "rewards/accuracy_reward": 0.5278645753860474, "rewards/format_reward": 1.0, "step": 496 }, { "completion_length": 88.9140625, "epoch": 2.269406392694064, "grad_norm": 8.18749713897705, "kl": 0.098876953125, "learning_rate": 7.730593607305936e-07, "loss": 0.004, "reward": 1.749678373336792, "reward_std": 0.16354048997163773, "rewards/accuracy_reward": 0.7496782541275024, "rewards/format_reward": 1.0, "step": 497 }, { "completion_length": 93.5078125, "epoch": 2.2739726027397262, "grad_norm": 2.6278417110443115, "kl": 0.077392578125, "learning_rate": 7.726027397260274e-07, "loss": 0.0031, "reward": 1.765897810459137, "reward_std": 0.21192234754562378, "rewards/accuracy_reward": 0.8049602508544922, "rewards/format_reward": 0.9609375, "step": 498 }, { "completion_length": 72.2421875, "epoch": 2.278538812785388, "grad_norm": 2.370776414871216, "kl": 0.112548828125, "learning_rate": 7.721461187214611e-07, "loss": 0.0045, "reward": 1.6037201881408691, "reward_std": 0.22106194496154785, "rewards/accuracy_reward": 0.6193452179431915, "rewards/format_reward": 0.984375, "step": 499 }, { "completion_length": 57.5703125, "epoch": 2.2831050228310503, "grad_norm": 27.69339942932129, "kl": 0.10791015625, "learning_rate": 7.71689497716895e-07, "loss": 0.0043, "reward": 1.6528646349906921, "reward_std": 0.2068563476204872, "rewards/accuracy_reward": 0.652864545583725, "rewards/format_reward": 1.0, "step": 500 }, { "completion_length": 74.9296875, "epoch": 2.287671232876712, "grad_norm": 3.454657793045044, "kl": 0.101318359375, "learning_rate": 7.712328767123287e-07, "loss": 0.0041, "reward": 1.5783380270004272, "reward_std": 0.300783634185791, "rewards/accuracy_reward": 0.6017755568027496, "rewards/format_reward": 0.9765625, "step": 501 }, { "completion_length": 73.5, "epoch": 2.2922374429223744, "grad_norm": 2.893542766571045, "kl": 0.117919921875, "learning_rate": 7.707762557077625e-07, "loss": 0.0047, "reward": 1.6714488863945007, "reward_std": 0.23431023210287094, "rewards/accuracy_reward": 0.6948863565921783, "rewards/format_reward": 0.9765625, "step": 502 }, { "completion_length": 90.9921875, "epoch": 2.2968036529680367, "grad_norm": 4.334531307220459, "kl": 0.075439453125, "learning_rate": 7.703196347031963e-07, "loss": 0.003, "reward": 1.6945313215255737, "reward_std": 0.22123288363218307, "rewards/accuracy_reward": 0.7179687321186066, "rewards/format_reward": 0.9765625, "step": 503 }, { "completion_length": 89.3984375, "epoch": 2.3013698630136985, "grad_norm": 8.32867431640625, "kl": 0.140869140625, "learning_rate": 7.6986301369863e-07, "loss": 0.0056, "reward": 1.6440104246139526, "reward_std": 0.12583958357572556, "rewards/accuracy_reward": 0.644010454416275, "rewards/format_reward": 1.0, "step": 504 }, { "completion_length": 78.046875, "epoch": 2.3059360730593608, "grad_norm": 3.375293493270874, "kl": 0.1474609375, "learning_rate": 7.69406392694064e-07, "loss": 0.0059, "reward": 1.7735260128974915, "reward_std": 0.1295642852783203, "rewards/accuracy_reward": 0.7813384830951691, "rewards/format_reward": 0.9921875, "step": 505 }, { "completion_length": 73.734375, "epoch": 2.3105022831050226, "grad_norm": 4.356123924255371, "kl": 0.114501953125, "learning_rate": 7.689497716894977e-07, "loss": 0.0046, "reward": 1.7072916626930237, "reward_std": 0.1626647561788559, "rewards/accuracy_reward": 0.7072916328907013, "rewards/format_reward": 1.0, "step": 506 }, { "completion_length": 66.375, "epoch": 2.315068493150685, "grad_norm": 2.643831253051758, "kl": 0.1552734375, "learning_rate": 7.684931506849314e-07, "loss": 0.0062, "reward": 1.7758206725120544, "reward_std": 0.17595528066158295, "rewards/accuracy_reward": 0.7914457023143768, "rewards/format_reward": 0.984375, "step": 507 }, { "completion_length": 65.109375, "epoch": 2.319634703196347, "grad_norm": 6.30082368850708, "kl": 0.15869140625, "learning_rate": 7.680365296803653e-07, "loss": 0.0064, "reward": 1.5708190202713013, "reward_std": 0.1858999952673912, "rewards/accuracy_reward": 0.5708190500736237, "rewards/format_reward": 1.0, "step": 508 }, { "completion_length": 76.015625, "epoch": 2.324200913242009, "grad_norm": 3.34980845451355, "kl": 0.1240234375, "learning_rate": 7.67579908675799e-07, "loss": 0.005, "reward": 1.6061203479766846, "reward_std": 0.2387639731168747, "rewards/accuracy_reward": 0.6295577883720398, "rewards/format_reward": 0.9765625, "step": 509 }, { "completion_length": 69.5546875, "epoch": 2.328767123287671, "grad_norm": 2.7794432640075684, "kl": 0.108642578125, "learning_rate": 7.671232876712328e-07, "loss": 0.0043, "reward": 1.628348171710968, "reward_std": 0.22845705598592758, "rewards/accuracy_reward": 0.6361607015132904, "rewards/format_reward": 0.9921875, "step": 510 }, { "completion_length": 77.1171875, "epoch": 2.3333333333333335, "grad_norm": 1.9683984518051147, "kl": 0.10400390625, "learning_rate": 7.666666666666667e-07, "loss": 0.0041, "reward": 1.7567708492279053, "reward_std": 0.1648455262184143, "rewards/accuracy_reward": 0.7723957598209381, "rewards/format_reward": 0.984375, "step": 511 }, { "completion_length": 91.765625, "epoch": 2.3378995433789953, "grad_norm": 7.019916534423828, "kl": 0.120849609375, "learning_rate": 7.662100456621004e-07, "loss": 0.0048, "reward": 1.7004202008247375, "reward_std": 0.13108576089143753, "rewards/accuracy_reward": 0.708232581615448, "rewards/format_reward": 0.9921875, "step": 512 }, { "completion_length": 80.7421875, "epoch": 2.3424657534246576, "grad_norm": 2.83266282081604, "kl": 0.13134765625, "learning_rate": 7.657534246575343e-07, "loss": 0.0053, "reward": 1.658984363079071, "reward_std": 0.20922444760799408, "rewards/accuracy_reward": 0.658984363079071, "rewards/format_reward": 1.0, "step": 513 }, { "completion_length": 71.234375, "epoch": 2.34703196347032, "grad_norm": 1.9871636629104614, "kl": 0.12646484375, "learning_rate": 7.65296803652968e-07, "loss": 0.005, "reward": 1.725781261920929, "reward_std": 0.10922157764434814, "rewards/accuracy_reward": 0.7257812023162842, "rewards/format_reward": 1.0, "step": 514 }, { "completion_length": 73.7265625, "epoch": 2.3515981735159817, "grad_norm": 2.359335422515869, "kl": 0.11279296875, "learning_rate": 7.648401826484017e-07, "loss": 0.0045, "reward": 1.772805094718933, "reward_std": 0.16348526254296303, "rewards/accuracy_reward": 0.7806175947189331, "rewards/format_reward": 0.9921875, "step": 515 }, { "completion_length": 60.3828125, "epoch": 2.356164383561644, "grad_norm": 5.7438812255859375, "kl": 0.14208984375, "learning_rate": 7.643835616438356e-07, "loss": 0.0057, "reward": 1.751901626586914, "reward_std": 0.17701375484466553, "rewards/accuracy_reward": 0.7597140967845917, "rewards/format_reward": 0.9921875, "step": 516 }, { "completion_length": 89.5703125, "epoch": 2.3607305936073057, "grad_norm": 4.794968128204346, "kl": 0.0986328125, "learning_rate": 7.639269406392693e-07, "loss": 0.0039, "reward": 1.7277343273162842, "reward_std": 0.1568085253238678, "rewards/accuracy_reward": 0.7355467975139618, "rewards/format_reward": 0.9921875, "step": 517 }, { "completion_length": 72.1015625, "epoch": 2.365296803652968, "grad_norm": 3.052950143814087, "kl": 0.10009765625, "learning_rate": 7.634703196347032e-07, "loss": 0.004, "reward": 1.7309381365776062, "reward_std": 0.13279738277196884, "rewards/accuracy_reward": 0.7387505769729614, "rewards/format_reward": 0.9921875, "step": 518 }, { "completion_length": 96.421875, "epoch": 2.3698630136986303, "grad_norm": 1.318949818611145, "kl": 0.08251953125, "learning_rate": 7.63013698630137e-07, "loss": 0.0033, "reward": 1.78125, "reward_std": 0.18937532603740692, "rewards/accuracy_reward": 0.8203124403953552, "rewards/format_reward": 0.9609375, "step": 519 }, { "completion_length": 76.15625, "epoch": 2.374429223744292, "grad_norm": 2.7549917697906494, "kl": 0.11669921875, "learning_rate": 7.625570776255707e-07, "loss": 0.0047, "reward": 1.7448863983154297, "reward_std": 0.23332231491804123, "rewards/accuracy_reward": 0.7683238387107849, "rewards/format_reward": 0.9765625, "step": 520 }, { "completion_length": 58.6328125, "epoch": 2.3789954337899544, "grad_norm": 3.747992753982544, "kl": 0.1669921875, "learning_rate": 7.621004566210046e-07, "loss": 0.0067, "reward": 1.6135417222976685, "reward_std": 0.16909091174602509, "rewards/accuracy_reward": 0.6135416924953461, "rewards/format_reward": 1.0, "step": 521 }, { "completion_length": 59.4296875, "epoch": 2.383561643835616, "grad_norm": 5.62481164932251, "kl": 0.14990234375, "learning_rate": 7.616438356164383e-07, "loss": 0.006, "reward": 1.5095030069351196, "reward_std": 0.3347364068031311, "rewards/accuracy_reward": 0.5407529026269913, "rewards/format_reward": 0.96875, "step": 522 }, { "completion_length": 73.7109375, "epoch": 2.3881278538812785, "grad_norm": 2.828101634979248, "kl": 0.15869140625, "learning_rate": 7.61187214611872e-07, "loss": 0.0063, "reward": 1.6775281429290771, "reward_std": 0.15744981169700623, "rewards/accuracy_reward": 0.6775281727313995, "rewards/format_reward": 1.0, "step": 523 }, { "completion_length": 79.359375, "epoch": 2.3926940639269407, "grad_norm": 1.9861787557601929, "kl": 0.1162109375, "learning_rate": 7.607305936073059e-07, "loss": 0.0046, "reward": 1.645312488079071, "reward_std": 0.12342093884944916, "rewards/accuracy_reward": 0.6453125178813934, "rewards/format_reward": 1.0, "step": 524 }, { "completion_length": 78.734375, "epoch": 2.3972602739726026, "grad_norm": 2.850097179412842, "kl": 0.112548828125, "learning_rate": 7.602739726027397e-07, "loss": 0.0045, "reward": 1.6301960349082947, "reward_std": 0.18981467187404633, "rewards/accuracy_reward": 0.6458209455013275, "rewards/format_reward": 0.984375, "step": 525 }, { "completion_length": 62.109375, "epoch": 2.401826484018265, "grad_norm": 2.852185010910034, "kl": 0.16552734375, "learning_rate": 7.598173515981735e-07, "loss": 0.0066, "reward": 1.5752604007720947, "reward_std": 0.20407412946224213, "rewards/accuracy_reward": 0.5908854007720947, "rewards/format_reward": 0.984375, "step": 526 }, { "completion_length": 68.4765625, "epoch": 2.406392694063927, "grad_norm": 2.7563745975494385, "kl": 0.1240234375, "learning_rate": 7.593607305936073e-07, "loss": 0.005, "reward": 1.646093726158142, "reward_std": 0.255212739109993, "rewards/accuracy_reward": 0.6695312261581421, "rewards/format_reward": 0.9765625, "step": 527 }, { "completion_length": 83.1953125, "epoch": 2.410958904109589, "grad_norm": 3.005842924118042, "kl": 0.11328125, "learning_rate": 7.58904109589041e-07, "loss": 0.0045, "reward": 1.4694010615348816, "reward_std": 0.34128230810165405, "rewards/accuracy_reward": 0.516276016831398, "rewards/format_reward": 0.953125, "step": 528 }, { "completion_length": 79.171875, "epoch": 2.415525114155251, "grad_norm": 10.589370727539062, "kl": 0.40234375, "learning_rate": 7.584474885844749e-07, "loss": 0.0161, "reward": 1.7232915163040161, "reward_std": 0.16085164994001389, "rewards/accuracy_reward": 0.7311040163040161, "rewards/format_reward": 0.9921875, "step": 529 }, { "completion_length": 75.6484375, "epoch": 2.4200913242009134, "grad_norm": 2.8637847900390625, "kl": 0.1416015625, "learning_rate": 7.579908675799086e-07, "loss": 0.0057, "reward": 1.5583333373069763, "reward_std": 0.24188697338104248, "rewards/accuracy_reward": 0.5661458522081375, "rewards/format_reward": 0.9921875, "step": 530 }, { "completion_length": 81.7421875, "epoch": 2.4246575342465753, "grad_norm": 2.9744412899017334, "kl": 0.1015625, "learning_rate": 7.575342465753424e-07, "loss": 0.0041, "reward": 1.6184896230697632, "reward_std": 0.22770510613918304, "rewards/accuracy_reward": 0.6341145932674408, "rewards/format_reward": 0.984375, "step": 531 }, { "completion_length": 68.96875, "epoch": 2.4292237442922375, "grad_norm": 5.560794830322266, "kl": 0.1513671875, "learning_rate": 7.570776255707763e-07, "loss": 0.006, "reward": 1.6268364787101746, "reward_std": 0.25046712160110474, "rewards/accuracy_reward": 0.6346489191055298, "rewards/format_reward": 0.9921875, "step": 532 }, { "completion_length": 83.734375, "epoch": 2.4337899543378994, "grad_norm": 1.856094241142273, "kl": 0.103271484375, "learning_rate": 7.5662100456621e-07, "loss": 0.0041, "reward": 1.719410002231598, "reward_std": 0.18102595210075378, "rewards/accuracy_reward": 0.7272224426269531, "rewards/format_reward": 0.9921875, "step": 533 }, { "completion_length": 82.9765625, "epoch": 2.4383561643835616, "grad_norm": 1.8449293375015259, "kl": 0.12255859375, "learning_rate": 7.561643835616438e-07, "loss": 0.0049, "reward": 1.7729809880256653, "reward_std": 0.10142140835523605, "rewards/accuracy_reward": 0.7807934284210205, "rewards/format_reward": 0.9921875, "step": 534 }, { "completion_length": 76.6953125, "epoch": 2.442922374429224, "grad_norm": 4.276492595672607, "kl": 0.13427734375, "learning_rate": 7.557077625570776e-07, "loss": 0.0054, "reward": 1.584375023841858, "reward_std": 0.2730569392442703, "rewards/accuracy_reward": 0.6078124940395355, "rewards/format_reward": 0.9765625, "step": 535 }, { "completion_length": 84.6171875, "epoch": 2.4474885844748857, "grad_norm": 7.224926471710205, "kl": 0.120361328125, "learning_rate": 7.552511415525113e-07, "loss": 0.0048, "reward": 1.7129571437835693, "reward_std": 0.16479767858982086, "rewards/accuracy_reward": 0.7207695543766022, "rewards/format_reward": 0.9921875, "step": 536 }, { "completion_length": 81.859375, "epoch": 2.452054794520548, "grad_norm": 2.366180181503296, "kl": 0.10302734375, "learning_rate": 7.547945205479452e-07, "loss": 0.0041, "reward": 1.7357105612754822, "reward_std": 0.1823057383298874, "rewards/accuracy_reward": 0.7747730314731598, "rewards/format_reward": 0.9609375, "step": 537 }, { "completion_length": 92.140625, "epoch": 2.45662100456621, "grad_norm": 2.225709915161133, "kl": 0.17041015625, "learning_rate": 7.54337899543379e-07, "loss": 0.0068, "reward": 1.7205729484558105, "reward_std": 0.136265367269516, "rewards/accuracy_reward": 0.7205729186534882, "rewards/format_reward": 1.0, "step": 538 }, { "completion_length": 53.6796875, "epoch": 2.461187214611872, "grad_norm": 2.991917133331299, "kl": 0.181640625, "learning_rate": 7.538812785388127e-07, "loss": 0.0073, "reward": 1.531180500984192, "reward_std": 0.2481382116675377, "rewards/accuracy_reward": 0.5389930307865143, "rewards/format_reward": 0.9921875, "step": 539 }, { "completion_length": 72.734375, "epoch": 2.4657534246575343, "grad_norm": 5.530900001525879, "kl": 0.10205078125, "learning_rate": 7.534246575342466e-07, "loss": 0.0041, "reward": 1.705208420753479, "reward_std": 0.24594328552484512, "rewards/accuracy_reward": 0.7286458015441895, "rewards/format_reward": 0.9765625, "step": 540 }, { "completion_length": 69.7734375, "epoch": 2.470319634703196, "grad_norm": 2.618407964706421, "kl": 0.156982421875, "learning_rate": 7.529680365296803e-07, "loss": 0.0063, "reward": 1.7212890982627869, "reward_std": 0.19607724994421005, "rewards/accuracy_reward": 0.7369140684604645, "rewards/format_reward": 0.984375, "step": 541 }, { "completion_length": 66.2109375, "epoch": 2.4748858447488584, "grad_norm": 2.0617170333862305, "kl": 0.14208984375, "learning_rate": 7.525114155251141e-07, "loss": 0.0057, "reward": 1.6648437976837158, "reward_std": 0.1707550622522831, "rewards/accuracy_reward": 0.672656238079071, "rewards/format_reward": 0.9921875, "step": 542 }, { "completion_length": 77.1171875, "epoch": 2.4794520547945207, "grad_norm": 3.923844814300537, "kl": 0.188232421875, "learning_rate": 7.520547945205479e-07, "loss": 0.0076, "reward": 1.7651843428611755, "reward_std": 0.18746302276849747, "rewards/accuracy_reward": 0.7729967534542084, "rewards/format_reward": 0.9921875, "step": 543 }, { "completion_length": 83.75, "epoch": 2.4840182648401825, "grad_norm": 12.723535537719727, "kl": 0.096923828125, "learning_rate": 7.515981735159816e-07, "loss": 0.0039, "reward": 1.7339038252830505, "reward_std": 0.16746822372078896, "rewards/accuracy_reward": 0.7495287358760834, "rewards/format_reward": 0.984375, "step": 544 }, { "completion_length": 63.9921875, "epoch": 2.4885844748858448, "grad_norm": 2.9950413703918457, "kl": 0.130615234375, "learning_rate": 7.511415525114156e-07, "loss": 0.0052, "reward": 1.6598958373069763, "reward_std": 0.2278856635093689, "rewards/accuracy_reward": 0.6833333075046539, "rewards/format_reward": 0.9765625, "step": 545 }, { "completion_length": 82.7890625, "epoch": 2.493150684931507, "grad_norm": 2.3096249103546143, "kl": 0.090576171875, "learning_rate": 7.506849315068493e-07, "loss": 0.0036, "reward": 1.7203125953674316, "reward_std": 0.20905159413814545, "rewards/accuracy_reward": 0.7281249761581421, "rewards/format_reward": 0.9921875, "step": 546 }, { "completion_length": 80.84375, "epoch": 2.497716894977169, "grad_norm": 3.3282742500305176, "kl": 0.093994140625, "learning_rate": 7.50228310502283e-07, "loss": 0.0038, "reward": 1.543817937374115, "reward_std": 0.22718056291341782, "rewards/accuracy_reward": 0.5594429075717926, "rewards/format_reward": 0.984375, "step": 547 }, { "completion_length": 106.3046875, "epoch": 2.502283105022831, "grad_norm": 5.616523742675781, "kl": 0.08251953125, "learning_rate": 7.497716894977169e-07, "loss": 0.0033, "reward": 1.6008946895599365, "reward_std": 0.2659634053707123, "rewards/accuracy_reward": 0.6321446299552917, "rewards/format_reward": 0.96875, "step": 548 }, { "completion_length": 90.3203125, "epoch": 2.506849315068493, "grad_norm": 5.4013166427612305, "kl": 0.0994873046875, "learning_rate": 7.493150684931506e-07, "loss": 0.004, "reward": 1.7019531726837158, "reward_std": 0.17125242203474045, "rewards/accuracy_reward": 0.7175780534744263, "rewards/format_reward": 0.984375, "step": 549 }, { "completion_length": 86.5859375, "epoch": 2.5114155251141552, "grad_norm": 2.318875789642334, "kl": 0.134765625, "learning_rate": 7.488584474885844e-07, "loss": 0.0054, "reward": 1.6225537061691284, "reward_std": 0.2404860332608223, "rewards/accuracy_reward": 0.645991176366806, "rewards/format_reward": 0.9765625, "step": 550 }, { "completion_length": 92.1015625, "epoch": 2.5159817351598175, "grad_norm": 6.498478412628174, "kl": 0.092041015625, "learning_rate": 7.484018264840183e-07, "loss": 0.0037, "reward": 1.6539062857627869, "reward_std": 0.20888085663318634, "rewards/accuracy_reward": 0.6695312559604645, "rewards/format_reward": 0.984375, "step": 551 }, { "completion_length": 98.4375, "epoch": 2.5205479452054793, "grad_norm": 3.217737913131714, "kl": 0.0743408203125, "learning_rate": 7.47945205479452e-07, "loss": 0.003, "reward": 1.8604166507720947, "reward_std": 0.10430474206805229, "rewards/accuracy_reward": 0.8682291209697723, "rewards/format_reward": 0.9921875, "step": 552 }, { "completion_length": 89.34375, "epoch": 2.5251141552511416, "grad_norm": 1.3329360485076904, "kl": 0.12451171875, "learning_rate": 7.474885844748859e-07, "loss": 0.005, "reward": 1.842187523841858, "reward_std": 0.09198738168925047, "rewards/accuracy_reward": 0.8421874642372131, "rewards/format_reward": 1.0, "step": 553 }, { "completion_length": 89.234375, "epoch": 2.5296803652968034, "grad_norm": 3.4690394401550293, "kl": 0.1015625, "learning_rate": 7.470319634703196e-07, "loss": 0.0041, "reward": 1.7106770873069763, "reward_std": 0.19660182297229767, "rewards/accuracy_reward": 0.7184895873069763, "rewards/format_reward": 0.9921875, "step": 554 }, { "completion_length": 82.5859375, "epoch": 2.5342465753424657, "grad_norm": 3.373579740524292, "kl": 0.113037109375, "learning_rate": 7.465753424657533e-07, "loss": 0.0045, "reward": 1.5816163420677185, "reward_std": 0.20616846531629562, "rewards/accuracy_reward": 0.5816163718700409, "rewards/format_reward": 1.0, "step": 555 }, { "completion_length": 93.5078125, "epoch": 2.538812785388128, "grad_norm": 3.093045473098755, "kl": 0.1044921875, "learning_rate": 7.461187214611872e-07, "loss": 0.0042, "reward": 1.608422875404358, "reward_std": 0.21706774830818176, "rewards/accuracy_reward": 0.6240477561950684, "rewards/format_reward": 0.984375, "step": 556 }, { "completion_length": 97.0703125, "epoch": 2.54337899543379, "grad_norm": 2.3710570335388184, "kl": 0.095703125, "learning_rate": 7.456621004566209e-07, "loss": 0.0038, "reward": 1.800067663192749, "reward_std": 0.09500321745872498, "rewards/accuracy_reward": 0.8000677227973938, "rewards/format_reward": 1.0, "step": 557 }, { "completion_length": 71.671875, "epoch": 2.547945205479452, "grad_norm": 2.5310826301574707, "kl": 0.108154296875, "learning_rate": 7.452054794520548e-07, "loss": 0.0043, "reward": 1.5969815254211426, "reward_std": 0.2227228805422783, "rewards/accuracy_reward": 0.5969814956188202, "rewards/format_reward": 1.0, "step": 558 }, { "completion_length": 78.15625, "epoch": 2.5525114155251143, "grad_norm": 2.1528046131134033, "kl": 0.12451171875, "learning_rate": 7.447488584474886e-07, "loss": 0.005, "reward": 1.7443639636039734, "reward_std": 0.10288457944989204, "rewards/accuracy_reward": 0.752176433801651, "rewards/format_reward": 0.9921875, "step": 559 }, { "completion_length": 84.546875, "epoch": 2.557077625570776, "grad_norm": 2.656460762023926, "kl": 0.1171875, "learning_rate": 7.442922374429223e-07, "loss": 0.0047, "reward": 1.7289806604385376, "reward_std": 0.17229026556015015, "rewards/accuracy_reward": 0.7289806008338928, "rewards/format_reward": 1.0, "step": 560 }, { "completion_length": 81.8984375, "epoch": 2.5616438356164384, "grad_norm": 3.7874462604522705, "kl": 0.12548828125, "learning_rate": 7.438356164383562e-07, "loss": 0.005, "reward": 1.7026662230491638, "reward_std": 0.2061307728290558, "rewards/accuracy_reward": 0.7104786336421967, "rewards/format_reward": 0.9921875, "step": 561 }, { "completion_length": 69.5546875, "epoch": 2.5662100456621006, "grad_norm": 2.563668966293335, "kl": 0.1201171875, "learning_rate": 7.433789954337899e-07, "loss": 0.0048, "reward": 1.5755208134651184, "reward_std": 0.207001730799675, "rewards/accuracy_reward": 0.5833333730697632, "rewards/format_reward": 0.9921875, "step": 562 }, { "completion_length": 76.96875, "epoch": 2.5707762557077625, "grad_norm": 2.503657817840576, "kl": 0.098876953125, "learning_rate": 7.429223744292236e-07, "loss": 0.004, "reward": 1.7036305665969849, "reward_std": 0.16530893370509148, "rewards/accuracy_reward": 0.7036304771900177, "rewards/format_reward": 1.0, "step": 563 }, { "completion_length": 86.0078125, "epoch": 2.5753424657534247, "grad_norm": 4.6145734786987305, "kl": 0.087646484375, "learning_rate": 7.424657534246575e-07, "loss": 0.0035, "reward": 1.5936384201049805, "reward_std": 0.24451126903295517, "rewards/accuracy_reward": 0.6248884201049805, "rewards/format_reward": 0.96875, "step": 564 }, { "completion_length": 91.1171875, "epoch": 2.5799086757990866, "grad_norm": 3.2497313022613525, "kl": 0.116455078125, "learning_rate": 7.420091324200913e-07, "loss": 0.0047, "reward": 1.555757462978363, "reward_std": 0.2567907050251961, "rewards/accuracy_reward": 0.5870074331760406, "rewards/format_reward": 0.96875, "step": 565 }, { "completion_length": 89.8984375, "epoch": 2.584474885844749, "grad_norm": 2.177954912185669, "kl": 0.073974609375, "learning_rate": 7.415525114155251e-07, "loss": 0.003, "reward": 1.7282168865203857, "reward_std": 0.18810292333364487, "rewards/accuracy_reward": 0.7516542971134186, "rewards/format_reward": 0.9765625, "step": 566 }, { "completion_length": 59.671875, "epoch": 2.589041095890411, "grad_norm": 2.342167854309082, "kl": 0.12646484375, "learning_rate": 7.410958904109589e-07, "loss": 0.0051, "reward": 1.5524925589561462, "reward_std": 0.19321630895137787, "rewards/accuracy_reward": 0.5524925589561462, "rewards/format_reward": 1.0, "step": 567 }, { "completion_length": 81.0390625, "epoch": 2.593607305936073, "grad_norm": 25.185625076293945, "kl": 0.099609375, "learning_rate": 7.406392694063926e-07, "loss": 0.004, "reward": 1.685937523841858, "reward_std": 0.2477683424949646, "rewards/accuracy_reward": 0.6937499642372131, "rewards/format_reward": 0.9921875, "step": 568 }, { "completion_length": 86.671875, "epoch": 2.598173515981735, "grad_norm": 3.6345832347869873, "kl": 0.095458984375, "learning_rate": 7.401826484018265e-07, "loss": 0.0038, "reward": 1.6493847966194153, "reward_std": 0.20508087426424026, "rewards/accuracy_reward": 0.6493848264217377, "rewards/format_reward": 1.0, "step": 569 }, { "completion_length": 111.0, "epoch": 2.602739726027397, "grad_norm": 1.393759846687317, "kl": 0.06494140625, "learning_rate": 7.397260273972602e-07, "loss": 0.0026, "reward": 1.872656226158142, "reward_std": 0.10857155546545982, "rewards/accuracy_reward": 0.8960936367511749, "rewards/format_reward": 0.9765625, "step": 570 }, { "completion_length": 96.6875, "epoch": 2.6073059360730593, "grad_norm": 2.2273528575897217, "kl": 0.077880859375, "learning_rate": 7.39269406392694e-07, "loss": 0.0031, "reward": 1.8092340230941772, "reward_std": 0.10701124370098114, "rewards/accuracy_reward": 0.8170464634895325, "rewards/format_reward": 0.9921875, "step": 571 }, { "completion_length": 70.09375, "epoch": 2.6118721461187215, "grad_norm": 2.2020483016967773, "kl": 0.108154296875, "learning_rate": 7.388127853881279e-07, "loss": 0.0043, "reward": 1.5493839979171753, "reward_std": 0.19577539712190628, "rewards/accuracy_reward": 0.5493840277194977, "rewards/format_reward": 1.0, "step": 572 }, { "completion_length": 56.6875, "epoch": 2.616438356164384, "grad_norm": 8.005638122558594, "kl": 0.111572265625, "learning_rate": 7.383561643835616e-07, "loss": 0.0045, "reward": 1.457552194595337, "reward_std": 0.28487062454223633, "rewards/accuracy_reward": 0.45755207538604736, "rewards/format_reward": 1.0, "step": 573 }, { "completion_length": 67.3984375, "epoch": 2.6210045662100456, "grad_norm": 7.093777179718018, "kl": 0.1669921875, "learning_rate": 7.378995433789954e-07, "loss": 0.0067, "reward": 1.7374799847602844, "reward_std": 0.1953911855816841, "rewards/accuracy_reward": 0.745292454957962, "rewards/format_reward": 0.9921875, "step": 574 }, { "completion_length": 70.953125, "epoch": 2.625570776255708, "grad_norm": 2.916884183883667, "kl": 0.129150390625, "learning_rate": 7.374429223744292e-07, "loss": 0.0052, "reward": 1.5901844501495361, "reward_std": 0.2463785707950592, "rewards/accuracy_reward": 0.5901843905448914, "rewards/format_reward": 1.0, "step": 575 }, { "completion_length": 79.0859375, "epoch": 2.6301369863013697, "grad_norm": 7.461641311645508, "kl": 0.12890625, "learning_rate": 7.369863013698629e-07, "loss": 0.0052, "reward": 1.666406273841858, "reward_std": 0.24838291853666306, "rewards/accuracy_reward": 0.6742187440395355, "rewards/format_reward": 0.9921875, "step": 576 }, { "completion_length": 71.1796875, "epoch": 2.634703196347032, "grad_norm": 2.474581718444824, "kl": 0.125244140625, "learning_rate": 7.365296803652968e-07, "loss": 0.005, "reward": 1.6498884558677673, "reward_std": 0.15810929238796234, "rewards/accuracy_reward": 0.6498883664608002, "rewards/format_reward": 1.0, "step": 577 }, { "completion_length": 93.484375, "epoch": 2.6392694063926943, "grad_norm": 2.4049859046936035, "kl": 0.068359375, "learning_rate": 7.360730593607306e-07, "loss": 0.0027, "reward": 1.7192708849906921, "reward_std": 0.1887570172548294, "rewards/accuracy_reward": 0.742708295583725, "rewards/format_reward": 0.9765625, "step": 578 }, { "completion_length": 78.71875, "epoch": 2.643835616438356, "grad_norm": 12.206317901611328, "kl": 0.11962890625, "learning_rate": 7.356164383561643e-07, "loss": 0.0048, "reward": 1.6132813096046448, "reward_std": 0.19830159842967987, "rewards/accuracy_reward": 0.62890625, "rewards/format_reward": 0.984375, "step": 579 }, { "completion_length": 84.28125, "epoch": 2.6484018264840183, "grad_norm": 3.1371257305145264, "kl": 0.112060546875, "learning_rate": 7.351598173515982e-07, "loss": 0.0045, "reward": 1.7513157725334167, "reward_std": 0.13612205535173416, "rewards/accuracy_reward": 0.7513157725334167, "rewards/format_reward": 1.0, "step": 580 }, { "completion_length": 74.203125, "epoch": 2.65296803652968, "grad_norm": 4.9632792472839355, "kl": 0.21728515625, "learning_rate": 7.347031963470319e-07, "loss": 0.0087, "reward": 1.5989612340927124, "reward_std": 0.21190468221902847, "rewards/accuracy_reward": 0.5989611893892288, "rewards/format_reward": 1.0, "step": 581 }, { "completion_length": 56.5234375, "epoch": 2.6575342465753424, "grad_norm": 3.400489091873169, "kl": 0.1005859375, "learning_rate": 7.342465753424657e-07, "loss": 0.004, "reward": 1.5473958253860474, "reward_std": 0.194392129778862, "rewards/accuracy_reward": 0.5473958253860474, "rewards/format_reward": 1.0, "step": 582 }, { "completion_length": 56.2421875, "epoch": 2.6621004566210047, "grad_norm": 2.7040834426879883, "kl": 0.19140625, "learning_rate": 7.337899543378995e-07, "loss": 0.0077, "reward": 1.5410130023956299, "reward_std": 0.23740805685520172, "rewards/accuracy_reward": 0.5488254725933075, "rewards/format_reward": 0.9921875, "step": 583 }, { "completion_length": 103.8515625, "epoch": 2.6666666666666665, "grad_norm": 1.5765724182128906, "kl": 0.0721435546875, "learning_rate": 7.333333333333332e-07, "loss": 0.0029, "reward": 1.6796875, "reward_std": 0.134404756128788, "rewards/accuracy_reward": 0.6874999403953552, "rewards/format_reward": 0.9921875, "step": 584 }, { "completion_length": 74.1875, "epoch": 2.671232876712329, "grad_norm": 7.368083477020264, "kl": 0.115234375, "learning_rate": 7.328767123287672e-07, "loss": 0.0046, "reward": 1.6952009201049805, "reward_std": 0.23021817207336426, "rewards/accuracy_reward": 0.7030133903026581, "rewards/format_reward": 0.9921875, "step": 585 }, { "completion_length": 74.0703125, "epoch": 2.6757990867579906, "grad_norm": 4.19453239440918, "kl": 0.112548828125, "learning_rate": 7.324200913242009e-07, "loss": 0.0045, "reward": 1.7542436122894287, "reward_std": 0.16448176465928555, "rewards/accuracy_reward": 0.7620560824871063, "rewards/format_reward": 0.9921875, "step": 586 }, { "completion_length": 83.4453125, "epoch": 2.680365296803653, "grad_norm": 2.116567373275757, "kl": 0.1123046875, "learning_rate": 7.319634703196346e-07, "loss": 0.0045, "reward": 1.8114583492279053, "reward_std": 0.10706461034715176, "rewards/accuracy_reward": 0.8114582598209381, "rewards/format_reward": 1.0, "step": 587 }, { "completion_length": 74.1171875, "epoch": 2.684931506849315, "grad_norm": 3.840576648712158, "kl": 0.106689453125, "learning_rate": 7.315068493150685e-07, "loss": 0.0043, "reward": 1.5600694417953491, "reward_std": 0.20616772770881653, "rewards/accuracy_reward": 0.5600694417953491, "rewards/format_reward": 1.0, "step": 588 }, { "completion_length": 80.515625, "epoch": 2.6894977168949774, "grad_norm": 5.284098148345947, "kl": 0.10986328125, "learning_rate": 7.310502283105022e-07, "loss": 0.0044, "reward": 1.6452972888946533, "reward_std": 0.1739073097705841, "rewards/accuracy_reward": 0.6452972292900085, "rewards/format_reward": 1.0, "step": 589 }, { "completion_length": 80.0078125, "epoch": 2.6940639269406392, "grad_norm": 2.7230966091156006, "kl": 0.1328125, "learning_rate": 7.30593607305936e-07, "loss": 0.0053, "reward": 1.7089230418205261, "reward_std": 0.18773558735847473, "rewards/accuracy_reward": 0.7167355120182037, "rewards/format_reward": 0.9921875, "step": 590 }, { "completion_length": 65.5625, "epoch": 2.6986301369863015, "grad_norm": 3.291010618209839, "kl": 0.1162109375, "learning_rate": 7.301369863013699e-07, "loss": 0.0047, "reward": 1.646875023841858, "reward_std": 0.2957848533987999, "rewards/accuracy_reward": 0.6624999642372131, "rewards/format_reward": 0.984375, "step": 591 }, { "completion_length": 78.953125, "epoch": 2.7031963470319633, "grad_norm": 3.601032257080078, "kl": 0.126953125, "learning_rate": 7.296803652968036e-07, "loss": 0.0051, "reward": 1.6582031846046448, "reward_std": 0.2431168407201767, "rewards/accuracy_reward": 0.6738280951976776, "rewards/format_reward": 0.984375, "step": 592 }, { "completion_length": 75.515625, "epoch": 2.7077625570776256, "grad_norm": 3.7182750701904297, "kl": 0.111572265625, "learning_rate": 7.292237442922375e-07, "loss": 0.0045, "reward": 1.8291015625, "reward_std": 0.15765050053596497, "rewards/accuracy_reward": 0.8291015625, "rewards/format_reward": 1.0, "step": 593 }, { "completion_length": 65.7890625, "epoch": 2.712328767123288, "grad_norm": 3.52632737159729, "kl": 0.15966796875, "learning_rate": 7.287671232876712e-07, "loss": 0.0064, "reward": 1.6682049632072449, "reward_std": 0.22022631764411926, "rewards/accuracy_reward": 0.6682049036026001, "rewards/format_reward": 1.0, "step": 594 }, { "completion_length": 94.90625, "epoch": 2.7168949771689497, "grad_norm": 1.899457335472107, "kl": 0.091796875, "learning_rate": 7.283105022831049e-07, "loss": 0.0037, "reward": 1.6791666746139526, "reward_std": 0.18102534115314484, "rewards/accuracy_reward": 0.7104166448116302, "rewards/format_reward": 0.96875, "step": 595 }, { "completion_length": 80.5390625, "epoch": 2.721461187214612, "grad_norm": 2.4807851314544678, "kl": 0.113525390625, "learning_rate": 7.278538812785388e-07, "loss": 0.0045, "reward": 1.8067708611488342, "reward_std": 0.12291676551103592, "rewards/accuracy_reward": 0.8067708313465118, "rewards/format_reward": 1.0, "step": 596 }, { "completion_length": 83.34375, "epoch": 2.7260273972602738, "grad_norm": 2.2889952659606934, "kl": 0.11669921875, "learning_rate": 7.273972602739725e-07, "loss": 0.0047, "reward": 1.7340867519378662, "reward_std": 0.2301034778356552, "rewards/accuracy_reward": 0.7731491327285767, "rewards/format_reward": 0.9609375, "step": 597 }, { "completion_length": 76.90625, "epoch": 2.730593607305936, "grad_norm": 2.431467056274414, "kl": 0.141845703125, "learning_rate": 7.269406392694064e-07, "loss": 0.0057, "reward": 1.702616572380066, "reward_std": 0.16628245636820793, "rewards/accuracy_reward": 0.7026165425777435, "rewards/format_reward": 1.0, "step": 598 }, { "completion_length": 75.203125, "epoch": 2.7351598173515983, "grad_norm": 2.5654890537261963, "kl": 0.1119384765625, "learning_rate": 7.264840182648402e-07, "loss": 0.0045, "reward": 1.7242187857627869, "reward_std": 0.13443218544125557, "rewards/accuracy_reward": 0.7320312559604645, "rewards/format_reward": 0.9921875, "step": 599 }, { "completion_length": 77.8515625, "epoch": 2.73972602739726, "grad_norm": 2.4375691413879395, "kl": 0.097900390625, "learning_rate": 7.260273972602739e-07, "loss": 0.0039, "reward": 1.7069196701049805, "reward_std": 0.24784404039382935, "rewards/accuracy_reward": 0.7225446105003357, "rewards/format_reward": 0.984375, "step": 600 }, { "completion_length": 73.1953125, "epoch": 2.7442922374429224, "grad_norm": 2.0869359970092773, "kl": 0.1171875, "learning_rate": 7.255707762557078e-07, "loss": 0.0047, "reward": 1.6764508485794067, "reward_std": 0.11909351497888565, "rewards/accuracy_reward": 0.6764508485794067, "rewards/format_reward": 1.0, "step": 601 }, { "completion_length": 59.4765625, "epoch": 2.748858447488584, "grad_norm": 3.2937729358673096, "kl": 0.138427734375, "learning_rate": 7.251141552511415e-07, "loss": 0.0055, "reward": 1.538671851158142, "reward_std": 0.36779990792274475, "rewards/accuracy_reward": 0.5542968809604645, "rewards/format_reward": 0.984375, "step": 602 }, { "completion_length": 92.5703125, "epoch": 2.7534246575342465, "grad_norm": 3.537177085876465, "kl": 0.090087890625, "learning_rate": 7.246575342465752e-07, "loss": 0.0036, "reward": 1.722743034362793, "reward_std": 0.18279560655355453, "rewards/accuracy_reward": 0.730555534362793, "rewards/format_reward": 0.9921875, "step": 603 }, { "completion_length": 93.8046875, "epoch": 2.7579908675799087, "grad_norm": 2.128953218460083, "kl": 0.08056640625, "learning_rate": 7.242009132420091e-07, "loss": 0.0032, "reward": 1.745312511920929, "reward_std": 0.1521657407283783, "rewards/accuracy_reward": 0.7453123927116394, "rewards/format_reward": 1.0, "step": 604 }, { "completion_length": 65.9375, "epoch": 2.762557077625571, "grad_norm": 17.820329666137695, "kl": 0.10205078125, "learning_rate": 7.237442922374429e-07, "loss": 0.0041, "reward": 1.6083807349205017, "reward_std": 0.20848889648914337, "rewards/accuracy_reward": 0.6161931753158569, "rewards/format_reward": 0.9921875, "step": 605 }, { "completion_length": 71.671875, "epoch": 2.767123287671233, "grad_norm": 2.2198660373687744, "kl": 0.10009765625, "learning_rate": 7.232876712328767e-07, "loss": 0.004, "reward": 1.7286458611488342, "reward_std": 0.16050894185900688, "rewards/accuracy_reward": 0.7286458313465118, "rewards/format_reward": 1.0, "step": 606 }, { "completion_length": 84.546875, "epoch": 2.771689497716895, "grad_norm": 2.738107681274414, "kl": 0.11376953125, "learning_rate": 7.228310502283105e-07, "loss": 0.0046, "reward": 1.6790487170219421, "reward_std": 0.19785276055335999, "rewards/accuracy_reward": 0.6946736574172974, "rewards/format_reward": 0.984375, "step": 607 }, { "completion_length": 71.125, "epoch": 2.776255707762557, "grad_norm": 5.066852569580078, "kl": 0.151123046875, "learning_rate": 7.223744292237442e-07, "loss": 0.006, "reward": 1.5531622171401978, "reward_std": 0.3353729024529457, "rewards/accuracy_reward": 0.5609746873378754, "rewards/format_reward": 0.9921875, "step": 608 }, { "completion_length": 83.7890625, "epoch": 2.780821917808219, "grad_norm": 4.337226867675781, "kl": 0.117919921875, "learning_rate": 7.219178082191781e-07, "loss": 0.0047, "reward": 1.7048035860061646, "reward_std": 0.13883494585752487, "rewards/accuracy_reward": 0.7048035860061646, "rewards/format_reward": 1.0, "step": 609 }, { "completion_length": 87.3984375, "epoch": 2.7853881278538815, "grad_norm": 2.790894031524658, "kl": 0.09716796875, "learning_rate": 7.214611872146118e-07, "loss": 0.0039, "reward": 1.6720238327980042, "reward_std": 0.16603849083185196, "rewards/accuracy_reward": 0.6798363327980042, "rewards/format_reward": 0.9921875, "step": 610 }, { "completion_length": 79.4140625, "epoch": 2.7899543378995433, "grad_norm": 4.233171463012695, "kl": 0.1103515625, "learning_rate": 7.210045662100456e-07, "loss": 0.0044, "reward": 1.6883246898651123, "reward_std": 0.21825328469276428, "rewards/accuracy_reward": 0.6961371600627899, "rewards/format_reward": 0.9921875, "step": 611 }, { "completion_length": 63.875, "epoch": 2.7945205479452055, "grad_norm": 4.263243198394775, "kl": 0.138671875, "learning_rate": 7.205479452054795e-07, "loss": 0.0055, "reward": 1.6089910864830017, "reward_std": 0.2958259731531143, "rewards/accuracy_reward": 0.6246160566806793, "rewards/format_reward": 0.984375, "step": 612 }, { "completion_length": 72.21875, "epoch": 2.7990867579908674, "grad_norm": 2.3356006145477295, "kl": 0.110595703125, "learning_rate": 7.200913242009132e-07, "loss": 0.0044, "reward": 1.5526910424232483, "reward_std": 0.24793513119220734, "rewards/accuracy_reward": 0.5526909977197647, "rewards/format_reward": 1.0, "step": 613 }, { "completion_length": 100.8203125, "epoch": 2.8036529680365296, "grad_norm": 1.9555134773254395, "kl": 0.098388671875, "learning_rate": 7.19634703196347e-07, "loss": 0.0039, "reward": 1.7364583611488342, "reward_std": 0.15703274309635162, "rewards/accuracy_reward": 0.7442708015441895, "rewards/format_reward": 0.9921875, "step": 614 }, { "completion_length": 71.8984375, "epoch": 2.808219178082192, "grad_norm": 5.0422539710998535, "kl": 0.16455078125, "learning_rate": 7.191780821917808e-07, "loss": 0.0066, "reward": 1.4983445405960083, "reward_std": 0.2954416871070862, "rewards/accuracy_reward": 0.5061569809913635, "rewards/format_reward": 0.9921875, "step": 615 }, { "completion_length": 81.109375, "epoch": 2.8127853881278537, "grad_norm": 2.615528106689453, "kl": 0.123779296875, "learning_rate": 7.187214611872145e-07, "loss": 0.005, "reward": 1.685937523841858, "reward_std": 0.16778654977679253, "rewards/accuracy_reward": 0.6859375238418579, "rewards/format_reward": 1.0, "step": 616 }, { "completion_length": 78.1171875, "epoch": 2.817351598173516, "grad_norm": 3.2558462619781494, "kl": 0.099609375, "learning_rate": 7.182648401826484e-07, "loss": 0.004, "reward": 1.7515625953674316, "reward_std": 0.1005905494093895, "rewards/accuracy_reward": 0.7593749463558197, "rewards/format_reward": 0.9921875, "step": 617 }, { "completion_length": 85.3671875, "epoch": 2.821917808219178, "grad_norm": 1.791414499282837, "kl": 0.127197265625, "learning_rate": 7.178082191780822e-07, "loss": 0.0051, "reward": 1.6640625, "reward_std": 0.16250330954790115, "rewards/accuracy_reward": 0.6718749403953552, "rewards/format_reward": 0.9921875, "step": 618 }, { "completion_length": 88.2578125, "epoch": 2.82648401826484, "grad_norm": 5.740970134735107, "kl": 0.077392578125, "learning_rate": 7.173515981735159e-07, "loss": 0.0031, "reward": 1.652303397655487, "reward_std": 0.17974259704351425, "rewards/accuracy_reward": 0.6523034274578094, "rewards/format_reward": 1.0, "step": 619 }, { "completion_length": 95.4609375, "epoch": 2.8310502283105023, "grad_norm": 2.585156202316284, "kl": 0.115234375, "learning_rate": 7.168949771689498e-07, "loss": 0.0046, "reward": 1.6767844557762146, "reward_std": 0.14024699479341507, "rewards/accuracy_reward": 0.6845969557762146, "rewards/format_reward": 0.9921875, "step": 620 }, { "completion_length": 80.734375, "epoch": 2.8356164383561646, "grad_norm": 6.572816848754883, "kl": 0.138671875, "learning_rate": 7.164383561643835e-07, "loss": 0.0055, "reward": 1.6025669574737549, "reward_std": 0.22662456333637238, "rewards/accuracy_reward": 0.6181919574737549, "rewards/format_reward": 0.984375, "step": 621 }, { "completion_length": 62.4921875, "epoch": 2.8401826484018264, "grad_norm": 12.138286590576172, "kl": 0.16650390625, "learning_rate": 7.159817351598173e-07, "loss": 0.0067, "reward": 1.7107762694358826, "reward_std": 0.16279632598161697, "rewards/accuracy_reward": 0.7107762694358826, "rewards/format_reward": 1.0, "step": 622 }, { "completion_length": 82.625, "epoch": 2.8447488584474887, "grad_norm": 4.130621910095215, "kl": 0.110595703125, "learning_rate": 7.155251141552511e-07, "loss": 0.0044, "reward": 1.792187511920929, "reward_std": 0.12151552736759186, "rewards/accuracy_reward": 0.7921874523162842, "rewards/format_reward": 1.0, "step": 623 }, { "completion_length": 66.8203125, "epoch": 2.8493150684931505, "grad_norm": 7.18456506729126, "kl": 0.12841796875, "learning_rate": 7.150684931506848e-07, "loss": 0.0051, "reward": 1.5980710983276367, "reward_std": 0.21035503596067429, "rewards/accuracy_reward": 0.6058836281299591, "rewards/format_reward": 0.9921875, "step": 624 }, { "completion_length": 92.1171875, "epoch": 2.853881278538813, "grad_norm": 5.2410759925842285, "kl": 0.119140625, "learning_rate": 7.146118721461188e-07, "loss": 0.0048, "reward": 1.8228118419647217, "reward_std": 0.17064978182315826, "rewards/accuracy_reward": 0.8306242823600769, "rewards/format_reward": 0.9921875, "step": 625 }, { "completion_length": 100.2265625, "epoch": 2.858447488584475, "grad_norm": 12.312285423278809, "kl": 0.086669921875, "learning_rate": 7.141552511415525e-07, "loss": 0.0035, "reward": 1.5916666984558105, "reward_std": 0.2057085707783699, "rewards/accuracy_reward": 0.5994791686534882, "rewards/format_reward": 0.9921875, "step": 626 }, { "completion_length": 94.7109375, "epoch": 2.863013698630137, "grad_norm": 2.4348156452178955, "kl": 0.09423828125, "learning_rate": 7.136986301369862e-07, "loss": 0.0038, "reward": 1.71042400598526, "reward_std": 0.15129226446151733, "rewards/accuracy_reward": 0.7104238867759705, "rewards/format_reward": 1.0, "step": 627 }, { "completion_length": 76.09375, "epoch": 2.867579908675799, "grad_norm": 2.56550931930542, "kl": 0.15283203125, "learning_rate": 7.132420091324201e-07, "loss": 0.0061, "reward": 1.6796875, "reward_std": 0.1692301705479622, "rewards/accuracy_reward": 0.6796875298023224, "rewards/format_reward": 1.0, "step": 628 }, { "completion_length": 63.71875, "epoch": 2.872146118721461, "grad_norm": 3.254136562347412, "kl": 0.1396484375, "learning_rate": 7.127853881278538e-07, "loss": 0.0056, "reward": 1.5118472576141357, "reward_std": 0.22680865228176117, "rewards/accuracy_reward": 0.5118472576141357, "rewards/format_reward": 1.0, "step": 629 }, { "completion_length": 91.984375, "epoch": 2.8767123287671232, "grad_norm": 1.9678096771240234, "kl": 0.118896484375, "learning_rate": 7.123287671232876e-07, "loss": 0.0047, "reward": 1.7608563899993896, "reward_std": 0.10874464362859726, "rewards/accuracy_reward": 0.7608563899993896, "rewards/format_reward": 1.0, "step": 630 }, { "completion_length": 80.421875, "epoch": 2.8812785388127855, "grad_norm": 3.203822374343872, "kl": 0.14111328125, "learning_rate": 7.118721461187215e-07, "loss": 0.0056, "reward": 1.5107174515724182, "reward_std": 0.2804914563894272, "rewards/accuracy_reward": 0.526342436671257, "rewards/format_reward": 0.984375, "step": 631 }, { "completion_length": 69.8671875, "epoch": 2.8858447488584473, "grad_norm": 2.1312613487243652, "kl": 0.1513671875, "learning_rate": 7.114155251141552e-07, "loss": 0.0061, "reward": 1.783835530281067, "reward_std": 0.09552156459540129, "rewards/accuracy_reward": 0.7838355302810669, "rewards/format_reward": 1.0, "step": 632 }, { "completion_length": 83.6796875, "epoch": 2.8904109589041096, "grad_norm": 2.534517765045166, "kl": 0.129638671875, "learning_rate": 7.109589041095891e-07, "loss": 0.0052, "reward": 1.820498526096344, "reward_std": 0.14453133195638657, "rewards/accuracy_reward": 0.8204984366893768, "rewards/format_reward": 1.0, "step": 633 }, { "completion_length": 100.578125, "epoch": 2.8949771689497714, "grad_norm": 4.368940353393555, "kl": 0.137451171875, "learning_rate": 7.105022831050228e-07, "loss": 0.0055, "reward": 1.6578125357627869, "reward_std": 0.2853652313351631, "rewards/accuracy_reward": 0.6890624761581421, "rewards/format_reward": 0.96875, "step": 634 }, { "completion_length": 91.734375, "epoch": 2.8995433789954337, "grad_norm": 2.2129926681518555, "kl": 0.1015625, "learning_rate": 7.100456621004565e-07, "loss": 0.0041, "reward": 1.60442715883255, "reward_std": 0.17540115863084793, "rewards/accuracy_reward": 0.6200520992279053, "rewards/format_reward": 0.984375, "step": 635 }, { "completion_length": 71.65625, "epoch": 2.904109589041096, "grad_norm": 14.674283981323242, "kl": 0.1591796875, "learning_rate": 7.095890410958904e-07, "loss": 0.0064, "reward": 1.5524739027023315, "reward_std": 0.24804671853780746, "rewards/accuracy_reward": 0.5602864325046539, "rewards/format_reward": 0.9921875, "step": 636 }, { "completion_length": 83.1484375, "epoch": 2.908675799086758, "grad_norm": 4.1890034675598145, "kl": 0.119140625, "learning_rate": 7.091324200913241e-07, "loss": 0.0048, "reward": 1.7505208253860474, "reward_std": 0.1413591168820858, "rewards/accuracy_reward": 0.7583333253860474, "rewards/format_reward": 0.9921875, "step": 637 }, { "completion_length": 72.359375, "epoch": 2.91324200913242, "grad_norm": 4.4610443115234375, "kl": 0.1669921875, "learning_rate": 7.08675799086758e-07, "loss": 0.0067, "reward": 1.6082217693328857, "reward_std": 0.3458182215690613, "rewards/accuracy_reward": 0.647284209728241, "rewards/format_reward": 0.9609375, "step": 638 }, { "completion_length": 86.0390625, "epoch": 2.9178082191780823, "grad_norm": 4.549093723297119, "kl": 0.1357421875, "learning_rate": 7.082191780821918e-07, "loss": 0.0054, "reward": 1.6874799728393555, "reward_std": 0.2353600338101387, "rewards/accuracy_reward": 0.6874799132347107, "rewards/format_reward": 1.0, "step": 639 }, { "completion_length": 97.1484375, "epoch": 2.922374429223744, "grad_norm": 2.3335134983062744, "kl": 0.12646484375, "learning_rate": 7.077625570776255e-07, "loss": 0.0051, "reward": 1.7588170766830444, "reward_std": 0.14432461559772491, "rewards/accuracy_reward": 0.7666293978691101, "rewards/format_reward": 0.9921875, "step": 640 }, { "completion_length": 77.0078125, "epoch": 2.9269406392694064, "grad_norm": 3.070441961288452, "kl": 0.126220703125, "learning_rate": 7.073059360730594e-07, "loss": 0.005, "reward": 1.4704504013061523, "reward_std": 0.28115857392549515, "rewards/accuracy_reward": 0.47826285660266876, "rewards/format_reward": 0.9921875, "step": 641 }, { "completion_length": 92.9765625, "epoch": 2.9315068493150687, "grad_norm": 3.367562770843506, "kl": 0.101806640625, "learning_rate": 7.068493150684931e-07, "loss": 0.0041, "reward": 1.729687511920929, "reward_std": 0.12863079458475113, "rewards/accuracy_reward": 0.7374999523162842, "rewards/format_reward": 0.9921875, "step": 642 }, { "completion_length": 96.390625, "epoch": 2.9360730593607305, "grad_norm": 2.8609812259674072, "kl": 0.10302734375, "learning_rate": 7.063926940639268e-07, "loss": 0.0041, "reward": 1.749218761920929, "reward_std": 0.18227346241474152, "rewards/accuracy_reward": 0.7648437321186066, "rewards/format_reward": 0.984375, "step": 643 }, { "completion_length": 89.7734375, "epoch": 2.9406392694063928, "grad_norm": 1.9020359516143799, "kl": 0.090087890625, "learning_rate": 7.059360730593607e-07, "loss": 0.0036, "reward": 1.7804688215255737, "reward_std": 0.11121231690049171, "rewards/accuracy_reward": 0.7804686725139618, "rewards/format_reward": 1.0, "step": 644 }, { "completion_length": 99.03125, "epoch": 2.9452054794520546, "grad_norm": 9.120920181274414, "kl": 0.10595703125, "learning_rate": 7.054794520547945e-07, "loss": 0.0042, "reward": 1.7126150131225586, "reward_std": 0.20574645698070526, "rewards/accuracy_reward": 0.7282399535179138, "rewards/format_reward": 0.984375, "step": 645 }, { "completion_length": 74.2265625, "epoch": 2.949771689497717, "grad_norm": 5.297418594360352, "kl": 0.133056640625, "learning_rate": 7.050228310502283e-07, "loss": 0.0053, "reward": 1.6359771490097046, "reward_std": 0.23064683377742767, "rewards/accuracy_reward": 0.6672270596027374, "rewards/format_reward": 0.96875, "step": 646 }, { "completion_length": 102.6796875, "epoch": 2.954337899543379, "grad_norm": 1.925435185432434, "kl": 0.0947265625, "learning_rate": 7.045662100456621e-07, "loss": 0.0038, "reward": 1.8184895515441895, "reward_std": 0.14697792008519173, "rewards/accuracy_reward": 0.8341145217418671, "rewards/format_reward": 0.984375, "step": 647 }, { "completion_length": 93.6796875, "epoch": 2.958904109589041, "grad_norm": 8.05284309387207, "kl": 0.119140625, "learning_rate": 7.041095890410958e-07, "loss": 0.0048, "reward": 1.7860276699066162, "reward_std": 0.13688677921891212, "rewards/accuracy_reward": 0.793840080499649, "rewards/format_reward": 0.9921875, "step": 648 }, { "completion_length": 89.421875, "epoch": 2.963470319634703, "grad_norm": 2.366201162338257, "kl": 0.150390625, "learning_rate": 7.036529680365297e-07, "loss": 0.006, "reward": 1.6971353888511658, "reward_std": 0.16348732262849808, "rewards/accuracy_reward": 0.7049478888511658, "rewards/format_reward": 0.9921875, "step": 649 }, { "completion_length": 76.46875, "epoch": 2.968036529680365, "grad_norm": 2.8056042194366455, "kl": 0.1259765625, "learning_rate": 7.031963470319634e-07, "loss": 0.0051, "reward": 1.7359544038772583, "reward_std": 0.1912137269973755, "rewards/accuracy_reward": 0.7437668442726135, "rewards/format_reward": 0.9921875, "step": 650 }, { "completion_length": 74.015625, "epoch": 2.9726027397260273, "grad_norm": 2.3145570755004883, "kl": 0.11767578125, "learning_rate": 7.027397260273972e-07, "loss": 0.0047, "reward": 1.5831072330474854, "reward_std": 0.25112421810626984, "rewards/accuracy_reward": 0.5987322330474854, "rewards/format_reward": 0.984375, "step": 651 }, { "completion_length": 93.734375, "epoch": 2.9771689497716896, "grad_norm": 3.0146915912628174, "kl": 0.117431640625, "learning_rate": 7.022831050228311e-07, "loss": 0.0047, "reward": 1.6713745594024658, "reward_std": 0.21526063233613968, "rewards/accuracy_reward": 0.6791869699954987, "rewards/format_reward": 0.9921875, "step": 652 }, { "completion_length": 65.5078125, "epoch": 2.981735159817352, "grad_norm": 2.2300150394439697, "kl": 0.123046875, "learning_rate": 7.018264840182648e-07, "loss": 0.0049, "reward": 1.690625011920929, "reward_std": 0.16151440143585205, "rewards/accuracy_reward": 0.690625011920929, "rewards/format_reward": 1.0, "step": 653 }, { "completion_length": 78.2890625, "epoch": 2.9863013698630136, "grad_norm": 4.648037433624268, "kl": 0.126708984375, "learning_rate": 7.013698630136986e-07, "loss": 0.0051, "reward": 1.744691550731659, "reward_std": 0.14016081020236015, "rewards/accuracy_reward": 0.7603164613246918, "rewards/format_reward": 0.984375, "step": 654 }, { "completion_length": 67.203125, "epoch": 2.990867579908676, "grad_norm": 2.895372152328491, "kl": 0.139404296875, "learning_rate": 7.009132420091324e-07, "loss": 0.0056, "reward": 1.6064826250076294, "reward_std": 0.2562439739704132, "rewards/accuracy_reward": 0.6142950057983398, "rewards/format_reward": 0.9921875, "step": 655 }, { "completion_length": 78.765625, "epoch": 2.9954337899543377, "grad_norm": 3.7892091274261475, "kl": 0.093017578125, "learning_rate": 7.004566210045661e-07, "loss": 0.0037, "reward": 1.754687488079071, "reward_std": 0.1408282183110714, "rewards/accuracy_reward": 0.754687488079071, "rewards/format_reward": 1.0, "step": 656 }, { "completion_length": 58.5, "epoch": 3.0, "grad_norm": 3.69757080078125, "kl": 0.18359375, "learning_rate": 7e-07, "loss": 0.0056, "reward": 1.5, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 657 }, { "completion_length": 73.21875, "epoch": 3.0045662100456623, "grad_norm": 2.8494434356689453, "kl": 0.12744140625, "learning_rate": 6.995433789954338e-07, "loss": 0.0051, "reward": 1.7073844075202942, "reward_std": 0.18213152885437012, "rewards/accuracy_reward": 0.7151968777179718, "rewards/format_reward": 0.9921875, "step": 658 }, { "completion_length": 73.71875, "epoch": 3.009132420091324, "grad_norm": 2.561201810836792, "kl": 0.099609375, "learning_rate": 6.990867579908675e-07, "loss": 0.004, "reward": 1.6927083730697632, "reward_std": 0.17407646402716637, "rewards/accuracy_reward": 0.6927083134651184, "rewards/format_reward": 1.0, "step": 659 }, { "completion_length": 98.328125, "epoch": 3.0136986301369864, "grad_norm": 3.664125442504883, "kl": 0.138671875, "learning_rate": 6.986301369863014e-07, "loss": 0.0055, "reward": 1.6324912905693054, "reward_std": 0.21703079342842102, "rewards/accuracy_reward": 0.6403037309646606, "rewards/format_reward": 0.9921875, "step": 660 }, { "completion_length": 94.46875, "epoch": 3.018264840182648, "grad_norm": 2.527571439743042, "kl": 0.1181640625, "learning_rate": 6.981735159817351e-07, "loss": 0.0047, "reward": 1.7589489221572876, "reward_std": 0.16745695658028126, "rewards/accuracy_reward": 0.7745738625526428, "rewards/format_reward": 0.984375, "step": 661 }, { "completion_length": 82.1953125, "epoch": 3.0228310502283104, "grad_norm": 2.676431894302368, "kl": 0.14599609375, "learning_rate": 6.977168949771689e-07, "loss": 0.0058, "reward": 1.681512713432312, "reward_std": 0.18309402465820312, "rewards/accuracy_reward": 0.6971376836299896, "rewards/format_reward": 0.984375, "step": 662 }, { "completion_length": 97.671875, "epoch": 3.0273972602739727, "grad_norm": 2.549351453781128, "kl": 0.078125, "learning_rate": 6.972602739726027e-07, "loss": 0.0031, "reward": 1.6531250476837158, "reward_std": 0.24489019811153412, "rewards/accuracy_reward": 0.6687500178813934, "rewards/format_reward": 0.984375, "step": 663 }, { "completion_length": 76.59375, "epoch": 3.0319634703196345, "grad_norm": 1.8996092081069946, "kl": 0.147216796875, "learning_rate": 6.968036529680364e-07, "loss": 0.0059, "reward": 1.734002947807312, "reward_std": 0.1543407365679741, "rewards/accuracy_reward": 0.7496279180049896, "rewards/format_reward": 0.984375, "step": 664 }, { "completion_length": 62.4453125, "epoch": 3.036529680365297, "grad_norm": 2.1863508224487305, "kl": 0.1217041015625, "learning_rate": 6.963470319634704e-07, "loss": 0.0049, "reward": 1.771484375, "reward_std": 0.08306973986327648, "rewards/accuracy_reward": 0.7714843153953552, "rewards/format_reward": 1.0, "step": 665 }, { "completion_length": 79.71875, "epoch": 3.041095890410959, "grad_norm": 2.9316976070404053, "kl": 0.129150390625, "learning_rate": 6.958904109589041e-07, "loss": 0.0052, "reward": 1.6214410066604614, "reward_std": 0.2264525145292282, "rewards/accuracy_reward": 0.6370659470558167, "rewards/format_reward": 0.984375, "step": 666 }, { "completion_length": 90.0546875, "epoch": 3.045662100456621, "grad_norm": 2.6280527114868164, "kl": 0.101806640625, "learning_rate": 6.954337899543378e-07, "loss": 0.0041, "reward": 1.7546875476837158, "reward_std": 0.1054728776216507, "rewards/accuracy_reward": 0.7546874284744263, "rewards/format_reward": 1.0, "step": 667 }, { "completion_length": 79.453125, "epoch": 3.050228310502283, "grad_norm": 5.255128383636475, "kl": 0.10546875, "learning_rate": 6.949771689497717e-07, "loss": 0.0042, "reward": 1.5757898688316345, "reward_std": 0.19272886961698532, "rewards/accuracy_reward": 0.5836023390293121, "rewards/format_reward": 0.9921875, "step": 668 }, { "completion_length": 96.90625, "epoch": 3.0547945205479454, "grad_norm": 2.3194217681884766, "kl": 0.077392578125, "learning_rate": 6.945205479452054e-07, "loss": 0.0031, "reward": 1.770312488079071, "reward_std": 0.11202363669872284, "rewards/accuracy_reward": 0.7703124582767487, "rewards/format_reward": 1.0, "step": 669 }, { "completion_length": 80.734375, "epoch": 3.0593607305936072, "grad_norm": 3.282161235809326, "kl": 0.163330078125, "learning_rate": 6.940639269406392e-07, "loss": 0.0065, "reward": 1.7177083492279053, "reward_std": 0.16301878169178963, "rewards/accuracy_reward": 0.7177082896232605, "rewards/format_reward": 1.0, "step": 670 }, { "completion_length": 83.0, "epoch": 3.0639269406392695, "grad_norm": 3.282697916030884, "kl": 0.12744140625, "learning_rate": 6.93607305936073e-07, "loss": 0.0051, "reward": 1.8043155074119568, "reward_std": 0.11800633184611797, "rewards/accuracy_reward": 0.8043154180049896, "rewards/format_reward": 1.0, "step": 671 }, { "completion_length": 80.8046875, "epoch": 3.0684931506849313, "grad_norm": 2.545616388320923, "kl": 0.09814453125, "learning_rate": 6.931506849315068e-07, "loss": 0.0039, "reward": 1.7645833492279053, "reward_std": 0.14983439445495605, "rewards/accuracy_reward": 0.7802082598209381, "rewards/format_reward": 0.984375, "step": 672 }, { "completion_length": 75.546875, "epoch": 3.0730593607305936, "grad_norm": 2.007420778274536, "kl": 0.1240234375, "learning_rate": 6.926940639269407e-07, "loss": 0.005, "reward": 1.5703125596046448, "reward_std": 0.21254336833953857, "rewards/accuracy_reward": 0.5703125, "rewards/format_reward": 1.0, "step": 673 }, { "completion_length": 74.5234375, "epoch": 3.077625570776256, "grad_norm": 3.1193904876708984, "kl": 0.119873046875, "learning_rate": 6.922374429223744e-07, "loss": 0.0048, "reward": 1.6553664803504944, "reward_std": 0.17046189308166504, "rewards/accuracy_reward": 0.6631789207458496, "rewards/format_reward": 0.9921875, "step": 674 }, { "completion_length": 85.1484375, "epoch": 3.0821917808219177, "grad_norm": 2.9060676097869873, "kl": 0.105712890625, "learning_rate": 6.917808219178081e-07, "loss": 0.0042, "reward": 1.5601562857627869, "reward_std": 0.2604144960641861, "rewards/accuracy_reward": 0.5835937559604645, "rewards/format_reward": 0.9765625, "step": 675 }, { "completion_length": 92.7109375, "epoch": 3.08675799086758, "grad_norm": 3.6577494144439697, "kl": 0.0640869140625, "learning_rate": 6.91324200913242e-07, "loss": 0.0026, "reward": 1.6146825551986694, "reward_std": 0.10726364329457283, "rewards/accuracy_reward": 0.614682525396347, "rewards/format_reward": 1.0, "step": 676 }, { "completion_length": 76.6875, "epoch": 3.091324200913242, "grad_norm": 9.413636207580566, "kl": 0.13818359375, "learning_rate": 6.908675799086757e-07, "loss": 0.0055, "reward": 1.6631696820259094, "reward_std": 0.2051173821091652, "rewards/accuracy_reward": 0.6709820926189423, "rewards/format_reward": 0.9921875, "step": 677 }, { "completion_length": 75.2734375, "epoch": 3.095890410958904, "grad_norm": 5.669606685638428, "kl": 0.119140625, "learning_rate": 6.904109589041097e-07, "loss": 0.0048, "reward": 1.5031325817108154, "reward_std": 0.2111596167087555, "rewards/accuracy_reward": 0.5109450221061707, "rewards/format_reward": 0.9921875, "step": 678 }, { "completion_length": 74.1796875, "epoch": 3.1004566210045663, "grad_norm": 2.975776433944702, "kl": 0.1220703125, "learning_rate": 6.899543378995434e-07, "loss": 0.0049, "reward": 1.567187488079071, "reward_std": 0.26324766874313354, "rewards/accuracy_reward": 0.582812488079071, "rewards/format_reward": 0.984375, "step": 679 }, { "completion_length": 84.171875, "epoch": 3.105022831050228, "grad_norm": 2.021631956100464, "kl": 0.09765625, "learning_rate": 6.894977168949771e-07, "loss": 0.0039, "reward": 1.723046898841858, "reward_std": 0.14361856132745743, "rewards/accuracy_reward": 0.7230468392372131, "rewards/format_reward": 1.0, "step": 680 }, { "completion_length": 82.0390625, "epoch": 3.1095890410958904, "grad_norm": 4.33884334564209, "kl": 0.108642578125, "learning_rate": 6.89041095890411e-07, "loss": 0.0043, "reward": 1.6755682229995728, "reward_std": 0.11651190742850304, "rewards/accuracy_reward": 0.6755681037902832, "rewards/format_reward": 1.0, "step": 681 }, { "completion_length": 86.203125, "epoch": 3.1141552511415527, "grad_norm": 1.7685151100158691, "kl": 0.099853515625, "learning_rate": 6.885844748858447e-07, "loss": 0.004, "reward": 1.71484375, "reward_std": 0.15246989578008652, "rewards/accuracy_reward": 0.72265625, "rewards/format_reward": 0.9921875, "step": 682 }, { "completion_length": 74.515625, "epoch": 3.1187214611872145, "grad_norm": 3.724857807159424, "kl": 0.136962890625, "learning_rate": 6.881278538812784e-07, "loss": 0.0055, "reward": 1.6691706776618958, "reward_std": 0.20630130916833878, "rewards/accuracy_reward": 0.6691707074642181, "rewards/format_reward": 1.0, "step": 683 }, { "completion_length": 88.515625, "epoch": 3.1232876712328768, "grad_norm": 1.960091471672058, "kl": 0.08740234375, "learning_rate": 6.876712328767123e-07, "loss": 0.0035, "reward": 1.7096437811851501, "reward_std": 0.1030731052160263, "rewards/accuracy_reward": 0.709643691778183, "rewards/format_reward": 1.0, "step": 684 }, { "completion_length": 69.921875, "epoch": 3.127853881278539, "grad_norm": 4.404357433319092, "kl": 0.11376953125, "learning_rate": 6.872146118721461e-07, "loss": 0.0046, "reward": 1.7164434790611267, "reward_std": 0.16906945407390594, "rewards/accuracy_reward": 0.7164434790611267, "rewards/format_reward": 1.0, "step": 685 }, { "completion_length": 106.78125, "epoch": 3.132420091324201, "grad_norm": 1.5101971626281738, "kl": 0.073974609375, "learning_rate": 6.867579908675799e-07, "loss": 0.003, "reward": 1.798065423965454, "reward_std": 0.0902152806520462, "rewards/accuracy_reward": 0.8058778941631317, "rewards/format_reward": 0.9921875, "step": 686 }, { "completion_length": 76.1953125, "epoch": 3.136986301369863, "grad_norm": 2.8091230392456055, "kl": 0.16064453125, "learning_rate": 6.863013698630137e-07, "loss": 0.0064, "reward": 1.5283854007720947, "reward_std": 0.21707772463560104, "rewards/accuracy_reward": 0.5283854007720947, "rewards/format_reward": 1.0, "step": 687 }, { "completion_length": 87.125, "epoch": 3.141552511415525, "grad_norm": 6.1914591789245605, "kl": 0.078857421875, "learning_rate": 6.858447488584474e-07, "loss": 0.0032, "reward": 1.7078726291656494, "reward_std": 0.15342054888606071, "rewards/accuracy_reward": 0.7156850397586823, "rewards/format_reward": 0.9921875, "step": 688 }, { "completion_length": 101.140625, "epoch": 3.146118721461187, "grad_norm": 2.486093521118164, "kl": 0.076904296875, "learning_rate": 6.853881278538813e-07, "loss": 0.0031, "reward": 1.8033654689788818, "reward_std": 0.08035113476216793, "rewards/accuracy_reward": 0.8033653199672699, "rewards/format_reward": 1.0, "step": 689 }, { "completion_length": 72.546875, "epoch": 3.1506849315068495, "grad_norm": 4.52701473236084, "kl": 0.13427734375, "learning_rate": 6.84931506849315e-07, "loss": 0.0054, "reward": 1.9150173664093018, "reward_std": 0.12145426124334335, "rewards/accuracy_reward": 0.9228298366069794, "rewards/format_reward": 0.9921875, "step": 690 }, { "completion_length": 77.0625, "epoch": 3.1552511415525113, "grad_norm": 2.9900949001312256, "kl": 0.124755859375, "learning_rate": 6.844748858447487e-07, "loss": 0.005, "reward": 1.6368862390518188, "reward_std": 0.17363014817237854, "rewards/accuracy_reward": 0.6446986198425293, "rewards/format_reward": 0.9921875, "step": 691 }, { "completion_length": 91.0859375, "epoch": 3.1598173515981736, "grad_norm": 2.327423095703125, "kl": 0.099609375, "learning_rate": 6.840182648401827e-07, "loss": 0.004, "reward": 1.682031273841858, "reward_std": 0.17397383973002434, "rewards/accuracy_reward": 0.7054687440395355, "rewards/format_reward": 0.9765625, "step": 692 }, { "completion_length": 67.2578125, "epoch": 3.1643835616438354, "grad_norm": 2.4474422931671143, "kl": 0.133544921875, "learning_rate": 6.835616438356164e-07, "loss": 0.0053, "reward": 1.6786458492279053, "reward_std": 0.16388440132141113, "rewards/accuracy_reward": 0.6786458194255829, "rewards/format_reward": 1.0, "step": 693 }, { "completion_length": 83.015625, "epoch": 3.1689497716894977, "grad_norm": 12.002235412597656, "kl": 0.1435546875, "learning_rate": 6.831050228310502e-07, "loss": 0.0057, "reward": 1.6534380912780762, "reward_std": 0.19905856251716614, "rewards/accuracy_reward": 0.6534381806850433, "rewards/format_reward": 1.0, "step": 694 }, { "completion_length": 72.671875, "epoch": 3.17351598173516, "grad_norm": 4.494046211242676, "kl": 0.127197265625, "learning_rate": 6.82648401826484e-07, "loss": 0.0051, "reward": 1.8556300401687622, "reward_std": 0.14538883790373802, "rewards/accuracy_reward": 0.8556298911571503, "rewards/format_reward": 1.0, "step": 695 }, { "completion_length": 86.609375, "epoch": 3.1780821917808217, "grad_norm": 1.9116896390914917, "kl": 0.103759765625, "learning_rate": 6.821917808219177e-07, "loss": 0.0041, "reward": 1.7070313096046448, "reward_std": 0.19209937751293182, "rewards/accuracy_reward": 0.7148437201976776, "rewards/format_reward": 0.9921875, "step": 696 }, { "completion_length": 87.53125, "epoch": 3.182648401826484, "grad_norm": 6.58709192276001, "kl": 0.1005859375, "learning_rate": 6.817351598173516e-07, "loss": 0.004, "reward": 1.7153645753860474, "reward_std": 0.13286828622221947, "rewards/accuracy_reward": 0.7153644859790802, "rewards/format_reward": 1.0, "step": 697 }, { "completion_length": 70.234375, "epoch": 3.1872146118721463, "grad_norm": 2.434600591659546, "kl": 0.145751953125, "learning_rate": 6.812785388127854e-07, "loss": 0.0058, "reward": 1.7907050848007202, "reward_std": 0.12088606879115105, "rewards/accuracy_reward": 0.7907051146030426, "rewards/format_reward": 1.0, "step": 698 }, { "completion_length": 68.5, "epoch": 3.191780821917808, "grad_norm": 26.128162384033203, "kl": 0.126953125, "learning_rate": 6.808219178082191e-07, "loss": 0.0051, "reward": 1.715624988079071, "reward_std": 0.2467075139284134, "rewards/accuracy_reward": 0.715624988079071, "rewards/format_reward": 1.0, "step": 699 }, { "completion_length": 87.078125, "epoch": 3.1963470319634704, "grad_norm": 3.1288247108459473, "kl": 0.14404296875, "learning_rate": 6.80365296803653e-07, "loss": 0.0057, "reward": 1.860937476158142, "reward_std": 0.08299508690834045, "rewards/accuracy_reward": 0.8687499463558197, "rewards/format_reward": 0.9921875, "step": 700 }, { "completion_length": 67.5, "epoch": 3.2009132420091326, "grad_norm": 7.457474708557129, "kl": 0.1416015625, "learning_rate": 6.799086757990867e-07, "loss": 0.0057, "reward": 1.6931147575378418, "reward_std": 0.18138662725687027, "rewards/accuracy_reward": 0.6931147575378418, "rewards/format_reward": 1.0, "step": 701 }, { "completion_length": 78.4921875, "epoch": 3.2054794520547945, "grad_norm": 3.00228214263916, "kl": 0.182861328125, "learning_rate": 6.794520547945205e-07, "loss": 0.0073, "reward": 1.7272436618804932, "reward_std": 0.18743212521076202, "rewards/accuracy_reward": 0.727243572473526, "rewards/format_reward": 1.0, "step": 702 }, { "completion_length": 83.515625, "epoch": 3.2100456621004567, "grad_norm": 2.2364189624786377, "kl": 0.10693359375, "learning_rate": 6.789954337899543e-07, "loss": 0.0043, "reward": 1.7500601410865784, "reward_std": 0.10543964058160782, "rewards/accuracy_reward": 0.7500600218772888, "rewards/format_reward": 1.0, "step": 703 }, { "completion_length": 59.78125, "epoch": 3.2146118721461185, "grad_norm": 15.17089557647705, "kl": 0.64404296875, "learning_rate": 6.78538812785388e-07, "loss": 0.0258, "reward": 1.5231274366378784, "reward_std": 0.3583277612924576, "rewards/accuracy_reward": 0.5309399664402008, "rewards/format_reward": 0.9921875, "step": 704 }, { "completion_length": 85.2734375, "epoch": 3.219178082191781, "grad_norm": 1.857921838760376, "kl": 0.121826171875, "learning_rate": 6.78082191780822e-07, "loss": 0.0049, "reward": 1.8244792222976685, "reward_std": 0.08049174584448338, "rewards/accuracy_reward": 0.8244791626930237, "rewards/format_reward": 1.0, "step": 705 }, { "completion_length": 81.7578125, "epoch": 3.223744292237443, "grad_norm": 6.063529014587402, "kl": 0.11328125, "learning_rate": 6.776255707762557e-07, "loss": 0.0045, "reward": 1.7606770992279053, "reward_std": 0.21317215263843536, "rewards/accuracy_reward": 0.7684895098209381, "rewards/format_reward": 0.9921875, "step": 706 }, { "completion_length": 86.984375, "epoch": 3.228310502283105, "grad_norm": 1.6526107788085938, "kl": 0.1376953125, "learning_rate": 6.771689497716894e-07, "loss": 0.0055, "reward": 1.6617188453674316, "reward_std": 0.175977885723114, "rewards/accuracy_reward": 0.6695312261581421, "rewards/format_reward": 0.9921875, "step": 707 }, { "completion_length": 90.5234375, "epoch": 3.232876712328767, "grad_norm": 3.4753360748291016, "kl": 0.090087890625, "learning_rate": 6.767123287671233e-07, "loss": 0.0036, "reward": 1.7570313215255737, "reward_std": 0.09644587151706219, "rewards/accuracy_reward": 0.7570312023162842, "rewards/format_reward": 1.0, "step": 708 }, { "completion_length": 78.34375, "epoch": 3.237442922374429, "grad_norm": 5.928980350494385, "kl": 0.1376953125, "learning_rate": 6.76255707762557e-07, "loss": 0.0055, "reward": 1.7825521230697632, "reward_std": 0.19306360930204391, "rewards/accuracy_reward": 0.7825520932674408, "rewards/format_reward": 1.0, "step": 709 }, { "completion_length": 78.015625, "epoch": 3.2420091324200913, "grad_norm": 5.787877559661865, "kl": 0.098876953125, "learning_rate": 6.757990867579907e-07, "loss": 0.004, "reward": 1.6989798545837402, "reward_std": 0.22939839959144592, "rewards/accuracy_reward": 0.7146047651767731, "rewards/format_reward": 0.984375, "step": 710 }, { "completion_length": 73.984375, "epoch": 3.2465753424657535, "grad_norm": 1.9611449241638184, "kl": 0.135986328125, "learning_rate": 6.753424657534246e-07, "loss": 0.0054, "reward": 1.7150809168815613, "reward_std": 0.12537125870585442, "rewards/accuracy_reward": 0.7228934466838837, "rewards/format_reward": 0.9921875, "step": 711 }, { "completion_length": 76.640625, "epoch": 3.2511415525114153, "grad_norm": 2.202395439147949, "kl": 0.15673828125, "learning_rate": 6.748858447488584e-07, "loss": 0.0063, "reward": 1.7408854365348816, "reward_std": 0.18467864021658897, "rewards/accuracy_reward": 0.7486979365348816, "rewards/format_reward": 0.9921875, "step": 712 }, { "completion_length": 91.578125, "epoch": 3.2557077625570776, "grad_norm": 2.922081232070923, "kl": 0.075439453125, "learning_rate": 6.744292237442923e-07, "loss": 0.003, "reward": 1.6480501890182495, "reward_std": 0.08936248533427715, "rewards/accuracy_reward": 0.6480501890182495, "rewards/format_reward": 1.0, "step": 713 }, { "completion_length": 97.6953125, "epoch": 3.26027397260274, "grad_norm": 1.8138573169708252, "kl": 0.0810546875, "learning_rate": 6.73972602739726e-07, "loss": 0.0032, "reward": 1.678125023841858, "reward_std": 0.17582245916128159, "rewards/accuracy_reward": 0.7015624642372131, "rewards/format_reward": 0.9765625, "step": 714 }, { "completion_length": 81.609375, "epoch": 3.2648401826484017, "grad_norm": 2.29533052444458, "kl": 0.117919921875, "learning_rate": 6.735159817351597e-07, "loss": 0.0047, "reward": 1.5667868852615356, "reward_std": 0.23160236328840256, "rewards/accuracy_reward": 0.5902243554592133, "rewards/format_reward": 0.9765625, "step": 715 }, { "completion_length": 79.046875, "epoch": 3.269406392694064, "grad_norm": 3.5871529579162598, "kl": 0.1748046875, "learning_rate": 6.730593607305936e-07, "loss": 0.007, "reward": 1.5300781726837158, "reward_std": 0.381600484251976, "rewards/accuracy_reward": 0.592578113079071, "rewards/format_reward": 0.9375, "step": 716 }, { "completion_length": 81.21875, "epoch": 3.2739726027397262, "grad_norm": 7.025841236114502, "kl": 0.177734375, "learning_rate": 6.726027397260273e-07, "loss": 0.0071, "reward": 1.5895833373069763, "reward_std": 0.3286707103252411, "rewards/accuracy_reward": 0.6442708373069763, "rewards/format_reward": 0.9453125, "step": 717 }, { "completion_length": 79.859375, "epoch": 3.278538812785388, "grad_norm": 3.0471503734588623, "kl": 0.099609375, "learning_rate": 6.721461187214613e-07, "loss": 0.004, "reward": 1.6017058491706848, "reward_std": 0.18348699063062668, "rewards/accuracy_reward": 0.6095183193683624, "rewards/format_reward": 0.9921875, "step": 718 }, { "completion_length": 78.8515625, "epoch": 3.2831050228310503, "grad_norm": 2.4444265365600586, "kl": 0.09912109375, "learning_rate": 6.71689497716895e-07, "loss": 0.004, "reward": 1.5035117268562317, "reward_std": 0.2922291085124016, "rewards/accuracy_reward": 0.5191366672515869, "rewards/format_reward": 0.984375, "step": 719 }, { "completion_length": 80.515625, "epoch": 3.287671232876712, "grad_norm": 4.203682899475098, "kl": 0.11962890625, "learning_rate": 6.712328767123287e-07, "loss": 0.0048, "reward": 1.5486140251159668, "reward_std": 0.18936936557292938, "rewards/accuracy_reward": 0.5564264357089996, "rewards/format_reward": 0.9921875, "step": 720 }, { "completion_length": 90.1015625, "epoch": 3.2922374429223744, "grad_norm": 3.4824612140655518, "kl": 0.103759765625, "learning_rate": 6.707762557077626e-07, "loss": 0.0042, "reward": 1.7547819018363953, "reward_std": 0.14520251005887985, "rewards/accuracy_reward": 0.7782192826271057, "rewards/format_reward": 0.9765625, "step": 721 }, { "completion_length": 71.7734375, "epoch": 3.2968036529680367, "grad_norm": 1.9310115575790405, "kl": 0.18310546875, "learning_rate": 6.703196347031963e-07, "loss": 0.0073, "reward": 1.494028091430664, "reward_std": 0.24336419254541397, "rewards/accuracy_reward": 0.5174656212329865, "rewards/format_reward": 0.9765625, "step": 722 }, { "completion_length": 87.65625, "epoch": 3.3013698630136985, "grad_norm": 2.703258991241455, "kl": 0.1064453125, "learning_rate": 6.6986301369863e-07, "loss": 0.0043, "reward": 1.6925916075706482, "reward_std": 0.22123625874519348, "rewards/accuracy_reward": 0.7238415777683258, "rewards/format_reward": 0.96875, "step": 723 }, { "completion_length": 86.796875, "epoch": 3.3059360730593608, "grad_norm": 1.1868500709533691, "kl": 0.10107421875, "learning_rate": 6.694063926940639e-07, "loss": 0.004, "reward": 1.868213951587677, "reward_std": 0.16314804006833583, "rewards/accuracy_reward": 0.8916513323783875, "rewards/format_reward": 0.9765625, "step": 724 }, { "completion_length": 89.765625, "epoch": 3.3105022831050226, "grad_norm": 1.9978954792022705, "kl": 0.09814453125, "learning_rate": 6.689497716894977e-07, "loss": 0.0039, "reward": 1.6983258724212646, "reward_std": 0.1776369959115982, "rewards/accuracy_reward": 0.7139508724212646, "rewards/format_reward": 0.984375, "step": 725 }, { "completion_length": 82.3515625, "epoch": 3.315068493150685, "grad_norm": 2.7241666316986084, "kl": 0.0947265625, "learning_rate": 6.684931506849316e-07, "loss": 0.0038, "reward": 1.6841517686843872, "reward_std": 0.20553293824195862, "rewards/accuracy_reward": 0.6997767686843872, "rewards/format_reward": 0.984375, "step": 726 }, { "completion_length": 66.3515625, "epoch": 3.319634703196347, "grad_norm": 2.9195847511291504, "kl": 0.1630859375, "learning_rate": 6.680365296803653e-07, "loss": 0.0065, "reward": 1.7539063096046448, "reward_std": 0.18009010702371597, "rewards/accuracy_reward": 0.7617186903953552, "rewards/format_reward": 0.9921875, "step": 727 }, { "completion_length": 75.4765625, "epoch": 3.324200913242009, "grad_norm": 2.223017930984497, "kl": 0.091064453125, "learning_rate": 6.67579908675799e-07, "loss": 0.0036, "reward": 1.535528302192688, "reward_std": 0.24083293601870537, "rewards/accuracy_reward": 0.5667782425880432, "rewards/format_reward": 0.96875, "step": 728 }, { "completion_length": 78.046875, "epoch": 3.328767123287671, "grad_norm": 2.5224952697753906, "kl": 0.102783203125, "learning_rate": 6.671232876712329e-07, "loss": 0.0041, "reward": 1.732812523841858, "reward_std": 0.21059568226337433, "rewards/accuracy_reward": 0.7640624642372131, "rewards/format_reward": 0.96875, "step": 729 }, { "completion_length": 71.8203125, "epoch": 3.3333333333333335, "grad_norm": 2.83807110786438, "kl": 0.10498046875, "learning_rate": 6.666666666666666e-07, "loss": 0.0042, "reward": 1.579541265964508, "reward_std": 0.26198120415210724, "rewards/accuracy_reward": 0.5951661765575409, "rewards/format_reward": 0.984375, "step": 730 }, { "completion_length": 63.3125, "epoch": 3.3378995433789953, "grad_norm": 3.6584157943725586, "kl": 0.16015625, "learning_rate": 6.662100456621003e-07, "loss": 0.0064, "reward": 1.7500744462013245, "reward_std": 0.19155671447515488, "rewards/accuracy_reward": 0.7500744163990021, "rewards/format_reward": 1.0, "step": 731 }, { "completion_length": 56.4765625, "epoch": 3.3424657534246576, "grad_norm": 2.3448262214660645, "kl": 0.13818359375, "learning_rate": 6.657534246575343e-07, "loss": 0.0055, "reward": 1.6341642141342163, "reward_std": 0.20584679394960403, "rewards/accuracy_reward": 0.6576017141342163, "rewards/format_reward": 0.9765625, "step": 732 }, { "completion_length": 56.953125, "epoch": 3.34703196347032, "grad_norm": 2.6592612266540527, "kl": 0.1484375, "learning_rate": 6.65296803652968e-07, "loss": 0.0059, "reward": 1.694618046283722, "reward_std": 0.17367403209209442, "rewards/accuracy_reward": 0.6946180760860443, "rewards/format_reward": 1.0, "step": 733 }, { "completion_length": 86.1640625, "epoch": 3.3515981735159817, "grad_norm": 1.7500590085983276, "kl": 0.128662109375, "learning_rate": 6.648401826484019e-07, "loss": 0.0051, "reward": 1.7081771492958069, "reward_std": 0.11748043447732925, "rewards/accuracy_reward": 0.7159895300865173, "rewards/format_reward": 0.9921875, "step": 734 }, { "completion_length": 73.1171875, "epoch": 3.356164383561644, "grad_norm": 2.0890016555786133, "kl": 0.111328125, "learning_rate": 6.643835616438356e-07, "loss": 0.0045, "reward": 1.669720709323883, "reward_std": 0.14257927983999252, "rewards/accuracy_reward": 0.6775331199169159, "rewards/format_reward": 0.9921875, "step": 735 }, { "completion_length": 75.4453125, "epoch": 3.3607305936073057, "grad_norm": 1.7575141191482544, "kl": 0.10302734375, "learning_rate": 6.639269406392693e-07, "loss": 0.0041, "reward": 1.760156273841858, "reward_std": 0.1406225487589836, "rewards/accuracy_reward": 0.7679686844348907, "rewards/format_reward": 0.9921875, "step": 736 }, { "completion_length": 79.0625, "epoch": 3.365296803652968, "grad_norm": 1.9857916831970215, "kl": 0.0750732421875, "learning_rate": 6.634703196347032e-07, "loss": 0.003, "reward": 1.8437398672103882, "reward_std": 0.10023375414311886, "rewards/accuracy_reward": 0.8515522480010986, "rewards/format_reward": 0.9921875, "step": 737 }, { "completion_length": 43.953125, "epoch": 3.3698630136986303, "grad_norm": 1.8931981325149536, "kl": 0.23876953125, "learning_rate": 6.63013698630137e-07, "loss": 0.0096, "reward": 1.777430534362793, "reward_std": 0.2545855790376663, "rewards/accuracy_reward": 0.808680534362793, "rewards/format_reward": 0.96875, "step": 738 }, { "completion_length": 66.671875, "epoch": 3.374429223744292, "grad_norm": 4.250927925109863, "kl": 0.11474609375, "learning_rate": 6.625570776255707e-07, "loss": 0.0046, "reward": 1.5928385257720947, "reward_std": 0.13594963401556015, "rewards/accuracy_reward": 0.5928385257720947, "rewards/format_reward": 1.0, "step": 739 }, { "completion_length": 59.28125, "epoch": 3.3789954337899544, "grad_norm": 1.9088664054870605, "kl": 0.114990234375, "learning_rate": 6.621004566210046e-07, "loss": 0.0046, "reward": 1.5951822996139526, "reward_std": 0.1505616046488285, "rewards/accuracy_reward": 0.5951822698116302, "rewards/format_reward": 1.0, "step": 740 }, { "completion_length": 74.8984375, "epoch": 3.383561643835616, "grad_norm": 2.5536303520202637, "kl": 0.126953125, "learning_rate": 6.616438356164383e-07, "loss": 0.0051, "reward": 1.6819568276405334, "reward_std": 0.1260900031775236, "rewards/accuracy_reward": 0.6819568276405334, "rewards/format_reward": 1.0, "step": 741 }, { "completion_length": 81.7421875, "epoch": 3.3881278538812785, "grad_norm": 1.9142627716064453, "kl": 0.091064453125, "learning_rate": 6.61187214611872e-07, "loss": 0.0036, "reward": 1.7414063215255737, "reward_std": 0.15430963411927223, "rewards/accuracy_reward": 0.7492187023162842, "rewards/format_reward": 0.9921875, "step": 742 }, { "completion_length": 63.8828125, "epoch": 3.3926940639269407, "grad_norm": 2.298887252807617, "kl": 0.15625, "learning_rate": 6.607305936073059e-07, "loss": 0.0063, "reward": 1.6809749007225037, "reward_std": 0.13766025006771088, "rewards/accuracy_reward": 0.6809749007225037, "rewards/format_reward": 1.0, "step": 743 }, { "completion_length": 62.3046875, "epoch": 3.3972602739726026, "grad_norm": 2.9588382244110107, "kl": 0.13916015625, "learning_rate": 6.602739726027396e-07, "loss": 0.0056, "reward": 1.668749988079071, "reward_std": 0.28636179864406586, "rewards/accuracy_reward": 0.692187488079071, "rewards/format_reward": 0.9765625, "step": 744 }, { "completion_length": 58.328125, "epoch": 3.401826484018265, "grad_norm": 2.7415127754211426, "kl": 0.130126953125, "learning_rate": 6.598173515981736e-07, "loss": 0.0052, "reward": 1.755094826221466, "reward_std": 0.2010849490761757, "rewards/accuracy_reward": 0.7707198262214661, "rewards/format_reward": 0.984375, "step": 745 }, { "completion_length": 61.15625, "epoch": 3.406392694063927, "grad_norm": 2.1522672176361084, "kl": 0.138427734375, "learning_rate": 6.593607305936073e-07, "loss": 0.0055, "reward": 1.779687523841858, "reward_std": 0.1337989792227745, "rewards/accuracy_reward": 0.7796874344348907, "rewards/format_reward": 1.0, "step": 746 }, { "completion_length": 93.4296875, "epoch": 3.410958904109589, "grad_norm": 2.299729824066162, "kl": 0.127197265625, "learning_rate": 6.58904109589041e-07, "loss": 0.0051, "reward": 1.787500023841858, "reward_std": 0.12756995856761932, "rewards/accuracy_reward": 0.8031249642372131, "rewards/format_reward": 0.984375, "step": 747 }, { "completion_length": 72.625, "epoch": 3.415525114155251, "grad_norm": 3.2953529357910156, "kl": 0.13623046875, "learning_rate": 6.584474885844749e-07, "loss": 0.0055, "reward": 1.5497395992279053, "reward_std": 0.2357780486345291, "rewards/accuracy_reward": 0.5575520694255829, "rewards/format_reward": 0.9921875, "step": 748 }, { "completion_length": 66.1015625, "epoch": 3.4200913242009134, "grad_norm": 3.811732530593872, "kl": 0.15625, "learning_rate": 6.579908675799086e-07, "loss": 0.0063, "reward": 1.465334177017212, "reward_std": 0.281493678689003, "rewards/accuracy_reward": 0.48095911741256714, "rewards/format_reward": 0.984375, "step": 749 }, { "completion_length": 78.1171875, "epoch": 3.4246575342465753, "grad_norm": 6.028533458709717, "kl": 0.14501953125, "learning_rate": 6.575342465753423e-07, "loss": 0.0058, "reward": 1.612395167350769, "reward_std": 0.1732819825410843, "rewards/accuracy_reward": 0.6202076524496078, "rewards/format_reward": 0.9921875, "step": 750 }, { "completion_length": 80.921875, "epoch": 3.4292237442922375, "grad_norm": 2.363414764404297, "kl": 0.10107421875, "learning_rate": 6.570776255707762e-07, "loss": 0.004, "reward": 1.7257813215255737, "reward_std": 0.15576134249567986, "rewards/accuracy_reward": 0.7257812023162842, "rewards/format_reward": 1.0, "step": 751 }, { "completion_length": 60.625, "epoch": 3.4337899543378994, "grad_norm": 2.295811176300049, "kl": 0.1298828125, "learning_rate": 6.5662100456621e-07, "loss": 0.0052, "reward": 1.8014204502105713, "reward_std": 0.14195573329925537, "rewards/accuracy_reward": 0.8092329502105713, "rewards/format_reward": 0.9921875, "step": 752 }, { "completion_length": 79.6875, "epoch": 3.4383561643835616, "grad_norm": 1.9623810052871704, "kl": 0.09326171875, "learning_rate": 6.561643835616439e-07, "loss": 0.0037, "reward": 1.654836356639862, "reward_std": 0.16391075402498245, "rewards/accuracy_reward": 0.6704612970352173, "rewards/format_reward": 0.984375, "step": 753 }, { "completion_length": 80.84375, "epoch": 3.442922374429224, "grad_norm": 2.4038403034210205, "kl": 0.117919921875, "learning_rate": 6.557077625570776e-07, "loss": 0.0047, "reward": 1.7968750596046448, "reward_std": 0.10437997803092003, "rewards/accuracy_reward": 0.7968749105930328, "rewards/format_reward": 1.0, "step": 754 }, { "completion_length": 72.078125, "epoch": 3.4474885844748857, "grad_norm": 1.999448299407959, "kl": 0.11962890625, "learning_rate": 6.552511415525113e-07, "loss": 0.0048, "reward": 1.829807698726654, "reward_std": 0.07720155641436577, "rewards/accuracy_reward": 0.8298076391220093, "rewards/format_reward": 1.0, "step": 755 }, { "completion_length": 75.8828125, "epoch": 3.452054794520548, "grad_norm": 1.9827911853790283, "kl": 0.0849609375, "learning_rate": 6.547945205479452e-07, "loss": 0.0034, "reward": 1.7932292222976685, "reward_std": 0.14317410439252853, "rewards/accuracy_reward": 0.8088541030883789, "rewards/format_reward": 0.984375, "step": 756 }, { "completion_length": 77.359375, "epoch": 3.45662100456621, "grad_norm": 2.601567268371582, "kl": 0.118896484375, "learning_rate": 6.543378995433789e-07, "loss": 0.0048, "reward": 1.7324219346046448, "reward_std": 0.15861116349697113, "rewards/accuracy_reward": 0.740234375, "rewards/format_reward": 0.9921875, "step": 757 }, { "completion_length": 83.2421875, "epoch": 3.461187214611872, "grad_norm": 2.38321852684021, "kl": 0.133056640625, "learning_rate": 6.538812785388129e-07, "loss": 0.0053, "reward": 1.6742457747459412, "reward_std": 0.18774619698524475, "rewards/accuracy_reward": 0.6820583343505859, "rewards/format_reward": 0.9921875, "step": 758 }, { "completion_length": 80.3671875, "epoch": 3.4657534246575343, "grad_norm": 1.9422838687896729, "kl": 0.128662109375, "learning_rate": 6.534246575342466e-07, "loss": 0.0051, "reward": 1.7861049175262451, "reward_std": 0.09223857149481773, "rewards/accuracy_reward": 0.7861048579216003, "rewards/format_reward": 1.0, "step": 759 }, { "completion_length": 77.8125, "epoch": 3.470319634703196, "grad_norm": 2.641580820083618, "kl": 0.119873046875, "learning_rate": 6.529680365296803e-07, "loss": 0.0048, "reward": 1.6968767046928406, "reward_std": 0.2295239269733429, "rewards/accuracy_reward": 0.7046891748905182, "rewards/format_reward": 0.9921875, "step": 760 }, { "completion_length": 73.4375, "epoch": 3.4748858447488584, "grad_norm": 2.904656171798706, "kl": 0.12744140625, "learning_rate": 6.525114155251142e-07, "loss": 0.0051, "reward": 1.5930989980697632, "reward_std": 0.1885790079832077, "rewards/accuracy_reward": 0.5930989682674408, "rewards/format_reward": 1.0, "step": 761 }, { "completion_length": 78.75, "epoch": 3.4794520547945207, "grad_norm": 2.564729690551758, "kl": 0.1591796875, "learning_rate": 6.520547945205479e-07, "loss": 0.0064, "reward": 1.7561274766921997, "reward_std": 0.1594456396996975, "rewards/accuracy_reward": 0.7639399170875549, "rewards/format_reward": 0.9921875, "step": 762 }, { "completion_length": 85.6953125, "epoch": 3.4840182648401825, "grad_norm": 4.693384170532227, "kl": 0.117919921875, "learning_rate": 6.515981735159816e-07, "loss": 0.0047, "reward": 1.4937500357627869, "reward_std": 0.26740533858537674, "rewards/accuracy_reward": 0.5093749910593033, "rewards/format_reward": 0.984375, "step": 763 }, { "completion_length": 74.28125, "epoch": 3.4885844748858448, "grad_norm": 3.116814136505127, "kl": 0.13037109375, "learning_rate": 6.511415525114155e-07, "loss": 0.0052, "reward": 1.760156273841858, "reward_std": 0.17397862672805786, "rewards/accuracy_reward": 0.7679687440395355, "rewards/format_reward": 0.9921875, "step": 764 }, { "completion_length": 72.8671875, "epoch": 3.493150684931507, "grad_norm": 4.106734752655029, "kl": 0.319091796875, "learning_rate": 6.506849315068493e-07, "loss": 0.0128, "reward": 1.6917535066604614, "reward_std": 0.1726042479276657, "rewards/accuracy_reward": 0.7073784470558167, "rewards/format_reward": 0.984375, "step": 765 }, { "completion_length": 72.4296875, "epoch": 3.497716894977169, "grad_norm": 2.7671029567718506, "kl": 0.11962890625, "learning_rate": 6.502283105022832e-07, "loss": 0.0048, "reward": 1.7573699951171875, "reward_std": 0.12242420390248299, "rewards/accuracy_reward": 0.7573699355125427, "rewards/format_reward": 1.0, "step": 766 }, { "completion_length": 91.5078125, "epoch": 3.502283105022831, "grad_norm": 2.353253126144409, "kl": 0.12158203125, "learning_rate": 6.497716894977169e-07, "loss": 0.0049, "reward": 1.5520833730697632, "reward_std": 0.21783916652202606, "rewards/accuracy_reward": 0.5677083134651184, "rewards/format_reward": 0.984375, "step": 767 }, { "completion_length": 86.0390625, "epoch": 3.506849315068493, "grad_norm": 2.568850040435791, "kl": 0.108642578125, "learning_rate": 6.493150684931506e-07, "loss": 0.0043, "reward": 1.8059896230697632, "reward_std": 0.1456034928560257, "rewards/accuracy_reward": 0.8138020634651184, "rewards/format_reward": 0.9921875, "step": 768 }, { "completion_length": 97.078125, "epoch": 3.5114155251141552, "grad_norm": 9.077414512634277, "kl": 0.084716796875, "learning_rate": 6.488584474885845e-07, "loss": 0.0034, "reward": 1.6987351775169373, "reward_std": 0.2012891098856926, "rewards/accuracy_reward": 0.7221725881099701, "rewards/format_reward": 0.9765625, "step": 769 }, { "completion_length": 90.3828125, "epoch": 3.5159817351598175, "grad_norm": 4.764615058898926, "kl": 0.107421875, "learning_rate": 6.484018264840182e-07, "loss": 0.0043, "reward": 1.6770833730697632, "reward_std": 0.20456601679325104, "rewards/accuracy_reward": 0.6927083432674408, "rewards/format_reward": 0.984375, "step": 770 }, { "completion_length": 91.53125, "epoch": 3.5205479452054793, "grad_norm": 2.192065477371216, "kl": 0.125244140625, "learning_rate": 6.479452054794519e-07, "loss": 0.005, "reward": 1.822656273841858, "reward_std": 0.11972266435623169, "rewards/accuracy_reward": 0.8226562142372131, "rewards/format_reward": 1.0, "step": 771 }, { "completion_length": 66.875, "epoch": 3.5251141552511416, "grad_norm": 2.9831643104553223, "kl": 0.16259765625, "learning_rate": 6.474885844748859e-07, "loss": 0.0065, "reward": 1.6729073524475098, "reward_std": 0.18848184496164322, "rewards/accuracy_reward": 0.6807198524475098, "rewards/format_reward": 0.9921875, "step": 772 }, { "completion_length": 112.515625, "epoch": 3.5296803652968034, "grad_norm": 4.550014495849609, "kl": 0.107177734375, "learning_rate": 6.470319634703196e-07, "loss": 0.0043, "reward": 1.748437523841858, "reward_std": 0.21249166131019592, "rewards/accuracy_reward": 0.7796874046325684, "rewards/format_reward": 0.96875, "step": 773 }, { "completion_length": 86.4453125, "epoch": 3.5342465753424657, "grad_norm": 12.177788734436035, "kl": 0.134521484375, "learning_rate": 6.465753424657535e-07, "loss": 0.0054, "reward": 1.5592572093009949, "reward_std": 0.3570391535758972, "rewards/accuracy_reward": 0.6061321794986725, "rewards/format_reward": 0.953125, "step": 774 }, { "completion_length": 91.90625, "epoch": 3.538812785388128, "grad_norm": 2.1934075355529785, "kl": 0.09423828125, "learning_rate": 6.461187214611872e-07, "loss": 0.0038, "reward": 1.7695313096046448, "reward_std": 0.2720255181193352, "rewards/accuracy_reward": 0.7929687201976776, "rewards/format_reward": 0.9765625, "step": 775 }, { "completion_length": 93.34375, "epoch": 3.54337899543379, "grad_norm": 1.7237935066223145, "kl": 0.10302734375, "learning_rate": 6.456621004566209e-07, "loss": 0.0041, "reward": 1.839062511920929, "reward_std": 0.08532825112342834, "rewards/accuracy_reward": 0.8390624821186066, "rewards/format_reward": 1.0, "step": 776 }, { "completion_length": 68.5234375, "epoch": 3.547945205479452, "grad_norm": 9.828876495361328, "kl": 0.130859375, "learning_rate": 6.452054794520548e-07, "loss": 0.0052, "reward": 1.699999988079071, "reward_std": 0.23911622911691666, "rewards/accuracy_reward": 0.7156249582767487, "rewards/format_reward": 0.984375, "step": 777 }, { "completion_length": 111.7265625, "epoch": 3.5525114155251143, "grad_norm": 2.7527942657470703, "kl": 0.0751953125, "learning_rate": 6.447488584474886e-07, "loss": 0.003, "reward": 1.6765625476837158, "reward_std": 0.17730073630809784, "rewards/accuracy_reward": 0.6921874582767487, "rewards/format_reward": 0.984375, "step": 778 }, { "completion_length": 86.328125, "epoch": 3.557077625570776, "grad_norm": 8.820732116699219, "kl": 0.127685546875, "learning_rate": 6.442922374429223e-07, "loss": 0.0051, "reward": 1.5761924982070923, "reward_std": 0.2583516389131546, "rewards/accuracy_reward": 0.5996299386024475, "rewards/format_reward": 0.9765625, "step": 779 }, { "completion_length": 83.546875, "epoch": 3.5616438356164384, "grad_norm": 4.555209636688232, "kl": 0.11083984375, "learning_rate": 6.438356164383562e-07, "loss": 0.0044, "reward": 1.6642005443572998, "reward_std": 0.2603069022297859, "rewards/accuracy_reward": 0.6876380443572998, "rewards/format_reward": 0.9765625, "step": 780 }, { "completion_length": 88.859375, "epoch": 3.5662100456621006, "grad_norm": 3.635876417160034, "kl": 0.133056640625, "learning_rate": 6.433789954337899e-07, "loss": 0.0053, "reward": 1.6102182865142822, "reward_std": 0.2444395273923874, "rewards/accuracy_reward": 0.6414682567119598, "rewards/format_reward": 0.96875, "step": 781 }, { "completion_length": 83.7734375, "epoch": 3.5707762557077625, "grad_norm": 2.809690237045288, "kl": 0.118896484375, "learning_rate": 6.429223744292238e-07, "loss": 0.0048, "reward": 1.6714159846305847, "reward_std": 0.2755907028913498, "rewards/accuracy_reward": 0.6948534548282623, "rewards/format_reward": 0.9765625, "step": 782 }, { "completion_length": 66.0, "epoch": 3.5753424657534247, "grad_norm": 4.247856140136719, "kl": 0.1552734375, "learning_rate": 6.424657534246575e-07, "loss": 0.0062, "reward": 1.5200520753860474, "reward_std": 0.2615704759955406, "rewards/accuracy_reward": 0.5434895753860474, "rewards/format_reward": 0.9765625, "step": 783 }, { "completion_length": 86.296875, "epoch": 3.5799086757990866, "grad_norm": 3.303392171859741, "kl": 0.17236328125, "learning_rate": 6.420091324200912e-07, "loss": 0.0069, "reward": 1.7383928894996643, "reward_std": 0.20217304676771164, "rewards/accuracy_reward": 0.7618303298950195, "rewards/format_reward": 0.9765625, "step": 784 }, { "completion_length": 94.890625, "epoch": 3.584474885844749, "grad_norm": 2.6589694023132324, "kl": 0.105712890625, "learning_rate": 6.415525114155252e-07, "loss": 0.0042, "reward": 1.710684597492218, "reward_std": 0.20098017156124115, "rewards/accuracy_reward": 0.7341220676898956, "rewards/format_reward": 0.9765625, "step": 785 }, { "completion_length": 90.21875, "epoch": 3.589041095890411, "grad_norm": 2.0707075595855713, "kl": 0.0810546875, "learning_rate": 6.410958904109589e-07, "loss": 0.0032, "reward": 1.70947265625, "reward_std": 0.13764164596796036, "rewards/accuracy_reward": 0.7172850966453552, "rewards/format_reward": 0.9921875, "step": 786 }, { "completion_length": 69.5078125, "epoch": 3.593607305936073, "grad_norm": 2.431682586669922, "kl": 0.197265625, "learning_rate": 6.406392694063926e-07, "loss": 0.0079, "reward": 1.6103981137275696, "reward_std": 0.27858249843120575, "rewards/accuracy_reward": 0.6338355839252472, "rewards/format_reward": 0.9765625, "step": 787 }, { "completion_length": 73.4453125, "epoch": 3.598173515981735, "grad_norm": 2.722654342651367, "kl": 0.13330078125, "learning_rate": 6.401826484018265e-07, "loss": 0.0053, "reward": 1.7416667342185974, "reward_std": 0.18196804821491241, "rewards/accuracy_reward": 0.7651041448116302, "rewards/format_reward": 0.9765625, "step": 788 }, { "completion_length": 80.890625, "epoch": 3.602739726027397, "grad_norm": 5.7324018478393555, "kl": 0.151123046875, "learning_rate": 6.397260273972602e-07, "loss": 0.0061, "reward": 1.7906250357627869, "reward_std": 0.23605135083198547, "rewards/accuracy_reward": 0.8062499463558197, "rewards/format_reward": 0.984375, "step": 789 }, { "completion_length": 72.625, "epoch": 3.6073059360730593, "grad_norm": 2.2109618186950684, "kl": 0.11767578125, "learning_rate": 6.39269406392694e-07, "loss": 0.0047, "reward": 1.5838541984558105, "reward_std": 0.16635090112686157, "rewards/accuracy_reward": 0.583854153752327, "rewards/format_reward": 1.0, "step": 790 }, { "completion_length": 79.53125, "epoch": 3.6118721461187215, "grad_norm": 1.7857418060302734, "kl": 0.142822265625, "learning_rate": 6.388127853881278e-07, "loss": 0.0057, "reward": 1.801552414894104, "reward_std": 0.1699754223227501, "rewards/accuracy_reward": 0.8171773254871368, "rewards/format_reward": 0.984375, "step": 791 }, { "completion_length": 72.3359375, "epoch": 3.616438356164384, "grad_norm": 6.849174499511719, "kl": 0.111572265625, "learning_rate": 6.383561643835616e-07, "loss": 0.0045, "reward": 1.5342634320259094, "reward_std": 0.26482056826353073, "rewards/accuracy_reward": 0.534263402223587, "rewards/format_reward": 1.0, "step": 792 }, { "completion_length": 98.8828125, "epoch": 3.6210045662100456, "grad_norm": 7.422957897186279, "kl": 0.0927734375, "learning_rate": 6.378995433789955e-07, "loss": 0.0037, "reward": 1.7940475940704346, "reward_std": 0.1312719490379095, "rewards/accuracy_reward": 0.8096725642681122, "rewards/format_reward": 0.984375, "step": 793 }, { "completion_length": 81.0546875, "epoch": 3.625570776255708, "grad_norm": 1.819212794303894, "kl": 0.11572265625, "learning_rate": 6.374429223744292e-07, "loss": 0.0046, "reward": 1.664595365524292, "reward_std": 0.1356574185192585, "rewards/accuracy_reward": 0.6802203357219696, "rewards/format_reward": 0.984375, "step": 794 }, { "completion_length": 80.25, "epoch": 3.6301369863013697, "grad_norm": 5.144930362701416, "kl": 0.15966796875, "learning_rate": 6.369863013698629e-07, "loss": 0.0064, "reward": 1.6710898280143738, "reward_std": 0.27560608088970184, "rewards/accuracy_reward": 0.710152268409729, "rewards/format_reward": 0.9609375, "step": 795 }, { "completion_length": 70.6015625, "epoch": 3.634703196347032, "grad_norm": 5.67324686050415, "kl": 0.119140625, "learning_rate": 6.365296803652968e-07, "loss": 0.0048, "reward": 1.5117551684379578, "reward_std": 0.2634401321411133, "rewards/accuracy_reward": 0.5195676535367966, "rewards/format_reward": 0.9921875, "step": 796 }, { "completion_length": 67.71875, "epoch": 3.6392694063926943, "grad_norm": 2.1526761054992676, "kl": 0.14599609375, "learning_rate": 6.360730593607305e-07, "loss": 0.0058, "reward": 1.7081494331359863, "reward_std": 0.15050432085990906, "rewards/accuracy_reward": 0.7237744629383087, "rewards/format_reward": 0.984375, "step": 797 }, { "completion_length": 90.4609375, "epoch": 3.643835616438356, "grad_norm": 7.734349250793457, "kl": 0.107666015625, "learning_rate": 6.356164383561645e-07, "loss": 0.0043, "reward": 1.572656273841858, "reward_std": 0.2658383846282959, "rewards/accuracy_reward": 0.5960937142372131, "rewards/format_reward": 0.9765625, "step": 798 }, { "completion_length": 76.6640625, "epoch": 3.6484018264840183, "grad_norm": 3.576162815093994, "kl": 0.115966796875, "learning_rate": 6.351598173515982e-07, "loss": 0.0046, "reward": 1.6600198149681091, "reward_std": 0.15953533351421356, "rewards/accuracy_reward": 0.6678323149681091, "rewards/format_reward": 0.9921875, "step": 799 }, { "completion_length": 91.1796875, "epoch": 3.65296803652968, "grad_norm": 3.9656198024749756, "kl": 0.107421875, "learning_rate": 6.347031963470319e-07, "loss": 0.0043, "reward": 1.6131696701049805, "reward_std": 0.19991916418075562, "rewards/accuracy_reward": 0.6366070806980133, "rewards/format_reward": 0.9765625, "step": 800 }, { "completion_length": 91.5, "epoch": 3.6575342465753424, "grad_norm": 2.609405755996704, "kl": 0.1591796875, "learning_rate": 6.342465753424658e-07, "loss": 0.0064, "reward": 1.6630208492279053, "reward_std": 0.2102055549621582, "rewards/accuracy_reward": 0.6708333194255829, "rewards/format_reward": 0.9921875, "step": 801 }, { "completion_length": 83.9609375, "epoch": 3.6621004566210047, "grad_norm": 2.384371280670166, "kl": 0.13818359375, "learning_rate": 6.337899543378995e-07, "loss": 0.0055, "reward": 1.7731770873069763, "reward_std": 0.16687272489070892, "rewards/accuracy_reward": 0.7888020277023315, "rewards/format_reward": 0.984375, "step": 802 }, { "completion_length": 83.25, "epoch": 3.6666666666666665, "grad_norm": 2.3074464797973633, "kl": 0.105224609375, "learning_rate": 6.333333333333332e-07, "loss": 0.0042, "reward": 1.6963542103767395, "reward_std": 0.2274407297372818, "rewards/accuracy_reward": 0.7354166209697723, "rewards/format_reward": 0.9609375, "step": 803 }, { "completion_length": 91.296875, "epoch": 3.671232876712329, "grad_norm": 4.80971097946167, "kl": 0.2685546875, "learning_rate": 6.328767123287671e-07, "loss": 0.0107, "reward": 1.8013640642166138, "reward_std": 0.18622903525829315, "rewards/accuracy_reward": 0.8326140344142914, "rewards/format_reward": 0.96875, "step": 804 }, { "completion_length": 76.4375, "epoch": 3.6757990867579906, "grad_norm": 2.313264846801758, "kl": 0.13623046875, "learning_rate": 6.324200913242009e-07, "loss": 0.0054, "reward": 1.573582112789154, "reward_std": 0.28307729959487915, "rewards/accuracy_reward": 0.5892071425914764, "rewards/format_reward": 0.984375, "step": 805 }, { "completion_length": 71.53125, "epoch": 3.680365296803653, "grad_norm": 5.89130163192749, "kl": 0.135986328125, "learning_rate": 6.319634703196348e-07, "loss": 0.0054, "reward": 1.6453125476837158, "reward_std": 0.2522100582718849, "rewards/accuracy_reward": 0.6687499582767487, "rewards/format_reward": 0.9765625, "step": 806 }, { "completion_length": 84.84375, "epoch": 3.684931506849315, "grad_norm": 2.150968074798584, "kl": 0.103759765625, "learning_rate": 6.315068493150685e-07, "loss": 0.0042, "reward": 1.6665269136428833, "reward_std": 0.17805734649300575, "rewards/accuracy_reward": 0.6743394434452057, "rewards/format_reward": 0.9921875, "step": 807 }, { "completion_length": 118.5390625, "epoch": 3.6894977168949774, "grad_norm": 1.7179328203201294, "kl": 0.0628662109375, "learning_rate": 6.310502283105022e-07, "loss": 0.0025, "reward": 1.6565104722976685, "reward_std": 0.31547820568084717, "rewards/accuracy_reward": 0.7268228828907013, "rewards/format_reward": 0.9296875, "step": 808 }, { "completion_length": 86.203125, "epoch": 3.6940639269406392, "grad_norm": 4.058598518371582, "kl": 0.12353515625, "learning_rate": 6.305936073059361e-07, "loss": 0.0049, "reward": 1.7383049130439758, "reward_std": 0.2762962728738785, "rewards/accuracy_reward": 0.7617424130439758, "rewards/format_reward": 0.9765625, "step": 809 }, { "completion_length": 72.484375, "epoch": 3.6986301369863015, "grad_norm": 6.153299331665039, "kl": 0.1328125, "learning_rate": 6.301369863013698e-07, "loss": 0.0053, "reward": 1.607812523841858, "reward_std": 0.26409636437892914, "rewards/accuracy_reward": 0.6312500238418579, "rewards/format_reward": 0.9765625, "step": 810 }, { "completion_length": 73.5078125, "epoch": 3.7031963470319633, "grad_norm": 4.83391809463501, "kl": 0.115478515625, "learning_rate": 6.296803652968035e-07, "loss": 0.0046, "reward": 1.6350895166397095, "reward_std": 0.3858235031366348, "rewards/accuracy_reward": 0.6897769868373871, "rewards/format_reward": 0.9453125, "step": 811 }, { "completion_length": 79.734375, "epoch": 3.7077625570776256, "grad_norm": 7.388575553894043, "kl": 0.121826171875, "learning_rate": 6.292237442922375e-07, "loss": 0.0049, "reward": 1.6204612851142883, "reward_std": 0.2048889473080635, "rewards/accuracy_reward": 0.6360863149166107, "rewards/format_reward": 0.984375, "step": 812 }, { "completion_length": 87.6875, "epoch": 3.712328767123288, "grad_norm": 2.1843342781066895, "kl": 0.091064453125, "learning_rate": 6.287671232876712e-07, "loss": 0.0036, "reward": 1.7203125357627869, "reward_std": 0.14389308542013168, "rewards/accuracy_reward": 0.7281249761581421, "rewards/format_reward": 0.9921875, "step": 813 }, { "completion_length": 87.6484375, "epoch": 3.7168949771689497, "grad_norm": 6.269219875335693, "kl": 0.100341796875, "learning_rate": 6.283105022831051e-07, "loss": 0.004, "reward": 1.6382812857627869, "reward_std": 0.17056220024824142, "rewards/accuracy_reward": 0.6460937112569809, "rewards/format_reward": 0.9921875, "step": 814 }, { "completion_length": 113.0, "epoch": 3.721461187214612, "grad_norm": 7.404258728027344, "kl": 0.0799560546875, "learning_rate": 6.278538812785388e-07, "loss": 0.0032, "reward": 1.7445932626724243, "reward_std": 0.27295994758605957, "rewards/accuracy_reward": 0.8149056732654572, "rewards/format_reward": 0.9296875, "step": 815 }, { "completion_length": 83.03125, "epoch": 3.7260273972602738, "grad_norm": 2.5489566326141357, "kl": 0.095703125, "learning_rate": 6.273972602739725e-07, "loss": 0.0038, "reward": 1.6650923490524292, "reward_std": 0.2622094973921776, "rewards/accuracy_reward": 0.7041547894477844, "rewards/format_reward": 0.9609375, "step": 816 }, { "completion_length": 82.0859375, "epoch": 3.730593607305936, "grad_norm": 9.064318656921387, "kl": 0.10693359375, "learning_rate": 6.269406392694064e-07, "loss": 0.0043, "reward": 1.565638542175293, "reward_std": 0.20368661731481552, "rewards/accuracy_reward": 0.589076042175293, "rewards/format_reward": 0.9765625, "step": 817 }, { "completion_length": 85.734375, "epoch": 3.7351598173515983, "grad_norm": 3.814204692840576, "kl": 0.0927734375, "learning_rate": 6.264840182648402e-07, "loss": 0.0037, "reward": 1.7833616733551025, "reward_std": 0.1723785549402237, "rewards/accuracy_reward": 0.7911740839481354, "rewards/format_reward": 0.9921875, "step": 818 }, { "completion_length": 59.921875, "epoch": 3.73972602739726, "grad_norm": 3.6849331855773926, "kl": 0.127685546875, "learning_rate": 6.260273972602739e-07, "loss": 0.0051, "reward": 1.6573927998542786, "reward_std": 0.23803135752677917, "rewards/accuracy_reward": 0.6808302700519562, "rewards/format_reward": 0.9765625, "step": 819 }, { "completion_length": 73.921875, "epoch": 3.7442922374429224, "grad_norm": 2.6054327487945557, "kl": 0.10498046875, "learning_rate": 6.255707762557078e-07, "loss": 0.0042, "reward": 1.5656526684761047, "reward_std": 0.2571340575814247, "rewards/accuracy_reward": 0.5890901386737823, "rewards/format_reward": 0.9765625, "step": 820 }, { "completion_length": 74.3046875, "epoch": 3.748858447488584, "grad_norm": 8.960927963256836, "kl": 0.413818359375, "learning_rate": 6.251141552511415e-07, "loss": 0.0166, "reward": 1.6491714119911194, "reward_std": 0.190296471118927, "rewards/accuracy_reward": 0.664796382188797, "rewards/format_reward": 0.984375, "step": 821 }, { "completion_length": 80.5859375, "epoch": 3.7534246575342465, "grad_norm": 4.059770107269287, "kl": 0.15185546875, "learning_rate": 6.246575342465754e-07, "loss": 0.0061, "reward": 1.7381510734558105, "reward_std": 0.23838761448860168, "rewards/accuracy_reward": 0.7615885138511658, "rewards/format_reward": 0.9765625, "step": 822 }, { "completion_length": 81.765625, "epoch": 3.7579908675799087, "grad_norm": 3.5536246299743652, "kl": 0.244140625, "learning_rate": 6.242009132420091e-07, "loss": 0.0098, "reward": 1.6796875, "reward_std": 0.19722937047481537, "rewards/accuracy_reward": 0.7031249403953552, "rewards/format_reward": 0.9765625, "step": 823 }, { "completion_length": 67.1015625, "epoch": 3.762557077625571, "grad_norm": 2.649221658706665, "kl": 0.148681640625, "learning_rate": 6.237442922374428e-07, "loss": 0.006, "reward": 1.5904948711395264, "reward_std": 0.26418011635541916, "rewards/accuracy_reward": 0.6217447817325592, "rewards/format_reward": 0.96875, "step": 824 }, { "completion_length": 67.453125, "epoch": 3.767123287671233, "grad_norm": 4.655685901641846, "kl": 0.13623046875, "learning_rate": 6.232876712328768e-07, "loss": 0.0054, "reward": 1.611718773841858, "reward_std": 0.17845439538359642, "rewards/accuracy_reward": 0.6195312142372131, "rewards/format_reward": 0.9921875, "step": 825 }, { "completion_length": 75.90625, "epoch": 3.771689497716895, "grad_norm": 1.6092960834503174, "kl": 0.127685546875, "learning_rate": 6.228310502283105e-07, "loss": 0.0051, "reward": 1.71875, "reward_std": 0.16304787248373032, "rewards/accuracy_reward": 0.7343750298023224, "rewards/format_reward": 0.984375, "step": 826 }, { "completion_length": 74.7890625, "epoch": 3.776255707762557, "grad_norm": 1.904418706893921, "kl": 0.090087890625, "learning_rate": 6.223744292237442e-07, "loss": 0.0036, "reward": 1.7890625596046448, "reward_std": 0.16059691458940506, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.9765625, "step": 827 }, { "completion_length": 78.609375, "epoch": 3.780821917808219, "grad_norm": 2.7213988304138184, "kl": 0.13037109375, "learning_rate": 6.219178082191781e-07, "loss": 0.0052, "reward": 1.6396701335906982, "reward_std": 0.23586497455835342, "rewards/accuracy_reward": 0.6787325739860535, "rewards/format_reward": 0.9609375, "step": 828 }, { "completion_length": 87.4140625, "epoch": 3.7853881278538815, "grad_norm": 3.867840051651001, "kl": 0.1123046875, "learning_rate": 6.214611872146118e-07, "loss": 0.0045, "reward": 1.6255208849906921, "reward_std": 0.19572605937719345, "rewards/accuracy_reward": 0.6489582806825638, "rewards/format_reward": 0.9765625, "step": 829 }, { "completion_length": 80.6953125, "epoch": 3.7899543378995433, "grad_norm": 5.212503910064697, "kl": 0.177490234375, "learning_rate": 6.210045662100457e-07, "loss": 0.0071, "reward": 1.7282792925834656, "reward_std": 0.31921522319316864, "rewards/accuracy_reward": 0.7673417627811432, "rewards/format_reward": 0.9609375, "step": 830 }, { "completion_length": 87.03125, "epoch": 3.7945205479452055, "grad_norm": 7.442432880401611, "kl": 0.093017578125, "learning_rate": 6.205479452054794e-07, "loss": 0.0037, "reward": 1.6146034002304077, "reward_std": 0.196245439350605, "rewards/accuracy_reward": 0.6224157810211182, "rewards/format_reward": 0.9921875, "step": 831 }, { "completion_length": 77.1640625, "epoch": 3.7990867579908674, "grad_norm": 4.140781402587891, "kl": 0.2216796875, "learning_rate": 6.200913242009132e-07, "loss": 0.0089, "reward": 1.4750909209251404, "reward_std": 0.26224930584430695, "rewards/accuracy_reward": 0.5141534507274628, "rewards/format_reward": 0.9609375, "step": 832 }, { "completion_length": 73.5859375, "epoch": 3.8036529680365296, "grad_norm": 2.893556594848633, "kl": 0.105224609375, "learning_rate": 6.196347031963471e-07, "loss": 0.0042, "reward": 1.6718750596046448, "reward_std": 0.25111906230449677, "rewards/accuracy_reward": 0.6953124701976776, "rewards/format_reward": 0.9765625, "step": 833 }, { "completion_length": 68.46875, "epoch": 3.808219178082192, "grad_norm": 4.676414489746094, "kl": 0.20361328125, "learning_rate": 6.191780821917808e-07, "loss": 0.0081, "reward": 1.682812511920929, "reward_std": 0.20322410762310028, "rewards/accuracy_reward": 0.6984374821186066, "rewards/format_reward": 0.984375, "step": 834 }, { "completion_length": 78.9375, "epoch": 3.8127853881278537, "grad_norm": 1.6920398473739624, "kl": 0.134765625, "learning_rate": 6.187214611872145e-07, "loss": 0.0054, "reward": 1.8067708611488342, "reward_std": 0.14878704398870468, "rewards/accuracy_reward": 0.8223958313465118, "rewards/format_reward": 0.984375, "step": 835 }, { "completion_length": 73.34375, "epoch": 3.817351598173516, "grad_norm": 3.9718282222747803, "kl": 0.1552734375, "learning_rate": 6.182648401826484e-07, "loss": 0.0062, "reward": 1.586328148841858, "reward_std": 0.2995697557926178, "rewards/accuracy_reward": 0.6410156190395355, "rewards/format_reward": 0.9453125, "step": 836 }, { "completion_length": 74.859375, "epoch": 3.821917808219178, "grad_norm": 2.3701136112213135, "kl": 0.146240234375, "learning_rate": 6.178082191780821e-07, "loss": 0.0059, "reward": 1.6562500596046448, "reward_std": 0.1812673956155777, "rewards/accuracy_reward": 0.6640624701976776, "rewards/format_reward": 0.9921875, "step": 837 }, { "completion_length": 63.671875, "epoch": 3.82648401826484, "grad_norm": 3.7064878940582275, "kl": 0.1220703125, "learning_rate": 6.173515981735161e-07, "loss": 0.0049, "reward": 1.574496567249298, "reward_std": 0.24655399471521378, "rewards/accuracy_reward": 0.5979340374469757, "rewards/format_reward": 0.9765625, "step": 838 }, { "completion_length": 69.6171875, "epoch": 3.8310502283105023, "grad_norm": 2.966919422149658, "kl": 0.119384765625, "learning_rate": 6.168949771689498e-07, "loss": 0.0048, "reward": 1.5951017141342163, "reward_std": 0.22802505642175674, "rewards/accuracy_reward": 0.6263516843318939, "rewards/format_reward": 0.96875, "step": 839 }, { "completion_length": 67.4921875, "epoch": 3.8356164383561646, "grad_norm": 5.754858493804932, "kl": 0.125732421875, "learning_rate": 6.164383561643835e-07, "loss": 0.005, "reward": 1.60744047164917, "reward_std": 0.2320682480931282, "rewards/accuracy_reward": 0.6230654716491699, "rewards/format_reward": 0.984375, "step": 840 }, { "completion_length": 94.8828125, "epoch": 3.8401826484018264, "grad_norm": 3.788088321685791, "kl": 0.09912109375, "learning_rate": 6.159817351598174e-07, "loss": 0.004, "reward": 1.65234375, "reward_std": 0.1578691005706787, "rewards/accuracy_reward": 0.65234375, "rewards/format_reward": 1.0, "step": 841 }, { "completion_length": 82.28125, "epoch": 3.8447488584474887, "grad_norm": 3.032823085784912, "kl": 0.103271484375, "learning_rate": 6.155251141552511e-07, "loss": 0.0041, "reward": 1.538119375705719, "reward_std": 0.2931046634912491, "rewards/accuracy_reward": 0.569369375705719, "rewards/format_reward": 0.96875, "step": 842 }, { "completion_length": 88.265625, "epoch": 3.8493150684931505, "grad_norm": 2.753324031829834, "kl": 0.08544921875, "learning_rate": 6.150684931506848e-07, "loss": 0.0034, "reward": 1.6361016035079956, "reward_std": 0.2543141394853592, "rewards/accuracy_reward": 0.6673516035079956, "rewards/format_reward": 0.96875, "step": 843 }, { "completion_length": 88.3203125, "epoch": 3.853881278538813, "grad_norm": 1.9302741289138794, "kl": 0.116455078125, "learning_rate": 6.146118721461187e-07, "loss": 0.0047, "reward": 1.723133623600006, "reward_std": 0.12216833233833313, "rewards/accuracy_reward": 0.7231336236000061, "rewards/format_reward": 1.0, "step": 844 }, { "completion_length": 68.3046875, "epoch": 3.858447488584475, "grad_norm": 2.5496842861175537, "kl": 0.115234375, "learning_rate": 6.141552511415525e-07, "loss": 0.0046, "reward": 1.63571435213089, "reward_std": 0.22781573235988617, "rewards/accuracy_reward": 0.6513392478227615, "rewards/format_reward": 0.984375, "step": 845 }, { "completion_length": 98.5546875, "epoch": 3.863013698630137, "grad_norm": 1.8447142839431763, "kl": 0.098388671875, "learning_rate": 6.136986301369864e-07, "loss": 0.0039, "reward": 1.7750434279441833, "reward_std": 0.16683020442724228, "rewards/accuracy_reward": 0.798480898141861, "rewards/format_reward": 0.9765625, "step": 846 }, { "completion_length": 86.2265625, "epoch": 3.867579908675799, "grad_norm": 4.997533798217773, "kl": 0.123779296875, "learning_rate": 6.132420091324201e-07, "loss": 0.0049, "reward": 1.6639086604118347, "reward_std": 0.298796147108078, "rewards/accuracy_reward": 0.7185961008071899, "rewards/format_reward": 0.9453125, "step": 847 }, { "completion_length": 76.2578125, "epoch": 3.872146118721461, "grad_norm": 4.215785503387451, "kl": 0.130859375, "learning_rate": 6.127853881278538e-07, "loss": 0.0052, "reward": 1.5344713926315308, "reward_std": 0.3186282366514206, "rewards/accuracy_reward": 0.5735338628292084, "rewards/format_reward": 0.9609375, "step": 848 }, { "completion_length": 76.5546875, "epoch": 3.8767123287671232, "grad_norm": 3.0258474349975586, "kl": 0.11962890625, "learning_rate": 6.123287671232877e-07, "loss": 0.0048, "reward": 1.5302554368972778, "reward_std": 0.21649178117513657, "rewards/accuracy_reward": 0.5458804070949554, "rewards/format_reward": 0.984375, "step": 849 }, { "completion_length": 88.828125, "epoch": 3.8812785388127855, "grad_norm": 3.9653830528259277, "kl": 0.134765625, "learning_rate": 6.118721461187214e-07, "loss": 0.0054, "reward": 1.7723276019096375, "reward_std": 0.27654044330120087, "rewards/accuracy_reward": 0.8035775721073151, "rewards/format_reward": 0.96875, "step": 850 }, { "completion_length": 70.6796875, "epoch": 3.8858447488584473, "grad_norm": 13.911476135253906, "kl": 0.1279296875, "learning_rate": 6.114155251141551e-07, "loss": 0.0051, "reward": 1.5849445462226868, "reward_std": 0.2708437442779541, "rewards/accuracy_reward": 0.5849444717168808, "rewards/format_reward": 1.0, "step": 851 }, { "completion_length": 84.46875, "epoch": 3.8904109589041096, "grad_norm": 2.8287229537963867, "kl": 0.122802734375, "learning_rate": 6.109589041095891e-07, "loss": 0.0049, "reward": 1.6496233344078064, "reward_std": 0.1977412924170494, "rewards/accuracy_reward": 0.6730607748031616, "rewards/format_reward": 0.9765625, "step": 852 }, { "completion_length": 84.328125, "epoch": 3.8949771689497714, "grad_norm": 3.292459487915039, "kl": 0.110107421875, "learning_rate": 6.105022831050228e-07, "loss": 0.0044, "reward": 1.560290813446045, "reward_std": 0.2603389769792557, "rewards/accuracy_reward": 0.5681032538414001, "rewards/format_reward": 0.9921875, "step": 853 }, { "completion_length": 70.046875, "epoch": 3.8995433789954337, "grad_norm": 3.479142189025879, "kl": 0.13818359375, "learning_rate": 6.100456621004567e-07, "loss": 0.0055, "reward": 1.5651041865348816, "reward_std": 0.327400267124176, "rewards/accuracy_reward": 0.6041666567325592, "rewards/format_reward": 0.9609375, "step": 854 }, { "completion_length": 78.3046875, "epoch": 3.904109589041096, "grad_norm": 2.2671470642089844, "kl": 0.103515625, "learning_rate": 6.095890410958904e-07, "loss": 0.0041, "reward": 1.6139509677886963, "reward_std": 0.221808023750782, "rewards/accuracy_reward": 0.6608258783817291, "rewards/format_reward": 0.953125, "step": 855 }, { "completion_length": 79.015625, "epoch": 3.908675799086758, "grad_norm": 3.359975814819336, "kl": 0.097900390625, "learning_rate": 6.091324200913241e-07, "loss": 0.0039, "reward": 1.7910323739051819, "reward_std": 0.16605094820261002, "rewards/accuracy_reward": 0.80665722489357, "rewards/format_reward": 0.984375, "step": 856 }, { "completion_length": 108.4375, "epoch": 3.91324200913242, "grad_norm": 2.9608707427978516, "kl": 0.064208984375, "learning_rate": 6.08675799086758e-07, "loss": 0.0026, "reward": 1.7960938215255737, "reward_std": 0.15246989950537682, "rewards/accuracy_reward": 0.8195312321186066, "rewards/format_reward": 0.9765625, "step": 857 }, { "completion_length": 70.484375, "epoch": 3.9178082191780823, "grad_norm": 1.9071540832519531, "kl": 0.115234375, "learning_rate": 6.082191780821918e-07, "loss": 0.0046, "reward": 1.8630208373069763, "reward_std": 0.18414238840341568, "rewards/accuracy_reward": 0.8864583075046539, "rewards/format_reward": 0.9765625, "step": 858 }, { "completion_length": 86.25, "epoch": 3.922374429223744, "grad_norm": 1.7425415515899658, "kl": 0.0791015625, "learning_rate": 6.077625570776255e-07, "loss": 0.0032, "reward": 1.7500391602516174, "reward_std": 0.1361438985913992, "rewards/accuracy_reward": 0.757851630449295, "rewards/format_reward": 0.9921875, "step": 859 }, { "completion_length": 83.4765625, "epoch": 3.9269406392694064, "grad_norm": 2.0878822803497314, "kl": 0.132080078125, "learning_rate": 6.073059360730594e-07, "loss": 0.0053, "reward": 1.7227915525436401, "reward_std": 0.2049795687198639, "rewards/accuracy_reward": 0.7384164929389954, "rewards/format_reward": 0.984375, "step": 860 }, { "completion_length": 82.6328125, "epoch": 3.9315068493150687, "grad_norm": 2.1125905513763428, "kl": 0.10400390625, "learning_rate": 6.068493150684931e-07, "loss": 0.0041, "reward": 1.7052381038665771, "reward_std": 0.19698219001293182, "rewards/accuracy_reward": 0.7208629250526428, "rewards/format_reward": 0.984375, "step": 861 }, { "completion_length": 70.6171875, "epoch": 3.9360730593607305, "grad_norm": 3.4557483196258545, "kl": 0.125244140625, "learning_rate": 6.06392694063927e-07, "loss": 0.005, "reward": 1.6386160850524902, "reward_std": 0.2512796074151993, "rewards/accuracy_reward": 0.6464285254478455, "rewards/format_reward": 0.9921875, "step": 862 }, { "completion_length": 60.2890625, "epoch": 3.9406392694063928, "grad_norm": 2.9261093139648438, "kl": 0.197265625, "learning_rate": 6.059360730593607e-07, "loss": 0.0079, "reward": 1.6639323234558105, "reward_std": 0.18012882769107819, "rewards/accuracy_reward": 0.6717447936534882, "rewards/format_reward": 0.9921875, "step": 863 }, { "completion_length": 68.984375, "epoch": 3.9452054794520546, "grad_norm": 8.949965476989746, "kl": 0.16455078125, "learning_rate": 6.054794520547944e-07, "loss": 0.0066, "reward": 1.634996235370636, "reward_std": 0.30956215411424637, "rewards/accuracy_reward": 0.658433735370636, "rewards/format_reward": 0.9765625, "step": 864 }, { "completion_length": 94.5078125, "epoch": 3.949771689497717, "grad_norm": 1.5857025384902954, "kl": 0.07958984375, "learning_rate": 6.050228310502284e-07, "loss": 0.0032, "reward": 1.7937500476837158, "reward_std": 0.08982988260686398, "rewards/accuracy_reward": 0.793749988079071, "rewards/format_reward": 1.0, "step": 865 }, { "completion_length": 72.328125, "epoch": 3.954337899543379, "grad_norm": 3.955540418624878, "kl": 0.144775390625, "learning_rate": 6.045662100456621e-07, "loss": 0.0058, "reward": 1.5606706142425537, "reward_std": 0.3175853192806244, "rewards/accuracy_reward": 0.6075455844402313, "rewards/format_reward": 0.953125, "step": 866 }, { "completion_length": 56.71875, "epoch": 3.958904109589041, "grad_norm": 3.506572723388672, "kl": 0.142822265625, "learning_rate": 6.041095890410958e-07, "loss": 0.0057, "reward": 1.7238853573799133, "reward_std": 0.22640568763017654, "rewards/accuracy_reward": 0.7316978275775909, "rewards/format_reward": 0.9921875, "step": 867 }, { "completion_length": 69.9921875, "epoch": 3.963470319634703, "grad_norm": 6.0887770652771, "kl": 0.0931396484375, "learning_rate": 6.036529680365297e-07, "loss": 0.0037, "reward": 1.851125419139862, "reward_std": 0.1146822888404131, "rewards/accuracy_reward": 0.8511254191398621, "rewards/format_reward": 1.0, "step": 868 }, { "completion_length": 78.6875, "epoch": 3.968036529680365, "grad_norm": 2.3538548946380615, "kl": 0.134765625, "learning_rate": 6.031963470319634e-07, "loss": 0.0054, "reward": 1.6927083730697632, "reward_std": 0.20856676995754242, "rewards/accuracy_reward": 0.7083333134651184, "rewards/format_reward": 0.984375, "step": 869 }, { "completion_length": 67.84375, "epoch": 3.9726027397260273, "grad_norm": 3.3203752040863037, "kl": 0.19482421875, "learning_rate": 6.027397260273972e-07, "loss": 0.0078, "reward": 1.682466745376587, "reward_std": 0.20432885736227036, "rewards/accuracy_reward": 0.6824667155742645, "rewards/format_reward": 1.0, "step": 870 }, { "completion_length": 60.8046875, "epoch": 3.9771689497716896, "grad_norm": 4.509641647338867, "kl": 0.12548828125, "learning_rate": 6.02283105022831e-07, "loss": 0.005, "reward": 1.5753461122512817, "reward_std": 0.23474501818418503, "rewards/accuracy_reward": 0.5753461122512817, "rewards/format_reward": 1.0, "step": 871 }, { "completion_length": 89.40625, "epoch": 3.981735159817352, "grad_norm": 2.3428871631622314, "kl": 0.113037109375, "learning_rate": 6.018264840182648e-07, "loss": 0.0045, "reward": 1.8376488089561462, "reward_std": 0.12626906298100948, "rewards/accuracy_reward": 0.8454612195491791, "rewards/format_reward": 0.9921875, "step": 872 }, { "completion_length": 70.9921875, "epoch": 3.9863013698630136, "grad_norm": 1.8384939432144165, "kl": 0.12890625, "learning_rate": 6.013698630136987e-07, "loss": 0.0052, "reward": 1.6695202589035034, "reward_std": 0.13698631152510643, "rewards/accuracy_reward": 0.6695202589035034, "rewards/format_reward": 1.0, "step": 873 }, { "completion_length": 72.8828125, "epoch": 3.990867579908676, "grad_norm": 2.790123224258423, "kl": 0.132568359375, "learning_rate": 6.009132420091324e-07, "loss": 0.0053, "reward": 1.5515252947807312, "reward_std": 0.24770487844944, "rewards/accuracy_reward": 0.5749628096818924, "rewards/format_reward": 0.9765625, "step": 874 }, { "completion_length": 70.765625, "epoch": 3.9954337899543377, "grad_norm": 3.470076084136963, "kl": 0.09423828125, "learning_rate": 6.004566210045661e-07, "loss": 0.0038, "reward": 1.8020795583724976, "reward_std": 0.08546407520771027, "rewards/accuracy_reward": 0.8098919987678528, "rewards/format_reward": 0.9921875, "step": 875 }, { "completion_length": 37.0, "epoch": 4.0, "grad_norm": 3.645047903060913, "kl": 0.158203125, "learning_rate": 6e-07, "loss": 0.0061, "reward": 1.5, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 876 }, { "completion_length": 63.9921875, "epoch": 4.004566210045662, "grad_norm": 8.070154190063477, "kl": 0.143310546875, "learning_rate": 5.995433789954337e-07, "loss": 0.0057, "reward": 1.726302146911621, "reward_std": 0.1457817256450653, "rewards/accuracy_reward": 0.7263020873069763, "rewards/format_reward": 1.0, "step": 877 }, { "completion_length": 80.28125, "epoch": 4.0091324200913245, "grad_norm": 3.771700620651245, "kl": 0.1318359375, "learning_rate": 5.990867579908675e-07, "loss": 0.0053, "reward": 1.7184895873069763, "reward_std": 0.14457330107688904, "rewards/accuracy_reward": 0.7263020873069763, "rewards/format_reward": 0.9921875, "step": 878 }, { "completion_length": 55.3828125, "epoch": 4.013698630136986, "grad_norm": 3.114535093307495, "kl": 0.193359375, "learning_rate": 5.986301369863014e-07, "loss": 0.0077, "reward": 1.699999988079071, "reward_std": 0.24036270380020142, "rewards/accuracy_reward": 0.715624988079071, "rewards/format_reward": 0.984375, "step": 879 }, { "completion_length": 68.9921875, "epoch": 4.018264840182648, "grad_norm": 5.49289608001709, "kl": 0.1513671875, "learning_rate": 5.981735159817351e-07, "loss": 0.006, "reward": 1.7083333730697632, "reward_std": 0.14341074973344803, "rewards/accuracy_reward": 0.7083333134651184, "rewards/format_reward": 1.0, "step": 880 }, { "completion_length": 68.1484375, "epoch": 4.0228310502283104, "grad_norm": 8.557650566101074, "kl": 0.47802734375, "learning_rate": 5.97716894977169e-07, "loss": 0.0191, "reward": 1.7641276121139526, "reward_std": 0.13008537888526917, "rewards/accuracy_reward": 0.7641275823116302, "rewards/format_reward": 1.0, "step": 881 }, { "completion_length": 74.2265625, "epoch": 4.027397260273973, "grad_norm": 1.5429102182388306, "kl": 0.1396484375, "learning_rate": 5.972602739726027e-07, "loss": 0.0056, "reward": 1.7580729722976685, "reward_std": 0.1651972383260727, "rewards/accuracy_reward": 0.7736978530883789, "rewards/format_reward": 0.984375, "step": 882 }, { "completion_length": 60.5703125, "epoch": 4.031963470319635, "grad_norm": 1.6412632465362549, "kl": 0.123779296875, "learning_rate": 5.968036529680364e-07, "loss": 0.005, "reward": 1.7804688215255737, "reward_std": 0.1583872102200985, "rewards/accuracy_reward": 0.7882812321186066, "rewards/format_reward": 0.9921875, "step": 883 }, { "completion_length": 74.703125, "epoch": 4.036529680365296, "grad_norm": 5.128458499908447, "kl": 0.2080078125, "learning_rate": 5.963470319634703e-07, "loss": 0.0083, "reward": 1.706798791885376, "reward_std": 0.19644346833229065, "rewards/accuracy_reward": 0.7302362024784088, "rewards/format_reward": 0.9765625, "step": 884 }, { "completion_length": 60.921875, "epoch": 4.041095890410959, "grad_norm": 2.6405813694000244, "kl": 0.14892578125, "learning_rate": 5.958904109589041e-07, "loss": 0.006, "reward": 1.7202391624450684, "reward_std": 0.218642920255661, "rewards/accuracy_reward": 0.7202391028404236, "rewards/format_reward": 1.0, "step": 885 }, { "completion_length": 66.59375, "epoch": 4.045662100456621, "grad_norm": 2.522268533706665, "kl": 0.1220703125, "learning_rate": 5.95433789954338e-07, "loss": 0.0049, "reward": 1.5917280912399292, "reward_std": 0.18826671689748764, "rewards/accuracy_reward": 0.6073530614376068, "rewards/format_reward": 0.984375, "step": 886 }, { "completion_length": 51.8828125, "epoch": 4.050228310502283, "grad_norm": 2.2267160415649414, "kl": 0.173828125, "learning_rate": 5.949771689497717e-07, "loss": 0.007, "reward": 1.6658853888511658, "reward_std": 0.22664503753185272, "rewards/accuracy_reward": 0.6815103888511658, "rewards/format_reward": 0.984375, "step": 887 }, { "completion_length": 51.3203125, "epoch": 4.054794520547945, "grad_norm": 1.9868104457855225, "kl": 0.1552734375, "learning_rate": 5.945205479452054e-07, "loss": 0.0062, "reward": 1.7429263591766357, "reward_std": 0.18497039377689362, "rewards/accuracy_reward": 0.7741763293743134, "rewards/format_reward": 0.96875, "step": 888 }, { "completion_length": 56.0390625, "epoch": 4.059360730593608, "grad_norm": 2.1909918785095215, "kl": 0.140869140625, "learning_rate": 5.940639269406393e-07, "loss": 0.0056, "reward": 1.726171851158142, "reward_std": 0.20413171127438545, "rewards/accuracy_reward": 0.7339843213558197, "rewards/format_reward": 0.9921875, "step": 889 }, { "completion_length": 69.4609375, "epoch": 4.063926940639269, "grad_norm": 1.6129564046859741, "kl": 0.125732421875, "learning_rate": 5.93607305936073e-07, "loss": 0.005, "reward": 1.7187500596046448, "reward_std": 0.1436246931552887, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 890 }, { "completion_length": 67.9921875, "epoch": 4.068493150684931, "grad_norm": 3.2070353031158447, "kl": 0.119140625, "learning_rate": 5.931506849315067e-07, "loss": 0.0048, "reward": 1.685523271560669, "reward_std": 0.18283560872077942, "rewards/accuracy_reward": 0.6933356523513794, "rewards/format_reward": 0.9921875, "step": 891 }, { "completion_length": 74.7578125, "epoch": 4.073059360730594, "grad_norm": 2.196234703063965, "kl": 0.092041015625, "learning_rate": 5.926940639269407e-07, "loss": 0.0037, "reward": 1.7648438215255737, "reward_std": 0.10456175357103348, "rewards/accuracy_reward": 0.7648437023162842, "rewards/format_reward": 1.0, "step": 892 }, { "completion_length": 62.0, "epoch": 4.077625570776256, "grad_norm": 3.306205987930298, "kl": 0.12451171875, "learning_rate": 5.922374429223744e-07, "loss": 0.005, "reward": 1.7713721990585327, "reward_std": 0.16517749428749084, "rewards/accuracy_reward": 0.7713720798492432, "rewards/format_reward": 1.0, "step": 893 }, { "completion_length": 59.890625, "epoch": 4.082191780821918, "grad_norm": 2.852937698364258, "kl": 0.13525390625, "learning_rate": 5.917808219178083e-07, "loss": 0.0054, "reward": 1.6403688192367554, "reward_std": 0.19380497932434082, "rewards/accuracy_reward": 0.648181289434433, "rewards/format_reward": 0.9921875, "step": 894 }, { "completion_length": 80.5546875, "epoch": 4.0867579908675795, "grad_norm": 4.440982818603516, "kl": 0.110107421875, "learning_rate": 5.91324200913242e-07, "loss": 0.0044, "reward": 1.7101563215255737, "reward_std": 0.15725971013307571, "rewards/accuracy_reward": 0.7179687321186066, "rewards/format_reward": 0.9921875, "step": 895 }, { "completion_length": 76.8046875, "epoch": 4.091324200913242, "grad_norm": 1.442449927330017, "kl": 0.130859375, "learning_rate": 5.908675799086757e-07, "loss": 0.0052, "reward": 1.831250011920929, "reward_std": 0.0731260534375906, "rewards/accuracy_reward": 0.8312499225139618, "rewards/format_reward": 1.0, "step": 896 }, { "completion_length": 43.0, "epoch": 4.095890410958904, "grad_norm": 5.812748908996582, "kl": 0.21533203125, "learning_rate": 5.904109589041096e-07, "loss": 0.0086, "reward": 1.7768229246139526, "reward_std": 0.22991649061441422, "rewards/accuracy_reward": 0.7768228948116302, "rewards/format_reward": 1.0, "step": 897 }, { "completion_length": 87.3828125, "epoch": 4.100456621004566, "grad_norm": 2.7623848915100098, "kl": 0.093505859375, "learning_rate": 5.899543378995433e-07, "loss": 0.0037, "reward": 1.8036272525787354, "reward_std": 0.11107254587113857, "rewards/accuracy_reward": 0.8114396631717682, "rewards/format_reward": 0.9921875, "step": 898 }, { "completion_length": 91.8125, "epoch": 4.105022831050229, "grad_norm": 2.0738606452941895, "kl": 0.133056640625, "learning_rate": 5.894977168949771e-07, "loss": 0.0053, "reward": 1.7905096411705017, "reward_std": 0.12370277941226959, "rewards/accuracy_reward": 0.8061346113681793, "rewards/format_reward": 0.984375, "step": 899 }, { "completion_length": 76.5546875, "epoch": 4.109589041095891, "grad_norm": 3.7335944175720215, "kl": 0.130859375, "learning_rate": 5.89041095890411e-07, "loss": 0.0052, "reward": 1.6451637148857117, "reward_std": 0.1962077133357525, "rewards/accuracy_reward": 0.6529761403799057, "rewards/format_reward": 0.9921875, "step": 900 }, { "completion_length": 63.0625, "epoch": 4.114155251141552, "grad_norm": 7.811534881591797, "kl": 0.117431640625, "learning_rate": 5.885844748858447e-07, "loss": 0.0047, "reward": 1.6516927480697632, "reward_std": 0.20568183064460754, "rewards/accuracy_reward": 0.651692658662796, "rewards/format_reward": 1.0, "step": 901 }, { "completion_length": 75.40625, "epoch": 4.1187214611872145, "grad_norm": 4.964561462402344, "kl": 0.11474609375, "learning_rate": 5.881278538812785e-07, "loss": 0.0046, "reward": 1.767968773841858, "reward_std": 0.17296937853097916, "rewards/accuracy_reward": 0.7757811844348907, "rewards/format_reward": 0.9921875, "step": 902 }, { "completion_length": 78.671875, "epoch": 4.123287671232877, "grad_norm": 7.2277679443359375, "kl": 0.111083984375, "learning_rate": 5.876712328767123e-07, "loss": 0.0044, "reward": 1.8033853769302368, "reward_std": 0.12047793343663216, "rewards/accuracy_reward": 0.8111978769302368, "rewards/format_reward": 0.9921875, "step": 903 }, { "completion_length": 76.0546875, "epoch": 4.127853881278539, "grad_norm": 3.666255474090576, "kl": 0.14697265625, "learning_rate": 5.87214611872146e-07, "loss": 0.0059, "reward": 1.7567708492279053, "reward_std": 0.15414869785308838, "rewards/accuracy_reward": 0.7645833492279053, "rewards/format_reward": 0.9921875, "step": 904 }, { "completion_length": 64.28125, "epoch": 4.132420091324201, "grad_norm": 6.144541263580322, "kl": 0.15673828125, "learning_rate": 5.8675799086758e-07, "loss": 0.0063, "reward": 1.6296875476837158, "reward_std": 0.23411936312913895, "rewards/accuracy_reward": 0.637499988079071, "rewards/format_reward": 0.9921875, "step": 905 }, { "completion_length": 96.265625, "epoch": 4.136986301369863, "grad_norm": 4.049592018127441, "kl": 0.103759765625, "learning_rate": 5.863013698630137e-07, "loss": 0.0041, "reward": 1.807031273841858, "reward_std": 0.16107044368982315, "rewards/accuracy_reward": 0.8070311546325684, "rewards/format_reward": 1.0, "step": 906 }, { "completion_length": 73.2578125, "epoch": 4.141552511415525, "grad_norm": 1.5114738941192627, "kl": 0.085693359375, "learning_rate": 5.858447488584474e-07, "loss": 0.0034, "reward": 1.6850818991661072, "reward_std": 0.16721044853329659, "rewards/accuracy_reward": 0.69289430975914, "rewards/format_reward": 0.9921875, "step": 907 }, { "completion_length": 56.8515625, "epoch": 4.146118721461187, "grad_norm": 2.3427228927612305, "kl": 0.12890625, "learning_rate": 5.853881278538813e-07, "loss": 0.0052, "reward": 1.756416380405426, "reward_std": 0.1625683754682541, "rewards/accuracy_reward": 0.7642288506031036, "rewards/format_reward": 0.9921875, "step": 908 }, { "completion_length": 85.671875, "epoch": 4.1506849315068495, "grad_norm": 2.933675765991211, "kl": 0.092529296875, "learning_rate": 5.84931506849315e-07, "loss": 0.0037, "reward": 1.723825216293335, "reward_std": 0.14209723100066185, "rewards/accuracy_reward": 0.7238251268863678, "rewards/format_reward": 1.0, "step": 909 }, { "completion_length": 67.4921875, "epoch": 4.155251141552512, "grad_norm": 2.615705966949463, "kl": 0.11328125, "learning_rate": 5.844748858447488e-07, "loss": 0.0045, "reward": 1.6570913791656494, "reward_std": 0.1796240657567978, "rewards/accuracy_reward": 0.664903849363327, "rewards/format_reward": 0.9921875, "step": 910 }, { "completion_length": 76.96875, "epoch": 4.159817351598173, "grad_norm": 1.9582023620605469, "kl": 0.090087890625, "learning_rate": 5.840182648401826e-07, "loss": 0.0036, "reward": 1.7256065011024475, "reward_std": 0.17783351242542267, "rewards/accuracy_reward": 0.7334189414978027, "rewards/format_reward": 0.9921875, "step": 911 }, { "completion_length": 60.40625, "epoch": 4.164383561643835, "grad_norm": 4.850559711456299, "kl": 0.103271484375, "learning_rate": 5.835616438356164e-07, "loss": 0.0041, "reward": 1.524218738079071, "reward_std": 0.2731492444872856, "rewards/accuracy_reward": 0.5320312678813934, "rewards/format_reward": 0.9921875, "step": 912 }, { "completion_length": 50.765625, "epoch": 4.168949771689498, "grad_norm": 2.815920352935791, "kl": 0.18408203125, "learning_rate": 5.831050228310503e-07, "loss": 0.0074, "reward": 1.653542160987854, "reward_std": 0.23784886300563812, "rewards/accuracy_reward": 0.6691671311855316, "rewards/format_reward": 0.984375, "step": 913 }, { "completion_length": 76.1875, "epoch": 4.17351598173516, "grad_norm": 2.3140416145324707, "kl": 0.14208984375, "learning_rate": 5.82648401826484e-07, "loss": 0.0057, "reward": 1.6864583492279053, "reward_std": 0.1315075010061264, "rewards/accuracy_reward": 0.6864583194255829, "rewards/format_reward": 1.0, "step": 914 }, { "completion_length": 73.3515625, "epoch": 4.178082191780822, "grad_norm": 3.9995479583740234, "kl": 0.12451171875, "learning_rate": 5.821917808219177e-07, "loss": 0.005, "reward": 1.7815169095993042, "reward_std": 0.08393021672964096, "rewards/accuracy_reward": 0.7815168499946594, "rewards/format_reward": 1.0, "step": 915 }, { "completion_length": 66.3515625, "epoch": 4.182648401826484, "grad_norm": 1.871342420578003, "kl": 0.12353515625, "learning_rate": 5.817351598173516e-07, "loss": 0.0049, "reward": 1.7460670471191406, "reward_std": 0.15686815977096558, "rewards/accuracy_reward": 0.7538795471191406, "rewards/format_reward": 0.9921875, "step": 916 }, { "completion_length": 77.828125, "epoch": 4.187214611872146, "grad_norm": 1.733489751815796, "kl": 0.095458984375, "learning_rate": 5.812785388127853e-07, "loss": 0.0038, "reward": 1.7660456895828247, "reward_std": 0.16200437024235725, "rewards/accuracy_reward": 0.7816706299781799, "rewards/format_reward": 0.984375, "step": 917 }, { "completion_length": 65.046875, "epoch": 4.191780821917808, "grad_norm": 3.412703275680542, "kl": 0.13232421875, "learning_rate": 5.808219178082191e-07, "loss": 0.0053, "reward": 1.6960819363594055, "reward_std": 0.19595444947481155, "rewards/accuracy_reward": 0.7117068469524384, "rewards/format_reward": 0.984375, "step": 918 }, { "completion_length": 83.4140625, "epoch": 4.19634703196347, "grad_norm": 1.8262677192687988, "kl": 0.120849609375, "learning_rate": 5.80365296803653e-07, "loss": 0.0048, "reward": 1.7575623989105225, "reward_std": 0.1326010897755623, "rewards/accuracy_reward": 0.7653749287128448, "rewards/format_reward": 0.9921875, "step": 919 }, { "completion_length": 86.09375, "epoch": 4.200913242009133, "grad_norm": 4.315945625305176, "kl": 0.10107421875, "learning_rate": 5.799086757990867e-07, "loss": 0.004, "reward": 1.701339304447174, "reward_std": 0.14829950034618378, "rewards/accuracy_reward": 0.7013393044471741, "rewards/format_reward": 1.0, "step": 920 }, { "completion_length": 74.46875, "epoch": 4.205479452054795, "grad_norm": 1.9493387937545776, "kl": 0.08251953125, "learning_rate": 5.794520547945206e-07, "loss": 0.0033, "reward": 1.8070870637893677, "reward_std": 0.036666832864284515, "rewards/accuracy_reward": 0.8070869743824005, "rewards/format_reward": 1.0, "step": 921 }, { "completion_length": 77.328125, "epoch": 4.210045662100456, "grad_norm": 1.4725087881088257, "kl": 0.1162109375, "learning_rate": 5.789954337899543e-07, "loss": 0.0047, "reward": 1.8280134201049805, "reward_std": 0.10532564483582973, "rewards/accuracy_reward": 0.8358259201049805, "rewards/format_reward": 0.9921875, "step": 922 }, { "completion_length": 72.9921875, "epoch": 4.2146118721461185, "grad_norm": 4.775530815124512, "kl": 0.13037109375, "learning_rate": 5.78538812785388e-07, "loss": 0.0052, "reward": 1.7100632786750793, "reward_std": 0.17319176718592644, "rewards/accuracy_reward": 0.7178757190704346, "rewards/format_reward": 0.9921875, "step": 923 }, { "completion_length": 70.015625, "epoch": 4.219178082191781, "grad_norm": 2.908025026321411, "kl": 0.162109375, "learning_rate": 5.780821917808219e-07, "loss": 0.0065, "reward": 1.6276227831840515, "reward_std": 0.22536901384592056, "rewards/accuracy_reward": 0.6432477831840515, "rewards/format_reward": 0.984375, "step": 924 }, { "completion_length": 59.578125, "epoch": 4.223744292237443, "grad_norm": 5.317495346069336, "kl": 0.10595703125, "learning_rate": 5.776255707762557e-07, "loss": 0.0042, "reward": 1.6319011449813843, "reward_std": 0.18694934993982315, "rewards/accuracy_reward": 0.6397135257720947, "rewards/format_reward": 0.9921875, "step": 925 }, { "completion_length": 81.40625, "epoch": 4.228310502283105, "grad_norm": 2.3484511375427246, "kl": 0.091796875, "learning_rate": 5.771689497716896e-07, "loss": 0.0037, "reward": 1.7912667393684387, "reward_std": 0.08569350093603134, "rewards/accuracy_reward": 0.7912667095661163, "rewards/format_reward": 1.0, "step": 926 }, { "completion_length": 68.578125, "epoch": 4.232876712328767, "grad_norm": 2.1061465740203857, "kl": 0.125244140625, "learning_rate": 5.767123287671233e-07, "loss": 0.005, "reward": 1.8626301884651184, "reward_std": 0.10864401236176491, "rewards/accuracy_reward": 0.8626301884651184, "rewards/format_reward": 1.0, "step": 927 }, { "completion_length": 74.9765625, "epoch": 4.237442922374429, "grad_norm": 3.035919427871704, "kl": 0.13134765625, "learning_rate": 5.76255707762557e-07, "loss": 0.0053, "reward": 1.6922495365142822, "reward_std": 0.15072567015886307, "rewards/accuracy_reward": 0.7156869769096375, "rewards/format_reward": 0.9765625, "step": 928 }, { "completion_length": 58.78125, "epoch": 4.242009132420091, "grad_norm": 2.8555891513824463, "kl": 0.16064453125, "learning_rate": 5.757990867579909e-07, "loss": 0.0064, "reward": 1.8009114861488342, "reward_std": 0.18833597749471664, "rewards/accuracy_reward": 0.8009114265441895, "rewards/format_reward": 1.0, "step": 929 }, { "completion_length": 79.625, "epoch": 4.2465753424657535, "grad_norm": 4.95534086227417, "kl": 0.113037109375, "learning_rate": 5.753424657534246e-07, "loss": 0.0045, "reward": 1.668163239955902, "reward_std": 0.20845329016447067, "rewards/accuracy_reward": 0.6759756505489349, "rewards/format_reward": 0.9921875, "step": 930 }, { "completion_length": 57.59375, "epoch": 4.251141552511416, "grad_norm": 3.5745315551757812, "kl": 0.158203125, "learning_rate": 5.748858447488583e-07, "loss": 0.0063, "reward": 1.7814725637435913, "reward_std": 0.1947070211172104, "rewards/accuracy_reward": 0.7814724445343018, "rewards/format_reward": 1.0, "step": 931 }, { "completion_length": 69.296875, "epoch": 4.255707762557078, "grad_norm": 2.9775948524475098, "kl": 0.129638671875, "learning_rate": 5.744292237442923e-07, "loss": 0.0052, "reward": 1.5528324842453003, "reward_std": 0.23214496672153473, "rewards/accuracy_reward": 0.5684574842453003, "rewards/format_reward": 0.984375, "step": 932 }, { "completion_length": 85.0859375, "epoch": 4.260273972602739, "grad_norm": 2.191046714782715, "kl": 0.13232421875, "learning_rate": 5.73972602739726e-07, "loss": 0.0053, "reward": 1.666010558605194, "reward_std": 0.16855772212147713, "rewards/accuracy_reward": 0.6816355586051941, "rewards/format_reward": 0.984375, "step": 933 }, { "completion_length": 84.921875, "epoch": 4.264840182648402, "grad_norm": 1.6789612770080566, "kl": 0.0821533203125, "learning_rate": 5.735159817351598e-07, "loss": 0.0033, "reward": 1.749913215637207, "reward_std": 0.07257125526666641, "rewards/accuracy_reward": 0.7499131858348846, "rewards/format_reward": 1.0, "step": 934 }, { "completion_length": 78.6171875, "epoch": 4.269406392694064, "grad_norm": 4.655454158782959, "kl": 0.116455078125, "learning_rate": 5.730593607305936e-07, "loss": 0.0047, "reward": 1.7203125953674316, "reward_std": 0.14768873527646065, "rewards/accuracy_reward": 0.7203124463558197, "rewards/format_reward": 1.0, "step": 935 }, { "completion_length": 79.765625, "epoch": 4.273972602739726, "grad_norm": 2.1534454822540283, "kl": 0.093017578125, "learning_rate": 5.726027397260273e-07, "loss": 0.0037, "reward": 1.7255195379257202, "reward_std": 0.18480905890464783, "rewards/accuracy_reward": 0.7489570677280426, "rewards/format_reward": 0.9765625, "step": 936 }, { "completion_length": 82.1015625, "epoch": 4.2785388127853885, "grad_norm": 4.468496799468994, "kl": 0.081298828125, "learning_rate": 5.721461187214612e-07, "loss": 0.0032, "reward": 1.8500558733940125, "reward_std": 0.13352815061807632, "rewards/accuracy_reward": 0.8578682541847229, "rewards/format_reward": 0.9921875, "step": 937 }, { "completion_length": 55.5078125, "epoch": 4.28310502283105, "grad_norm": 2.743320941925049, "kl": 0.18212890625, "learning_rate": 5.716894977168949e-07, "loss": 0.0073, "reward": 1.5885499119758606, "reward_std": 0.20670751482248306, "rewards/accuracy_reward": 0.6041748821735382, "rewards/format_reward": 0.984375, "step": 938 }, { "completion_length": 77.984375, "epoch": 4.287671232876712, "grad_norm": 3.1169564723968506, "kl": 0.102783203125, "learning_rate": 5.712328767123287e-07, "loss": 0.0041, "reward": 1.6959820985794067, "reward_std": 0.1572049930691719, "rewards/accuracy_reward": 0.7037945985794067, "rewards/format_reward": 0.9921875, "step": 939 }, { "completion_length": 78.359375, "epoch": 4.292237442922374, "grad_norm": 1.5095194578170776, "kl": 0.11083984375, "learning_rate": 5.707762557077626e-07, "loss": 0.0044, "reward": 1.7596355080604553, "reward_std": 0.1378481425344944, "rewards/accuracy_reward": 0.7830728888511658, "rewards/format_reward": 0.9765625, "step": 940 }, { "completion_length": 61.1171875, "epoch": 4.296803652968037, "grad_norm": 2.8175227642059326, "kl": 0.153076171875, "learning_rate": 5.703196347031963e-07, "loss": 0.0061, "reward": 1.6882672905921936, "reward_std": 0.1907612383365631, "rewards/accuracy_reward": 0.696079820394516, "rewards/format_reward": 0.9921875, "step": 941 }, { "completion_length": 77.0546875, "epoch": 4.301369863013699, "grad_norm": 2.724752426147461, "kl": 0.14306640625, "learning_rate": 5.698630136986301e-07, "loss": 0.0057, "reward": 1.6338477730751038, "reward_std": 0.1553211621940136, "rewards/accuracy_reward": 0.633847713470459, "rewards/format_reward": 1.0, "step": 942 }, { "completion_length": 72.0625, "epoch": 4.30593607305936, "grad_norm": 1.5134310722351074, "kl": 0.109375, "learning_rate": 5.694063926940639e-07, "loss": 0.0044, "reward": 1.8307477235794067, "reward_std": 0.10607551783323288, "rewards/accuracy_reward": 0.8307477533817291, "rewards/format_reward": 1.0, "step": 943 }, { "completion_length": 84.6796875, "epoch": 4.310502283105023, "grad_norm": 1.9289695024490356, "kl": 0.120849609375, "learning_rate": 5.689497716894976e-07, "loss": 0.0048, "reward": 1.7625186443328857, "reward_std": 0.14161107502877712, "rewards/accuracy_reward": 0.7703310549259186, "rewards/format_reward": 0.9921875, "step": 944 }, { "completion_length": 68.6171875, "epoch": 4.315068493150685, "grad_norm": 2.0734076499938965, "kl": 0.149169921875, "learning_rate": 5.684931506849316e-07, "loss": 0.006, "reward": 1.6850000619888306, "reward_std": 0.14654473960399628, "rewards/accuracy_reward": 0.6928124725818634, "rewards/format_reward": 0.9921875, "step": 945 }, { "completion_length": 68.4765625, "epoch": 4.319634703196347, "grad_norm": 2.146794080734253, "kl": 0.11572265625, "learning_rate": 5.680365296803653e-07, "loss": 0.0046, "reward": 1.70016747713089, "reward_std": 0.20281969010829926, "rewards/accuracy_reward": 0.7157924175262451, "rewards/format_reward": 0.984375, "step": 946 }, { "completion_length": 68.03125, "epoch": 4.324200913242009, "grad_norm": 2.0439021587371826, "kl": 0.126953125, "learning_rate": 5.67579908675799e-07, "loss": 0.0051, "reward": 1.8492187857627869, "reward_std": 0.09047675505280495, "rewards/accuracy_reward": 0.8492186665534973, "rewards/format_reward": 1.0, "step": 947 }, { "completion_length": 68.6953125, "epoch": 4.328767123287671, "grad_norm": 5.651437759399414, "kl": 0.129638671875, "learning_rate": 5.671232876712329e-07, "loss": 0.0052, "reward": 1.7400199174880981, "reward_std": 0.19576279073953629, "rewards/accuracy_reward": 0.7478323876857758, "rewards/format_reward": 0.9921875, "step": 948 }, { "completion_length": 92.3828125, "epoch": 4.333333333333333, "grad_norm": 2.3988163471221924, "kl": 0.107666015625, "learning_rate": 5.666666666666666e-07, "loss": 0.0043, "reward": 1.7797867059707642, "reward_std": 0.091391421854496, "rewards/accuracy_reward": 0.7797866761684418, "rewards/format_reward": 1.0, "step": 949 }, { "completion_length": 65.796875, "epoch": 4.337899543378995, "grad_norm": 4.604662895202637, "kl": 0.14306640625, "learning_rate": 5.662100456621004e-07, "loss": 0.0057, "reward": 1.5967974662780762, "reward_std": 0.19326772540807724, "rewards/accuracy_reward": 0.596797525882721, "rewards/format_reward": 1.0, "step": 950 }, { "completion_length": 65.34375, "epoch": 4.342465753424658, "grad_norm": 2.6304314136505127, "kl": 0.194091796875, "learning_rate": 5.657534246575342e-07, "loss": 0.0078, "reward": 1.6250000596046448, "reward_std": 0.22805283963680267, "rewards/accuracy_reward": 0.6484375, "rewards/format_reward": 0.9765625, "step": 951 }, { "completion_length": 79.5390625, "epoch": 4.34703196347032, "grad_norm": 26.240947723388672, "kl": 0.1016845703125, "learning_rate": 5.65296803652968e-07, "loss": 0.0041, "reward": 1.7042073011398315, "reward_std": 0.1676829382777214, "rewards/accuracy_reward": 0.7120197415351868, "rewards/format_reward": 0.9921875, "step": 952 }, { "completion_length": 74.0390625, "epoch": 4.351598173515982, "grad_norm": 2.3484628200531006, "kl": 0.178466796875, "learning_rate": 5.648401826484019e-07, "loss": 0.0072, "reward": 1.7171673774719238, "reward_std": 0.16094867885112762, "rewards/accuracy_reward": 0.7249797880649567, "rewards/format_reward": 0.9921875, "step": 953 }, { "completion_length": 74.578125, "epoch": 4.3561643835616435, "grad_norm": 42.42313766479492, "kl": 0.120361328125, "learning_rate": 5.643835616438356e-07, "loss": 0.0048, "reward": 1.6389508843421936, "reward_std": 0.1658247858285904, "rewards/accuracy_reward": 0.638950914144516, "rewards/format_reward": 1.0, "step": 954 }, { "completion_length": 76.90625, "epoch": 4.360730593607306, "grad_norm": 12.59420108795166, "kl": 0.126220703125, "learning_rate": 5.639269406392693e-07, "loss": 0.005, "reward": 1.704687476158142, "reward_std": 0.18361148238182068, "rewards/accuracy_reward": 0.7046875059604645, "rewards/format_reward": 1.0, "step": 955 }, { "completion_length": 84.125, "epoch": 4.365296803652968, "grad_norm": 2.1863152980804443, "kl": 0.107177734375, "learning_rate": 5.634703196347032e-07, "loss": 0.0043, "reward": 1.6656250953674316, "reward_std": 0.1505398079752922, "rewards/accuracy_reward": 0.6812499761581421, "rewards/format_reward": 0.984375, "step": 956 }, { "completion_length": 66.25, "epoch": 4.36986301369863, "grad_norm": 5.098891735076904, "kl": 0.134033203125, "learning_rate": 5.630136986301369e-07, "loss": 0.0054, "reward": 1.7317472100257874, "reward_std": 0.13505896925926208, "rewards/accuracy_reward": 0.7395596504211426, "rewards/format_reward": 0.9921875, "step": 957 }, { "completion_length": 66.5078125, "epoch": 4.3744292237442925, "grad_norm": 10.807661056518555, "kl": 0.119873046875, "learning_rate": 5.625570776255707e-07, "loss": 0.0048, "reward": 1.7822917103767395, "reward_std": 0.11853177845478058, "rewards/accuracy_reward": 0.7822916507720947, "rewards/format_reward": 1.0, "step": 958 }, { "completion_length": 67.375, "epoch": 4.378995433789954, "grad_norm": 1.9763253927230835, "kl": 0.16064453125, "learning_rate": 5.621004566210046e-07, "loss": 0.0064, "reward": 1.6397321224212646, "reward_std": 0.2425994575023651, "rewards/accuracy_reward": 0.6631696224212646, "rewards/format_reward": 0.9765625, "step": 959 }, { "completion_length": 73.609375, "epoch": 4.383561643835616, "grad_norm": 11.640439987182617, "kl": 0.12841796875, "learning_rate": 5.616438356164383e-07, "loss": 0.0051, "reward": 1.6920552849769592, "reward_std": 0.23473482113331556, "rewards/accuracy_reward": 0.7154927253723145, "rewards/format_reward": 0.9765625, "step": 960 }, { "completion_length": 67.1640625, "epoch": 4.3881278538812785, "grad_norm": 4.730476379394531, "kl": 0.091796875, "learning_rate": 5.611872146118722e-07, "loss": 0.0037, "reward": 1.8379933834075928, "reward_std": 0.07210628129541874, "rewards/accuracy_reward": 0.8379934132099152, "rewards/format_reward": 1.0, "step": 961 }, { "completion_length": 95.25, "epoch": 4.392694063926941, "grad_norm": 1.63775634765625, "kl": 0.069091796875, "learning_rate": 5.607305936073059e-07, "loss": 0.0028, "reward": 1.818750023841858, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.8343749344348907, "rewards/format_reward": 0.984375, "step": 962 }, { "completion_length": 58.0078125, "epoch": 4.397260273972603, "grad_norm": 2.0879337787628174, "kl": 0.162109375, "learning_rate": 5.602739726027396e-07, "loss": 0.0065, "reward": 1.810954213142395, "reward_std": 0.18426654487848282, "rewards/accuracy_reward": 0.818766713142395, "rewards/format_reward": 0.9921875, "step": 963 }, { "completion_length": 60.8125, "epoch": 4.401826484018265, "grad_norm": 4.00584077835083, "kl": 0.1513671875, "learning_rate": 5.598173515981735e-07, "loss": 0.0061, "reward": 1.765178620815277, "reward_std": 0.20918096601963043, "rewards/accuracy_reward": 0.7729910910129547, "rewards/format_reward": 0.9921875, "step": 964 }, { "completion_length": 62.6171875, "epoch": 4.406392694063927, "grad_norm": 2.43243408203125, "kl": 0.134521484375, "learning_rate": 5.593607305936073e-07, "loss": 0.0054, "reward": 1.7512852549552917, "reward_std": 0.20323559269309044, "rewards/accuracy_reward": 0.7512852847576141, "rewards/format_reward": 1.0, "step": 965 }, { "completion_length": 87.6875, "epoch": 4.410958904109589, "grad_norm": 2.643324375152588, "kl": 0.10693359375, "learning_rate": 5.589041095890411e-07, "loss": 0.0043, "reward": 1.7709821462631226, "reward_std": 0.12626906856894493, "rewards/accuracy_reward": 0.7787946164608002, "rewards/format_reward": 0.9921875, "step": 966 }, { "completion_length": 73.7421875, "epoch": 4.415525114155251, "grad_norm": 3.2244210243225098, "kl": 0.10791015625, "learning_rate": 5.584474885844749e-07, "loss": 0.0043, "reward": 1.76171875, "reward_std": 0.18859816156327724, "rewards/accuracy_reward": 0.7695312201976776, "rewards/format_reward": 0.9921875, "step": 967 }, { "completion_length": 77.84375, "epoch": 4.420091324200913, "grad_norm": 4.4358625411987305, "kl": 0.11962890625, "learning_rate": 5.579908675799086e-07, "loss": 0.0048, "reward": 1.8305882215499878, "reward_std": 0.09982336312532425, "rewards/accuracy_reward": 0.8305881917476654, "rewards/format_reward": 1.0, "step": 968 }, { "completion_length": 62.171875, "epoch": 4.424657534246576, "grad_norm": 4.14423942565918, "kl": 0.133056640625, "learning_rate": 5.575342465753425e-07, "loss": 0.0053, "reward": 1.6299851536750793, "reward_std": 0.19754792749881744, "rewards/accuracy_reward": 0.637797623872757, "rewards/format_reward": 0.9921875, "step": 969 }, { "completion_length": 83.1953125, "epoch": 4.429223744292237, "grad_norm": 1.427676796913147, "kl": 0.087158203125, "learning_rate": 5.570776255707762e-07, "loss": 0.0035, "reward": 1.8171875476837158, "reward_std": 0.11048543266952038, "rewards/accuracy_reward": 0.8249998986721039, "rewards/format_reward": 0.9921875, "step": 970 }, { "completion_length": 83.6484375, "epoch": 4.433789954337899, "grad_norm": 2.3894028663635254, "kl": 0.098388671875, "learning_rate": 5.566210045662099e-07, "loss": 0.0039, "reward": 1.7342448234558105, "reward_std": 0.24424592405557632, "rewards/accuracy_reward": 0.7576822340488434, "rewards/format_reward": 0.9765625, "step": 971 }, { "completion_length": 86.203125, "epoch": 4.438356164383562, "grad_norm": 2.410252571105957, "kl": 0.103515625, "learning_rate": 5.561643835616439e-07, "loss": 0.0041, "reward": 1.8436079621315002, "reward_std": 0.08567275106906891, "rewards/accuracy_reward": 0.8514204323291779, "rewards/format_reward": 0.9921875, "step": 972 }, { "completion_length": 67.296875, "epoch": 4.442922374429224, "grad_norm": 3.1359169483184814, "kl": 0.156982421875, "learning_rate": 5.557077625570776e-07, "loss": 0.0063, "reward": 1.6325520277023315, "reward_std": 0.1258198693394661, "rewards/accuracy_reward": 0.6325520575046539, "rewards/format_reward": 1.0, "step": 973 }, { "completion_length": 75.625, "epoch": 4.447488584474886, "grad_norm": 3.9389309883117676, "kl": 0.17529296875, "learning_rate": 5.552511415525114e-07, "loss": 0.007, "reward": 1.6369017958641052, "reward_std": 0.21986636519432068, "rewards/accuracy_reward": 0.6447142958641052, "rewards/format_reward": 0.9921875, "step": 974 }, { "completion_length": 75.4765625, "epoch": 4.4520547945205475, "grad_norm": 19.899776458740234, "kl": 0.638916015625, "learning_rate": 5.547945205479452e-07, "loss": 0.0255, "reward": 1.819531261920929, "reward_std": 0.07878133933991194, "rewards/accuracy_reward": 0.819531261920929, "rewards/format_reward": 1.0, "step": 975 }, { "completion_length": 73.5625, "epoch": 4.45662100456621, "grad_norm": 3.0659143924713135, "kl": 0.095703125, "learning_rate": 5.543378995433789e-07, "loss": 0.0038, "reward": 1.6691592335700989, "reward_std": 0.19636988639831543, "rewards/accuracy_reward": 0.6847842335700989, "rewards/format_reward": 0.984375, "step": 976 }, { "completion_length": 82.0, "epoch": 4.461187214611872, "grad_norm": 2.006531000137329, "kl": 0.101318359375, "learning_rate": 5.538812785388128e-07, "loss": 0.0041, "reward": 1.8178571462631226, "reward_std": 0.1673773005604744, "rewards/accuracy_reward": 0.8334820866584778, "rewards/format_reward": 0.984375, "step": 977 }, { "completion_length": 55.5234375, "epoch": 4.465753424657534, "grad_norm": 2.1843132972717285, "kl": 0.158203125, "learning_rate": 5.534246575342465e-07, "loss": 0.0063, "reward": 1.770518183708191, "reward_std": 0.19590065628290176, "rewards/accuracy_reward": 0.7783306241035461, "rewards/format_reward": 0.9921875, "step": 978 }, { "completion_length": 54.3984375, "epoch": 4.470319634703197, "grad_norm": 5.7283830642700195, "kl": 0.16845703125, "learning_rate": 5.529680365296803e-07, "loss": 0.0067, "reward": 1.692187488079071, "reward_std": 0.193861223757267, "rewards/accuracy_reward": 0.6999999582767487, "rewards/format_reward": 0.9921875, "step": 979 }, { "completion_length": 73.6171875, "epoch": 4.474885844748858, "grad_norm": 4.302793979644775, "kl": 0.139404296875, "learning_rate": 5.525114155251142e-07, "loss": 0.0056, "reward": 1.7293124198913574, "reward_std": 0.14934544544667006, "rewards/accuracy_reward": 0.7293124198913574, "rewards/format_reward": 1.0, "step": 980 }, { "completion_length": 87.0625, "epoch": 4.47945205479452, "grad_norm": 2.7895452976226807, "kl": 0.115966796875, "learning_rate": 5.520547945205479e-07, "loss": 0.0046, "reward": 1.7789062857627869, "reward_std": 0.09265873953700066, "rewards/accuracy_reward": 0.7789061963558197, "rewards/format_reward": 1.0, "step": 981 }, { "completion_length": 78.3984375, "epoch": 4.4840182648401825, "grad_norm": 3.9894137382507324, "kl": 0.1279296875, "learning_rate": 5.515981735159817e-07, "loss": 0.0051, "reward": 1.6846325397491455, "reward_std": 0.18365756422281265, "rewards/accuracy_reward": 0.7002575993537903, "rewards/format_reward": 0.984375, "step": 982 }, { "completion_length": 76.875, "epoch": 4.488584474885845, "grad_norm": 3.384289503097534, "kl": 0.150634765625, "learning_rate": 5.511415525114155e-07, "loss": 0.006, "reward": 1.7085938453674316, "reward_std": 0.20954116433858871, "rewards/accuracy_reward": 0.7320311665534973, "rewards/format_reward": 0.9765625, "step": 983 }, { "completion_length": 49.8671875, "epoch": 4.493150684931507, "grad_norm": 5.41481876373291, "kl": 0.14111328125, "learning_rate": 5.506849315068492e-07, "loss": 0.0056, "reward": 1.5635416507720947, "reward_std": 0.23787462711334229, "rewards/accuracy_reward": 0.5635416507720947, "rewards/format_reward": 1.0, "step": 984 }, { "completion_length": 87.890625, "epoch": 4.497716894977169, "grad_norm": 1.2277454137802124, "kl": 0.0751953125, "learning_rate": 5.502283105022832e-07, "loss": 0.003, "reward": 1.7507812976837158, "reward_std": 0.05524272099137306, "rewards/accuracy_reward": 0.7507811486721039, "rewards/format_reward": 1.0, "step": 985 }, { "completion_length": 61.9453125, "epoch": 4.502283105022831, "grad_norm": 5.256927490234375, "kl": 0.1865234375, "learning_rate": 5.497716894977169e-07, "loss": 0.0075, "reward": 1.610156238079071, "reward_std": 0.2878893092274666, "rewards/accuracy_reward": 0.625781238079071, "rewards/format_reward": 0.984375, "step": 986 }, { "completion_length": 82.71875, "epoch": 4.506849315068493, "grad_norm": 3.515986919403076, "kl": 0.18603515625, "learning_rate": 5.493150684931506e-07, "loss": 0.0074, "reward": 1.7588542103767395, "reward_std": 0.1455376148223877, "rewards/accuracy_reward": 0.7588541209697723, "rewards/format_reward": 1.0, "step": 987 }, { "completion_length": 67.7578125, "epoch": 4.511415525114155, "grad_norm": 2.741415023803711, "kl": 0.14306640625, "learning_rate": 5.488584474885845e-07, "loss": 0.0057, "reward": 1.5901537537574768, "reward_std": 0.15988682955503464, "rewards/accuracy_reward": 0.5901537537574768, "rewards/format_reward": 1.0, "step": 988 }, { "completion_length": 66.84375, "epoch": 4.5159817351598175, "grad_norm": 2.9208884239196777, "kl": 0.12060546875, "learning_rate": 5.484018264840182e-07, "loss": 0.0048, "reward": 1.7678571939468384, "reward_std": 0.14058196544647217, "rewards/accuracy_reward": 0.7678571343421936, "rewards/format_reward": 1.0, "step": 989 }, { "completion_length": 67.2734375, "epoch": 4.52054794520548, "grad_norm": 3.776416540145874, "kl": 0.126708984375, "learning_rate": 5.47945205479452e-07, "loss": 0.0051, "reward": 1.6418346166610718, "reward_std": 0.1818918213248253, "rewards/accuracy_reward": 0.6418345868587494, "rewards/format_reward": 1.0, "step": 990 }, { "completion_length": 72.625, "epoch": 4.525114155251142, "grad_norm": 2.1685075759887695, "kl": 0.138916015625, "learning_rate": 5.474885844748858e-07, "loss": 0.0056, "reward": 1.7085193395614624, "reward_std": 0.22063866257667542, "rewards/accuracy_reward": 0.72414430975914, "rewards/format_reward": 0.984375, "step": 991 }, { "completion_length": 66.28125, "epoch": 4.529680365296803, "grad_norm": 14.555278778076172, "kl": 0.1513671875, "learning_rate": 5.470319634703196e-07, "loss": 0.006, "reward": 1.6950520873069763, "reward_std": 0.19789891690015793, "rewards/accuracy_reward": 0.6950520873069763, "rewards/format_reward": 1.0, "step": 992 }, { "completion_length": 68.71875, "epoch": 4.534246575342466, "grad_norm": 4.036056995391846, "kl": 0.15283203125, "learning_rate": 5.465753424657535e-07, "loss": 0.0061, "reward": 1.5639322996139526, "reward_std": 0.2465272918343544, "rewards/accuracy_reward": 0.5795572698116302, "rewards/format_reward": 0.984375, "step": 993 }, { "completion_length": 72.296875, "epoch": 4.538812785388128, "grad_norm": 4.413112163543701, "kl": 0.2080078125, "learning_rate": 5.461187214611872e-07, "loss": 0.0083, "reward": 1.6929687857627869, "reward_std": 0.26582426577806473, "rewards/accuracy_reward": 0.7242187261581421, "rewards/format_reward": 0.96875, "step": 994 }, { "completion_length": 71.1171875, "epoch": 4.54337899543379, "grad_norm": 2.5498805046081543, "kl": 0.13623046875, "learning_rate": 5.456621004566209e-07, "loss": 0.0055, "reward": 1.696093738079071, "reward_std": 0.20334278792142868, "rewards/accuracy_reward": 0.7039062678813934, "rewards/format_reward": 0.9921875, "step": 995 }, { "completion_length": 80.1796875, "epoch": 4.5479452054794525, "grad_norm": 1.7603377103805542, "kl": 0.117919921875, "learning_rate": 5.452054794520548e-07, "loss": 0.0047, "reward": 1.6623343229293823, "reward_std": 0.15708915889263153, "rewards/accuracy_reward": 0.6779592037200928, "rewards/format_reward": 0.984375, "step": 996 }, { "completion_length": 56.375, "epoch": 4.552511415525114, "grad_norm": 4.723056316375732, "kl": 0.17236328125, "learning_rate": 5.447488584474885e-07, "loss": 0.0069, "reward": 1.7257593870162964, "reward_std": 0.21335439383983612, "rewards/accuracy_reward": 0.7257594168186188, "rewards/format_reward": 1.0, "step": 997 }, { "completion_length": 59.25, "epoch": 4.557077625570776, "grad_norm": 2.3055455684661865, "kl": 0.1591796875, "learning_rate": 5.442922374429223e-07, "loss": 0.0064, "reward": 1.7297247648239136, "reward_std": 0.17667409405112267, "rewards/accuracy_reward": 0.7375371754169464, "rewards/format_reward": 0.9921875, "step": 998 }, { "completion_length": 55.4296875, "epoch": 4.561643835616438, "grad_norm": 10.311870574951172, "kl": 0.135009765625, "learning_rate": 5.438356164383562e-07, "loss": 0.0054, "reward": 1.48444002866745, "reward_std": 0.28679582476615906, "rewards/accuracy_reward": 0.49225252866744995, "rewards/format_reward": 0.9921875, "step": 999 }, { "completion_length": 60.46875, "epoch": 4.566210045662101, "grad_norm": 3.404649019241333, "kl": 0.1982421875, "learning_rate": 5.433789954337899e-07, "loss": 0.0079, "reward": 1.5433160066604614, "reward_std": 0.2616465389728546, "rewards/accuracy_reward": 0.574565976858139, "rewards/format_reward": 0.96875, "step": 1000 }, { "completion_length": 73.15625, "epoch": 4.570776255707763, "grad_norm": 1.8293238878250122, "kl": 0.118408203125, "learning_rate": 5.429223744292238e-07, "loss": 0.0047, "reward": 1.8298460245132446, "reward_std": 0.1409841626882553, "rewards/accuracy_reward": 0.8454709947109222, "rewards/format_reward": 0.984375, "step": 1001 }, { "completion_length": 76.0703125, "epoch": 4.575342465753424, "grad_norm": 1.9321002960205078, "kl": 0.111328125, "learning_rate": 5.424657534246575e-07, "loss": 0.0044, "reward": 1.725000023841858, "reward_std": 0.15813970565795898, "rewards/accuracy_reward": 0.7328124344348907, "rewards/format_reward": 0.9921875, "step": 1002 }, { "completion_length": 65.5703125, "epoch": 4.579908675799087, "grad_norm": 2.1403932571411133, "kl": 0.133056640625, "learning_rate": 5.420091324200912e-07, "loss": 0.0053, "reward": 1.7848958373069763, "reward_std": 0.21307425945997238, "rewards/accuracy_reward": 0.8083333373069763, "rewards/format_reward": 0.9765625, "step": 1003 }, { "completion_length": 108.484375, "epoch": 4.584474885844749, "grad_norm": 1.755354642868042, "kl": 0.10107421875, "learning_rate": 5.415525114155251e-07, "loss": 0.004, "reward": 1.8961884379386902, "reward_std": 0.060667259618639946, "rewards/accuracy_reward": 0.896188348531723, "rewards/format_reward": 1.0, "step": 1004 }, { "completion_length": 48.4140625, "epoch": 4.589041095890411, "grad_norm": 1.387018084526062, "kl": 0.1494140625, "learning_rate": 5.410958904109589e-07, "loss": 0.006, "reward": 1.8950520753860474, "reward_std": 0.12889967486262321, "rewards/accuracy_reward": 0.9028645753860474, "rewards/format_reward": 0.9921875, "step": 1005 }, { "completion_length": 62.9140625, "epoch": 4.593607305936073, "grad_norm": 3.6895904541015625, "kl": 0.20751953125, "learning_rate": 5.406392694063927e-07, "loss": 0.0083, "reward": 1.6447916626930237, "reward_std": 0.25781603902578354, "rewards/accuracy_reward": 0.6526041626930237, "rewards/format_reward": 0.9921875, "step": 1006 }, { "completion_length": 73.59375, "epoch": 4.598173515981735, "grad_norm": 5.844861030578613, "kl": 0.118896484375, "learning_rate": 5.401826484018265e-07, "loss": 0.0048, "reward": 1.6738625168800354, "reward_std": 0.16072557866573334, "rewards/accuracy_reward": 0.6894874572753906, "rewards/format_reward": 0.984375, "step": 1007 }, { "completion_length": 62.390625, "epoch": 4.602739726027397, "grad_norm": 9.675552368164062, "kl": 0.143798828125, "learning_rate": 5.397260273972602e-07, "loss": 0.0058, "reward": 1.6939173936843872, "reward_std": 0.1647581309080124, "rewards/accuracy_reward": 0.6939173638820648, "rewards/format_reward": 1.0, "step": 1008 }, { "completion_length": 67.1015625, "epoch": 4.607305936073059, "grad_norm": 5.169093132019043, "kl": 0.130859375, "learning_rate": 5.392694063926941e-07, "loss": 0.0052, "reward": 1.6269097924232483, "reward_std": 0.23041004687547684, "rewards/accuracy_reward": 0.6425347030162811, "rewards/format_reward": 0.984375, "step": 1009 }, { "completion_length": 66.6328125, "epoch": 4.6118721461187215, "grad_norm": 1.9441876411437988, "kl": 0.106201171875, "learning_rate": 5.388127853881278e-07, "loss": 0.0043, "reward": 1.5781250596046448, "reward_std": 0.14283225312829018, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 1010 }, { "completion_length": 62.9296875, "epoch": 4.616438356164384, "grad_norm": 3.9866504669189453, "kl": 0.20361328125, "learning_rate": 5.383561643835615e-07, "loss": 0.0081, "reward": 1.718553066253662, "reward_std": 0.1621505692601204, "rewards/accuracy_reward": 0.7185530364513397, "rewards/format_reward": 1.0, "step": 1011 }, { "completion_length": 54.625, "epoch": 4.621004566210045, "grad_norm": 3.620591402053833, "kl": 0.1474609375, "learning_rate": 5.378995433789955e-07, "loss": 0.0059, "reward": 1.813281238079071, "reward_std": 0.07733980193734169, "rewards/accuracy_reward": 0.813281238079071, "rewards/format_reward": 1.0, "step": 1012 }, { "completion_length": 71.109375, "epoch": 4.6255707762557075, "grad_norm": 3.377457857131958, "kl": 0.14208984375, "learning_rate": 5.374429223744292e-07, "loss": 0.0057, "reward": 1.74702388048172, "reward_std": 0.1983274295926094, "rewards/accuracy_reward": 0.7470237612724304, "rewards/format_reward": 1.0, "step": 1013 }, { "completion_length": 59.6171875, "epoch": 4.63013698630137, "grad_norm": 3.1818978786468506, "kl": 0.16064453125, "learning_rate": 5.36986301369863e-07, "loss": 0.0064, "reward": 1.667509913444519, "reward_std": 0.17691579461097717, "rewards/accuracy_reward": 0.6675098836421967, "rewards/format_reward": 1.0, "step": 1014 }, { "completion_length": 62.40625, "epoch": 4.634703196347032, "grad_norm": 2.588076114654541, "kl": 0.198974609375, "learning_rate": 5.365296803652968e-07, "loss": 0.008, "reward": 1.7416791319847107, "reward_std": 0.19379056990146637, "rewards/accuracy_reward": 0.7573040723800659, "rewards/format_reward": 0.984375, "step": 1015 }, { "completion_length": 64.6171875, "epoch": 4.639269406392694, "grad_norm": 2.7476561069488525, "kl": 0.1728515625, "learning_rate": 5.360730593607305e-07, "loss": 0.0069, "reward": 1.8008184432983398, "reward_std": 0.1534968689084053, "rewards/accuracy_reward": 0.8008184134960175, "rewards/format_reward": 1.0, "step": 1016 }, { "completion_length": 71.171875, "epoch": 4.6438356164383565, "grad_norm": 3.0133135318756104, "kl": 0.121826171875, "learning_rate": 5.356164383561644e-07, "loss": 0.0049, "reward": 1.820052146911621, "reward_std": 0.1514080874621868, "rewards/accuracy_reward": 0.8278645575046539, "rewards/format_reward": 0.9921875, "step": 1017 }, { "completion_length": 80.9375, "epoch": 4.648401826484018, "grad_norm": 55.512779235839844, "kl": 0.109375, "learning_rate": 5.351598173515981e-07, "loss": 0.0044, "reward": 1.7374999523162842, "reward_std": 0.1173202246427536, "rewards/accuracy_reward": 0.7374999821186066, "rewards/format_reward": 1.0, "step": 1018 }, { "completion_length": 73.75, "epoch": 4.65296803652968, "grad_norm": 3.2106876373291016, "kl": 0.150390625, "learning_rate": 5.347031963470319e-07, "loss": 0.006, "reward": 1.7260440587997437, "reward_std": 0.22125811874866486, "rewards/accuracy_reward": 0.7338564693927765, "rewards/format_reward": 0.9921875, "step": 1019 }, { "completion_length": 53.90625, "epoch": 4.657534246575342, "grad_norm": 3.0299935340881348, "kl": 0.133056640625, "learning_rate": 5.342465753424658e-07, "loss": 0.0053, "reward": 1.7598958611488342, "reward_std": 0.14248863141983747, "rewards/accuracy_reward": 0.7598957717418671, "rewards/format_reward": 1.0, "step": 1020 }, { "completion_length": 44.4296875, "epoch": 4.662100456621005, "grad_norm": 17.613971710205078, "kl": 0.751953125, "learning_rate": 5.337899543378995e-07, "loss": 0.03, "reward": 1.843323826789856, "reward_std": 0.22472677379846573, "rewards/accuracy_reward": 0.866761326789856, "rewards/format_reward": 0.9765625, "step": 1021 }, { "completion_length": 68.1640625, "epoch": 4.666666666666667, "grad_norm": 3.197855234146118, "kl": 0.171875, "learning_rate": 5.333333333333333e-07, "loss": 0.0069, "reward": 1.805405616760254, "reward_std": 0.15048640966415405, "rewards/accuracy_reward": 0.8054055869579315, "rewards/format_reward": 1.0, "step": 1022 }, { "completion_length": 66.9921875, "epoch": 4.671232876712329, "grad_norm": 3.1693930625915527, "kl": 0.126708984375, "learning_rate": 5.328767123287671e-07, "loss": 0.0051, "reward": 1.6846354603767395, "reward_std": 0.12612807750701904, "rewards/accuracy_reward": 0.6846353709697723, "rewards/format_reward": 1.0, "step": 1023 }, { "completion_length": 81.1484375, "epoch": 4.675799086757991, "grad_norm": 2.464825391769409, "kl": 0.106689453125, "learning_rate": 5.324200913242008e-07, "loss": 0.0043, "reward": 1.8835937976837158, "reward_std": 0.08758953772485256, "rewards/accuracy_reward": 0.8835937082767487, "rewards/format_reward": 1.0, "step": 1024 }, { "completion_length": 64.78125, "epoch": 4.680365296803653, "grad_norm": 3.6387760639190674, "kl": 0.17333984375, "learning_rate": 5.319634703196348e-07, "loss": 0.0069, "reward": 1.7731905579566956, "reward_std": 0.16570382565259933, "rewards/accuracy_reward": 0.7731905579566956, "rewards/format_reward": 1.0, "step": 1025 }, { "completion_length": 60.9296875, "epoch": 4.684931506849315, "grad_norm": 20.164772033691406, "kl": 0.13232421875, "learning_rate": 5.315068493150685e-07, "loss": 0.0053, "reward": 1.689843773841858, "reward_std": 0.17715102434158325, "rewards/accuracy_reward": 0.6976562738418579, "rewards/format_reward": 0.9921875, "step": 1026 }, { "completion_length": 75.0, "epoch": 4.689497716894977, "grad_norm": 2.0931997299194336, "kl": 0.1484375, "learning_rate": 5.310502283105022e-07, "loss": 0.0059, "reward": 1.630468726158142, "reward_std": 0.16156607866287231, "rewards/accuracy_reward": 0.6460937261581421, "rewards/format_reward": 0.984375, "step": 1027 }, { "completion_length": 49.234375, "epoch": 4.69406392694064, "grad_norm": 5.067066192626953, "kl": 0.1962890625, "learning_rate": 5.305936073059361e-07, "loss": 0.0078, "reward": 1.8552082777023315, "reward_std": 0.15139273926615715, "rewards/accuracy_reward": 0.8552083373069763, "rewards/format_reward": 1.0, "step": 1028 }, { "completion_length": 80.9453125, "epoch": 4.698630136986301, "grad_norm": 4.103106498718262, "kl": 0.126708984375, "learning_rate": 5.301369863013698e-07, "loss": 0.0051, "reward": 1.678125023841858, "reward_std": 0.17818758636713028, "rewards/accuracy_reward": 0.6859374642372131, "rewards/format_reward": 0.9921875, "step": 1029 }, { "completion_length": 75.2734375, "epoch": 4.703196347031963, "grad_norm": 4.427400588989258, "kl": 0.25732421875, "learning_rate": 5.296803652968036e-07, "loss": 0.0103, "reward": 1.696877896785736, "reward_std": 0.16482967138290405, "rewards/accuracy_reward": 0.7046903669834137, "rewards/format_reward": 0.9921875, "step": 1030 }, { "completion_length": 56.8515625, "epoch": 4.707762557077626, "grad_norm": 5.5338921546936035, "kl": 0.22607421875, "learning_rate": 5.292237442922374e-07, "loss": 0.0091, "reward": 1.6015625, "reward_std": 0.25800998508930206, "rewards/accuracy_reward": 0.6249999403953552, "rewards/format_reward": 0.9765625, "step": 1031 }, { "completion_length": 68.96875, "epoch": 4.712328767123288, "grad_norm": 2.648360013961792, "kl": 0.138427734375, "learning_rate": 5.287671232876712e-07, "loss": 0.0055, "reward": 1.7213541865348816, "reward_std": 0.1301564909517765, "rewards/accuracy_reward": 0.7291666567325592, "rewards/format_reward": 0.9921875, "step": 1032 }, { "completion_length": 76.53125, "epoch": 4.71689497716895, "grad_norm": 3.0798656940460205, "kl": 0.126953125, "learning_rate": 5.283105022831051e-07, "loss": 0.0051, "reward": 1.7125211358070374, "reward_std": 0.09494294971227646, "rewards/accuracy_reward": 0.7125210464000702, "rewards/format_reward": 1.0, "step": 1033 }, { "completion_length": 68.859375, "epoch": 4.7214611872146115, "grad_norm": 5.309957981109619, "kl": 0.154296875, "learning_rate": 5.278538812785388e-07, "loss": 0.0062, "reward": 1.609375, "reward_std": 0.20264848321676254, "rewards/accuracy_reward": 0.6093749701976776, "rewards/format_reward": 1.0, "step": 1034 }, { "completion_length": 77.921875, "epoch": 4.726027397260274, "grad_norm": 4.624930381774902, "kl": 0.13427734375, "learning_rate": 5.273972602739725e-07, "loss": 0.0054, "reward": 1.7170308232307434, "reward_std": 0.19927022606134415, "rewards/accuracy_reward": 0.7248433232307434, "rewards/format_reward": 0.9921875, "step": 1035 }, { "completion_length": 69.5234375, "epoch": 4.730593607305936, "grad_norm": 3.3374686241149902, "kl": 0.116943359375, "learning_rate": 5.269406392694064e-07, "loss": 0.0047, "reward": 1.6639309525489807, "reward_std": 0.1389038860797882, "rewards/accuracy_reward": 0.6717434823513031, "rewards/format_reward": 0.9921875, "step": 1036 }, { "completion_length": 65.140625, "epoch": 4.735159817351598, "grad_norm": 3.858757495880127, "kl": 0.1416015625, "learning_rate": 5.264840182648401e-07, "loss": 0.0057, "reward": 1.6567708253860474, "reward_std": 0.16906771063804626, "rewards/accuracy_reward": 0.6723958551883698, "rewards/format_reward": 0.984375, "step": 1037 }, { "completion_length": 70.0859375, "epoch": 4.739726027397261, "grad_norm": 3.435810089111328, "kl": 0.156005859375, "learning_rate": 5.260273972602739e-07, "loss": 0.0063, "reward": 1.6902902126312256, "reward_std": 0.19441108405590057, "rewards/accuracy_reward": 0.698102593421936, "rewards/format_reward": 0.9921875, "step": 1038 }, { "completion_length": 60.0859375, "epoch": 4.744292237442922, "grad_norm": 4.343775272369385, "kl": 0.20263671875, "learning_rate": 5.255707762557078e-07, "loss": 0.0081, "reward": 1.8015338778495789, "reward_std": 0.20425771176815033, "rewards/accuracy_reward": 0.8015338182449341, "rewards/format_reward": 1.0, "step": 1039 }, { "completion_length": 67.5, "epoch": 4.748858447488584, "grad_norm": 2.354308605194092, "kl": 0.1474609375, "learning_rate": 5.251141552511415e-07, "loss": 0.0059, "reward": 1.8274368047714233, "reward_std": 0.13800114393234253, "rewards/accuracy_reward": 0.8352491855621338, "rewards/format_reward": 0.9921875, "step": 1040 }, { "completion_length": 65.109375, "epoch": 4.7534246575342465, "grad_norm": 4.133329391479492, "kl": 0.119140625, "learning_rate": 5.246575342465754e-07, "loss": 0.0048, "reward": 1.7390583753585815, "reward_std": 0.1775147169828415, "rewards/accuracy_reward": 0.7390583753585815, "rewards/format_reward": 1.0, "step": 1041 }, { "completion_length": 96.8125, "epoch": 4.757990867579909, "grad_norm": 2.921767473220825, "kl": 0.090576171875, "learning_rate": 5.242009132420091e-07, "loss": 0.0036, "reward": 1.8279520869255066, "reward_std": 0.11337075009942055, "rewards/accuracy_reward": 0.843576967716217, "rewards/format_reward": 0.984375, "step": 1042 }, { "completion_length": 76.25, "epoch": 4.762557077625571, "grad_norm": 1.6646754741668701, "kl": 0.0908203125, "learning_rate": 5.237442922374428e-07, "loss": 0.0036, "reward": 1.761111080646515, "reward_std": 0.08823190443217754, "rewards/accuracy_reward": 0.7611111104488373, "rewards/format_reward": 1.0, "step": 1043 }, { "completion_length": 68.90625, "epoch": 4.767123287671232, "grad_norm": 6.794203758239746, "kl": 0.12890625, "learning_rate": 5.232876712328767e-07, "loss": 0.0052, "reward": 1.6395359635353088, "reward_std": 0.1700442135334015, "rewards/accuracy_reward": 0.6473484635353088, "rewards/format_reward": 0.9921875, "step": 1044 }, { "completion_length": 64.5625, "epoch": 4.771689497716895, "grad_norm": 1.9526876211166382, "kl": 0.160888671875, "learning_rate": 5.228310502283105e-07, "loss": 0.0064, "reward": 1.818750023841858, "reward_std": 0.19044626876711845, "rewards/accuracy_reward": 0.8265624642372131, "rewards/format_reward": 0.9921875, "step": 1045 }, { "completion_length": 71.8359375, "epoch": 4.776255707762557, "grad_norm": 5.581969261169434, "kl": 0.121337890625, "learning_rate": 5.223744292237443e-07, "loss": 0.0049, "reward": 1.7421875596046448, "reward_std": 0.1561211347579956, "rewards/accuracy_reward": 0.7421874701976776, "rewards/format_reward": 1.0, "step": 1046 }, { "completion_length": 65.2109375, "epoch": 4.780821917808219, "grad_norm": 3.522632360458374, "kl": 0.139892578125, "learning_rate": 5.219178082191781e-07, "loss": 0.0056, "reward": 1.8170573115348816, "reward_std": 0.1570434384047985, "rewards/accuracy_reward": 0.840494692325592, "rewards/format_reward": 0.9765625, "step": 1047 }, { "completion_length": 82.296875, "epoch": 4.7853881278538815, "grad_norm": 8.847128868103027, "kl": 0.110107421875, "learning_rate": 5.214611872146118e-07, "loss": 0.0044, "reward": 1.8859375715255737, "reward_std": 0.0849014800041914, "rewards/accuracy_reward": 0.8859374523162842, "rewards/format_reward": 1.0, "step": 1048 }, { "completion_length": 73.2578125, "epoch": 4.789954337899544, "grad_norm": 9.131427764892578, "kl": 0.13525390625, "learning_rate": 5.210045662100457e-07, "loss": 0.0054, "reward": 1.6504226922988892, "reward_std": 0.18509591370821, "rewards/accuracy_reward": 0.6582351326942444, "rewards/format_reward": 0.9921875, "step": 1049 }, { "completion_length": 79.2265625, "epoch": 4.794520547945205, "grad_norm": 3.1302144527435303, "kl": 0.123779296875, "learning_rate": 5.205479452054794e-07, "loss": 0.005, "reward": 1.6324219703674316, "reward_std": 0.22072409093379974, "rewards/accuracy_reward": 0.6480468809604645, "rewards/format_reward": 0.984375, "step": 1050 }, { "completion_length": 68.59375, "epoch": 4.799086757990867, "grad_norm": 1.5062321424484253, "kl": 0.13525390625, "learning_rate": 5.200913242009131e-07, "loss": 0.0054, "reward": 1.7230769991874695, "reward_std": 0.10901044122874737, "rewards/accuracy_reward": 0.7387019097805023, "rewards/format_reward": 0.984375, "step": 1051 }, { "completion_length": 57.203125, "epoch": 4.80365296803653, "grad_norm": 2.9574577808380127, "kl": 0.18505859375, "learning_rate": 5.196347031963471e-07, "loss": 0.0074, "reward": 1.6598585844039917, "reward_std": 0.14134880900382996, "rewards/accuracy_reward": 0.6676710844039917, "rewards/format_reward": 0.9921875, "step": 1052 }, { "completion_length": 76.7734375, "epoch": 4.808219178082192, "grad_norm": 2.4928853511810303, "kl": 0.1630859375, "learning_rate": 5.191780821917808e-07, "loss": 0.0065, "reward": 1.7294270992279053, "reward_std": 0.2240969017148018, "rewards/accuracy_reward": 0.7528644800186157, "rewards/format_reward": 0.9765625, "step": 1053 }, { "completion_length": 84.828125, "epoch": 4.812785388127854, "grad_norm": 4.220333576202393, "kl": 0.114013671875, "learning_rate": 5.187214611872146e-07, "loss": 0.0046, "reward": 1.5883237719535828, "reward_std": 0.18970628082752228, "rewards/accuracy_reward": 0.5961362421512604, "rewards/format_reward": 0.9921875, "step": 1054 }, { "completion_length": 80.0546875, "epoch": 4.817351598173516, "grad_norm": 2.9225761890411377, "kl": 0.084716796875, "learning_rate": 5.182648401826484e-07, "loss": 0.0034, "reward": 1.8234375715255737, "reward_std": 0.1099486481398344, "rewards/accuracy_reward": 0.8234374225139618, "rewards/format_reward": 1.0, "step": 1055 }, { "completion_length": 68.84375, "epoch": 4.821917808219178, "grad_norm": 15.956554412841797, "kl": 0.126953125, "learning_rate": 5.178082191780821e-07, "loss": 0.0051, "reward": 1.707698404788971, "reward_std": 0.12136031687259674, "rewards/accuracy_reward": 0.7155108153820038, "rewards/format_reward": 0.9921875, "step": 1056 }, { "completion_length": 61.59375, "epoch": 4.82648401826484, "grad_norm": 3.7795159816741943, "kl": 0.19775390625, "learning_rate": 5.17351598173516e-07, "loss": 0.0079, "reward": 1.7173488140106201, "reward_std": 0.15133387595415115, "rewards/accuracy_reward": 0.7251611948013306, "rewards/format_reward": 0.9921875, "step": 1057 }, { "completion_length": 85.015625, "epoch": 4.831050228310502, "grad_norm": 4.373653888702393, "kl": 0.104736328125, "learning_rate": 5.168949771689497e-07, "loss": 0.0042, "reward": 1.8429688215255737, "reward_std": 0.08631845097988844, "rewards/accuracy_reward": 0.8507812023162842, "rewards/format_reward": 0.9921875, "step": 1058 }, { "completion_length": 80.9296875, "epoch": 4.835616438356165, "grad_norm": 2.2139017581939697, "kl": 0.091552734375, "learning_rate": 5.164383561643836e-07, "loss": 0.0037, "reward": 1.7554687857627869, "reward_std": 0.12153397500514984, "rewards/accuracy_reward": 0.7632811963558197, "rewards/format_reward": 0.9921875, "step": 1059 }, { "completion_length": 69.9296875, "epoch": 4.840182648401827, "grad_norm": 2.391106128692627, "kl": 0.113525390625, "learning_rate": 5.159817351598174e-07, "loss": 0.0045, "reward": 1.6293052434921265, "reward_std": 0.18556885421276093, "rewards/accuracy_reward": 0.6371176838874817, "rewards/format_reward": 0.9921875, "step": 1060 }, { "completion_length": 75.5078125, "epoch": 4.844748858447488, "grad_norm": 3.7311413288116455, "kl": 0.1337890625, "learning_rate": 5.155251141552511e-07, "loss": 0.0053, "reward": 1.8738667964935303, "reward_std": 0.09744009375572205, "rewards/accuracy_reward": 0.8738666772842407, "rewards/format_reward": 1.0, "step": 1061 }, { "completion_length": 88.328125, "epoch": 4.8493150684931505, "grad_norm": 2.399247884750366, "kl": 0.146484375, "learning_rate": 5.150684931506849e-07, "loss": 0.0058, "reward": 1.706250011920929, "reward_std": 0.21963772177696228, "rewards/accuracy_reward": 0.7140624821186066, "rewards/format_reward": 0.9921875, "step": 1062 }, { "completion_length": 73.6171875, "epoch": 4.853881278538813, "grad_norm": 2.581521511077881, "kl": 0.1435546875, "learning_rate": 5.146118721461187e-07, "loss": 0.0058, "reward": 1.7601500153541565, "reward_std": 0.14986789226531982, "rewards/accuracy_reward": 0.7679624557495117, "rewards/format_reward": 0.9921875, "step": 1063 }, { "completion_length": 72.1484375, "epoch": 4.858447488584475, "grad_norm": 8.218365669250488, "kl": 0.1533203125, "learning_rate": 5.141552511415524e-07, "loss": 0.0061, "reward": 1.7895833253860474, "reward_std": 0.18269683420658112, "rewards/accuracy_reward": 0.8052083253860474, "rewards/format_reward": 0.984375, "step": 1064 }, { "completion_length": 75.6875, "epoch": 4.863013698630137, "grad_norm": 6.358737468719482, "kl": 0.130126953125, "learning_rate": 5.136986301369864e-07, "loss": 0.0052, "reward": 1.8111705780029297, "reward_std": 0.11504914239048958, "rewards/accuracy_reward": 0.8111704587936401, "rewards/format_reward": 1.0, "step": 1065 }, { "completion_length": 72.0859375, "epoch": 4.867579908675799, "grad_norm": 1.565542221069336, "kl": 0.123779296875, "learning_rate": 5.132420091324201e-07, "loss": 0.005, "reward": 1.7661458849906921, "reward_std": 0.13723178207874298, "rewards/accuracy_reward": 0.773958295583725, "rewards/format_reward": 0.9921875, "step": 1066 }, { "completion_length": 81.34375, "epoch": 4.872146118721461, "grad_norm": 1.8809258937835693, "kl": 0.123291015625, "learning_rate": 5.127853881278538e-07, "loss": 0.0049, "reward": 1.8238808512687683, "reward_std": 0.09778433851897717, "rewards/accuracy_reward": 0.8238807916641235, "rewards/format_reward": 1.0, "step": 1067 }, { "completion_length": 80.703125, "epoch": 4.876712328767123, "grad_norm": 1.7312757968902588, "kl": 0.0810546875, "learning_rate": 5.123287671232877e-07, "loss": 0.0032, "reward": 1.624392330646515, "reward_std": 0.09445247054100037, "rewards/accuracy_reward": 0.6243923306465149, "rewards/format_reward": 1.0, "step": 1068 }, { "completion_length": 73.265625, "epoch": 4.8812785388127855, "grad_norm": 2.2705366611480713, "kl": 0.134033203125, "learning_rate": 5.118721461187214e-07, "loss": 0.0054, "reward": 1.6665404438972473, "reward_std": 0.17886501550674438, "rewards/accuracy_reward": 0.6821653842926025, "rewards/format_reward": 0.984375, "step": 1069 }, { "completion_length": 74.671875, "epoch": 4.885844748858448, "grad_norm": 2.8938543796539307, "kl": 0.12646484375, "learning_rate": 5.114155251141552e-07, "loss": 0.0051, "reward": 1.8554381728172302, "reward_std": 0.18266908079385757, "rewards/accuracy_reward": 0.8866880834102631, "rewards/format_reward": 0.96875, "step": 1070 }, { "completion_length": 84.8203125, "epoch": 4.890410958904109, "grad_norm": 6.559085845947266, "kl": 0.0888671875, "learning_rate": 5.10958904109589e-07, "loss": 0.0036, "reward": 1.6242188215255737, "reward_std": 0.14865797758102417, "rewards/accuracy_reward": 0.6320312321186066, "rewards/format_reward": 0.9921875, "step": 1071 }, { "completion_length": 72.203125, "epoch": 4.894977168949771, "grad_norm": 2.1274454593658447, "kl": 0.126708984375, "learning_rate": 5.105022831050228e-07, "loss": 0.0051, "reward": 1.7550346851348877, "reward_std": 0.15320852398872375, "rewards/accuracy_reward": 0.7628472149372101, "rewards/format_reward": 0.9921875, "step": 1072 }, { "completion_length": 73.6484375, "epoch": 4.899543378995434, "grad_norm": 2.215421676635742, "kl": 0.134765625, "learning_rate": 5.100456621004567e-07, "loss": 0.0054, "reward": 1.7328497171401978, "reward_std": 0.17783771082758904, "rewards/accuracy_reward": 0.756287157535553, "rewards/format_reward": 0.9765625, "step": 1073 }, { "completion_length": 82.7109375, "epoch": 4.904109589041096, "grad_norm": 7.915944576263428, "kl": 0.1328125, "learning_rate": 5.095890410958904e-07, "loss": 0.0053, "reward": 1.7984375953674316, "reward_std": 0.14389308542013168, "rewards/accuracy_reward": 0.8062499761581421, "rewards/format_reward": 0.9921875, "step": 1074 }, { "completion_length": 56.046875, "epoch": 4.908675799086758, "grad_norm": 6.48905086517334, "kl": 0.142822265625, "learning_rate": 5.091324200913241e-07, "loss": 0.0057, "reward": 1.7908854484558105, "reward_std": 0.16380437463521957, "rewards/accuracy_reward": 0.7986978888511658, "rewards/format_reward": 0.9921875, "step": 1075 }, { "completion_length": 76.328125, "epoch": 4.91324200913242, "grad_norm": 15.81816577911377, "kl": 0.103759765625, "learning_rate": 5.08675799086758e-07, "loss": 0.0042, "reward": 1.774218738079071, "reward_std": 0.14345712214708328, "rewards/accuracy_reward": 0.782031238079071, "rewards/format_reward": 0.9921875, "step": 1076 }, { "completion_length": 65.71875, "epoch": 4.917808219178082, "grad_norm": 2.252168655395508, "kl": 0.138916015625, "learning_rate": 5.082191780821917e-07, "loss": 0.0056, "reward": 1.7870659828186035, "reward_std": 0.14891928434371948, "rewards/accuracy_reward": 0.7870660126209259, "rewards/format_reward": 1.0, "step": 1077 }, { "completion_length": 84.4921875, "epoch": 4.922374429223744, "grad_norm": 6.789572715759277, "kl": 0.1016845703125, "learning_rate": 5.077625570776255e-07, "loss": 0.0041, "reward": 1.780468761920929, "reward_std": 0.12099719420075417, "rewards/accuracy_reward": 0.7882812023162842, "rewards/format_reward": 0.9921875, "step": 1078 }, { "completion_length": 93.4765625, "epoch": 4.926940639269406, "grad_norm": 1.4697335958480835, "kl": 0.0927734375, "learning_rate": 5.073059360730594e-07, "loss": 0.0037, "reward": 1.7980054020881653, "reward_std": 0.06987884640693665, "rewards/accuracy_reward": 0.7980053424835205, "rewards/format_reward": 1.0, "step": 1079 }, { "completion_length": 67.8125, "epoch": 4.931506849315069, "grad_norm": 1.78167724609375, "kl": 0.12353515625, "learning_rate": 5.068493150684931e-07, "loss": 0.005, "reward": 1.8179687857627869, "reward_std": 0.09306978806853294, "rewards/accuracy_reward": 0.8257812857627869, "rewards/format_reward": 0.9921875, "step": 1080 }, { "completion_length": 71.4453125, "epoch": 4.936073059360731, "grad_norm": 2.6702702045440674, "kl": 0.146484375, "learning_rate": 5.06392694063927e-07, "loss": 0.0059, "reward": 1.8533853888511658, "reward_std": 0.27950893342494965, "rewards/accuracy_reward": 0.884635329246521, "rewards/format_reward": 0.96875, "step": 1081 }, { "completion_length": 98.71875, "epoch": 4.940639269406392, "grad_norm": 2.6240620613098145, "kl": 0.06982421875, "learning_rate": 5.059360730593607e-07, "loss": 0.0028, "reward": 1.7433823347091675, "reward_std": 0.1912398487329483, "rewards/accuracy_reward": 0.7746323049068451, "rewards/format_reward": 0.96875, "step": 1082 }, { "completion_length": 66.03125, "epoch": 4.945205479452055, "grad_norm": 5.760696887969971, "kl": 0.15234375, "learning_rate": 5.054794520547944e-07, "loss": 0.0061, "reward": 1.7298898100852966, "reward_std": 0.21766673773527145, "rewards/accuracy_reward": 0.7377023696899414, "rewards/format_reward": 0.9921875, "step": 1083 }, { "completion_length": 68.59375, "epoch": 4.949771689497717, "grad_norm": 5.8692216873168945, "kl": 0.16357421875, "learning_rate": 5.050228310502283e-07, "loss": 0.0065, "reward": 1.7146621942520142, "reward_std": 0.10843057557940483, "rewards/accuracy_reward": 0.7146620750427246, "rewards/format_reward": 1.0, "step": 1084 }, { "completion_length": 92.359375, "epoch": 4.954337899543379, "grad_norm": 3.167354106903076, "kl": 0.1181640625, "learning_rate": 5.045662100456621e-07, "loss": 0.0047, "reward": 1.6780134439468384, "reward_std": 0.2429228127002716, "rewards/accuracy_reward": 0.6936383247375488, "rewards/format_reward": 0.984375, "step": 1085 }, { "completion_length": 79.984375, "epoch": 4.958904109589041, "grad_norm": 2.3840832710266113, "kl": 0.113037109375, "learning_rate": 5.041095890410959e-07, "loss": 0.0045, "reward": 1.7828125357627869, "reward_std": 0.19722937047481537, "rewards/accuracy_reward": 0.8140624761581421, "rewards/format_reward": 0.96875, "step": 1086 }, { "completion_length": 60.84375, "epoch": 4.963470319634704, "grad_norm": 2.938784599304199, "kl": 0.140380859375, "learning_rate": 5.036529680365297e-07, "loss": 0.0056, "reward": 1.6975947618484497, "reward_std": 0.22429338097572327, "rewards/accuracy_reward": 0.7210322618484497, "rewards/format_reward": 0.9765625, "step": 1087 }, { "completion_length": 73.8125, "epoch": 4.968036529680365, "grad_norm": 3.95701265335083, "kl": 0.11474609375, "learning_rate": 5.031963470319634e-07, "loss": 0.0046, "reward": 1.6440104246139526, "reward_std": 0.1992247775197029, "rewards/accuracy_reward": 0.651822954416275, "rewards/format_reward": 0.9921875, "step": 1088 }, { "completion_length": 71.96875, "epoch": 4.972602739726027, "grad_norm": 7.810092926025391, "kl": 0.1279296875, "learning_rate": 5.027397260273973e-07, "loss": 0.0051, "reward": 1.7612414360046387, "reward_std": 0.17399008572101593, "rewards/accuracy_reward": 0.7690538763999939, "rewards/format_reward": 0.9921875, "step": 1089 }, { "completion_length": 95.4765625, "epoch": 4.9771689497716896, "grad_norm": 2.2332069873809814, "kl": 0.106201171875, "learning_rate": 5.02283105022831e-07, "loss": 0.0042, "reward": 1.6498697996139526, "reward_std": 0.23457543551921844, "rewards/accuracy_reward": 0.6811197698116302, "rewards/format_reward": 0.96875, "step": 1090 }, { "completion_length": 67.2421875, "epoch": 4.981735159817352, "grad_norm": 2.379009485244751, "kl": 0.138916015625, "learning_rate": 5.018264840182647e-07, "loss": 0.0056, "reward": 1.758593738079071, "reward_std": 0.15046585351228714, "rewards/accuracy_reward": 0.766406238079071, "rewards/format_reward": 0.9921875, "step": 1091 }, { "completion_length": 80.3515625, "epoch": 4.986301369863014, "grad_norm": 1.9919111728668213, "kl": 0.100830078125, "learning_rate": 5.013698630136987e-07, "loss": 0.004, "reward": 1.7437500357627869, "reward_std": 0.16439745388925076, "rewards/accuracy_reward": 0.7671874463558197, "rewards/format_reward": 0.9765625, "step": 1092 }, { "completion_length": 87.265625, "epoch": 4.9908675799086755, "grad_norm": 1.678598403930664, "kl": 0.108642578125, "learning_rate": 5.009132420091324e-07, "loss": 0.0043, "reward": 1.6603811383247375, "reward_std": 0.1863521747291088, "rewards/accuracy_reward": 0.6838186085224152, "rewards/format_reward": 0.9765625, "step": 1093 }, { "completion_length": 86.515625, "epoch": 4.995433789954338, "grad_norm": 1.237178921699524, "kl": 0.105224609375, "learning_rate": 5.004566210045662e-07, "loss": 0.0042, "reward": 1.8344122171401978, "reward_std": 0.09498458355665207, "rewards/accuracy_reward": 0.8422246873378754, "rewards/format_reward": 0.9921875, "step": 1094 }, { "completion_length": 49.625, "epoch": 5.0, "grad_norm": 3.034471273422241, "kl": 0.10205078125, "learning_rate": 5e-07, "loss": 0.0039, "reward": 1.1875, "reward_std": 0.36278264224529266, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 1.0, "step": 1095 }, { "completion_length": 73.484375, "epoch": 5.004566210045662, "grad_norm": 4.169987678527832, "kl": 0.15771484375, "learning_rate": 4.995433789954337e-07, "loss": 0.0063, "reward": 1.7950236201286316, "reward_std": 0.1954728439450264, "rewards/accuracy_reward": 0.818461149930954, "rewards/format_reward": 0.9765625, "step": 1096 }, { "completion_length": 53.1484375, "epoch": 5.0091324200913245, "grad_norm": 2.2528958320617676, "kl": 0.18115234375, "learning_rate": 4.990867579908676e-07, "loss": 0.0073, "reward": 1.6565104126930237, "reward_std": 0.2434339076280594, "rewards/accuracy_reward": 0.6799479424953461, "rewards/format_reward": 0.9765625, "step": 1097 }, { "completion_length": 77.0390625, "epoch": 5.013698630136986, "grad_norm": 2.312629461288452, "kl": 0.150634765625, "learning_rate": 4.986301369863014e-07, "loss": 0.006, "reward": 1.6930060386657715, "reward_std": 0.15761591121554375, "rewards/accuracy_reward": 0.7008183896541595, "rewards/format_reward": 0.9921875, "step": 1098 }, { "completion_length": 68.0625, "epoch": 5.018264840182648, "grad_norm": 4.447121620178223, "kl": 0.180908203125, "learning_rate": 4.981735159817351e-07, "loss": 0.0072, "reward": 1.689843773841858, "reward_std": 0.19052022695541382, "rewards/accuracy_reward": 0.7054686844348907, "rewards/format_reward": 0.984375, "step": 1099 }, { "completion_length": 63.734375, "epoch": 5.0228310502283104, "grad_norm": 5.222453594207764, "kl": 0.1005859375, "learning_rate": 4.977168949771689e-07, "loss": 0.004, "reward": 1.8026909828186035, "reward_std": 0.11854784563183784, "rewards/accuracy_reward": 0.8105034232139587, "rewards/format_reward": 0.9921875, "step": 1100 }, { "completion_length": 97.75, "epoch": 5.027397260273973, "grad_norm": 2.315014362335205, "kl": 0.0711669921875, "learning_rate": 4.972602739726027e-07, "loss": 0.0028, "reward": 1.7503806352615356, "reward_std": 0.11963648349046707, "rewards/accuracy_reward": 0.7503806054592133, "rewards/format_reward": 1.0, "step": 1101 }, { "completion_length": 69.828125, "epoch": 5.031963470319635, "grad_norm": 2.1413135528564453, "kl": 0.13818359375, "learning_rate": 4.968036529680365e-07, "loss": 0.0055, "reward": 1.8750000596046448, "reward_std": 0.10205792635679245, "rewards/accuracy_reward": 0.8749999105930328, "rewards/format_reward": 1.0, "step": 1102 }, { "completion_length": 95.1328125, "epoch": 5.036529680365296, "grad_norm": 1.4922230243682861, "kl": 0.1259765625, "learning_rate": 4.963470319634703e-07, "loss": 0.005, "reward": 1.7955846786499023, "reward_std": 0.07053190469741821, "rewards/accuracy_reward": 0.79558464884758, "rewards/format_reward": 1.0, "step": 1103 }, { "completion_length": 75.15625, "epoch": 5.041095890410959, "grad_norm": 4.347703456878662, "kl": 0.125244140625, "learning_rate": 4.958904109589041e-07, "loss": 0.005, "reward": 1.6125000715255737, "reward_std": 0.16528953611850739, "rewards/accuracy_reward": 0.612500011920929, "rewards/format_reward": 1.0, "step": 1104 }, { "completion_length": 83.9453125, "epoch": 5.045662100456621, "grad_norm": 1.9074816703796387, "kl": 0.109375, "learning_rate": 4.954337899543379e-07, "loss": 0.0044, "reward": 1.810937523841858, "reward_std": 0.16448542103171349, "rewards/accuracy_reward": 0.8265624642372131, "rewards/format_reward": 0.984375, "step": 1105 }, { "completion_length": 66.875, "epoch": 5.050228310502283, "grad_norm": 2.969620943069458, "kl": 0.12841796875, "learning_rate": 4.949771689497717e-07, "loss": 0.0051, "reward": 1.6987723112106323, "reward_std": 0.19702571630477905, "rewards/accuracy_reward": 0.6987722814083099, "rewards/format_reward": 1.0, "step": 1106 }, { "completion_length": 97.59375, "epoch": 5.054794520547945, "grad_norm": 2.6807308197021484, "kl": 0.086669921875, "learning_rate": 4.945205479452055e-07, "loss": 0.0035, "reward": 1.841210961341858, "reward_std": 0.1313929297029972, "rewards/accuracy_reward": 0.8490233421325684, "rewards/format_reward": 0.9921875, "step": 1107 }, { "completion_length": 67.1015625, "epoch": 5.059360730593608, "grad_norm": 1.8944909572601318, "kl": 0.135986328125, "learning_rate": 4.940639269406393e-07, "loss": 0.0054, "reward": 1.8903645873069763, "reward_std": 0.09690769761800766, "rewards/accuracy_reward": 0.8903645575046539, "rewards/format_reward": 1.0, "step": 1108 }, { "completion_length": 77.6015625, "epoch": 5.063926940639269, "grad_norm": 3.560889959335327, "kl": 0.17138671875, "learning_rate": 4.93607305936073e-07, "loss": 0.0069, "reward": 1.6163216829299927, "reward_std": 0.2479196935892105, "rewards/accuracy_reward": 0.6319466531276703, "rewards/format_reward": 0.984375, "step": 1109 }, { "completion_length": 64.453125, "epoch": 5.068493150684931, "grad_norm": 1.8171228170394897, "kl": 0.158203125, "learning_rate": 4.931506849315068e-07, "loss": 0.0063, "reward": 1.7542577981948853, "reward_std": 0.1399587020277977, "rewards/accuracy_reward": 0.7542578279972076, "rewards/format_reward": 1.0, "step": 1110 }, { "completion_length": 86.5078125, "epoch": 5.073059360730594, "grad_norm": 2.2326784133911133, "kl": 0.106689453125, "learning_rate": 4.926940639269407e-07, "loss": 0.0043, "reward": 1.7170758247375488, "reward_std": 0.146156445145607, "rewards/accuracy_reward": 0.7248883247375488, "rewards/format_reward": 0.9921875, "step": 1111 }, { "completion_length": 60.7109375, "epoch": 5.077625570776256, "grad_norm": 2.7463958263397217, "kl": 0.16796875, "learning_rate": 4.922374429223744e-07, "loss": 0.0067, "reward": 1.6381410360336304, "reward_std": 0.25498080253601074, "rewards/accuracy_reward": 0.6615785360336304, "rewards/format_reward": 0.9765625, "step": 1112 }, { "completion_length": 72.3046875, "epoch": 5.082191780821918, "grad_norm": 1.4025739431381226, "kl": 0.10888671875, "learning_rate": 4.917808219178081e-07, "loss": 0.0044, "reward": 1.7656250596046448, "reward_std": 0.07576144114136696, "rewards/accuracy_reward": 0.7734374403953552, "rewards/format_reward": 0.9921875, "step": 1113 }, { "completion_length": 53.21875, "epoch": 5.0867579908675795, "grad_norm": 3.9440958499908447, "kl": 0.142822265625, "learning_rate": 4.91324200913242e-07, "loss": 0.0057, "reward": 1.8531250357627869, "reward_std": 0.14943039789795876, "rewards/accuracy_reward": 0.8609374761581421, "rewards/format_reward": 0.9921875, "step": 1114 }, { "completion_length": 71.0859375, "epoch": 5.091324200913242, "grad_norm": 2.5945069789886475, "kl": 0.101318359375, "learning_rate": 4.908675799086758e-07, "loss": 0.0041, "reward": 1.7131696939468384, "reward_std": 0.09270836971700191, "rewards/accuracy_reward": 0.7131696045398712, "rewards/format_reward": 1.0, "step": 1115 }, { "completion_length": 65.1796875, "epoch": 5.095890410958904, "grad_norm": 1.469778060913086, "kl": 0.121337890625, "learning_rate": 4.904109589041096e-07, "loss": 0.0048, "reward": 1.7162946462631226, "reward_std": 0.12621085345745087, "rewards/accuracy_reward": 0.7241071164608002, "rewards/format_reward": 0.9921875, "step": 1116 }, { "completion_length": 80.140625, "epoch": 5.100456621004566, "grad_norm": 2.706517457962036, "kl": 0.099365234375, "learning_rate": 4.899543378995434e-07, "loss": 0.004, "reward": 1.8068211078643799, "reward_std": 0.12114018388092518, "rewards/accuracy_reward": 0.8224460184574127, "rewards/format_reward": 0.984375, "step": 1117 }, { "completion_length": 58.5078125, "epoch": 5.105022831050229, "grad_norm": 4.029208183288574, "kl": 0.17138671875, "learning_rate": 4.894977168949771e-07, "loss": 0.0069, "reward": 1.7942708134651184, "reward_std": 0.09960613120347261, "rewards/accuracy_reward": 0.7942708432674408, "rewards/format_reward": 1.0, "step": 1118 }, { "completion_length": 67.359375, "epoch": 5.109589041095891, "grad_norm": 2.30452823638916, "kl": 0.15185546875, "learning_rate": 4.89041095890411e-07, "loss": 0.0061, "reward": 1.7510417103767395, "reward_std": 0.18200254440307617, "rewards/accuracy_reward": 0.7666666209697723, "rewards/format_reward": 0.984375, "step": 1119 }, { "completion_length": 70.984375, "epoch": 5.114155251141552, "grad_norm": 1.813476324081421, "kl": 0.1337890625, "learning_rate": 4.885844748858447e-07, "loss": 0.0054, "reward": 1.7627604007720947, "reward_std": 0.09784993343055248, "rewards/accuracy_reward": 0.76276034116745, "rewards/format_reward": 1.0, "step": 1120 }, { "completion_length": 67.1171875, "epoch": 5.1187214611872145, "grad_norm": 2.7639548778533936, "kl": 0.158203125, "learning_rate": 4.881278538812786e-07, "loss": 0.0063, "reward": 1.6960286498069763, "reward_std": 0.1810276135802269, "rewards/accuracy_reward": 0.6960286498069763, "rewards/format_reward": 1.0, "step": 1121 }, { "completion_length": 64.8984375, "epoch": 5.123287671232877, "grad_norm": 1.0785552263259888, "kl": 0.116943359375, "learning_rate": 4.876712328767123e-07, "loss": 0.0047, "reward": 1.7366800904273987, "reward_std": 0.12058132514357567, "rewards/accuracy_reward": 0.7523050308227539, "rewards/format_reward": 0.984375, "step": 1122 }, { "completion_length": 70.203125, "epoch": 5.127853881278539, "grad_norm": 1.4450359344482422, "kl": 0.165771484375, "learning_rate": 4.872146118721461e-07, "loss": 0.0066, "reward": 1.8294271230697632, "reward_std": 0.1284763477742672, "rewards/accuracy_reward": 0.8450520634651184, "rewards/format_reward": 0.984375, "step": 1123 }, { "completion_length": 73.5390625, "epoch": 5.132420091324201, "grad_norm": 1.4733330011367798, "kl": 0.126220703125, "learning_rate": 4.867579908675799e-07, "loss": 0.0051, "reward": 1.8328125476837158, "reward_std": 0.09086007624864578, "rewards/accuracy_reward": 0.8328124284744263, "rewards/format_reward": 1.0, "step": 1124 }, { "completion_length": 62.890625, "epoch": 5.136986301369863, "grad_norm": 5.508318901062012, "kl": 0.16796875, "learning_rate": 4.863013698630137e-07, "loss": 0.0067, "reward": 1.7619792222976685, "reward_std": 0.18441661447286606, "rewards/accuracy_reward": 0.7619791626930237, "rewards/format_reward": 1.0, "step": 1125 }, { "completion_length": 68.4453125, "epoch": 5.141552511415525, "grad_norm": 2.6522018909454346, "kl": 0.16748046875, "learning_rate": 4.858447488584474e-07, "loss": 0.0067, "reward": 1.705004334449768, "reward_std": 0.19281967729330063, "rewards/accuracy_reward": 0.7206293344497681, "rewards/format_reward": 0.984375, "step": 1126 }, { "completion_length": 69.0, "epoch": 5.146118721461187, "grad_norm": 2.304168224334717, "kl": 0.143310546875, "learning_rate": 4.853881278538813e-07, "loss": 0.0057, "reward": 1.7598021030426025, "reward_std": 0.18482757359743118, "rewards/accuracy_reward": 0.7832395136356354, "rewards/format_reward": 0.9765625, "step": 1127 }, { "completion_length": 92.8203125, "epoch": 5.1506849315068495, "grad_norm": 2.8171322345733643, "kl": 0.082763671875, "learning_rate": 4.84931506849315e-07, "loss": 0.0033, "reward": 1.7391226291656494, "reward_std": 0.10928737744688988, "rewards/accuracy_reward": 0.7469350695610046, "rewards/format_reward": 0.9921875, "step": 1128 }, { "completion_length": 66.7578125, "epoch": 5.155251141552512, "grad_norm": 3.0008230209350586, "kl": 0.22119140625, "learning_rate": 4.844748858447489e-07, "loss": 0.0088, "reward": 1.7692708373069763, "reward_std": 0.20399662107229233, "rewards/accuracy_reward": 0.7927083671092987, "rewards/format_reward": 0.9765625, "step": 1129 }, { "completion_length": 71.7734375, "epoch": 5.159817351598173, "grad_norm": 3.2780561447143555, "kl": 0.15673828125, "learning_rate": 4.840182648401826e-07, "loss": 0.0063, "reward": 1.8493314981460571, "reward_std": 0.12171986699104309, "rewards/accuracy_reward": 0.8493313789367676, "rewards/format_reward": 1.0, "step": 1130 }, { "completion_length": 72.875, "epoch": 5.164383561643835, "grad_norm": 2.125765800476074, "kl": 0.1103515625, "learning_rate": 4.835616438356164e-07, "loss": 0.0044, "reward": 1.8101563453674316, "reward_std": 0.09943688660860062, "rewards/accuracy_reward": 0.8179686963558197, "rewards/format_reward": 0.9921875, "step": 1131 }, { "completion_length": 70.375, "epoch": 5.168949771689498, "grad_norm": 8.532958984375, "kl": 0.138671875, "learning_rate": 4.831050228310502e-07, "loss": 0.0055, "reward": 1.7585145235061646, "reward_std": 0.1326666846871376, "rewards/accuracy_reward": 0.7663269340991974, "rewards/format_reward": 0.9921875, "step": 1132 }, { "completion_length": 86.296875, "epoch": 5.17351598173516, "grad_norm": 1.4057005643844604, "kl": 0.105224609375, "learning_rate": 4.82648401826484e-07, "loss": 0.0042, "reward": 1.8115254044532776, "reward_std": 0.059396788477897644, "rewards/accuracy_reward": 0.8115253746509552, "rewards/format_reward": 1.0, "step": 1133 }, { "completion_length": 50.203125, "epoch": 5.178082191780822, "grad_norm": 4.812795162200928, "kl": 0.13623046875, "learning_rate": 4.821917808219178e-07, "loss": 0.0055, "reward": 1.5721354484558105, "reward_std": 0.19010350108146667, "rewards/accuracy_reward": 0.5721354186534882, "rewards/format_reward": 1.0, "step": 1134 }, { "completion_length": 62.796875, "epoch": 5.182648401826484, "grad_norm": 5.795816421508789, "kl": 0.14892578125, "learning_rate": 4.817351598173516e-07, "loss": 0.006, "reward": 1.839672565460205, "reward_std": 0.16954216361045837, "rewards/accuracy_reward": 0.8631100654602051, "rewards/format_reward": 0.9765625, "step": 1135 }, { "completion_length": 65.6015625, "epoch": 5.187214611872146, "grad_norm": 4.185856342315674, "kl": 0.14453125, "learning_rate": 4.812785388127853e-07, "loss": 0.0058, "reward": 1.727814257144928, "reward_std": 0.19888605177402496, "rewards/accuracy_reward": 0.7512516975402832, "rewards/format_reward": 0.9765625, "step": 1136 }, { "completion_length": 74.3046875, "epoch": 5.191780821917808, "grad_norm": 1.525724172592163, "kl": 0.113525390625, "learning_rate": 4.808219178082192e-07, "loss": 0.0045, "reward": 1.7773438096046448, "reward_std": 0.12639044970273972, "rewards/accuracy_reward": 0.77734375, "rewards/format_reward": 1.0, "step": 1137 }, { "completion_length": 54.1875, "epoch": 5.19634703196347, "grad_norm": 2.832690954208374, "kl": 0.21484375, "learning_rate": 4.80365296803653e-07, "loss": 0.0086, "reward": 1.666732907295227, "reward_std": 0.19027332961559296, "rewards/accuracy_reward": 0.6745454370975494, "rewards/format_reward": 0.9921875, "step": 1138 }, { "completion_length": 58.1953125, "epoch": 5.200913242009133, "grad_norm": 2.4609923362731934, "kl": 0.17724609375, "learning_rate": 4.799086757990867e-07, "loss": 0.0071, "reward": 1.7091332077980042, "reward_std": 0.24910558015108109, "rewards/accuracy_reward": 0.7247581779956818, "rewards/format_reward": 0.984375, "step": 1139 }, { "completion_length": 94.75, "epoch": 5.205479452054795, "grad_norm": 11.776135444641113, "kl": 0.2222900390625, "learning_rate": 4.794520547945205e-07, "loss": 0.0089, "reward": 1.803125023841858, "reward_std": 0.09865947626531124, "rewards/accuracy_reward": 0.8109373450279236, "rewards/format_reward": 0.9921875, "step": 1140 }, { "completion_length": 75.5546875, "epoch": 5.210045662100456, "grad_norm": 1.514672875404358, "kl": 0.13330078125, "learning_rate": 4.789954337899543e-07, "loss": 0.0053, "reward": 1.8759238719940186, "reward_std": 0.10365894064307213, "rewards/accuracy_reward": 0.8915487825870514, "rewards/format_reward": 0.984375, "step": 1141 }, { "completion_length": 65.953125, "epoch": 5.2146118721461185, "grad_norm": 2.5782454013824463, "kl": 0.110595703125, "learning_rate": 4.785388127853881e-07, "loss": 0.0044, "reward": 1.6188101172447205, "reward_std": 0.24680721014738083, "rewards/accuracy_reward": 0.6422475874423981, "rewards/format_reward": 0.9765625, "step": 1142 }, { "completion_length": 54.1875, "epoch": 5.219178082191781, "grad_norm": 6.618932723999023, "kl": 0.1689453125, "learning_rate": 4.780821917808219e-07, "loss": 0.0068, "reward": 1.6172246932983398, "reward_std": 0.23022934794425964, "rewards/accuracy_reward": 0.6250371932983398, "rewards/format_reward": 0.9921875, "step": 1143 }, { "completion_length": 75.8203125, "epoch": 5.223744292237443, "grad_norm": 3.0787389278411865, "kl": 0.10595703125, "learning_rate": 4.776255707762557e-07, "loss": 0.0042, "reward": 1.806249976158142, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.8062499165534973, "rewards/format_reward": 1.0, "step": 1144 }, { "completion_length": 65.4453125, "epoch": 5.228310502283105, "grad_norm": 2.9733009338378906, "kl": 0.1591796875, "learning_rate": 4.771689497716894e-07, "loss": 0.0064, "reward": 1.7895833849906921, "reward_std": 0.13740738481283188, "rewards/accuracy_reward": 0.7973958253860474, "rewards/format_reward": 0.9921875, "step": 1145 }, { "completion_length": 68.8828125, "epoch": 5.232876712328767, "grad_norm": 3.3826847076416016, "kl": 0.1767578125, "learning_rate": 4.7671232876712324e-07, "loss": 0.0071, "reward": 1.7227678894996643, "reward_std": 0.2944817692041397, "rewards/accuracy_reward": 0.7540178298950195, "rewards/format_reward": 0.96875, "step": 1146 }, { "completion_length": 104.4140625, "epoch": 5.237442922374429, "grad_norm": 2.469144105911255, "kl": 0.07373046875, "learning_rate": 4.762557077625571e-07, "loss": 0.0029, "reward": 1.896093726158142, "reward_std": 0.07282309047877789, "rewards/accuracy_reward": 0.9039061367511749, "rewards/format_reward": 0.9921875, "step": 1147 }, { "completion_length": 73.578125, "epoch": 5.242009132420091, "grad_norm": 2.969399929046631, "kl": 0.139892578125, "learning_rate": 4.7579908675799086e-07, "loss": 0.0056, "reward": 1.8597139716148376, "reward_std": 0.13000112399458885, "rewards/accuracy_reward": 0.8597138822078705, "rewards/format_reward": 1.0, "step": 1148 }, { "completion_length": 77.1953125, "epoch": 5.2465753424657535, "grad_norm": 1.6237159967422485, "kl": 0.115478515625, "learning_rate": 4.7534246575342465e-07, "loss": 0.0046, "reward": 1.7437500953674316, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.7593749761581421, "rewards/format_reward": 0.984375, "step": 1149 }, { "completion_length": 73.2578125, "epoch": 5.251141552511416, "grad_norm": 3.891481637954712, "kl": 0.108154296875, "learning_rate": 4.748858447488584e-07, "loss": 0.0043, "reward": 1.792336344718933, "reward_std": 0.23970390856266022, "rewards/accuracy_reward": 0.8157738149166107, "rewards/format_reward": 0.9765625, "step": 1150 }, { "completion_length": 92.109375, "epoch": 5.255707762557078, "grad_norm": 2.1832501888275146, "kl": 0.074951171875, "learning_rate": 4.744292237442922e-07, "loss": 0.003, "reward": 1.7013021111488342, "reward_std": 0.13120698183774948, "rewards/accuracy_reward": 0.7091145515441895, "rewards/format_reward": 0.9921875, "step": 1151 }, { "completion_length": 79.3359375, "epoch": 5.260273972602739, "grad_norm": 2.02133846282959, "kl": 0.15185546875, "learning_rate": 4.73972602739726e-07, "loss": 0.0061, "reward": 1.7348758578300476, "reward_std": 0.2100473716855049, "rewards/accuracy_reward": 0.7661258280277252, "rewards/format_reward": 0.96875, "step": 1152 }, { "completion_length": 69.28125, "epoch": 5.264840182648402, "grad_norm": 1.7159175872802734, "kl": 0.146484375, "learning_rate": 4.735159817351598e-07, "loss": 0.0058, "reward": 1.6989583373069763, "reward_std": 0.17543572187423706, "rewards/accuracy_reward": 0.7223958075046539, "rewards/format_reward": 0.9765625, "step": 1153 }, { "completion_length": 62.1328125, "epoch": 5.269406392694064, "grad_norm": 3.155850887298584, "kl": 0.1455078125, "learning_rate": 4.730593607305936e-07, "loss": 0.0058, "reward": 1.734188973903656, "reward_std": 0.2661764770746231, "rewards/accuracy_reward": 0.7576265037059784, "rewards/format_reward": 0.9765625, "step": 1154 }, { "completion_length": 64.859375, "epoch": 5.273972602739726, "grad_norm": 2.906141757965088, "kl": 0.18798828125, "learning_rate": 4.726027397260274e-07, "loss": 0.0075, "reward": 1.7283979058265686, "reward_std": 0.15935904532670975, "rewards/accuracy_reward": 0.7362103760242462, "rewards/format_reward": 0.9921875, "step": 1155 }, { "completion_length": 68.3984375, "epoch": 5.2785388127853885, "grad_norm": 3.6288256645202637, "kl": 0.1611328125, "learning_rate": 4.7214611872146116e-07, "loss": 0.0065, "reward": 1.6093750596046448, "reward_std": 0.2415616661310196, "rewards/accuracy_reward": 0.6328125298023224, "rewards/format_reward": 0.9765625, "step": 1156 }, { "completion_length": 65.546875, "epoch": 5.28310502283105, "grad_norm": 5.221689701080322, "kl": 0.21435546875, "learning_rate": 4.71689497716895e-07, "loss": 0.0086, "reward": 1.7025855779647827, "reward_std": 0.11607009917497635, "rewards/accuracy_reward": 0.7025855779647827, "rewards/format_reward": 1.0, "step": 1157 }, { "completion_length": 66.5546875, "epoch": 5.287671232876712, "grad_norm": 1.9122740030288696, "kl": 0.1474609375, "learning_rate": 4.7123287671232874e-07, "loss": 0.0059, "reward": 1.8352110981941223, "reward_std": 0.097220653668046, "rewards/accuracy_reward": 0.8430235981941223, "rewards/format_reward": 0.9921875, "step": 1158 }, { "completion_length": 70.34375, "epoch": 5.292237442922374, "grad_norm": 10.378006935119629, "kl": 0.1640625, "learning_rate": 4.707762557077625e-07, "loss": 0.0066, "reward": 1.8343608975410461, "reward_std": 0.10324066504836082, "rewards/accuracy_reward": 0.8343608975410461, "rewards/format_reward": 1.0, "step": 1159 }, { "completion_length": 67.8828125, "epoch": 5.296803652968037, "grad_norm": 6.8480119705200195, "kl": 0.14599609375, "learning_rate": 4.703196347031963e-07, "loss": 0.0058, "reward": 1.7857979536056519, "reward_std": 0.13422124087810516, "rewards/accuracy_reward": 0.7936104536056519, "rewards/format_reward": 0.9921875, "step": 1160 }, { "completion_length": 100.640625, "epoch": 5.301369863013699, "grad_norm": 3.1650795936584473, "kl": 0.0848388671875, "learning_rate": 4.6986301369863015e-07, "loss": 0.0034, "reward": 1.803321123123169, "reward_std": 0.17942239344120026, "rewards/accuracy_reward": 0.8345710933208466, "rewards/format_reward": 0.96875, "step": 1161 }, { "completion_length": 53.546875, "epoch": 5.30593607305936, "grad_norm": 4.188801288604736, "kl": 0.20947265625, "learning_rate": 4.694063926940639e-07, "loss": 0.0084, "reward": 1.7130786776542664, "reward_std": 0.21442808210849762, "rewards/accuracy_reward": 0.7287036776542664, "rewards/format_reward": 0.984375, "step": 1162 }, { "completion_length": 80.5625, "epoch": 5.310502283105023, "grad_norm": 2.222104787826538, "kl": 0.08837890625, "learning_rate": 4.689497716894977e-07, "loss": 0.0035, "reward": 1.7203125953674316, "reward_std": 0.16751586645841599, "rewards/accuracy_reward": 0.7437499463558197, "rewards/format_reward": 0.9765625, "step": 1163 }, { "completion_length": 54.359375, "epoch": 5.315068493150685, "grad_norm": 2.051166296005249, "kl": 0.15380859375, "learning_rate": 4.684931506849315e-07, "loss": 0.0062, "reward": 1.7621361017227173, "reward_std": 0.11842145770788193, "rewards/accuracy_reward": 0.7699486315250397, "rewards/format_reward": 0.9921875, "step": 1164 }, { "completion_length": 64.0546875, "epoch": 5.319634703196347, "grad_norm": 3.0098822116851807, "kl": 0.177734375, "learning_rate": 4.680365296803653e-07, "loss": 0.0071, "reward": 1.6876825094223022, "reward_std": 0.1372067779302597, "rewards/accuracy_reward": 0.6876825392246246, "rewards/format_reward": 1.0, "step": 1165 }, { "completion_length": 80.75, "epoch": 5.324200913242009, "grad_norm": 2.739140033721924, "kl": 0.0966796875, "learning_rate": 4.6757990867579904e-07, "loss": 0.0039, "reward": 1.740625023841858, "reward_std": 0.14044751226902008, "rewards/accuracy_reward": 0.7484374046325684, "rewards/format_reward": 0.9921875, "step": 1166 }, { "completion_length": 67.6875, "epoch": 5.328767123287671, "grad_norm": 2.510779857635498, "kl": 0.103271484375, "learning_rate": 4.671232876712329e-07, "loss": 0.0041, "reward": 1.607812523841858, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.6078124940395355, "rewards/format_reward": 1.0, "step": 1167 }, { "completion_length": 78.734375, "epoch": 5.333333333333333, "grad_norm": 2.7751314640045166, "kl": 0.126708984375, "learning_rate": 4.6666666666666666e-07, "loss": 0.0051, "reward": 1.853255271911621, "reward_std": 0.19070542603731155, "rewards/accuracy_reward": 0.8688801825046539, "rewards/format_reward": 0.984375, "step": 1168 }, { "completion_length": 67.75, "epoch": 5.337899543378995, "grad_norm": 1.8172931671142578, "kl": 0.146240234375, "learning_rate": 4.6621004566210045e-07, "loss": 0.0059, "reward": 1.7744792103767395, "reward_std": 0.09880542568862438, "rewards/accuracy_reward": 0.78229159116745, "rewards/format_reward": 0.9921875, "step": 1169 }, { "completion_length": 96.1640625, "epoch": 5.342465753424658, "grad_norm": 2.242799758911133, "kl": 0.11083984375, "learning_rate": 4.657534246575342e-07, "loss": 0.0044, "reward": 1.6858011484146118, "reward_std": 0.16090433299541473, "rewards/accuracy_reward": 0.7014259994029999, "rewards/format_reward": 0.984375, "step": 1170 }, { "completion_length": 94.90625, "epoch": 5.34703196347032, "grad_norm": 1.8737952709197998, "kl": 0.102783203125, "learning_rate": 4.65296803652968e-07, "loss": 0.0041, "reward": 1.8359375, "reward_std": 0.1631505787372589, "rewards/accuracy_reward": 0.8515624403953552, "rewards/format_reward": 0.984375, "step": 1171 }, { "completion_length": 108.46875, "epoch": 5.351598173515982, "grad_norm": 1.0832995176315308, "kl": 0.1112060546875, "learning_rate": 4.648401826484018e-07, "loss": 0.0044, "reward": 1.8958333730697632, "reward_std": 0.05308555904775858, "rewards/accuracy_reward": 0.8958332538604736, "rewards/format_reward": 1.0, "step": 1172 }, { "completion_length": 89.1875, "epoch": 5.3561643835616435, "grad_norm": 2.971842050552368, "kl": 0.1611328125, "learning_rate": 4.643835616438356e-07, "loss": 0.0065, "reward": 1.6929687857627869, "reward_std": 0.22123289108276367, "rewards/accuracy_reward": 0.7164062261581421, "rewards/format_reward": 0.9765625, "step": 1173 }, { "completion_length": 79.7109375, "epoch": 5.360730593607306, "grad_norm": 3.3845016956329346, "kl": 0.107177734375, "learning_rate": 4.639269406392694e-07, "loss": 0.0043, "reward": 1.7539063096046448, "reward_std": 0.1683424860239029, "rewards/accuracy_reward": 0.75390625, "rewards/format_reward": 1.0, "step": 1174 }, { "completion_length": 88.7265625, "epoch": 5.365296803652968, "grad_norm": 1.8644884824752808, "kl": 0.115478515625, "learning_rate": 4.634703196347032e-07, "loss": 0.0046, "reward": 1.635702133178711, "reward_std": 0.2014181986451149, "rewards/accuracy_reward": 0.6591395735740662, "rewards/format_reward": 0.9765625, "step": 1175 }, { "completion_length": 79.828125, "epoch": 5.36986301369863, "grad_norm": 11.902091026306152, "kl": 0.126953125, "learning_rate": 4.6301369863013696e-07, "loss": 0.0051, "reward": 1.7371652126312256, "reward_std": 0.1940011829137802, "rewards/accuracy_reward": 0.7527901232242584, "rewards/format_reward": 0.984375, "step": 1176 }, { "completion_length": 66.2734375, "epoch": 5.3744292237442925, "grad_norm": 2.139374017715454, "kl": 0.11083984375, "learning_rate": 4.625570776255708e-07, "loss": 0.0044, "reward": 1.8981026411056519, "reward_std": 0.12666313955560327, "rewards/accuracy_reward": 0.9137276411056519, "rewards/format_reward": 0.984375, "step": 1177 }, { "completion_length": 68.4921875, "epoch": 5.378995433789954, "grad_norm": 1.7856290340423584, "kl": 0.12744140625, "learning_rate": 4.6210045662100454e-07, "loss": 0.0051, "reward": 1.6984019875526428, "reward_std": 0.18527702055871487, "rewards/accuracy_reward": 0.7140269875526428, "rewards/format_reward": 0.984375, "step": 1178 }, { "completion_length": 85.734375, "epoch": 5.383561643835616, "grad_norm": 3.6187846660614014, "kl": 0.090576171875, "learning_rate": 4.616438356164383e-07, "loss": 0.0036, "reward": 1.8394480347633362, "reward_std": 0.08028085343539715, "rewards/accuracy_reward": 0.8394480347633362, "rewards/format_reward": 1.0, "step": 1179 }, { "completion_length": 73.796875, "epoch": 5.3881278538812785, "grad_norm": 2.5453333854675293, "kl": 0.17138671875, "learning_rate": 4.611872146118721e-07, "loss": 0.0068, "reward": 1.810937523841858, "reward_std": 0.19586525857448578, "rewards/accuracy_reward": 0.8343749344348907, "rewards/format_reward": 0.9765625, "step": 1180 }, { "completion_length": 75.5546875, "epoch": 5.392694063926941, "grad_norm": 2.8857522010803223, "kl": 0.107177734375, "learning_rate": 4.6073059360730595e-07, "loss": 0.0043, "reward": 1.685937523841858, "reward_std": 0.14966705068945885, "rewards/accuracy_reward": 0.6859375238418579, "rewards/format_reward": 1.0, "step": 1181 }, { "completion_length": 64.7890625, "epoch": 5.397260273972603, "grad_norm": 2.9432222843170166, "kl": 0.14306640625, "learning_rate": 4.602739726027397e-07, "loss": 0.0057, "reward": 1.8203496932983398, "reward_std": 0.20706991106271744, "rewards/accuracy_reward": 0.8281622231006622, "rewards/format_reward": 0.9921875, "step": 1182 }, { "completion_length": 88.6328125, "epoch": 5.401826484018265, "grad_norm": 2.689836025238037, "kl": 0.0927734375, "learning_rate": 4.5981735159817347e-07, "loss": 0.0037, "reward": 1.772569477558136, "reward_std": 0.2539152055978775, "rewards/accuracy_reward": 0.8116318881511688, "rewards/format_reward": 0.9609375, "step": 1183 }, { "completion_length": 74.28125, "epoch": 5.406392694063927, "grad_norm": 3.8888282775878906, "kl": 0.14453125, "learning_rate": 4.593607305936073e-07, "loss": 0.0058, "reward": 1.6562862396240234, "reward_std": 0.22701606899499893, "rewards/accuracy_reward": 0.679723709821701, "rewards/format_reward": 0.9765625, "step": 1184 }, { "completion_length": 90.3671875, "epoch": 5.410958904109589, "grad_norm": 5.0972981452941895, "kl": 0.13427734375, "learning_rate": 4.589041095890411e-07, "loss": 0.0054, "reward": 1.7076823115348816, "reward_std": 0.1942017376422882, "rewards/accuracy_reward": 0.7154947817325592, "rewards/format_reward": 0.9921875, "step": 1185 }, { "completion_length": 94.515625, "epoch": 5.415525114155251, "grad_norm": 2.337641716003418, "kl": 0.112548828125, "learning_rate": 4.5844748858447483e-07, "loss": 0.0045, "reward": 1.8321614265441895, "reward_std": 0.08036978542804718, "rewards/accuracy_reward": 0.8321613669395447, "rewards/format_reward": 1.0, "step": 1186 }, { "completion_length": 70.9296875, "epoch": 5.420091324200913, "grad_norm": 1.6933238506317139, "kl": 0.099853515625, "learning_rate": 4.579908675799087e-07, "loss": 0.004, "reward": 1.6846354007720947, "reward_std": 0.20877984166145325, "rewards/accuracy_reward": 0.7002603709697723, "rewards/format_reward": 0.984375, "step": 1187 }, { "completion_length": 86.8359375, "epoch": 5.424657534246576, "grad_norm": 9.663613319396973, "kl": 0.094970703125, "learning_rate": 4.5753424657534246e-07, "loss": 0.0038, "reward": 1.784375011920929, "reward_std": 0.10888781771063805, "rewards/accuracy_reward": 0.7843749225139618, "rewards/format_reward": 1.0, "step": 1188 }, { "completion_length": 90.921875, "epoch": 5.429223744292237, "grad_norm": 3.4232490062713623, "kl": 0.1181640625, "learning_rate": 4.5707762557077625e-07, "loss": 0.0047, "reward": 1.7392657399177551, "reward_std": 0.12979238480329514, "rewards/accuracy_reward": 0.7470781803131104, "rewards/format_reward": 0.9921875, "step": 1189 }, { "completion_length": 74.96875, "epoch": 5.433789954337899, "grad_norm": 2.6465795040130615, "kl": 0.132080078125, "learning_rate": 4.5662100456621e-07, "loss": 0.0053, "reward": 1.7466145753860474, "reward_std": 0.129929106682539, "rewards/accuracy_reward": 0.7544271051883698, "rewards/format_reward": 0.9921875, "step": 1190 }, { "completion_length": 77.8828125, "epoch": 5.438356164383562, "grad_norm": 4.823699951171875, "kl": 0.12158203125, "learning_rate": 4.561643835616438e-07, "loss": 0.0049, "reward": 1.679805874824524, "reward_std": 0.12897202000021935, "rewards/accuracy_reward": 0.6798058748245239, "rewards/format_reward": 1.0, "step": 1191 }, { "completion_length": 61.109375, "epoch": 5.442922374429224, "grad_norm": 1.3143653869628906, "kl": 0.13623046875, "learning_rate": 4.557077625570776e-07, "loss": 0.0055, "reward": 1.6895833611488342, "reward_std": 0.10995453409850597, "rewards/accuracy_reward": 0.6895833015441895, "rewards/format_reward": 1.0, "step": 1192 }, { "completion_length": 94.640625, "epoch": 5.447488584474886, "grad_norm": 26.47618293762207, "kl": 0.086181640625, "learning_rate": 4.552511415525114e-07, "loss": 0.0035, "reward": 1.8638021349906921, "reward_std": 0.14757593348622322, "rewards/accuracy_reward": 0.8872395753860474, "rewards/format_reward": 0.9765625, "step": 1193 }, { "completion_length": 52.4609375, "epoch": 5.4520547945205475, "grad_norm": 5.206370830535889, "kl": 0.17138671875, "learning_rate": 4.547945205479452e-07, "loss": 0.0069, "reward": 1.6548038721084595, "reward_std": 0.23970913887023926, "rewards/accuracy_reward": 0.6704288721084595, "rewards/format_reward": 0.984375, "step": 1194 }, { "completion_length": 61.78125, "epoch": 5.45662100456621, "grad_norm": 4.111104488372803, "kl": 0.15966796875, "learning_rate": 4.54337899543379e-07, "loss": 0.0064, "reward": 1.7300130128860474, "reward_std": 0.17958877980709076, "rewards/accuracy_reward": 0.7456380128860474, "rewards/format_reward": 0.984375, "step": 1195 }, { "completion_length": 79.1796875, "epoch": 5.461187214611872, "grad_norm": 1.4285087585449219, "kl": 0.10009765625, "learning_rate": 4.5388127853881276e-07, "loss": 0.004, "reward": 1.8193080425262451, "reward_std": 0.11568646691739559, "rewards/accuracy_reward": 0.827120453119278, "rewards/format_reward": 0.9921875, "step": 1196 }, { "completion_length": 80.234375, "epoch": 5.465753424657534, "grad_norm": 24.182233810424805, "kl": 0.119384765625, "learning_rate": 4.534246575342466e-07, "loss": 0.0048, "reward": 1.6295573711395264, "reward_std": 0.16487130522727966, "rewards/accuracy_reward": 0.6451822966337204, "rewards/format_reward": 0.984375, "step": 1197 }, { "completion_length": 73.296875, "epoch": 5.470319634703197, "grad_norm": 1.8759607076644897, "kl": 0.115234375, "learning_rate": 4.5296803652968034e-07, "loss": 0.0046, "reward": 1.742903709411621, "reward_std": 0.14027476869523525, "rewards/accuracy_reward": 0.7585286498069763, "rewards/format_reward": 0.984375, "step": 1198 }, { "completion_length": 56.65625, "epoch": 5.474885844748858, "grad_norm": 2.124757766723633, "kl": 0.129638671875, "learning_rate": 4.525114155251141e-07, "loss": 0.0052, "reward": 1.766406238079071, "reward_std": 0.11525032296776772, "rewards/accuracy_reward": 0.7664062678813934, "rewards/format_reward": 1.0, "step": 1199 }, { "completion_length": 71.984375, "epoch": 5.47945205479452, "grad_norm": 3.025282382965088, "kl": 0.1591796875, "learning_rate": 4.520547945205479e-07, "loss": 0.0064, "reward": 1.565625011920929, "reward_std": 0.25334832072257996, "rewards/accuracy_reward": 0.5812499821186066, "rewards/format_reward": 0.984375, "step": 1200 }, { "completion_length": 71.421875, "epoch": 5.4840182648401825, "grad_norm": 3.4319229125976562, "kl": 0.141357421875, "learning_rate": 4.5159817351598175e-07, "loss": 0.0057, "reward": 1.8551432490348816, "reward_std": 0.10183484852313995, "rewards/accuracy_reward": 0.8551432490348816, "rewards/format_reward": 1.0, "step": 1201 }, { "completion_length": 85.296875, "epoch": 5.488584474885845, "grad_norm": 2.546898365020752, "kl": 0.114501953125, "learning_rate": 4.511415525114155e-07, "loss": 0.0046, "reward": 1.6248489022254944, "reward_std": 0.21220972388982773, "rewards/accuracy_reward": 0.6404739022254944, "rewards/format_reward": 0.984375, "step": 1202 }, { "completion_length": 107.4453125, "epoch": 5.493150684931507, "grad_norm": 2.221569299697876, "kl": 0.12939453125, "learning_rate": 4.5068493150684927e-07, "loss": 0.0052, "reward": 1.7825521230697632, "reward_std": 0.11523282900452614, "rewards/accuracy_reward": 0.7903645038604736, "rewards/format_reward": 0.9921875, "step": 1203 }, { "completion_length": 68.296875, "epoch": 5.497716894977169, "grad_norm": 5.981943130493164, "kl": 0.162109375, "learning_rate": 4.502283105022831e-07, "loss": 0.0065, "reward": 1.6208333373069763, "reward_std": 0.18434154987335205, "rewards/accuracy_reward": 0.6208333373069763, "rewards/format_reward": 1.0, "step": 1204 }, { "completion_length": 82.796875, "epoch": 5.502283105022831, "grad_norm": 6.735628128051758, "kl": 0.10888671875, "learning_rate": 4.497716894977169e-07, "loss": 0.0044, "reward": 1.7694801092147827, "reward_std": 0.08905210345983505, "rewards/accuracy_reward": 0.7694801688194275, "rewards/format_reward": 1.0, "step": 1205 }, { "completion_length": 48.140625, "epoch": 5.506849315068493, "grad_norm": 4.568028926849365, "kl": 0.12255859375, "learning_rate": 4.4931506849315063e-07, "loss": 0.0049, "reward": 1.8573929071426392, "reward_std": 0.12051096558570862, "rewards/accuracy_reward": 0.8573929369449615, "rewards/format_reward": 1.0, "step": 1206 }, { "completion_length": 79.2265625, "epoch": 5.511415525114155, "grad_norm": 2.16815447807312, "kl": 0.114990234375, "learning_rate": 4.488584474885845e-07, "loss": 0.0046, "reward": 1.8097842931747437, "reward_std": 0.13499605283141136, "rewards/accuracy_reward": 0.8175966441631317, "rewards/format_reward": 0.9921875, "step": 1207 }, { "completion_length": 83.3828125, "epoch": 5.5159817351598175, "grad_norm": 9.177694320678711, "kl": 0.100830078125, "learning_rate": 4.4840182648401826e-07, "loss": 0.004, "reward": 1.8088541626930237, "reward_std": 0.14230135083198547, "rewards/accuracy_reward": 0.8166666626930237, "rewards/format_reward": 0.9921875, "step": 1208 }, { "completion_length": 66.96875, "epoch": 5.52054794520548, "grad_norm": 5.906530380249023, "kl": 0.109375, "learning_rate": 4.4794520547945205e-07, "loss": 0.0044, "reward": 1.6549479365348816, "reward_std": 0.16808292269706726, "rewards/accuracy_reward": 0.6549479067325592, "rewards/format_reward": 1.0, "step": 1209 }, { "completion_length": 71.90625, "epoch": 5.525114155251142, "grad_norm": 2.249753713607788, "kl": 0.14599609375, "learning_rate": 4.474885844748858e-07, "loss": 0.0058, "reward": 1.7879971265792847, "reward_std": 0.13272903114557266, "rewards/accuracy_reward": 0.8036221563816071, "rewards/format_reward": 0.984375, "step": 1210 }, { "completion_length": 71.3359375, "epoch": 5.529680365296803, "grad_norm": 3.3971409797668457, "kl": 0.142578125, "learning_rate": 4.470319634703196e-07, "loss": 0.0057, "reward": 1.7578125596046448, "reward_std": 0.14966705441474915, "rewards/accuracy_reward": 0.7578124403953552, "rewards/format_reward": 1.0, "step": 1211 }, { "completion_length": 82.359375, "epoch": 5.534246575342466, "grad_norm": 4.350315093994141, "kl": 0.116455078125, "learning_rate": 4.465753424657534e-07, "loss": 0.0047, "reward": 1.66940575838089, "reward_std": 0.14690347015857697, "rewards/accuracy_reward": 0.6850306689739227, "rewards/format_reward": 0.984375, "step": 1212 }, { "completion_length": 81.0234375, "epoch": 5.538812785388128, "grad_norm": 2.2065749168395996, "kl": 0.094970703125, "learning_rate": 4.461187214611872e-07, "loss": 0.0038, "reward": 1.7763858437538147, "reward_std": 0.07247142866253853, "rewards/accuracy_reward": 0.7763858437538147, "rewards/format_reward": 1.0, "step": 1213 }, { "completion_length": 50.5703125, "epoch": 5.54337899543379, "grad_norm": 1.8587976694107056, "kl": 0.17333984375, "learning_rate": 4.45662100456621e-07, "loss": 0.0069, "reward": 1.8041483163833618, "reward_std": 0.1315086344256997, "rewards/accuracy_reward": 0.8119607865810394, "rewards/format_reward": 0.9921875, "step": 1214 }, { "completion_length": 72.6953125, "epoch": 5.5479452054794525, "grad_norm": 2.4681146144866943, "kl": 0.10400390625, "learning_rate": 4.4520547945205477e-07, "loss": 0.0042, "reward": 1.7903646230697632, "reward_std": 0.08470549248158932, "rewards/accuracy_reward": 0.7903645634651184, "rewards/format_reward": 1.0, "step": 1215 }, { "completion_length": 85.7890625, "epoch": 5.552511415525114, "grad_norm": 3.0611839294433594, "kl": 0.077392578125, "learning_rate": 4.4474885844748856e-07, "loss": 0.0031, "reward": 1.7632812857627869, "reward_std": 0.20296207815408707, "rewards/accuracy_reward": 0.7789061665534973, "rewards/format_reward": 0.984375, "step": 1216 }, { "completion_length": 68.2421875, "epoch": 5.557077625570776, "grad_norm": 2.2732932567596436, "kl": 0.120849609375, "learning_rate": 4.442922374429224e-07, "loss": 0.0048, "reward": 1.663699746131897, "reward_std": 0.1902095228433609, "rewards/accuracy_reward": 0.6715123057365417, "rewards/format_reward": 0.9921875, "step": 1217 }, { "completion_length": 87.59375, "epoch": 5.561643835616438, "grad_norm": 1.4389771223068237, "kl": 0.117431640625, "learning_rate": 4.4383561643835613e-07, "loss": 0.0047, "reward": 1.8174665570259094, "reward_std": 0.0971344392746687, "rewards/accuracy_reward": 0.825279027223587, "rewards/format_reward": 0.9921875, "step": 1218 }, { "completion_length": 98.3046875, "epoch": 5.566210045662101, "grad_norm": 1.4271727800369263, "kl": 0.12255859375, "learning_rate": 4.433789954337899e-07, "loss": 0.0049, "reward": 1.8162059783935547, "reward_std": 0.11401430889964104, "rewards/accuracy_reward": 0.8240183591842651, "rewards/format_reward": 0.9921875, "step": 1219 }, { "completion_length": 73.3046875, "epoch": 5.570776255707763, "grad_norm": 2.519207239151001, "kl": 0.117919921875, "learning_rate": 4.429223744292237e-07, "loss": 0.0047, "reward": 1.7786458730697632, "reward_std": 0.13994821161031723, "rewards/accuracy_reward": 0.7786457538604736, "rewards/format_reward": 1.0, "step": 1220 }, { "completion_length": 68.9453125, "epoch": 5.575342465753424, "grad_norm": 2.901245594024658, "kl": 0.16357421875, "learning_rate": 4.4246575342465755e-07, "loss": 0.0065, "reward": 1.639657735824585, "reward_std": 0.24836767464876175, "rewards/accuracy_reward": 0.655282735824585, "rewards/format_reward": 0.984375, "step": 1221 }, { "completion_length": 96.234375, "epoch": 5.579908675799087, "grad_norm": 1.3380907773971558, "kl": 0.1065673828125, "learning_rate": 4.420091324200913e-07, "loss": 0.0043, "reward": 1.8123116493225098, "reward_std": 0.06037373095750809, "rewards/accuracy_reward": 0.8123115301132202, "rewards/format_reward": 1.0, "step": 1222 }, { "completion_length": 78.203125, "epoch": 5.584474885844749, "grad_norm": 2.1692612171173096, "kl": 0.11328125, "learning_rate": 4.4155251141552507e-07, "loss": 0.0045, "reward": 1.852519154548645, "reward_std": 0.08084761165082455, "rewards/accuracy_reward": 0.8603315949440002, "rewards/format_reward": 0.9921875, "step": 1223 }, { "completion_length": 76.1015625, "epoch": 5.589041095890411, "grad_norm": 1.5047615766525269, "kl": 0.13916015625, "learning_rate": 4.410958904109589e-07, "loss": 0.0056, "reward": 1.8007813096046448, "reward_std": 0.13284454122185707, "rewards/accuracy_reward": 0.8164061903953552, "rewards/format_reward": 0.984375, "step": 1224 }, { "completion_length": 95.3671875, "epoch": 5.593607305936073, "grad_norm": 5.50695276260376, "kl": 0.0765380859375, "learning_rate": 4.406392694063927e-07, "loss": 0.0031, "reward": 1.821587860584259, "reward_std": 0.11784346960484982, "rewards/accuracy_reward": 0.8294003307819366, "rewards/format_reward": 0.9921875, "step": 1225 }, { "completion_length": 79.359375, "epoch": 5.598173515981735, "grad_norm": 1.8532605171203613, "kl": 0.123046875, "learning_rate": 4.4018264840182643e-07, "loss": 0.0049, "reward": 1.672764003276825, "reward_std": 0.1454983726143837, "rewards/accuracy_reward": 0.672764003276825, "rewards/format_reward": 1.0, "step": 1226 }, { "completion_length": 49.9296875, "epoch": 5.602739726027397, "grad_norm": 4.534260272979736, "kl": 0.18798828125, "learning_rate": 4.397260273972603e-07, "loss": 0.0075, "reward": 1.7486504912376404, "reward_std": 0.23138123750686646, "rewards/accuracy_reward": 0.7642754018306732, "rewards/format_reward": 0.984375, "step": 1227 }, { "completion_length": 79.5390625, "epoch": 5.607305936073059, "grad_norm": 2.4813179969787598, "kl": 0.113525390625, "learning_rate": 4.3926940639269406e-07, "loss": 0.0045, "reward": 1.854801058769226, "reward_std": 0.15426605194807053, "rewards/accuracy_reward": 0.8782385289669037, "rewards/format_reward": 0.9765625, "step": 1228 }, { "completion_length": 65.8984375, "epoch": 5.6118721461187215, "grad_norm": 1.7068575620651245, "kl": 0.123291015625, "learning_rate": 4.3881278538812785e-07, "loss": 0.0049, "reward": 1.8518972992897034, "reward_std": 0.11522487178444862, "rewards/accuracy_reward": 0.8675222992897034, "rewards/format_reward": 0.984375, "step": 1229 }, { "completion_length": 73.5390625, "epoch": 5.616438356164384, "grad_norm": 2.039097309112549, "kl": 0.150390625, "learning_rate": 4.383561643835616e-07, "loss": 0.006, "reward": 1.8411458730697632, "reward_std": 0.12557167932391167, "rewards/accuracy_reward": 0.8411458134651184, "rewards/format_reward": 1.0, "step": 1230 }, { "completion_length": 62.3515625, "epoch": 5.621004566210045, "grad_norm": 1.1969468593597412, "kl": 0.14990234375, "learning_rate": 4.378995433789954e-07, "loss": 0.006, "reward": 1.891406238079071, "reward_std": 0.055242715403437614, "rewards/accuracy_reward": 0.8914062678813934, "rewards/format_reward": 1.0, "step": 1231 }, { "completion_length": 65.2421875, "epoch": 5.6255707762557075, "grad_norm": 0.9585725665092468, "kl": 0.095703125, "learning_rate": 4.374429223744292e-07, "loss": 0.0038, "reward": 1.806380271911621, "reward_std": 0.053401291370391846, "rewards/accuracy_reward": 0.8063801527023315, "rewards/format_reward": 1.0, "step": 1232 }, { "completion_length": 94.8828125, "epoch": 5.63013698630137, "grad_norm": 1.548374891281128, "kl": 0.121826171875, "learning_rate": 4.36986301369863e-07, "loss": 0.0049, "reward": 1.8385416865348816, "reward_std": 0.07087302953004837, "rewards/accuracy_reward": 0.8463541269302368, "rewards/format_reward": 0.9921875, "step": 1233 }, { "completion_length": 65.015625, "epoch": 5.634703196347032, "grad_norm": 3.663956880569458, "kl": 0.10302734375, "learning_rate": 4.365296803652968e-07, "loss": 0.0041, "reward": 1.701562523841858, "reward_std": 0.11350465193390846, "rewards/accuracy_reward": 0.7015624940395355, "rewards/format_reward": 1.0, "step": 1234 }, { "completion_length": 76.859375, "epoch": 5.639269406392694, "grad_norm": 4.082939624786377, "kl": 0.123046875, "learning_rate": 4.3607305936073057e-07, "loss": 0.0049, "reward": 1.7964844107627869, "reward_std": 0.1577010676264763, "rewards/accuracy_reward": 0.7964843213558197, "rewards/format_reward": 1.0, "step": 1235 }, { "completion_length": 72.140625, "epoch": 5.6438356164383565, "grad_norm": 2.4908390045166016, "kl": 0.15625, "learning_rate": 4.3561643835616436e-07, "loss": 0.0062, "reward": 1.7734375596046448, "reward_std": 0.13743899390101433, "rewards/accuracy_reward": 0.7734374701976776, "rewards/format_reward": 1.0, "step": 1236 }, { "completion_length": 73.1953125, "epoch": 5.648401826484018, "grad_norm": 2.1884684562683105, "kl": 0.122314453125, "learning_rate": 4.351598173515982e-07, "loss": 0.0049, "reward": 1.7967329621315002, "reward_std": 0.22378800809383392, "rewards/accuracy_reward": 0.8123579323291779, "rewards/format_reward": 0.984375, "step": 1237 }, { "completion_length": 74.21875, "epoch": 5.65296803652968, "grad_norm": 2.7931785583496094, "kl": 0.149169921875, "learning_rate": 4.3470319634703193e-07, "loss": 0.006, "reward": 1.7240886092185974, "reward_std": 0.15288061648607254, "rewards/accuracy_reward": 0.7240885198116302, "rewards/format_reward": 1.0, "step": 1238 }, { "completion_length": 74.875, "epoch": 5.657534246575342, "grad_norm": 7.83119535446167, "kl": 0.398681640625, "learning_rate": 4.342465753424657e-07, "loss": 0.0159, "reward": 1.7018229365348816, "reward_std": 0.20128536224365234, "rewards/accuracy_reward": 0.7252604067325592, "rewards/format_reward": 0.9765625, "step": 1239 }, { "completion_length": 85.15625, "epoch": 5.662100456621005, "grad_norm": 2.2747838497161865, "kl": 0.084716796875, "learning_rate": 4.337899543378995e-07, "loss": 0.0034, "reward": 1.7470728158950806, "reward_std": 0.09693595767021179, "rewards/accuracy_reward": 0.7470727562904358, "rewards/format_reward": 1.0, "step": 1240 }, { "completion_length": 62.78125, "epoch": 5.666666666666667, "grad_norm": 3.267308235168457, "kl": 0.15283203125, "learning_rate": 4.3333333333333335e-07, "loss": 0.0061, "reward": 1.7454678416252136, "reward_std": 0.1368042230606079, "rewards/accuracy_reward": 0.7532803416252136, "rewards/format_reward": 0.9921875, "step": 1241 }, { "completion_length": 88.375, "epoch": 5.671232876712329, "grad_norm": 2.6403214931488037, "kl": 0.1396484375, "learning_rate": 4.328767123287671e-07, "loss": 0.0056, "reward": 1.7575656175613403, "reward_std": 0.20803096145391464, "rewards/accuracy_reward": 0.7810031175613403, "rewards/format_reward": 0.9765625, "step": 1242 }, { "completion_length": 79.96875, "epoch": 5.675799086757991, "grad_norm": 3.1201863288879395, "kl": 0.122314453125, "learning_rate": 4.3242009132420087e-07, "loss": 0.0049, "reward": 1.7832190990447998, "reward_std": 0.16100692749023438, "rewards/accuracy_reward": 0.7988439798355103, "rewards/format_reward": 0.984375, "step": 1243 }, { "completion_length": 81.8515625, "epoch": 5.680365296803653, "grad_norm": 2.047901153564453, "kl": 0.121337890625, "learning_rate": 4.319634703196347e-07, "loss": 0.0049, "reward": 1.760881781578064, "reward_std": 0.11731128394603729, "rewards/accuracy_reward": 0.7608817219734192, "rewards/format_reward": 1.0, "step": 1244 }, { "completion_length": 73.046875, "epoch": 5.684931506849315, "grad_norm": 1.7335350513458252, "kl": 0.147705078125, "learning_rate": 4.315068493150685e-07, "loss": 0.0059, "reward": 1.7863653898239136, "reward_std": 0.10918881744146347, "rewards/accuracy_reward": 0.8019903302192688, "rewards/format_reward": 0.984375, "step": 1245 }, { "completion_length": 71.8125, "epoch": 5.689497716894977, "grad_norm": 2.161808967590332, "kl": 0.149169921875, "learning_rate": 4.3105022831050223e-07, "loss": 0.006, "reward": 1.7140624523162842, "reward_std": 0.14955899119377136, "rewards/accuracy_reward": 0.7140624821186066, "rewards/format_reward": 1.0, "step": 1246 }, { "completion_length": 70.53125, "epoch": 5.69406392694064, "grad_norm": 1.7596287727355957, "kl": 0.13671875, "learning_rate": 4.3059360730593607e-07, "loss": 0.0055, "reward": 1.6859374642372131, "reward_std": 0.13941731676459312, "rewards/accuracy_reward": 0.6859374940395355, "rewards/format_reward": 1.0, "step": 1247 }, { "completion_length": 78.921875, "epoch": 5.698630136986301, "grad_norm": 1.425408959388733, "kl": 0.093994140625, "learning_rate": 4.3013698630136986e-07, "loss": 0.0038, "reward": 1.8960938453674316, "reward_std": 0.10968662425875664, "rewards/accuracy_reward": 0.9039062261581421, "rewards/format_reward": 0.9921875, "step": 1248 }, { "completion_length": 82.5625, "epoch": 5.703196347031963, "grad_norm": 3.0568716526031494, "kl": 0.115234375, "learning_rate": 4.2968036529680365e-07, "loss": 0.0046, "reward": 1.6471354365348816, "reward_std": 0.22037852555513382, "rewards/accuracy_reward": 0.6549479067325592, "rewards/format_reward": 0.9921875, "step": 1249 }, { "completion_length": 92.828125, "epoch": 5.707762557077626, "grad_norm": 27.536405563354492, "kl": 0.10205078125, "learning_rate": 4.292237442922374e-07, "loss": 0.0041, "reward": 1.75323486328125, "reward_std": 0.09919268637895584, "rewards/accuracy_reward": 0.76104736328125, "rewards/format_reward": 0.9921875, "step": 1250 }, { "completion_length": 75.4609375, "epoch": 5.712328767123288, "grad_norm": 4.183087348937988, "kl": 0.100830078125, "learning_rate": 4.287671232876712e-07, "loss": 0.004, "reward": 1.625, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.6249999403953552, "rewards/format_reward": 1.0, "step": 1251 }, { "completion_length": 74.15625, "epoch": 5.71689497716895, "grad_norm": 7.003372669219971, "kl": 0.138671875, "learning_rate": 4.28310502283105e-07, "loss": 0.0055, "reward": 1.631250023841858, "reward_std": 0.23694030940532684, "rewards/accuracy_reward": 0.6390624642372131, "rewards/format_reward": 0.9921875, "step": 1252 }, { "completion_length": 71.2265625, "epoch": 5.7214611872146115, "grad_norm": 4.267425060272217, "kl": 0.159912109375, "learning_rate": 4.278538812785388e-07, "loss": 0.0064, "reward": 1.7780134677886963, "reward_std": 0.16748925298452377, "rewards/accuracy_reward": 0.7858259081840515, "rewards/format_reward": 0.9921875, "step": 1253 }, { "completion_length": 76.8984375, "epoch": 5.726027397260274, "grad_norm": 2.602168083190918, "kl": 0.140625, "learning_rate": 4.273972602739726e-07, "loss": 0.0056, "reward": 1.7774969339370728, "reward_std": 0.1920681744813919, "rewards/accuracy_reward": 0.7853094637393951, "rewards/format_reward": 0.9921875, "step": 1254 }, { "completion_length": 80.0703125, "epoch": 5.730593607305936, "grad_norm": 3.621494770050049, "kl": 0.14501953125, "learning_rate": 4.2694063926940637e-07, "loss": 0.0058, "reward": 1.7632812857627869, "reward_std": 0.18639566004276276, "rewards/accuracy_reward": 0.7710936963558197, "rewards/format_reward": 0.9921875, "step": 1255 }, { "completion_length": 81.65625, "epoch": 5.735159817351598, "grad_norm": 2.0993213653564453, "kl": 0.143310546875, "learning_rate": 4.2648401826484016e-07, "loss": 0.0057, "reward": 1.765625, "reward_std": 0.1767766885459423, "rewards/accuracy_reward": 0.7812499403953552, "rewards/format_reward": 0.984375, "step": 1256 }, { "completion_length": 63.765625, "epoch": 5.739726027397261, "grad_norm": 2.3051700592041016, "kl": 0.19140625, "learning_rate": 4.26027397260274e-07, "loss": 0.0077, "reward": 1.800000011920929, "reward_std": 0.11946406960487366, "rewards/accuracy_reward": 0.8078124821186066, "rewards/format_reward": 0.9921875, "step": 1257 }, { "completion_length": 90.7265625, "epoch": 5.744292237442922, "grad_norm": 5.680175304412842, "kl": 0.13720703125, "learning_rate": 4.2557077625570773e-07, "loss": 0.0055, "reward": 1.7391927242279053, "reward_std": 0.1934903860092163, "rewards/accuracy_reward": 0.7548176646232605, "rewards/format_reward": 0.984375, "step": 1258 }, { "completion_length": 67.1171875, "epoch": 5.748858447488584, "grad_norm": 3.6025280952453613, "kl": 0.1533203125, "learning_rate": 4.251141552511415e-07, "loss": 0.0061, "reward": 1.6899954080581665, "reward_std": 0.177649587392807, "rewards/accuracy_reward": 0.6899954080581665, "rewards/format_reward": 1.0, "step": 1259 }, { "completion_length": 78.46875, "epoch": 5.7534246575342465, "grad_norm": 2.8901851177215576, "kl": 0.150390625, "learning_rate": 4.246575342465753e-07, "loss": 0.006, "reward": 1.65723317861557, "reward_std": 0.15903639048337936, "rewards/accuracy_reward": 0.6728581488132477, "rewards/format_reward": 0.984375, "step": 1260 }, { "completion_length": 90.2890625, "epoch": 5.757990867579909, "grad_norm": 30.16987419128418, "kl": 0.11328125, "learning_rate": 4.2420091324200915e-07, "loss": 0.0045, "reward": 1.80078125, "reward_std": 0.13798470050096512, "rewards/accuracy_reward": 0.8007811605930328, "rewards/format_reward": 1.0, "step": 1261 }, { "completion_length": 69.9375, "epoch": 5.762557077625571, "grad_norm": 4.615162372589111, "kl": 0.2236328125, "learning_rate": 4.237442922374429e-07, "loss": 0.0089, "reward": 1.7741782069206238, "reward_std": 0.1786958873271942, "rewards/accuracy_reward": 0.7819906771183014, "rewards/format_reward": 0.9921875, "step": 1262 }, { "completion_length": 85.2421875, "epoch": 5.767123287671232, "grad_norm": 2.68100905418396, "kl": 0.10888671875, "learning_rate": 4.2328767123287667e-07, "loss": 0.0044, "reward": 1.7717448472976685, "reward_std": 0.1444082222878933, "rewards/accuracy_reward": 0.7795572876930237, "rewards/format_reward": 0.9921875, "step": 1263 }, { "completion_length": 89.0390625, "epoch": 5.771689497716895, "grad_norm": 3.867462158203125, "kl": 0.141845703125, "learning_rate": 4.228310502283105e-07, "loss": 0.0057, "reward": 1.6650173664093018, "reward_std": 0.21590976417064667, "rewards/accuracy_reward": 0.6806423366069794, "rewards/format_reward": 0.984375, "step": 1264 }, { "completion_length": 72.515625, "epoch": 5.776255707762557, "grad_norm": 9.51895523071289, "kl": 0.103271484375, "learning_rate": 4.223744292237443e-07, "loss": 0.0041, "reward": 1.7899226546287537, "reward_std": 0.1994732916355133, "rewards/accuracy_reward": 0.8133600950241089, "rewards/format_reward": 0.9765625, "step": 1265 }, { "completion_length": 87.1328125, "epoch": 5.780821917808219, "grad_norm": 3.7063732147216797, "kl": 0.142578125, "learning_rate": 4.2191780821917803e-07, "loss": 0.0057, "reward": 1.7083333730697632, "reward_std": 0.19097717106342316, "rewards/accuracy_reward": 0.7161458134651184, "rewards/format_reward": 0.9921875, "step": 1266 }, { "completion_length": 85.3828125, "epoch": 5.7853881278538815, "grad_norm": 2.703822374343872, "kl": 0.13037109375, "learning_rate": 4.2146118721461187e-07, "loss": 0.0052, "reward": 1.5992187857627869, "reward_std": 0.2338416427373886, "rewards/accuracy_reward": 0.6148437559604645, "rewards/format_reward": 0.984375, "step": 1267 }, { "completion_length": 70.9453125, "epoch": 5.789954337899544, "grad_norm": 4.01165246963501, "kl": 0.17236328125, "learning_rate": 4.2100456621004566e-07, "loss": 0.0069, "reward": 1.794851839542389, "reward_std": 0.17533704824745655, "rewards/accuracy_reward": 0.8104767203330994, "rewards/format_reward": 0.984375, "step": 1268 }, { "completion_length": 78.515625, "epoch": 5.794520547945205, "grad_norm": 4.9043684005737305, "kl": 0.137451171875, "learning_rate": 4.2054794520547945e-07, "loss": 0.0055, "reward": 1.6109731793403625, "reward_std": 0.2282901182770729, "rewards/accuracy_reward": 0.6187856197357178, "rewards/format_reward": 0.9921875, "step": 1269 }, { "completion_length": 89.8671875, "epoch": 5.799086757990867, "grad_norm": 2.3333468437194824, "kl": 0.099853515625, "learning_rate": 4.200913242009132e-07, "loss": 0.004, "reward": 1.7177083492279053, "reward_std": 0.17229503020644188, "rewards/accuracy_reward": 0.7255208194255829, "rewards/format_reward": 0.9921875, "step": 1270 }, { "completion_length": 71.6171875, "epoch": 5.80365296803653, "grad_norm": 2.1021461486816406, "kl": 0.153564453125, "learning_rate": 4.19634703196347e-07, "loss": 0.0061, "reward": 1.6888020634651184, "reward_std": 0.30169400572776794, "rewards/accuracy_reward": 0.7200521230697632, "rewards/format_reward": 0.96875, "step": 1271 }, { "completion_length": 84.703125, "epoch": 5.808219178082192, "grad_norm": 4.3470282554626465, "kl": 0.148193359375, "learning_rate": 4.191780821917808e-07, "loss": 0.0059, "reward": 1.7307292222976685, "reward_std": 0.1593589335680008, "rewards/accuracy_reward": 0.7463541030883789, "rewards/format_reward": 0.984375, "step": 1272 }, { "completion_length": 63.3203125, "epoch": 5.812785388127854, "grad_norm": 5.333644390106201, "kl": 0.159912109375, "learning_rate": 4.187214611872146e-07, "loss": 0.0064, "reward": 1.6519964933395386, "reward_std": 0.18434231728315353, "rewards/accuracy_reward": 0.6519964933395386, "rewards/format_reward": 1.0, "step": 1273 }, { "completion_length": 67.6953125, "epoch": 5.817351598173516, "grad_norm": 2.9642715454101562, "kl": 0.1298828125, "learning_rate": 4.182648401826484e-07, "loss": 0.0052, "reward": 1.6983258724212646, "reward_std": 0.15258603543043137, "rewards/accuracy_reward": 0.698325902223587, "rewards/format_reward": 1.0, "step": 1274 }, { "completion_length": 74.75, "epoch": 5.821917808219178, "grad_norm": 2.8370203971862793, "kl": 0.1337890625, "learning_rate": 4.1780821917808217e-07, "loss": 0.0053, "reward": 1.757942795753479, "reward_std": 0.24083544313907623, "rewards/accuracy_reward": 0.7813802063465118, "rewards/format_reward": 0.9765625, "step": 1275 }, { "completion_length": 90.6640625, "epoch": 5.82648401826484, "grad_norm": 1.1865882873535156, "kl": 0.076171875, "learning_rate": 4.1735159817351596e-07, "loss": 0.0031, "reward": 1.8952009081840515, "reward_std": 0.09130230359733105, "rewards/accuracy_reward": 0.9030132591724396, "rewards/format_reward": 0.9921875, "step": 1276 }, { "completion_length": 89.4375, "epoch": 5.831050228310502, "grad_norm": 2.1936404705047607, "kl": 0.094482421875, "learning_rate": 4.168949771689498e-07, "loss": 0.0038, "reward": 1.672282099723816, "reward_std": 0.1674654446542263, "rewards/accuracy_reward": 0.6957195997238159, "rewards/format_reward": 0.9765625, "step": 1277 }, { "completion_length": 88.984375, "epoch": 5.835616438356165, "grad_norm": 2.0885391235351562, "kl": 0.12646484375, "learning_rate": 4.1643835616438353e-07, "loss": 0.0051, "reward": 1.6463323831558228, "reward_std": 0.18611476570367813, "rewards/accuracy_reward": 0.6619572937488556, "rewards/format_reward": 0.984375, "step": 1278 }, { "completion_length": 75.7109375, "epoch": 5.840182648401827, "grad_norm": 6.192373275756836, "kl": 0.135009765625, "learning_rate": 4.159817351598173e-07, "loss": 0.0054, "reward": 1.634151816368103, "reward_std": 0.11692540347576141, "rewards/accuracy_reward": 0.6419642567634583, "rewards/format_reward": 0.9921875, "step": 1279 }, { "completion_length": 80.0078125, "epoch": 5.844748858447488, "grad_norm": 2.136561155319214, "kl": 0.16015625, "learning_rate": 4.155251141552511e-07, "loss": 0.0064, "reward": 1.7792754769325256, "reward_std": 0.09816346131265163, "rewards/accuracy_reward": 0.7870878875255585, "rewards/format_reward": 0.9921875, "step": 1280 }, { "completion_length": 99.0078125, "epoch": 5.8493150684931505, "grad_norm": 1.984879732131958, "kl": 0.08544921875, "learning_rate": 4.1506849315068495e-07, "loss": 0.0034, "reward": 1.7765625715255737, "reward_std": 0.06629125960171223, "rewards/accuracy_reward": 0.7843749523162842, "rewards/format_reward": 0.9921875, "step": 1281 }, { "completion_length": 82.71875, "epoch": 5.853881278538813, "grad_norm": 2.8610336780548096, "kl": 0.11669921875, "learning_rate": 4.146118721461187e-07, "loss": 0.0047, "reward": 1.854315459728241, "reward_std": 0.11991548165678978, "rewards/accuracy_reward": 0.8543154299259186, "rewards/format_reward": 1.0, "step": 1282 }, { "completion_length": 98.296875, "epoch": 5.858447488584475, "grad_norm": 1.845438003540039, "kl": 0.101318359375, "learning_rate": 4.1415525114155247e-07, "loss": 0.004, "reward": 1.8150991797447205, "reward_std": 0.07342393416911364, "rewards/accuracy_reward": 0.8150991201400757, "rewards/format_reward": 1.0, "step": 1283 }, { "completion_length": 67.1953125, "epoch": 5.863013698630137, "grad_norm": 5.036915302276611, "kl": 0.145751953125, "learning_rate": 4.136986301369863e-07, "loss": 0.0058, "reward": 1.7412946820259094, "reward_std": 0.175389152020216, "rewards/accuracy_reward": 0.7491071224212646, "rewards/format_reward": 0.9921875, "step": 1284 }, { "completion_length": 67.8828125, "epoch": 5.867579908675799, "grad_norm": 5.918455123901367, "kl": 0.150390625, "learning_rate": 4.132420091324201e-07, "loss": 0.006, "reward": 1.7437500357627869, "reward_std": 0.2584189176559448, "rewards/accuracy_reward": 0.7749999761581421, "rewards/format_reward": 0.96875, "step": 1285 }, { "completion_length": 74.6015625, "epoch": 5.872146118721461, "grad_norm": 1.850948691368103, "kl": 0.132568359375, "learning_rate": 4.1278538812785383e-07, "loss": 0.0053, "reward": 1.9023438096046448, "reward_std": 0.08417459111660719, "rewards/accuracy_reward": 0.9023437201976776, "rewards/format_reward": 1.0, "step": 1286 }, { "completion_length": 68.390625, "epoch": 5.876712328767123, "grad_norm": 4.7030534744262695, "kl": 0.166015625, "learning_rate": 4.1232876712328767e-07, "loss": 0.0067, "reward": 1.8146693706512451, "reward_std": 0.1410301998257637, "rewards/accuracy_reward": 0.8146693110466003, "rewards/format_reward": 1.0, "step": 1287 }, { "completion_length": 83.390625, "epoch": 5.8812785388127855, "grad_norm": 2.299928665161133, "kl": 0.114501953125, "learning_rate": 4.1187214611872146e-07, "loss": 0.0046, "reward": 1.7009549140930176, "reward_std": 0.18842098861932755, "rewards/accuracy_reward": 0.7087673544883728, "rewards/format_reward": 0.9921875, "step": 1288 }, { "completion_length": 77.59375, "epoch": 5.885844748858448, "grad_norm": 2.9478793144226074, "kl": 0.1025390625, "learning_rate": 4.1141552511415525e-07, "loss": 0.0041, "reward": 1.748046875, "reward_std": 0.15785659104585648, "rewards/accuracy_reward": 0.7480468451976776, "rewards/format_reward": 1.0, "step": 1289 }, { "completion_length": 100.3359375, "epoch": 5.890410958904109, "grad_norm": 2.3754220008850098, "kl": 0.112060546875, "learning_rate": 4.10958904109589e-07, "loss": 0.0045, "reward": 1.7859375476837158, "reward_std": 0.17908401414752007, "rewards/accuracy_reward": 0.8015624284744263, "rewards/format_reward": 0.984375, "step": 1290 }, { "completion_length": 69.46875, "epoch": 5.894977168949771, "grad_norm": 3.575843334197998, "kl": 0.134765625, "learning_rate": 4.105022831050228e-07, "loss": 0.0054, "reward": 1.845781147480011, "reward_std": 0.08029642701148987, "rewards/accuracy_reward": 0.8457811176776886, "rewards/format_reward": 1.0, "step": 1291 }, { "completion_length": 94.5234375, "epoch": 5.899543378995434, "grad_norm": 7.622490406036377, "kl": 0.0859375, "learning_rate": 4.100456621004566e-07, "loss": 0.0034, "reward": 1.796093761920929, "reward_std": 0.1538807898759842, "rewards/accuracy_reward": 0.803906261920929, "rewards/format_reward": 0.9921875, "step": 1292 }, { "completion_length": 82.234375, "epoch": 5.904109589041096, "grad_norm": 7.203249931335449, "kl": 0.11962890625, "learning_rate": 4.095890410958904e-07, "loss": 0.0048, "reward": 1.737942636013031, "reward_std": 0.16788282990455627, "rewards/accuracy_reward": 0.7457550466060638, "rewards/format_reward": 0.9921875, "step": 1293 }, { "completion_length": 82.7578125, "epoch": 5.908675799086758, "grad_norm": 2.2500412464141846, "kl": 0.101806640625, "learning_rate": 4.091324200913242e-07, "loss": 0.0041, "reward": 1.7422269582748413, "reward_std": 0.13895701617002487, "rewards/accuracy_reward": 0.7500394582748413, "rewards/format_reward": 0.9921875, "step": 1294 }, { "completion_length": 93.71875, "epoch": 5.91324200913242, "grad_norm": 2.051222324371338, "kl": 0.122314453125, "learning_rate": 4.0867579908675797e-07, "loss": 0.0049, "reward": 1.8802083730697632, "reward_std": 0.08680902794003487, "rewards/accuracy_reward": 0.8880207538604736, "rewards/format_reward": 0.9921875, "step": 1295 }, { "completion_length": 98.2421875, "epoch": 5.917808219178082, "grad_norm": 1.9777852296829224, "kl": 0.10009765625, "learning_rate": 4.0821917808219176e-07, "loss": 0.004, "reward": 1.805654764175415, "reward_std": 0.10066970996558666, "rewards/accuracy_reward": 0.8134672045707703, "rewards/format_reward": 0.9921875, "step": 1296 }, { "completion_length": 73.6875, "epoch": 5.922374429223744, "grad_norm": 2.501546859741211, "kl": 0.1171875, "learning_rate": 4.077625570776256e-07, "loss": 0.0047, "reward": 1.7920072674751282, "reward_std": 0.11444094032049179, "rewards/accuracy_reward": 0.7920072078704834, "rewards/format_reward": 1.0, "step": 1297 }, { "completion_length": 113.390625, "epoch": 5.926940639269406, "grad_norm": 1.9746527671813965, "kl": 0.05908203125, "learning_rate": 4.0730593607305933e-07, "loss": 0.0024, "reward": 1.830468773841858, "reward_std": 0.17702769488096237, "rewards/accuracy_reward": 0.8617186546325684, "rewards/format_reward": 0.96875, "step": 1298 }, { "completion_length": 89.3203125, "epoch": 5.931506849315069, "grad_norm": 1.95270574092865, "kl": 0.118896484375, "learning_rate": 4.068493150684931e-07, "loss": 0.0048, "reward": 1.784375011920929, "reward_std": 0.11048543453216553, "rewards/accuracy_reward": 0.784375011920929, "rewards/format_reward": 1.0, "step": 1299 }, { "completion_length": 80.3359375, "epoch": 5.936073059360731, "grad_norm": 2.471496820449829, "kl": 0.116943359375, "learning_rate": 4.063926940639269e-07, "loss": 0.0047, "reward": 1.7593750357627869, "reward_std": 0.11229449138045311, "rewards/accuracy_reward": 0.7671874761581421, "rewards/format_reward": 0.9921875, "step": 1300 }, { "completion_length": 82.578125, "epoch": 5.940639269406392, "grad_norm": 2.451826572418213, "kl": 0.15087890625, "learning_rate": 4.0593607305936075e-07, "loss": 0.006, "reward": 1.713808834552765, "reward_std": 0.16193728893995285, "rewards/accuracy_reward": 0.7216213047504425, "rewards/format_reward": 0.9921875, "step": 1301 }, { "completion_length": 68.0234375, "epoch": 5.945205479452055, "grad_norm": 1.9107226133346558, "kl": 0.087890625, "learning_rate": 4.054794520547945e-07, "loss": 0.0035, "reward": 1.7034350037574768, "reward_std": 0.09215648844838142, "rewards/accuracy_reward": 0.7034350335597992, "rewards/format_reward": 1.0, "step": 1302 }, { "completion_length": 87.0234375, "epoch": 5.949771689497717, "grad_norm": 1.9171605110168457, "kl": 0.104736328125, "learning_rate": 4.0502283105022827e-07, "loss": 0.0042, "reward": 1.7768229246139526, "reward_std": 0.09278659522533417, "rewards/accuracy_reward": 0.7768228650093079, "rewards/format_reward": 1.0, "step": 1303 }, { "completion_length": 65.109375, "epoch": 5.954337899543379, "grad_norm": 2.9926059246063232, "kl": 0.1611328125, "learning_rate": 4.045662100456621e-07, "loss": 0.0064, "reward": 1.6825549602508545, "reward_std": 0.27830731868743896, "rewards/accuracy_reward": 0.7216174304485321, "rewards/format_reward": 0.9609375, "step": 1304 }, { "completion_length": 73.109375, "epoch": 5.958904109589041, "grad_norm": 7.7803120613098145, "kl": 0.1435546875, "learning_rate": 4.041095890410959e-07, "loss": 0.0057, "reward": 1.7353760600090027, "reward_std": 0.16425937414169312, "rewards/accuracy_reward": 0.7431885600090027, "rewards/format_reward": 0.9921875, "step": 1305 }, { "completion_length": 83.65625, "epoch": 5.963470319634704, "grad_norm": 2.513974905014038, "kl": 0.105712890625, "learning_rate": 4.0365296803652963e-07, "loss": 0.0042, "reward": 1.8269480466842651, "reward_std": 0.1296938955783844, "rewards/accuracy_reward": 0.8347605168819427, "rewards/format_reward": 0.9921875, "step": 1306 }, { "completion_length": 61.1328125, "epoch": 5.968036529680365, "grad_norm": 3.1066365242004395, "kl": 0.169921875, "learning_rate": 4.0319634703196347e-07, "loss": 0.0068, "reward": 1.8063979148864746, "reward_std": 0.19071058928966522, "rewards/accuracy_reward": 0.8220228552818298, "rewards/format_reward": 0.984375, "step": 1307 }, { "completion_length": 81.75, "epoch": 5.972602739726027, "grad_norm": 6.998978614807129, "kl": 0.154296875, "learning_rate": 4.0273972602739726e-07, "loss": 0.0062, "reward": 1.7838542461395264, "reward_std": 0.13565516471862793, "rewards/accuracy_reward": 0.7916666567325592, "rewards/format_reward": 0.9921875, "step": 1308 }, { "completion_length": 65.1953125, "epoch": 5.9771689497716896, "grad_norm": 2.565167188644409, "kl": 0.11279296875, "learning_rate": 4.0228310502283105e-07, "loss": 0.0045, "reward": 1.685937523841858, "reward_std": 0.15467960759997368, "rewards/accuracy_reward": 0.6937499940395355, "rewards/format_reward": 0.9921875, "step": 1309 }, { "completion_length": 84.0703125, "epoch": 5.981735159817352, "grad_norm": 4.1508073806762695, "kl": 0.099365234375, "learning_rate": 4.018264840182648e-07, "loss": 0.004, "reward": 1.706423580646515, "reward_std": 0.1783130094408989, "rewards/accuracy_reward": 0.7220486104488373, "rewards/format_reward": 0.984375, "step": 1310 }, { "completion_length": 74.0703125, "epoch": 5.986301369863014, "grad_norm": 4.801496505737305, "kl": 0.17236328125, "learning_rate": 4.013698630136986e-07, "loss": 0.0069, "reward": 1.8074793815612793, "reward_std": 0.14287326484918594, "rewards/accuracy_reward": 0.8152918219566345, "rewards/format_reward": 0.9921875, "step": 1311 }, { "completion_length": 101.0390625, "epoch": 5.9908675799086755, "grad_norm": 3.006030797958374, "kl": 0.08642578125, "learning_rate": 4.009132420091324e-07, "loss": 0.0035, "reward": 1.8205991387367249, "reward_std": 0.14634042605757713, "rewards/accuracy_reward": 0.8284116387367249, "rewards/format_reward": 0.9921875, "step": 1312 }, { "completion_length": 70.0078125, "epoch": 5.995433789954338, "grad_norm": 3.449769973754883, "kl": 0.13720703125, "learning_rate": 4.004566210045662e-07, "loss": 0.0055, "reward": 1.651562511920929, "reward_std": 0.21917617321014404, "rewards/accuracy_reward": 0.659375011920929, "rewards/format_reward": 0.9921875, "step": 1313 }, { "completion_length": 44.0, "epoch": 6.0, "grad_norm": 3.673351287841797, "kl": 0.1494140625, "learning_rate": 4e-07, "loss": 0.0045, "reward": 1.625, "reward_std": 0.408231720328331, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 1314 }, { "completion_length": 84.4453125, "epoch": 6.004566210045662, "grad_norm": 1.5486712455749512, "kl": 0.10546875, "learning_rate": 3.9954337899543377e-07, "loss": 0.0042, "reward": 1.8225446343421936, "reward_std": 0.06181819178164005, "rewards/accuracy_reward": 0.8225446343421936, "rewards/format_reward": 1.0, "step": 1315 }, { "completion_length": 87.6484375, "epoch": 6.0091324200913245, "grad_norm": 2.9046413898468018, "kl": 0.106201171875, "learning_rate": 3.9908675799086756e-07, "loss": 0.0042, "reward": 1.8345487117767334, "reward_std": 0.1487816534936428, "rewards/accuracy_reward": 0.8501735925674438, "rewards/format_reward": 0.984375, "step": 1316 }, { "completion_length": 91.9453125, "epoch": 6.013698630136986, "grad_norm": 3.910123348236084, "kl": 0.083251953125, "learning_rate": 3.9863013698630134e-07, "loss": 0.0033, "reward": 1.6695313453674316, "reward_std": 0.2122296392917633, "rewards/accuracy_reward": 0.6851562261581421, "rewards/format_reward": 0.984375, "step": 1317 }, { "completion_length": 71.890625, "epoch": 6.018264840182648, "grad_norm": 3.065825939178467, "kl": 0.124755859375, "learning_rate": 3.9817351598173513e-07, "loss": 0.005, "reward": 1.6242188215255737, "reward_std": 0.14951937273144722, "rewards/accuracy_reward": 0.624218761920929, "rewards/format_reward": 1.0, "step": 1318 }, { "completion_length": 97.1875, "epoch": 6.0228310502283104, "grad_norm": 1.6381162405014038, "kl": 0.10986328125, "learning_rate": 3.977168949771689e-07, "loss": 0.0044, "reward": 1.8406250476837158, "reward_std": 0.14283224940299988, "rewards/accuracy_reward": 0.8562499284744263, "rewards/format_reward": 0.984375, "step": 1319 }, { "completion_length": 74.859375, "epoch": 6.027397260273973, "grad_norm": 4.058730125427246, "kl": 0.13037109375, "learning_rate": 3.972602739726027e-07, "loss": 0.0052, "reward": 1.7753472328186035, "reward_std": 0.11618576943874359, "rewards/accuracy_reward": 0.7909722030162811, "rewards/format_reward": 0.984375, "step": 1320 }, { "completion_length": 75.2109375, "epoch": 6.031963470319635, "grad_norm": 2.673145055770874, "kl": 0.103271484375, "learning_rate": 3.9680365296803655e-07, "loss": 0.0041, "reward": 1.8099148869514465, "reward_std": 0.0990208312869072, "rewards/accuracy_reward": 0.8177272975444794, "rewards/format_reward": 0.9921875, "step": 1321 }, { "completion_length": 70.84375, "epoch": 6.036529680365296, "grad_norm": 4.576772689819336, "kl": 0.12890625, "learning_rate": 3.963470319634703e-07, "loss": 0.0052, "reward": 1.7658482193946838, "reward_std": 0.1474735364317894, "rewards/accuracy_reward": 0.7814731895923615, "rewards/format_reward": 0.984375, "step": 1322 }, { "completion_length": 75.84375, "epoch": 6.041095890410959, "grad_norm": 3.1631548404693604, "kl": 0.1611328125, "learning_rate": 3.9589041095890407e-07, "loss": 0.0064, "reward": 1.6473042964935303, "reward_std": 0.17740929126739502, "rewards/accuracy_reward": 0.6473042666912079, "rewards/format_reward": 1.0, "step": 1323 }, { "completion_length": 71.1640625, "epoch": 6.045662100456621, "grad_norm": 2.176661968231201, "kl": 0.1416015625, "learning_rate": 3.954337899543379e-07, "loss": 0.0057, "reward": 1.8772332668304443, "reward_std": 0.07372352294623852, "rewards/accuracy_reward": 0.8772332966327667, "rewards/format_reward": 1.0, "step": 1324 }, { "completion_length": 80.1328125, "epoch": 6.050228310502283, "grad_norm": 2.169936180114746, "kl": 0.12109375, "learning_rate": 3.949771689497717e-07, "loss": 0.0048, "reward": 1.6571251153945923, "reward_std": 0.13837599009275436, "rewards/accuracy_reward": 0.6571250259876251, "rewards/format_reward": 1.0, "step": 1325 }, { "completion_length": 83.4375, "epoch": 6.054794520547945, "grad_norm": 2.906425714492798, "kl": 0.15966796875, "learning_rate": 3.9452054794520543e-07, "loss": 0.0064, "reward": 1.87109375, "reward_std": 0.17256292700767517, "rewards/accuracy_reward": 0.88671875, "rewards/format_reward": 0.984375, "step": 1326 }, { "completion_length": 69.375, "epoch": 6.059360730593608, "grad_norm": 2.7249135971069336, "kl": 0.197998046875, "learning_rate": 3.940639269406392e-07, "loss": 0.0079, "reward": 1.780989646911621, "reward_std": 0.19701316952705383, "rewards/accuracy_reward": 0.7888020575046539, "rewards/format_reward": 0.9921875, "step": 1327 }, { "completion_length": 85.78125, "epoch": 6.063926940639269, "grad_norm": 2.412091016769409, "kl": 0.125732421875, "learning_rate": 3.9360730593607306e-07, "loss": 0.005, "reward": 1.6776041984558105, "reward_std": 0.15093514323234558, "rewards/accuracy_reward": 0.6932291090488434, "rewards/format_reward": 0.984375, "step": 1328 }, { "completion_length": 67.96875, "epoch": 6.068493150684931, "grad_norm": 2.928151845932007, "kl": 0.14501953125, "learning_rate": 3.9315068493150684e-07, "loss": 0.0058, "reward": 1.7759548425674438, "reward_std": 0.15794897824525833, "rewards/accuracy_reward": 0.7759548723697662, "rewards/format_reward": 1.0, "step": 1329 }, { "completion_length": 94.921875, "epoch": 6.073059360730594, "grad_norm": 1.4851278066635132, "kl": 0.093994140625, "learning_rate": 3.926940639269406e-07, "loss": 0.0038, "reward": 1.8491500616073608, "reward_std": 0.14507145062088966, "rewards/accuracy_reward": 0.8725875020027161, "rewards/format_reward": 0.9765625, "step": 1330 }, { "completion_length": 78.6484375, "epoch": 6.077625570776256, "grad_norm": 2.765091896057129, "kl": 0.112548828125, "learning_rate": 3.922374429223744e-07, "loss": 0.0045, "reward": 1.8410881161689758, "reward_std": 0.14643773436546326, "rewards/accuracy_reward": 0.864525556564331, "rewards/format_reward": 0.9765625, "step": 1331 }, { "completion_length": 86.4375, "epoch": 6.082191780821918, "grad_norm": 3.7591657638549805, "kl": 0.1103515625, "learning_rate": 3.917808219178082e-07, "loss": 0.0044, "reward": 1.7199653387069702, "reward_std": 0.1599731780588627, "rewards/accuracy_reward": 0.7355902194976807, "rewards/format_reward": 0.984375, "step": 1332 }, { "completion_length": 88.1015625, "epoch": 6.0867579908675795, "grad_norm": 2.042999267578125, "kl": 0.102783203125, "learning_rate": 3.91324200913242e-07, "loss": 0.0041, "reward": 1.8294270634651184, "reward_std": 0.09024603478610516, "rewards/accuracy_reward": 0.8372394740581512, "rewards/format_reward": 0.9921875, "step": 1333 }, { "completion_length": 96.703125, "epoch": 6.091324200913242, "grad_norm": 1.6703046560287476, "kl": 0.14990234375, "learning_rate": 3.908675799086758e-07, "loss": 0.006, "reward": 1.795138955116272, "reward_std": 0.09230764210224152, "rewards/accuracy_reward": 0.8029513657093048, "rewards/format_reward": 0.9921875, "step": 1334 }, { "completion_length": 84.5078125, "epoch": 6.095890410958904, "grad_norm": 1.742505669593811, "kl": 0.10205078125, "learning_rate": 3.9041095890410957e-07, "loss": 0.0041, "reward": 1.6895833611488342, "reward_std": 0.2131306305527687, "rewards/accuracy_reward": 0.7130208313465118, "rewards/format_reward": 0.9765625, "step": 1335 }, { "completion_length": 86.1484375, "epoch": 6.100456621004566, "grad_norm": 12.497916221618652, "kl": 0.13671875, "learning_rate": 3.8995433789954336e-07, "loss": 0.0055, "reward": 1.6923341751098633, "reward_std": 0.17974907904863358, "rewards/accuracy_reward": 0.7001466453075409, "rewards/format_reward": 0.9921875, "step": 1336 }, { "completion_length": 77.3359375, "epoch": 6.105022831050229, "grad_norm": 4.783447742462158, "kl": 0.1591796875, "learning_rate": 3.8949771689497714e-07, "loss": 0.0064, "reward": 1.6346808671951294, "reward_std": 0.1741974987089634, "rewards/accuracy_reward": 0.642493337392807, "rewards/format_reward": 0.9921875, "step": 1337 }, { "completion_length": 85.03125, "epoch": 6.109589041095891, "grad_norm": 3.4984941482543945, "kl": 0.140625, "learning_rate": 3.8904109589041093e-07, "loss": 0.0056, "reward": 1.6837969422340393, "reward_std": 0.16627999395132065, "rewards/accuracy_reward": 0.6916094422340393, "rewards/format_reward": 0.9921875, "step": 1338 }, { "completion_length": 95.484375, "epoch": 6.114155251141552, "grad_norm": 3.724400520324707, "kl": 0.084228515625, "learning_rate": 3.885844748858447e-07, "loss": 0.0034, "reward": 1.7506882548332214, "reward_std": 0.16492830961942673, "rewards/accuracy_reward": 0.7741256952285767, "rewards/format_reward": 0.9765625, "step": 1339 }, { "completion_length": 84.8359375, "epoch": 6.1187214611872145, "grad_norm": 3.958569049835205, "kl": 0.1142578125, "learning_rate": 3.881278538812785e-07, "loss": 0.0046, "reward": 1.8411458730697632, "reward_std": 0.12668996676802635, "rewards/accuracy_reward": 0.8489583134651184, "rewards/format_reward": 0.9921875, "step": 1340 }, { "completion_length": 64.8828125, "epoch": 6.123287671232877, "grad_norm": 7.010223388671875, "kl": 0.1708984375, "learning_rate": 3.8767123287671235e-07, "loss": 0.0068, "reward": 1.796571135520935, "reward_std": 0.147329680621624, "rewards/accuracy_reward": 0.8043836355209351, "rewards/format_reward": 0.9921875, "step": 1341 }, { "completion_length": 51.1953125, "epoch": 6.127853881278539, "grad_norm": 1.703456163406372, "kl": 0.18798828125, "learning_rate": 3.872146118721461e-07, "loss": 0.0075, "reward": 1.7231026887893677, "reward_std": 0.15178490430116653, "rewards/accuracy_reward": 0.7231026887893677, "rewards/format_reward": 1.0, "step": 1342 }, { "completion_length": 83.125, "epoch": 6.132420091324201, "grad_norm": 2.8786849975585938, "kl": 0.12744140625, "learning_rate": 3.8675799086757987e-07, "loss": 0.0051, "reward": 1.6835938096046448, "reward_std": 0.15046585351228714, "rewards/accuracy_reward": 0.6835937201976776, "rewards/format_reward": 1.0, "step": 1343 }, { "completion_length": 84.7109375, "epoch": 6.136986301369863, "grad_norm": 3.438633680343628, "kl": 0.1298828125, "learning_rate": 3.863013698630137e-07, "loss": 0.0052, "reward": 1.8140625357627869, "reward_std": 0.15467960387468338, "rewards/accuracy_reward": 0.8140624463558197, "rewards/format_reward": 1.0, "step": 1344 }, { "completion_length": 76.953125, "epoch": 6.141552511415525, "grad_norm": 5.209409236907959, "kl": 0.1064453125, "learning_rate": 3.858447488584475e-07, "loss": 0.0043, "reward": 1.6888335943222046, "reward_std": 0.1942080445587635, "rewards/accuracy_reward": 0.696646124124527, "rewards/format_reward": 0.9921875, "step": 1345 }, { "completion_length": 75.7578125, "epoch": 6.146118721461187, "grad_norm": 1.846853494644165, "kl": 0.115966796875, "learning_rate": 3.8538812785388123e-07, "loss": 0.0046, "reward": 1.7150331735610962, "reward_std": 0.19362305849790573, "rewards/accuracy_reward": 0.7306581139564514, "rewards/format_reward": 0.984375, "step": 1346 }, { "completion_length": 91.0390625, "epoch": 6.1506849315068495, "grad_norm": 1.0356731414794922, "kl": 0.11962890625, "learning_rate": 3.84931506849315e-07, "loss": 0.0048, "reward": 1.8451389074325562, "reward_std": 0.0736747458577156, "rewards/accuracy_reward": 0.852951318025589, "rewards/format_reward": 0.9921875, "step": 1347 }, { "completion_length": 69.703125, "epoch": 6.155251141552512, "grad_norm": 1.3840466737747192, "kl": 0.126953125, "learning_rate": 3.8447488584474886e-07, "loss": 0.0051, "reward": 1.8907551765441895, "reward_std": 0.06240340322256088, "rewards/accuracy_reward": 0.8907552063465118, "rewards/format_reward": 1.0, "step": 1348 }, { "completion_length": 90.1171875, "epoch": 6.159817351598173, "grad_norm": 1.7931628227233887, "kl": 0.091064453125, "learning_rate": 3.8401826484018264e-07, "loss": 0.0036, "reward": 1.792187511920929, "reward_std": 0.10482378304004669, "rewards/accuracy_reward": 0.7999999225139618, "rewards/format_reward": 0.9921875, "step": 1349 }, { "completion_length": 93.625, "epoch": 6.164383561643835, "grad_norm": 2.3507261276245117, "kl": 0.12060546875, "learning_rate": 3.835616438356164e-07, "loss": 0.0048, "reward": 1.7338955998420715, "reward_std": 0.16754615679383278, "rewards/accuracy_reward": 0.7573330104351044, "rewards/format_reward": 0.9765625, "step": 1350 }, { "completion_length": 62.859375, "epoch": 6.168949771689498, "grad_norm": 2.7926764488220215, "kl": 0.1259765625, "learning_rate": 3.831050228310502e-07, "loss": 0.005, "reward": 1.92203950881958, "reward_std": 0.05836756294593215, "rewards/accuracy_reward": 0.9220394492149353, "rewards/format_reward": 1.0, "step": 1351 }, { "completion_length": 72.953125, "epoch": 6.17351598173516, "grad_norm": 1.8602731227874756, "kl": 0.120849609375, "learning_rate": 3.82648401826484e-07, "loss": 0.0048, "reward": 1.8530598878860474, "reward_std": 0.07507604733109474, "rewards/accuracy_reward": 0.860872358083725, "rewards/format_reward": 0.9921875, "step": 1352 }, { "completion_length": 89.671875, "epoch": 6.178082191780822, "grad_norm": 1.9812475442886353, "kl": 0.107421875, "learning_rate": 3.821917808219178e-07, "loss": 0.0043, "reward": 1.8095996975898743, "reward_std": 0.07891392894089222, "rewards/accuracy_reward": 0.8095996379852295, "rewards/format_reward": 1.0, "step": 1353 }, { "completion_length": 84.40625, "epoch": 6.182648401826484, "grad_norm": 2.6219146251678467, "kl": 0.110107421875, "learning_rate": 3.817351598173516e-07, "loss": 0.0044, "reward": 1.7649739980697632, "reward_std": 0.13717181608080864, "rewards/accuracy_reward": 0.7805989384651184, "rewards/format_reward": 0.984375, "step": 1354 }, { "completion_length": 85.2265625, "epoch": 6.187214611872146, "grad_norm": 1.6997660398483276, "kl": 0.078857421875, "learning_rate": 3.8127853881278537e-07, "loss": 0.0031, "reward": 1.7958333492279053, "reward_std": 0.07365695387125015, "rewards/accuracy_reward": 0.7958332598209381, "rewards/format_reward": 1.0, "step": 1355 }, { "completion_length": 68.3828125, "epoch": 6.191780821917808, "grad_norm": 2.4636871814727783, "kl": 0.140380859375, "learning_rate": 3.8082191780821916e-07, "loss": 0.0056, "reward": 1.638918399810791, "reward_std": 0.1541249379515648, "rewards/accuracy_reward": 0.6467308402061462, "rewards/format_reward": 0.9921875, "step": 1356 }, { "completion_length": 96.0078125, "epoch": 6.19634703196347, "grad_norm": 2.066955804824829, "kl": 0.08056640625, "learning_rate": 3.8036529680365294e-07, "loss": 0.0032, "reward": 1.7703125476837158, "reward_std": 0.11048543080687523, "rewards/accuracy_reward": 0.7781248986721039, "rewards/format_reward": 0.9921875, "step": 1357 }, { "completion_length": 87.1640625, "epoch": 6.200913242009133, "grad_norm": 1.8385370969772339, "kl": 0.125, "learning_rate": 3.7990867579908673e-07, "loss": 0.005, "reward": 1.7934895753860474, "reward_std": 0.16230732202529907, "rewards/accuracy_reward": 0.809114545583725, "rewards/format_reward": 0.984375, "step": 1358 }, { "completion_length": 79.4140625, "epoch": 6.205479452054795, "grad_norm": 1.4888020753860474, "kl": 0.1396484375, "learning_rate": 3.794520547945205e-07, "loss": 0.0056, "reward": 1.8671875596046448, "reward_std": 0.08838834427297115, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.9921875, "step": 1359 }, { "completion_length": 81.5859375, "epoch": 6.210045662100456, "grad_norm": 3.295711040496826, "kl": 0.159423828125, "learning_rate": 3.789954337899543e-07, "loss": 0.0064, "reward": 1.7882669568061829, "reward_std": 0.14686734601855278, "rewards/accuracy_reward": 0.7960793972015381, "rewards/format_reward": 0.9921875, "step": 1360 }, { "completion_length": 70.3828125, "epoch": 6.2146118721461185, "grad_norm": 4.181000709533691, "kl": 0.1220703125, "learning_rate": 3.7853881278538814e-07, "loss": 0.0049, "reward": 1.6776198744773865, "reward_std": 0.20163647830486298, "rewards/accuracy_reward": 0.6854324042797089, "rewards/format_reward": 0.9921875, "step": 1361 }, { "completion_length": 74.453125, "epoch": 6.219178082191781, "grad_norm": 2.4070448875427246, "kl": 0.123779296875, "learning_rate": 3.780821917808219e-07, "loss": 0.0049, "reward": 1.810937523841858, "reward_std": 0.1541428193449974, "rewards/accuracy_reward": 0.8265624046325684, "rewards/format_reward": 0.984375, "step": 1362 }, { "completion_length": 73.96875, "epoch": 6.223744292237443, "grad_norm": 4.166317939758301, "kl": 0.16796875, "learning_rate": 3.7762557077625567e-07, "loss": 0.0067, "reward": 1.6138020753860474, "reward_std": 0.25969336926937103, "rewards/accuracy_reward": 0.6294271051883698, "rewards/format_reward": 0.984375, "step": 1363 }, { "completion_length": 74.0859375, "epoch": 6.228310502283105, "grad_norm": 9.133631706237793, "kl": 0.107421875, "learning_rate": 3.771689497716895e-07, "loss": 0.0043, "reward": 1.8240530490875244, "reward_std": 0.12234986200928688, "rewards/accuracy_reward": 0.8318654298782349, "rewards/format_reward": 0.9921875, "step": 1364 }, { "completion_length": 86.6328125, "epoch": 6.232876712328767, "grad_norm": 2.3225855827331543, "kl": 0.19580078125, "learning_rate": 3.767123287671233e-07, "loss": 0.0078, "reward": 1.7432359457015991, "reward_std": 0.20243436098098755, "rewards/accuracy_reward": 0.7666734158992767, "rewards/format_reward": 0.9765625, "step": 1365 }, { "completion_length": 75.84375, "epoch": 6.237442922374429, "grad_norm": 4.120067596435547, "kl": 0.1038818359375, "learning_rate": 3.7625570776255703e-07, "loss": 0.0042, "reward": 1.7382813096046448, "reward_std": 0.1538807973265648, "rewards/accuracy_reward": 0.74609375, "rewards/format_reward": 0.9921875, "step": 1366 }, { "completion_length": 71.75, "epoch": 6.242009132420091, "grad_norm": 2.138282537460327, "kl": 0.1630859375, "learning_rate": 3.757990867579908e-07, "loss": 0.0065, "reward": 1.7145547270774841, "reward_std": 0.18507731705904007, "rewards/accuracy_reward": 0.722367137670517, "rewards/format_reward": 0.9921875, "step": 1367 }, { "completion_length": 66.390625, "epoch": 6.2465753424657535, "grad_norm": 2.4669651985168457, "kl": 0.20654296875, "learning_rate": 3.7534246575342466e-07, "loss": 0.0083, "reward": 1.818272590637207, "reward_std": 0.11263852939009666, "rewards/accuracy_reward": 0.8260850310325623, "rewards/format_reward": 0.9921875, "step": 1368 }, { "completion_length": 100.6875, "epoch": 6.251141552511416, "grad_norm": 1.6667773723602295, "kl": 0.113037109375, "learning_rate": 3.7488584474885844e-07, "loss": 0.0045, "reward": 1.682812511920929, "reward_std": 0.14389308914542198, "rewards/accuracy_reward": 0.7062499523162842, "rewards/format_reward": 0.9765625, "step": 1369 }, { "completion_length": 58.890625, "epoch": 6.255707762557078, "grad_norm": 4.199071884155273, "kl": 0.122314453125, "learning_rate": 3.744292237442922e-07, "loss": 0.0049, "reward": 1.8065972328186035, "reward_std": 0.10062464885413647, "rewards/accuracy_reward": 0.8065972328186035, "rewards/format_reward": 1.0, "step": 1370 }, { "completion_length": 86.0625, "epoch": 6.260273972602739, "grad_norm": 4.296463966369629, "kl": 0.129150390625, "learning_rate": 3.73972602739726e-07, "loss": 0.0052, "reward": 1.7294270992279053, "reward_std": 0.1873747929930687, "rewards/accuracy_reward": 0.7372395694255829, "rewards/format_reward": 0.9921875, "step": 1371 }, { "completion_length": 70.609375, "epoch": 6.264840182648402, "grad_norm": 2.9196784496307373, "kl": 0.138671875, "learning_rate": 3.735159817351598e-07, "loss": 0.0055, "reward": 1.7803341746330261, "reward_std": 0.09776799008250237, "rewards/accuracy_reward": 0.7803341746330261, "rewards/format_reward": 1.0, "step": 1372 }, { "completion_length": 89.875, "epoch": 6.269406392694064, "grad_norm": 2.074902296066284, "kl": 0.11474609375, "learning_rate": 3.730593607305936e-07, "loss": 0.0046, "reward": 1.7255208492279053, "reward_std": 0.18392051756381989, "rewards/accuracy_reward": 0.7411458194255829, "rewards/format_reward": 0.984375, "step": 1373 }, { "completion_length": 75.140625, "epoch": 6.273972602739726, "grad_norm": 4.484436511993408, "kl": 0.156005859375, "learning_rate": 3.726027397260274e-07, "loss": 0.0062, "reward": 1.7850632667541504, "reward_std": 0.14479004591703415, "rewards/accuracy_reward": 0.792875736951828, "rewards/format_reward": 0.9921875, "step": 1374 }, { "completion_length": 92.6640625, "epoch": 6.2785388127853885, "grad_norm": 3.3195595741271973, "kl": 0.1070556640625, "learning_rate": 3.7214611872146117e-07, "loss": 0.0043, "reward": 1.80859375, "reward_std": 0.1387247210368514, "rewards/accuracy_reward": 0.8164062201976776, "rewards/format_reward": 0.9921875, "step": 1375 }, { "completion_length": 87.1875, "epoch": 6.28310502283105, "grad_norm": 4.265213966369629, "kl": 0.153564453125, "learning_rate": 3.7168949771689495e-07, "loss": 0.0061, "reward": 1.7935296893119812, "reward_std": 0.1823706291615963, "rewards/accuracy_reward": 0.8325920403003693, "rewards/format_reward": 0.9609375, "step": 1376 }, { "completion_length": 84.71875, "epoch": 6.287671232876712, "grad_norm": 1.7530524730682373, "kl": 0.115234375, "learning_rate": 3.7123287671232874e-07, "loss": 0.0046, "reward": 1.7466146349906921, "reward_std": 0.1300597358494997, "rewards/accuracy_reward": 0.7622395753860474, "rewards/format_reward": 0.984375, "step": 1377 }, { "completion_length": 92.078125, "epoch": 6.292237442922374, "grad_norm": 2.0412070751190186, "kl": 0.1259765625, "learning_rate": 3.7077625570776253e-07, "loss": 0.005, "reward": 1.8295753002166748, "reward_std": 0.17172623425722122, "rewards/accuracy_reward": 0.8608251810073853, "rewards/format_reward": 0.96875, "step": 1378 }, { "completion_length": 53.4375, "epoch": 6.296803652968037, "grad_norm": 2.757150888442993, "kl": 0.140625, "learning_rate": 3.703196347031963e-07, "loss": 0.0056, "reward": 1.701302170753479, "reward_std": 0.29356005787849426, "rewards/accuracy_reward": 0.7403645813465118, "rewards/format_reward": 0.9609375, "step": 1379 }, { "completion_length": 98.2109375, "epoch": 6.301369863013699, "grad_norm": 2.7139008045196533, "kl": 0.07958984375, "learning_rate": 3.698630136986301e-07, "loss": 0.0032, "reward": 1.783593773841858, "reward_std": 0.09553372114896774, "rewards/accuracy_reward": 0.7835937440395355, "rewards/format_reward": 1.0, "step": 1380 }, { "completion_length": 90.40625, "epoch": 6.30593607305936, "grad_norm": 3.6404201984405518, "kl": 0.105712890625, "learning_rate": 3.6940639269406394e-07, "loss": 0.0042, "reward": 1.746897280216217, "reward_std": 0.18012897670269012, "rewards/accuracy_reward": 0.7625222206115723, "rewards/format_reward": 0.984375, "step": 1381 }, { "completion_length": 70.71875, "epoch": 6.310502283105023, "grad_norm": 2.44050931930542, "kl": 0.1953125, "learning_rate": 3.689497716894977e-07, "loss": 0.0078, "reward": 1.7049107551574707, "reward_std": 0.14146586507558823, "rewards/accuracy_reward": 0.7127232253551483, "rewards/format_reward": 0.9921875, "step": 1382 }, { "completion_length": 84.0859375, "epoch": 6.315068493150685, "grad_norm": 2.908999443054199, "kl": 0.129150390625, "learning_rate": 3.6849315068493147e-07, "loss": 0.0052, "reward": 1.866501271724701, "reward_std": 0.14921535179018974, "rewards/accuracy_reward": 0.8743137121200562, "rewards/format_reward": 0.9921875, "step": 1383 }, { "completion_length": 66.9609375, "epoch": 6.319634703196347, "grad_norm": 7.363396644592285, "kl": 0.155517578125, "learning_rate": 3.680365296803653e-07, "loss": 0.0062, "reward": 1.73046875, "reward_std": 0.14824316650629044, "rewards/accuracy_reward": 0.7304687201976776, "rewards/format_reward": 1.0, "step": 1384 }, { "completion_length": 63.6875, "epoch": 6.324200913242009, "grad_norm": 2.700493097305298, "kl": 0.153564453125, "learning_rate": 3.675799086757991e-07, "loss": 0.0061, "reward": 1.7351128458976746, "reward_std": 0.1287803202867508, "rewards/accuracy_reward": 0.7429253458976746, "rewards/format_reward": 0.9921875, "step": 1385 }, { "completion_length": 77.515625, "epoch": 6.328767123287671, "grad_norm": 4.386509895324707, "kl": 0.140869140625, "learning_rate": 3.6712328767123283e-07, "loss": 0.0056, "reward": 1.6953125596046448, "reward_std": 0.20576493442058563, "rewards/accuracy_reward": 0.7187499701976776, "rewards/format_reward": 0.9765625, "step": 1386 }, { "completion_length": 69.765625, "epoch": 6.333333333333333, "grad_norm": 2.0167596340179443, "kl": 0.117431640625, "learning_rate": 3.666666666666666e-07, "loss": 0.0047, "reward": 1.7052951455116272, "reward_std": 0.12893803417682648, "rewards/accuracy_reward": 0.7052951157093048, "rewards/format_reward": 1.0, "step": 1387 }, { "completion_length": 90.6875, "epoch": 6.337899543378995, "grad_norm": 4.069055557250977, "kl": 0.095458984375, "learning_rate": 3.6621004566210046e-07, "loss": 0.0038, "reward": 1.753125011920929, "reward_std": 0.1462521031498909, "rewards/accuracy_reward": 0.7531249523162842, "rewards/format_reward": 1.0, "step": 1388 }, { "completion_length": 85.3671875, "epoch": 6.342465753424658, "grad_norm": 1.5775866508483887, "kl": 0.1103515625, "learning_rate": 3.6575342465753424e-07, "loss": 0.0044, "reward": 1.7893466353416443, "reward_std": 0.13156647235155106, "rewards/accuracy_reward": 0.8127840459346771, "rewards/format_reward": 0.9765625, "step": 1389 }, { "completion_length": 71.953125, "epoch": 6.34703196347032, "grad_norm": 4.919250965118408, "kl": 0.16259765625, "learning_rate": 3.65296803652968e-07, "loss": 0.0065, "reward": 1.816933274269104, "reward_std": 0.18086620420217514, "rewards/accuracy_reward": 0.816933274269104, "rewards/format_reward": 1.0, "step": 1390 }, { "completion_length": 75.359375, "epoch": 6.351598173515982, "grad_norm": 1.6152794361114502, "kl": 0.12841796875, "learning_rate": 3.648401826484018e-07, "loss": 0.0051, "reward": 1.8754695653915405, "reward_std": 0.05770116671919823, "rewards/accuracy_reward": 0.8754695355892181, "rewards/format_reward": 1.0, "step": 1391 }, { "completion_length": 97.0625, "epoch": 6.3561643835616435, "grad_norm": 3.0562081336975098, "kl": 0.0654296875, "learning_rate": 3.643835616438356e-07, "loss": 0.0026, "reward": 1.8640625476837158, "reward_std": 0.11048543266952038, "rewards/accuracy_reward": 0.8874999284744263, "rewards/format_reward": 0.9765625, "step": 1392 }, { "completion_length": 91.1015625, "epoch": 6.360730593607306, "grad_norm": 11.379284858703613, "kl": 0.123046875, "learning_rate": 3.639269406392694e-07, "loss": 0.0049, "reward": 1.7335938215255737, "reward_std": 0.13041038066148758, "rewards/accuracy_reward": 0.741406261920929, "rewards/format_reward": 0.9921875, "step": 1393 }, { "completion_length": 88.609375, "epoch": 6.365296803652968, "grad_norm": 2.2893126010894775, "kl": 0.120849609375, "learning_rate": 3.634703196347032e-07, "loss": 0.0048, "reward": 1.7554688453674316, "reward_std": 0.14954836666584015, "rewards/accuracy_reward": 0.7710936963558197, "rewards/format_reward": 0.984375, "step": 1394 }, { "completion_length": 86.0390625, "epoch": 6.36986301369863, "grad_norm": 2.2495548725128174, "kl": 0.10888671875, "learning_rate": 3.6301369863013697e-07, "loss": 0.0043, "reward": 1.8598958253860474, "reward_std": 0.10234630480408669, "rewards/accuracy_reward": 0.8677082657814026, "rewards/format_reward": 0.9921875, "step": 1395 }, { "completion_length": 75.4609375, "epoch": 6.3744292237442925, "grad_norm": 1.356154203414917, "kl": 0.095947265625, "learning_rate": 3.6255707762557075e-07, "loss": 0.0038, "reward": 1.6979167461395264, "reward_std": 0.11066011898219585, "rewards/accuracy_reward": 0.6979166567325592, "rewards/format_reward": 1.0, "step": 1396 }, { "completion_length": 76.453125, "epoch": 6.378995433789954, "grad_norm": 2.3890748023986816, "kl": 0.202880859375, "learning_rate": 3.6210045662100454e-07, "loss": 0.0081, "reward": 1.8653646111488342, "reward_std": 0.15285574202425778, "rewards/accuracy_reward": 0.8966145813465118, "rewards/format_reward": 0.96875, "step": 1397 }, { "completion_length": 66.3359375, "epoch": 6.383561643835616, "grad_norm": 1.8695800304412842, "kl": 0.1298828125, "learning_rate": 3.6164383561643833e-07, "loss": 0.0052, "reward": 1.8802322745323181, "reward_std": 0.046260682400316, "rewards/accuracy_reward": 0.8802323341369629, "rewards/format_reward": 1.0, "step": 1398 }, { "completion_length": 89.1796875, "epoch": 6.3881278538812785, "grad_norm": 1.6961324214935303, "kl": 0.103271484375, "learning_rate": 3.611872146118721e-07, "loss": 0.0041, "reward": 1.870312511920929, "reward_std": 0.10539799928665161, "rewards/accuracy_reward": 0.8781249523162842, "rewards/format_reward": 0.9921875, "step": 1399 }, { "completion_length": 90.0859375, "epoch": 6.392694063926941, "grad_norm": 8.425874710083008, "kl": 0.180419921875, "learning_rate": 3.607305936073059e-07, "loss": 0.0072, "reward": 1.6842634081840515, "reward_std": 0.16779197752475739, "rewards/accuracy_reward": 0.6842633485794067, "rewards/format_reward": 1.0, "step": 1400 } ], "logging_steps": 1.0, "max_steps": 2190, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }