diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.392694063926941, + "eval_steps": 500, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 101.3359375, + "epoch": 0.0045662100456621, + "grad_norm": 6.199877738952637, + "kl": 0.0, + "learning_rate": 9.995433789954337e-07, + "loss": -0.0, + "reward": 0.6208333522081375, + "reward_std": 0.723351925611496, + "rewards/accuracy_reward": 0.2614583298563957, + "rewards/format_reward": 0.359375, + "step": 1 + }, + { + "completion_length": 105.796875, + "epoch": 0.0091324200913242, + "grad_norm": 13.76264476776123, + "kl": 0.0005550384521484375, + "learning_rate": 9.990867579908674e-07, + "loss": 0.0, + "reward": 1.1440449953079224, + "reward_std": 0.6297117471694946, + "rewards/accuracy_reward": 0.5346700549125671, + "rewards/format_reward": 0.609375, + "step": 2 + }, + { + "completion_length": 106.609375, + "epoch": 0.0136986301369863, + "grad_norm": 6.990738868713379, + "kl": 0.000965118408203125, + "learning_rate": 9.986301369863014e-07, + "loss": 0.0, + "reward": 0.9957855343818665, + "reward_std": 0.726005494594574, + "rewards/accuracy_reward": 0.45672303438186646, + "rewards/format_reward": 0.5390625, + "step": 3 + }, + { + "completion_length": 101.4140625, + "epoch": 0.0182648401826484, + "grad_norm": 4.9568257331848145, + "kl": 0.001346588134765625, + "learning_rate": 9.98173515981735e-07, + "loss": 0.0001, + "reward": 0.991195559501648, + "reward_std": 0.7322099208831787, + "rewards/accuracy_reward": 0.43650802969932556, + "rewards/format_reward": 0.5546875, + "step": 4 + }, + { + "completion_length": 106.4453125, + "epoch": 0.0228310502283105, + "grad_norm": 4.493799209594727, + "kl": 0.0035247802734375, + "learning_rate": 9.977168949771688e-07, + "loss": 0.0001, + "reward": 0.9764133095741272, + "reward_std": 0.6022496819496155, + "rewards/accuracy_reward": 0.421725794672966, + "rewards/format_reward": 0.5546875, + "step": 5 + }, + { + "completion_length": 125.046875, + "epoch": 0.0273972602739726, + "grad_norm": 3.48476505279541, + "kl": 0.00315093994140625, + "learning_rate": 9.972602739726028e-07, + "loss": 0.0001, + "reward": 1.162934124469757, + "reward_std": 0.5378114879131317, + "rewards/accuracy_reward": 0.49887165427207947, + "rewards/format_reward": 0.6640625, + "step": 6 + }, + { + "completion_length": 107.1484375, + "epoch": 0.0319634703196347, + "grad_norm": 3.6667165756225586, + "kl": 0.0055999755859375, + "learning_rate": 9.968036529680365e-07, + "loss": 0.0002, + "reward": 1.1229968070983887, + "reward_std": 0.6023176610469818, + "rewards/accuracy_reward": 0.4511217921972275, + "rewards/format_reward": 0.671875, + "step": 7 + }, + { + "completion_length": 76.890625, + "epoch": 0.0365296803652968, + "grad_norm": 4.247033596038818, + "kl": 0.017059326171875, + "learning_rate": 9.963470319634703e-07, + "loss": 0.0007, + "reward": 0.9820911884307861, + "reward_std": 0.6595480144023895, + "rewards/accuracy_reward": 0.31021616607904434, + "rewards/format_reward": 0.671875, + "step": 8 + }, + { + "completion_length": 99.359375, + "epoch": 0.0410958904109589, + "grad_norm": 3.2795424461364746, + "kl": 0.018402099609375, + "learning_rate": 9.95890410958904e-07, + "loss": 0.0007, + "reward": 1.3125749230384827, + "reward_std": 0.5887171626091003, + "rewards/accuracy_reward": 0.5235124826431274, + "rewards/format_reward": 0.7890625, + "step": 9 + }, + { + "completion_length": 68.734375, + "epoch": 0.045662100456621, + "grad_norm": 4.997896671295166, + "kl": 0.04248046875, + "learning_rate": 9.954337899543377e-07, + "loss": 0.0017, + "reward": 1.2461639046669006, + "reward_std": 0.527179092168808, + "rewards/accuracy_reward": 0.386788934469223, + "rewards/format_reward": 0.859375, + "step": 10 + }, + { + "completion_length": 98.9453125, + "epoch": 0.0502283105022831, + "grad_norm": 7.262642860412598, + "kl": 0.023773193359375, + "learning_rate": 9.949771689497717e-07, + "loss": 0.001, + "reward": 1.3204909563064575, + "reward_std": 0.4520634114742279, + "rewards/accuracy_reward": 0.46892838180065155, + "rewards/format_reward": 0.8515625, + "step": 11 + }, + { + "completion_length": 102.6953125, + "epoch": 0.0547945205479452, + "grad_norm": 2.358719825744629, + "kl": 0.032470703125, + "learning_rate": 9.945205479452054e-07, + "loss": 0.0013, + "reward": 1.3686427474021912, + "reward_std": 0.45033006370067596, + "rewards/accuracy_reward": 0.5405177175998688, + "rewards/format_reward": 0.828125, + "step": 12 + }, + { + "completion_length": 83.1171875, + "epoch": 0.0593607305936073, + "grad_norm": 3.457000255584717, + "kl": 0.044189453125, + "learning_rate": 9.940639269406391e-07, + "loss": 0.0018, + "reward": 1.3782268166542053, + "reward_std": 0.4138905107975006, + "rewards/accuracy_reward": 0.45635175704956055, + "rewards/format_reward": 0.921875, + "step": 13 + }, + { + "completion_length": 48.734375, + "epoch": 0.0639269406392694, + "grad_norm": 3.6976211071014404, + "kl": 0.056884765625, + "learning_rate": 9.93607305936073e-07, + "loss": 0.0023, + "reward": 1.340489685535431, + "reward_std": 0.37420646846294403, + "rewards/accuracy_reward": 0.4029896557331085, + "rewards/format_reward": 0.9375, + "step": 14 + }, + { + "completion_length": 79.2421875, + "epoch": 0.0684931506849315, + "grad_norm": 2.8622705936431885, + "kl": 0.043212890625, + "learning_rate": 9.931506849315068e-07, + "loss": 0.0017, + "reward": 1.2608134746551514, + "reward_std": 0.45702624320983887, + "rewards/accuracy_reward": 0.393625944852829, + "rewards/format_reward": 0.8671875, + "step": 15 + }, + { + "completion_length": 101.453125, + "epoch": 0.0730593607305936, + "grad_norm": 3.5660035610198975, + "kl": 0.034423828125, + "learning_rate": 9.926940639269406e-07, + "loss": 0.0014, + "reward": 1.4559400081634521, + "reward_std": 0.3906751722097397, + "rewards/accuracy_reward": 0.5418775081634521, + "rewards/format_reward": 0.9140625, + "step": 16 + }, + { + "completion_length": 58.125, + "epoch": 0.0776255707762557, + "grad_norm": 3.2858753204345703, + "kl": 0.052978515625, + "learning_rate": 9.922374429223745e-07, + "loss": 0.0021, + "reward": 1.3879406452178955, + "reward_std": 0.32537831366062164, + "rewards/accuracy_reward": 0.4113781452178955, + "rewards/format_reward": 0.9765625, + "step": 17 + }, + { + "completion_length": 77.3671875, + "epoch": 0.0821917808219178, + "grad_norm": 2.0785841941833496, + "kl": 0.0511474609375, + "learning_rate": 9.917808219178082e-07, + "loss": 0.002, + "reward": 1.5014740824699402, + "reward_std": 0.27246882766485214, + "rewards/accuracy_reward": 0.540536567568779, + "rewards/format_reward": 0.9609375, + "step": 18 + }, + { + "completion_length": 70.7890625, + "epoch": 0.0867579908675799, + "grad_norm": 3.1388633251190186, + "kl": 0.0772705078125, + "learning_rate": 9.91324200913242e-07, + "loss": 0.0031, + "reward": 1.4673261046409607, + "reward_std": 0.3659689426422119, + "rewards/accuracy_reward": 0.5142011493444443, + "rewards/format_reward": 0.953125, + "step": 19 + }, + { + "completion_length": 73.71875, + "epoch": 0.091324200913242, + "grad_norm": 3.4941606521606445, + "kl": 0.0772705078125, + "learning_rate": 9.908675799086757e-07, + "loss": 0.0031, + "reward": 1.2669085264205933, + "reward_std": 0.41668441891670227, + "rewards/accuracy_reward": 0.3919084817171097, + "rewards/format_reward": 0.875, + "step": 20 + }, + { + "completion_length": 86.0390625, + "epoch": 0.0958904109589041, + "grad_norm": 4.310213565826416, + "kl": 0.0654296875, + "learning_rate": 9.904109589041094e-07, + "loss": 0.0026, + "reward": 1.4882500767707825, + "reward_std": 0.30715326964855194, + "rewards/accuracy_reward": 0.5351250767707825, + "rewards/format_reward": 0.953125, + "step": 21 + }, + { + "completion_length": 71.8359375, + "epoch": 0.1004566210045662, + "grad_norm": 3.6013519763946533, + "kl": 0.09912109375, + "learning_rate": 9.899543378995434e-07, + "loss": 0.004, + "reward": 1.602288007736206, + "reward_std": 0.2894471287727356, + "rewards/accuracy_reward": 0.6257254481315613, + "rewards/format_reward": 0.9765625, + "step": 22 + }, + { + "completion_length": 80.5078125, + "epoch": 0.1050228310502283, + "grad_norm": 2.9229257106781006, + "kl": 0.07373046875, + "learning_rate": 9.894977168949771e-07, + "loss": 0.0029, + "reward": 1.5954613089561462, + "reward_std": 0.29795171320438385, + "rewards/accuracy_reward": 0.6267113089561462, + "rewards/format_reward": 0.96875, + "step": 23 + }, + { + "completion_length": 78.6953125, + "epoch": 0.1095890410958904, + "grad_norm": 3.3861594200134277, + "kl": 0.0640869140625, + "learning_rate": 9.89041095890411e-07, + "loss": 0.0026, + "reward": 1.54551100730896, + "reward_std": 0.2194862775504589, + "rewards/accuracy_reward": 0.5611358880996704, + "rewards/format_reward": 0.984375, + "step": 24 + }, + { + "completion_length": 73.4453125, + "epoch": 0.1141552511415525, + "grad_norm": 3.412217140197754, + "kl": 0.093994140625, + "learning_rate": 9.885844748858448e-07, + "loss": 0.0038, + "reward": 1.3831676244735718, + "reward_std": 0.3298991322517395, + "rewards/accuracy_reward": 0.4378551170229912, + "rewards/format_reward": 0.9453125, + "step": 25 + }, + { + "completion_length": 67.359375, + "epoch": 0.1187214611872146, + "grad_norm": 2.750807285308838, + "kl": 0.0947265625, + "learning_rate": 9.881278538812785e-07, + "loss": 0.0038, + "reward": 1.5090773701667786, + "reward_std": 0.30529043078422546, + "rewards/accuracy_reward": 0.5559523701667786, + "rewards/format_reward": 0.953125, + "step": 26 + }, + { + "completion_length": 65.1953125, + "epoch": 0.1232876712328767, + "grad_norm": 5.965710163116455, + "kl": 0.10595703125, + "learning_rate": 9.876712328767123e-07, + "loss": 0.0042, + "reward": 1.5281611680984497, + "reward_std": 0.3281934857368469, + "rewards/accuracy_reward": 0.5437861680984497, + "rewards/format_reward": 0.984375, + "step": 27 + }, + { + "completion_length": 76.8671875, + "epoch": 0.1278538812785388, + "grad_norm": 3.2638497352600098, + "kl": 0.07958984375, + "learning_rate": 9.87214611872146e-07, + "loss": 0.0032, + "reward": 1.5927692651748657, + "reward_std": 0.24901984632015228, + "rewards/accuracy_reward": 0.6318317353725433, + "rewards/format_reward": 0.9609375, + "step": 28 + }, + { + "completion_length": 66.0234375, + "epoch": 0.1324200913242009, + "grad_norm": 3.1384220123291016, + "kl": 0.106689453125, + "learning_rate": 9.867579908675797e-07, + "loss": 0.0043, + "reward": 1.5447565913200378, + "reward_std": 0.284378357231617, + "rewards/accuracy_reward": 0.5760065913200378, + "rewards/format_reward": 0.96875, + "step": 29 + }, + { + "completion_length": 56.90625, + "epoch": 0.136986301369863, + "grad_norm": 5.531259059906006, + "kl": 0.11572265625, + "learning_rate": 9.863013698630137e-07, + "loss": 0.0046, + "reward": 1.6361945867538452, + "reward_std": 0.24622034281492233, + "rewards/accuracy_reward": 0.6361946165561676, + "rewards/format_reward": 1.0, + "step": 30 + }, + { + "completion_length": 55.5703125, + "epoch": 0.1415525114155251, + "grad_norm": 3.289832353591919, + "kl": 0.10986328125, + "learning_rate": 9.858447488584474e-07, + "loss": 0.0044, + "reward": 1.4083333611488342, + "reward_std": 0.32854069769382477, + "rewards/accuracy_reward": 0.45520833134651184, + "rewards/format_reward": 0.953125, + "step": 31 + }, + { + "completion_length": 76.828125, + "epoch": 0.1461187214611872, + "grad_norm": 10.165680885314941, + "kl": 0.091064453125, + "learning_rate": 9.853881278538814e-07, + "loss": 0.0036, + "reward": 1.5438354015350342, + "reward_std": 0.23943377658724785, + "rewards/accuracy_reward": 0.5672729313373566, + "rewards/format_reward": 0.9765625, + "step": 32 + }, + { + "completion_length": 58.3515625, + "epoch": 0.1506849315068493, + "grad_norm": 5.475100517272949, + "kl": 0.108642578125, + "learning_rate": 9.84931506849315e-07, + "loss": 0.0043, + "reward": 1.6295759081840515, + "reward_std": 0.25405153632164, + "rewards/accuracy_reward": 0.6373884081840515, + "rewards/format_reward": 0.9921875, + "step": 33 + }, + { + "completion_length": 67.96875, + "epoch": 0.1552511415525114, + "grad_norm": 2.0600223541259766, + "kl": 0.085693359375, + "learning_rate": 9.844748858447488e-07, + "loss": 0.0034, + "reward": 1.5968750715255737, + "reward_std": 0.2177756428718567, + "rewards/accuracy_reward": 0.5968749821186066, + "rewards/format_reward": 1.0, + "step": 34 + }, + { + "completion_length": 55.0703125, + "epoch": 0.1598173515981735, + "grad_norm": 4.03507661819458, + "kl": 0.134033203125, + "learning_rate": 9.840182648401826e-07, + "loss": 0.0054, + "reward": 1.5210416316986084, + "reward_std": 0.262370266020298, + "rewards/accuracy_reward": 0.5288541465997696, + "rewards/format_reward": 0.9921875, + "step": 35 + }, + { + "completion_length": 69.890625, + "epoch": 0.1643835616438356, + "grad_norm": 2.3833439350128174, + "kl": 0.091796875, + "learning_rate": 9.835616438356163e-07, + "loss": 0.0037, + "reward": 1.6906250715255737, + "reward_std": 0.20069602131843567, + "rewards/accuracy_reward": 0.6906249821186066, + "rewards/format_reward": 1.0, + "step": 36 + }, + { + "completion_length": 67.2265625, + "epoch": 0.1689497716894977, + "grad_norm": 3.298387050628662, + "kl": 0.126220703125, + "learning_rate": 9.831050228310502e-07, + "loss": 0.0051, + "reward": 1.5833591222763062, + "reward_std": 0.23695440590381622, + "rewards/accuracy_reward": 0.5911716222763062, + "rewards/format_reward": 0.9921875, + "step": 37 + }, + { + "completion_length": 75.1875, + "epoch": 0.1735159817351598, + "grad_norm": 2.7184557914733887, + "kl": 0.126953125, + "learning_rate": 9.82648401826484e-07, + "loss": 0.0051, + "reward": 1.6548610925674438, + "reward_std": 0.18883602693676949, + "rewards/accuracy_reward": 0.6704860329627991, + "rewards/format_reward": 0.984375, + "step": 38 + }, + { + "completion_length": 64.015625, + "epoch": 0.1780821917808219, + "grad_norm": 3.2870519161224365, + "kl": 0.117431640625, + "learning_rate": 9.821917808219177e-07, + "loss": 0.0047, + "reward": 1.5351698398590088, + "reward_std": 0.267734594643116, + "rewards/accuracy_reward": 0.5664198100566864, + "rewards/format_reward": 0.96875, + "step": 39 + }, + { + "completion_length": 52.1171875, + "epoch": 0.182648401826484, + "grad_norm": 5.351809024810791, + "kl": 0.15234375, + "learning_rate": 9.817351598173517e-07, + "loss": 0.0061, + "reward": 1.443321943283081, + "reward_std": 0.32138869166374207, + "rewards/accuracy_reward": 0.44332198798656464, + "rewards/format_reward": 1.0, + "step": 40 + }, + { + "completion_length": 70.34375, + "epoch": 0.1872146118721461, + "grad_norm": 2.1224160194396973, + "kl": 0.126220703125, + "learning_rate": 9.812785388127854e-07, + "loss": 0.0051, + "reward": 1.761244773864746, + "reward_std": 0.13502541184425354, + "rewards/accuracy_reward": 0.7846822142601013, + "rewards/format_reward": 0.9765625, + "step": 41 + }, + { + "completion_length": 64.1796875, + "epoch": 0.1917808219178082, + "grad_norm": 3.5647408962249756, + "kl": 0.15234375, + "learning_rate": 9.808219178082191e-07, + "loss": 0.0061, + "reward": 1.4638384580612183, + "reward_std": 0.2666083872318268, + "rewards/accuracy_reward": 0.47946345806121826, + "rewards/format_reward": 0.984375, + "step": 42 + }, + { + "completion_length": 51.8515625, + "epoch": 0.1963470319634703, + "grad_norm": 9.191572189331055, + "kl": 0.15185546875, + "learning_rate": 9.803652968036529e-07, + "loss": 0.0061, + "reward": 1.5036438703536987, + "reward_std": 0.22346660494804382, + "rewards/accuracy_reward": 0.5114563703536987, + "rewards/format_reward": 0.9921875, + "step": 43 + }, + { + "completion_length": 67.1328125, + "epoch": 0.2009132420091324, + "grad_norm": 4.356919765472412, + "kl": 0.18994140625, + "learning_rate": 9.799086757990868e-07, + "loss": 0.0076, + "reward": 1.448957920074463, + "reward_std": 0.2659284919500351, + "rewards/accuracy_reward": 0.4645828604698181, + "rewards/format_reward": 0.984375, + "step": 44 + }, + { + "completion_length": 82.8203125, + "epoch": 0.2054794520547945, + "grad_norm": 4.919662952423096, + "kl": 0.074951171875, + "learning_rate": 9.794520547945205e-07, + "loss": 0.003, + "reward": 1.6562500596046448, + "reward_std": 0.1944543570280075, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.984375, + "step": 45 + }, + { + "completion_length": 79.3125, + "epoch": 0.2100456621004566, + "grad_norm": 3.381512403488159, + "kl": 0.0947265625, + "learning_rate": 9.789954337899543e-07, + "loss": 0.0038, + "reward": 1.5502474904060364, + "reward_std": 0.18939972668886185, + "rewards/accuracy_reward": 0.5502474904060364, + "rewards/format_reward": 1.0, + "step": 46 + }, + { + "completion_length": 63.5390625, + "epoch": 0.2146118721461187, + "grad_norm": 1.7941492795944214, + "kl": 0.101806640625, + "learning_rate": 9.78538812785388e-07, + "loss": 0.0041, + "reward": 1.634374976158142, + "reward_std": 0.13939146511256695, + "rewards/accuracy_reward": 0.6343749165534973, + "rewards/format_reward": 1.0, + "step": 47 + }, + { + "completion_length": 54.125, + "epoch": 0.2191780821917808, + "grad_norm": 3.9591901302337646, + "kl": 0.2177734375, + "learning_rate": 9.78082191780822e-07, + "loss": 0.0087, + "reward": 1.4007303714752197, + "reward_std": 0.33134153485298157, + "rewards/accuracy_reward": 0.4163554012775421, + "rewards/format_reward": 0.984375, + "step": 48 + }, + { + "completion_length": 31.8984375, + "epoch": 0.2237442922374429, + "grad_norm": 3.5492911338806152, + "kl": 0.15966796875, + "learning_rate": 9.776255707762557e-07, + "loss": 0.0064, + "reward": 1.44140625, + "reward_std": 0.32149194180965424, + "rewards/accuracy_reward": 0.44140625, + "rewards/format_reward": 1.0, + "step": 49 + }, + { + "completion_length": 59.5625, + "epoch": 0.228310502283105, + "grad_norm": 6.129611968994141, + "kl": 0.15771484375, + "learning_rate": 9.771689497716894e-07, + "loss": 0.0063, + "reward": 1.6152344346046448, + "reward_std": 0.17767168581485748, + "rewards/accuracy_reward": 0.623046875, + "rewards/format_reward": 0.9921875, + "step": 50 + }, + { + "completion_length": 51.4453125, + "epoch": 0.2328767123287671, + "grad_norm": 1.8558975458145142, + "kl": 0.1015625, + "learning_rate": 9.767123287671234e-07, + "loss": 0.0041, + "reward": 1.6468749642372131, + "reward_std": 0.16151440143585205, + "rewards/accuracy_reward": 0.6468749940395355, + "rewards/format_reward": 1.0, + "step": 51 + }, + { + "completion_length": 64.2734375, + "epoch": 0.2374429223744292, + "grad_norm": 1.8877390623092651, + "kl": 0.122802734375, + "learning_rate": 9.762557077625571e-07, + "loss": 0.0049, + "reward": 1.557812511920929, + "reward_std": 0.1784707009792328, + "rewards/accuracy_reward": 0.557812511920929, + "rewards/format_reward": 1.0, + "step": 52 + }, + { + "completion_length": 67.5625, + "epoch": 0.2420091324200913, + "grad_norm": 3.6787755489349365, + "kl": 0.111328125, + "learning_rate": 9.757990867579908e-07, + "loss": 0.0045, + "reward": 1.6884114742279053, + "reward_std": 0.17782026529312134, + "rewards/accuracy_reward": 0.6962239742279053, + "rewards/format_reward": 0.9921875, + "step": 53 + }, + { + "completion_length": 63.3671875, + "epoch": 0.2465753424657534, + "grad_norm": 2.3778576850891113, + "kl": 0.0980224609375, + "learning_rate": 9.753424657534246e-07, + "loss": 0.0039, + "reward": 1.6024739742279053, + "reward_std": 0.19950664788484573, + "rewards/accuracy_reward": 0.6102864444255829, + "rewards/format_reward": 0.9921875, + "step": 54 + }, + { + "completion_length": 78.6640625, + "epoch": 0.2511415525114155, + "grad_norm": 3.8972158432006836, + "kl": 0.113037109375, + "learning_rate": 9.748858447488583e-07, + "loss": 0.0045, + "reward": 1.591796875, + "reward_std": 0.17978167533874512, + "rewards/accuracy_reward": 0.5917968302965164, + "rewards/format_reward": 1.0, + "step": 55 + }, + { + "completion_length": 84.65625, + "epoch": 0.2557077625570776, + "grad_norm": 1.890934705734253, + "kl": 0.086181640625, + "learning_rate": 9.744292237442923e-07, + "loss": 0.0035, + "reward": 1.5904948115348816, + "reward_std": 0.1708931028842926, + "rewards/accuracy_reward": 0.5983072519302368, + "rewards/format_reward": 0.9921875, + "step": 56 + }, + { + "completion_length": 68.1015625, + "epoch": 0.2602739726027397, + "grad_norm": 2.091691017150879, + "kl": 0.110107421875, + "learning_rate": 9.73972602739726e-07, + "loss": 0.0044, + "reward": 1.590624988079071, + "reward_std": 0.21159524470567703, + "rewards/accuracy_reward": 0.6062500178813934, + "rewards/format_reward": 0.984375, + "step": 57 + }, + { + "completion_length": 58.8671875, + "epoch": 0.2648401826484018, + "grad_norm": 2.6050846576690674, + "kl": 0.10546875, + "learning_rate": 9.735159817351597e-07, + "loss": 0.0042, + "reward": 1.4429687857627869, + "reward_std": 0.3084883987903595, + "rewards/accuracy_reward": 0.4585937559604645, + "rewards/format_reward": 0.984375, + "step": 58 + }, + { + "completion_length": 57.8125, + "epoch": 0.2694063926940639, + "grad_norm": 6.616756439208984, + "kl": 0.104736328125, + "learning_rate": 9.730593607305937e-07, + "loss": 0.0042, + "reward": 1.4373698234558105, + "reward_std": 0.3443310409784317, + "rewards/accuracy_reward": 0.46861980855464935, + "rewards/format_reward": 0.96875, + "step": 59 + }, + { + "completion_length": 72.84375, + "epoch": 0.273972602739726, + "grad_norm": 2.8635833263397217, + "kl": 0.115478515625, + "learning_rate": 9.726027397260274e-07, + "loss": 0.0046, + "reward": 1.5471354722976685, + "reward_std": 0.25735440850257874, + "rewards/accuracy_reward": 0.5549479126930237, + "rewards/format_reward": 0.9921875, + "step": 60 + }, + { + "completion_length": 89.2578125, + "epoch": 0.2785388127853881, + "grad_norm": 2.5327181816101074, + "kl": 0.0791015625, + "learning_rate": 9.721461187214611e-07, + "loss": 0.0032, + "reward": 1.732812523841858, + "reward_std": 0.17859892547130585, + "rewards/accuracy_reward": 0.7484374642372131, + "rewards/format_reward": 0.984375, + "step": 61 + }, + { + "completion_length": 55.9453125, + "epoch": 0.2831050228310502, + "grad_norm": 2.024601697921753, + "kl": 0.15771484375, + "learning_rate": 9.716894977168949e-07, + "loss": 0.0063, + "reward": 1.6088541746139526, + "reward_std": 0.28504033386707306, + "rewards/accuracy_reward": 0.6401041746139526, + "rewards/format_reward": 0.96875, + "step": 62 + }, + { + "completion_length": 69.328125, + "epoch": 0.2876712328767123, + "grad_norm": 4.5766730308532715, + "kl": 0.111083984375, + "learning_rate": 9.712328767123286e-07, + "loss": 0.0044, + "reward": 1.7034826278686523, + "reward_std": 0.12682656943798065, + "rewards/accuracy_reward": 0.719107449054718, + "rewards/format_reward": 0.984375, + "step": 63 + }, + { + "completion_length": 78.0234375, + "epoch": 0.2922374429223744, + "grad_norm": 3.555784225463867, + "kl": 0.109375, + "learning_rate": 9.707762557077626e-07, + "loss": 0.0044, + "reward": 1.5473958253860474, + "reward_std": 0.23826827853918076, + "rewards/accuracy_reward": 0.5630208402872086, + "rewards/format_reward": 0.984375, + "step": 64 + }, + { + "completion_length": 46.015625, + "epoch": 0.2968036529680365, + "grad_norm": 5.032433032989502, + "kl": 0.17724609375, + "learning_rate": 9.703196347031963e-07, + "loss": 0.0071, + "reward": 1.401562511920929, + "reward_std": 0.2856694385409355, + "rewards/accuracy_reward": 0.40156251192092896, + "rewards/format_reward": 1.0, + "step": 65 + }, + { + "completion_length": 80.3203125, + "epoch": 0.3013698630136986, + "grad_norm": 3.9773030281066895, + "kl": 0.083740234375, + "learning_rate": 9.6986301369863e-07, + "loss": 0.0034, + "reward": 1.6140625476837158, + "reward_std": 0.1680549457669258, + "rewards/accuracy_reward": 0.6218749582767487, + "rewards/format_reward": 0.9921875, + "step": 66 + }, + { + "completion_length": 72.734375, + "epoch": 0.3059360730593607, + "grad_norm": 4.309236526489258, + "kl": 0.091796875, + "learning_rate": 9.69406392694064e-07, + "loss": 0.0037, + "reward": 1.4543966054916382, + "reward_std": 0.27315448969602585, + "rewards/accuracy_reward": 0.4778340458869934, + "rewards/format_reward": 0.9765625, + "step": 67 + }, + { + "completion_length": 48.328125, + "epoch": 0.3105022831050228, + "grad_norm": 2.2709150314331055, + "kl": 0.17724609375, + "learning_rate": 9.689497716894977e-07, + "loss": 0.0071, + "reward": 1.6068063974380493, + "reward_std": 0.247873917222023, + "rewards/accuracy_reward": 0.6380563378334045, + "rewards/format_reward": 0.96875, + "step": 68 + }, + { + "completion_length": 64.4921875, + "epoch": 0.3150684931506849, + "grad_norm": 2.840082883834839, + "kl": 0.12841796875, + "learning_rate": 9.684931506849314e-07, + "loss": 0.0051, + "reward": 1.65234375, + "reward_std": 0.1649593710899353, + "rewards/accuracy_reward": 0.6601562201976776, + "rewards/format_reward": 0.9921875, + "step": 69 + }, + { + "completion_length": 65.5, + "epoch": 0.319634703196347, + "grad_norm": 2.1552846431732178, + "kl": 0.14453125, + "learning_rate": 9.680365296803652e-07, + "loss": 0.0058, + "reward": 1.6424851417541504, + "reward_std": 0.14906217902898788, + "rewards/accuracy_reward": 0.642485111951828, + "rewards/format_reward": 1.0, + "step": 70 + }, + { + "completion_length": 66.7265625, + "epoch": 0.3242009132420091, + "grad_norm": 2.2692887783050537, + "kl": 0.1513671875, + "learning_rate": 9.675799086757991e-07, + "loss": 0.0061, + "reward": 1.689843773841858, + "reward_std": 0.17708637565374374, + "rewards/accuracy_reward": 0.6976562440395355, + "rewards/format_reward": 0.9921875, + "step": 71 + }, + { + "completion_length": 76.78125, + "epoch": 0.3287671232876712, + "grad_norm": 2.679795265197754, + "kl": 0.121337890625, + "learning_rate": 9.671232876712329e-07, + "loss": 0.0049, + "reward": 1.6899740099906921, + "reward_std": 0.1546846330165863, + "rewards/accuracy_reward": 0.6977863907814026, + "rewards/format_reward": 0.9921875, + "step": 72 + }, + { + "completion_length": 44.765625, + "epoch": 0.3333333333333333, + "grad_norm": 5.212102890014648, + "kl": 0.125, + "learning_rate": 9.666666666666666e-07, + "loss": 0.005, + "reward": 1.5778015851974487, + "reward_std": 0.2252170294523239, + "rewards/accuracy_reward": 0.5778016149997711, + "rewards/format_reward": 1.0, + "step": 73 + }, + { + "completion_length": 79.1953125, + "epoch": 0.3378995433789954, + "grad_norm": 2.121600389480591, + "kl": 0.102783203125, + "learning_rate": 9.662100456621003e-07, + "loss": 0.0041, + "reward": 1.4783854484558105, + "reward_std": 0.2087068259716034, + "rewards/accuracy_reward": 0.5018228888511658, + "rewards/format_reward": 0.9765625, + "step": 74 + }, + { + "completion_length": 67.703125, + "epoch": 0.3424657534246575, + "grad_norm": 3.5149497985839844, + "kl": 0.11279296875, + "learning_rate": 9.657534246575343e-07, + "loss": 0.0045, + "reward": 1.655573308467865, + "reward_std": 0.293765589594841, + "rewards/accuracy_reward": 0.6868232786655426, + "rewards/format_reward": 0.96875, + "step": 75 + }, + { + "completion_length": 64.078125, + "epoch": 0.3470319634703196, + "grad_norm": 3.8966798782348633, + "kl": 0.150634765625, + "learning_rate": 9.65296803652968e-07, + "loss": 0.006, + "reward": 1.686715006828308, + "reward_std": 0.2304065003991127, + "rewards/accuracy_reward": 0.7023399174213409, + "rewards/format_reward": 0.984375, + "step": 76 + }, + { + "completion_length": 62.2890625, + "epoch": 0.3515981735159817, + "grad_norm": 3.204103469848633, + "kl": 0.12451171875, + "learning_rate": 9.648401826484017e-07, + "loss": 0.005, + "reward": 1.4418154954910278, + "reward_std": 0.305108904838562, + "rewards/accuracy_reward": 0.5199404954910278, + "rewards/format_reward": 0.921875, + "step": 77 + }, + { + "completion_length": 56.8125, + "epoch": 0.3561643835616438, + "grad_norm": 35.58188247680664, + "kl": 0.1640625, + "learning_rate": 9.643835616438357e-07, + "loss": 0.0066, + "reward": 1.5536458492279053, + "reward_std": 0.34119510650634766, + "rewards/accuracy_reward": 0.6161458492279053, + "rewards/format_reward": 0.9375, + "step": 78 + }, + { + "completion_length": 79.78125, + "epoch": 0.3607305936073059, + "grad_norm": 2.3489811420440674, + "kl": 0.107421875, + "learning_rate": 9.639269406392694e-07, + "loss": 0.0043, + "reward": 1.5434371829032898, + "reward_std": 0.2838926613330841, + "rewards/accuracy_reward": 0.6059371829032898, + "rewards/format_reward": 0.9375, + "step": 79 + }, + { + "completion_length": 76.34375, + "epoch": 0.365296803652968, + "grad_norm": 3.45082426071167, + "kl": 0.1591796875, + "learning_rate": 9.634703196347032e-07, + "loss": 0.0064, + "reward": 1.510318636894226, + "reward_std": 0.3496459722518921, + "rewards/accuracy_reward": 0.5806310772895813, + "rewards/format_reward": 0.9296875, + "step": 80 + }, + { + "completion_length": 83.40625, + "epoch": 0.3698630136986301, + "grad_norm": 2.6958389282226562, + "kl": 0.13671875, + "learning_rate": 9.630136986301369e-07, + "loss": 0.0055, + "reward": 1.4917969107627869, + "reward_std": 0.29826872050762177, + "rewards/accuracy_reward": 0.5308593809604645, + "rewards/format_reward": 0.9609375, + "step": 81 + }, + { + "completion_length": 61.8984375, + "epoch": 0.3744292237442922, + "grad_norm": 9.85806941986084, + "kl": 0.15966796875, + "learning_rate": 9.625570776255706e-07, + "loss": 0.0064, + "reward": 1.2881510257720947, + "reward_std": 0.37539002299308777, + "rewards/accuracy_reward": 0.3506510257720947, + "rewards/format_reward": 0.9375, + "step": 82 + }, + { + "completion_length": 87.1640625, + "epoch": 0.3789954337899543, + "grad_norm": 1.7815834283828735, + "kl": 0.11572265625, + "learning_rate": 9.621004566210046e-07, + "loss": 0.0046, + "reward": 1.624854028224945, + "reward_std": 0.19279810786247253, + "rewards/accuracy_reward": 0.6482914686203003, + "rewards/format_reward": 0.9765625, + "step": 83 + }, + { + "completion_length": 72.078125, + "epoch": 0.3835616438356164, + "grad_norm": 3.1070339679718018, + "kl": 0.14013671875, + "learning_rate": 9.616438356164383e-07, + "loss": 0.0056, + "reward": 1.5429518222808838, + "reward_std": 0.3384602963924408, + "rewards/accuracy_reward": 0.5820143222808838, + "rewards/format_reward": 0.9609375, + "step": 84 + }, + { + "completion_length": 81.640625, + "epoch": 0.3881278538812785, + "grad_norm": 2.3276116847991943, + "kl": 0.14990234375, + "learning_rate": 9.61187214611872e-07, + "loss": 0.006, + "reward": 1.5467524528503418, + "reward_std": 0.23365992307662964, + "rewards/accuracy_reward": 0.570189893245697, + "rewards/format_reward": 0.9765625, + "step": 85 + }, + { + "completion_length": 72.9296875, + "epoch": 0.3926940639269406, + "grad_norm": 13.4576416015625, + "kl": 0.111572265625, + "learning_rate": 9.60730593607306e-07, + "loss": 0.0045, + "reward": 1.580468773841858, + "reward_std": 0.3186973035335541, + "rewards/accuracy_reward": 0.6195312440395355, + "rewards/format_reward": 0.9609375, + "step": 86 + }, + { + "completion_length": 76.4453125, + "epoch": 0.3972602739726027, + "grad_norm": 9.36266040802002, + "kl": 0.135498046875, + "learning_rate": 9.602739726027397e-07, + "loss": 0.0054, + "reward": 1.459251582622528, + "reward_std": 0.3123939037322998, + "rewards/accuracy_reward": 0.5061265677213669, + "rewards/format_reward": 0.953125, + "step": 87 + }, + { + "completion_length": 71.3828125, + "epoch": 0.4018264840182648, + "grad_norm": 3.115593671798706, + "kl": 0.110107421875, + "learning_rate": 9.598173515981735e-07, + "loss": 0.0044, + "reward": 1.513058066368103, + "reward_std": 0.2660725861787796, + "rewards/accuracy_reward": 0.5521205365657806, + "rewards/format_reward": 0.9609375, + "step": 88 + }, + { + "completion_length": 64.2109375, + "epoch": 0.4063926940639269, + "grad_norm": 3.5730538368225098, + "kl": 0.169921875, + "learning_rate": 9.593607305936072e-07, + "loss": 0.0068, + "reward": 1.4846374988555908, + "reward_std": 0.25958995521068573, + "rewards/accuracy_reward": 0.5080749839544296, + "rewards/format_reward": 0.9765625, + "step": 89 + }, + { + "completion_length": 95.4921875, + "epoch": 0.410958904109589, + "grad_norm": 2.466034173965454, + "kl": 0.11083984375, + "learning_rate": 9.58904109589041e-07, + "loss": 0.0044, + "reward": 1.6667287349700928, + "reward_std": 0.21123456954956055, + "rewards/accuracy_reward": 0.7057911455631256, + "rewards/format_reward": 0.9609375, + "step": 90 + }, + { + "completion_length": 68.859375, + "epoch": 0.4155251141552511, + "grad_norm": 2.8238866329193115, + "kl": 0.147216796875, + "learning_rate": 9.584474885844749e-07, + "loss": 0.0059, + "reward": 1.4025428891181946, + "reward_std": 0.2600523456931114, + "rewards/accuracy_reward": 0.4416054040193558, + "rewards/format_reward": 0.9609375, + "step": 91 + }, + { + "completion_length": 86.1484375, + "epoch": 0.4200913242009132, + "grad_norm": 3.0570385456085205, + "kl": 0.112548828125, + "learning_rate": 9.579908675799086e-07, + "loss": 0.0045, + "reward": 1.394381046295166, + "reward_std": 0.3288656920194626, + "rewards/accuracy_reward": 0.44125598669052124, + "rewards/format_reward": 0.953125, + "step": 92 + }, + { + "completion_length": 61.4609375, + "epoch": 0.4246575342465753, + "grad_norm": 3.7054836750030518, + "kl": 0.154541015625, + "learning_rate": 9.575342465753423e-07, + "loss": 0.0062, + "reward": 1.3927083611488342, + "reward_std": 0.3015543594956398, + "rewards/accuracy_reward": 0.40052083134651184, + "rewards/format_reward": 0.9921875, + "step": 93 + }, + { + "completion_length": 75.2578125, + "epoch": 0.4292237442922374, + "grad_norm": 2.8483920097351074, + "kl": 0.123046875, + "learning_rate": 9.570776255707763e-07, + "loss": 0.0049, + "reward": 1.5788614153862, + "reward_std": 0.26594626903533936, + "rewards/accuracy_reward": 0.6022988557815552, + "rewards/format_reward": 0.9765625, + "step": 94 + }, + { + "completion_length": 51.71875, + "epoch": 0.4337899543378995, + "grad_norm": 3.642658233642578, + "kl": 0.2294921875, + "learning_rate": 9.5662100456621e-07, + "loss": 0.0092, + "reward": 1.3690476417541504, + "reward_std": 0.32933689653873444, + "rewards/accuracy_reward": 0.400297611951828, + "rewards/format_reward": 0.96875, + "step": 95 + }, + { + "completion_length": 81.4765625, + "epoch": 0.4383561643835616, + "grad_norm": 2.5764758586883545, + "kl": 0.14013671875, + "learning_rate": 9.561643835616437e-07, + "loss": 0.0056, + "reward": 1.6177189946174622, + "reward_std": 0.20244715362787247, + "rewards/accuracy_reward": 0.6333439946174622, + "rewards/format_reward": 0.984375, + "step": 96 + }, + { + "completion_length": 64.3984375, + "epoch": 0.4429223744292237, + "grad_norm": 3.5117011070251465, + "kl": 0.18408203125, + "learning_rate": 9.557077625570777e-07, + "loss": 0.0074, + "reward": 1.3848958611488342, + "reward_std": 0.33436155319213867, + "rewards/accuracy_reward": 0.39270834624767303, + "rewards/format_reward": 0.9921875, + "step": 97 + }, + { + "completion_length": 75.0703125, + "epoch": 0.4474885844748858, + "grad_norm": 5.965494632720947, + "kl": 0.16650390625, + "learning_rate": 9.552511415525114e-07, + "loss": 0.0067, + "reward": 1.490625023841858, + "reward_std": 0.3021724224090576, + "rewards/accuracy_reward": 0.5140624940395355, + "rewards/format_reward": 0.9765625, + "step": 98 + }, + { + "completion_length": 81.328125, + "epoch": 0.4520547945205479, + "grad_norm": 5.425507545471191, + "kl": 0.12939453125, + "learning_rate": 9.547945205479452e-07, + "loss": 0.0052, + "reward": 1.489062488079071, + "reward_std": 0.30329059064388275, + "rewards/accuracy_reward": 0.5046875178813934, + "rewards/format_reward": 0.984375, + "step": 99 + }, + { + "completion_length": 85.65625, + "epoch": 0.45662100456621, + "grad_norm": 3.3313958644866943, + "kl": 0.124755859375, + "learning_rate": 9.54337899543379e-07, + "loss": 0.005, + "reward": 1.4551078081130981, + "reward_std": 0.2936365008354187, + "rewards/accuracy_reward": 0.5019826889038086, + "rewards/format_reward": 0.953125, + "step": 100 + }, + { + "completion_length": 101.9921875, + "epoch": 0.4611872146118721, + "grad_norm": 7.004181861877441, + "kl": 0.100830078125, + "learning_rate": 9.538812785388126e-07, + "loss": 0.004, + "reward": 1.8317708373069763, + "reward_std": 0.11421890184283257, + "rewards/accuracy_reward": 0.8395832479000092, + "rewards/format_reward": 0.9921875, + "step": 101 + }, + { + "completion_length": 65.2890625, + "epoch": 0.4657534246575342, + "grad_norm": 5.00878381729126, + "kl": 0.13720703125, + "learning_rate": 9.534246575342465e-07, + "loss": 0.0055, + "reward": 1.4119791984558105, + "reward_std": 0.3915850967168808, + "rewards/accuracy_reward": 0.45885418355464935, + "rewards/format_reward": 0.953125, + "step": 102 + }, + { + "completion_length": 66.1875, + "epoch": 0.4703196347031963, + "grad_norm": 33.22105026245117, + "kl": 0.1689453125, + "learning_rate": 9.529680365296803e-07, + "loss": 0.0068, + "reward": 1.5824455618858337, + "reward_std": 0.362550288438797, + "rewards/accuracy_reward": 0.5980705618858337, + "rewards/format_reward": 0.984375, + "step": 103 + }, + { + "completion_length": 94.6015625, + "epoch": 0.4748858447488584, + "grad_norm": 2.8027193546295166, + "kl": 0.100341796875, + "learning_rate": 9.525114155251142e-07, + "loss": 0.004, + "reward": 1.573001742362976, + "reward_std": 0.2286304533481598, + "rewards/accuracy_reward": 0.5886267125606537, + "rewards/format_reward": 0.984375, + "step": 104 + }, + { + "completion_length": 88.09375, + "epoch": 0.4794520547945205, + "grad_norm": 3.694000482559204, + "kl": 0.1162109375, + "learning_rate": 9.520547945205479e-07, + "loss": 0.0046, + "reward": 1.616619348526001, + "reward_std": 0.24134992063045502, + "rewards/accuracy_reward": 0.6244317889213562, + "rewards/format_reward": 0.9921875, + "step": 105 + }, + { + "completion_length": 97.0546875, + "epoch": 0.4840182648401826, + "grad_norm": 3.0992743968963623, + "kl": 0.1025390625, + "learning_rate": 9.515981735159817e-07, + "loss": 0.0041, + "reward": 1.6255208253860474, + "reward_std": 0.2588481456041336, + "rewards/accuracy_reward": 0.6489583253860474, + "rewards/format_reward": 0.9765625, + "step": 106 + }, + { + "completion_length": 76.7890625, + "epoch": 0.4885844748858447, + "grad_norm": 2.923036813735962, + "kl": 0.12255859375, + "learning_rate": 9.511415525114155e-07, + "loss": 0.0049, + "reward": 1.6026042103767395, + "reward_std": 0.23826471716165543, + "rewards/accuracy_reward": 0.6260416805744171, + "rewards/format_reward": 0.9765625, + "step": 107 + }, + { + "completion_length": 78.6796875, + "epoch": 0.4931506849315068, + "grad_norm": 2.8310043811798096, + "kl": 0.1357421875, + "learning_rate": 9.506849315068493e-07, + "loss": 0.0054, + "reward": 1.4074218273162842, + "reward_std": 0.3311483561992645, + "rewards/accuracy_reward": 0.44648435711860657, + "rewards/format_reward": 0.9609375, + "step": 108 + }, + { + "completion_length": 76.40625, + "epoch": 0.4977168949771689, + "grad_norm": 4.38238525390625, + "kl": 0.119873046875, + "learning_rate": 9.50228310502283e-07, + "loss": 0.0048, + "reward": 1.5411389470100403, + "reward_std": 0.22866196930408478, + "rewards/accuracy_reward": 0.5802014023065567, + "rewards/format_reward": 0.9609375, + "step": 109 + }, + { + "completion_length": 71.234375, + "epoch": 0.502283105022831, + "grad_norm": 5.190479755401611, + "kl": 0.18994140625, + "learning_rate": 9.497716894977168e-07, + "loss": 0.0076, + "reward": 1.5246779322624207, + "reward_std": 0.31538626551628113, + "rewards/accuracy_reward": 0.5637404024600983, + "rewards/format_reward": 0.9609375, + "step": 110 + }, + { + "completion_length": 63.21875, + "epoch": 0.5068493150684932, + "grad_norm": 7.178065776824951, + "kl": 0.111083984375, + "learning_rate": 9.493150684931507e-07, + "loss": 0.0044, + "reward": 1.421093761920929, + "reward_std": 0.3210388720035553, + "rewards/accuracy_reward": 0.42890626192092896, + "rewards/format_reward": 0.9921875, + "step": 111 + }, + { + "completion_length": 86.609375, + "epoch": 0.5114155251141552, + "grad_norm": 3.856398105621338, + "kl": 0.115478515625, + "learning_rate": 9.488584474885845e-07, + "loss": 0.0046, + "reward": 1.504786729812622, + "reward_std": 0.22957277297973633, + "rewards/accuracy_reward": 0.5125992149114609, + "rewards/format_reward": 0.9921875, + "step": 112 + }, + { + "completion_length": 77.28125, + "epoch": 0.5159817351598174, + "grad_norm": 8.988385200500488, + "kl": 0.15869140625, + "learning_rate": 9.484018264840182e-07, + "loss": 0.0064, + "reward": 1.4809608459472656, + "reward_std": 0.3224972039461136, + "rewards/accuracy_reward": 0.5043983459472656, + "rewards/format_reward": 0.9765625, + "step": 113 + }, + { + "completion_length": 84.90625, + "epoch": 0.5205479452054794, + "grad_norm": 4.3304219245910645, + "kl": 0.1025390625, + "learning_rate": 9.47945205479452e-07, + "loss": 0.0041, + "reward": 1.5533854961395264, + "reward_std": 0.2838037610054016, + "rewards/accuracy_reward": 0.5846354365348816, + "rewards/format_reward": 0.96875, + "step": 114 + }, + { + "completion_length": 94.1328125, + "epoch": 0.5251141552511416, + "grad_norm": 2.1755430698394775, + "kl": 0.11572265625, + "learning_rate": 9.474885844748858e-07, + "loss": 0.0046, + "reward": 1.6908555626869202, + "reward_std": 0.18422479182481766, + "rewards/accuracy_reward": 0.7064805030822754, + "rewards/format_reward": 0.984375, + "step": 115 + }, + { + "completion_length": 87.375, + "epoch": 0.5296803652968036, + "grad_norm": 2.527374029159546, + "kl": 0.09033203125, + "learning_rate": 9.470319634703196e-07, + "loss": 0.0036, + "reward": 1.4536417722702026, + "reward_std": 0.23572513461112976, + "rewards/accuracy_reward": 0.46926678717136383, + "rewards/format_reward": 0.984375, + "step": 116 + }, + { + "completion_length": 90.4140625, + "epoch": 0.5342465753424658, + "grad_norm": 2.9267947673797607, + "kl": 0.098388671875, + "learning_rate": 9.465753424657534e-07, + "loss": 0.0039, + "reward": 1.52434903383255, + "reward_std": 0.2611450105905533, + "rewards/accuracy_reward": 0.5477864444255829, + "rewards/format_reward": 0.9765625, + "step": 117 + }, + { + "completion_length": 108.2890625, + "epoch": 0.5388127853881278, + "grad_norm": 4.520658493041992, + "kl": 0.1044921875, + "learning_rate": 9.461187214611872e-07, + "loss": 0.0042, + "reward": 1.5627976655960083, + "reward_std": 0.2677070200443268, + "rewards/accuracy_reward": 0.6252975761890411, + "rewards/format_reward": 0.9375, + "step": 118 + }, + { + "completion_length": 74.1953125, + "epoch": 0.54337899543379, + "grad_norm": 4.915615558624268, + "kl": 0.1650390625, + "learning_rate": 9.45662100456621e-07, + "loss": 0.0066, + "reward": 1.5641247630119324, + "reward_std": 0.22196320444345474, + "rewards/accuracy_reward": 0.56412473320961, + "rewards/format_reward": 1.0, + "step": 119 + }, + { + "completion_length": 76.375, + "epoch": 0.547945205479452, + "grad_norm": 4.11935567855835, + "kl": 0.094482421875, + "learning_rate": 9.452054794520548e-07, + "loss": 0.0038, + "reward": 1.5418658256530762, + "reward_std": 0.2516227439045906, + "rewards/accuracy_reward": 0.5496782958507538, + "rewards/format_reward": 0.9921875, + "step": 120 + }, + { + "completion_length": 68.0859375, + "epoch": 0.5525114155251142, + "grad_norm": 3.256564140319824, + "kl": 0.14208984375, + "learning_rate": 9.447488584474885e-07, + "loss": 0.0057, + "reward": 1.520312488079071, + "reward_std": 0.2812953442335129, + "rewards/accuracy_reward": 0.5281250178813934, + "rewards/format_reward": 0.9921875, + "step": 121 + }, + { + "completion_length": 92.7109375, + "epoch": 0.5570776255707762, + "grad_norm": 2.9781503677368164, + "kl": 0.10546875, + "learning_rate": 9.442922374429223e-07, + "loss": 0.0042, + "reward": 1.6091517806053162, + "reward_std": 0.23979534208774567, + "rewards/accuracy_reward": 0.6404017508029938, + "rewards/format_reward": 0.96875, + "step": 122 + }, + { + "completion_length": 55.8984375, + "epoch": 0.5616438356164384, + "grad_norm": 4.986601829528809, + "kl": 0.146484375, + "learning_rate": 9.438356164383561e-07, + "loss": 0.0059, + "reward": 1.562890648841858, + "reward_std": 0.2664823532104492, + "rewards/accuracy_reward": 0.5707031190395355, + "rewards/format_reward": 0.9921875, + "step": 123 + }, + { + "completion_length": 66.03125, + "epoch": 0.5662100456621004, + "grad_norm": 5.077969551086426, + "kl": 0.140625, + "learning_rate": 9.4337899543379e-07, + "loss": 0.0056, + "reward": 1.5604352951049805, + "reward_std": 0.2387000024318695, + "rewards/accuracy_reward": 0.5604352653026581, + "rewards/format_reward": 1.0, + "step": 124 + }, + { + "completion_length": 75.796875, + "epoch": 0.5707762557077626, + "grad_norm": 2.614694356918335, + "kl": 0.124755859375, + "learning_rate": 9.429223744292237e-07, + "loss": 0.005, + "reward": 1.7544271349906921, + "reward_std": 0.13757340610027313, + "rewards/accuracy_reward": 0.7700520753860474, + "rewards/format_reward": 0.984375, + "step": 125 + }, + { + "completion_length": 63.4765625, + "epoch": 0.5753424657534246, + "grad_norm": 5.098128795623779, + "kl": 0.13330078125, + "learning_rate": 9.424657534246575e-07, + "loss": 0.0053, + "reward": 1.545653522014618, + "reward_std": 0.2823333144187927, + "rewards/accuracy_reward": 0.5612785220146179, + "rewards/format_reward": 0.984375, + "step": 126 + }, + { + "completion_length": 81.3046875, + "epoch": 0.5799086757990868, + "grad_norm": 2.6090292930603027, + "kl": 0.072021484375, + "learning_rate": 9.420091324200913e-07, + "loss": 0.0029, + "reward": 1.5642718076705933, + "reward_std": 0.16427360475063324, + "rewards/accuracy_reward": 0.5642717182636261, + "rewards/format_reward": 1.0, + "step": 127 + }, + { + "completion_length": 63.546875, + "epoch": 0.5844748858447488, + "grad_norm": 4.00266170501709, + "kl": 0.093505859375, + "learning_rate": 9.41552511415525e-07, + "loss": 0.0037, + "reward": 1.5677236318588257, + "reward_std": 0.30948029458522797, + "rewards/accuracy_reward": 0.5755361616611481, + "rewards/format_reward": 0.9921875, + "step": 128 + }, + { + "completion_length": 75.8359375, + "epoch": 0.589041095890411, + "grad_norm": 3.477132558822632, + "kl": 0.099609375, + "learning_rate": 9.410958904109588e-07, + "loss": 0.004, + "reward": 1.5374347567558289, + "reward_std": 0.24908459186553955, + "rewards/accuracy_reward": 0.5608722269535065, + "rewards/format_reward": 0.9765625, + "step": 129 + }, + { + "completion_length": 76.46875, + "epoch": 0.593607305936073, + "grad_norm": 2.5465404987335205, + "kl": 0.1416015625, + "learning_rate": 9.406392694063926e-07, + "loss": 0.0057, + "reward": 1.6419846415519714, + "reward_std": 0.1617630198597908, + "rewards/accuracy_reward": 0.6497970819473267, + "rewards/format_reward": 0.9921875, + "step": 130 + }, + { + "completion_length": 69.015625, + "epoch": 0.5981735159817352, + "grad_norm": 4.162069320678711, + "kl": 0.14794921875, + "learning_rate": 9.401826484018265e-07, + "loss": 0.0059, + "reward": 1.5735609531402588, + "reward_std": 0.24296356737613678, + "rewards/accuracy_reward": 0.5735608339309692, + "rewards/format_reward": 1.0, + "step": 131 + }, + { + "completion_length": 63.578125, + "epoch": 0.6027397260273972, + "grad_norm": 2.8231759071350098, + "kl": 0.158935546875, + "learning_rate": 9.397260273972603e-07, + "loss": 0.0064, + "reward": 1.6688734889030457, + "reward_std": 0.20343666523694992, + "rewards/accuracy_reward": 0.6766859591007233, + "rewards/format_reward": 0.9921875, + "step": 132 + }, + { + "completion_length": 58.921875, + "epoch": 0.6073059360730594, + "grad_norm": 3.4793238639831543, + "kl": 0.13818359375, + "learning_rate": 9.39269406392694e-07, + "loss": 0.0055, + "reward": 1.4002604484558105, + "reward_std": 0.2559265270829201, + "rewards/accuracy_reward": 0.40026040375232697, + "rewards/format_reward": 1.0, + "step": 133 + }, + { + "completion_length": 56.109375, + "epoch": 0.6118721461187214, + "grad_norm": 3.3541765213012695, + "kl": 0.14404296875, + "learning_rate": 9.388127853881278e-07, + "loss": 0.0058, + "reward": 1.470312476158142, + "reward_std": 0.3302301913499832, + "rewards/accuracy_reward": 0.4937499612569809, + "rewards/format_reward": 0.9765625, + "step": 134 + }, + { + "completion_length": 70.4609375, + "epoch": 0.6164383561643836, + "grad_norm": 2.545574903488159, + "kl": 0.104248046875, + "learning_rate": 9.383561643835616e-07, + "loss": 0.0042, + "reward": 1.6333333849906921, + "reward_std": 0.16338077187538147, + "rewards/accuracy_reward": 0.633333295583725, + "rewards/format_reward": 1.0, + "step": 135 + }, + { + "completion_length": 47.046875, + "epoch": 0.6210045662100456, + "grad_norm": 3.1955769062042236, + "kl": 0.102783203125, + "learning_rate": 9.378995433789953e-07, + "loss": 0.0041, + "reward": 1.459455132484436, + "reward_std": 0.3040963113307953, + "rewards/accuracy_reward": 0.47508013248443604, + "rewards/format_reward": 0.984375, + "step": 136 + }, + { + "completion_length": 67.234375, + "epoch": 0.6255707762557078, + "grad_norm": 4.687453269958496, + "kl": 0.13427734375, + "learning_rate": 9.374429223744292e-07, + "loss": 0.0054, + "reward": 1.600705862045288, + "reward_std": 0.20340244472026825, + "rewards/accuracy_reward": 0.6085183620452881, + "rewards/format_reward": 0.9921875, + "step": 137 + }, + { + "completion_length": 68.546875, + "epoch": 0.6301369863013698, + "grad_norm": 2.4881796836853027, + "kl": 0.13037109375, + "learning_rate": 9.36986301369863e-07, + "loss": 0.0052, + "reward": 1.5750191807746887, + "reward_std": 0.21577110141515732, + "rewards/accuracy_reward": 0.5906441807746887, + "rewards/format_reward": 0.984375, + "step": 138 + }, + { + "completion_length": 58.6171875, + "epoch": 0.634703196347032, + "grad_norm": 4.482968807220459, + "kl": 0.1357421875, + "learning_rate": 9.365296803652968e-07, + "loss": 0.0054, + "reward": 1.6022320985794067, + "reward_std": 0.25604283809661865, + "rewards/accuracy_reward": 0.6022321283817291, + "rewards/format_reward": 1.0, + "step": 139 + }, + { + "completion_length": 59.625, + "epoch": 0.639269406392694, + "grad_norm": 2.6462514400482178, + "kl": 0.144287109375, + "learning_rate": 9.360730593607306e-07, + "loss": 0.0058, + "reward": 1.6302083134651184, + "reward_std": 0.2878073900938034, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/format_reward": 0.984375, + "step": 140 + }, + { + "completion_length": 67.953125, + "epoch": 0.6438356164383562, + "grad_norm": 14.897021293640137, + "kl": 0.10888671875, + "learning_rate": 9.356164383561643e-07, + "loss": 0.0044, + "reward": 1.5768229961395264, + "reward_std": 0.26041025668382645, + "rewards/accuracy_reward": 0.5846354067325592, + "rewards/format_reward": 0.9921875, + "step": 141 + }, + { + "completion_length": 62.2421875, + "epoch": 0.6484018264840182, + "grad_norm": 2.723630428314209, + "kl": 0.140625, + "learning_rate": 9.351598173515981e-07, + "loss": 0.0056, + "reward": 1.5526549220085144, + "reward_std": 0.2013382911682129, + "rewards/accuracy_reward": 0.552654892206192, + "rewards/format_reward": 1.0, + "step": 142 + }, + { + "completion_length": 62.234375, + "epoch": 0.6529680365296804, + "grad_norm": 4.193840026855469, + "kl": 0.12744140625, + "learning_rate": 9.347031963470319e-07, + "loss": 0.0051, + "reward": 1.5169403553009033, + "reward_std": 0.3250080794095993, + "rewards/accuracy_reward": 0.5481902956962585, + "rewards/format_reward": 0.96875, + "step": 143 + }, + { + "completion_length": 54.5078125, + "epoch": 0.6575342465753424, + "grad_norm": 3.8824496269226074, + "kl": 0.1943359375, + "learning_rate": 9.342465753424658e-07, + "loss": 0.0078, + "reward": 1.5293877124786377, + "reward_std": 0.20680496841669083, + "rewards/accuracy_reward": 0.5293876230716705, + "rewards/format_reward": 1.0, + "step": 144 + }, + { + "completion_length": 70.8359375, + "epoch": 0.6621004566210046, + "grad_norm": 2.7752647399902344, + "kl": 0.109375, + "learning_rate": 9.337899543378995e-07, + "loss": 0.0044, + "reward": 1.6002604365348816, + "reward_std": 0.16751104593276978, + "rewards/accuracy_reward": 0.6002604067325592, + "rewards/format_reward": 1.0, + "step": 145 + }, + { + "completion_length": 43.4921875, + "epoch": 0.6666666666666666, + "grad_norm": 4.007904529571533, + "kl": 0.1455078125, + "learning_rate": 9.333333333333333e-07, + "loss": 0.0058, + "reward": 1.4145089387893677, + "reward_std": 0.37413105368614197, + "rewards/accuracy_reward": 0.4301339387893677, + "rewards/format_reward": 0.984375, + "step": 146 + }, + { + "completion_length": 55.015625, + "epoch": 0.6712328767123288, + "grad_norm": 2.844416618347168, + "kl": 0.14501953125, + "learning_rate": 9.328767123287671e-07, + "loss": 0.0058, + "reward": 1.434374988079071, + "reward_std": 0.2953774631023407, + "rewards/accuracy_reward": 0.44218750298023224, + "rewards/format_reward": 0.9921875, + "step": 147 + }, + { + "completion_length": 63.953125, + "epoch": 0.6757990867579908, + "grad_norm": 3.3027069568634033, + "kl": 0.11376953125, + "learning_rate": 9.324200913242009e-07, + "loss": 0.0045, + "reward": 1.5338541269302368, + "reward_std": 0.2085491269826889, + "rewards/accuracy_reward": 0.5494791865348816, + "rewards/format_reward": 0.984375, + "step": 148 + }, + { + "completion_length": 60.046875, + "epoch": 0.680365296803653, + "grad_norm": 2.4687771797180176, + "kl": 0.150146484375, + "learning_rate": 9.319634703196346e-07, + "loss": 0.006, + "reward": 1.4104894995689392, + "reward_std": 0.2745247557759285, + "rewards/accuracy_reward": 0.4104894697666168, + "rewards/format_reward": 1.0, + "step": 149 + }, + { + "completion_length": 70.234375, + "epoch": 0.684931506849315, + "grad_norm": 3.1265056133270264, + "kl": 0.112548828125, + "learning_rate": 9.315068493150684e-07, + "loss": 0.0045, + "reward": 1.5757813453674316, + "reward_std": 0.2977932393550873, + "rewards/accuracy_reward": 0.5914062261581421, + "rewards/format_reward": 0.984375, + "step": 150 + }, + { + "completion_length": 55.9609375, + "epoch": 0.6894977168949772, + "grad_norm": 2.7954444885253906, + "kl": 0.138916015625, + "learning_rate": 9.310502283105023e-07, + "loss": 0.0056, + "reward": 1.532336711883545, + "reward_std": 0.2634681910276413, + "rewards/accuracy_reward": 0.5557741522789001, + "rewards/format_reward": 0.9765625, + "step": 151 + }, + { + "completion_length": 91.7734375, + "epoch": 0.6940639269406392, + "grad_norm": 3.276801586151123, + "kl": 0.09521484375, + "learning_rate": 9.30593607305936e-07, + "loss": 0.0038, + "reward": 1.8068453073501587, + "reward_std": 0.13430681824684143, + "rewards/accuracy_reward": 0.8146576881408691, + "rewards/format_reward": 0.9921875, + "step": 152 + }, + { + "completion_length": 71.9453125, + "epoch": 0.6986301369863014, + "grad_norm": 5.107734680175781, + "kl": 0.126708984375, + "learning_rate": 9.301369863013698e-07, + "loss": 0.0051, + "reward": 1.5772569179534912, + "reward_std": 0.3007543087005615, + "rewards/accuracy_reward": 0.60069440305233, + "rewards/format_reward": 0.9765625, + "step": 153 + }, + { + "completion_length": 65.1015625, + "epoch": 0.7031963470319634, + "grad_norm": 11.66229248046875, + "kl": 0.142578125, + "learning_rate": 9.296803652968036e-07, + "loss": 0.0057, + "reward": 1.5702590942382812, + "reward_std": 0.2691568061709404, + "rewards/accuracy_reward": 0.5780715942382812, + "rewards/format_reward": 0.9921875, + "step": 154 + }, + { + "completion_length": 70.734375, + "epoch": 0.7077625570776256, + "grad_norm": 2.577918291091919, + "kl": 0.101318359375, + "learning_rate": 9.292237442922374e-07, + "loss": 0.0041, + "reward": 1.706770896911621, + "reward_std": 0.17131806910037994, + "rewards/accuracy_reward": 0.7145833075046539, + "rewards/format_reward": 0.9921875, + "step": 155 + }, + { + "completion_length": 60.515625, + "epoch": 0.7123287671232876, + "grad_norm": 3.4716637134552, + "kl": 0.119384765625, + "learning_rate": 9.287671232876712e-07, + "loss": 0.0048, + "reward": 1.534895896911621, + "reward_std": 0.2703789845108986, + "rewards/accuracy_reward": 0.5817708373069763, + "rewards/format_reward": 0.953125, + "step": 156 + }, + { + "completion_length": 56.15625, + "epoch": 0.7168949771689498, + "grad_norm": 3.562089204788208, + "kl": 0.119140625, + "learning_rate": 9.28310502283105e-07, + "loss": 0.0048, + "reward": 1.586718738079071, + "reward_std": 0.23555129766464233, + "rewards/accuracy_reward": 0.5867187678813934, + "rewards/format_reward": 1.0, + "step": 157 + }, + { + "completion_length": 78.2890625, + "epoch": 0.7214611872146118, + "grad_norm": 3.0020015239715576, + "kl": 0.124755859375, + "learning_rate": 9.278538812785388e-07, + "loss": 0.005, + "reward": 1.6171875596046448, + "reward_std": 0.29713982343673706, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.9609375, + "step": 158 + }, + { + "completion_length": 79.8203125, + "epoch": 0.726027397260274, + "grad_norm": 3.9287021160125732, + "kl": 0.089599609375, + "learning_rate": 9.273972602739726e-07, + "loss": 0.0036, + "reward": 1.5328125953674316, + "reward_std": 0.3052903413772583, + "rewards/accuracy_reward": 0.5953125059604645, + "rewards/format_reward": 0.9375, + "step": 159 + }, + { + "completion_length": 81.546875, + "epoch": 0.730593607305936, + "grad_norm": 4.853604793548584, + "kl": 0.142333984375, + "learning_rate": 9.269406392694063e-07, + "loss": 0.0057, + "reward": 1.6752576231956482, + "reward_std": 0.17081036418676376, + "rewards/accuracy_reward": 0.6830700039863586, + "rewards/format_reward": 0.9921875, + "step": 160 + }, + { + "completion_length": 62.828125, + "epoch": 0.7351598173515982, + "grad_norm": 6.2052903175354, + "kl": 0.14013671875, + "learning_rate": 9.264840182648401e-07, + "loss": 0.0056, + "reward": 1.4204427003860474, + "reward_std": 0.3042631521821022, + "rewards/accuracy_reward": 0.42825521528720856, + "rewards/format_reward": 0.9921875, + "step": 161 + }, + { + "completion_length": 113.703125, + "epoch": 0.7397260273972602, + "grad_norm": 0.730038583278656, + "kl": 0.066162109375, + "learning_rate": 9.260273972602739e-07, + "loss": 0.0026, + "reward": 1.8767098784446716, + "reward_std": 0.03975973278284073, + "rewards/accuracy_reward": 0.8845223784446716, + "rewards/format_reward": 0.9921875, + "step": 162 + }, + { + "completion_length": 77.8515625, + "epoch": 0.7442922374429224, + "grad_norm": 2.9642155170440674, + "kl": 0.10009765625, + "learning_rate": 9.255707762557077e-07, + "loss": 0.004, + "reward": 1.702742099761963, + "reward_std": 0.18021715432405472, + "rewards/accuracy_reward": 0.7105545401573181, + "rewards/format_reward": 0.9921875, + "step": 163 + }, + { + "completion_length": 73.984375, + "epoch": 0.7488584474885844, + "grad_norm": 3.072006940841675, + "kl": 0.13330078125, + "learning_rate": 9.251141552511416e-07, + "loss": 0.0053, + "reward": 1.768545389175415, + "reward_std": 0.1539178043603897, + "rewards/accuracy_reward": 0.768545389175415, + "rewards/format_reward": 1.0, + "step": 164 + }, + { + "completion_length": 65.953125, + "epoch": 0.7534246575342466, + "grad_norm": 5.146835803985596, + "kl": 0.12841796875, + "learning_rate": 9.246575342465753e-07, + "loss": 0.0051, + "reward": 1.503950297832489, + "reward_std": 0.3056825324892998, + "rewards/accuracy_reward": 0.5195753127336502, + "rewards/format_reward": 0.984375, + "step": 165 + }, + { + "completion_length": 91.4921875, + "epoch": 0.7579908675799086, + "grad_norm": 2.6764745712280273, + "kl": 0.1031494140625, + "learning_rate": 9.242009132420091e-07, + "loss": 0.0041, + "reward": 1.5589421391487122, + "reward_std": 0.23093140125274658, + "rewards/accuracy_reward": 0.5745670646429062, + "rewards/format_reward": 0.984375, + "step": 166 + }, + { + "completion_length": 81.7734375, + "epoch": 0.7625570776255708, + "grad_norm": 2.828564405441284, + "kl": 0.12109375, + "learning_rate": 9.237442922374429e-07, + "loss": 0.0048, + "reward": 1.5663008093833923, + "reward_std": 0.20618148148059845, + "rewards/accuracy_reward": 0.5975507497787476, + "rewards/format_reward": 0.96875, + "step": 167 + }, + { + "completion_length": 76.296875, + "epoch": 0.7671232876712328, + "grad_norm": 2.9903767108917236, + "kl": 0.1298828125, + "learning_rate": 9.232876712328766e-07, + "loss": 0.0052, + "reward": 1.645312488079071, + "reward_std": 0.2312253788113594, + "rewards/accuracy_reward": 0.6531249731779099, + "rewards/format_reward": 0.9921875, + "step": 168 + }, + { + "completion_length": 77.71875, + "epoch": 0.771689497716895, + "grad_norm": 2.3729984760284424, + "kl": 0.103759765625, + "learning_rate": 9.228310502283104e-07, + "loss": 0.0042, + "reward": 1.5843749642372131, + "reward_std": 0.26058951020240784, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 0.984375, + "step": 169 + }, + { + "completion_length": 109.0234375, + "epoch": 0.776255707762557, + "grad_norm": 2.769318103790283, + "kl": 0.076416015625, + "learning_rate": 9.223744292237442e-07, + "loss": 0.0031, + "reward": 1.7934027910232544, + "reward_std": 0.09004722535610199, + "rewards/accuracy_reward": 0.801215261220932, + "rewards/format_reward": 0.9921875, + "step": 170 + }, + { + "completion_length": 75.8203125, + "epoch": 0.7808219178082192, + "grad_norm": 2.6014890670776367, + "kl": 0.131103515625, + "learning_rate": 9.219178082191781e-07, + "loss": 0.0052, + "reward": 1.5569196343421936, + "reward_std": 0.26341303437948227, + "rewards/accuracy_reward": 0.580357164144516, + "rewards/format_reward": 0.9765625, + "step": 171 + }, + { + "completion_length": 97.9375, + "epoch": 0.7853881278538812, + "grad_norm": 2.9145772457122803, + "kl": 0.091064453125, + "learning_rate": 9.214611872146119e-07, + "loss": 0.0036, + "reward": 1.6800222992897034, + "reward_std": 0.15495410561561584, + "rewards/accuracy_reward": 0.6956472992897034, + "rewards/format_reward": 0.984375, + "step": 172 + }, + { + "completion_length": 82.0234375, + "epoch": 0.7899543378995434, + "grad_norm": 5.809459686279297, + "kl": 0.27392578125, + "learning_rate": 9.210045662100456e-07, + "loss": 0.0109, + "reward": 1.489843726158142, + "reward_std": 0.22895930707454681, + "rewards/accuracy_reward": 0.4898437559604645, + "rewards/format_reward": 1.0, + "step": 173 + }, + { + "completion_length": 76.0859375, + "epoch": 0.7945205479452054, + "grad_norm": 4.242368698120117, + "kl": 0.111572265625, + "learning_rate": 9.205479452054794e-07, + "loss": 0.0045, + "reward": 1.4959192276000977, + "reward_std": 0.2452109009027481, + "rewards/accuracy_reward": 0.5037316679954529, + "rewards/format_reward": 0.9921875, + "step": 174 + }, + { + "completion_length": 72.4921875, + "epoch": 0.7990867579908676, + "grad_norm": 4.817114353179932, + "kl": 0.12060546875, + "learning_rate": 9.200913242009132e-07, + "loss": 0.0048, + "reward": 1.607180118560791, + "reward_std": 0.2259521484375, + "rewards/accuracy_reward": 0.6149925291538239, + "rewards/format_reward": 0.9921875, + "step": 175 + }, + { + "completion_length": 73.3828125, + "epoch": 0.8036529680365296, + "grad_norm": 5.523159027099609, + "kl": 0.11669921875, + "learning_rate": 9.196347031963469e-07, + "loss": 0.0047, + "reward": 1.4424473643302917, + "reward_std": 0.35005757212638855, + "rewards/accuracy_reward": 0.46588489413261414, + "rewards/format_reward": 0.9765625, + "step": 176 + }, + { + "completion_length": 83.7109375, + "epoch": 0.8082191780821918, + "grad_norm": 6.852357387542725, + "kl": 0.101318359375, + "learning_rate": 9.191780821917808e-07, + "loss": 0.0041, + "reward": 1.7020823955535889, + "reward_std": 0.1837347373366356, + "rewards/accuracy_reward": 0.7020823657512665, + "rewards/format_reward": 1.0, + "step": 177 + }, + { + "completion_length": 95.171875, + "epoch": 0.8127853881278538, + "grad_norm": 3.1507697105407715, + "kl": 0.0908203125, + "learning_rate": 9.187214611872146e-07, + "loss": 0.0036, + "reward": 1.6673295497894287, + "reward_std": 0.14267417788505554, + "rewards/accuracy_reward": 0.6673295497894287, + "rewards/format_reward": 1.0, + "step": 178 + }, + { + "completion_length": 110.1640625, + "epoch": 0.817351598173516, + "grad_norm": 2.0009899139404297, + "kl": 0.0653076171875, + "learning_rate": 9.182648401826484e-07, + "loss": 0.0026, + "reward": 1.7578993439674377, + "reward_std": 0.12590338289737701, + "rewards/accuracy_reward": 0.765711784362793, + "rewards/format_reward": 0.9921875, + "step": 179 + }, + { + "completion_length": 83.59375, + "epoch": 0.821917808219178, + "grad_norm": 2.3591296672821045, + "kl": 0.09765625, + "learning_rate": 9.178082191780822e-07, + "loss": 0.0039, + "reward": 1.72604501247406, + "reward_std": 0.1832278147339821, + "rewards/accuracy_reward": 0.7338575124740601, + "rewards/format_reward": 0.9921875, + "step": 180 + }, + { + "completion_length": 90.8125, + "epoch": 0.8264840182648402, + "grad_norm": 4.365949630737305, + "kl": 0.1068115234375, + "learning_rate": 9.173515981735159e-07, + "loss": 0.0043, + "reward": 1.7241073250770569, + "reward_std": 0.1286808280274272, + "rewards/accuracy_reward": 0.7241072654724121, + "rewards/format_reward": 1.0, + "step": 181 + }, + { + "completion_length": 70.1015625, + "epoch": 0.8310502283105022, + "grad_norm": 4.564199924468994, + "kl": 0.1298828125, + "learning_rate": 9.168949771689497e-07, + "loss": 0.0052, + "reward": 1.4811267852783203, + "reward_std": 0.23117025196552277, + "rewards/accuracy_reward": 0.4967518150806427, + "rewards/format_reward": 0.984375, + "step": 182 + }, + { + "completion_length": 70.0859375, + "epoch": 0.8356164383561644, + "grad_norm": 8.132929801940918, + "kl": 0.15234375, + "learning_rate": 9.164383561643835e-07, + "loss": 0.0061, + "reward": 1.513009488582611, + "reward_std": 0.31318600475788116, + "rewards/accuracy_reward": 0.5208219289779663, + "rewards/format_reward": 0.9921875, + "step": 183 + }, + { + "completion_length": 87.390625, + "epoch": 0.8401826484018264, + "grad_norm": 2.4152746200561523, + "kl": 0.086669921875, + "learning_rate": 9.159817351598174e-07, + "loss": 0.0035, + "reward": 1.6757907271385193, + "reward_std": 0.16663195937871933, + "rewards/accuracy_reward": 0.6836032569408417, + "rewards/format_reward": 0.9921875, + "step": 184 + }, + { + "completion_length": 71.40625, + "epoch": 0.8447488584474886, + "grad_norm": 3.8499457836151123, + "kl": 0.14111328125, + "learning_rate": 9.155251141552511e-07, + "loss": 0.0057, + "reward": 1.418749988079071, + "reward_std": 0.27360689640045166, + "rewards/accuracy_reward": 0.42656251788139343, + "rewards/format_reward": 0.9921875, + "step": 185 + }, + { + "completion_length": 100.109375, + "epoch": 0.8493150684931506, + "grad_norm": 1.6684777736663818, + "kl": 0.103271484375, + "learning_rate": 9.150684931506849e-07, + "loss": 0.0041, + "reward": 1.820052146911621, + "reward_std": 0.09526955150067806, + "rewards/accuracy_reward": 0.8278645575046539, + "rewards/format_reward": 0.9921875, + "step": 186 + }, + { + "completion_length": 110.109375, + "epoch": 0.8538812785388128, + "grad_norm": 2.6076531410217285, + "kl": 0.0618896484375, + "learning_rate": 9.146118721461187e-07, + "loss": 0.0025, + "reward": 1.6446732878684998, + "reward_std": 0.1990083083510399, + "rewards/accuracy_reward": 0.6602982878684998, + "rewards/format_reward": 0.984375, + "step": 187 + }, + { + "completion_length": 73.921875, + "epoch": 0.8584474885844748, + "grad_norm": 3.0167715549468994, + "kl": 0.129150390625, + "learning_rate": 9.141552511415525e-07, + "loss": 0.0052, + "reward": 1.4889777302742004, + "reward_std": 0.310699999332428, + "rewards/accuracy_reward": 0.520227700471878, + "rewards/format_reward": 0.96875, + "step": 188 + }, + { + "completion_length": 77.5625, + "epoch": 0.863013698630137, + "grad_norm": 5.573866844177246, + "kl": 0.142578125, + "learning_rate": 9.136986301369862e-07, + "loss": 0.0057, + "reward": 1.5417782664299011, + "reward_std": 0.21660251170396805, + "rewards/accuracy_reward": 0.5417782664299011, + "rewards/format_reward": 1.0, + "step": 189 + }, + { + "completion_length": 98.015625, + "epoch": 0.867579908675799, + "grad_norm": 3.7342417240142822, + "kl": 0.12890625, + "learning_rate": 9.1324200913242e-07, + "loss": 0.0051, + "reward": 1.5212674140930176, + "reward_std": 0.3198610842227936, + "rewards/accuracy_reward": 0.5681423544883728, + "rewards/format_reward": 0.953125, + "step": 190 + }, + { + "completion_length": 87.9609375, + "epoch": 0.8721461187214612, + "grad_norm": 5.50727653503418, + "kl": 0.09033203125, + "learning_rate": 9.127853881278539e-07, + "loss": 0.0036, + "reward": 1.4584822058677673, + "reward_std": 0.2924446016550064, + "rewards/accuracy_reward": 0.47410714626312256, + "rewards/format_reward": 0.984375, + "step": 191 + }, + { + "completion_length": 86.59375, + "epoch": 0.8767123287671232, + "grad_norm": 4.182096004486084, + "kl": 0.110595703125, + "learning_rate": 9.123287671232876e-07, + "loss": 0.0044, + "reward": 1.4973958134651184, + "reward_std": 0.27411043643951416, + "rewards/accuracy_reward": 0.5130208432674408, + "rewards/format_reward": 0.984375, + "step": 192 + }, + { + "completion_length": 83.3359375, + "epoch": 0.8812785388127854, + "grad_norm": 3.033630609512329, + "kl": 0.1015625, + "learning_rate": 9.118721461187214e-07, + "loss": 0.0041, + "reward": 1.6757813096046448, + "reward_std": 0.18561583012342453, + "rewards/accuracy_reward": 0.6835937201976776, + "rewards/format_reward": 0.9921875, + "step": 193 + }, + { + "completion_length": 86.953125, + "epoch": 0.8858447488584474, + "grad_norm": 2.5366592407226562, + "kl": 0.0810546875, + "learning_rate": 9.114155251141552e-07, + "loss": 0.0032, + "reward": 1.605208396911621, + "reward_std": 0.2767683416604996, + "rewards/accuracy_reward": 0.6286458075046539, + "rewards/format_reward": 0.9765625, + "step": 194 + }, + { + "completion_length": 87.75, + "epoch": 0.8904109589041096, + "grad_norm": 2.646970748901367, + "kl": 0.121826171875, + "learning_rate": 9.10958904109589e-07, + "loss": 0.0049, + "reward": 1.6739211678504944, + "reward_std": 0.22777877748012543, + "rewards/accuracy_reward": 0.6817336082458496, + "rewards/format_reward": 0.9921875, + "step": 195 + }, + { + "completion_length": 85.375, + "epoch": 0.8949771689497716, + "grad_norm": 4.142679214477539, + "kl": 0.116455078125, + "learning_rate": 9.105022831050228e-07, + "loss": 0.0047, + "reward": 1.5367559790611267, + "reward_std": 0.22284159809350967, + "rewards/accuracy_reward": 0.5367559492588043, + "rewards/format_reward": 1.0, + "step": 196 + }, + { + "completion_length": 73.3359375, + "epoch": 0.8995433789954338, + "grad_norm": 98.02293395996094, + "kl": 0.123779296875, + "learning_rate": 9.100456621004566e-07, + "loss": 0.005, + "reward": 1.613690435886383, + "reward_std": 0.3185143321752548, + "rewards/accuracy_reward": 0.6293154358863831, + "rewards/format_reward": 0.984375, + "step": 197 + }, + { + "completion_length": 68.6796875, + "epoch": 0.9041095890410958, + "grad_norm": 3.6209559440612793, + "kl": 0.13037109375, + "learning_rate": 9.095890410958904e-07, + "loss": 0.0052, + "reward": 1.440234363079071, + "reward_std": 0.31120626628398895, + "rewards/accuracy_reward": 0.47148437798023224, + "rewards/format_reward": 0.96875, + "step": 198 + }, + { + "completion_length": 79.1953125, + "epoch": 0.908675799086758, + "grad_norm": 5.003497123718262, + "kl": 0.1123046875, + "learning_rate": 9.091324200913242e-07, + "loss": 0.0045, + "reward": 1.648740530014038, + "reward_std": 0.1829584613442421, + "rewards/accuracy_reward": 0.6487405002117157, + "rewards/format_reward": 1.0, + "step": 199 + }, + { + "completion_length": 102.6171875, + "epoch": 0.91324200913242, + "grad_norm": 2.219266891479492, + "kl": 0.0859375, + "learning_rate": 9.08675799086758e-07, + "loss": 0.0034, + "reward": 1.7168915271759033, + "reward_std": 0.12932297587394714, + "rewards/accuracy_reward": 0.7247040569782257, + "rewards/format_reward": 0.9921875, + "step": 200 + }, + { + "completion_length": 72.140625, + "epoch": 0.9178082191780822, + "grad_norm": 3.0909345149993896, + "kl": 0.1162109375, + "learning_rate": 9.082191780821917e-07, + "loss": 0.0046, + "reward": 1.4611505270004272, + "reward_std": 0.3023676201701164, + "rewards/accuracy_reward": 0.4924005717039108, + "rewards/format_reward": 0.96875, + "step": 201 + }, + { + "completion_length": 94.140625, + "epoch": 0.9223744292237442, + "grad_norm": 4.803802967071533, + "kl": 0.103759765625, + "learning_rate": 9.077625570776255e-07, + "loss": 0.0042, + "reward": 1.6710938215255737, + "reward_std": 0.18428679555654526, + "rewards/accuracy_reward": 0.6789062321186066, + "rewards/format_reward": 0.9921875, + "step": 202 + }, + { + "completion_length": 86.5234375, + "epoch": 0.9269406392694064, + "grad_norm": 4.511474609375, + "kl": 0.078125, + "learning_rate": 9.073059360730593e-07, + "loss": 0.0031, + "reward": 1.6398438215255737, + "reward_std": 0.17492686957120895, + "rewards/accuracy_reward": 0.6632812321186066, + "rewards/format_reward": 0.9765625, + "step": 203 + }, + { + "completion_length": 77.4296875, + "epoch": 0.9315068493150684, + "grad_norm": 3.1800436973571777, + "kl": 0.1025390625, + "learning_rate": 9.068493150684932e-07, + "loss": 0.0041, + "reward": 1.5895833373069763, + "reward_std": 0.2831973433494568, + "rewards/accuracy_reward": 0.6130208373069763, + "rewards/format_reward": 0.9765625, + "step": 204 + }, + { + "completion_length": 78.21875, + "epoch": 0.9360730593607306, + "grad_norm": 2.723695993423462, + "kl": 0.110107421875, + "learning_rate": 9.063926940639269e-07, + "loss": 0.0044, + "reward": 1.508962869644165, + "reward_std": 0.293969988822937, + "rewards/accuracy_reward": 0.5324003100395203, + "rewards/format_reward": 0.9765625, + "step": 205 + }, + { + "completion_length": 71.1953125, + "epoch": 0.9406392694063926, + "grad_norm": 4.026218414306641, + "kl": 0.125244140625, + "learning_rate": 9.059360730593607e-07, + "loss": 0.005, + "reward": 1.475000023841858, + "reward_std": 0.31207825243473053, + "rewards/accuracy_reward": 0.4828125238418579, + "rewards/format_reward": 0.9921875, + "step": 206 + }, + { + "completion_length": 91.5546875, + "epoch": 0.9452054794520548, + "grad_norm": 3.48559832572937, + "kl": 0.078125, + "learning_rate": 9.054794520547945e-07, + "loss": 0.0031, + "reward": 1.5565290451049805, + "reward_std": 0.17772940546274185, + "rewards/accuracy_reward": 0.5721540153026581, + "rewards/format_reward": 0.984375, + "step": 207 + }, + { + "completion_length": 80.015625, + "epoch": 0.9497716894977168, + "grad_norm": 2.530951499938965, + "kl": 0.103759765625, + "learning_rate": 9.050228310502282e-07, + "loss": 0.0042, + "reward": 1.7421875, + "reward_std": 0.17887144908308983, + "rewards/accuracy_reward": 0.7499999403953552, + "rewards/format_reward": 0.9921875, + "step": 208 + }, + { + "completion_length": 85.671875, + "epoch": 0.954337899543379, + "grad_norm": 3.1076207160949707, + "kl": 0.084228515625, + "learning_rate": 9.04566210045662e-07, + "loss": 0.0034, + "reward": 1.6094618439674377, + "reward_std": 0.16175774857401848, + "rewards/accuracy_reward": 0.617274284362793, + "rewards/format_reward": 0.9921875, + "step": 209 + }, + { + "completion_length": 84.546875, + "epoch": 0.958904109589041, + "grad_norm": 2.6074228286743164, + "kl": 0.076171875, + "learning_rate": 9.041095890410958e-07, + "loss": 0.003, + "reward": 1.5931640267372131, + "reward_std": 0.18021905422210693, + "rewards/accuracy_reward": 0.5931640565395355, + "rewards/format_reward": 1.0, + "step": 210 + }, + { + "completion_length": 74.015625, + "epoch": 0.9634703196347032, + "grad_norm": 3.9831788539886475, + "kl": 0.087890625, + "learning_rate": 9.036529680365297e-07, + "loss": 0.0035, + "reward": 1.5644965171813965, + "reward_std": 0.20533857494592667, + "rewards/accuracy_reward": 0.5644965171813965, + "rewards/format_reward": 1.0, + "step": 211 + }, + { + "completion_length": 60.9296875, + "epoch": 0.9680365296803652, + "grad_norm": 2.8256754875183105, + "kl": 0.10986328125, + "learning_rate": 9.031963470319635e-07, + "loss": 0.0044, + "reward": 1.5828006267547607, + "reward_std": 0.2151413932442665, + "rewards/accuracy_reward": 0.5906131863594055, + "rewards/format_reward": 0.9921875, + "step": 212 + }, + { + "completion_length": 88.8515625, + "epoch": 0.9726027397260274, + "grad_norm": 2.615966796875, + "kl": 0.113037109375, + "learning_rate": 9.027397260273972e-07, + "loss": 0.0045, + "reward": 1.5743975639343262, + "reward_std": 0.24481570720672607, + "rewards/accuracy_reward": 0.5822100639343262, + "rewards/format_reward": 0.9921875, + "step": 213 + }, + { + "completion_length": 66.3359375, + "epoch": 0.9771689497716894, + "grad_norm": 11.469498634338379, + "kl": 0.106201171875, + "learning_rate": 9.02283105022831e-07, + "loss": 0.0042, + "reward": 1.7420889139175415, + "reward_std": 0.13549309968948364, + "rewards/accuracy_reward": 0.7420888245105743, + "rewards/format_reward": 1.0, + "step": 214 + }, + { + "completion_length": 74.0390625, + "epoch": 0.9817351598173516, + "grad_norm": 3.873460292816162, + "kl": 0.0693359375, + "learning_rate": 9.018264840182648e-07, + "loss": 0.0028, + "reward": 1.6421875953674316, + "reward_std": 0.2022070661187172, + "rewards/accuracy_reward": 0.6421874761581421, + "rewards/format_reward": 1.0, + "step": 215 + }, + { + "completion_length": 93.375, + "epoch": 0.9863013698630136, + "grad_norm": 2.612290143966675, + "kl": 0.09765625, + "learning_rate": 9.013698630136985e-07, + "loss": 0.0039, + "reward": 1.645312488079071, + "reward_std": 0.25715554505586624, + "rewards/accuracy_reward": 0.676562488079071, + "rewards/format_reward": 0.96875, + "step": 216 + }, + { + "completion_length": 83.8515625, + "epoch": 0.9908675799086758, + "grad_norm": 1.9043883085250854, + "kl": 0.077880859375, + "learning_rate": 9.009132420091324e-07, + "loss": 0.0031, + "reward": 1.6007813215255737, + "reward_std": 0.2224196195602417, + "rewards/accuracy_reward": 0.6242187321186066, + "rewards/format_reward": 0.9765625, + "step": 217 + }, + { + "completion_length": 70.2421875, + "epoch": 0.9954337899543378, + "grad_norm": 9.268891334533691, + "kl": 0.114013671875, + "learning_rate": 9.004566210045662e-07, + "loss": 0.0046, + "reward": 1.487464964389801, + "reward_std": 0.3045327961444855, + "rewards/accuracy_reward": 0.510902464389801, + "rewards/format_reward": 0.9765625, + "step": 218 + }, + { + "completion_length": 138.875, + "epoch": 1.0, + "grad_norm": 2.398301839828491, + "kl": 0.1318359375, + "learning_rate": 9e-07, + "loss": 0.0041, + "reward": 1.899999976158142, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.8999999761581421, + "rewards/format_reward": 1.0, + "step": 219 + }, + { + "completion_length": 70.625, + "epoch": 1.004566210045662, + "grad_norm": 3.741870880126953, + "kl": 0.12158203125, + "learning_rate": 8.995433789954338e-07, + "loss": 0.0049, + "reward": 1.50390625, + "reward_std": 0.26016832888126373, + "rewards/accuracy_reward": 0.5039062350988388, + "rewards/format_reward": 1.0, + "step": 220 + }, + { + "completion_length": 64.859375, + "epoch": 1.009132420091324, + "grad_norm": 11.450385093688965, + "kl": 0.133056640625, + "learning_rate": 8.990867579908675e-07, + "loss": 0.0053, + "reward": 1.7419270873069763, + "reward_std": 0.11560981348156929, + "rewards/accuracy_reward": 0.7419269979000092, + "rewards/format_reward": 1.0, + "step": 221 + }, + { + "completion_length": 82.046875, + "epoch": 1.0136986301369864, + "grad_norm": 3.4675490856170654, + "kl": 0.08935546875, + "learning_rate": 8.986301369863013e-07, + "loss": 0.0036, + "reward": 1.6703497171401978, + "reward_std": 0.2566395699977875, + "rewards/accuracy_reward": 0.6859746873378754, + "rewards/format_reward": 0.984375, + "step": 222 + }, + { + "completion_length": 88.4375, + "epoch": 1.0182648401826484, + "grad_norm": 2.4552440643310547, + "kl": 0.11328125, + "learning_rate": 8.981735159817351e-07, + "loss": 0.0045, + "reward": 1.646279752254486, + "reward_std": 0.1442876234650612, + "rewards/accuracy_reward": 0.6462797522544861, + "rewards/format_reward": 1.0, + "step": 223 + }, + { + "completion_length": 60.6171875, + "epoch": 1.0228310502283104, + "grad_norm": 2.843845844268799, + "kl": 0.144287109375, + "learning_rate": 8.97716894977169e-07, + "loss": 0.0058, + "reward": 1.5498643517494202, + "reward_std": 0.2656385153532028, + "rewards/accuracy_reward": 0.5498644113540649, + "rewards/format_reward": 1.0, + "step": 224 + }, + { + "completion_length": 59.640625, + "epoch": 1.0273972602739727, + "grad_norm": 6.659507751464844, + "kl": 0.26123046875, + "learning_rate": 8.972602739726027e-07, + "loss": 0.0105, + "reward": 1.522736370563507, + "reward_std": 0.23698078095912933, + "rewards/accuracy_reward": 0.5383614003658295, + "rewards/format_reward": 0.984375, + "step": 225 + }, + { + "completion_length": 76.2578125, + "epoch": 1.0319634703196348, + "grad_norm": 7.1479926109313965, + "kl": 0.11474609375, + "learning_rate": 8.968036529680365e-07, + "loss": 0.0046, + "reward": 1.5441184043884277, + "reward_std": 0.1862540915608406, + "rewards/accuracy_reward": 0.5519307851791382, + "rewards/format_reward": 0.9921875, + "step": 226 + }, + { + "completion_length": 52.7109375, + "epoch": 1.0365296803652968, + "grad_norm": 5.714673042297363, + "kl": 0.17578125, + "learning_rate": 8.963470319634703e-07, + "loss": 0.007, + "reward": 1.4614583253860474, + "reward_std": 0.3212399482727051, + "rewards/accuracy_reward": 0.47708331048488617, + "rewards/format_reward": 0.984375, + "step": 227 + }, + { + "completion_length": 59.234375, + "epoch": 1.0410958904109588, + "grad_norm": 2.191619396209717, + "kl": 0.143798828125, + "learning_rate": 8.958904109589041e-07, + "loss": 0.0058, + "reward": 1.6684895753860474, + "reward_std": 0.1687004156410694, + "rewards/accuracy_reward": 0.6763020753860474, + "rewards/format_reward": 0.9921875, + "step": 228 + }, + { + "completion_length": 64.375, + "epoch": 1.045662100456621, + "grad_norm": 7.204253196716309, + "kl": 0.1484375, + "learning_rate": 8.954337899543378e-07, + "loss": 0.0059, + "reward": 1.6486244797706604, + "reward_std": 0.20006748288869858, + "rewards/accuracy_reward": 0.6486244797706604, + "rewards/format_reward": 1.0, + "step": 229 + }, + { + "completion_length": 68.6796875, + "epoch": 1.0502283105022832, + "grad_norm": 34.476558685302734, + "kl": 0.112060546875, + "learning_rate": 8.949771689497716e-07, + "loss": 0.0045, + "reward": 1.484375, + "reward_std": 0.2940969169139862, + "rewards/accuracy_reward": 0.5078124701976776, + "rewards/format_reward": 0.9765625, + "step": 230 + }, + { + "completion_length": 60.7421875, + "epoch": 1.0547945205479452, + "grad_norm": 1.8852620124816895, + "kl": 0.15234375, + "learning_rate": 8.945205479452055e-07, + "loss": 0.0061, + "reward": 1.641055941581726, + "reward_std": 0.19024673104286194, + "rewards/accuracy_reward": 0.6488684415817261, + "rewards/format_reward": 0.9921875, + "step": 231 + }, + { + "completion_length": 57.875, + "epoch": 1.0593607305936072, + "grad_norm": 5.363572597503662, + "kl": 0.19677734375, + "learning_rate": 8.940639269406392e-07, + "loss": 0.0079, + "reward": 1.5763392448425293, + "reward_std": 0.3014761507511139, + "rewards/accuracy_reward": 0.5997768044471741, + "rewards/format_reward": 0.9765625, + "step": 232 + }, + { + "completion_length": 58.1484375, + "epoch": 1.0639269406392695, + "grad_norm": 2.532191276550293, + "kl": 0.22900390625, + "learning_rate": 8.93607305936073e-07, + "loss": 0.0092, + "reward": 1.5628709197044373, + "reward_std": 0.1907019466161728, + "rewards/accuracy_reward": 0.5706833899021149, + "rewards/format_reward": 0.9921875, + "step": 233 + }, + { + "completion_length": 66.8046875, + "epoch": 1.0684931506849316, + "grad_norm": 5.099847793579102, + "kl": 0.16259765625, + "learning_rate": 8.931506849315068e-07, + "loss": 0.0065, + "reward": 1.6207798719406128, + "reward_std": 0.2609640061855316, + "rewards/accuracy_reward": 0.659842312335968, + "rewards/format_reward": 0.9609375, + "step": 234 + }, + { + "completion_length": 60.0078125, + "epoch": 1.0730593607305936, + "grad_norm": 4.480944633483887, + "kl": 0.22509765625, + "learning_rate": 8.926940639269406e-07, + "loss": 0.009, + "reward": 1.514843761920929, + "reward_std": 0.3193802535533905, + "rewards/accuracy_reward": 0.5460937023162842, + "rewards/format_reward": 0.96875, + "step": 235 + }, + { + "completion_length": 59.7109375, + "epoch": 1.0776255707762556, + "grad_norm": 3.332169532775879, + "kl": 0.203125, + "learning_rate": 8.922374429223744e-07, + "loss": 0.0081, + "reward": 1.5903645753860474, + "reward_std": 0.2678108364343643, + "rewards/accuracy_reward": 0.6216145753860474, + "rewards/format_reward": 0.96875, + "step": 236 + }, + { + "completion_length": 60.46875, + "epoch": 1.0821917808219177, + "grad_norm": 2.0340123176574707, + "kl": 0.14794921875, + "learning_rate": 8.917808219178081e-07, + "loss": 0.0059, + "reward": 1.5674479603767395, + "reward_std": 0.2526354044675827, + "rewards/accuracy_reward": 0.5752603709697723, + "rewards/format_reward": 0.9921875, + "step": 237 + }, + { + "completion_length": 57.0859375, + "epoch": 1.08675799086758, + "grad_norm": 2.9721603393554688, + "kl": 0.18408203125, + "learning_rate": 8.91324200913242e-07, + "loss": 0.0074, + "reward": 1.5576340556144714, + "reward_std": 0.17624534666538239, + "rewards/accuracy_reward": 0.5576339960098267, + "rewards/format_reward": 1.0, + "step": 238 + }, + { + "completion_length": 90.328125, + "epoch": 1.091324200913242, + "grad_norm": 2.5770187377929688, + "kl": 0.18798828125, + "learning_rate": 8.908675799086758e-07, + "loss": 0.0075, + "reward": 1.7246136665344238, + "reward_std": 0.16096369177103043, + "rewards/accuracy_reward": 0.7480511367321014, + "rewards/format_reward": 0.9765625, + "step": 239 + }, + { + "completion_length": 42.3828125, + "epoch": 1.095890410958904, + "grad_norm": 1.3982386589050293, + "kl": 0.20263671875, + "learning_rate": 8.904109589041095e-07, + "loss": 0.0081, + "reward": 1.5574405193328857, + "reward_std": 0.1670553982257843, + "rewards/accuracy_reward": 0.5574404895305634, + "rewards/format_reward": 1.0, + "step": 240 + }, + { + "completion_length": 71.734375, + "epoch": 1.1004566210045663, + "grad_norm": 3.7010498046875, + "kl": 0.16162109375, + "learning_rate": 8.899543378995433e-07, + "loss": 0.0065, + "reward": 1.8125000596046448, + "reward_std": 0.12704820185899734, + "rewards/accuracy_reward": 0.8124999701976776, + "rewards/format_reward": 1.0, + "step": 241 + }, + { + "completion_length": 65.9609375, + "epoch": 1.1050228310502284, + "grad_norm": 2.443540096282959, + "kl": 0.1474609375, + "learning_rate": 8.894977168949771e-07, + "loss": 0.0059, + "reward": 1.6270038485527039, + "reward_std": 0.2314547374844551, + "rewards/accuracy_reward": 0.6426288187503815, + "rewards/format_reward": 0.984375, + "step": 242 + }, + { + "completion_length": 63.46875, + "epoch": 1.1095890410958904, + "grad_norm": 2.410088062286377, + "kl": 0.1611328125, + "learning_rate": 8.890410958904109e-07, + "loss": 0.0064, + "reward": 1.5, + "reward_std": 0.2109457477927208, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 243 + }, + { + "completion_length": 61.3671875, + "epoch": 1.1141552511415524, + "grad_norm": 3.672989845275879, + "kl": 0.21826171875, + "learning_rate": 8.885844748858448e-07, + "loss": 0.0087, + "reward": 1.6392003893852234, + "reward_std": 0.21675443649291992, + "rewards/accuracy_reward": 0.6626378297805786, + "rewards/format_reward": 0.9765625, + "step": 244 + }, + { + "completion_length": 67.1171875, + "epoch": 1.1187214611872145, + "grad_norm": 2.5747616291046143, + "kl": 0.1923828125, + "learning_rate": 8.881278538812785e-07, + "loss": 0.0077, + "reward": 1.5857422351837158, + "reward_std": 0.18916229903697968, + "rewards/accuracy_reward": 0.5935547053813934, + "rewards/format_reward": 0.9921875, + "step": 245 + }, + { + "completion_length": 51.1640625, + "epoch": 1.1232876712328768, + "grad_norm": 2.2640976905822754, + "kl": 0.18017578125, + "learning_rate": 8.876712328767123e-07, + "loss": 0.0072, + "reward": 1.5431283712387085, + "reward_std": 0.19479237496852875, + "rewards/accuracy_reward": 0.5431284010410309, + "rewards/format_reward": 1.0, + "step": 246 + }, + { + "completion_length": 70.78125, + "epoch": 1.1278538812785388, + "grad_norm": 6.029892444610596, + "kl": 0.1064453125, + "learning_rate": 8.872146118721461e-07, + "loss": 0.0042, + "reward": 1.6893415451049805, + "reward_std": 0.1636102795600891, + "rewards/accuracy_reward": 0.6971540153026581, + "rewards/format_reward": 0.9921875, + "step": 247 + }, + { + "completion_length": 67.9296875, + "epoch": 1.1324200913242009, + "grad_norm": 9.093193054199219, + "kl": 0.1455078125, + "learning_rate": 8.867579908675798e-07, + "loss": 0.0058, + "reward": 1.6471974849700928, + "reward_std": 0.1802346110343933, + "rewards/accuracy_reward": 0.647197425365448, + "rewards/format_reward": 1.0, + "step": 248 + }, + { + "completion_length": 86.5234375, + "epoch": 1.1369863013698631, + "grad_norm": 3.213435411453247, + "kl": 0.114501953125, + "learning_rate": 8.863013698630136e-07, + "loss": 0.0046, + "reward": 1.550682783126831, + "reward_std": 0.20900916308164597, + "rewards/accuracy_reward": 0.5663077533245087, + "rewards/format_reward": 0.984375, + "step": 249 + }, + { + "completion_length": 79.015625, + "epoch": 1.1415525114155252, + "grad_norm": 3.8062937259674072, + "kl": 0.124267578125, + "learning_rate": 8.858447488584474e-07, + "loss": 0.005, + "reward": 1.5841034650802612, + "reward_std": 0.17966121435165405, + "rewards/accuracy_reward": 0.5997284352779388, + "rewards/format_reward": 0.984375, + "step": 250 + }, + { + "completion_length": 67.578125, + "epoch": 1.1461187214611872, + "grad_norm": 2.4963302612304688, + "kl": 0.147705078125, + "learning_rate": 8.853881278538813e-07, + "loss": 0.0059, + "reward": 1.712117850780487, + "reward_std": 0.15082692354917526, + "rewards/accuracy_reward": 0.7199303209781647, + "rewards/format_reward": 0.9921875, + "step": 251 + }, + { + "completion_length": 66.7109375, + "epoch": 1.1506849315068493, + "grad_norm": 4.159963607788086, + "kl": 0.132080078125, + "learning_rate": 8.849315068493151e-07, + "loss": 0.0053, + "reward": 1.4834584593772888, + "reward_std": 0.2506244257092476, + "rewards/accuracy_reward": 0.4834584891796112, + "rewards/format_reward": 1.0, + "step": 252 + }, + { + "completion_length": 65.9140625, + "epoch": 1.1552511415525113, + "grad_norm": 3.3972127437591553, + "kl": 0.11669921875, + "learning_rate": 8.844748858447488e-07, + "loss": 0.0047, + "reward": 1.7484084367752075, + "reward_std": 0.17951766774058342, + "rewards/accuracy_reward": 0.748408317565918, + "rewards/format_reward": 1.0, + "step": 253 + }, + { + "completion_length": 70.4921875, + "epoch": 1.1598173515981736, + "grad_norm": 5.158849239349365, + "kl": 0.105712890625, + "learning_rate": 8.840182648401826e-07, + "loss": 0.0042, + "reward": 1.6956559419631958, + "reward_std": 0.19831054285168648, + "rewards/accuracy_reward": 0.7034684121608734, + "rewards/format_reward": 0.9921875, + "step": 254 + }, + { + "completion_length": 65.328125, + "epoch": 1.1643835616438356, + "grad_norm": 4.302172660827637, + "kl": 0.1591796875, + "learning_rate": 8.835616438356164e-07, + "loss": 0.0064, + "reward": 1.5410139560699463, + "reward_std": 0.24922513961791992, + "rewards/accuracy_reward": 0.5488264262676239, + "rewards/format_reward": 0.9921875, + "step": 255 + }, + { + "completion_length": 90.1015625, + "epoch": 1.1689497716894977, + "grad_norm": 4.157711029052734, + "kl": 0.07763671875, + "learning_rate": 8.831050228310501e-07, + "loss": 0.0031, + "reward": 1.8432291746139526, + "reward_std": 0.1643235646188259, + "rewards/accuracy_reward": 0.8588541150093079, + "rewards/format_reward": 0.984375, + "step": 256 + }, + { + "completion_length": 76.1171875, + "epoch": 1.17351598173516, + "grad_norm": 8.10622787475586, + "kl": 0.111328125, + "learning_rate": 8.826484018264839e-07, + "loss": 0.0045, + "reward": 1.6758702397346497, + "reward_std": 0.2606264054775238, + "rewards/accuracy_reward": 0.699307769536972, + "rewards/format_reward": 0.9765625, + "step": 257 + }, + { + "completion_length": 79.9609375, + "epoch": 1.178082191780822, + "grad_norm": 9.346076011657715, + "kl": 0.105224609375, + "learning_rate": 8.821917808219178e-07, + "loss": 0.0042, + "reward": 1.7317607402801514, + "reward_std": 0.1373404860496521, + "rewards/accuracy_reward": 0.7395730912685394, + "rewards/format_reward": 0.9921875, + "step": 258 + }, + { + "completion_length": 52.734375, + "epoch": 1.182648401826484, + "grad_norm": 2.3975396156311035, + "kl": 0.13427734375, + "learning_rate": 8.817351598173516e-07, + "loss": 0.0054, + "reward": 1.545721709728241, + "reward_std": 0.26958315074443817, + "rewards/accuracy_reward": 0.561346709728241, + "rewards/format_reward": 0.984375, + "step": 259 + }, + { + "completion_length": 59.125, + "epoch": 1.187214611872146, + "grad_norm": 4.990502834320068, + "kl": 0.130126953125, + "learning_rate": 8.812785388127854e-07, + "loss": 0.0052, + "reward": 1.5396197438240051, + "reward_std": 0.338210791349411, + "rewards/accuracy_reward": 0.5552447736263275, + "rewards/format_reward": 0.984375, + "step": 260 + }, + { + "completion_length": 92.8984375, + "epoch": 1.191780821917808, + "grad_norm": 1.891558051109314, + "kl": 0.083251953125, + "learning_rate": 8.808219178082191e-07, + "loss": 0.0033, + "reward": 1.6179687976837158, + "reward_std": 0.19345303252339363, + "rewards/accuracy_reward": 0.6414062082767487, + "rewards/format_reward": 0.9765625, + "step": 261 + }, + { + "completion_length": 37.671875, + "epoch": 1.1963470319634704, + "grad_norm": 3.5058436393737793, + "kl": 0.12646484375, + "learning_rate": 8.803652968036529e-07, + "loss": 0.0051, + "reward": 1.3005208373069763, + "reward_std": 0.35989323258399963, + "rewards/accuracy_reward": 0.3317708224058151, + "rewards/format_reward": 0.96875, + "step": 262 + }, + { + "completion_length": 67.3203125, + "epoch": 1.2009132420091324, + "grad_norm": 3.314962387084961, + "kl": 0.121826171875, + "learning_rate": 8.799086757990867e-07, + "loss": 0.0049, + "reward": 1.6558881998062134, + "reward_std": 0.2189551442861557, + "rewards/accuracy_reward": 0.655888170003891, + "rewards/format_reward": 1.0, + "step": 263 + }, + { + "completion_length": 69.8203125, + "epoch": 1.2054794520547945, + "grad_norm": 3.078131675720215, + "kl": 0.14013671875, + "learning_rate": 8.794520547945205e-07, + "loss": 0.0056, + "reward": 1.6604167222976685, + "reward_std": 0.2180890440940857, + "rewards/accuracy_reward": 0.6760416328907013, + "rewards/format_reward": 0.984375, + "step": 264 + }, + { + "completion_length": 79.875, + "epoch": 1.2100456621004567, + "grad_norm": 2.9452457427978516, + "kl": 0.086669921875, + "learning_rate": 8.789954337899543e-07, + "loss": 0.0035, + "reward": 1.616637647151947, + "reward_std": 0.1805976927280426, + "rewards/accuracy_reward": 0.6244500577449799, + "rewards/format_reward": 0.9921875, + "step": 265 + }, + { + "completion_length": 68.3984375, + "epoch": 1.2146118721461188, + "grad_norm": 3.070889711380005, + "kl": 0.1455078125, + "learning_rate": 8.785388127853881e-07, + "loss": 0.0058, + "reward": 1.6356534361839294, + "reward_std": 0.20898383855819702, + "rewards/accuracy_reward": 0.6356533765792847, + "rewards/format_reward": 1.0, + "step": 266 + }, + { + "completion_length": 84.015625, + "epoch": 1.2191780821917808, + "grad_norm": 4.916512489318848, + "kl": 0.090087890625, + "learning_rate": 8.780821917808219e-07, + "loss": 0.0036, + "reward": 1.6764622926712036, + "reward_std": 0.22947455942630768, + "rewards/accuracy_reward": 0.6998997330665588, + "rewards/format_reward": 0.9765625, + "step": 267 + }, + { + "completion_length": 56.9296875, + "epoch": 1.2237442922374429, + "grad_norm": 3.320021629333496, + "kl": 0.1376953125, + "learning_rate": 8.776255707762557e-07, + "loss": 0.0055, + "reward": 1.5441592335700989, + "reward_std": 0.2436549812555313, + "rewards/accuracy_reward": 0.5519717484712601, + "rewards/format_reward": 0.9921875, + "step": 268 + }, + { + "completion_length": 55.03125, + "epoch": 1.228310502283105, + "grad_norm": 2.28918194770813, + "kl": 0.099609375, + "learning_rate": 8.771689497716894e-07, + "loss": 0.004, + "reward": 1.567187488079071, + "reward_std": 0.24649156630039215, + "rewards/accuracy_reward": 0.582812488079071, + "rewards/format_reward": 0.984375, + "step": 269 + }, + { + "completion_length": 65.3203125, + "epoch": 1.2328767123287672, + "grad_norm": 4.561470985412598, + "kl": 0.09423828125, + "learning_rate": 8.767123287671232e-07, + "loss": 0.0038, + "reward": 1.5898438096046448, + "reward_std": 0.22158773988485336, + "rewards/accuracy_reward": 0.59765625, + "rewards/format_reward": 0.9921875, + "step": 270 + }, + { + "completion_length": 71.8984375, + "epoch": 1.2374429223744292, + "grad_norm": 2.199537992477417, + "kl": 0.130126953125, + "learning_rate": 8.762557077625571e-07, + "loss": 0.0052, + "reward": 1.6058040857315063, + "reward_std": 0.20108795166015625, + "rewards/accuracy_reward": 0.6058041155338287, + "rewards/format_reward": 1.0, + "step": 271 + }, + { + "completion_length": 76.6015625, + "epoch": 1.2420091324200913, + "grad_norm": 1.9431648254394531, + "kl": 0.116455078125, + "learning_rate": 8.757990867579908e-07, + "loss": 0.0047, + "reward": 1.7972594499588013, + "reward_std": 0.12338224425911903, + "rewards/accuracy_reward": 0.7972594499588013, + "rewards/format_reward": 1.0, + "step": 272 + }, + { + "completion_length": 69.6015625, + "epoch": 1.2465753424657535, + "grad_norm": 1.7630584239959717, + "kl": 0.103515625, + "learning_rate": 8.753424657534246e-07, + "loss": 0.0041, + "reward": 1.6484375, + "reward_std": 0.1649293452501297, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.9921875, + "step": 273 + }, + { + "completion_length": 98.6640625, + "epoch": 1.2511415525114156, + "grad_norm": 3.8383920192718506, + "kl": 0.103271484375, + "learning_rate": 8.748858447488584e-07, + "loss": 0.0041, + "reward": 1.765743374824524, + "reward_std": 0.13689537346363068, + "rewards/accuracy_reward": 0.7813683450222015, + "rewards/format_reward": 0.984375, + "step": 274 + }, + { + "completion_length": 69.1328125, + "epoch": 1.2557077625570776, + "grad_norm": 3.83646297454834, + "kl": 0.107666015625, + "learning_rate": 8.744292237442922e-07, + "loss": 0.0043, + "reward": 1.5489718914031982, + "reward_std": 0.2690936550498009, + "rewards/accuracy_reward": 0.5645968914031982, + "rewards/format_reward": 0.984375, + "step": 275 + }, + { + "completion_length": 74.171875, + "epoch": 1.2602739726027397, + "grad_norm": 1.6941070556640625, + "kl": 0.103759765625, + "learning_rate": 8.73972602739726e-07, + "loss": 0.0042, + "reward": 1.6818824410438538, + "reward_std": 0.10927051305770874, + "rewards/accuracy_reward": 0.6818824410438538, + "rewards/format_reward": 1.0, + "step": 276 + }, + { + "completion_length": 68.9375, + "epoch": 1.2648401826484017, + "grad_norm": 5.96689510345459, + "kl": 0.126220703125, + "learning_rate": 8.735159817351597e-07, + "loss": 0.0051, + "reward": 1.4658617973327637, + "reward_std": 0.19346562027931213, + "rewards/accuracy_reward": 0.4736742377281189, + "rewards/format_reward": 0.9921875, + "step": 277 + }, + { + "completion_length": 73.265625, + "epoch": 1.269406392694064, + "grad_norm": 2.1737794876098633, + "kl": 0.124267578125, + "learning_rate": 8.730593607305936e-07, + "loss": 0.005, + "reward": 1.6984375715255737, + "reward_std": 0.13310657069087029, + "rewards/accuracy_reward": 0.7062499523162842, + "rewards/format_reward": 0.9921875, + "step": 278 + }, + { + "completion_length": 65.4609375, + "epoch": 1.273972602739726, + "grad_norm": 2.1923165321350098, + "kl": 0.1279296875, + "learning_rate": 8.726027397260274e-07, + "loss": 0.0051, + "reward": 1.7382813096046448, + "reward_std": 0.1450316607952118, + "rewards/accuracy_reward": 0.7460937201976776, + "rewards/format_reward": 0.9921875, + "step": 279 + }, + { + "completion_length": 69.6328125, + "epoch": 1.278538812785388, + "grad_norm": 2.4596610069274902, + "kl": 0.11279296875, + "learning_rate": 8.721461187214611e-07, + "loss": 0.0045, + "reward": 1.7476563453674316, + "reward_std": 0.15961872786283493, + "rewards/accuracy_reward": 0.7476562261581421, + "rewards/format_reward": 1.0, + "step": 280 + }, + { + "completion_length": 74.1640625, + "epoch": 1.2831050228310503, + "grad_norm": 1.8160244226455688, + "kl": 0.08837890625, + "learning_rate": 8.716894977168949e-07, + "loss": 0.0035, + "reward": 1.625745713710785, + "reward_std": 0.17970408126711845, + "rewards/accuracy_reward": 0.6257456988096237, + "rewards/format_reward": 1.0, + "step": 281 + }, + { + "completion_length": 56.1875, + "epoch": 1.2876712328767124, + "grad_norm": 2.5006446838378906, + "kl": 0.14306640625, + "learning_rate": 8.712328767123287e-07, + "loss": 0.0057, + "reward": 1.571769893169403, + "reward_std": 0.2276684269309044, + "rewards/accuracy_reward": 0.5717698335647583, + "rewards/format_reward": 1.0, + "step": 282 + }, + { + "completion_length": 55.5234375, + "epoch": 1.2922374429223744, + "grad_norm": 6.536046504974365, + "kl": 0.1826171875, + "learning_rate": 8.707762557077625e-07, + "loss": 0.0073, + "reward": 1.6325520277023315, + "reward_std": 0.21870127320289612, + "rewards/accuracy_reward": 0.6325520575046539, + "rewards/format_reward": 1.0, + "step": 283 + }, + { + "completion_length": 93.65625, + "epoch": 1.2968036529680365, + "grad_norm": 10.913942337036133, + "kl": 0.0833740234375, + "learning_rate": 8.703196347031964e-07, + "loss": 0.0033, + "reward": 1.6755207777023315, + "reward_std": 0.17495759949088097, + "rewards/accuracy_reward": 0.6833332777023315, + "rewards/format_reward": 0.9921875, + "step": 284 + }, + { + "completion_length": 67.125, + "epoch": 1.3013698630136985, + "grad_norm": 2.0972490310668945, + "kl": 0.146484375, + "learning_rate": 8.698630136986301e-07, + "loss": 0.0059, + "reward": 1.4372395873069763, + "reward_std": 0.24129608273506165, + "rewards/accuracy_reward": 0.4372395873069763, + "rewards/format_reward": 1.0, + "step": 285 + }, + { + "completion_length": 83.25, + "epoch": 1.3059360730593608, + "grad_norm": 2.7569093704223633, + "kl": 0.1005859375, + "learning_rate": 8.694063926940639e-07, + "loss": 0.004, + "reward": 1.6421875953674316, + "reward_std": 0.2471975013613701, + "rewards/accuracy_reward": 0.6734374463558197, + "rewards/format_reward": 0.96875, + "step": 286 + }, + { + "completion_length": 60.7578125, + "epoch": 1.3105022831050228, + "grad_norm": 4.288987636566162, + "kl": 0.10888671875, + "learning_rate": 8.689497716894977e-07, + "loss": 0.0044, + "reward": 1.562853455543518, + "reward_std": 0.25185681879520416, + "rewards/accuracy_reward": 0.5706659406423569, + "rewards/format_reward": 0.9921875, + "step": 287 + }, + { + "completion_length": 72.6484375, + "epoch": 1.3150684931506849, + "grad_norm": 5.571771621704102, + "kl": 0.128662109375, + "learning_rate": 8.684931506849314e-07, + "loss": 0.0052, + "reward": 1.57805997133255, + "reward_std": 0.23417949676513672, + "rewards/accuracy_reward": 0.5858723521232605, + "rewards/format_reward": 0.9921875, + "step": 288 + }, + { + "completion_length": 84.0390625, + "epoch": 1.3196347031963471, + "grad_norm": 1.9545791149139404, + "kl": 0.085693359375, + "learning_rate": 8.680365296803652e-07, + "loss": 0.0034, + "reward": 1.768750011920929, + "reward_std": 0.1173202283680439, + "rewards/accuracy_reward": 0.7765624225139618, + "rewards/format_reward": 0.9921875, + "step": 289 + }, + { + "completion_length": 79.34375, + "epoch": 1.3242009132420092, + "grad_norm": 2.6059577465057373, + "kl": 0.100830078125, + "learning_rate": 8.67579908675799e-07, + "loss": 0.004, + "reward": 1.5458333492279053, + "reward_std": 0.31738781929016113, + "rewards/accuracy_reward": 0.5848958194255829, + "rewards/format_reward": 0.9609375, + "step": 290 + }, + { + "completion_length": 77.8828125, + "epoch": 1.3287671232876712, + "grad_norm": 2.8494393825531006, + "kl": 0.126953125, + "learning_rate": 8.671232876712329e-07, + "loss": 0.0051, + "reward": 1.6368772983551025, + "reward_std": 0.22899606823921204, + "rewards/accuracy_reward": 0.6603147983551025, + "rewards/format_reward": 0.9765625, + "step": 291 + }, + { + "completion_length": 91.6484375, + "epoch": 1.3333333333333333, + "grad_norm": 2.535183906555176, + "kl": 0.08740234375, + "learning_rate": 8.666666666666667e-07, + "loss": 0.0035, + "reward": 1.6453125476837158, + "reward_std": 0.15308689326047897, + "rewards/accuracy_reward": 0.6687499582767487, + "rewards/format_reward": 0.9765625, + "step": 292 + }, + { + "completion_length": 65.921875, + "epoch": 1.3378995433789953, + "grad_norm": 3.120173692703247, + "kl": 0.135009765625, + "learning_rate": 8.662100456621004e-07, + "loss": 0.0054, + "reward": 1.541332721710205, + "reward_std": 0.2652948349714279, + "rewards/accuracy_reward": 0.5647702217102051, + "rewards/format_reward": 0.9765625, + "step": 293 + }, + { + "completion_length": 64.9609375, + "epoch": 1.3424657534246576, + "grad_norm": 3.71287202835083, + "kl": 0.096923828125, + "learning_rate": 8.657534246575342e-07, + "loss": 0.0039, + "reward": 1.375390648841858, + "reward_std": 0.3063650578260422, + "rewards/accuracy_reward": 0.3988281339406967, + "rewards/format_reward": 0.9765625, + "step": 294 + }, + { + "completion_length": 86.9296875, + "epoch": 1.3470319634703196, + "grad_norm": 4.879149913787842, + "kl": 0.112548828125, + "learning_rate": 8.65296803652968e-07, + "loss": 0.0045, + "reward": 1.7951704263687134, + "reward_std": 0.1273602545261383, + "rewards/accuracy_reward": 0.795170396566391, + "rewards/format_reward": 1.0, + "step": 295 + }, + { + "completion_length": 84.3203125, + "epoch": 1.3515981735159817, + "grad_norm": 2.7764885425567627, + "kl": 0.102783203125, + "learning_rate": 8.648401826484017e-07, + "loss": 0.0041, + "reward": 1.7486329078674316, + "reward_std": 0.2014031484723091, + "rewards/accuracy_reward": 0.7564452290534973, + "rewards/format_reward": 0.9921875, + "step": 296 + }, + { + "completion_length": 87.6875, + "epoch": 1.356164383561644, + "grad_norm": 3.286853551864624, + "kl": 0.1103515625, + "learning_rate": 8.643835616438355e-07, + "loss": 0.0044, + "reward": 1.5868489742279053, + "reward_std": 0.21391719579696655, + "rewards/accuracy_reward": 0.6024739146232605, + "rewards/format_reward": 0.984375, + "step": 297 + }, + { + "completion_length": 72.0, + "epoch": 1.360730593607306, + "grad_norm": 3.1161341667175293, + "kl": 0.130615234375, + "learning_rate": 8.639269406392694e-07, + "loss": 0.0052, + "reward": 1.551562488079071, + "reward_std": 0.2892322689294815, + "rewards/accuracy_reward": 0.5750000178813934, + "rewards/format_reward": 0.9765625, + "step": 298 + }, + { + "completion_length": 95.8828125, + "epoch": 1.365296803652968, + "grad_norm": 2.0366954803466797, + "kl": 0.091796875, + "learning_rate": 8.634703196347032e-07, + "loss": 0.0037, + "reward": 1.8364962935447693, + "reward_std": 0.09190401062369347, + "rewards/accuracy_reward": 0.8443087041378021, + "rewards/format_reward": 0.9921875, + "step": 299 + }, + { + "completion_length": 101.921875, + "epoch": 1.36986301369863, + "grad_norm": 9.95484447479248, + "kl": 0.080322265625, + "learning_rate": 8.63013698630137e-07, + "loss": 0.0032, + "reward": 1.629079818725586, + "reward_std": 0.13987341336905956, + "rewards/accuracy_reward": 0.6290798187255859, + "rewards/format_reward": 1.0, + "step": 300 + }, + { + "completion_length": 79.34375, + "epoch": 1.374429223744292, + "grad_norm": 19.206026077270508, + "kl": 0.117919921875, + "learning_rate": 8.625570776255707e-07, + "loss": 0.0047, + "reward": 1.6438615918159485, + "reward_std": 0.21473538875579834, + "rewards/accuracy_reward": 0.6516740918159485, + "rewards/format_reward": 0.9921875, + "step": 301 + }, + { + "completion_length": 86.4609375, + "epoch": 1.3789954337899544, + "grad_norm": 9.368078231811523, + "kl": 0.09228515625, + "learning_rate": 8.621004566210045e-07, + "loss": 0.0037, + "reward": 1.6070312857627869, + "reward_std": 0.1797475889325142, + "rewards/accuracy_reward": 0.6226562261581421, + "rewards/format_reward": 0.984375, + "step": 302 + }, + { + "completion_length": 79.0625, + "epoch": 1.3835616438356164, + "grad_norm": 2.789032220840454, + "kl": 0.101806640625, + "learning_rate": 8.616438356164383e-07, + "loss": 0.0041, + "reward": 1.62343031167984, + "reward_std": 0.2234785482287407, + "rewards/accuracy_reward": 0.6234302222728729, + "rewards/format_reward": 1.0, + "step": 303 + }, + { + "completion_length": 68.3984375, + "epoch": 1.3881278538812785, + "grad_norm": 2.6755340099334717, + "kl": 0.10009765625, + "learning_rate": 8.611872146118721e-07, + "loss": 0.004, + "reward": 1.5965625047683716, + "reward_std": 0.1608435958623886, + "rewards/accuracy_reward": 0.6121874451637268, + "rewards/format_reward": 0.984375, + "step": 304 + }, + { + "completion_length": 79.3203125, + "epoch": 1.3926940639269407, + "grad_norm": 3.4425978660583496, + "kl": 0.1259765625, + "learning_rate": 8.607305936073059e-07, + "loss": 0.005, + "reward": 1.7191716432571411, + "reward_std": 0.11866222321987152, + "rewards/accuracy_reward": 0.7269841134548187, + "rewards/format_reward": 0.9921875, + "step": 305 + }, + { + "completion_length": 105.5546875, + "epoch": 1.3972602739726028, + "grad_norm": 4.509172439575195, + "kl": 0.063720703125, + "learning_rate": 8.602739726027397e-07, + "loss": 0.0025, + "reward": 1.7390625476837158, + "reward_std": 0.14966704696416855, + "rewards/accuracy_reward": 0.739062488079071, + "rewards/format_reward": 1.0, + "step": 306 + }, + { + "completion_length": 87.1953125, + "epoch": 1.4018264840182648, + "grad_norm": 2.9753623008728027, + "kl": 0.13232421875, + "learning_rate": 8.598173515981735e-07, + "loss": 0.0053, + "reward": 1.6669872403144836, + "reward_std": 0.14199939370155334, + "rewards/accuracy_reward": 0.6669871509075165, + "rewards/format_reward": 1.0, + "step": 307 + }, + { + "completion_length": 68.671875, + "epoch": 1.4063926940639269, + "grad_norm": 3.5725512504577637, + "kl": 0.105712890625, + "learning_rate": 8.593607305936073e-07, + "loss": 0.0042, + "reward": 1.5044271349906921, + "reward_std": 0.2842589318752289, + "rewards/accuracy_reward": 0.5044270902872086, + "rewards/format_reward": 1.0, + "step": 308 + }, + { + "completion_length": 84.8671875, + "epoch": 1.410958904109589, + "grad_norm": 2.2308547496795654, + "kl": 0.091796875, + "learning_rate": 8.58904109589041e-07, + "loss": 0.0037, + "reward": 1.5451836585998535, + "reward_std": 0.23898552358150482, + "rewards/accuracy_reward": 0.5608087480068207, + "rewards/format_reward": 0.984375, + "step": 309 + }, + { + "completion_length": 74.3984375, + "epoch": 1.4155251141552512, + "grad_norm": 5.253906726837158, + "kl": 0.139892578125, + "learning_rate": 8.584474885844748e-07, + "loss": 0.0056, + "reward": 1.5776489973068237, + "reward_std": 0.20516303181648254, + "rewards/accuracy_reward": 0.5854615569114685, + "rewards/format_reward": 0.9921875, + "step": 310 + }, + { + "completion_length": 75.515625, + "epoch": 1.4200913242009132, + "grad_norm": 3.5134174823760986, + "kl": 0.0849609375, + "learning_rate": 8.579908675799087e-07, + "loss": 0.0034, + "reward": 1.4553078413009644, + "reward_std": 0.26812630146741867, + "rewards/accuracy_reward": 0.47093285620212555, + "rewards/format_reward": 0.984375, + "step": 311 + }, + { + "completion_length": 100.5, + "epoch": 1.4246575342465753, + "grad_norm": 2.315570116043091, + "kl": 0.090087890625, + "learning_rate": 8.575342465753424e-07, + "loss": 0.0036, + "reward": 1.597842276096344, + "reward_std": 0.192316435277462, + "rewards/accuracy_reward": 0.5978422611951828, + "rewards/format_reward": 1.0, + "step": 312 + }, + { + "completion_length": 85.46875, + "epoch": 1.4292237442922375, + "grad_norm": 4.950066566467285, + "kl": 0.09375, + "learning_rate": 8.570776255707762e-07, + "loss": 0.0037, + "reward": 1.6005207896232605, + "reward_std": 0.23914001137018204, + "rewards/accuracy_reward": 0.6083333194255829, + "rewards/format_reward": 0.9921875, + "step": 313 + }, + { + "completion_length": 73.1328125, + "epoch": 1.4337899543378996, + "grad_norm": 3.737804651260376, + "kl": 0.13037109375, + "learning_rate": 8.5662100456621e-07, + "loss": 0.0052, + "reward": 1.542373538017273, + "reward_std": 0.24436407536268234, + "rewards/accuracy_reward": 0.542373538017273, + "rewards/format_reward": 1.0, + "step": 314 + }, + { + "completion_length": 81.2265625, + "epoch": 1.4383561643835616, + "grad_norm": 2.7811484336853027, + "kl": 0.10693359375, + "learning_rate": 8.561643835616438e-07, + "loss": 0.0043, + "reward": 1.7172211408615112, + "reward_std": 0.15319041907787323, + "rewards/accuracy_reward": 0.7172211408615112, + "rewards/format_reward": 1.0, + "step": 315 + }, + { + "completion_length": 82.328125, + "epoch": 1.4429223744292237, + "grad_norm": 5.399558067321777, + "kl": 0.10986328125, + "learning_rate": 8.557077625570776e-07, + "loss": 0.0044, + "reward": 1.5121857523918152, + "reward_std": 0.28201115131378174, + "rewards/accuracy_reward": 0.5199982225894928, + "rewards/format_reward": 0.9921875, + "step": 316 + }, + { + "completion_length": 103.671875, + "epoch": 1.4474885844748857, + "grad_norm": 1.9453126192092896, + "kl": 0.070068359375, + "learning_rate": 8.552511415525113e-07, + "loss": 0.0028, + "reward": 1.7234273552894592, + "reward_std": 0.12736555561423302, + "rewards/accuracy_reward": 0.7234272956848145, + "rewards/format_reward": 1.0, + "step": 317 + }, + { + "completion_length": 91.109375, + "epoch": 1.452054794520548, + "grad_norm": 4.417510032653809, + "kl": 0.09716796875, + "learning_rate": 8.547945205479452e-07, + "loss": 0.0039, + "reward": 1.6735481023788452, + "reward_std": 0.17602262273430824, + "rewards/accuracy_reward": 0.68136066198349, + "rewards/format_reward": 0.9921875, + "step": 318 + }, + { + "completion_length": 86.109375, + "epoch": 1.45662100456621, + "grad_norm": 3.274066925048828, + "kl": 0.105712890625, + "learning_rate": 8.54337899543379e-07, + "loss": 0.0042, + "reward": 1.6061198115348816, + "reward_std": 0.2346249595284462, + "rewards/accuracy_reward": 0.6061197817325592, + "rewards/format_reward": 1.0, + "step": 319 + }, + { + "completion_length": 103.8828125, + "epoch": 1.461187214611872, + "grad_norm": 3.0639874935150146, + "kl": 0.09375, + "learning_rate": 8.538812785388127e-07, + "loss": 0.0037, + "reward": 1.5671589970588684, + "reward_std": 0.2653961777687073, + "rewards/accuracy_reward": 0.5984089076519012, + "rewards/format_reward": 0.96875, + "step": 320 + }, + { + "completion_length": 91.578125, + "epoch": 1.4657534246575343, + "grad_norm": 3.1881332397460938, + "kl": 0.11669921875, + "learning_rate": 8.534246575342465e-07, + "loss": 0.0047, + "reward": 1.7369791269302368, + "reward_std": 0.24841733276844025, + "rewards/accuracy_reward": 0.7369791567325592, + "rewards/format_reward": 1.0, + "step": 321 + }, + { + "completion_length": 90.1328125, + "epoch": 1.4703196347031964, + "grad_norm": 7.73578405380249, + "kl": 0.12841796875, + "learning_rate": 8.529680365296803e-07, + "loss": 0.0051, + "reward": 1.5886787176132202, + "reward_std": 0.27549922466278076, + "rewards/accuracy_reward": 0.6043036431074142, + "rewards/format_reward": 0.984375, + "step": 322 + }, + { + "completion_length": 105.2578125, + "epoch": 1.4748858447488584, + "grad_norm": 2.833080768585205, + "kl": 0.063720703125, + "learning_rate": 8.52511415525114e-07, + "loss": 0.0026, + "reward": 1.6982238292694092, + "reward_std": 0.09853163920342922, + "rewards/accuracy_reward": 0.7060362696647644, + "rewards/format_reward": 0.9921875, + "step": 323 + }, + { + "completion_length": 101.9375, + "epoch": 1.4794520547945205, + "grad_norm": 1.9237995147705078, + "kl": 0.0869140625, + "learning_rate": 8.52054794520548e-07, + "loss": 0.0035, + "reward": 1.6929687857627869, + "reward_std": 0.14721976220607758, + "rewards/accuracy_reward": 0.7085936963558197, + "rewards/format_reward": 0.984375, + "step": 324 + }, + { + "completion_length": 93.171875, + "epoch": 1.4840182648401825, + "grad_norm": 21.809825897216797, + "kl": 0.100830078125, + "learning_rate": 8.515981735159817e-07, + "loss": 0.004, + "reward": 1.464756965637207, + "reward_std": 0.26955385506153107, + "rewards/accuracy_reward": 0.503819465637207, + "rewards/format_reward": 0.9609375, + "step": 325 + }, + { + "completion_length": 102.03125, + "epoch": 1.4885844748858448, + "grad_norm": 2.1946544647216797, + "kl": 0.075439453125, + "learning_rate": 8.511415525114155e-07, + "loss": 0.003, + "reward": 1.7023438215255737, + "reward_std": 0.15592241287231445, + "rewards/accuracy_reward": 0.7179687023162842, + "rewards/format_reward": 0.984375, + "step": 326 + }, + { + "completion_length": 78.6875, + "epoch": 1.4931506849315068, + "grad_norm": 5.698726654052734, + "kl": 0.11376953125, + "learning_rate": 8.506849315068493e-07, + "loss": 0.0046, + "reward": 1.5953141450881958, + "reward_std": 0.2819615304470062, + "rewards/accuracy_reward": 0.618751734495163, + "rewards/format_reward": 0.9765625, + "step": 327 + }, + { + "completion_length": 89.71875, + "epoch": 1.4977168949771689, + "grad_norm": 4.1140007972717285, + "kl": 0.080078125, + "learning_rate": 8.50228310502283e-07, + "loss": 0.0032, + "reward": 1.6470133662223816, + "reward_std": 0.18788425624370575, + "rewards/accuracy_reward": 0.6626383662223816, + "rewards/format_reward": 0.984375, + "step": 328 + }, + { + "completion_length": 116.9375, + "epoch": 1.5022831050228311, + "grad_norm": 2.189347743988037, + "kl": 0.0545654296875, + "learning_rate": 8.497716894977168e-07, + "loss": 0.0022, + "reward": 1.71875, + "reward_std": 0.12179600074887276, + "rewards/accuracy_reward": 0.7343749701976776, + "rewards/format_reward": 0.984375, + "step": 329 + }, + { + "completion_length": 88.703125, + "epoch": 1.5068493150684932, + "grad_norm": 2.298283815383911, + "kl": 0.105712890625, + "learning_rate": 8.493150684931506e-07, + "loss": 0.0042, + "reward": 1.686813473701477, + "reward_std": 0.1672440692782402, + "rewards/accuracy_reward": 0.6868133842945099, + "rewards/format_reward": 1.0, + "step": 330 + }, + { + "completion_length": 69.828125, + "epoch": 1.5114155251141552, + "grad_norm": 6.333926200866699, + "kl": 0.13134765625, + "learning_rate": 8.488584474885845e-07, + "loss": 0.0053, + "reward": 1.7103299498558044, + "reward_std": 0.1922176629304886, + "rewards/accuracy_reward": 0.7103298306465149, + "rewards/format_reward": 1.0, + "step": 331 + }, + { + "completion_length": 95.6484375, + "epoch": 1.5159817351598175, + "grad_norm": 1.9884992837905884, + "kl": 0.109375, + "learning_rate": 8.484018264840183e-07, + "loss": 0.0044, + "reward": 1.6320313215255737, + "reward_std": 0.2171119600534439, + "rewards/accuracy_reward": 0.6398437321186066, + "rewards/format_reward": 0.9921875, + "step": 332 + }, + { + "completion_length": 80.28125, + "epoch": 1.5205479452054793, + "grad_norm": 3.4122371673583984, + "kl": 0.15234375, + "learning_rate": 8.47945205479452e-07, + "loss": 0.0061, + "reward": 1.6989583373069763, + "reward_std": 0.22790630161762238, + "rewards/accuracy_reward": 0.7067708373069763, + "rewards/format_reward": 0.9921875, + "step": 333 + }, + { + "completion_length": 82.734375, + "epoch": 1.5251141552511416, + "grad_norm": 16.75699234008789, + "kl": 0.084228515625, + "learning_rate": 8.474885844748858e-07, + "loss": 0.0034, + "reward": 1.6330461502075195, + "reward_std": 0.1892491653561592, + "rewards/accuracy_reward": 0.6408586502075195, + "rewards/format_reward": 0.9921875, + "step": 334 + }, + { + "completion_length": 81.421875, + "epoch": 1.5296803652968036, + "grad_norm": 2.46637225151062, + "kl": 0.095703125, + "learning_rate": 8.470319634703196e-07, + "loss": 0.0038, + "reward": 1.6607667207717896, + "reward_std": 0.23428593575954437, + "rewards/accuracy_reward": 0.6763917207717896, + "rewards/format_reward": 0.984375, + "step": 335 + }, + { + "completion_length": 87.4375, + "epoch": 1.5342465753424657, + "grad_norm": 4.196310043334961, + "kl": 0.09423828125, + "learning_rate": 8.465753424657533e-07, + "loss": 0.0038, + "reward": 1.5312500596046448, + "reward_std": 0.2585080787539482, + "rewards/accuracy_reward": 0.5390624850988388, + "rewards/format_reward": 0.9921875, + "step": 336 + }, + { + "completion_length": 73.046875, + "epoch": 1.538812785388128, + "grad_norm": 2.5180585384368896, + "kl": 0.13671875, + "learning_rate": 8.461187214611871e-07, + "loss": 0.0055, + "reward": 1.630428671836853, + "reward_std": 0.20693185180425644, + "rewards/accuracy_reward": 0.6538661420345306, + "rewards/format_reward": 0.9765625, + "step": 337 + }, + { + "completion_length": 83.09375, + "epoch": 1.54337899543379, + "grad_norm": 4.625184059143066, + "kl": 0.107666015625, + "learning_rate": 8.45662100456621e-07, + "loss": 0.0043, + "reward": 1.7304688096046448, + "reward_std": 0.16690129786729813, + "rewards/accuracy_reward": 0.7304687201976776, + "rewards/format_reward": 1.0, + "step": 338 + }, + { + "completion_length": 79.3515625, + "epoch": 1.547945205479452, + "grad_norm": 7.074364185333252, + "kl": 0.2109375, + "learning_rate": 8.452054794520548e-07, + "loss": 0.0084, + "reward": 1.6570312976837158, + "reward_std": 0.24604861438274384, + "rewards/accuracy_reward": 0.680468738079071, + "rewards/format_reward": 0.9765625, + "step": 339 + }, + { + "completion_length": 79.8125, + "epoch": 1.5525114155251143, + "grad_norm": 3.3716442584991455, + "kl": 0.086669921875, + "learning_rate": 8.447488584474886e-07, + "loss": 0.0035, + "reward": 1.5007859468460083, + "reward_std": 0.2506624162197113, + "rewards/accuracy_reward": 0.5242233872413635, + "rewards/format_reward": 0.9765625, + "step": 340 + }, + { + "completion_length": 75.390625, + "epoch": 1.5570776255707761, + "grad_norm": 3.0893497467041016, + "kl": 0.099853515625, + "learning_rate": 8.442922374429223e-07, + "loss": 0.004, + "reward": 1.546875, + "reward_std": 0.2782461494207382, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.96875, + "step": 341 + }, + { + "completion_length": 80.75, + "epoch": 1.5616438356164384, + "grad_norm": 1.9481297731399536, + "kl": 0.0885009765625, + "learning_rate": 8.438356164383561e-07, + "loss": 0.0035, + "reward": 1.5662733316421509, + "reward_std": 0.20162209123373032, + "rewards/accuracy_reward": 0.5897107124328613, + "rewards/format_reward": 0.9765625, + "step": 342 + }, + { + "completion_length": 72.890625, + "epoch": 1.5662100456621004, + "grad_norm": 2.927980899810791, + "kl": 0.112548828125, + "learning_rate": 8.433789954337899e-07, + "loss": 0.0045, + "reward": 1.4613777995109558, + "reward_std": 0.34207557141780853, + "rewards/accuracy_reward": 0.5473152101039886, + "rewards/format_reward": 0.9140625, + "step": 343 + }, + { + "completion_length": 56.953125, + "epoch": 1.5707762557077625, + "grad_norm": 3.1426687240600586, + "kl": 0.155517578125, + "learning_rate": 8.429223744292237e-07, + "loss": 0.0062, + "reward": 1.3815755248069763, + "reward_std": 0.3664630800485611, + "rewards/accuracy_reward": 0.44407549500465393, + "rewards/format_reward": 0.9375, + "step": 344 + }, + { + "completion_length": 82.2265625, + "epoch": 1.5753424657534247, + "grad_norm": 2.98221492767334, + "kl": 0.126220703125, + "learning_rate": 8.424657534246576e-07, + "loss": 0.005, + "reward": 1.6400888562202454, + "reward_std": 0.24476776085793972, + "rewards/accuracy_reward": 0.6947763860225677, + "rewards/format_reward": 0.9453125, + "step": 345 + }, + { + "completion_length": 67.859375, + "epoch": 1.5799086757990868, + "grad_norm": 2.4362642765045166, + "kl": 0.122314453125, + "learning_rate": 8.420091324200913e-07, + "loss": 0.0049, + "reward": 1.4768601059913635, + "reward_std": 0.258517824113369, + "rewards/accuracy_reward": 0.5159225761890411, + "rewards/format_reward": 0.9609375, + "step": 346 + }, + { + "completion_length": 70.0859375, + "epoch": 1.5844748858447488, + "grad_norm": 3.0861399173736572, + "kl": 0.166015625, + "learning_rate": 8.415525114155251e-07, + "loss": 0.0066, + "reward": 1.6798083782196045, + "reward_std": 0.2793383300304413, + "rewards/accuracy_reward": 0.7032458782196045, + "rewards/format_reward": 0.9765625, + "step": 347 + }, + { + "completion_length": 50.015625, + "epoch": 1.589041095890411, + "grad_norm": 3.495870351791382, + "kl": 0.162109375, + "learning_rate": 8.410958904109589e-07, + "loss": 0.0065, + "reward": 1.582118034362793, + "reward_std": 0.21435417234897614, + "rewards/accuracy_reward": 0.5821180641651154, + "rewards/format_reward": 1.0, + "step": 348 + }, + { + "completion_length": 50.2890625, + "epoch": 1.593607305936073, + "grad_norm": 4.53682804107666, + "kl": 0.146240234375, + "learning_rate": 8.406392694063926e-07, + "loss": 0.0059, + "reward": 1.6024181842803955, + "reward_std": 0.2329491451382637, + "rewards/accuracy_reward": 0.6258556544780731, + "rewards/format_reward": 0.9765625, + "step": 349 + }, + { + "completion_length": 54.34375, + "epoch": 1.5981735159817352, + "grad_norm": 2.6551053524017334, + "kl": 0.127685546875, + "learning_rate": 8.401826484018264e-07, + "loss": 0.0051, + "reward": 1.5606771111488342, + "reward_std": 0.1882794126868248, + "rewards/accuracy_reward": 0.568489596247673, + "rewards/format_reward": 0.9921875, + "step": 350 + }, + { + "completion_length": 66.4140625, + "epoch": 1.6027397260273972, + "grad_norm": 6.037808418273926, + "kl": 0.146484375, + "learning_rate": 8.397260273972603e-07, + "loss": 0.0059, + "reward": 1.6453644037246704, + "reward_std": 0.18358591571450233, + "rewards/accuracy_reward": 0.660989373922348, + "rewards/format_reward": 0.984375, + "step": 351 + }, + { + "completion_length": 49.375, + "epoch": 1.6073059360730593, + "grad_norm": 2.8399972915649414, + "kl": 0.1689453125, + "learning_rate": 8.39269406392694e-07, + "loss": 0.0068, + "reward": 1.6010416746139526, + "reward_std": 0.2630682438611984, + "rewards/accuracy_reward": 0.6088541746139526, + "rewards/format_reward": 0.9921875, + "step": 352 + }, + { + "completion_length": 58.4921875, + "epoch": 1.6118721461187215, + "grad_norm": 1.744779109954834, + "kl": 0.1396484375, + "learning_rate": 8.388127853881279e-07, + "loss": 0.0056, + "reward": 1.675000011920929, + "reward_std": 0.16849348694086075, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 353 + }, + { + "completion_length": 63.8046875, + "epoch": 1.6164383561643836, + "grad_norm": 4.681619167327881, + "kl": 0.14404296875, + "learning_rate": 8.383561643835616e-07, + "loss": 0.0058, + "reward": 1.7920387387275696, + "reward_std": 0.14393481612205505, + "rewards/accuracy_reward": 0.7920385599136353, + "rewards/format_reward": 1.0, + "step": 354 + }, + { + "completion_length": 67.0234375, + "epoch": 1.6210045662100456, + "grad_norm": 3.167783498764038, + "kl": 0.1318359375, + "learning_rate": 8.378995433789954e-07, + "loss": 0.0053, + "reward": 1.728416085243225, + "reward_std": 0.18405602872371674, + "rewards/accuracy_reward": 0.7518534958362579, + "rewards/format_reward": 0.9765625, + "step": 355 + }, + { + "completion_length": 63.390625, + "epoch": 1.625570776255708, + "grad_norm": 3.923906087875366, + "kl": 0.13330078125, + "learning_rate": 8.374429223744292e-07, + "loss": 0.0053, + "reward": 1.6279487609863281, + "reward_std": 0.12273544818162918, + "rewards/accuracy_reward": 0.6279487460851669, + "rewards/format_reward": 1.0, + "step": 356 + }, + { + "completion_length": 61.1796875, + "epoch": 1.6301369863013697, + "grad_norm": 3.410731077194214, + "kl": 0.13427734375, + "learning_rate": 8.369863013698629e-07, + "loss": 0.0054, + "reward": 1.6447545289993286, + "reward_std": 0.22194860875606537, + "rewards/accuracy_reward": 0.6603794991970062, + "rewards/format_reward": 0.984375, + "step": 357 + }, + { + "completion_length": 68.53125, + "epoch": 1.634703196347032, + "grad_norm": 4.652464389801025, + "kl": 0.1181640625, + "learning_rate": 8.365296803652968e-07, + "loss": 0.0047, + "reward": 1.6500000953674316, + "reward_std": 0.17417245358228683, + "rewards/accuracy_reward": 0.6499999761581421, + "rewards/format_reward": 1.0, + "step": 358 + }, + { + "completion_length": 55.984375, + "epoch": 1.639269406392694, + "grad_norm": 1.8993587493896484, + "kl": 0.1298828125, + "learning_rate": 8.360730593607306e-07, + "loss": 0.0052, + "reward": 1.7149627804756165, + "reward_std": 0.11658288538455963, + "rewards/accuracy_reward": 0.7149626910686493, + "rewards/format_reward": 1.0, + "step": 359 + }, + { + "completion_length": 81.7578125, + "epoch": 1.643835616438356, + "grad_norm": 1.6242355108261108, + "kl": 0.145751953125, + "learning_rate": 8.356164383561643e-07, + "loss": 0.0058, + "reward": 1.7650251388549805, + "reward_std": 0.07628657668828964, + "rewards/accuracy_reward": 0.7650250494480133, + "rewards/format_reward": 1.0, + "step": 360 + }, + { + "completion_length": 92.7265625, + "epoch": 1.6484018264840183, + "grad_norm": 1.6242085695266724, + "kl": 0.06298828125, + "learning_rate": 8.351598173515981e-07, + "loss": 0.0025, + "reward": 1.75, + "reward_std": 0.1409430019557476, + "rewards/accuracy_reward": 0.7656249701976776, + "rewards/format_reward": 0.984375, + "step": 361 + }, + { + "completion_length": 71.1328125, + "epoch": 1.6529680365296804, + "grad_norm": 3.2496068477630615, + "kl": 0.109130859375, + "learning_rate": 8.347031963470319e-07, + "loss": 0.0044, + "reward": 1.614074468612671, + "reward_std": 0.19926752150058746, + "rewards/accuracy_reward": 0.6296994388103485, + "rewards/format_reward": 0.984375, + "step": 362 + }, + { + "completion_length": 68.6953125, + "epoch": 1.6575342465753424, + "grad_norm": 1.9562387466430664, + "kl": 0.100830078125, + "learning_rate": 8.342465753424657e-07, + "loss": 0.004, + "reward": 1.7125000953674316, + "reward_std": 0.13402405753731728, + "rewards/accuracy_reward": 0.7124999761581421, + "rewards/format_reward": 1.0, + "step": 363 + }, + { + "completion_length": 65.21875, + "epoch": 1.6621004566210047, + "grad_norm": 2.6374332904815674, + "kl": 0.15673828125, + "learning_rate": 8.337899543378996e-07, + "loss": 0.0063, + "reward": 1.653124988079071, + "reward_std": 0.1894800141453743, + "rewards/accuracy_reward": 0.668749988079071, + "rewards/format_reward": 0.984375, + "step": 364 + }, + { + "completion_length": 50.03125, + "epoch": 1.6666666666666665, + "grad_norm": 3.6687989234924316, + "kl": 0.1171875, + "learning_rate": 8.333333333333333e-07, + "loss": 0.0047, + "reward": 1.5525545477867126, + "reward_std": 0.18541519343852997, + "rewards/accuracy_reward": 0.5525545328855515, + "rewards/format_reward": 1.0, + "step": 365 + }, + { + "completion_length": 50.265625, + "epoch": 1.6712328767123288, + "grad_norm": 27.93817710876465, + "kl": 0.8662109375, + "learning_rate": 8.328767123287671e-07, + "loss": 0.0347, + "reward": 1.6250372529029846, + "reward_std": 0.26529867947101593, + "rewards/accuracy_reward": 0.6250371932983398, + "rewards/format_reward": 1.0, + "step": 366 + }, + { + "completion_length": 55.046875, + "epoch": 1.6757990867579908, + "grad_norm": 1.959330439567566, + "kl": 0.128662109375, + "learning_rate": 8.324200913242009e-07, + "loss": 0.0051, + "reward": 1.611718773841858, + "reward_std": 0.20705362409353256, + "rewards/accuracy_reward": 0.6117187142372131, + "rewards/format_reward": 1.0, + "step": 367 + }, + { + "completion_length": 73.1796875, + "epoch": 1.6803652968036529, + "grad_norm": 4.830105781555176, + "kl": 0.11083984375, + "learning_rate": 8.319634703196346e-07, + "loss": 0.0044, + "reward": 1.7028512358665466, + "reward_std": 0.15199671685695648, + "rewards/accuracy_reward": 0.7106637060642242, + "rewards/format_reward": 0.9921875, + "step": 368 + }, + { + "completion_length": 44.71875, + "epoch": 1.6849315068493151, + "grad_norm": 2.5528032779693604, + "kl": 0.1474609375, + "learning_rate": 8.315068493150684e-07, + "loss": 0.0059, + "reward": 1.5036830306053162, + "reward_std": 0.2237851321697235, + "rewards/accuracy_reward": 0.5036830604076385, + "rewards/format_reward": 1.0, + "step": 369 + }, + { + "completion_length": 64.7578125, + "epoch": 1.6894977168949772, + "grad_norm": 3.1371445655822754, + "kl": 0.10693359375, + "learning_rate": 8.310502283105022e-07, + "loss": 0.0043, + "reward": 1.7922433018684387, + "reward_std": 0.1861533522605896, + "rewards/accuracy_reward": 0.8000558018684387, + "rewards/format_reward": 0.9921875, + "step": 370 + }, + { + "completion_length": 81.265625, + "epoch": 1.6940639269406392, + "grad_norm": 3.3165316581726074, + "kl": 0.094482421875, + "learning_rate": 8.305936073059361e-07, + "loss": 0.0038, + "reward": 1.5851722359657288, + "reward_std": 0.17648599669337273, + "rewards/accuracy_reward": 0.5851722061634064, + "rewards/format_reward": 1.0, + "step": 371 + }, + { + "completion_length": 84.890625, + "epoch": 1.6986301369863015, + "grad_norm": 3.0761749744415283, + "kl": 0.10693359375, + "learning_rate": 8.301369863013699e-07, + "loss": 0.0043, + "reward": 1.7469556331634521, + "reward_std": 0.1787155643105507, + "rewards/accuracy_reward": 0.770393043756485, + "rewards/format_reward": 0.9765625, + "step": 372 + }, + { + "completion_length": 50.25, + "epoch": 1.7031963470319633, + "grad_norm": 3.6592459678649902, + "kl": 0.15380859375, + "learning_rate": 8.296803652968036e-07, + "loss": 0.0062, + "reward": 1.6217397451400757, + "reward_std": 0.2100653052330017, + "rewards/accuracy_reward": 0.6217397451400757, + "rewards/format_reward": 1.0, + "step": 373 + }, + { + "completion_length": 69.7734375, + "epoch": 1.7077625570776256, + "grad_norm": 4.063467502593994, + "kl": 0.1123046875, + "learning_rate": 8.292237442922374e-07, + "loss": 0.0045, + "reward": 1.6185640096664429, + "reward_std": 0.17512290179729462, + "rewards/accuracy_reward": 0.6185639351606369, + "rewards/format_reward": 1.0, + "step": 374 + }, + { + "completion_length": 44.3125, + "epoch": 1.7123287671232876, + "grad_norm": 2.8335328102111816, + "kl": 0.1572265625, + "learning_rate": 8.287671232876712e-07, + "loss": 0.0063, + "reward": 1.528906226158142, + "reward_std": 0.238134503364563, + "rewards/accuracy_reward": 0.5289062559604645, + "rewards/format_reward": 1.0, + "step": 375 + }, + { + "completion_length": 47.7578125, + "epoch": 1.7168949771689497, + "grad_norm": 2.0584352016448975, + "kl": 0.14306640625, + "learning_rate": 8.283105022831049e-07, + "loss": 0.0057, + "reward": 1.5130573511123657, + "reward_std": 0.23582037538290024, + "rewards/accuracy_reward": 0.5130573809146881, + "rewards/format_reward": 1.0, + "step": 376 + }, + { + "completion_length": 81.203125, + "epoch": 1.721461187214612, + "grad_norm": 2.3844358921051025, + "kl": 0.11669921875, + "learning_rate": 8.278538812785387e-07, + "loss": 0.0047, + "reward": 1.616406261920929, + "reward_std": 0.22177018970251083, + "rewards/accuracy_reward": 0.6320312321186066, + "rewards/format_reward": 0.984375, + "step": 377 + }, + { + "completion_length": 70.109375, + "epoch": 1.726027397260274, + "grad_norm": 4.480605602264404, + "kl": 0.093994140625, + "learning_rate": 8.273972602739726e-07, + "loss": 0.0038, + "reward": 1.7281250357627869, + "reward_std": 0.13258251547813416, + "rewards/accuracy_reward": 0.7359374761581421, + "rewards/format_reward": 0.9921875, + "step": 378 + }, + { + "completion_length": 57.6015625, + "epoch": 1.730593607305936, + "grad_norm": 3.5465571880340576, + "kl": 0.12109375, + "learning_rate": 8.269406392694064e-07, + "loss": 0.0048, + "reward": 1.6325520873069763, + "reward_std": 0.20021257549524307, + "rewards/accuracy_reward": 0.6325520575046539, + "rewards/format_reward": 1.0, + "step": 379 + }, + { + "completion_length": 85.328125, + "epoch": 1.7351598173515983, + "grad_norm": 2.483445882797241, + "kl": 0.0963134765625, + "learning_rate": 8.264840182648402e-07, + "loss": 0.0038, + "reward": 1.5804979801177979, + "reward_std": 0.16095227003097534, + "rewards/accuracy_reward": 0.5804979428648949, + "rewards/format_reward": 1.0, + "step": 380 + }, + { + "completion_length": 64.453125, + "epoch": 1.7397260273972601, + "grad_norm": 2.9765961170196533, + "kl": 0.16650390625, + "learning_rate": 8.260273972602739e-07, + "loss": 0.0067, + "reward": 1.6214489340782166, + "reward_std": 0.20274285972118378, + "rewards/accuracy_reward": 0.6214488744735718, + "rewards/format_reward": 1.0, + "step": 381 + }, + { + "completion_length": 76.828125, + "epoch": 1.7442922374429224, + "grad_norm": 3.132500410079956, + "kl": 0.1435546875, + "learning_rate": 8.255707762557077e-07, + "loss": 0.0058, + "reward": 1.5574839115142822, + "reward_std": 0.22679631412029266, + "rewards/accuracy_reward": 0.5652963519096375, + "rewards/format_reward": 0.9921875, + "step": 382 + }, + { + "completion_length": 70.46875, + "epoch": 1.7488584474885844, + "grad_norm": 2.3076906204223633, + "kl": 0.114501953125, + "learning_rate": 8.251141552511415e-07, + "loss": 0.0046, + "reward": 1.688330888748169, + "reward_std": 0.13296211138367653, + "rewards/accuracy_reward": 0.6883308291435242, + "rewards/format_reward": 1.0, + "step": 383 + }, + { + "completion_length": 73.0078125, + "epoch": 1.7534246575342465, + "grad_norm": 4.062695026397705, + "kl": 0.114501953125, + "learning_rate": 8.246575342465753e-07, + "loss": 0.0046, + "reward": 1.5453124642372131, + "reward_std": 0.2351284772157669, + "rewards/accuracy_reward": 0.5609375238418579, + "rewards/format_reward": 0.984375, + "step": 384 + }, + { + "completion_length": 48.984375, + "epoch": 1.7579908675799087, + "grad_norm": 6.5755510330200195, + "kl": 0.20068359375, + "learning_rate": 8.242009132420092e-07, + "loss": 0.008, + "reward": 1.5984994769096375, + "reward_std": 0.3222559839487076, + "rewards/accuracy_reward": 0.5984995067119598, + "rewards/format_reward": 1.0, + "step": 385 + }, + { + "completion_length": 71.703125, + "epoch": 1.7625570776255708, + "grad_norm": 13.117231369018555, + "kl": 0.083251953125, + "learning_rate": 8.237442922374429e-07, + "loss": 0.0033, + "reward": 1.5032986402511597, + "reward_std": 0.21686269342899323, + "rewards/accuracy_reward": 0.5032986104488373, + "rewards/format_reward": 1.0, + "step": 386 + }, + { + "completion_length": 87.375, + "epoch": 1.7671232876712328, + "grad_norm": 2.233595371246338, + "kl": 0.107177734375, + "learning_rate": 8.232876712328767e-07, + "loss": 0.0043, + "reward": 1.701716125011444, + "reward_std": 0.11170049756765366, + "rewards/accuracy_reward": 0.7017160058021545, + "rewards/format_reward": 1.0, + "step": 387 + }, + { + "completion_length": 77.515625, + "epoch": 1.771689497716895, + "grad_norm": 4.436239719390869, + "kl": 0.1083984375, + "learning_rate": 8.228310502283105e-07, + "loss": 0.0043, + "reward": 1.6824799180030823, + "reward_std": 0.18607579916715622, + "rewards/accuracy_reward": 0.6902924478054047, + "rewards/format_reward": 0.9921875, + "step": 388 + }, + { + "completion_length": 60.2265625, + "epoch": 1.776255707762557, + "grad_norm": 2.3695435523986816, + "kl": 0.1494140625, + "learning_rate": 8.223744292237442e-07, + "loss": 0.006, + "reward": 1.7026600241661072, + "reward_std": 0.13800616562366486, + "rewards/accuracy_reward": 0.7104724645614624, + "rewards/format_reward": 0.9921875, + "step": 389 + }, + { + "completion_length": 71.140625, + "epoch": 1.7808219178082192, + "grad_norm": 4.455196380615234, + "kl": 0.15966796875, + "learning_rate": 8.21917808219178e-07, + "loss": 0.0064, + "reward": 1.6663504838943481, + "reward_std": 0.16872704774141312, + "rewards/accuracy_reward": 0.6663504242897034, + "rewards/format_reward": 1.0, + "step": 390 + }, + { + "completion_length": 65.8359375, + "epoch": 1.7853881278538812, + "grad_norm": 15.527393341064453, + "kl": 0.148681640625, + "learning_rate": 8.214611872146119e-07, + "loss": 0.0059, + "reward": 1.553125023841858, + "reward_std": 0.21595830470323563, + "rewards/accuracy_reward": 0.5531250089406967, + "rewards/format_reward": 1.0, + "step": 391 + }, + { + "completion_length": 73.0625, + "epoch": 1.7899543378995433, + "grad_norm": 4.076765537261963, + "kl": 0.101806640625, + "learning_rate": 8.210045662100456e-07, + "loss": 0.0041, + "reward": 1.6578125357627869, + "reward_std": 0.17282497137784958, + "rewards/accuracy_reward": 0.6578125059604645, + "rewards/format_reward": 1.0, + "step": 392 + }, + { + "completion_length": 64.3515625, + "epoch": 1.7945205479452055, + "grad_norm": 4.910162448883057, + "kl": 0.24609375, + "learning_rate": 8.205479452054795e-07, + "loss": 0.0099, + "reward": 1.5686192512512207, + "reward_std": 0.20994295924901962, + "rewards/accuracy_reward": 0.5764318108558655, + "rewards/format_reward": 0.9921875, + "step": 393 + }, + { + "completion_length": 71.3359375, + "epoch": 1.7990867579908676, + "grad_norm": 3.380282402038574, + "kl": 0.098876953125, + "learning_rate": 8.200913242009132e-07, + "loss": 0.004, + "reward": 1.6727213859558105, + "reward_std": 0.09240220487117767, + "rewards/accuracy_reward": 0.672721341252327, + "rewards/format_reward": 1.0, + "step": 394 + }, + { + "completion_length": 70.625, + "epoch": 1.8036529680365296, + "grad_norm": 15.939676284790039, + "kl": 0.1748046875, + "learning_rate": 8.19634703196347e-07, + "loss": 0.007, + "reward": 1.4804688096046448, + "reward_std": 0.22750268876552582, + "rewards/accuracy_reward": 0.4882812350988388, + "rewards/format_reward": 0.9921875, + "step": 395 + }, + { + "completion_length": 76.9453125, + "epoch": 1.808219178082192, + "grad_norm": 8.642084121704102, + "kl": 0.10009765625, + "learning_rate": 8.191780821917808e-07, + "loss": 0.004, + "reward": 1.6074219346046448, + "reward_std": 0.21606218069791794, + "rewards/accuracy_reward": 0.615234375, + "rewards/format_reward": 0.9921875, + "step": 396 + }, + { + "completion_length": 71.0546875, + "epoch": 1.8127853881278537, + "grad_norm": 3.1132235527038574, + "kl": 0.1259765625, + "learning_rate": 8.187214611872145e-07, + "loss": 0.005, + "reward": 1.6144831776618958, + "reward_std": 0.22148973494768143, + "rewards/accuracy_reward": 0.6144831776618958, + "rewards/format_reward": 1.0, + "step": 397 + }, + { + "completion_length": 64.6484375, + "epoch": 1.817351598173516, + "grad_norm": 5.543883800506592, + "kl": 0.135498046875, + "learning_rate": 8.182648401826484e-07, + "loss": 0.0054, + "reward": 1.5686756372451782, + "reward_std": 0.1985800489783287, + "rewards/accuracy_reward": 0.5764880925416946, + "rewards/format_reward": 0.9921875, + "step": 398 + }, + { + "completion_length": 59.421875, + "epoch": 1.821917808219178, + "grad_norm": 6.449603080749512, + "kl": 0.14892578125, + "learning_rate": 8.178082191780822e-07, + "loss": 0.006, + "reward": 1.6453125476837158, + "reward_std": 0.1793966293334961, + "rewards/accuracy_reward": 0.6531250476837158, + "rewards/format_reward": 0.9921875, + "step": 399 + }, + { + "completion_length": 65.0859375, + "epoch": 1.82648401826484, + "grad_norm": 3.558039665222168, + "kl": 0.121826171875, + "learning_rate": 8.173515981735159e-07, + "loss": 0.0049, + "reward": 1.5231584310531616, + "reward_std": 0.27039487659931183, + "rewards/accuracy_reward": 0.5309710204601288, + "rewards/format_reward": 0.9921875, + "step": 400 + }, + { + "completion_length": 74.875, + "epoch": 1.8310502283105023, + "grad_norm": 2.6240084171295166, + "kl": 0.138427734375, + "learning_rate": 8.168949771689498e-07, + "loss": 0.0055, + "reward": 1.5302269458770752, + "reward_std": 0.19186384975910187, + "rewards/accuracy_reward": 0.5302269384264946, + "rewards/format_reward": 1.0, + "step": 401 + }, + { + "completion_length": 72.109375, + "epoch": 1.8356164383561644, + "grad_norm": 2.054145574569702, + "kl": 0.11572265625, + "learning_rate": 8.164383561643835e-07, + "loss": 0.0046, + "reward": 1.7239583730697632, + "reward_std": 0.13660814613103867, + "rewards/accuracy_reward": 0.723958283662796, + "rewards/format_reward": 1.0, + "step": 402 + }, + { + "completion_length": 93.1484375, + "epoch": 1.8401826484018264, + "grad_norm": 2.950429916381836, + "kl": 0.108642578125, + "learning_rate": 8.159817351598172e-07, + "loss": 0.0043, + "reward": 1.734375, + "reward_std": 0.11295716743916273, + "rewards/accuracy_reward": 0.7343749403953552, + "rewards/format_reward": 1.0, + "step": 403 + }, + { + "completion_length": 68.5546875, + "epoch": 1.8447488584474887, + "grad_norm": 2.904849052429199, + "kl": 0.154541015625, + "learning_rate": 8.155251141552512e-07, + "loss": 0.0062, + "reward": 1.6253038048744202, + "reward_std": 0.26257922500371933, + "rewards/accuracy_reward": 0.6409288048744202, + "rewards/format_reward": 0.984375, + "step": 404 + }, + { + "completion_length": 80.0390625, + "epoch": 1.8493150684931505, + "grad_norm": 2.698310136795044, + "kl": 0.10546875, + "learning_rate": 8.150684931506849e-07, + "loss": 0.0042, + "reward": 1.6187500357627869, + "reward_std": 0.1505398042500019, + "rewards/accuracy_reward": 0.6343750059604645, + "rewards/format_reward": 0.984375, + "step": 405 + }, + { + "completion_length": 88.8046875, + "epoch": 1.8538812785388128, + "grad_norm": 3.7120022773742676, + "kl": 0.091064453125, + "learning_rate": 8.146118721461187e-07, + "loss": 0.0036, + "reward": 1.6648437976837158, + "reward_std": 0.16381803154945374, + "rewards/accuracy_reward": 0.6882811784744263, + "rewards/format_reward": 0.9765625, + "step": 406 + }, + { + "completion_length": 82.1171875, + "epoch": 1.8584474885844748, + "grad_norm": 17.568010330200195, + "kl": 0.09521484375, + "learning_rate": 8.141552511415525e-07, + "loss": 0.0038, + "reward": 1.5686570405960083, + "reward_std": 0.27213824540376663, + "rewards/accuracy_reward": 0.5920945107936859, + "rewards/format_reward": 0.9765625, + "step": 407 + }, + { + "completion_length": 72.0390625, + "epoch": 1.8630136986301369, + "grad_norm": 5.123001575469971, + "kl": 0.108642578125, + "learning_rate": 8.136986301369862e-07, + "loss": 0.0044, + "reward": 1.6002225279808044, + "reward_std": 0.1988915428519249, + "rewards/accuracy_reward": 0.6002225577831268, + "rewards/format_reward": 1.0, + "step": 408 + }, + { + "completion_length": 65.7890625, + "epoch": 1.8675799086757991, + "grad_norm": 3.825585126876831, + "kl": 0.12646484375, + "learning_rate": 8.132420091324201e-07, + "loss": 0.0051, + "reward": 1.5363582372665405, + "reward_std": 0.22431423515081406, + "rewards/accuracy_reward": 0.5363581627607346, + "rewards/format_reward": 1.0, + "step": 409 + }, + { + "completion_length": 84.0859375, + "epoch": 1.8721461187214612, + "grad_norm": 2.313400983810425, + "kl": 0.123779296875, + "learning_rate": 8.127853881278538e-07, + "loss": 0.005, + "reward": 1.6218750476837158, + "reward_std": 0.28108011931180954, + "rewards/accuracy_reward": 0.7078124582767487, + "rewards/format_reward": 0.9140625, + "step": 410 + }, + { + "completion_length": 88.03125, + "epoch": 1.8767123287671232, + "grad_norm": 4.612303256988525, + "kl": 0.132568359375, + "learning_rate": 8.123287671232877e-07, + "loss": 0.0053, + "reward": 1.662500023841858, + "reward_std": 0.18329600244760513, + "rewards/accuracy_reward": 0.6781249642372131, + "rewards/format_reward": 0.984375, + "step": 411 + }, + { + "completion_length": 60.7265625, + "epoch": 1.8812785388127855, + "grad_norm": 2.7391302585601807, + "kl": 0.14794921875, + "learning_rate": 8.118721461187215e-07, + "loss": 0.0059, + "reward": 1.648708462715149, + "reward_std": 0.15135541558265686, + "rewards/accuracy_reward": 0.6487084329128265, + "rewards/format_reward": 1.0, + "step": 412 + }, + { + "completion_length": 103.84375, + "epoch": 1.8858447488584473, + "grad_norm": 1.170571208000183, + "kl": 0.0670166015625, + "learning_rate": 8.114155251141552e-07, + "loss": 0.0027, + "reward": 1.811079502105713, + "reward_std": 0.0749700665473938, + "rewards/accuracy_reward": 0.8267044425010681, + "rewards/format_reward": 0.984375, + "step": 413 + }, + { + "completion_length": 70.9921875, + "epoch": 1.8904109589041096, + "grad_norm": 3.7362253665924072, + "kl": 0.14404296875, + "learning_rate": 8.10958904109589e-07, + "loss": 0.0058, + "reward": 1.6984003186225891, + "reward_std": 0.19452574849128723, + "rewards/accuracy_reward": 0.7062127590179443, + "rewards/format_reward": 0.9921875, + "step": 414 + }, + { + "completion_length": 66.953125, + "epoch": 1.8949771689497716, + "grad_norm": 4.104921340942383, + "kl": 0.133056640625, + "learning_rate": 8.105022831050228e-07, + "loss": 0.0053, + "reward": 1.5964038372039795, + "reward_std": 0.2661294490098953, + "rewards/accuracy_reward": 0.6042163074016571, + "rewards/format_reward": 0.9921875, + "step": 415 + }, + { + "completion_length": 67.984375, + "epoch": 1.8995433789954337, + "grad_norm": 5.06357479095459, + "kl": 0.11669921875, + "learning_rate": 8.100456621004565e-07, + "loss": 0.0047, + "reward": 1.6529513597488403, + "reward_std": 0.16535750776529312, + "rewards/accuracy_reward": 0.6529513597488403, + "rewards/format_reward": 1.0, + "step": 416 + }, + { + "completion_length": 72.1640625, + "epoch": 1.904109589041096, + "grad_norm": 9.87835693359375, + "kl": 0.137451171875, + "learning_rate": 8.095890410958903e-07, + "loss": 0.0055, + "reward": 1.6302083134651184, + "reward_std": 0.21513652801513672, + "rewards/accuracy_reward": 0.6458333134651184, + "rewards/format_reward": 0.984375, + "step": 417 + }, + { + "completion_length": 73.6015625, + "epoch": 1.908675799086758, + "grad_norm": 10.113238334655762, + "kl": 0.142333984375, + "learning_rate": 8.091324200913242e-07, + "loss": 0.0057, + "reward": 1.6189236044883728, + "reward_std": 0.23221635073423386, + "rewards/accuracy_reward": 0.6345485746860504, + "rewards/format_reward": 0.984375, + "step": 418 + }, + { + "completion_length": 62.078125, + "epoch": 1.91324200913242, + "grad_norm": 3.4071831703186035, + "kl": 0.1513671875, + "learning_rate": 8.08675799086758e-07, + "loss": 0.0061, + "reward": 1.356249988079071, + "reward_std": 0.3108007460832596, + "rewards/accuracy_reward": 0.36406250298023224, + "rewards/format_reward": 0.9921875, + "step": 419 + }, + { + "completion_length": 83.203125, + "epoch": 1.9178082191780823, + "grad_norm": 3.302225351333618, + "kl": 0.2509765625, + "learning_rate": 8.082191780821918e-07, + "loss": 0.0101, + "reward": 1.6587789058685303, + "reward_std": 0.1820889264345169, + "rewards/accuracy_reward": 0.6665914356708527, + "rewards/format_reward": 0.9921875, + "step": 420 + }, + { + "completion_length": 76.1015625, + "epoch": 1.9223744292237441, + "grad_norm": 4.783244609832764, + "kl": 0.1474609375, + "learning_rate": 8.077625570776255e-07, + "loss": 0.0059, + "reward": 1.7332961559295654, + "reward_std": 0.11496374011039734, + "rewards/accuracy_reward": 0.733296126127243, + "rewards/format_reward": 1.0, + "step": 421 + }, + { + "completion_length": 94.6484375, + "epoch": 1.9269406392694064, + "grad_norm": 2.7827274799346924, + "kl": 0.09375, + "learning_rate": 8.073059360730593e-07, + "loss": 0.0038, + "reward": 1.8006826043128967, + "reward_std": 0.09438200853765011, + "rewards/accuracy_reward": 0.800682544708252, + "rewards/format_reward": 1.0, + "step": 422 + }, + { + "completion_length": 70.4375, + "epoch": 1.9315068493150684, + "grad_norm": 4.186135768890381, + "kl": 0.16455078125, + "learning_rate": 8.068493150684931e-07, + "loss": 0.0066, + "reward": 1.6078130006790161, + "reward_std": 0.21306797862052917, + "rewards/accuracy_reward": 0.6078130900859833, + "rewards/format_reward": 1.0, + "step": 423 + }, + { + "completion_length": 77.6953125, + "epoch": 1.9360730593607305, + "grad_norm": 2.545297861099243, + "kl": 0.111083984375, + "learning_rate": 8.063926940639269e-07, + "loss": 0.0045, + "reward": 1.6678841710090637, + "reward_std": 0.11376722529530525, + "rewards/accuracy_reward": 0.6756967306137085, + "rewards/format_reward": 0.9921875, + "step": 424 + }, + { + "completion_length": 87.1171875, + "epoch": 1.9406392694063928, + "grad_norm": 2.4129421710968018, + "kl": 0.080078125, + "learning_rate": 8.059360730593608e-07, + "loss": 0.0032, + "reward": 1.6949777007102966, + "reward_std": 0.14730913192033768, + "rewards/accuracy_reward": 0.7027901709079742, + "rewards/format_reward": 0.9921875, + "step": 425 + }, + { + "completion_length": 78.53125, + "epoch": 1.9452054794520548, + "grad_norm": 2.179104804992676, + "kl": 0.120361328125, + "learning_rate": 8.054794520547945e-07, + "loss": 0.0048, + "reward": 1.7349414825439453, + "reward_std": 0.20185434818267822, + "rewards/accuracy_reward": 0.7505663931369781, + "rewards/format_reward": 0.984375, + "step": 426 + }, + { + "completion_length": 70.2578125, + "epoch": 1.9497716894977168, + "grad_norm": 3.5189919471740723, + "kl": 0.15869140625, + "learning_rate": 8.050228310502283e-07, + "loss": 0.0064, + "reward": 1.6359375715255737, + "reward_std": 0.1778659224510193, + "rewards/accuracy_reward": 0.6359374523162842, + "rewards/format_reward": 1.0, + "step": 427 + }, + { + "completion_length": 91.140625, + "epoch": 1.954337899543379, + "grad_norm": 4.333745002746582, + "kl": 0.07861328125, + "learning_rate": 8.045662100456621e-07, + "loss": 0.0031, + "reward": 1.6484509706497192, + "reward_std": 0.23022788017988205, + "rewards/accuracy_reward": 0.6797009706497192, + "rewards/format_reward": 0.96875, + "step": 428 + }, + { + "completion_length": 75.453125, + "epoch": 1.958904109589041, + "grad_norm": 3.482632637023926, + "kl": 0.12841796875, + "learning_rate": 8.041095890410958e-07, + "loss": 0.0051, + "reward": 1.6106771230697632, + "reward_std": 0.23327118158340454, + "rewards/accuracy_reward": 0.626302033662796, + "rewards/format_reward": 0.984375, + "step": 429 + }, + { + "completion_length": 87.984375, + "epoch": 1.9634703196347032, + "grad_norm": 1.5310007333755493, + "kl": 0.12451171875, + "learning_rate": 8.036529680365296e-07, + "loss": 0.005, + "reward": 1.7687500715255737, + "reward_std": 0.1117947231978178, + "rewards/accuracy_reward": 0.7765624523162842, + "rewards/format_reward": 0.9921875, + "step": 430 + }, + { + "completion_length": 74.0625, + "epoch": 1.9680365296803652, + "grad_norm": 2.0865707397460938, + "kl": 0.120849609375, + "learning_rate": 8.031963470319635e-07, + "loss": 0.0048, + "reward": 1.7373016476631165, + "reward_std": 0.1270090974867344, + "rewards/accuracy_reward": 0.7373015582561493, + "rewards/format_reward": 1.0, + "step": 431 + }, + { + "completion_length": 74.6953125, + "epoch": 1.9726027397260273, + "grad_norm": 2.8522982597351074, + "kl": 0.104248046875, + "learning_rate": 8.027397260273972e-07, + "loss": 0.0042, + "reward": 1.6342076063156128, + "reward_std": 0.18551241606473923, + "rewards/accuracy_reward": 0.6498326063156128, + "rewards/format_reward": 0.984375, + "step": 432 + }, + { + "completion_length": 86.6484375, + "epoch": 1.9771689497716896, + "grad_norm": 2.4067180156707764, + "kl": 0.10009765625, + "learning_rate": 8.022831050228311e-07, + "loss": 0.004, + "reward": 1.6945313215255737, + "reward_std": 0.11847387999296188, + "rewards/accuracy_reward": 0.694531261920929, + "rewards/format_reward": 1.0, + "step": 433 + }, + { + "completion_length": 80.296875, + "epoch": 1.9817351598173516, + "grad_norm": 2.204301357269287, + "kl": 0.12890625, + "learning_rate": 8.018264840182648e-07, + "loss": 0.0052, + "reward": 1.6941722631454468, + "reward_std": 0.16894984245300293, + "rewards/accuracy_reward": 0.7176096439361572, + "rewards/format_reward": 0.9765625, + "step": 434 + }, + { + "completion_length": 69.578125, + "epoch": 1.9863013698630136, + "grad_norm": 3.0245296955108643, + "kl": 0.127197265625, + "learning_rate": 8.013698630136985e-07, + "loss": 0.0051, + "reward": 1.627698838710785, + "reward_std": 0.1703873947262764, + "rewards/accuracy_reward": 0.6276988387107849, + "rewards/format_reward": 1.0, + "step": 435 + }, + { + "completion_length": 59.9140625, + "epoch": 1.990867579908676, + "grad_norm": 3.451076030731201, + "kl": 0.14404296875, + "learning_rate": 8.009132420091324e-07, + "loss": 0.0058, + "reward": 1.5428819060325623, + "reward_std": 0.2504672184586525, + "rewards/accuracy_reward": 0.542881965637207, + "rewards/format_reward": 1.0, + "step": 436 + }, + { + "completion_length": 84.5625, + "epoch": 1.9954337899543377, + "grad_norm": 2.452648639678955, + "kl": 0.08056640625, + "learning_rate": 8.004566210045661e-07, + "loss": 0.0032, + "reward": 1.6554688215255737, + "reward_std": 0.2031169831752777, + "rewards/accuracy_reward": 0.6710937917232513, + "rewards/format_reward": 0.984375, + "step": 437 + }, + { + "completion_length": 45.5, + "epoch": 2.0, + "grad_norm": 1.8669296503067017, + "kl": 0.126953125, + "learning_rate": 8e-07, + "loss": 0.0047, + "reward": 1.875, + "reward_std": 0.43671509623527527, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 438 + }, + { + "completion_length": 79.21875, + "epoch": 2.0045662100456623, + "grad_norm": 1.4508730173110962, + "kl": 0.127685546875, + "learning_rate": 7.995433789954338e-07, + "loss": 0.0051, + "reward": 1.7248697876930237, + "reward_std": 0.1105603277683258, + "rewards/accuracy_reward": 0.7404947876930237, + "rewards/format_reward": 0.984375, + "step": 439 + }, + { + "completion_length": 62.09375, + "epoch": 2.009132420091324, + "grad_norm": 2.7377336025238037, + "kl": 0.1376953125, + "learning_rate": 7.990867579908675e-07, + "loss": 0.0055, + "reward": 1.3338541984558105, + "reward_std": 0.3162979334592819, + "rewards/accuracy_reward": 0.34947916865348816, + "rewards/format_reward": 0.984375, + "step": 440 + }, + { + "completion_length": 87.34375, + "epoch": 2.0136986301369864, + "grad_norm": 2.2570443153381348, + "kl": 0.12060546875, + "learning_rate": 7.986301369863014e-07, + "loss": 0.0048, + "reward": 1.7250688076019287, + "reward_std": 0.08361868560314178, + "rewards/accuracy_reward": 0.7250687181949615, + "rewards/format_reward": 1.0, + "step": 441 + }, + { + "completion_length": 94.140625, + "epoch": 2.018264840182648, + "grad_norm": 8.999687194824219, + "kl": 0.10595703125, + "learning_rate": 7.981735159817351e-07, + "loss": 0.0042, + "reward": 1.6828125715255737, + "reward_std": 0.16398613899946213, + "rewards/accuracy_reward": 0.7062499523162842, + "rewards/format_reward": 0.9765625, + "step": 442 + }, + { + "completion_length": 69.984375, + "epoch": 2.0228310502283104, + "grad_norm": 2.834467649459839, + "kl": 0.096923828125, + "learning_rate": 7.977168949771688e-07, + "loss": 0.0039, + "reward": 1.5455728769302368, + "reward_std": 0.25657252967357635, + "rewards/accuracy_reward": 0.5533854067325592, + "rewards/format_reward": 0.9921875, + "step": 443 + }, + { + "completion_length": 98.921875, + "epoch": 2.0273972602739727, + "grad_norm": 2.6678521633148193, + "kl": 0.082275390625, + "learning_rate": 7.972602739726027e-07, + "loss": 0.0033, + "reward": 1.663573145866394, + "reward_std": 0.1522715613245964, + "rewards/accuracy_reward": 0.6635731160640717, + "rewards/format_reward": 1.0, + "step": 444 + }, + { + "completion_length": 80.84375, + "epoch": 2.0319634703196345, + "grad_norm": 1.9825059175491333, + "kl": 0.09326171875, + "learning_rate": 7.968036529680365e-07, + "loss": 0.0037, + "reward": 1.603124976158142, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.6109374761581421, + "rewards/format_reward": 0.9921875, + "step": 445 + }, + { + "completion_length": 78.09375, + "epoch": 2.036529680365297, + "grad_norm": 5.509555816650391, + "kl": 0.1341552734375, + "learning_rate": 7.963470319634703e-07, + "loss": 0.0054, + "reward": 1.6062500476837158, + "reward_std": 0.24452467262744904, + "rewards/accuracy_reward": 0.621874988079071, + "rewards/format_reward": 0.984375, + "step": 446 + }, + { + "completion_length": 59.6484375, + "epoch": 2.041095890410959, + "grad_norm": 4.631182670593262, + "kl": 0.154296875, + "learning_rate": 7.958904109589041e-07, + "loss": 0.0062, + "reward": 1.6522735357284546, + "reward_std": 0.24951402842998505, + "rewards/accuracy_reward": 0.667898565530777, + "rewards/format_reward": 0.984375, + "step": 447 + }, + { + "completion_length": 69.25, + "epoch": 2.045662100456621, + "grad_norm": 3.0625455379486084, + "kl": 0.152099609375, + "learning_rate": 7.954337899543378e-07, + "loss": 0.0061, + "reward": 1.6024305820465088, + "reward_std": 0.21730707585811615, + "rewards/accuracy_reward": 0.6102430522441864, + "rewards/format_reward": 0.9921875, + "step": 448 + }, + { + "completion_length": 66.5859375, + "epoch": 2.050228310502283, + "grad_norm": 7.600069522857666, + "kl": 0.118408203125, + "learning_rate": 7.949771689497717e-07, + "loss": 0.0047, + "reward": 1.7647135853767395, + "reward_std": 0.16752880066633224, + "rewards/accuracy_reward": 0.7725259959697723, + "rewards/format_reward": 0.9921875, + "step": 449 + }, + { + "completion_length": 83.8828125, + "epoch": 2.0547945205479454, + "grad_norm": 4.596118450164795, + "kl": 0.149658203125, + "learning_rate": 7.945205479452054e-07, + "loss": 0.006, + "reward": 1.6769480109214783, + "reward_std": 0.1898738443851471, + "rewards/accuracy_reward": 0.684760570526123, + "rewards/format_reward": 0.9921875, + "step": 450 + }, + { + "completion_length": 48.546875, + "epoch": 2.0593607305936072, + "grad_norm": 3.093118906021118, + "kl": 0.171875, + "learning_rate": 7.940639269406393e-07, + "loss": 0.0069, + "reward": 1.6568829417228699, + "reward_std": 0.20424779504537582, + "rewards/accuracy_reward": 0.656883031129837, + "rewards/format_reward": 1.0, + "step": 451 + }, + { + "completion_length": 73.203125, + "epoch": 2.0639269406392695, + "grad_norm": 3.5054690837860107, + "kl": 0.135498046875, + "learning_rate": 7.936073059360731e-07, + "loss": 0.0054, + "reward": 1.6708519458770752, + "reward_std": 0.18046213686466217, + "rewards/accuracy_reward": 0.6708519458770752, + "rewards/format_reward": 1.0, + "step": 452 + }, + { + "completion_length": 78.1953125, + "epoch": 2.0684931506849313, + "grad_norm": 1.87969172000885, + "kl": 0.096923828125, + "learning_rate": 7.931506849315068e-07, + "loss": 0.0039, + "reward": 1.6500000953674316, + "reward_std": 0.1841355338692665, + "rewards/accuracy_reward": 0.6656249761581421, + "rewards/format_reward": 0.984375, + "step": 453 + }, + { + "completion_length": 66.203125, + "epoch": 2.0730593607305936, + "grad_norm": 3.03586483001709, + "kl": 0.116455078125, + "learning_rate": 7.926940639269406e-07, + "loss": 0.0047, + "reward": 1.5984020233154297, + "reward_std": 0.2402767539024353, + "rewards/accuracy_reward": 0.6218394637107849, + "rewards/format_reward": 0.9765625, + "step": 454 + }, + { + "completion_length": 60.4453125, + "epoch": 2.077625570776256, + "grad_norm": 5.758031368255615, + "kl": 0.15185546875, + "learning_rate": 7.922374429223744e-07, + "loss": 0.0061, + "reward": 1.6396695375442505, + "reward_std": 0.18095022439956665, + "rewards/accuracy_reward": 0.6396694481372833, + "rewards/format_reward": 1.0, + "step": 455 + }, + { + "completion_length": 80.0703125, + "epoch": 2.0821917808219177, + "grad_norm": 2.098630905151367, + "kl": 0.105712890625, + "learning_rate": 7.917808219178081e-07, + "loss": 0.0042, + "reward": 1.6691096425056458, + "reward_std": 0.1658947691321373, + "rewards/accuracy_reward": 0.684734582901001, + "rewards/format_reward": 0.984375, + "step": 456 + }, + { + "completion_length": 72.2734375, + "epoch": 2.08675799086758, + "grad_norm": 2.7200841903686523, + "kl": 0.11083984375, + "learning_rate": 7.91324200913242e-07, + "loss": 0.0044, + "reward": 1.5380051136016846, + "reward_std": 0.2077661082148552, + "rewards/accuracy_reward": 0.5458175092935562, + "rewards/format_reward": 0.9921875, + "step": 457 + }, + { + "completion_length": 85.140625, + "epoch": 2.091324200913242, + "grad_norm": 12.362757682800293, + "kl": 0.091552734375, + "learning_rate": 7.908675799086758e-07, + "loss": 0.0037, + "reward": 1.6828125715255737, + "reward_std": 0.19778337329626083, + "rewards/accuracy_reward": 0.6984374821186066, + "rewards/format_reward": 0.984375, + "step": 458 + }, + { + "completion_length": 84.1875, + "epoch": 2.095890410958904, + "grad_norm": 2.426860809326172, + "kl": 0.10595703125, + "learning_rate": 7.904109589041096e-07, + "loss": 0.0042, + "reward": 1.6260236501693726, + "reward_std": 0.18746323138475418, + "rewards/accuracy_reward": 0.6338361203670502, + "rewards/format_reward": 0.9921875, + "step": 459 + }, + { + "completion_length": 91.34375, + "epoch": 2.1004566210045663, + "grad_norm": 9.778014183044434, + "kl": 0.09765625, + "learning_rate": 7.899543378995434e-07, + "loss": 0.0039, + "reward": 1.660937488079071, + "reward_std": 0.19728107750415802, + "rewards/accuracy_reward": 0.6921874582767487, + "rewards/format_reward": 0.96875, + "step": 460 + }, + { + "completion_length": 60.7265625, + "epoch": 2.105022831050228, + "grad_norm": 12.834918975830078, + "kl": 0.32470703125, + "learning_rate": 7.894977168949771e-07, + "loss": 0.0129, + "reward": 1.7225513458251953, + "reward_std": 0.1815098226070404, + "rewards/accuracy_reward": 0.7225514352321625, + "rewards/format_reward": 1.0, + "step": 461 + }, + { + "completion_length": 65.0546875, + "epoch": 2.1095890410958904, + "grad_norm": 3.4787039756774902, + "kl": 0.1533203125, + "learning_rate": 7.890410958904109e-07, + "loss": 0.0061, + "reward": 1.6404947638511658, + "reward_std": 0.19807539880275726, + "rewards/accuracy_reward": 0.6483072936534882, + "rewards/format_reward": 0.9921875, + "step": 462 + }, + { + "completion_length": 82.46875, + "epoch": 2.1141552511415527, + "grad_norm": 2.7784857749938965, + "kl": 0.1136474609375, + "learning_rate": 7.885844748858447e-07, + "loss": 0.0045, + "reward": 1.6932291984558105, + "reward_std": 0.14191660657525063, + "rewards/accuracy_reward": 0.6932291388511658, + "rewards/format_reward": 1.0, + "step": 463 + }, + { + "completion_length": 78.5, + "epoch": 2.1187214611872145, + "grad_norm": 2.705900192260742, + "kl": 0.098388671875, + "learning_rate": 7.881278538812784e-07, + "loss": 0.0039, + "reward": 1.6399182081222534, + "reward_std": 0.18248122185468674, + "rewards/accuracy_reward": 0.6477306485176086, + "rewards/format_reward": 0.9921875, + "step": 464 + }, + { + "completion_length": 88.0703125, + "epoch": 2.1232876712328768, + "grad_norm": 2.8577768802642822, + "kl": 0.099609375, + "learning_rate": 7.876712328767124e-07, + "loss": 0.004, + "reward": 1.688281238079071, + "reward_std": 0.19159993529319763, + "rewards/accuracy_reward": 0.6960937082767487, + "rewards/format_reward": 0.9921875, + "step": 465 + }, + { + "completion_length": 66.6015625, + "epoch": 2.127853881278539, + "grad_norm": 2.5234570503234863, + "kl": 0.12158203125, + "learning_rate": 7.872146118721461e-07, + "loss": 0.0049, + "reward": 1.5919778943061829, + "reward_std": 0.24576038867235184, + "rewards/accuracy_reward": 0.6076028943061829, + "rewards/format_reward": 0.984375, + "step": 466 + }, + { + "completion_length": 75.390625, + "epoch": 2.132420091324201, + "grad_norm": 3.172330141067505, + "kl": 0.1142578125, + "learning_rate": 7.867579908675798e-07, + "loss": 0.0046, + "reward": 1.4447365403175354, + "reward_std": 0.22060814499855042, + "rewards/accuracy_reward": 0.452549085021019, + "rewards/format_reward": 0.9921875, + "step": 467 + }, + { + "completion_length": 91.1640625, + "epoch": 2.136986301369863, + "grad_norm": 2.385173797607422, + "kl": 0.0966796875, + "learning_rate": 7.863013698630137e-07, + "loss": 0.0039, + "reward": 1.7272321581840515, + "reward_std": 0.13396714627742767, + "rewards/accuracy_reward": 0.7350445687770844, + "rewards/format_reward": 0.9921875, + "step": 468 + }, + { + "completion_length": 75.5234375, + "epoch": 2.141552511415525, + "grad_norm": 2.3278234004974365, + "kl": 0.130126953125, + "learning_rate": 7.858447488584474e-07, + "loss": 0.0052, + "reward": 1.643117606639862, + "reward_std": 0.1804744228720665, + "rewards/accuracy_reward": 0.6509300172328949, + "rewards/format_reward": 0.9921875, + "step": 469 + }, + { + "completion_length": 63.9453125, + "epoch": 2.146118721461187, + "grad_norm": 5.3924360275268555, + "kl": 0.16552734375, + "learning_rate": 7.853881278538812e-07, + "loss": 0.0066, + "reward": 1.5500783324241638, + "reward_std": 0.25609923899173737, + "rewards/accuracy_reward": 0.5735158026218414, + "rewards/format_reward": 0.9765625, + "step": 470 + }, + { + "completion_length": 86.0625, + "epoch": 2.1506849315068495, + "grad_norm": 2.030338764190674, + "kl": 0.1171875, + "learning_rate": 7.849315068493151e-07, + "loss": 0.0047, + "reward": 1.6851562857627869, + "reward_std": 0.14745555073022842, + "rewards/accuracy_reward": 0.6929687559604645, + "rewards/format_reward": 0.9921875, + "step": 471 + }, + { + "completion_length": 66.0, + "epoch": 2.1552511415525113, + "grad_norm": 18.199661254882812, + "kl": 0.13330078125, + "learning_rate": 7.844748858447488e-07, + "loss": 0.0053, + "reward": 1.486718773841858, + "reward_std": 0.32600878179073334, + "rewards/accuracy_reward": 0.5101562440395355, + "rewards/format_reward": 0.9765625, + "step": 472 + }, + { + "completion_length": 93.5625, + "epoch": 2.1598173515981736, + "grad_norm": 3.143573760986328, + "kl": 0.07470703125, + "learning_rate": 7.840182648401827e-07, + "loss": 0.003, + "reward": 1.7031250596046448, + "reward_std": 0.18717344850301743, + "rewards/accuracy_reward": 0.7109374701976776, + "rewards/format_reward": 0.9921875, + "step": 473 + }, + { + "completion_length": 68.9453125, + "epoch": 2.1643835616438354, + "grad_norm": 3.1616029739379883, + "kl": 0.121337890625, + "learning_rate": 7.835616438356164e-07, + "loss": 0.0049, + "reward": 1.6342764496803284, + "reward_std": 0.23029568046331406, + "rewards/accuracy_reward": 0.6499014496803284, + "rewards/format_reward": 0.984375, + "step": 474 + }, + { + "completion_length": 80.5859375, + "epoch": 2.1689497716894977, + "grad_norm": 2.141812324523926, + "kl": 0.11572265625, + "learning_rate": 7.831050228310501e-07, + "loss": 0.0046, + "reward": 1.7305381298065186, + "reward_std": 0.19372030347585678, + "rewards/accuracy_reward": 0.7383506596088409, + "rewards/format_reward": 0.9921875, + "step": 475 + }, + { + "completion_length": 89.4921875, + "epoch": 2.17351598173516, + "grad_norm": 3.011232852935791, + "kl": 0.098876953125, + "learning_rate": 7.82648401826484e-07, + "loss": 0.004, + "reward": 1.60247403383255, + "reward_std": 0.28417903184890747, + "rewards/accuracy_reward": 0.6415364444255829, + "rewards/format_reward": 0.9609375, + "step": 476 + }, + { + "completion_length": 88.90625, + "epoch": 2.1780821917808217, + "grad_norm": 1.868679165840149, + "kl": 0.101806640625, + "learning_rate": 7.821917808219177e-07, + "loss": 0.0041, + "reward": 1.5997712016105652, + "reward_std": 0.1929171234369278, + "rewards/accuracy_reward": 0.5997711420059204, + "rewards/format_reward": 1.0, + "step": 477 + }, + { + "completion_length": 102.1640625, + "epoch": 2.182648401826484, + "grad_norm": 3.3878626823425293, + "kl": 0.11962890625, + "learning_rate": 7.817351598173516e-07, + "loss": 0.0048, + "reward": 1.7378038167953491, + "reward_std": 0.13828756287693977, + "rewards/accuracy_reward": 0.7456162571907043, + "rewards/format_reward": 0.9921875, + "step": 478 + }, + { + "completion_length": 73.65625, + "epoch": 2.1872146118721463, + "grad_norm": 3.133633613586426, + "kl": 0.1357421875, + "learning_rate": 7.812785388127854e-07, + "loss": 0.0054, + "reward": 1.7677083611488342, + "reward_std": 0.18527808785438538, + "rewards/accuracy_reward": 0.7911458313465118, + "rewards/format_reward": 0.9765625, + "step": 479 + }, + { + "completion_length": 65.0703125, + "epoch": 2.191780821917808, + "grad_norm": 2.9012463092803955, + "kl": 0.14794921875, + "learning_rate": 7.808219178082191e-07, + "loss": 0.0059, + "reward": 1.5422247648239136, + "reward_std": 0.2771962434053421, + "rewards/accuracy_reward": 0.5500372052192688, + "rewards/format_reward": 0.9921875, + "step": 480 + }, + { + "completion_length": 80.53125, + "epoch": 2.1963470319634704, + "grad_norm": 7.046329498291016, + "kl": 0.103271484375, + "learning_rate": 7.80365296803653e-07, + "loss": 0.0041, + "reward": 1.66796875, + "reward_std": 0.1984855979681015, + "rewards/accuracy_reward": 0.6679687798023224, + "rewards/format_reward": 1.0, + "step": 481 + }, + { + "completion_length": 81.921875, + "epoch": 2.2009132420091326, + "grad_norm": 2.9581196308135986, + "kl": 0.108642578125, + "learning_rate": 7.799086757990867e-07, + "loss": 0.0043, + "reward": 1.6520833373069763, + "reward_std": 0.23736542463302612, + "rewards/accuracy_reward": 0.6598958075046539, + "rewards/format_reward": 0.9921875, + "step": 482 + }, + { + "completion_length": 79.78125, + "epoch": 2.2054794520547945, + "grad_norm": 8.972982406616211, + "kl": 0.10693359375, + "learning_rate": 7.794520547945204e-07, + "loss": 0.0043, + "reward": 1.5802912712097168, + "reward_std": 0.21900298446416855, + "rewards/accuracy_reward": 0.580291211605072, + "rewards/format_reward": 1.0, + "step": 483 + }, + { + "completion_length": 83.5703125, + "epoch": 2.2100456621004567, + "grad_norm": 2.1765618324279785, + "kl": 0.12548828125, + "learning_rate": 7.789954337899543e-07, + "loss": 0.005, + "reward": 1.6838542222976685, + "reward_std": 0.21648824214935303, + "rewards/accuracy_reward": 0.6838541924953461, + "rewards/format_reward": 1.0, + "step": 484 + }, + { + "completion_length": 82.1171875, + "epoch": 2.2146118721461185, + "grad_norm": 6.6184563636779785, + "kl": 0.13916015625, + "learning_rate": 7.785388127853881e-07, + "loss": 0.0056, + "reward": 1.576785683631897, + "reward_std": 0.2656140699982643, + "rewards/accuracy_reward": 0.615848183631897, + "rewards/format_reward": 0.9609375, + "step": 485 + }, + { + "completion_length": 79.90625, + "epoch": 2.219178082191781, + "grad_norm": 3.397468328475952, + "kl": 0.12109375, + "learning_rate": 7.780821917808219e-07, + "loss": 0.0048, + "reward": 1.6415550708770752, + "reward_std": 0.1549607552587986, + "rewards/accuracy_reward": 0.6571800261735916, + "rewards/format_reward": 0.984375, + "step": 486 + }, + { + "completion_length": 84.609375, + "epoch": 2.223744292237443, + "grad_norm": 2.6514155864715576, + "kl": 0.127685546875, + "learning_rate": 7.776255707762557e-07, + "loss": 0.0051, + "reward": 1.6367188096046448, + "reward_std": 0.24639248847961426, + "rewards/accuracy_reward": 0.64453125, + "rewards/format_reward": 0.9921875, + "step": 487 + }, + { + "completion_length": 67.40625, + "epoch": 2.228310502283105, + "grad_norm": 2.794630289077759, + "kl": 0.130859375, + "learning_rate": 7.771689497716894e-07, + "loss": 0.0052, + "reward": 1.7186384201049805, + "reward_std": 0.18474777042865753, + "rewards/accuracy_reward": 0.7264508605003357, + "rewards/format_reward": 0.9921875, + "step": 488 + }, + { + "completion_length": 89.7578125, + "epoch": 2.232876712328767, + "grad_norm": 3.7008538246154785, + "kl": 0.115478515625, + "learning_rate": 7.767123287671233e-07, + "loss": 0.0046, + "reward": 1.6265625953674316, + "reward_std": 0.28076815605163574, + "rewards/accuracy_reward": 0.6656249761581421, + "rewards/format_reward": 0.9609375, + "step": 489 + }, + { + "completion_length": 66.0, + "epoch": 2.237442922374429, + "grad_norm": 5.58867883682251, + "kl": 0.19970703125, + "learning_rate": 7.76255707762557e-07, + "loss": 0.008, + "reward": 1.714453101158142, + "reward_std": 0.2889961302280426, + "rewards/accuracy_reward": 0.7378906309604645, + "rewards/format_reward": 0.9765625, + "step": 490 + }, + { + "completion_length": 71.7265625, + "epoch": 2.2420091324200913, + "grad_norm": 2.161491870880127, + "kl": 0.137451171875, + "learning_rate": 7.757990867579909e-07, + "loss": 0.0055, + "reward": 1.5816146731376648, + "reward_std": 0.19107923656702042, + "rewards/accuracy_reward": 0.5894270539283752, + "rewards/format_reward": 0.9921875, + "step": 491 + }, + { + "completion_length": 79.7734375, + "epoch": 2.2465753424657535, + "grad_norm": 5.536935329437256, + "kl": 0.09423828125, + "learning_rate": 7.753424657534247e-07, + "loss": 0.0038, + "reward": 1.5820313096046448, + "reward_std": 0.23220208287239075, + "rewards/accuracy_reward": 0.5976562649011612, + "rewards/format_reward": 0.984375, + "step": 492 + }, + { + "completion_length": 105.1640625, + "epoch": 2.2511415525114153, + "grad_norm": 2.601624011993408, + "kl": 0.102294921875, + "learning_rate": 7.748858447488584e-07, + "loss": 0.0041, + "reward": 1.7345969676971436, + "reward_std": 0.1583983302116394, + "rewards/accuracy_reward": 0.7424094080924988, + "rewards/format_reward": 0.9921875, + "step": 493 + }, + { + "completion_length": 76.2578125, + "epoch": 2.2557077625570776, + "grad_norm": 3.9378445148468018, + "kl": 0.14208984375, + "learning_rate": 7.744292237442922e-07, + "loss": 0.0057, + "reward": 1.6885417103767395, + "reward_std": 0.24918173253536224, + "rewards/accuracy_reward": 0.7197916507720947, + "rewards/format_reward": 0.96875, + "step": 494 + }, + { + "completion_length": 69.578125, + "epoch": 2.26027397260274, + "grad_norm": 9.057132720947266, + "kl": 0.1142578125, + "learning_rate": 7.73972602739726e-07, + "loss": 0.0046, + "reward": 1.7063058018684387, + "reward_std": 0.18320050090551376, + "rewards/accuracy_reward": 0.7141183018684387, + "rewards/format_reward": 0.9921875, + "step": 495 + }, + { + "completion_length": 58.1640625, + "epoch": 2.2648401826484017, + "grad_norm": 3.844257116317749, + "kl": 0.14892578125, + "learning_rate": 7.735159817351597e-07, + "loss": 0.006, + "reward": 1.5278646349906921, + "reward_std": 0.2481004297733307, + "rewards/accuracy_reward": 0.5278645753860474, + "rewards/format_reward": 1.0, + "step": 496 + }, + { + "completion_length": 88.9140625, + "epoch": 2.269406392694064, + "grad_norm": 8.18749713897705, + "kl": 0.098876953125, + "learning_rate": 7.730593607305936e-07, + "loss": 0.004, + "reward": 1.749678373336792, + "reward_std": 0.16354048997163773, + "rewards/accuracy_reward": 0.7496782541275024, + "rewards/format_reward": 1.0, + "step": 497 + }, + { + "completion_length": 93.5078125, + "epoch": 2.2739726027397262, + "grad_norm": 2.6278417110443115, + "kl": 0.077392578125, + "learning_rate": 7.726027397260274e-07, + "loss": 0.0031, + "reward": 1.765897810459137, + "reward_std": 0.21192234754562378, + "rewards/accuracy_reward": 0.8049602508544922, + "rewards/format_reward": 0.9609375, + "step": 498 + }, + { + "completion_length": 72.2421875, + "epoch": 2.278538812785388, + "grad_norm": 2.370776414871216, + "kl": 0.112548828125, + "learning_rate": 7.721461187214611e-07, + "loss": 0.0045, + "reward": 1.6037201881408691, + "reward_std": 0.22106194496154785, + "rewards/accuracy_reward": 0.6193452179431915, + "rewards/format_reward": 0.984375, + "step": 499 + }, + { + "completion_length": 57.5703125, + "epoch": 2.2831050228310503, + "grad_norm": 27.69339942932129, + "kl": 0.10791015625, + "learning_rate": 7.71689497716895e-07, + "loss": 0.0043, + "reward": 1.6528646349906921, + "reward_std": 0.2068563476204872, + "rewards/accuracy_reward": 0.652864545583725, + "rewards/format_reward": 1.0, + "step": 500 + }, + { + "completion_length": 74.9296875, + "epoch": 2.287671232876712, + "grad_norm": 3.454657793045044, + "kl": 0.101318359375, + "learning_rate": 7.712328767123287e-07, + "loss": 0.0041, + "reward": 1.5783380270004272, + "reward_std": 0.300783634185791, + "rewards/accuracy_reward": 0.6017755568027496, + "rewards/format_reward": 0.9765625, + "step": 501 + }, + { + "completion_length": 73.5, + "epoch": 2.2922374429223744, + "grad_norm": 2.893542766571045, + "kl": 0.117919921875, + "learning_rate": 7.707762557077625e-07, + "loss": 0.0047, + "reward": 1.6714488863945007, + "reward_std": 0.23431023210287094, + "rewards/accuracy_reward": 0.6948863565921783, + "rewards/format_reward": 0.9765625, + "step": 502 + }, + { + "completion_length": 90.9921875, + "epoch": 2.2968036529680367, + "grad_norm": 4.334531307220459, + "kl": 0.075439453125, + "learning_rate": 7.703196347031963e-07, + "loss": 0.003, + "reward": 1.6945313215255737, + "reward_std": 0.22123288363218307, + "rewards/accuracy_reward": 0.7179687321186066, + "rewards/format_reward": 0.9765625, + "step": 503 + }, + { + "completion_length": 89.3984375, + "epoch": 2.3013698630136985, + "grad_norm": 8.32867431640625, + "kl": 0.140869140625, + "learning_rate": 7.6986301369863e-07, + "loss": 0.0056, + "reward": 1.6440104246139526, + "reward_std": 0.12583958357572556, + "rewards/accuracy_reward": 0.644010454416275, + "rewards/format_reward": 1.0, + "step": 504 + }, + { + "completion_length": 78.046875, + "epoch": 2.3059360730593608, + "grad_norm": 3.375293493270874, + "kl": 0.1474609375, + "learning_rate": 7.69406392694064e-07, + "loss": 0.0059, + "reward": 1.7735260128974915, + "reward_std": 0.1295642852783203, + "rewards/accuracy_reward": 0.7813384830951691, + "rewards/format_reward": 0.9921875, + "step": 505 + }, + { + "completion_length": 73.734375, + "epoch": 2.3105022831050226, + "grad_norm": 4.356123924255371, + "kl": 0.114501953125, + "learning_rate": 7.689497716894977e-07, + "loss": 0.0046, + "reward": 1.7072916626930237, + "reward_std": 0.1626647561788559, + "rewards/accuracy_reward": 0.7072916328907013, + "rewards/format_reward": 1.0, + "step": 506 + }, + { + "completion_length": 66.375, + "epoch": 2.315068493150685, + "grad_norm": 2.643831253051758, + "kl": 0.1552734375, + "learning_rate": 7.684931506849314e-07, + "loss": 0.0062, + "reward": 1.7758206725120544, + "reward_std": 0.17595528066158295, + "rewards/accuracy_reward": 0.7914457023143768, + "rewards/format_reward": 0.984375, + "step": 507 + }, + { + "completion_length": 65.109375, + "epoch": 2.319634703196347, + "grad_norm": 6.30082368850708, + "kl": 0.15869140625, + "learning_rate": 7.680365296803653e-07, + "loss": 0.0064, + "reward": 1.5708190202713013, + "reward_std": 0.1858999952673912, + "rewards/accuracy_reward": 0.5708190500736237, + "rewards/format_reward": 1.0, + "step": 508 + }, + { + "completion_length": 76.015625, + "epoch": 2.324200913242009, + "grad_norm": 3.34980845451355, + "kl": 0.1240234375, + "learning_rate": 7.67579908675799e-07, + "loss": 0.005, + "reward": 1.6061203479766846, + "reward_std": 0.2387639731168747, + "rewards/accuracy_reward": 0.6295577883720398, + "rewards/format_reward": 0.9765625, + "step": 509 + }, + { + "completion_length": 69.5546875, + "epoch": 2.328767123287671, + "grad_norm": 2.7794432640075684, + "kl": 0.108642578125, + "learning_rate": 7.671232876712328e-07, + "loss": 0.0043, + "reward": 1.628348171710968, + "reward_std": 0.22845705598592758, + "rewards/accuracy_reward": 0.6361607015132904, + "rewards/format_reward": 0.9921875, + "step": 510 + }, + { + "completion_length": 77.1171875, + "epoch": 2.3333333333333335, + "grad_norm": 1.9683984518051147, + "kl": 0.10400390625, + "learning_rate": 7.666666666666667e-07, + "loss": 0.0041, + "reward": 1.7567708492279053, + "reward_std": 0.1648455262184143, + "rewards/accuracy_reward": 0.7723957598209381, + "rewards/format_reward": 0.984375, + "step": 511 + }, + { + "completion_length": 91.765625, + "epoch": 2.3378995433789953, + "grad_norm": 7.019916534423828, + "kl": 0.120849609375, + "learning_rate": 7.662100456621004e-07, + "loss": 0.0048, + "reward": 1.7004202008247375, + "reward_std": 0.13108576089143753, + "rewards/accuracy_reward": 0.708232581615448, + "rewards/format_reward": 0.9921875, + "step": 512 + }, + { + "completion_length": 80.7421875, + "epoch": 2.3424657534246576, + "grad_norm": 2.83266282081604, + "kl": 0.13134765625, + "learning_rate": 7.657534246575343e-07, + "loss": 0.0053, + "reward": 1.658984363079071, + "reward_std": 0.20922444760799408, + "rewards/accuracy_reward": 0.658984363079071, + "rewards/format_reward": 1.0, + "step": 513 + }, + { + "completion_length": 71.234375, + "epoch": 2.34703196347032, + "grad_norm": 1.9871636629104614, + "kl": 0.12646484375, + "learning_rate": 7.65296803652968e-07, + "loss": 0.005, + "reward": 1.725781261920929, + "reward_std": 0.10922157764434814, + "rewards/accuracy_reward": 0.7257812023162842, + "rewards/format_reward": 1.0, + "step": 514 + }, + { + "completion_length": 73.7265625, + "epoch": 2.3515981735159817, + "grad_norm": 2.359335422515869, + "kl": 0.11279296875, + "learning_rate": 7.648401826484017e-07, + "loss": 0.0045, + "reward": 1.772805094718933, + "reward_std": 0.16348526254296303, + "rewards/accuracy_reward": 0.7806175947189331, + "rewards/format_reward": 0.9921875, + "step": 515 + }, + { + "completion_length": 60.3828125, + "epoch": 2.356164383561644, + "grad_norm": 5.7438812255859375, + "kl": 0.14208984375, + "learning_rate": 7.643835616438356e-07, + "loss": 0.0057, + "reward": 1.751901626586914, + "reward_std": 0.17701375484466553, + "rewards/accuracy_reward": 0.7597140967845917, + "rewards/format_reward": 0.9921875, + "step": 516 + }, + { + "completion_length": 89.5703125, + "epoch": 2.3607305936073057, + "grad_norm": 4.794968128204346, + "kl": 0.0986328125, + "learning_rate": 7.639269406392693e-07, + "loss": 0.0039, + "reward": 1.7277343273162842, + "reward_std": 0.1568085253238678, + "rewards/accuracy_reward": 0.7355467975139618, + "rewards/format_reward": 0.9921875, + "step": 517 + }, + { + "completion_length": 72.1015625, + "epoch": 2.365296803652968, + "grad_norm": 3.052950143814087, + "kl": 0.10009765625, + "learning_rate": 7.634703196347032e-07, + "loss": 0.004, + "reward": 1.7309381365776062, + "reward_std": 0.13279738277196884, + "rewards/accuracy_reward": 0.7387505769729614, + "rewards/format_reward": 0.9921875, + "step": 518 + }, + { + "completion_length": 96.421875, + "epoch": 2.3698630136986303, + "grad_norm": 1.318949818611145, + "kl": 0.08251953125, + "learning_rate": 7.63013698630137e-07, + "loss": 0.0033, + "reward": 1.78125, + "reward_std": 0.18937532603740692, + "rewards/accuracy_reward": 0.8203124403953552, + "rewards/format_reward": 0.9609375, + "step": 519 + }, + { + "completion_length": 76.15625, + "epoch": 2.374429223744292, + "grad_norm": 2.7549917697906494, + "kl": 0.11669921875, + "learning_rate": 7.625570776255707e-07, + "loss": 0.0047, + "reward": 1.7448863983154297, + "reward_std": 0.23332231491804123, + "rewards/accuracy_reward": 0.7683238387107849, + "rewards/format_reward": 0.9765625, + "step": 520 + }, + { + "completion_length": 58.6328125, + "epoch": 2.3789954337899544, + "grad_norm": 3.747992753982544, + "kl": 0.1669921875, + "learning_rate": 7.621004566210046e-07, + "loss": 0.0067, + "reward": 1.6135417222976685, + "reward_std": 0.16909091174602509, + "rewards/accuracy_reward": 0.6135416924953461, + "rewards/format_reward": 1.0, + "step": 521 + }, + { + "completion_length": 59.4296875, + "epoch": 2.383561643835616, + "grad_norm": 5.62481164932251, + "kl": 0.14990234375, + "learning_rate": 7.616438356164383e-07, + "loss": 0.006, + "reward": 1.5095030069351196, + "reward_std": 0.3347364068031311, + "rewards/accuracy_reward": 0.5407529026269913, + "rewards/format_reward": 0.96875, + "step": 522 + }, + { + "completion_length": 73.7109375, + "epoch": 2.3881278538812785, + "grad_norm": 2.828101634979248, + "kl": 0.15869140625, + "learning_rate": 7.61187214611872e-07, + "loss": 0.0063, + "reward": 1.6775281429290771, + "reward_std": 0.15744981169700623, + "rewards/accuracy_reward": 0.6775281727313995, + "rewards/format_reward": 1.0, + "step": 523 + }, + { + "completion_length": 79.359375, + "epoch": 2.3926940639269407, + "grad_norm": 1.9861787557601929, + "kl": 0.1162109375, + "learning_rate": 7.607305936073059e-07, + "loss": 0.0046, + "reward": 1.645312488079071, + "reward_std": 0.12342093884944916, + "rewards/accuracy_reward": 0.6453125178813934, + "rewards/format_reward": 1.0, + "step": 524 + }, + { + "completion_length": 78.734375, + "epoch": 2.3972602739726026, + "grad_norm": 2.850097179412842, + "kl": 0.112548828125, + "learning_rate": 7.602739726027397e-07, + "loss": 0.0045, + "reward": 1.6301960349082947, + "reward_std": 0.18981467187404633, + "rewards/accuracy_reward": 0.6458209455013275, + "rewards/format_reward": 0.984375, + "step": 525 + }, + { + "completion_length": 62.109375, + "epoch": 2.401826484018265, + "grad_norm": 2.852185010910034, + "kl": 0.16552734375, + "learning_rate": 7.598173515981735e-07, + "loss": 0.0066, + "reward": 1.5752604007720947, + "reward_std": 0.20407412946224213, + "rewards/accuracy_reward": 0.5908854007720947, + "rewards/format_reward": 0.984375, + "step": 526 + }, + { + "completion_length": 68.4765625, + "epoch": 2.406392694063927, + "grad_norm": 2.7563745975494385, + "kl": 0.1240234375, + "learning_rate": 7.593607305936073e-07, + "loss": 0.005, + "reward": 1.646093726158142, + "reward_std": 0.255212739109993, + "rewards/accuracy_reward": 0.6695312261581421, + "rewards/format_reward": 0.9765625, + "step": 527 + }, + { + "completion_length": 83.1953125, + "epoch": 2.410958904109589, + "grad_norm": 3.005842924118042, + "kl": 0.11328125, + "learning_rate": 7.58904109589041e-07, + "loss": 0.0045, + "reward": 1.4694010615348816, + "reward_std": 0.34128230810165405, + "rewards/accuracy_reward": 0.516276016831398, + "rewards/format_reward": 0.953125, + "step": 528 + }, + { + "completion_length": 79.171875, + "epoch": 2.415525114155251, + "grad_norm": 10.589370727539062, + "kl": 0.40234375, + "learning_rate": 7.584474885844749e-07, + "loss": 0.0161, + "reward": 1.7232915163040161, + "reward_std": 0.16085164994001389, + "rewards/accuracy_reward": 0.7311040163040161, + "rewards/format_reward": 0.9921875, + "step": 529 + }, + { + "completion_length": 75.6484375, + "epoch": 2.4200913242009134, + "grad_norm": 2.8637847900390625, + "kl": 0.1416015625, + "learning_rate": 7.579908675799086e-07, + "loss": 0.0057, + "reward": 1.5583333373069763, + "reward_std": 0.24188697338104248, + "rewards/accuracy_reward": 0.5661458522081375, + "rewards/format_reward": 0.9921875, + "step": 530 + }, + { + "completion_length": 81.7421875, + "epoch": 2.4246575342465753, + "grad_norm": 2.9744412899017334, + "kl": 0.1015625, + "learning_rate": 7.575342465753424e-07, + "loss": 0.0041, + "reward": 1.6184896230697632, + "reward_std": 0.22770510613918304, + "rewards/accuracy_reward": 0.6341145932674408, + "rewards/format_reward": 0.984375, + "step": 531 + }, + { + "completion_length": 68.96875, + "epoch": 2.4292237442922375, + "grad_norm": 5.560794830322266, + "kl": 0.1513671875, + "learning_rate": 7.570776255707763e-07, + "loss": 0.006, + "reward": 1.6268364787101746, + "reward_std": 0.25046712160110474, + "rewards/accuracy_reward": 0.6346489191055298, + "rewards/format_reward": 0.9921875, + "step": 532 + }, + { + "completion_length": 83.734375, + "epoch": 2.4337899543378994, + "grad_norm": 1.856094241142273, + "kl": 0.103271484375, + "learning_rate": 7.5662100456621e-07, + "loss": 0.0041, + "reward": 1.719410002231598, + "reward_std": 0.18102595210075378, + "rewards/accuracy_reward": 0.7272224426269531, + "rewards/format_reward": 0.9921875, + "step": 533 + }, + { + "completion_length": 82.9765625, + "epoch": 2.4383561643835616, + "grad_norm": 1.8449293375015259, + "kl": 0.12255859375, + "learning_rate": 7.561643835616438e-07, + "loss": 0.0049, + "reward": 1.7729809880256653, + "reward_std": 0.10142140835523605, + "rewards/accuracy_reward": 0.7807934284210205, + "rewards/format_reward": 0.9921875, + "step": 534 + }, + { + "completion_length": 76.6953125, + "epoch": 2.442922374429224, + "grad_norm": 4.276492595672607, + "kl": 0.13427734375, + "learning_rate": 7.557077625570776e-07, + "loss": 0.0054, + "reward": 1.584375023841858, + "reward_std": 0.2730569392442703, + "rewards/accuracy_reward": 0.6078124940395355, + "rewards/format_reward": 0.9765625, + "step": 535 + }, + { + "completion_length": 84.6171875, + "epoch": 2.4474885844748857, + "grad_norm": 7.224926471710205, + "kl": 0.120361328125, + "learning_rate": 7.552511415525113e-07, + "loss": 0.0048, + "reward": 1.7129571437835693, + "reward_std": 0.16479767858982086, + "rewards/accuracy_reward": 0.7207695543766022, + "rewards/format_reward": 0.9921875, + "step": 536 + }, + { + "completion_length": 81.859375, + "epoch": 2.452054794520548, + "grad_norm": 2.366180181503296, + "kl": 0.10302734375, + "learning_rate": 7.547945205479452e-07, + "loss": 0.0041, + "reward": 1.7357105612754822, + "reward_std": 0.1823057383298874, + "rewards/accuracy_reward": 0.7747730314731598, + "rewards/format_reward": 0.9609375, + "step": 537 + }, + { + "completion_length": 92.140625, + "epoch": 2.45662100456621, + "grad_norm": 2.225709915161133, + "kl": 0.17041015625, + "learning_rate": 7.54337899543379e-07, + "loss": 0.0068, + "reward": 1.7205729484558105, + "reward_std": 0.136265367269516, + "rewards/accuracy_reward": 0.7205729186534882, + "rewards/format_reward": 1.0, + "step": 538 + }, + { + "completion_length": 53.6796875, + "epoch": 2.461187214611872, + "grad_norm": 2.991917133331299, + "kl": 0.181640625, + "learning_rate": 7.538812785388127e-07, + "loss": 0.0073, + "reward": 1.531180500984192, + "reward_std": 0.2481382116675377, + "rewards/accuracy_reward": 0.5389930307865143, + "rewards/format_reward": 0.9921875, + "step": 539 + }, + { + "completion_length": 72.734375, + "epoch": 2.4657534246575343, + "grad_norm": 5.530900001525879, + "kl": 0.10205078125, + "learning_rate": 7.534246575342466e-07, + "loss": 0.0041, + "reward": 1.705208420753479, + "reward_std": 0.24594328552484512, + "rewards/accuracy_reward": 0.7286458015441895, + "rewards/format_reward": 0.9765625, + "step": 540 + }, + { + "completion_length": 69.7734375, + "epoch": 2.470319634703196, + "grad_norm": 2.618407964706421, + "kl": 0.156982421875, + "learning_rate": 7.529680365296803e-07, + "loss": 0.0063, + "reward": 1.7212890982627869, + "reward_std": 0.19607724994421005, + "rewards/accuracy_reward": 0.7369140684604645, + "rewards/format_reward": 0.984375, + "step": 541 + }, + { + "completion_length": 66.2109375, + "epoch": 2.4748858447488584, + "grad_norm": 2.0617170333862305, + "kl": 0.14208984375, + "learning_rate": 7.525114155251141e-07, + "loss": 0.0057, + "reward": 1.6648437976837158, + "reward_std": 0.1707550622522831, + "rewards/accuracy_reward": 0.672656238079071, + "rewards/format_reward": 0.9921875, + "step": 542 + }, + { + "completion_length": 77.1171875, + "epoch": 2.4794520547945207, + "grad_norm": 3.923844814300537, + "kl": 0.188232421875, + "learning_rate": 7.520547945205479e-07, + "loss": 0.0076, + "reward": 1.7651843428611755, + "reward_std": 0.18746302276849747, + "rewards/accuracy_reward": 0.7729967534542084, + "rewards/format_reward": 0.9921875, + "step": 543 + }, + { + "completion_length": 83.75, + "epoch": 2.4840182648401825, + "grad_norm": 12.723535537719727, + "kl": 0.096923828125, + "learning_rate": 7.515981735159816e-07, + "loss": 0.0039, + "reward": 1.7339038252830505, + "reward_std": 0.16746822372078896, + "rewards/accuracy_reward": 0.7495287358760834, + "rewards/format_reward": 0.984375, + "step": 544 + }, + { + "completion_length": 63.9921875, + "epoch": 2.4885844748858448, + "grad_norm": 2.9950413703918457, + "kl": 0.130615234375, + "learning_rate": 7.511415525114156e-07, + "loss": 0.0052, + "reward": 1.6598958373069763, + "reward_std": 0.2278856635093689, + "rewards/accuracy_reward": 0.6833333075046539, + "rewards/format_reward": 0.9765625, + "step": 545 + }, + { + "completion_length": 82.7890625, + "epoch": 2.493150684931507, + "grad_norm": 2.3096249103546143, + "kl": 0.090576171875, + "learning_rate": 7.506849315068493e-07, + "loss": 0.0036, + "reward": 1.7203125953674316, + "reward_std": 0.20905159413814545, + "rewards/accuracy_reward": 0.7281249761581421, + "rewards/format_reward": 0.9921875, + "step": 546 + }, + { + "completion_length": 80.84375, + "epoch": 2.497716894977169, + "grad_norm": 3.3282742500305176, + "kl": 0.093994140625, + "learning_rate": 7.50228310502283e-07, + "loss": 0.0038, + "reward": 1.543817937374115, + "reward_std": 0.22718056291341782, + "rewards/accuracy_reward": 0.5594429075717926, + "rewards/format_reward": 0.984375, + "step": 547 + }, + { + "completion_length": 106.3046875, + "epoch": 2.502283105022831, + "grad_norm": 5.616523742675781, + "kl": 0.08251953125, + "learning_rate": 7.497716894977169e-07, + "loss": 0.0033, + "reward": 1.6008946895599365, + "reward_std": 0.2659634053707123, + "rewards/accuracy_reward": 0.6321446299552917, + "rewards/format_reward": 0.96875, + "step": 548 + }, + { + "completion_length": 90.3203125, + "epoch": 2.506849315068493, + "grad_norm": 5.4013166427612305, + "kl": 0.0994873046875, + "learning_rate": 7.493150684931506e-07, + "loss": 0.004, + "reward": 1.7019531726837158, + "reward_std": 0.17125242203474045, + "rewards/accuracy_reward": 0.7175780534744263, + "rewards/format_reward": 0.984375, + "step": 549 + }, + { + "completion_length": 86.5859375, + "epoch": 2.5114155251141552, + "grad_norm": 2.318875789642334, + "kl": 0.134765625, + "learning_rate": 7.488584474885844e-07, + "loss": 0.0054, + "reward": 1.6225537061691284, + "reward_std": 0.2404860332608223, + "rewards/accuracy_reward": 0.645991176366806, + "rewards/format_reward": 0.9765625, + "step": 550 + }, + { + "completion_length": 92.1015625, + "epoch": 2.5159817351598175, + "grad_norm": 6.498478412628174, + "kl": 0.092041015625, + "learning_rate": 7.484018264840183e-07, + "loss": 0.0037, + "reward": 1.6539062857627869, + "reward_std": 0.20888085663318634, + "rewards/accuracy_reward": 0.6695312559604645, + "rewards/format_reward": 0.984375, + "step": 551 + }, + { + "completion_length": 98.4375, + "epoch": 2.5205479452054793, + "grad_norm": 3.217737913131714, + "kl": 0.0743408203125, + "learning_rate": 7.47945205479452e-07, + "loss": 0.003, + "reward": 1.8604166507720947, + "reward_std": 0.10430474206805229, + "rewards/accuracy_reward": 0.8682291209697723, + "rewards/format_reward": 0.9921875, + "step": 552 + }, + { + "completion_length": 89.34375, + "epoch": 2.5251141552511416, + "grad_norm": 1.3329360485076904, + "kl": 0.12451171875, + "learning_rate": 7.474885844748859e-07, + "loss": 0.005, + "reward": 1.842187523841858, + "reward_std": 0.09198738168925047, + "rewards/accuracy_reward": 0.8421874642372131, + "rewards/format_reward": 1.0, + "step": 553 + }, + { + "completion_length": 89.234375, + "epoch": 2.5296803652968034, + "grad_norm": 3.4690394401550293, + "kl": 0.1015625, + "learning_rate": 7.470319634703196e-07, + "loss": 0.0041, + "reward": 1.7106770873069763, + "reward_std": 0.19660182297229767, + "rewards/accuracy_reward": 0.7184895873069763, + "rewards/format_reward": 0.9921875, + "step": 554 + }, + { + "completion_length": 82.5859375, + "epoch": 2.5342465753424657, + "grad_norm": 3.373579740524292, + "kl": 0.113037109375, + "learning_rate": 7.465753424657533e-07, + "loss": 0.0045, + "reward": 1.5816163420677185, + "reward_std": 0.20616846531629562, + "rewards/accuracy_reward": 0.5816163718700409, + "rewards/format_reward": 1.0, + "step": 555 + }, + { + "completion_length": 93.5078125, + "epoch": 2.538812785388128, + "grad_norm": 3.093045473098755, + "kl": 0.1044921875, + "learning_rate": 7.461187214611872e-07, + "loss": 0.0042, + "reward": 1.608422875404358, + "reward_std": 0.21706774830818176, + "rewards/accuracy_reward": 0.6240477561950684, + "rewards/format_reward": 0.984375, + "step": 556 + }, + { + "completion_length": 97.0703125, + "epoch": 2.54337899543379, + "grad_norm": 2.3710570335388184, + "kl": 0.095703125, + "learning_rate": 7.456621004566209e-07, + "loss": 0.0038, + "reward": 1.800067663192749, + "reward_std": 0.09500321745872498, + "rewards/accuracy_reward": 0.8000677227973938, + "rewards/format_reward": 1.0, + "step": 557 + }, + { + "completion_length": 71.671875, + "epoch": 2.547945205479452, + "grad_norm": 2.5310826301574707, + "kl": 0.108154296875, + "learning_rate": 7.452054794520548e-07, + "loss": 0.0043, + "reward": 1.5969815254211426, + "reward_std": 0.2227228805422783, + "rewards/accuracy_reward": 0.5969814956188202, + "rewards/format_reward": 1.0, + "step": 558 + }, + { + "completion_length": 78.15625, + "epoch": 2.5525114155251143, + "grad_norm": 2.1528046131134033, + "kl": 0.12451171875, + "learning_rate": 7.447488584474886e-07, + "loss": 0.005, + "reward": 1.7443639636039734, + "reward_std": 0.10288457944989204, + "rewards/accuracy_reward": 0.752176433801651, + "rewards/format_reward": 0.9921875, + "step": 559 + }, + { + "completion_length": 84.546875, + "epoch": 2.557077625570776, + "grad_norm": 2.656460762023926, + "kl": 0.1171875, + "learning_rate": 7.442922374429223e-07, + "loss": 0.0047, + "reward": 1.7289806604385376, + "reward_std": 0.17229026556015015, + "rewards/accuracy_reward": 0.7289806008338928, + "rewards/format_reward": 1.0, + "step": 560 + }, + { + "completion_length": 81.8984375, + "epoch": 2.5616438356164384, + "grad_norm": 3.7874462604522705, + "kl": 0.12548828125, + "learning_rate": 7.438356164383562e-07, + "loss": 0.005, + "reward": 1.7026662230491638, + "reward_std": 0.2061307728290558, + "rewards/accuracy_reward": 0.7104786336421967, + "rewards/format_reward": 0.9921875, + "step": 561 + }, + { + "completion_length": 69.5546875, + "epoch": 2.5662100456621006, + "grad_norm": 2.563668966293335, + "kl": 0.1201171875, + "learning_rate": 7.433789954337899e-07, + "loss": 0.0048, + "reward": 1.5755208134651184, + "reward_std": 0.207001730799675, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/format_reward": 0.9921875, + "step": 562 + }, + { + "completion_length": 76.96875, + "epoch": 2.5707762557077625, + "grad_norm": 2.503657817840576, + "kl": 0.098876953125, + "learning_rate": 7.429223744292236e-07, + "loss": 0.004, + "reward": 1.7036305665969849, + "reward_std": 0.16530893370509148, + "rewards/accuracy_reward": 0.7036304771900177, + "rewards/format_reward": 1.0, + "step": 563 + }, + { + "completion_length": 86.0078125, + "epoch": 2.5753424657534247, + "grad_norm": 4.6145734786987305, + "kl": 0.087646484375, + "learning_rate": 7.424657534246575e-07, + "loss": 0.0035, + "reward": 1.5936384201049805, + "reward_std": 0.24451126903295517, + "rewards/accuracy_reward": 0.6248884201049805, + "rewards/format_reward": 0.96875, + "step": 564 + }, + { + "completion_length": 91.1171875, + "epoch": 2.5799086757990866, + "grad_norm": 3.2497313022613525, + "kl": 0.116455078125, + "learning_rate": 7.420091324200913e-07, + "loss": 0.0047, + "reward": 1.555757462978363, + "reward_std": 0.2567907050251961, + "rewards/accuracy_reward": 0.5870074331760406, + "rewards/format_reward": 0.96875, + "step": 565 + }, + { + "completion_length": 89.8984375, + "epoch": 2.584474885844749, + "grad_norm": 2.177954912185669, + "kl": 0.073974609375, + "learning_rate": 7.415525114155251e-07, + "loss": 0.003, + "reward": 1.7282168865203857, + "reward_std": 0.18810292333364487, + "rewards/accuracy_reward": 0.7516542971134186, + "rewards/format_reward": 0.9765625, + "step": 566 + }, + { + "completion_length": 59.671875, + "epoch": 2.589041095890411, + "grad_norm": 2.342167854309082, + "kl": 0.12646484375, + "learning_rate": 7.410958904109589e-07, + "loss": 0.0051, + "reward": 1.5524925589561462, + "reward_std": 0.19321630895137787, + "rewards/accuracy_reward": 0.5524925589561462, + "rewards/format_reward": 1.0, + "step": 567 + }, + { + "completion_length": 81.0390625, + "epoch": 2.593607305936073, + "grad_norm": 25.185625076293945, + "kl": 0.099609375, + "learning_rate": 7.406392694063926e-07, + "loss": 0.004, + "reward": 1.685937523841858, + "reward_std": 0.2477683424949646, + "rewards/accuracy_reward": 0.6937499642372131, + "rewards/format_reward": 0.9921875, + "step": 568 + }, + { + "completion_length": 86.671875, + "epoch": 2.598173515981735, + "grad_norm": 3.6345832347869873, + "kl": 0.095458984375, + "learning_rate": 7.401826484018265e-07, + "loss": 0.0038, + "reward": 1.6493847966194153, + "reward_std": 0.20508087426424026, + "rewards/accuracy_reward": 0.6493848264217377, + "rewards/format_reward": 1.0, + "step": 569 + }, + { + "completion_length": 111.0, + "epoch": 2.602739726027397, + "grad_norm": 1.393759846687317, + "kl": 0.06494140625, + "learning_rate": 7.397260273972602e-07, + "loss": 0.0026, + "reward": 1.872656226158142, + "reward_std": 0.10857155546545982, + "rewards/accuracy_reward": 0.8960936367511749, + "rewards/format_reward": 0.9765625, + "step": 570 + }, + { + "completion_length": 96.6875, + "epoch": 2.6073059360730593, + "grad_norm": 2.2273528575897217, + "kl": 0.077880859375, + "learning_rate": 7.39269406392694e-07, + "loss": 0.0031, + "reward": 1.8092340230941772, + "reward_std": 0.10701124370098114, + "rewards/accuracy_reward": 0.8170464634895325, + "rewards/format_reward": 0.9921875, + "step": 571 + }, + { + "completion_length": 70.09375, + "epoch": 2.6118721461187215, + "grad_norm": 2.2020483016967773, + "kl": 0.108154296875, + "learning_rate": 7.388127853881279e-07, + "loss": 0.0043, + "reward": 1.5493839979171753, + "reward_std": 0.19577539712190628, + "rewards/accuracy_reward": 0.5493840277194977, + "rewards/format_reward": 1.0, + "step": 572 + }, + { + "completion_length": 56.6875, + "epoch": 2.616438356164384, + "grad_norm": 8.005638122558594, + "kl": 0.111572265625, + "learning_rate": 7.383561643835616e-07, + "loss": 0.0045, + "reward": 1.457552194595337, + "reward_std": 0.28487062454223633, + "rewards/accuracy_reward": 0.45755207538604736, + "rewards/format_reward": 1.0, + "step": 573 + }, + { + "completion_length": 67.3984375, + "epoch": 2.6210045662100456, + "grad_norm": 7.093777179718018, + "kl": 0.1669921875, + "learning_rate": 7.378995433789954e-07, + "loss": 0.0067, + "reward": 1.7374799847602844, + "reward_std": 0.1953911855816841, + "rewards/accuracy_reward": 0.745292454957962, + "rewards/format_reward": 0.9921875, + "step": 574 + }, + { + "completion_length": 70.953125, + "epoch": 2.625570776255708, + "grad_norm": 2.916884183883667, + "kl": 0.129150390625, + "learning_rate": 7.374429223744292e-07, + "loss": 0.0052, + "reward": 1.5901844501495361, + "reward_std": 0.2463785707950592, + "rewards/accuracy_reward": 0.5901843905448914, + "rewards/format_reward": 1.0, + "step": 575 + }, + { + "completion_length": 79.0859375, + "epoch": 2.6301369863013697, + "grad_norm": 7.461641311645508, + "kl": 0.12890625, + "learning_rate": 7.369863013698629e-07, + "loss": 0.0052, + "reward": 1.666406273841858, + "reward_std": 0.24838291853666306, + "rewards/accuracy_reward": 0.6742187440395355, + "rewards/format_reward": 0.9921875, + "step": 576 + }, + { + "completion_length": 71.1796875, + "epoch": 2.634703196347032, + "grad_norm": 2.474581718444824, + "kl": 0.125244140625, + "learning_rate": 7.365296803652968e-07, + "loss": 0.005, + "reward": 1.6498884558677673, + "reward_std": 0.15810929238796234, + "rewards/accuracy_reward": 0.6498883664608002, + "rewards/format_reward": 1.0, + "step": 577 + }, + { + "completion_length": 93.484375, + "epoch": 2.6392694063926943, + "grad_norm": 2.4049859046936035, + "kl": 0.068359375, + "learning_rate": 7.360730593607306e-07, + "loss": 0.0027, + "reward": 1.7192708849906921, + "reward_std": 0.1887570172548294, + "rewards/accuracy_reward": 0.742708295583725, + "rewards/format_reward": 0.9765625, + "step": 578 + }, + { + "completion_length": 78.71875, + "epoch": 2.643835616438356, + "grad_norm": 12.206317901611328, + "kl": 0.11962890625, + "learning_rate": 7.356164383561643e-07, + "loss": 0.0048, + "reward": 1.6132813096046448, + "reward_std": 0.19830159842967987, + "rewards/accuracy_reward": 0.62890625, + "rewards/format_reward": 0.984375, + "step": 579 + }, + { + "completion_length": 84.28125, + "epoch": 2.6484018264840183, + "grad_norm": 3.1371257305145264, + "kl": 0.112060546875, + "learning_rate": 7.351598173515982e-07, + "loss": 0.0045, + "reward": 1.7513157725334167, + "reward_std": 0.13612205535173416, + "rewards/accuracy_reward": 0.7513157725334167, + "rewards/format_reward": 1.0, + "step": 580 + }, + { + "completion_length": 74.203125, + "epoch": 2.65296803652968, + "grad_norm": 4.9632792472839355, + "kl": 0.21728515625, + "learning_rate": 7.347031963470319e-07, + "loss": 0.0087, + "reward": 1.5989612340927124, + "reward_std": 0.21190468221902847, + "rewards/accuracy_reward": 0.5989611893892288, + "rewards/format_reward": 1.0, + "step": 581 + }, + { + "completion_length": 56.5234375, + "epoch": 2.6575342465753424, + "grad_norm": 3.400489091873169, + "kl": 0.1005859375, + "learning_rate": 7.342465753424657e-07, + "loss": 0.004, + "reward": 1.5473958253860474, + "reward_std": 0.194392129778862, + "rewards/accuracy_reward": 0.5473958253860474, + "rewards/format_reward": 1.0, + "step": 582 + }, + { + "completion_length": 56.2421875, + "epoch": 2.6621004566210047, + "grad_norm": 2.7040834426879883, + "kl": 0.19140625, + "learning_rate": 7.337899543378995e-07, + "loss": 0.0077, + "reward": 1.5410130023956299, + "reward_std": 0.23740805685520172, + "rewards/accuracy_reward": 0.5488254725933075, + "rewards/format_reward": 0.9921875, + "step": 583 + }, + { + "completion_length": 103.8515625, + "epoch": 2.6666666666666665, + "grad_norm": 1.5765724182128906, + "kl": 0.0721435546875, + "learning_rate": 7.333333333333332e-07, + "loss": 0.0029, + "reward": 1.6796875, + "reward_std": 0.134404756128788, + "rewards/accuracy_reward": 0.6874999403953552, + "rewards/format_reward": 0.9921875, + "step": 584 + }, + { + "completion_length": 74.1875, + "epoch": 2.671232876712329, + "grad_norm": 7.368083477020264, + "kl": 0.115234375, + "learning_rate": 7.328767123287672e-07, + "loss": 0.0046, + "reward": 1.6952009201049805, + "reward_std": 0.23021817207336426, + "rewards/accuracy_reward": 0.7030133903026581, + "rewards/format_reward": 0.9921875, + "step": 585 + }, + { + "completion_length": 74.0703125, + "epoch": 2.6757990867579906, + "grad_norm": 4.19453239440918, + "kl": 0.112548828125, + "learning_rate": 7.324200913242009e-07, + "loss": 0.0045, + "reward": 1.7542436122894287, + "reward_std": 0.16448176465928555, + "rewards/accuracy_reward": 0.7620560824871063, + "rewards/format_reward": 0.9921875, + "step": 586 + }, + { + "completion_length": 83.4453125, + "epoch": 2.680365296803653, + "grad_norm": 2.116567373275757, + "kl": 0.1123046875, + "learning_rate": 7.319634703196346e-07, + "loss": 0.0045, + "reward": 1.8114583492279053, + "reward_std": 0.10706461034715176, + "rewards/accuracy_reward": 0.8114582598209381, + "rewards/format_reward": 1.0, + "step": 587 + }, + { + "completion_length": 74.1171875, + "epoch": 2.684931506849315, + "grad_norm": 3.840576648712158, + "kl": 0.106689453125, + "learning_rate": 7.315068493150685e-07, + "loss": 0.0043, + "reward": 1.5600694417953491, + "reward_std": 0.20616772770881653, + "rewards/accuracy_reward": 0.5600694417953491, + "rewards/format_reward": 1.0, + "step": 588 + }, + { + "completion_length": 80.515625, + "epoch": 2.6894977168949774, + "grad_norm": 5.284098148345947, + "kl": 0.10986328125, + "learning_rate": 7.310502283105022e-07, + "loss": 0.0044, + "reward": 1.6452972888946533, + "reward_std": 0.1739073097705841, + "rewards/accuracy_reward": 0.6452972292900085, + "rewards/format_reward": 1.0, + "step": 589 + }, + { + "completion_length": 80.0078125, + "epoch": 2.6940639269406392, + "grad_norm": 2.7230966091156006, + "kl": 0.1328125, + "learning_rate": 7.30593607305936e-07, + "loss": 0.0053, + "reward": 1.7089230418205261, + "reward_std": 0.18773558735847473, + "rewards/accuracy_reward": 0.7167355120182037, + "rewards/format_reward": 0.9921875, + "step": 590 + }, + { + "completion_length": 65.5625, + "epoch": 2.6986301369863015, + "grad_norm": 3.291010618209839, + "kl": 0.1162109375, + "learning_rate": 7.301369863013699e-07, + "loss": 0.0047, + "reward": 1.646875023841858, + "reward_std": 0.2957848533987999, + "rewards/accuracy_reward": 0.6624999642372131, + "rewards/format_reward": 0.984375, + "step": 591 + }, + { + "completion_length": 78.953125, + "epoch": 2.7031963470319633, + "grad_norm": 3.601032257080078, + "kl": 0.126953125, + "learning_rate": 7.296803652968036e-07, + "loss": 0.0051, + "reward": 1.6582031846046448, + "reward_std": 0.2431168407201767, + "rewards/accuracy_reward": 0.6738280951976776, + "rewards/format_reward": 0.984375, + "step": 592 + }, + { + "completion_length": 75.515625, + "epoch": 2.7077625570776256, + "grad_norm": 3.7182750701904297, + "kl": 0.111572265625, + "learning_rate": 7.292237442922375e-07, + "loss": 0.0045, + "reward": 1.8291015625, + "reward_std": 0.15765050053596497, + "rewards/accuracy_reward": 0.8291015625, + "rewards/format_reward": 1.0, + "step": 593 + }, + { + "completion_length": 65.7890625, + "epoch": 2.712328767123288, + "grad_norm": 3.52632737159729, + "kl": 0.15966796875, + "learning_rate": 7.287671232876712e-07, + "loss": 0.0064, + "reward": 1.6682049632072449, + "reward_std": 0.22022631764411926, + "rewards/accuracy_reward": 0.6682049036026001, + "rewards/format_reward": 1.0, + "step": 594 + }, + { + "completion_length": 94.90625, + "epoch": 2.7168949771689497, + "grad_norm": 1.899457335472107, + "kl": 0.091796875, + "learning_rate": 7.283105022831049e-07, + "loss": 0.0037, + "reward": 1.6791666746139526, + "reward_std": 0.18102534115314484, + "rewards/accuracy_reward": 0.7104166448116302, + "rewards/format_reward": 0.96875, + "step": 595 + }, + { + "completion_length": 80.5390625, + "epoch": 2.721461187214612, + "grad_norm": 2.4807851314544678, + "kl": 0.113525390625, + "learning_rate": 7.278538812785388e-07, + "loss": 0.0045, + "reward": 1.8067708611488342, + "reward_std": 0.12291676551103592, + "rewards/accuracy_reward": 0.8067708313465118, + "rewards/format_reward": 1.0, + "step": 596 + }, + { + "completion_length": 83.34375, + "epoch": 2.7260273972602738, + "grad_norm": 2.2889952659606934, + "kl": 0.11669921875, + "learning_rate": 7.273972602739725e-07, + "loss": 0.0047, + "reward": 1.7340867519378662, + "reward_std": 0.2301034778356552, + "rewards/accuracy_reward": 0.7731491327285767, + "rewards/format_reward": 0.9609375, + "step": 597 + }, + { + "completion_length": 76.90625, + "epoch": 2.730593607305936, + "grad_norm": 2.431467056274414, + "kl": 0.141845703125, + "learning_rate": 7.269406392694064e-07, + "loss": 0.0057, + "reward": 1.702616572380066, + "reward_std": 0.16628245636820793, + "rewards/accuracy_reward": 0.7026165425777435, + "rewards/format_reward": 1.0, + "step": 598 + }, + { + "completion_length": 75.203125, + "epoch": 2.7351598173515983, + "grad_norm": 2.5654890537261963, + "kl": 0.1119384765625, + "learning_rate": 7.264840182648402e-07, + "loss": 0.0045, + "reward": 1.7242187857627869, + "reward_std": 0.13443218544125557, + "rewards/accuracy_reward": 0.7320312559604645, + "rewards/format_reward": 0.9921875, + "step": 599 + }, + { + "completion_length": 77.8515625, + "epoch": 2.73972602739726, + "grad_norm": 2.4375691413879395, + "kl": 0.097900390625, + "learning_rate": 7.260273972602739e-07, + "loss": 0.0039, + "reward": 1.7069196701049805, + "reward_std": 0.24784404039382935, + "rewards/accuracy_reward": 0.7225446105003357, + "rewards/format_reward": 0.984375, + "step": 600 + }, + { + "completion_length": 73.1953125, + "epoch": 2.7442922374429224, + "grad_norm": 2.0869359970092773, + "kl": 0.1171875, + "learning_rate": 7.255707762557078e-07, + "loss": 0.0047, + "reward": 1.6764508485794067, + "reward_std": 0.11909351497888565, + "rewards/accuracy_reward": 0.6764508485794067, + "rewards/format_reward": 1.0, + "step": 601 + }, + { + "completion_length": 59.4765625, + "epoch": 2.748858447488584, + "grad_norm": 3.2937729358673096, + "kl": 0.138427734375, + "learning_rate": 7.251141552511415e-07, + "loss": 0.0055, + "reward": 1.538671851158142, + "reward_std": 0.36779990792274475, + "rewards/accuracy_reward": 0.5542968809604645, + "rewards/format_reward": 0.984375, + "step": 602 + }, + { + "completion_length": 92.5703125, + "epoch": 2.7534246575342465, + "grad_norm": 3.537177085876465, + "kl": 0.090087890625, + "learning_rate": 7.246575342465752e-07, + "loss": 0.0036, + "reward": 1.722743034362793, + "reward_std": 0.18279560655355453, + "rewards/accuracy_reward": 0.730555534362793, + "rewards/format_reward": 0.9921875, + "step": 603 + }, + { + "completion_length": 93.8046875, + "epoch": 2.7579908675799087, + "grad_norm": 2.128953218460083, + "kl": 0.08056640625, + "learning_rate": 7.242009132420091e-07, + "loss": 0.0032, + "reward": 1.745312511920929, + "reward_std": 0.1521657407283783, + "rewards/accuracy_reward": 0.7453123927116394, + "rewards/format_reward": 1.0, + "step": 604 + }, + { + "completion_length": 65.9375, + "epoch": 2.762557077625571, + "grad_norm": 17.820329666137695, + "kl": 0.10205078125, + "learning_rate": 7.237442922374429e-07, + "loss": 0.0041, + "reward": 1.6083807349205017, + "reward_std": 0.20848889648914337, + "rewards/accuracy_reward": 0.6161931753158569, + "rewards/format_reward": 0.9921875, + "step": 605 + }, + { + "completion_length": 71.671875, + "epoch": 2.767123287671233, + "grad_norm": 2.2198660373687744, + "kl": 0.10009765625, + "learning_rate": 7.232876712328767e-07, + "loss": 0.004, + "reward": 1.7286458611488342, + "reward_std": 0.16050894185900688, + "rewards/accuracy_reward": 0.7286458313465118, + "rewards/format_reward": 1.0, + "step": 606 + }, + { + "completion_length": 84.546875, + "epoch": 2.771689497716895, + "grad_norm": 2.738107681274414, + "kl": 0.11376953125, + "learning_rate": 7.228310502283105e-07, + "loss": 0.0046, + "reward": 1.6790487170219421, + "reward_std": 0.19785276055335999, + "rewards/accuracy_reward": 0.6946736574172974, + "rewards/format_reward": 0.984375, + "step": 607 + }, + { + "completion_length": 71.125, + "epoch": 2.776255707762557, + "grad_norm": 5.066852569580078, + "kl": 0.151123046875, + "learning_rate": 7.223744292237442e-07, + "loss": 0.006, + "reward": 1.5531622171401978, + "reward_std": 0.3353729024529457, + "rewards/accuracy_reward": 0.5609746873378754, + "rewards/format_reward": 0.9921875, + "step": 608 + }, + { + "completion_length": 83.7890625, + "epoch": 2.780821917808219, + "grad_norm": 4.337226867675781, + "kl": 0.117919921875, + "learning_rate": 7.219178082191781e-07, + "loss": 0.0047, + "reward": 1.7048035860061646, + "reward_std": 0.13883494585752487, + "rewards/accuracy_reward": 0.7048035860061646, + "rewards/format_reward": 1.0, + "step": 609 + }, + { + "completion_length": 87.3984375, + "epoch": 2.7853881278538815, + "grad_norm": 2.790894031524658, + "kl": 0.09716796875, + "learning_rate": 7.214611872146118e-07, + "loss": 0.0039, + "reward": 1.6720238327980042, + "reward_std": 0.16603849083185196, + "rewards/accuracy_reward": 0.6798363327980042, + "rewards/format_reward": 0.9921875, + "step": 610 + }, + { + "completion_length": 79.4140625, + "epoch": 2.7899543378995433, + "grad_norm": 4.233171463012695, + "kl": 0.1103515625, + "learning_rate": 7.210045662100456e-07, + "loss": 0.0044, + "reward": 1.6883246898651123, + "reward_std": 0.21825328469276428, + "rewards/accuracy_reward": 0.6961371600627899, + "rewards/format_reward": 0.9921875, + "step": 611 + }, + { + "completion_length": 63.875, + "epoch": 2.7945205479452055, + "grad_norm": 4.263243198394775, + "kl": 0.138671875, + "learning_rate": 7.205479452054795e-07, + "loss": 0.0055, + "reward": 1.6089910864830017, + "reward_std": 0.2958259731531143, + "rewards/accuracy_reward": 0.6246160566806793, + "rewards/format_reward": 0.984375, + "step": 612 + }, + { + "completion_length": 72.21875, + "epoch": 2.7990867579908674, + "grad_norm": 2.3356006145477295, + "kl": 0.110595703125, + "learning_rate": 7.200913242009132e-07, + "loss": 0.0044, + "reward": 1.5526910424232483, + "reward_std": 0.24793513119220734, + "rewards/accuracy_reward": 0.5526909977197647, + "rewards/format_reward": 1.0, + "step": 613 + }, + { + "completion_length": 100.8203125, + "epoch": 2.8036529680365296, + "grad_norm": 1.9555134773254395, + "kl": 0.098388671875, + "learning_rate": 7.19634703196347e-07, + "loss": 0.0039, + "reward": 1.7364583611488342, + "reward_std": 0.15703274309635162, + "rewards/accuracy_reward": 0.7442708015441895, + "rewards/format_reward": 0.9921875, + "step": 614 + }, + { + "completion_length": 71.8984375, + "epoch": 2.808219178082192, + "grad_norm": 5.0422539710998535, + "kl": 0.16455078125, + "learning_rate": 7.191780821917808e-07, + "loss": 0.0066, + "reward": 1.4983445405960083, + "reward_std": 0.2954416871070862, + "rewards/accuracy_reward": 0.5061569809913635, + "rewards/format_reward": 0.9921875, + "step": 615 + }, + { + "completion_length": 81.109375, + "epoch": 2.8127853881278537, + "grad_norm": 2.615528106689453, + "kl": 0.123779296875, + "learning_rate": 7.187214611872145e-07, + "loss": 0.005, + "reward": 1.685937523841858, + "reward_std": 0.16778654977679253, + "rewards/accuracy_reward": 0.6859375238418579, + "rewards/format_reward": 1.0, + "step": 616 + }, + { + "completion_length": 78.1171875, + "epoch": 2.817351598173516, + "grad_norm": 3.2558462619781494, + "kl": 0.099609375, + "learning_rate": 7.182648401826484e-07, + "loss": 0.004, + "reward": 1.7515625953674316, + "reward_std": 0.1005905494093895, + "rewards/accuracy_reward": 0.7593749463558197, + "rewards/format_reward": 0.9921875, + "step": 617 + }, + { + "completion_length": 85.3671875, + "epoch": 2.821917808219178, + "grad_norm": 1.791414499282837, + "kl": 0.127197265625, + "learning_rate": 7.178082191780822e-07, + "loss": 0.0051, + "reward": 1.6640625, + "reward_std": 0.16250330954790115, + "rewards/accuracy_reward": 0.6718749403953552, + "rewards/format_reward": 0.9921875, + "step": 618 + }, + { + "completion_length": 88.2578125, + "epoch": 2.82648401826484, + "grad_norm": 5.740970134735107, + "kl": 0.077392578125, + "learning_rate": 7.173515981735159e-07, + "loss": 0.0031, + "reward": 1.652303397655487, + "reward_std": 0.17974259704351425, + "rewards/accuracy_reward": 0.6523034274578094, + "rewards/format_reward": 1.0, + "step": 619 + }, + { + "completion_length": 95.4609375, + "epoch": 2.8310502283105023, + "grad_norm": 2.585156202316284, + "kl": 0.115234375, + "learning_rate": 7.168949771689498e-07, + "loss": 0.0046, + "reward": 1.6767844557762146, + "reward_std": 0.14024699479341507, + "rewards/accuracy_reward": 0.6845969557762146, + "rewards/format_reward": 0.9921875, + "step": 620 + }, + { + "completion_length": 80.734375, + "epoch": 2.8356164383561646, + "grad_norm": 6.572816848754883, + "kl": 0.138671875, + "learning_rate": 7.164383561643835e-07, + "loss": 0.0055, + "reward": 1.6025669574737549, + "reward_std": 0.22662456333637238, + "rewards/accuracy_reward": 0.6181919574737549, + "rewards/format_reward": 0.984375, + "step": 621 + }, + { + "completion_length": 62.4921875, + "epoch": 2.8401826484018264, + "grad_norm": 12.138286590576172, + "kl": 0.16650390625, + "learning_rate": 7.159817351598173e-07, + "loss": 0.0067, + "reward": 1.7107762694358826, + "reward_std": 0.16279632598161697, + "rewards/accuracy_reward": 0.7107762694358826, + "rewards/format_reward": 1.0, + "step": 622 + }, + { + "completion_length": 82.625, + "epoch": 2.8447488584474887, + "grad_norm": 4.130621910095215, + "kl": 0.110595703125, + "learning_rate": 7.155251141552511e-07, + "loss": 0.0044, + "reward": 1.792187511920929, + "reward_std": 0.12151552736759186, + "rewards/accuracy_reward": 0.7921874523162842, + "rewards/format_reward": 1.0, + "step": 623 + }, + { + "completion_length": 66.8203125, + "epoch": 2.8493150684931505, + "grad_norm": 7.18456506729126, + "kl": 0.12841796875, + "learning_rate": 7.150684931506848e-07, + "loss": 0.0051, + "reward": 1.5980710983276367, + "reward_std": 0.21035503596067429, + "rewards/accuracy_reward": 0.6058836281299591, + "rewards/format_reward": 0.9921875, + "step": 624 + }, + { + "completion_length": 92.1171875, + "epoch": 2.853881278538813, + "grad_norm": 5.2410759925842285, + "kl": 0.119140625, + "learning_rate": 7.146118721461188e-07, + "loss": 0.0048, + "reward": 1.8228118419647217, + "reward_std": 0.17064978182315826, + "rewards/accuracy_reward": 0.8306242823600769, + "rewards/format_reward": 0.9921875, + "step": 625 + }, + { + "completion_length": 100.2265625, + "epoch": 2.858447488584475, + "grad_norm": 12.312285423278809, + "kl": 0.086669921875, + "learning_rate": 7.141552511415525e-07, + "loss": 0.0035, + "reward": 1.5916666984558105, + "reward_std": 0.2057085707783699, + "rewards/accuracy_reward": 0.5994791686534882, + "rewards/format_reward": 0.9921875, + "step": 626 + }, + { + "completion_length": 94.7109375, + "epoch": 2.863013698630137, + "grad_norm": 2.4348156452178955, + "kl": 0.09423828125, + "learning_rate": 7.136986301369862e-07, + "loss": 0.0038, + "reward": 1.71042400598526, + "reward_std": 0.15129226446151733, + "rewards/accuracy_reward": 0.7104238867759705, + "rewards/format_reward": 1.0, + "step": 627 + }, + { + "completion_length": 76.09375, + "epoch": 2.867579908675799, + "grad_norm": 2.56550931930542, + "kl": 0.15283203125, + "learning_rate": 7.132420091324201e-07, + "loss": 0.0061, + "reward": 1.6796875, + "reward_std": 0.1692301705479622, + "rewards/accuracy_reward": 0.6796875298023224, + "rewards/format_reward": 1.0, + "step": 628 + }, + { + "completion_length": 63.71875, + "epoch": 2.872146118721461, + "grad_norm": 3.254136562347412, + "kl": 0.1396484375, + "learning_rate": 7.127853881278538e-07, + "loss": 0.0056, + "reward": 1.5118472576141357, + "reward_std": 0.22680865228176117, + "rewards/accuracy_reward": 0.5118472576141357, + "rewards/format_reward": 1.0, + "step": 629 + }, + { + "completion_length": 91.984375, + "epoch": 2.8767123287671232, + "grad_norm": 1.9678096771240234, + "kl": 0.118896484375, + "learning_rate": 7.123287671232876e-07, + "loss": 0.0047, + "reward": 1.7608563899993896, + "reward_std": 0.10874464362859726, + "rewards/accuracy_reward": 0.7608563899993896, + "rewards/format_reward": 1.0, + "step": 630 + }, + { + "completion_length": 80.421875, + "epoch": 2.8812785388127855, + "grad_norm": 3.203822374343872, + "kl": 0.14111328125, + "learning_rate": 7.118721461187215e-07, + "loss": 0.0056, + "reward": 1.5107174515724182, + "reward_std": 0.2804914563894272, + "rewards/accuracy_reward": 0.526342436671257, + "rewards/format_reward": 0.984375, + "step": 631 + }, + { + "completion_length": 69.8671875, + "epoch": 2.8858447488584473, + "grad_norm": 2.1312613487243652, + "kl": 0.1513671875, + "learning_rate": 7.114155251141552e-07, + "loss": 0.0061, + "reward": 1.783835530281067, + "reward_std": 0.09552156459540129, + "rewards/accuracy_reward": 0.7838355302810669, + "rewards/format_reward": 1.0, + "step": 632 + }, + { + "completion_length": 83.6796875, + "epoch": 2.8904109589041096, + "grad_norm": 2.534517765045166, + "kl": 0.129638671875, + "learning_rate": 7.109589041095891e-07, + "loss": 0.0052, + "reward": 1.820498526096344, + "reward_std": 0.14453133195638657, + "rewards/accuracy_reward": 0.8204984366893768, + "rewards/format_reward": 1.0, + "step": 633 + }, + { + "completion_length": 100.578125, + "epoch": 2.8949771689497714, + "grad_norm": 4.368940353393555, + "kl": 0.137451171875, + "learning_rate": 7.105022831050228e-07, + "loss": 0.0055, + "reward": 1.6578125357627869, + "reward_std": 0.2853652313351631, + "rewards/accuracy_reward": 0.6890624761581421, + "rewards/format_reward": 0.96875, + "step": 634 + }, + { + "completion_length": 91.734375, + "epoch": 2.8995433789954337, + "grad_norm": 2.2129926681518555, + "kl": 0.1015625, + "learning_rate": 7.100456621004565e-07, + "loss": 0.0041, + "reward": 1.60442715883255, + "reward_std": 0.17540115863084793, + "rewards/accuracy_reward": 0.6200520992279053, + "rewards/format_reward": 0.984375, + "step": 635 + }, + { + "completion_length": 71.65625, + "epoch": 2.904109589041096, + "grad_norm": 14.674283981323242, + "kl": 0.1591796875, + "learning_rate": 7.095890410958904e-07, + "loss": 0.0064, + "reward": 1.5524739027023315, + "reward_std": 0.24804671853780746, + "rewards/accuracy_reward": 0.5602864325046539, + "rewards/format_reward": 0.9921875, + "step": 636 + }, + { + "completion_length": 83.1484375, + "epoch": 2.908675799086758, + "grad_norm": 4.1890034675598145, + "kl": 0.119140625, + "learning_rate": 7.091324200913241e-07, + "loss": 0.0048, + "reward": 1.7505208253860474, + "reward_std": 0.1413591168820858, + "rewards/accuracy_reward": 0.7583333253860474, + "rewards/format_reward": 0.9921875, + "step": 637 + }, + { + "completion_length": 72.359375, + "epoch": 2.91324200913242, + "grad_norm": 4.4610443115234375, + "kl": 0.1669921875, + "learning_rate": 7.08675799086758e-07, + "loss": 0.0067, + "reward": 1.6082217693328857, + "reward_std": 0.3458182215690613, + "rewards/accuracy_reward": 0.647284209728241, + "rewards/format_reward": 0.9609375, + "step": 638 + }, + { + "completion_length": 86.0390625, + "epoch": 2.9178082191780823, + "grad_norm": 4.549093723297119, + "kl": 0.1357421875, + "learning_rate": 7.082191780821918e-07, + "loss": 0.0054, + "reward": 1.6874799728393555, + "reward_std": 0.2353600338101387, + "rewards/accuracy_reward": 0.6874799132347107, + "rewards/format_reward": 1.0, + "step": 639 + }, + { + "completion_length": 97.1484375, + "epoch": 2.922374429223744, + "grad_norm": 2.3335134983062744, + "kl": 0.12646484375, + "learning_rate": 7.077625570776255e-07, + "loss": 0.0051, + "reward": 1.7588170766830444, + "reward_std": 0.14432461559772491, + "rewards/accuracy_reward": 0.7666293978691101, + "rewards/format_reward": 0.9921875, + "step": 640 + }, + { + "completion_length": 77.0078125, + "epoch": 2.9269406392694064, + "grad_norm": 3.070441961288452, + "kl": 0.126220703125, + "learning_rate": 7.073059360730594e-07, + "loss": 0.005, + "reward": 1.4704504013061523, + "reward_std": 0.28115857392549515, + "rewards/accuracy_reward": 0.47826285660266876, + "rewards/format_reward": 0.9921875, + "step": 641 + }, + { + "completion_length": 92.9765625, + "epoch": 2.9315068493150687, + "grad_norm": 3.367562770843506, + "kl": 0.101806640625, + "learning_rate": 7.068493150684931e-07, + "loss": 0.0041, + "reward": 1.729687511920929, + "reward_std": 0.12863079458475113, + "rewards/accuracy_reward": 0.7374999523162842, + "rewards/format_reward": 0.9921875, + "step": 642 + }, + { + "completion_length": 96.390625, + "epoch": 2.9360730593607305, + "grad_norm": 2.8609812259674072, + "kl": 0.10302734375, + "learning_rate": 7.063926940639268e-07, + "loss": 0.0041, + "reward": 1.749218761920929, + "reward_std": 0.18227346241474152, + "rewards/accuracy_reward": 0.7648437321186066, + "rewards/format_reward": 0.984375, + "step": 643 + }, + { + "completion_length": 89.7734375, + "epoch": 2.9406392694063928, + "grad_norm": 1.9020359516143799, + "kl": 0.090087890625, + "learning_rate": 7.059360730593607e-07, + "loss": 0.0036, + "reward": 1.7804688215255737, + "reward_std": 0.11121231690049171, + "rewards/accuracy_reward": 0.7804686725139618, + "rewards/format_reward": 1.0, + "step": 644 + }, + { + "completion_length": 99.03125, + "epoch": 2.9452054794520546, + "grad_norm": 9.120920181274414, + "kl": 0.10595703125, + "learning_rate": 7.054794520547945e-07, + "loss": 0.0042, + "reward": 1.7126150131225586, + "reward_std": 0.20574645698070526, + "rewards/accuracy_reward": 0.7282399535179138, + "rewards/format_reward": 0.984375, + "step": 645 + }, + { + "completion_length": 74.2265625, + "epoch": 2.949771689497717, + "grad_norm": 5.297418594360352, + "kl": 0.133056640625, + "learning_rate": 7.050228310502283e-07, + "loss": 0.0053, + "reward": 1.6359771490097046, + "reward_std": 0.23064683377742767, + "rewards/accuracy_reward": 0.6672270596027374, + "rewards/format_reward": 0.96875, + "step": 646 + }, + { + "completion_length": 102.6796875, + "epoch": 2.954337899543379, + "grad_norm": 1.925435185432434, + "kl": 0.0947265625, + "learning_rate": 7.045662100456621e-07, + "loss": 0.0038, + "reward": 1.8184895515441895, + "reward_std": 0.14697792008519173, + "rewards/accuracy_reward": 0.8341145217418671, + "rewards/format_reward": 0.984375, + "step": 647 + }, + { + "completion_length": 93.6796875, + "epoch": 2.958904109589041, + "grad_norm": 8.05284309387207, + "kl": 0.119140625, + "learning_rate": 7.041095890410958e-07, + "loss": 0.0048, + "reward": 1.7860276699066162, + "reward_std": 0.13688677921891212, + "rewards/accuracy_reward": 0.793840080499649, + "rewards/format_reward": 0.9921875, + "step": 648 + }, + { + "completion_length": 89.421875, + "epoch": 2.963470319634703, + "grad_norm": 2.366201162338257, + "kl": 0.150390625, + "learning_rate": 7.036529680365297e-07, + "loss": 0.006, + "reward": 1.6971353888511658, + "reward_std": 0.16348732262849808, + "rewards/accuracy_reward": 0.7049478888511658, + "rewards/format_reward": 0.9921875, + "step": 649 + }, + { + "completion_length": 76.46875, + "epoch": 2.968036529680365, + "grad_norm": 2.8056042194366455, + "kl": 0.1259765625, + "learning_rate": 7.031963470319634e-07, + "loss": 0.0051, + "reward": 1.7359544038772583, + "reward_std": 0.1912137269973755, + "rewards/accuracy_reward": 0.7437668442726135, + "rewards/format_reward": 0.9921875, + "step": 650 + }, + { + "completion_length": 74.015625, + "epoch": 2.9726027397260273, + "grad_norm": 2.3145570755004883, + "kl": 0.11767578125, + "learning_rate": 7.027397260273972e-07, + "loss": 0.0047, + "reward": 1.5831072330474854, + "reward_std": 0.25112421810626984, + "rewards/accuracy_reward": 0.5987322330474854, + "rewards/format_reward": 0.984375, + "step": 651 + }, + { + "completion_length": 93.734375, + "epoch": 2.9771689497716896, + "grad_norm": 3.0146915912628174, + "kl": 0.117431640625, + "learning_rate": 7.022831050228311e-07, + "loss": 0.0047, + "reward": 1.6713745594024658, + "reward_std": 0.21526063233613968, + "rewards/accuracy_reward": 0.6791869699954987, + "rewards/format_reward": 0.9921875, + "step": 652 + }, + { + "completion_length": 65.5078125, + "epoch": 2.981735159817352, + "grad_norm": 2.2300150394439697, + "kl": 0.123046875, + "learning_rate": 7.018264840182648e-07, + "loss": 0.0049, + "reward": 1.690625011920929, + "reward_std": 0.16151440143585205, + "rewards/accuracy_reward": 0.690625011920929, + "rewards/format_reward": 1.0, + "step": 653 + }, + { + "completion_length": 78.2890625, + "epoch": 2.9863013698630136, + "grad_norm": 4.648037433624268, + "kl": 0.126708984375, + "learning_rate": 7.013698630136986e-07, + "loss": 0.0051, + "reward": 1.744691550731659, + "reward_std": 0.14016081020236015, + "rewards/accuracy_reward": 0.7603164613246918, + "rewards/format_reward": 0.984375, + "step": 654 + }, + { + "completion_length": 67.203125, + "epoch": 2.990867579908676, + "grad_norm": 2.895372152328491, + "kl": 0.139404296875, + "learning_rate": 7.009132420091324e-07, + "loss": 0.0056, + "reward": 1.6064826250076294, + "reward_std": 0.2562439739704132, + "rewards/accuracy_reward": 0.6142950057983398, + "rewards/format_reward": 0.9921875, + "step": 655 + }, + { + "completion_length": 78.765625, + "epoch": 2.9954337899543377, + "grad_norm": 3.7892091274261475, + "kl": 0.093017578125, + "learning_rate": 7.004566210045661e-07, + "loss": 0.0037, + "reward": 1.754687488079071, + "reward_std": 0.1408282183110714, + "rewards/accuracy_reward": 0.754687488079071, + "rewards/format_reward": 1.0, + "step": 656 + }, + { + "completion_length": 58.5, + "epoch": 3.0, + "grad_norm": 3.69757080078125, + "kl": 0.18359375, + "learning_rate": 7e-07, + "loss": 0.0056, + "reward": 1.5, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 657 + }, + { + "completion_length": 73.21875, + "epoch": 3.0045662100456623, + "grad_norm": 2.8494434356689453, + "kl": 0.12744140625, + "learning_rate": 6.995433789954338e-07, + "loss": 0.0051, + "reward": 1.7073844075202942, + "reward_std": 0.18213152885437012, + "rewards/accuracy_reward": 0.7151968777179718, + "rewards/format_reward": 0.9921875, + "step": 658 + }, + { + "completion_length": 73.71875, + "epoch": 3.009132420091324, + "grad_norm": 2.561201810836792, + "kl": 0.099609375, + "learning_rate": 6.990867579908675e-07, + "loss": 0.004, + "reward": 1.6927083730697632, + "reward_std": 0.17407646402716637, + "rewards/accuracy_reward": 0.6927083134651184, + "rewards/format_reward": 1.0, + "step": 659 + }, + { + "completion_length": 98.328125, + "epoch": 3.0136986301369864, + "grad_norm": 3.664125442504883, + "kl": 0.138671875, + "learning_rate": 6.986301369863014e-07, + "loss": 0.0055, + "reward": 1.6324912905693054, + "reward_std": 0.21703079342842102, + "rewards/accuracy_reward": 0.6403037309646606, + "rewards/format_reward": 0.9921875, + "step": 660 + }, + { + "completion_length": 94.46875, + "epoch": 3.018264840182648, + "grad_norm": 2.527571439743042, + "kl": 0.1181640625, + "learning_rate": 6.981735159817351e-07, + "loss": 0.0047, + "reward": 1.7589489221572876, + "reward_std": 0.16745695658028126, + "rewards/accuracy_reward": 0.7745738625526428, + "rewards/format_reward": 0.984375, + "step": 661 + }, + { + "completion_length": 82.1953125, + "epoch": 3.0228310502283104, + "grad_norm": 2.676431894302368, + "kl": 0.14599609375, + "learning_rate": 6.977168949771689e-07, + "loss": 0.0058, + "reward": 1.681512713432312, + "reward_std": 0.18309402465820312, + "rewards/accuracy_reward": 0.6971376836299896, + "rewards/format_reward": 0.984375, + "step": 662 + }, + { + "completion_length": 97.671875, + "epoch": 3.0273972602739727, + "grad_norm": 2.549351453781128, + "kl": 0.078125, + "learning_rate": 6.972602739726027e-07, + "loss": 0.0031, + "reward": 1.6531250476837158, + "reward_std": 0.24489019811153412, + "rewards/accuracy_reward": 0.6687500178813934, + "rewards/format_reward": 0.984375, + "step": 663 + }, + { + "completion_length": 76.59375, + "epoch": 3.0319634703196345, + "grad_norm": 1.8996092081069946, + "kl": 0.147216796875, + "learning_rate": 6.968036529680364e-07, + "loss": 0.0059, + "reward": 1.734002947807312, + "reward_std": 0.1543407365679741, + "rewards/accuracy_reward": 0.7496279180049896, + "rewards/format_reward": 0.984375, + "step": 664 + }, + { + "completion_length": 62.4453125, + "epoch": 3.036529680365297, + "grad_norm": 2.1863508224487305, + "kl": 0.1217041015625, + "learning_rate": 6.963470319634704e-07, + "loss": 0.0049, + "reward": 1.771484375, + "reward_std": 0.08306973986327648, + "rewards/accuracy_reward": 0.7714843153953552, + "rewards/format_reward": 1.0, + "step": 665 + }, + { + "completion_length": 79.71875, + "epoch": 3.041095890410959, + "grad_norm": 2.9316976070404053, + "kl": 0.129150390625, + "learning_rate": 6.958904109589041e-07, + "loss": 0.0052, + "reward": 1.6214410066604614, + "reward_std": 0.2264525145292282, + "rewards/accuracy_reward": 0.6370659470558167, + "rewards/format_reward": 0.984375, + "step": 666 + }, + { + "completion_length": 90.0546875, + "epoch": 3.045662100456621, + "grad_norm": 2.6280527114868164, + "kl": 0.101806640625, + "learning_rate": 6.954337899543378e-07, + "loss": 0.0041, + "reward": 1.7546875476837158, + "reward_std": 0.1054728776216507, + "rewards/accuracy_reward": 0.7546874284744263, + "rewards/format_reward": 1.0, + "step": 667 + }, + { + "completion_length": 79.453125, + "epoch": 3.050228310502283, + "grad_norm": 5.255128383636475, + "kl": 0.10546875, + "learning_rate": 6.949771689497717e-07, + "loss": 0.0042, + "reward": 1.5757898688316345, + "reward_std": 0.19272886961698532, + "rewards/accuracy_reward": 0.5836023390293121, + "rewards/format_reward": 0.9921875, + "step": 668 + }, + { + "completion_length": 96.90625, + "epoch": 3.0547945205479454, + "grad_norm": 2.3194217681884766, + "kl": 0.077392578125, + "learning_rate": 6.945205479452054e-07, + "loss": 0.0031, + "reward": 1.770312488079071, + "reward_std": 0.11202363669872284, + "rewards/accuracy_reward": 0.7703124582767487, + "rewards/format_reward": 1.0, + "step": 669 + }, + { + "completion_length": 80.734375, + "epoch": 3.0593607305936072, + "grad_norm": 3.282161235809326, + "kl": 0.163330078125, + "learning_rate": 6.940639269406392e-07, + "loss": 0.0065, + "reward": 1.7177083492279053, + "reward_std": 0.16301878169178963, + "rewards/accuracy_reward": 0.7177082896232605, + "rewards/format_reward": 1.0, + "step": 670 + }, + { + "completion_length": 83.0, + "epoch": 3.0639269406392695, + "grad_norm": 3.282697916030884, + "kl": 0.12744140625, + "learning_rate": 6.93607305936073e-07, + "loss": 0.0051, + "reward": 1.8043155074119568, + "reward_std": 0.11800633184611797, + "rewards/accuracy_reward": 0.8043154180049896, + "rewards/format_reward": 1.0, + "step": 671 + }, + { + "completion_length": 80.8046875, + "epoch": 3.0684931506849313, + "grad_norm": 2.545616388320923, + "kl": 0.09814453125, + "learning_rate": 6.931506849315068e-07, + "loss": 0.0039, + "reward": 1.7645833492279053, + "reward_std": 0.14983439445495605, + "rewards/accuracy_reward": 0.7802082598209381, + "rewards/format_reward": 0.984375, + "step": 672 + }, + { + "completion_length": 75.546875, + "epoch": 3.0730593607305936, + "grad_norm": 2.007420778274536, + "kl": 0.1240234375, + "learning_rate": 6.926940639269407e-07, + "loss": 0.005, + "reward": 1.5703125596046448, + "reward_std": 0.21254336833953857, + "rewards/accuracy_reward": 0.5703125, + "rewards/format_reward": 1.0, + "step": 673 + }, + { + "completion_length": 74.5234375, + "epoch": 3.077625570776256, + "grad_norm": 3.1193904876708984, + "kl": 0.119873046875, + "learning_rate": 6.922374429223744e-07, + "loss": 0.0048, + "reward": 1.6553664803504944, + "reward_std": 0.17046189308166504, + "rewards/accuracy_reward": 0.6631789207458496, + "rewards/format_reward": 0.9921875, + "step": 674 + }, + { + "completion_length": 85.1484375, + "epoch": 3.0821917808219177, + "grad_norm": 2.9060676097869873, + "kl": 0.105712890625, + "learning_rate": 6.917808219178081e-07, + "loss": 0.0042, + "reward": 1.5601562857627869, + "reward_std": 0.2604144960641861, + "rewards/accuracy_reward": 0.5835937559604645, + "rewards/format_reward": 0.9765625, + "step": 675 + }, + { + "completion_length": 92.7109375, + "epoch": 3.08675799086758, + "grad_norm": 3.6577494144439697, + "kl": 0.0640869140625, + "learning_rate": 6.91324200913242e-07, + "loss": 0.0026, + "reward": 1.6146825551986694, + "reward_std": 0.10726364329457283, + "rewards/accuracy_reward": 0.614682525396347, + "rewards/format_reward": 1.0, + "step": 676 + }, + { + "completion_length": 76.6875, + "epoch": 3.091324200913242, + "grad_norm": 9.413636207580566, + "kl": 0.13818359375, + "learning_rate": 6.908675799086757e-07, + "loss": 0.0055, + "reward": 1.6631696820259094, + "reward_std": 0.2051173821091652, + "rewards/accuracy_reward": 0.6709820926189423, + "rewards/format_reward": 0.9921875, + "step": 677 + }, + { + "completion_length": 75.2734375, + "epoch": 3.095890410958904, + "grad_norm": 5.669606685638428, + "kl": 0.119140625, + "learning_rate": 6.904109589041097e-07, + "loss": 0.0048, + "reward": 1.5031325817108154, + "reward_std": 0.2111596167087555, + "rewards/accuracy_reward": 0.5109450221061707, + "rewards/format_reward": 0.9921875, + "step": 678 + }, + { + "completion_length": 74.1796875, + "epoch": 3.1004566210045663, + "grad_norm": 2.975776433944702, + "kl": 0.1220703125, + "learning_rate": 6.899543378995434e-07, + "loss": 0.0049, + "reward": 1.567187488079071, + "reward_std": 0.26324766874313354, + "rewards/accuracy_reward": 0.582812488079071, + "rewards/format_reward": 0.984375, + "step": 679 + }, + { + "completion_length": 84.171875, + "epoch": 3.105022831050228, + "grad_norm": 2.021631956100464, + "kl": 0.09765625, + "learning_rate": 6.894977168949771e-07, + "loss": 0.0039, + "reward": 1.723046898841858, + "reward_std": 0.14361856132745743, + "rewards/accuracy_reward": 0.7230468392372131, + "rewards/format_reward": 1.0, + "step": 680 + }, + { + "completion_length": 82.0390625, + "epoch": 3.1095890410958904, + "grad_norm": 4.33884334564209, + "kl": 0.108642578125, + "learning_rate": 6.89041095890411e-07, + "loss": 0.0043, + "reward": 1.6755682229995728, + "reward_std": 0.11651190742850304, + "rewards/accuracy_reward": 0.6755681037902832, + "rewards/format_reward": 1.0, + "step": 681 + }, + { + "completion_length": 86.203125, + "epoch": 3.1141552511415527, + "grad_norm": 1.7685151100158691, + "kl": 0.099853515625, + "learning_rate": 6.885844748858447e-07, + "loss": 0.004, + "reward": 1.71484375, + "reward_std": 0.15246989578008652, + "rewards/accuracy_reward": 0.72265625, + "rewards/format_reward": 0.9921875, + "step": 682 + }, + { + "completion_length": 74.515625, + "epoch": 3.1187214611872145, + "grad_norm": 3.724857807159424, + "kl": 0.136962890625, + "learning_rate": 6.881278538812784e-07, + "loss": 0.0055, + "reward": 1.6691706776618958, + "reward_std": 0.20630130916833878, + "rewards/accuracy_reward": 0.6691707074642181, + "rewards/format_reward": 1.0, + "step": 683 + }, + { + "completion_length": 88.515625, + "epoch": 3.1232876712328768, + "grad_norm": 1.960091471672058, + "kl": 0.08740234375, + "learning_rate": 6.876712328767123e-07, + "loss": 0.0035, + "reward": 1.7096437811851501, + "reward_std": 0.1030731052160263, + "rewards/accuracy_reward": 0.709643691778183, + "rewards/format_reward": 1.0, + "step": 684 + }, + { + "completion_length": 69.921875, + "epoch": 3.127853881278539, + "grad_norm": 4.404357433319092, + "kl": 0.11376953125, + "learning_rate": 6.872146118721461e-07, + "loss": 0.0046, + "reward": 1.7164434790611267, + "reward_std": 0.16906945407390594, + "rewards/accuracy_reward": 0.7164434790611267, + "rewards/format_reward": 1.0, + "step": 685 + }, + { + "completion_length": 106.78125, + "epoch": 3.132420091324201, + "grad_norm": 1.5101971626281738, + "kl": 0.073974609375, + "learning_rate": 6.867579908675799e-07, + "loss": 0.003, + "reward": 1.798065423965454, + "reward_std": 0.0902152806520462, + "rewards/accuracy_reward": 0.8058778941631317, + "rewards/format_reward": 0.9921875, + "step": 686 + }, + { + "completion_length": 76.1953125, + "epoch": 3.136986301369863, + "grad_norm": 2.8091230392456055, + "kl": 0.16064453125, + "learning_rate": 6.863013698630137e-07, + "loss": 0.0064, + "reward": 1.5283854007720947, + "reward_std": 0.21707772463560104, + "rewards/accuracy_reward": 0.5283854007720947, + "rewards/format_reward": 1.0, + "step": 687 + }, + { + "completion_length": 87.125, + "epoch": 3.141552511415525, + "grad_norm": 6.1914591789245605, + "kl": 0.078857421875, + "learning_rate": 6.858447488584474e-07, + "loss": 0.0032, + "reward": 1.7078726291656494, + "reward_std": 0.15342054888606071, + "rewards/accuracy_reward": 0.7156850397586823, + "rewards/format_reward": 0.9921875, + "step": 688 + }, + { + "completion_length": 101.140625, + "epoch": 3.146118721461187, + "grad_norm": 2.486093521118164, + "kl": 0.076904296875, + "learning_rate": 6.853881278538813e-07, + "loss": 0.0031, + "reward": 1.8033654689788818, + "reward_std": 0.08035113476216793, + "rewards/accuracy_reward": 0.8033653199672699, + "rewards/format_reward": 1.0, + "step": 689 + }, + { + "completion_length": 72.546875, + "epoch": 3.1506849315068495, + "grad_norm": 4.52701473236084, + "kl": 0.13427734375, + "learning_rate": 6.84931506849315e-07, + "loss": 0.0054, + "reward": 1.9150173664093018, + "reward_std": 0.12145426124334335, + "rewards/accuracy_reward": 0.9228298366069794, + "rewards/format_reward": 0.9921875, + "step": 690 + }, + { + "completion_length": 77.0625, + "epoch": 3.1552511415525113, + "grad_norm": 2.9900949001312256, + "kl": 0.124755859375, + "learning_rate": 6.844748858447487e-07, + "loss": 0.005, + "reward": 1.6368862390518188, + "reward_std": 0.17363014817237854, + "rewards/accuracy_reward": 0.6446986198425293, + "rewards/format_reward": 0.9921875, + "step": 691 + }, + { + "completion_length": 91.0859375, + "epoch": 3.1598173515981736, + "grad_norm": 2.327423095703125, + "kl": 0.099609375, + "learning_rate": 6.840182648401827e-07, + "loss": 0.004, + "reward": 1.682031273841858, + "reward_std": 0.17397383973002434, + "rewards/accuracy_reward": 0.7054687440395355, + "rewards/format_reward": 0.9765625, + "step": 692 + }, + { + "completion_length": 67.2578125, + "epoch": 3.1643835616438354, + "grad_norm": 2.4474422931671143, + "kl": 0.133544921875, + "learning_rate": 6.835616438356164e-07, + "loss": 0.0053, + "reward": 1.6786458492279053, + "reward_std": 0.16388440132141113, + "rewards/accuracy_reward": 0.6786458194255829, + "rewards/format_reward": 1.0, + "step": 693 + }, + { + "completion_length": 83.015625, + "epoch": 3.1689497716894977, + "grad_norm": 12.002235412597656, + "kl": 0.1435546875, + "learning_rate": 6.831050228310502e-07, + "loss": 0.0057, + "reward": 1.6534380912780762, + "reward_std": 0.19905856251716614, + "rewards/accuracy_reward": 0.6534381806850433, + "rewards/format_reward": 1.0, + "step": 694 + }, + { + "completion_length": 72.671875, + "epoch": 3.17351598173516, + "grad_norm": 4.494046211242676, + "kl": 0.127197265625, + "learning_rate": 6.82648401826484e-07, + "loss": 0.0051, + "reward": 1.8556300401687622, + "reward_std": 0.14538883790373802, + "rewards/accuracy_reward": 0.8556298911571503, + "rewards/format_reward": 1.0, + "step": 695 + }, + { + "completion_length": 86.609375, + "epoch": 3.1780821917808217, + "grad_norm": 1.9116896390914917, + "kl": 0.103759765625, + "learning_rate": 6.821917808219177e-07, + "loss": 0.0041, + "reward": 1.7070313096046448, + "reward_std": 0.19209937751293182, + "rewards/accuracy_reward": 0.7148437201976776, + "rewards/format_reward": 0.9921875, + "step": 696 + }, + { + "completion_length": 87.53125, + "epoch": 3.182648401826484, + "grad_norm": 6.58709192276001, + "kl": 0.1005859375, + "learning_rate": 6.817351598173516e-07, + "loss": 0.004, + "reward": 1.7153645753860474, + "reward_std": 0.13286828622221947, + "rewards/accuracy_reward": 0.7153644859790802, + "rewards/format_reward": 1.0, + "step": 697 + }, + { + "completion_length": 70.234375, + "epoch": 3.1872146118721463, + "grad_norm": 2.434600591659546, + "kl": 0.145751953125, + "learning_rate": 6.812785388127854e-07, + "loss": 0.0058, + "reward": 1.7907050848007202, + "reward_std": 0.12088606879115105, + "rewards/accuracy_reward": 0.7907051146030426, + "rewards/format_reward": 1.0, + "step": 698 + }, + { + "completion_length": 68.5, + "epoch": 3.191780821917808, + "grad_norm": 26.128162384033203, + "kl": 0.126953125, + "learning_rate": 6.808219178082191e-07, + "loss": 0.0051, + "reward": 1.715624988079071, + "reward_std": 0.2467075139284134, + "rewards/accuracy_reward": 0.715624988079071, + "rewards/format_reward": 1.0, + "step": 699 + }, + { + "completion_length": 87.078125, + "epoch": 3.1963470319634704, + "grad_norm": 3.1288247108459473, + "kl": 0.14404296875, + "learning_rate": 6.80365296803653e-07, + "loss": 0.0057, + "reward": 1.860937476158142, + "reward_std": 0.08299508690834045, + "rewards/accuracy_reward": 0.8687499463558197, + "rewards/format_reward": 0.9921875, + "step": 700 + }, + { + "completion_length": 67.5, + "epoch": 3.2009132420091326, + "grad_norm": 7.457474708557129, + "kl": 0.1416015625, + "learning_rate": 6.799086757990867e-07, + "loss": 0.0057, + "reward": 1.6931147575378418, + "reward_std": 0.18138662725687027, + "rewards/accuracy_reward": 0.6931147575378418, + "rewards/format_reward": 1.0, + "step": 701 + }, + { + "completion_length": 78.4921875, + "epoch": 3.2054794520547945, + "grad_norm": 3.00228214263916, + "kl": 0.182861328125, + "learning_rate": 6.794520547945205e-07, + "loss": 0.0073, + "reward": 1.7272436618804932, + "reward_std": 0.18743212521076202, + "rewards/accuracy_reward": 0.727243572473526, + "rewards/format_reward": 1.0, + "step": 702 + }, + { + "completion_length": 83.515625, + "epoch": 3.2100456621004567, + "grad_norm": 2.2364189624786377, + "kl": 0.10693359375, + "learning_rate": 6.789954337899543e-07, + "loss": 0.0043, + "reward": 1.7500601410865784, + "reward_std": 0.10543964058160782, + "rewards/accuracy_reward": 0.7500600218772888, + "rewards/format_reward": 1.0, + "step": 703 + }, + { + "completion_length": 59.78125, + "epoch": 3.2146118721461185, + "grad_norm": 15.17089557647705, + "kl": 0.64404296875, + "learning_rate": 6.78538812785388e-07, + "loss": 0.0258, + "reward": 1.5231274366378784, + "reward_std": 0.3583277612924576, + "rewards/accuracy_reward": 0.5309399664402008, + "rewards/format_reward": 0.9921875, + "step": 704 + }, + { + "completion_length": 85.2734375, + "epoch": 3.219178082191781, + "grad_norm": 1.857921838760376, + "kl": 0.121826171875, + "learning_rate": 6.78082191780822e-07, + "loss": 0.0049, + "reward": 1.8244792222976685, + "reward_std": 0.08049174584448338, + "rewards/accuracy_reward": 0.8244791626930237, + "rewards/format_reward": 1.0, + "step": 705 + }, + { + "completion_length": 81.7578125, + "epoch": 3.223744292237443, + "grad_norm": 6.063529014587402, + "kl": 0.11328125, + "learning_rate": 6.776255707762557e-07, + "loss": 0.0045, + "reward": 1.7606770992279053, + "reward_std": 0.21317215263843536, + "rewards/accuracy_reward": 0.7684895098209381, + "rewards/format_reward": 0.9921875, + "step": 706 + }, + { + "completion_length": 86.984375, + "epoch": 3.228310502283105, + "grad_norm": 1.6526107788085938, + "kl": 0.1376953125, + "learning_rate": 6.771689497716894e-07, + "loss": 0.0055, + "reward": 1.6617188453674316, + "reward_std": 0.175977885723114, + "rewards/accuracy_reward": 0.6695312261581421, + "rewards/format_reward": 0.9921875, + "step": 707 + }, + { + "completion_length": 90.5234375, + "epoch": 3.232876712328767, + "grad_norm": 3.4753360748291016, + "kl": 0.090087890625, + "learning_rate": 6.767123287671233e-07, + "loss": 0.0036, + "reward": 1.7570313215255737, + "reward_std": 0.09644587151706219, + "rewards/accuracy_reward": 0.7570312023162842, + "rewards/format_reward": 1.0, + "step": 708 + }, + { + "completion_length": 78.34375, + "epoch": 3.237442922374429, + "grad_norm": 5.928980350494385, + "kl": 0.1376953125, + "learning_rate": 6.76255707762557e-07, + "loss": 0.0055, + "reward": 1.7825521230697632, + "reward_std": 0.19306360930204391, + "rewards/accuracy_reward": 0.7825520932674408, + "rewards/format_reward": 1.0, + "step": 709 + }, + { + "completion_length": 78.015625, + "epoch": 3.2420091324200913, + "grad_norm": 5.787877559661865, + "kl": 0.098876953125, + "learning_rate": 6.757990867579907e-07, + "loss": 0.004, + "reward": 1.6989798545837402, + "reward_std": 0.22939839959144592, + "rewards/accuracy_reward": 0.7146047651767731, + "rewards/format_reward": 0.984375, + "step": 710 + }, + { + "completion_length": 73.984375, + "epoch": 3.2465753424657535, + "grad_norm": 1.9611449241638184, + "kl": 0.135986328125, + "learning_rate": 6.753424657534246e-07, + "loss": 0.0054, + "reward": 1.7150809168815613, + "reward_std": 0.12537125870585442, + "rewards/accuracy_reward": 0.7228934466838837, + "rewards/format_reward": 0.9921875, + "step": 711 + }, + { + "completion_length": 76.640625, + "epoch": 3.2511415525114153, + "grad_norm": 2.202395439147949, + "kl": 0.15673828125, + "learning_rate": 6.748858447488584e-07, + "loss": 0.0063, + "reward": 1.7408854365348816, + "reward_std": 0.18467864021658897, + "rewards/accuracy_reward": 0.7486979365348816, + "rewards/format_reward": 0.9921875, + "step": 712 + }, + { + "completion_length": 91.578125, + "epoch": 3.2557077625570776, + "grad_norm": 2.922081232070923, + "kl": 0.075439453125, + "learning_rate": 6.744292237442923e-07, + "loss": 0.003, + "reward": 1.6480501890182495, + "reward_std": 0.08936248533427715, + "rewards/accuracy_reward": 0.6480501890182495, + "rewards/format_reward": 1.0, + "step": 713 + }, + { + "completion_length": 97.6953125, + "epoch": 3.26027397260274, + "grad_norm": 1.8138573169708252, + "kl": 0.0810546875, + "learning_rate": 6.73972602739726e-07, + "loss": 0.0032, + "reward": 1.678125023841858, + "reward_std": 0.17582245916128159, + "rewards/accuracy_reward": 0.7015624642372131, + "rewards/format_reward": 0.9765625, + "step": 714 + }, + { + "completion_length": 81.609375, + "epoch": 3.2648401826484017, + "grad_norm": 2.29533052444458, + "kl": 0.117919921875, + "learning_rate": 6.735159817351597e-07, + "loss": 0.0047, + "reward": 1.5667868852615356, + "reward_std": 0.23160236328840256, + "rewards/accuracy_reward": 0.5902243554592133, + "rewards/format_reward": 0.9765625, + "step": 715 + }, + { + "completion_length": 79.046875, + "epoch": 3.269406392694064, + "grad_norm": 3.5871529579162598, + "kl": 0.1748046875, + "learning_rate": 6.730593607305936e-07, + "loss": 0.007, + "reward": 1.5300781726837158, + "reward_std": 0.381600484251976, + "rewards/accuracy_reward": 0.592578113079071, + "rewards/format_reward": 0.9375, + "step": 716 + }, + { + "completion_length": 81.21875, + "epoch": 3.2739726027397262, + "grad_norm": 7.025841236114502, + "kl": 0.177734375, + "learning_rate": 6.726027397260273e-07, + "loss": 0.0071, + "reward": 1.5895833373069763, + "reward_std": 0.3286707103252411, + "rewards/accuracy_reward": 0.6442708373069763, + "rewards/format_reward": 0.9453125, + "step": 717 + }, + { + "completion_length": 79.859375, + "epoch": 3.278538812785388, + "grad_norm": 3.0471503734588623, + "kl": 0.099609375, + "learning_rate": 6.721461187214613e-07, + "loss": 0.004, + "reward": 1.6017058491706848, + "reward_std": 0.18348699063062668, + "rewards/accuracy_reward": 0.6095183193683624, + "rewards/format_reward": 0.9921875, + "step": 718 + }, + { + "completion_length": 78.8515625, + "epoch": 3.2831050228310503, + "grad_norm": 2.4444265365600586, + "kl": 0.09912109375, + "learning_rate": 6.71689497716895e-07, + "loss": 0.004, + "reward": 1.5035117268562317, + "reward_std": 0.2922291085124016, + "rewards/accuracy_reward": 0.5191366672515869, + "rewards/format_reward": 0.984375, + "step": 719 + }, + { + "completion_length": 80.515625, + "epoch": 3.287671232876712, + "grad_norm": 4.203682899475098, + "kl": 0.11962890625, + "learning_rate": 6.712328767123287e-07, + "loss": 0.0048, + "reward": 1.5486140251159668, + "reward_std": 0.18936936557292938, + "rewards/accuracy_reward": 0.5564264357089996, + "rewards/format_reward": 0.9921875, + "step": 720 + }, + { + "completion_length": 90.1015625, + "epoch": 3.2922374429223744, + "grad_norm": 3.4824612140655518, + "kl": 0.103759765625, + "learning_rate": 6.707762557077626e-07, + "loss": 0.0042, + "reward": 1.7547819018363953, + "reward_std": 0.14520251005887985, + "rewards/accuracy_reward": 0.7782192826271057, + "rewards/format_reward": 0.9765625, + "step": 721 + }, + { + "completion_length": 71.7734375, + "epoch": 3.2968036529680367, + "grad_norm": 1.9310115575790405, + "kl": 0.18310546875, + "learning_rate": 6.703196347031963e-07, + "loss": 0.0073, + "reward": 1.494028091430664, + "reward_std": 0.24336419254541397, + "rewards/accuracy_reward": 0.5174656212329865, + "rewards/format_reward": 0.9765625, + "step": 722 + }, + { + "completion_length": 87.65625, + "epoch": 3.3013698630136985, + "grad_norm": 2.703258991241455, + "kl": 0.1064453125, + "learning_rate": 6.6986301369863e-07, + "loss": 0.0043, + "reward": 1.6925916075706482, + "reward_std": 0.22123625874519348, + "rewards/accuracy_reward": 0.7238415777683258, + "rewards/format_reward": 0.96875, + "step": 723 + }, + { + "completion_length": 86.796875, + "epoch": 3.3059360730593608, + "grad_norm": 1.1868500709533691, + "kl": 0.10107421875, + "learning_rate": 6.694063926940639e-07, + "loss": 0.004, + "reward": 1.868213951587677, + "reward_std": 0.16314804006833583, + "rewards/accuracy_reward": 0.8916513323783875, + "rewards/format_reward": 0.9765625, + "step": 724 + }, + { + "completion_length": 89.765625, + "epoch": 3.3105022831050226, + "grad_norm": 1.9978954792022705, + "kl": 0.09814453125, + "learning_rate": 6.689497716894977e-07, + "loss": 0.0039, + "reward": 1.6983258724212646, + "reward_std": 0.1776369959115982, + "rewards/accuracy_reward": 0.7139508724212646, + "rewards/format_reward": 0.984375, + "step": 725 + }, + { + "completion_length": 82.3515625, + "epoch": 3.315068493150685, + "grad_norm": 2.7241666316986084, + "kl": 0.0947265625, + "learning_rate": 6.684931506849316e-07, + "loss": 0.0038, + "reward": 1.6841517686843872, + "reward_std": 0.20553293824195862, + "rewards/accuracy_reward": 0.6997767686843872, + "rewards/format_reward": 0.984375, + "step": 726 + }, + { + "completion_length": 66.3515625, + "epoch": 3.319634703196347, + "grad_norm": 2.9195847511291504, + "kl": 0.1630859375, + "learning_rate": 6.680365296803653e-07, + "loss": 0.0065, + "reward": 1.7539063096046448, + "reward_std": 0.18009010702371597, + "rewards/accuracy_reward": 0.7617186903953552, + "rewards/format_reward": 0.9921875, + "step": 727 + }, + { + "completion_length": 75.4765625, + "epoch": 3.324200913242009, + "grad_norm": 2.223017930984497, + "kl": 0.091064453125, + "learning_rate": 6.67579908675799e-07, + "loss": 0.0036, + "reward": 1.535528302192688, + "reward_std": 0.24083293601870537, + "rewards/accuracy_reward": 0.5667782425880432, + "rewards/format_reward": 0.96875, + "step": 728 + }, + { + "completion_length": 78.046875, + "epoch": 3.328767123287671, + "grad_norm": 2.5224952697753906, + "kl": 0.102783203125, + "learning_rate": 6.671232876712329e-07, + "loss": 0.0041, + "reward": 1.732812523841858, + "reward_std": 0.21059568226337433, + "rewards/accuracy_reward": 0.7640624642372131, + "rewards/format_reward": 0.96875, + "step": 729 + }, + { + "completion_length": 71.8203125, + "epoch": 3.3333333333333335, + "grad_norm": 2.83807110786438, + "kl": 0.10498046875, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0042, + "reward": 1.579541265964508, + "reward_std": 0.26198120415210724, + "rewards/accuracy_reward": 0.5951661765575409, + "rewards/format_reward": 0.984375, + "step": 730 + }, + { + "completion_length": 63.3125, + "epoch": 3.3378995433789953, + "grad_norm": 3.6584157943725586, + "kl": 0.16015625, + "learning_rate": 6.662100456621003e-07, + "loss": 0.0064, + "reward": 1.7500744462013245, + "reward_std": 0.19155671447515488, + "rewards/accuracy_reward": 0.7500744163990021, + "rewards/format_reward": 1.0, + "step": 731 + }, + { + "completion_length": 56.4765625, + "epoch": 3.3424657534246576, + "grad_norm": 2.3448262214660645, + "kl": 0.13818359375, + "learning_rate": 6.657534246575343e-07, + "loss": 0.0055, + "reward": 1.6341642141342163, + "reward_std": 0.20584679394960403, + "rewards/accuracy_reward": 0.6576017141342163, + "rewards/format_reward": 0.9765625, + "step": 732 + }, + { + "completion_length": 56.953125, + "epoch": 3.34703196347032, + "grad_norm": 2.6592612266540527, + "kl": 0.1484375, + "learning_rate": 6.65296803652968e-07, + "loss": 0.0059, + "reward": 1.694618046283722, + "reward_std": 0.17367403209209442, + "rewards/accuracy_reward": 0.6946180760860443, + "rewards/format_reward": 1.0, + "step": 733 + }, + { + "completion_length": 86.1640625, + "epoch": 3.3515981735159817, + "grad_norm": 1.7500590085983276, + "kl": 0.128662109375, + "learning_rate": 6.648401826484019e-07, + "loss": 0.0051, + "reward": 1.7081771492958069, + "reward_std": 0.11748043447732925, + "rewards/accuracy_reward": 0.7159895300865173, + "rewards/format_reward": 0.9921875, + "step": 734 + }, + { + "completion_length": 73.1171875, + "epoch": 3.356164383561644, + "grad_norm": 2.0890016555786133, + "kl": 0.111328125, + "learning_rate": 6.643835616438356e-07, + "loss": 0.0045, + "reward": 1.669720709323883, + "reward_std": 0.14257927983999252, + "rewards/accuracy_reward": 0.6775331199169159, + "rewards/format_reward": 0.9921875, + "step": 735 + }, + { + "completion_length": 75.4453125, + "epoch": 3.3607305936073057, + "grad_norm": 1.7575141191482544, + "kl": 0.10302734375, + "learning_rate": 6.639269406392693e-07, + "loss": 0.0041, + "reward": 1.760156273841858, + "reward_std": 0.1406225487589836, + "rewards/accuracy_reward": 0.7679686844348907, + "rewards/format_reward": 0.9921875, + "step": 736 + }, + { + "completion_length": 79.0625, + "epoch": 3.365296803652968, + "grad_norm": 1.9857916831970215, + "kl": 0.0750732421875, + "learning_rate": 6.634703196347032e-07, + "loss": 0.003, + "reward": 1.8437398672103882, + "reward_std": 0.10023375414311886, + "rewards/accuracy_reward": 0.8515522480010986, + "rewards/format_reward": 0.9921875, + "step": 737 + }, + { + "completion_length": 43.953125, + "epoch": 3.3698630136986303, + "grad_norm": 1.8931981325149536, + "kl": 0.23876953125, + "learning_rate": 6.63013698630137e-07, + "loss": 0.0096, + "reward": 1.777430534362793, + "reward_std": 0.2545855790376663, + "rewards/accuracy_reward": 0.808680534362793, + "rewards/format_reward": 0.96875, + "step": 738 + }, + { + "completion_length": 66.671875, + "epoch": 3.374429223744292, + "grad_norm": 4.250927925109863, + "kl": 0.11474609375, + "learning_rate": 6.625570776255707e-07, + "loss": 0.0046, + "reward": 1.5928385257720947, + "reward_std": 0.13594963401556015, + "rewards/accuracy_reward": 0.5928385257720947, + "rewards/format_reward": 1.0, + "step": 739 + }, + { + "completion_length": 59.28125, + "epoch": 3.3789954337899544, + "grad_norm": 1.9088664054870605, + "kl": 0.114990234375, + "learning_rate": 6.621004566210046e-07, + "loss": 0.0046, + "reward": 1.5951822996139526, + "reward_std": 0.1505616046488285, + "rewards/accuracy_reward": 0.5951822698116302, + "rewards/format_reward": 1.0, + "step": 740 + }, + { + "completion_length": 74.8984375, + "epoch": 3.383561643835616, + "grad_norm": 2.5536303520202637, + "kl": 0.126953125, + "learning_rate": 6.616438356164383e-07, + "loss": 0.0051, + "reward": 1.6819568276405334, + "reward_std": 0.1260900031775236, + "rewards/accuracy_reward": 0.6819568276405334, + "rewards/format_reward": 1.0, + "step": 741 + }, + { + "completion_length": 81.7421875, + "epoch": 3.3881278538812785, + "grad_norm": 1.9142627716064453, + "kl": 0.091064453125, + "learning_rate": 6.61187214611872e-07, + "loss": 0.0036, + "reward": 1.7414063215255737, + "reward_std": 0.15430963411927223, + "rewards/accuracy_reward": 0.7492187023162842, + "rewards/format_reward": 0.9921875, + "step": 742 + }, + { + "completion_length": 63.8828125, + "epoch": 3.3926940639269407, + "grad_norm": 2.298887252807617, + "kl": 0.15625, + "learning_rate": 6.607305936073059e-07, + "loss": 0.0063, + "reward": 1.6809749007225037, + "reward_std": 0.13766025006771088, + "rewards/accuracy_reward": 0.6809749007225037, + "rewards/format_reward": 1.0, + "step": 743 + }, + { + "completion_length": 62.3046875, + "epoch": 3.3972602739726026, + "grad_norm": 2.9588382244110107, + "kl": 0.13916015625, + "learning_rate": 6.602739726027396e-07, + "loss": 0.0056, + "reward": 1.668749988079071, + "reward_std": 0.28636179864406586, + "rewards/accuracy_reward": 0.692187488079071, + "rewards/format_reward": 0.9765625, + "step": 744 + }, + { + "completion_length": 58.328125, + "epoch": 3.401826484018265, + "grad_norm": 2.7415127754211426, + "kl": 0.130126953125, + "learning_rate": 6.598173515981736e-07, + "loss": 0.0052, + "reward": 1.755094826221466, + "reward_std": 0.2010849490761757, + "rewards/accuracy_reward": 0.7707198262214661, + "rewards/format_reward": 0.984375, + "step": 745 + }, + { + "completion_length": 61.15625, + "epoch": 3.406392694063927, + "grad_norm": 2.1522672176361084, + "kl": 0.138427734375, + "learning_rate": 6.593607305936073e-07, + "loss": 0.0055, + "reward": 1.779687523841858, + "reward_std": 0.1337989792227745, + "rewards/accuracy_reward": 0.7796874344348907, + "rewards/format_reward": 1.0, + "step": 746 + }, + { + "completion_length": 93.4296875, + "epoch": 3.410958904109589, + "grad_norm": 2.299729824066162, + "kl": 0.127197265625, + "learning_rate": 6.58904109589041e-07, + "loss": 0.0051, + "reward": 1.787500023841858, + "reward_std": 0.12756995856761932, + "rewards/accuracy_reward": 0.8031249642372131, + "rewards/format_reward": 0.984375, + "step": 747 + }, + { + "completion_length": 72.625, + "epoch": 3.415525114155251, + "grad_norm": 3.2953529357910156, + "kl": 0.13623046875, + "learning_rate": 6.584474885844749e-07, + "loss": 0.0055, + "reward": 1.5497395992279053, + "reward_std": 0.2357780486345291, + "rewards/accuracy_reward": 0.5575520694255829, + "rewards/format_reward": 0.9921875, + "step": 748 + }, + { + "completion_length": 66.1015625, + "epoch": 3.4200913242009134, + "grad_norm": 3.811732530593872, + "kl": 0.15625, + "learning_rate": 6.579908675799086e-07, + "loss": 0.0063, + "reward": 1.465334177017212, + "reward_std": 0.281493678689003, + "rewards/accuracy_reward": 0.48095911741256714, + "rewards/format_reward": 0.984375, + "step": 749 + }, + { + "completion_length": 78.1171875, + "epoch": 3.4246575342465753, + "grad_norm": 6.028533458709717, + "kl": 0.14501953125, + "learning_rate": 6.575342465753423e-07, + "loss": 0.0058, + "reward": 1.612395167350769, + "reward_std": 0.1732819825410843, + "rewards/accuracy_reward": 0.6202076524496078, + "rewards/format_reward": 0.9921875, + "step": 750 + }, + { + "completion_length": 80.921875, + "epoch": 3.4292237442922375, + "grad_norm": 2.363414764404297, + "kl": 0.10107421875, + "learning_rate": 6.570776255707762e-07, + "loss": 0.004, + "reward": 1.7257813215255737, + "reward_std": 0.15576134249567986, + "rewards/accuracy_reward": 0.7257812023162842, + "rewards/format_reward": 1.0, + "step": 751 + }, + { + "completion_length": 60.625, + "epoch": 3.4337899543378994, + "grad_norm": 2.295811176300049, + "kl": 0.1298828125, + "learning_rate": 6.5662100456621e-07, + "loss": 0.0052, + "reward": 1.8014204502105713, + "reward_std": 0.14195573329925537, + "rewards/accuracy_reward": 0.8092329502105713, + "rewards/format_reward": 0.9921875, + "step": 752 + }, + { + "completion_length": 79.6875, + "epoch": 3.4383561643835616, + "grad_norm": 1.9623810052871704, + "kl": 0.09326171875, + "learning_rate": 6.561643835616439e-07, + "loss": 0.0037, + "reward": 1.654836356639862, + "reward_std": 0.16391075402498245, + "rewards/accuracy_reward": 0.6704612970352173, + "rewards/format_reward": 0.984375, + "step": 753 + }, + { + "completion_length": 80.84375, + "epoch": 3.442922374429224, + "grad_norm": 2.4038403034210205, + "kl": 0.117919921875, + "learning_rate": 6.557077625570776e-07, + "loss": 0.0047, + "reward": 1.7968750596046448, + "reward_std": 0.10437997803092003, + "rewards/accuracy_reward": 0.7968749105930328, + "rewards/format_reward": 1.0, + "step": 754 + }, + { + "completion_length": 72.078125, + "epoch": 3.4474885844748857, + "grad_norm": 1.999448299407959, + "kl": 0.11962890625, + "learning_rate": 6.552511415525113e-07, + "loss": 0.0048, + "reward": 1.829807698726654, + "reward_std": 0.07720155641436577, + "rewards/accuracy_reward": 0.8298076391220093, + "rewards/format_reward": 1.0, + "step": 755 + }, + { + "completion_length": 75.8828125, + "epoch": 3.452054794520548, + "grad_norm": 1.9827911853790283, + "kl": 0.0849609375, + "learning_rate": 6.547945205479452e-07, + "loss": 0.0034, + "reward": 1.7932292222976685, + "reward_std": 0.14317410439252853, + "rewards/accuracy_reward": 0.8088541030883789, + "rewards/format_reward": 0.984375, + "step": 756 + }, + { + "completion_length": 77.359375, + "epoch": 3.45662100456621, + "grad_norm": 2.601567268371582, + "kl": 0.118896484375, + "learning_rate": 6.543378995433789e-07, + "loss": 0.0048, + "reward": 1.7324219346046448, + "reward_std": 0.15861116349697113, + "rewards/accuracy_reward": 0.740234375, + "rewards/format_reward": 0.9921875, + "step": 757 + }, + { + "completion_length": 83.2421875, + "epoch": 3.461187214611872, + "grad_norm": 2.38321852684021, + "kl": 0.133056640625, + "learning_rate": 6.538812785388129e-07, + "loss": 0.0053, + "reward": 1.6742457747459412, + "reward_std": 0.18774619698524475, + "rewards/accuracy_reward": 0.6820583343505859, + "rewards/format_reward": 0.9921875, + "step": 758 + }, + { + "completion_length": 80.3671875, + "epoch": 3.4657534246575343, + "grad_norm": 1.9422838687896729, + "kl": 0.128662109375, + "learning_rate": 6.534246575342466e-07, + "loss": 0.0051, + "reward": 1.7861049175262451, + "reward_std": 0.09223857149481773, + "rewards/accuracy_reward": 0.7861048579216003, + "rewards/format_reward": 1.0, + "step": 759 + }, + { + "completion_length": 77.8125, + "epoch": 3.470319634703196, + "grad_norm": 2.641580820083618, + "kl": 0.119873046875, + "learning_rate": 6.529680365296803e-07, + "loss": 0.0048, + "reward": 1.6968767046928406, + "reward_std": 0.2295239269733429, + "rewards/accuracy_reward": 0.7046891748905182, + "rewards/format_reward": 0.9921875, + "step": 760 + }, + { + "completion_length": 73.4375, + "epoch": 3.4748858447488584, + "grad_norm": 2.904656171798706, + "kl": 0.12744140625, + "learning_rate": 6.525114155251142e-07, + "loss": 0.0051, + "reward": 1.5930989980697632, + "reward_std": 0.1885790079832077, + "rewards/accuracy_reward": 0.5930989682674408, + "rewards/format_reward": 1.0, + "step": 761 + }, + { + "completion_length": 78.75, + "epoch": 3.4794520547945207, + "grad_norm": 2.564729690551758, + "kl": 0.1591796875, + "learning_rate": 6.520547945205479e-07, + "loss": 0.0064, + "reward": 1.7561274766921997, + "reward_std": 0.1594456396996975, + "rewards/accuracy_reward": 0.7639399170875549, + "rewards/format_reward": 0.9921875, + "step": 762 + }, + { + "completion_length": 85.6953125, + "epoch": 3.4840182648401825, + "grad_norm": 4.693384170532227, + "kl": 0.117919921875, + "learning_rate": 6.515981735159816e-07, + "loss": 0.0047, + "reward": 1.4937500357627869, + "reward_std": 0.26740533858537674, + "rewards/accuracy_reward": 0.5093749910593033, + "rewards/format_reward": 0.984375, + "step": 763 + }, + { + "completion_length": 74.28125, + "epoch": 3.4885844748858448, + "grad_norm": 3.116814136505127, + "kl": 0.13037109375, + "learning_rate": 6.511415525114155e-07, + "loss": 0.0052, + "reward": 1.760156273841858, + "reward_std": 0.17397862672805786, + "rewards/accuracy_reward": 0.7679687440395355, + "rewards/format_reward": 0.9921875, + "step": 764 + }, + { + "completion_length": 72.8671875, + "epoch": 3.493150684931507, + "grad_norm": 4.106734752655029, + "kl": 0.319091796875, + "learning_rate": 6.506849315068493e-07, + "loss": 0.0128, + "reward": 1.6917535066604614, + "reward_std": 0.1726042479276657, + "rewards/accuracy_reward": 0.7073784470558167, + "rewards/format_reward": 0.984375, + "step": 765 + }, + { + "completion_length": 72.4296875, + "epoch": 3.497716894977169, + "grad_norm": 2.7671029567718506, + "kl": 0.11962890625, + "learning_rate": 6.502283105022832e-07, + "loss": 0.0048, + "reward": 1.7573699951171875, + "reward_std": 0.12242420390248299, + "rewards/accuracy_reward": 0.7573699355125427, + "rewards/format_reward": 1.0, + "step": 766 + }, + { + "completion_length": 91.5078125, + "epoch": 3.502283105022831, + "grad_norm": 2.353253126144409, + "kl": 0.12158203125, + "learning_rate": 6.497716894977169e-07, + "loss": 0.0049, + "reward": 1.5520833730697632, + "reward_std": 0.21783916652202606, + "rewards/accuracy_reward": 0.5677083134651184, + "rewards/format_reward": 0.984375, + "step": 767 + }, + { + "completion_length": 86.0390625, + "epoch": 3.506849315068493, + "grad_norm": 2.568850040435791, + "kl": 0.108642578125, + "learning_rate": 6.493150684931506e-07, + "loss": 0.0043, + "reward": 1.8059896230697632, + "reward_std": 0.1456034928560257, + "rewards/accuracy_reward": 0.8138020634651184, + "rewards/format_reward": 0.9921875, + "step": 768 + }, + { + "completion_length": 97.078125, + "epoch": 3.5114155251141552, + "grad_norm": 9.077414512634277, + "kl": 0.084716796875, + "learning_rate": 6.488584474885845e-07, + "loss": 0.0034, + "reward": 1.6987351775169373, + "reward_std": 0.2012891098856926, + "rewards/accuracy_reward": 0.7221725881099701, + "rewards/format_reward": 0.9765625, + "step": 769 + }, + { + "completion_length": 90.3828125, + "epoch": 3.5159817351598175, + "grad_norm": 4.764615058898926, + "kl": 0.107421875, + "learning_rate": 6.484018264840182e-07, + "loss": 0.0043, + "reward": 1.6770833730697632, + "reward_std": 0.20456601679325104, + "rewards/accuracy_reward": 0.6927083432674408, + "rewards/format_reward": 0.984375, + "step": 770 + }, + { + "completion_length": 91.53125, + "epoch": 3.5205479452054793, + "grad_norm": 2.192065477371216, + "kl": 0.125244140625, + "learning_rate": 6.479452054794519e-07, + "loss": 0.005, + "reward": 1.822656273841858, + "reward_std": 0.11972266435623169, + "rewards/accuracy_reward": 0.8226562142372131, + "rewards/format_reward": 1.0, + "step": 771 + }, + { + "completion_length": 66.875, + "epoch": 3.5251141552511416, + "grad_norm": 2.9831643104553223, + "kl": 0.16259765625, + "learning_rate": 6.474885844748859e-07, + "loss": 0.0065, + "reward": 1.6729073524475098, + "reward_std": 0.18848184496164322, + "rewards/accuracy_reward": 0.6807198524475098, + "rewards/format_reward": 0.9921875, + "step": 772 + }, + { + "completion_length": 112.515625, + "epoch": 3.5296803652968034, + "grad_norm": 4.550014495849609, + "kl": 0.107177734375, + "learning_rate": 6.470319634703196e-07, + "loss": 0.0043, + "reward": 1.748437523841858, + "reward_std": 0.21249166131019592, + "rewards/accuracy_reward": 0.7796874046325684, + "rewards/format_reward": 0.96875, + "step": 773 + }, + { + "completion_length": 86.4453125, + "epoch": 3.5342465753424657, + "grad_norm": 12.177788734436035, + "kl": 0.134521484375, + "learning_rate": 6.465753424657535e-07, + "loss": 0.0054, + "reward": 1.5592572093009949, + "reward_std": 0.3570391535758972, + "rewards/accuracy_reward": 0.6061321794986725, + "rewards/format_reward": 0.953125, + "step": 774 + }, + { + "completion_length": 91.90625, + "epoch": 3.538812785388128, + "grad_norm": 2.1934075355529785, + "kl": 0.09423828125, + "learning_rate": 6.461187214611872e-07, + "loss": 0.0038, + "reward": 1.7695313096046448, + "reward_std": 0.2720255181193352, + "rewards/accuracy_reward": 0.7929687201976776, + "rewards/format_reward": 0.9765625, + "step": 775 + }, + { + "completion_length": 93.34375, + "epoch": 3.54337899543379, + "grad_norm": 1.7237935066223145, + "kl": 0.10302734375, + "learning_rate": 6.456621004566209e-07, + "loss": 0.0041, + "reward": 1.839062511920929, + "reward_std": 0.08532825112342834, + "rewards/accuracy_reward": 0.8390624821186066, + "rewards/format_reward": 1.0, + "step": 776 + }, + { + "completion_length": 68.5234375, + "epoch": 3.547945205479452, + "grad_norm": 9.828876495361328, + "kl": 0.130859375, + "learning_rate": 6.452054794520548e-07, + "loss": 0.0052, + "reward": 1.699999988079071, + "reward_std": 0.23911622911691666, + "rewards/accuracy_reward": 0.7156249582767487, + "rewards/format_reward": 0.984375, + "step": 777 + }, + { + "completion_length": 111.7265625, + "epoch": 3.5525114155251143, + "grad_norm": 2.7527942657470703, + "kl": 0.0751953125, + "learning_rate": 6.447488584474886e-07, + "loss": 0.003, + "reward": 1.6765625476837158, + "reward_std": 0.17730073630809784, + "rewards/accuracy_reward": 0.6921874582767487, + "rewards/format_reward": 0.984375, + "step": 778 + }, + { + "completion_length": 86.328125, + "epoch": 3.557077625570776, + "grad_norm": 8.820732116699219, + "kl": 0.127685546875, + "learning_rate": 6.442922374429223e-07, + "loss": 0.0051, + "reward": 1.5761924982070923, + "reward_std": 0.2583516389131546, + "rewards/accuracy_reward": 0.5996299386024475, + "rewards/format_reward": 0.9765625, + "step": 779 + }, + { + "completion_length": 83.546875, + "epoch": 3.5616438356164384, + "grad_norm": 4.555209636688232, + "kl": 0.11083984375, + "learning_rate": 6.438356164383562e-07, + "loss": 0.0044, + "reward": 1.6642005443572998, + "reward_std": 0.2603069022297859, + "rewards/accuracy_reward": 0.6876380443572998, + "rewards/format_reward": 0.9765625, + "step": 780 + }, + { + "completion_length": 88.859375, + "epoch": 3.5662100456621006, + "grad_norm": 3.635876417160034, + "kl": 0.133056640625, + "learning_rate": 6.433789954337899e-07, + "loss": 0.0053, + "reward": 1.6102182865142822, + "reward_std": 0.2444395273923874, + "rewards/accuracy_reward": 0.6414682567119598, + "rewards/format_reward": 0.96875, + "step": 781 + }, + { + "completion_length": 83.7734375, + "epoch": 3.5707762557077625, + "grad_norm": 2.809690237045288, + "kl": 0.118896484375, + "learning_rate": 6.429223744292238e-07, + "loss": 0.0048, + "reward": 1.6714159846305847, + "reward_std": 0.2755907028913498, + "rewards/accuracy_reward": 0.6948534548282623, + "rewards/format_reward": 0.9765625, + "step": 782 + }, + { + "completion_length": 66.0, + "epoch": 3.5753424657534247, + "grad_norm": 4.247856140136719, + "kl": 0.1552734375, + "learning_rate": 6.424657534246575e-07, + "loss": 0.0062, + "reward": 1.5200520753860474, + "reward_std": 0.2615704759955406, + "rewards/accuracy_reward": 0.5434895753860474, + "rewards/format_reward": 0.9765625, + "step": 783 + }, + { + "completion_length": 86.296875, + "epoch": 3.5799086757990866, + "grad_norm": 3.303392171859741, + "kl": 0.17236328125, + "learning_rate": 6.420091324200912e-07, + "loss": 0.0069, + "reward": 1.7383928894996643, + "reward_std": 0.20217304676771164, + "rewards/accuracy_reward": 0.7618303298950195, + "rewards/format_reward": 0.9765625, + "step": 784 + }, + { + "completion_length": 94.890625, + "epoch": 3.584474885844749, + "grad_norm": 2.6589694023132324, + "kl": 0.105712890625, + "learning_rate": 6.415525114155252e-07, + "loss": 0.0042, + "reward": 1.710684597492218, + "reward_std": 0.20098017156124115, + "rewards/accuracy_reward": 0.7341220676898956, + "rewards/format_reward": 0.9765625, + "step": 785 + }, + { + "completion_length": 90.21875, + "epoch": 3.589041095890411, + "grad_norm": 2.0707075595855713, + "kl": 0.0810546875, + "learning_rate": 6.410958904109589e-07, + "loss": 0.0032, + "reward": 1.70947265625, + "reward_std": 0.13764164596796036, + "rewards/accuracy_reward": 0.7172850966453552, + "rewards/format_reward": 0.9921875, + "step": 786 + }, + { + "completion_length": 69.5078125, + "epoch": 3.593607305936073, + "grad_norm": 2.431682586669922, + "kl": 0.197265625, + "learning_rate": 6.406392694063926e-07, + "loss": 0.0079, + "reward": 1.6103981137275696, + "reward_std": 0.27858249843120575, + "rewards/accuracy_reward": 0.6338355839252472, + "rewards/format_reward": 0.9765625, + "step": 787 + }, + { + "completion_length": 73.4453125, + "epoch": 3.598173515981735, + "grad_norm": 2.722654342651367, + "kl": 0.13330078125, + "learning_rate": 6.401826484018265e-07, + "loss": 0.0053, + "reward": 1.7416667342185974, + "reward_std": 0.18196804821491241, + "rewards/accuracy_reward": 0.7651041448116302, + "rewards/format_reward": 0.9765625, + "step": 788 + }, + { + "completion_length": 80.890625, + "epoch": 3.602739726027397, + "grad_norm": 5.7324018478393555, + "kl": 0.151123046875, + "learning_rate": 6.397260273972602e-07, + "loss": 0.0061, + "reward": 1.7906250357627869, + "reward_std": 0.23605135083198547, + "rewards/accuracy_reward": 0.8062499463558197, + "rewards/format_reward": 0.984375, + "step": 789 + }, + { + "completion_length": 72.625, + "epoch": 3.6073059360730593, + "grad_norm": 2.2109618186950684, + "kl": 0.11767578125, + "learning_rate": 6.39269406392694e-07, + "loss": 0.0047, + "reward": 1.5838541984558105, + "reward_std": 0.16635090112686157, + "rewards/accuracy_reward": 0.583854153752327, + "rewards/format_reward": 1.0, + "step": 790 + }, + { + "completion_length": 79.53125, + "epoch": 3.6118721461187215, + "grad_norm": 1.7857418060302734, + "kl": 0.142822265625, + "learning_rate": 6.388127853881278e-07, + "loss": 0.0057, + "reward": 1.801552414894104, + "reward_std": 0.1699754223227501, + "rewards/accuracy_reward": 0.8171773254871368, + "rewards/format_reward": 0.984375, + "step": 791 + }, + { + "completion_length": 72.3359375, + "epoch": 3.616438356164384, + "grad_norm": 6.849174499511719, + "kl": 0.111572265625, + "learning_rate": 6.383561643835616e-07, + "loss": 0.0045, + "reward": 1.5342634320259094, + "reward_std": 0.26482056826353073, + "rewards/accuracy_reward": 0.534263402223587, + "rewards/format_reward": 1.0, + "step": 792 + }, + { + "completion_length": 98.8828125, + "epoch": 3.6210045662100456, + "grad_norm": 7.422957897186279, + "kl": 0.0927734375, + "learning_rate": 6.378995433789955e-07, + "loss": 0.0037, + "reward": 1.7940475940704346, + "reward_std": 0.1312719490379095, + "rewards/accuracy_reward": 0.8096725642681122, + "rewards/format_reward": 0.984375, + "step": 793 + }, + { + "completion_length": 81.0546875, + "epoch": 3.625570776255708, + "grad_norm": 1.819212794303894, + "kl": 0.11572265625, + "learning_rate": 6.374429223744292e-07, + "loss": 0.0046, + "reward": 1.664595365524292, + "reward_std": 0.1356574185192585, + "rewards/accuracy_reward": 0.6802203357219696, + "rewards/format_reward": 0.984375, + "step": 794 + }, + { + "completion_length": 80.25, + "epoch": 3.6301369863013697, + "grad_norm": 5.144930362701416, + "kl": 0.15966796875, + "learning_rate": 6.369863013698629e-07, + "loss": 0.0064, + "reward": 1.6710898280143738, + "reward_std": 0.27560608088970184, + "rewards/accuracy_reward": 0.710152268409729, + "rewards/format_reward": 0.9609375, + "step": 795 + }, + { + "completion_length": 70.6015625, + "epoch": 3.634703196347032, + "grad_norm": 5.67324686050415, + "kl": 0.119140625, + "learning_rate": 6.365296803652968e-07, + "loss": 0.0048, + "reward": 1.5117551684379578, + "reward_std": 0.2634401321411133, + "rewards/accuracy_reward": 0.5195676535367966, + "rewards/format_reward": 0.9921875, + "step": 796 + }, + { + "completion_length": 67.71875, + "epoch": 3.6392694063926943, + "grad_norm": 2.1526761054992676, + "kl": 0.14599609375, + "learning_rate": 6.360730593607305e-07, + "loss": 0.0058, + "reward": 1.7081494331359863, + "reward_std": 0.15050432085990906, + "rewards/accuracy_reward": 0.7237744629383087, + "rewards/format_reward": 0.984375, + "step": 797 + }, + { + "completion_length": 90.4609375, + "epoch": 3.643835616438356, + "grad_norm": 7.734349250793457, + "kl": 0.107666015625, + "learning_rate": 6.356164383561645e-07, + "loss": 0.0043, + "reward": 1.572656273841858, + "reward_std": 0.2658383846282959, + "rewards/accuracy_reward": 0.5960937142372131, + "rewards/format_reward": 0.9765625, + "step": 798 + }, + { + "completion_length": 76.6640625, + "epoch": 3.6484018264840183, + "grad_norm": 3.576162815093994, + "kl": 0.115966796875, + "learning_rate": 6.351598173515982e-07, + "loss": 0.0046, + "reward": 1.6600198149681091, + "reward_std": 0.15953533351421356, + "rewards/accuracy_reward": 0.6678323149681091, + "rewards/format_reward": 0.9921875, + "step": 799 + }, + { + "completion_length": 91.1796875, + "epoch": 3.65296803652968, + "grad_norm": 3.9656198024749756, + "kl": 0.107421875, + "learning_rate": 6.347031963470319e-07, + "loss": 0.0043, + "reward": 1.6131696701049805, + "reward_std": 0.19991916418075562, + "rewards/accuracy_reward": 0.6366070806980133, + "rewards/format_reward": 0.9765625, + "step": 800 + }, + { + "completion_length": 91.5, + "epoch": 3.6575342465753424, + "grad_norm": 2.609405755996704, + "kl": 0.1591796875, + "learning_rate": 6.342465753424658e-07, + "loss": 0.0064, + "reward": 1.6630208492279053, + "reward_std": 0.2102055549621582, + "rewards/accuracy_reward": 0.6708333194255829, + "rewards/format_reward": 0.9921875, + "step": 801 + }, + { + "completion_length": 83.9609375, + "epoch": 3.6621004566210047, + "grad_norm": 2.384371280670166, + "kl": 0.13818359375, + "learning_rate": 6.337899543378995e-07, + "loss": 0.0055, + "reward": 1.7731770873069763, + "reward_std": 0.16687272489070892, + "rewards/accuracy_reward": 0.7888020277023315, + "rewards/format_reward": 0.984375, + "step": 802 + }, + { + "completion_length": 83.25, + "epoch": 3.6666666666666665, + "grad_norm": 2.3074464797973633, + "kl": 0.105224609375, + "learning_rate": 6.333333333333332e-07, + "loss": 0.0042, + "reward": 1.6963542103767395, + "reward_std": 0.2274407297372818, + "rewards/accuracy_reward": 0.7354166209697723, + "rewards/format_reward": 0.9609375, + "step": 803 + }, + { + "completion_length": 91.296875, + "epoch": 3.671232876712329, + "grad_norm": 4.80971097946167, + "kl": 0.2685546875, + "learning_rate": 6.328767123287671e-07, + "loss": 0.0107, + "reward": 1.8013640642166138, + "reward_std": 0.18622903525829315, + "rewards/accuracy_reward": 0.8326140344142914, + "rewards/format_reward": 0.96875, + "step": 804 + }, + { + "completion_length": 76.4375, + "epoch": 3.6757990867579906, + "grad_norm": 2.313264846801758, + "kl": 0.13623046875, + "learning_rate": 6.324200913242009e-07, + "loss": 0.0054, + "reward": 1.573582112789154, + "reward_std": 0.28307729959487915, + "rewards/accuracy_reward": 0.5892071425914764, + "rewards/format_reward": 0.984375, + "step": 805 + }, + { + "completion_length": 71.53125, + "epoch": 3.680365296803653, + "grad_norm": 5.89130163192749, + "kl": 0.135986328125, + "learning_rate": 6.319634703196348e-07, + "loss": 0.0054, + "reward": 1.6453125476837158, + "reward_std": 0.2522100582718849, + "rewards/accuracy_reward": 0.6687499582767487, + "rewards/format_reward": 0.9765625, + "step": 806 + }, + { + "completion_length": 84.84375, + "epoch": 3.684931506849315, + "grad_norm": 2.150968074798584, + "kl": 0.103759765625, + "learning_rate": 6.315068493150685e-07, + "loss": 0.0042, + "reward": 1.6665269136428833, + "reward_std": 0.17805734649300575, + "rewards/accuracy_reward": 0.6743394434452057, + "rewards/format_reward": 0.9921875, + "step": 807 + }, + { + "completion_length": 118.5390625, + "epoch": 3.6894977168949774, + "grad_norm": 1.7179328203201294, + "kl": 0.0628662109375, + "learning_rate": 6.310502283105022e-07, + "loss": 0.0025, + "reward": 1.6565104722976685, + "reward_std": 0.31547820568084717, + "rewards/accuracy_reward": 0.7268228828907013, + "rewards/format_reward": 0.9296875, + "step": 808 + }, + { + "completion_length": 86.203125, + "epoch": 3.6940639269406392, + "grad_norm": 4.058598518371582, + "kl": 0.12353515625, + "learning_rate": 6.305936073059361e-07, + "loss": 0.0049, + "reward": 1.7383049130439758, + "reward_std": 0.2762962728738785, + "rewards/accuracy_reward": 0.7617424130439758, + "rewards/format_reward": 0.9765625, + "step": 809 + }, + { + "completion_length": 72.484375, + "epoch": 3.6986301369863015, + "grad_norm": 6.153299331665039, + "kl": 0.1328125, + "learning_rate": 6.301369863013698e-07, + "loss": 0.0053, + "reward": 1.607812523841858, + "reward_std": 0.26409636437892914, + "rewards/accuracy_reward": 0.6312500238418579, + "rewards/format_reward": 0.9765625, + "step": 810 + }, + { + "completion_length": 73.5078125, + "epoch": 3.7031963470319633, + "grad_norm": 4.83391809463501, + "kl": 0.115478515625, + "learning_rate": 6.296803652968035e-07, + "loss": 0.0046, + "reward": 1.6350895166397095, + "reward_std": 0.3858235031366348, + "rewards/accuracy_reward": 0.6897769868373871, + "rewards/format_reward": 0.9453125, + "step": 811 + }, + { + "completion_length": 79.734375, + "epoch": 3.7077625570776256, + "grad_norm": 7.388575553894043, + "kl": 0.121826171875, + "learning_rate": 6.292237442922375e-07, + "loss": 0.0049, + "reward": 1.6204612851142883, + "reward_std": 0.2048889473080635, + "rewards/accuracy_reward": 0.6360863149166107, + "rewards/format_reward": 0.984375, + "step": 812 + }, + { + "completion_length": 87.6875, + "epoch": 3.712328767123288, + "grad_norm": 2.1843342781066895, + "kl": 0.091064453125, + "learning_rate": 6.287671232876712e-07, + "loss": 0.0036, + "reward": 1.7203125357627869, + "reward_std": 0.14389308542013168, + "rewards/accuracy_reward": 0.7281249761581421, + "rewards/format_reward": 0.9921875, + "step": 813 + }, + { + "completion_length": 87.6484375, + "epoch": 3.7168949771689497, + "grad_norm": 6.269219875335693, + "kl": 0.100341796875, + "learning_rate": 6.283105022831051e-07, + "loss": 0.004, + "reward": 1.6382812857627869, + "reward_std": 0.17056220024824142, + "rewards/accuracy_reward": 0.6460937112569809, + "rewards/format_reward": 0.9921875, + "step": 814 + }, + { + "completion_length": 113.0, + "epoch": 3.721461187214612, + "grad_norm": 7.404258728027344, + "kl": 0.0799560546875, + "learning_rate": 6.278538812785388e-07, + "loss": 0.0032, + "reward": 1.7445932626724243, + "reward_std": 0.27295994758605957, + "rewards/accuracy_reward": 0.8149056732654572, + "rewards/format_reward": 0.9296875, + "step": 815 + }, + { + "completion_length": 83.03125, + "epoch": 3.7260273972602738, + "grad_norm": 2.5489566326141357, + "kl": 0.095703125, + "learning_rate": 6.273972602739725e-07, + "loss": 0.0038, + "reward": 1.6650923490524292, + "reward_std": 0.2622094973921776, + "rewards/accuracy_reward": 0.7041547894477844, + "rewards/format_reward": 0.9609375, + "step": 816 + }, + { + "completion_length": 82.0859375, + "epoch": 3.730593607305936, + "grad_norm": 9.064318656921387, + "kl": 0.10693359375, + "learning_rate": 6.269406392694064e-07, + "loss": 0.0043, + "reward": 1.565638542175293, + "reward_std": 0.20368661731481552, + "rewards/accuracy_reward": 0.589076042175293, + "rewards/format_reward": 0.9765625, + "step": 817 + }, + { + "completion_length": 85.734375, + "epoch": 3.7351598173515983, + "grad_norm": 3.814204692840576, + "kl": 0.0927734375, + "learning_rate": 6.264840182648402e-07, + "loss": 0.0037, + "reward": 1.7833616733551025, + "reward_std": 0.1723785549402237, + "rewards/accuracy_reward": 0.7911740839481354, + "rewards/format_reward": 0.9921875, + "step": 818 + }, + { + "completion_length": 59.921875, + "epoch": 3.73972602739726, + "grad_norm": 3.6849331855773926, + "kl": 0.127685546875, + "learning_rate": 6.260273972602739e-07, + "loss": 0.0051, + "reward": 1.6573927998542786, + "reward_std": 0.23803135752677917, + "rewards/accuracy_reward": 0.6808302700519562, + "rewards/format_reward": 0.9765625, + "step": 819 + }, + { + "completion_length": 73.921875, + "epoch": 3.7442922374429224, + "grad_norm": 2.6054327487945557, + "kl": 0.10498046875, + "learning_rate": 6.255707762557078e-07, + "loss": 0.0042, + "reward": 1.5656526684761047, + "reward_std": 0.2571340575814247, + "rewards/accuracy_reward": 0.5890901386737823, + "rewards/format_reward": 0.9765625, + "step": 820 + }, + { + "completion_length": 74.3046875, + "epoch": 3.748858447488584, + "grad_norm": 8.960927963256836, + "kl": 0.413818359375, + "learning_rate": 6.251141552511415e-07, + "loss": 0.0166, + "reward": 1.6491714119911194, + "reward_std": 0.190296471118927, + "rewards/accuracy_reward": 0.664796382188797, + "rewards/format_reward": 0.984375, + "step": 821 + }, + { + "completion_length": 80.5859375, + "epoch": 3.7534246575342465, + "grad_norm": 4.059770107269287, + "kl": 0.15185546875, + "learning_rate": 6.246575342465754e-07, + "loss": 0.0061, + "reward": 1.7381510734558105, + "reward_std": 0.23838761448860168, + "rewards/accuracy_reward": 0.7615885138511658, + "rewards/format_reward": 0.9765625, + "step": 822 + }, + { + "completion_length": 81.765625, + "epoch": 3.7579908675799087, + "grad_norm": 3.5536246299743652, + "kl": 0.244140625, + "learning_rate": 6.242009132420091e-07, + "loss": 0.0098, + "reward": 1.6796875, + "reward_std": 0.19722937047481537, + "rewards/accuracy_reward": 0.7031249403953552, + "rewards/format_reward": 0.9765625, + "step": 823 + }, + { + "completion_length": 67.1015625, + "epoch": 3.762557077625571, + "grad_norm": 2.649221658706665, + "kl": 0.148681640625, + "learning_rate": 6.237442922374428e-07, + "loss": 0.006, + "reward": 1.5904948711395264, + "reward_std": 0.26418011635541916, + "rewards/accuracy_reward": 0.6217447817325592, + "rewards/format_reward": 0.96875, + "step": 824 + }, + { + "completion_length": 67.453125, + "epoch": 3.767123287671233, + "grad_norm": 4.655685901641846, + "kl": 0.13623046875, + "learning_rate": 6.232876712328768e-07, + "loss": 0.0054, + "reward": 1.611718773841858, + "reward_std": 0.17845439538359642, + "rewards/accuracy_reward": 0.6195312142372131, + "rewards/format_reward": 0.9921875, + "step": 825 + }, + { + "completion_length": 75.90625, + "epoch": 3.771689497716895, + "grad_norm": 1.6092960834503174, + "kl": 0.127685546875, + "learning_rate": 6.228310502283105e-07, + "loss": 0.0051, + "reward": 1.71875, + "reward_std": 0.16304787248373032, + "rewards/accuracy_reward": 0.7343750298023224, + "rewards/format_reward": 0.984375, + "step": 826 + }, + { + "completion_length": 74.7890625, + "epoch": 3.776255707762557, + "grad_norm": 1.904418706893921, + "kl": 0.090087890625, + "learning_rate": 6.223744292237442e-07, + "loss": 0.0036, + "reward": 1.7890625596046448, + "reward_std": 0.16059691458940506, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9765625, + "step": 827 + }, + { + "completion_length": 78.609375, + "epoch": 3.780821917808219, + "grad_norm": 2.7213988304138184, + "kl": 0.13037109375, + "learning_rate": 6.219178082191781e-07, + "loss": 0.0052, + "reward": 1.6396701335906982, + "reward_std": 0.23586497455835342, + "rewards/accuracy_reward": 0.6787325739860535, + "rewards/format_reward": 0.9609375, + "step": 828 + }, + { + "completion_length": 87.4140625, + "epoch": 3.7853881278538815, + "grad_norm": 3.867840051651001, + "kl": 0.1123046875, + "learning_rate": 6.214611872146118e-07, + "loss": 0.0045, + "reward": 1.6255208849906921, + "reward_std": 0.19572605937719345, + "rewards/accuracy_reward": 0.6489582806825638, + "rewards/format_reward": 0.9765625, + "step": 829 + }, + { + "completion_length": 80.6953125, + "epoch": 3.7899543378995433, + "grad_norm": 5.212503910064697, + "kl": 0.177490234375, + "learning_rate": 6.210045662100457e-07, + "loss": 0.0071, + "reward": 1.7282792925834656, + "reward_std": 0.31921522319316864, + "rewards/accuracy_reward": 0.7673417627811432, + "rewards/format_reward": 0.9609375, + "step": 830 + }, + { + "completion_length": 87.03125, + "epoch": 3.7945205479452055, + "grad_norm": 7.442432880401611, + "kl": 0.093017578125, + "learning_rate": 6.205479452054794e-07, + "loss": 0.0037, + "reward": 1.6146034002304077, + "reward_std": 0.196245439350605, + "rewards/accuracy_reward": 0.6224157810211182, + "rewards/format_reward": 0.9921875, + "step": 831 + }, + { + "completion_length": 77.1640625, + "epoch": 3.7990867579908674, + "grad_norm": 4.140781402587891, + "kl": 0.2216796875, + "learning_rate": 6.200913242009132e-07, + "loss": 0.0089, + "reward": 1.4750909209251404, + "reward_std": 0.26224930584430695, + "rewards/accuracy_reward": 0.5141534507274628, + "rewards/format_reward": 0.9609375, + "step": 832 + }, + { + "completion_length": 73.5859375, + "epoch": 3.8036529680365296, + "grad_norm": 2.893556594848633, + "kl": 0.105224609375, + "learning_rate": 6.196347031963471e-07, + "loss": 0.0042, + "reward": 1.6718750596046448, + "reward_std": 0.25111906230449677, + "rewards/accuracy_reward": 0.6953124701976776, + "rewards/format_reward": 0.9765625, + "step": 833 + }, + { + "completion_length": 68.46875, + "epoch": 3.808219178082192, + "grad_norm": 4.676414489746094, + "kl": 0.20361328125, + "learning_rate": 6.191780821917808e-07, + "loss": 0.0081, + "reward": 1.682812511920929, + "reward_std": 0.20322410762310028, + "rewards/accuracy_reward": 0.6984374821186066, + "rewards/format_reward": 0.984375, + "step": 834 + }, + { + "completion_length": 78.9375, + "epoch": 3.8127853881278537, + "grad_norm": 1.6920398473739624, + "kl": 0.134765625, + "learning_rate": 6.187214611872145e-07, + "loss": 0.0054, + "reward": 1.8067708611488342, + "reward_std": 0.14878704398870468, + "rewards/accuracy_reward": 0.8223958313465118, + "rewards/format_reward": 0.984375, + "step": 835 + }, + { + "completion_length": 73.34375, + "epoch": 3.817351598173516, + "grad_norm": 3.9718282222747803, + "kl": 0.1552734375, + "learning_rate": 6.182648401826484e-07, + "loss": 0.0062, + "reward": 1.586328148841858, + "reward_std": 0.2995697557926178, + "rewards/accuracy_reward": 0.6410156190395355, + "rewards/format_reward": 0.9453125, + "step": 836 + }, + { + "completion_length": 74.859375, + "epoch": 3.821917808219178, + "grad_norm": 2.3701136112213135, + "kl": 0.146240234375, + "learning_rate": 6.178082191780821e-07, + "loss": 0.0059, + "reward": 1.6562500596046448, + "reward_std": 0.1812673956155777, + "rewards/accuracy_reward": 0.6640624701976776, + "rewards/format_reward": 0.9921875, + "step": 837 + }, + { + "completion_length": 63.671875, + "epoch": 3.82648401826484, + "grad_norm": 3.7064878940582275, + "kl": 0.1220703125, + "learning_rate": 6.173515981735161e-07, + "loss": 0.0049, + "reward": 1.574496567249298, + "reward_std": 0.24655399471521378, + "rewards/accuracy_reward": 0.5979340374469757, + "rewards/format_reward": 0.9765625, + "step": 838 + }, + { + "completion_length": 69.6171875, + "epoch": 3.8310502283105023, + "grad_norm": 2.966919422149658, + "kl": 0.119384765625, + "learning_rate": 6.168949771689498e-07, + "loss": 0.0048, + "reward": 1.5951017141342163, + "reward_std": 0.22802505642175674, + "rewards/accuracy_reward": 0.6263516843318939, + "rewards/format_reward": 0.96875, + "step": 839 + }, + { + "completion_length": 67.4921875, + "epoch": 3.8356164383561646, + "grad_norm": 5.754858493804932, + "kl": 0.125732421875, + "learning_rate": 6.164383561643835e-07, + "loss": 0.005, + "reward": 1.60744047164917, + "reward_std": 0.2320682480931282, + "rewards/accuracy_reward": 0.6230654716491699, + "rewards/format_reward": 0.984375, + "step": 840 + }, + { + "completion_length": 94.8828125, + "epoch": 3.8401826484018264, + "grad_norm": 3.788088321685791, + "kl": 0.09912109375, + "learning_rate": 6.159817351598174e-07, + "loss": 0.004, + "reward": 1.65234375, + "reward_std": 0.1578691005706787, + "rewards/accuracy_reward": 0.65234375, + "rewards/format_reward": 1.0, + "step": 841 + }, + { + "completion_length": 82.28125, + "epoch": 3.8447488584474887, + "grad_norm": 3.032823085784912, + "kl": 0.103271484375, + "learning_rate": 6.155251141552511e-07, + "loss": 0.0041, + "reward": 1.538119375705719, + "reward_std": 0.2931046634912491, + "rewards/accuracy_reward": 0.569369375705719, + "rewards/format_reward": 0.96875, + "step": 842 + }, + { + "completion_length": 88.265625, + "epoch": 3.8493150684931505, + "grad_norm": 2.753324031829834, + "kl": 0.08544921875, + "learning_rate": 6.150684931506848e-07, + "loss": 0.0034, + "reward": 1.6361016035079956, + "reward_std": 0.2543141394853592, + "rewards/accuracy_reward": 0.6673516035079956, + "rewards/format_reward": 0.96875, + "step": 843 + }, + { + "completion_length": 88.3203125, + "epoch": 3.853881278538813, + "grad_norm": 1.9302741289138794, + "kl": 0.116455078125, + "learning_rate": 6.146118721461187e-07, + "loss": 0.0047, + "reward": 1.723133623600006, + "reward_std": 0.12216833233833313, + "rewards/accuracy_reward": 0.7231336236000061, + "rewards/format_reward": 1.0, + "step": 844 + }, + { + "completion_length": 68.3046875, + "epoch": 3.858447488584475, + "grad_norm": 2.5496842861175537, + "kl": 0.115234375, + "learning_rate": 6.141552511415525e-07, + "loss": 0.0046, + "reward": 1.63571435213089, + "reward_std": 0.22781573235988617, + "rewards/accuracy_reward": 0.6513392478227615, + "rewards/format_reward": 0.984375, + "step": 845 + }, + { + "completion_length": 98.5546875, + "epoch": 3.863013698630137, + "grad_norm": 1.8447142839431763, + "kl": 0.098388671875, + "learning_rate": 6.136986301369864e-07, + "loss": 0.0039, + "reward": 1.7750434279441833, + "reward_std": 0.16683020442724228, + "rewards/accuracy_reward": 0.798480898141861, + "rewards/format_reward": 0.9765625, + "step": 846 + }, + { + "completion_length": 86.2265625, + "epoch": 3.867579908675799, + "grad_norm": 4.997533798217773, + "kl": 0.123779296875, + "learning_rate": 6.132420091324201e-07, + "loss": 0.0049, + "reward": 1.6639086604118347, + "reward_std": 0.298796147108078, + "rewards/accuracy_reward": 0.7185961008071899, + "rewards/format_reward": 0.9453125, + "step": 847 + }, + { + "completion_length": 76.2578125, + "epoch": 3.872146118721461, + "grad_norm": 4.215785503387451, + "kl": 0.130859375, + "learning_rate": 6.127853881278538e-07, + "loss": 0.0052, + "reward": 1.5344713926315308, + "reward_std": 0.3186282366514206, + "rewards/accuracy_reward": 0.5735338628292084, + "rewards/format_reward": 0.9609375, + "step": 848 + }, + { + "completion_length": 76.5546875, + "epoch": 3.8767123287671232, + "grad_norm": 3.0258474349975586, + "kl": 0.11962890625, + "learning_rate": 6.123287671232877e-07, + "loss": 0.0048, + "reward": 1.5302554368972778, + "reward_std": 0.21649178117513657, + "rewards/accuracy_reward": 0.5458804070949554, + "rewards/format_reward": 0.984375, + "step": 849 + }, + { + "completion_length": 88.828125, + "epoch": 3.8812785388127855, + "grad_norm": 3.9653830528259277, + "kl": 0.134765625, + "learning_rate": 6.118721461187214e-07, + "loss": 0.0054, + "reward": 1.7723276019096375, + "reward_std": 0.27654044330120087, + "rewards/accuracy_reward": 0.8035775721073151, + "rewards/format_reward": 0.96875, + "step": 850 + }, + { + "completion_length": 70.6796875, + "epoch": 3.8858447488584473, + "grad_norm": 13.911476135253906, + "kl": 0.1279296875, + "learning_rate": 6.114155251141551e-07, + "loss": 0.0051, + "reward": 1.5849445462226868, + "reward_std": 0.2708437442779541, + "rewards/accuracy_reward": 0.5849444717168808, + "rewards/format_reward": 1.0, + "step": 851 + }, + { + "completion_length": 84.46875, + "epoch": 3.8904109589041096, + "grad_norm": 2.8287229537963867, + "kl": 0.122802734375, + "learning_rate": 6.109589041095891e-07, + "loss": 0.0049, + "reward": 1.6496233344078064, + "reward_std": 0.1977412924170494, + "rewards/accuracy_reward": 0.6730607748031616, + "rewards/format_reward": 0.9765625, + "step": 852 + }, + { + "completion_length": 84.328125, + "epoch": 3.8949771689497714, + "grad_norm": 3.292459487915039, + "kl": 0.110107421875, + "learning_rate": 6.105022831050228e-07, + "loss": 0.0044, + "reward": 1.560290813446045, + "reward_std": 0.2603389769792557, + "rewards/accuracy_reward": 0.5681032538414001, + "rewards/format_reward": 0.9921875, + "step": 853 + }, + { + "completion_length": 70.046875, + "epoch": 3.8995433789954337, + "grad_norm": 3.479142189025879, + "kl": 0.13818359375, + "learning_rate": 6.100456621004567e-07, + "loss": 0.0055, + "reward": 1.5651041865348816, + "reward_std": 0.327400267124176, + "rewards/accuracy_reward": 0.6041666567325592, + "rewards/format_reward": 0.9609375, + "step": 854 + }, + { + "completion_length": 78.3046875, + "epoch": 3.904109589041096, + "grad_norm": 2.2671470642089844, + "kl": 0.103515625, + "learning_rate": 6.095890410958904e-07, + "loss": 0.0041, + "reward": 1.6139509677886963, + "reward_std": 0.221808023750782, + "rewards/accuracy_reward": 0.6608258783817291, + "rewards/format_reward": 0.953125, + "step": 855 + }, + { + "completion_length": 79.015625, + "epoch": 3.908675799086758, + "grad_norm": 3.359975814819336, + "kl": 0.097900390625, + "learning_rate": 6.091324200913241e-07, + "loss": 0.0039, + "reward": 1.7910323739051819, + "reward_std": 0.16605094820261002, + "rewards/accuracy_reward": 0.80665722489357, + "rewards/format_reward": 0.984375, + "step": 856 + }, + { + "completion_length": 108.4375, + "epoch": 3.91324200913242, + "grad_norm": 2.9608707427978516, + "kl": 0.064208984375, + "learning_rate": 6.08675799086758e-07, + "loss": 0.0026, + "reward": 1.7960938215255737, + "reward_std": 0.15246989950537682, + "rewards/accuracy_reward": 0.8195312321186066, + "rewards/format_reward": 0.9765625, + "step": 857 + }, + { + "completion_length": 70.484375, + "epoch": 3.9178082191780823, + "grad_norm": 1.9071540832519531, + "kl": 0.115234375, + "learning_rate": 6.082191780821918e-07, + "loss": 0.0046, + "reward": 1.8630208373069763, + "reward_std": 0.18414238840341568, + "rewards/accuracy_reward": 0.8864583075046539, + "rewards/format_reward": 0.9765625, + "step": 858 + }, + { + "completion_length": 86.25, + "epoch": 3.922374429223744, + "grad_norm": 1.7425415515899658, + "kl": 0.0791015625, + "learning_rate": 6.077625570776255e-07, + "loss": 0.0032, + "reward": 1.7500391602516174, + "reward_std": 0.1361438985913992, + "rewards/accuracy_reward": 0.757851630449295, + "rewards/format_reward": 0.9921875, + "step": 859 + }, + { + "completion_length": 83.4765625, + "epoch": 3.9269406392694064, + "grad_norm": 2.0878822803497314, + "kl": 0.132080078125, + "learning_rate": 6.073059360730594e-07, + "loss": 0.0053, + "reward": 1.7227915525436401, + "reward_std": 0.2049795687198639, + "rewards/accuracy_reward": 0.7384164929389954, + "rewards/format_reward": 0.984375, + "step": 860 + }, + { + "completion_length": 82.6328125, + "epoch": 3.9315068493150687, + "grad_norm": 2.1125905513763428, + "kl": 0.10400390625, + "learning_rate": 6.068493150684931e-07, + "loss": 0.0041, + "reward": 1.7052381038665771, + "reward_std": 0.19698219001293182, + "rewards/accuracy_reward": 0.7208629250526428, + "rewards/format_reward": 0.984375, + "step": 861 + }, + { + "completion_length": 70.6171875, + "epoch": 3.9360730593607305, + "grad_norm": 3.4557483196258545, + "kl": 0.125244140625, + "learning_rate": 6.06392694063927e-07, + "loss": 0.005, + "reward": 1.6386160850524902, + "reward_std": 0.2512796074151993, + "rewards/accuracy_reward": 0.6464285254478455, + "rewards/format_reward": 0.9921875, + "step": 862 + }, + { + "completion_length": 60.2890625, + "epoch": 3.9406392694063928, + "grad_norm": 2.9261093139648438, + "kl": 0.197265625, + "learning_rate": 6.059360730593607e-07, + "loss": 0.0079, + "reward": 1.6639323234558105, + "reward_std": 0.18012882769107819, + "rewards/accuracy_reward": 0.6717447936534882, + "rewards/format_reward": 0.9921875, + "step": 863 + }, + { + "completion_length": 68.984375, + "epoch": 3.9452054794520546, + "grad_norm": 8.949965476989746, + "kl": 0.16455078125, + "learning_rate": 6.054794520547944e-07, + "loss": 0.0066, + "reward": 1.634996235370636, + "reward_std": 0.30956215411424637, + "rewards/accuracy_reward": 0.658433735370636, + "rewards/format_reward": 0.9765625, + "step": 864 + }, + { + "completion_length": 94.5078125, + "epoch": 3.949771689497717, + "grad_norm": 1.5857025384902954, + "kl": 0.07958984375, + "learning_rate": 6.050228310502284e-07, + "loss": 0.0032, + "reward": 1.7937500476837158, + "reward_std": 0.08982988260686398, + "rewards/accuracy_reward": 0.793749988079071, + "rewards/format_reward": 1.0, + "step": 865 + }, + { + "completion_length": 72.328125, + "epoch": 3.954337899543379, + "grad_norm": 3.955540418624878, + "kl": 0.144775390625, + "learning_rate": 6.045662100456621e-07, + "loss": 0.0058, + "reward": 1.5606706142425537, + "reward_std": 0.3175853192806244, + "rewards/accuracy_reward": 0.6075455844402313, + "rewards/format_reward": 0.953125, + "step": 866 + }, + { + "completion_length": 56.71875, + "epoch": 3.958904109589041, + "grad_norm": 3.506572723388672, + "kl": 0.142822265625, + "learning_rate": 6.041095890410958e-07, + "loss": 0.0057, + "reward": 1.7238853573799133, + "reward_std": 0.22640568763017654, + "rewards/accuracy_reward": 0.7316978275775909, + "rewards/format_reward": 0.9921875, + "step": 867 + }, + { + "completion_length": 69.9921875, + "epoch": 3.963470319634703, + "grad_norm": 6.0887770652771, + "kl": 0.0931396484375, + "learning_rate": 6.036529680365297e-07, + "loss": 0.0037, + "reward": 1.851125419139862, + "reward_std": 0.1146822888404131, + "rewards/accuracy_reward": 0.8511254191398621, + "rewards/format_reward": 1.0, + "step": 868 + }, + { + "completion_length": 78.6875, + "epoch": 3.968036529680365, + "grad_norm": 2.3538548946380615, + "kl": 0.134765625, + "learning_rate": 6.031963470319634e-07, + "loss": 0.0054, + "reward": 1.6927083730697632, + "reward_std": 0.20856676995754242, + "rewards/accuracy_reward": 0.7083333134651184, + "rewards/format_reward": 0.984375, + "step": 869 + }, + { + "completion_length": 67.84375, + "epoch": 3.9726027397260273, + "grad_norm": 3.3203752040863037, + "kl": 0.19482421875, + "learning_rate": 6.027397260273972e-07, + "loss": 0.0078, + "reward": 1.682466745376587, + "reward_std": 0.20432885736227036, + "rewards/accuracy_reward": 0.6824667155742645, + "rewards/format_reward": 1.0, + "step": 870 + }, + { + "completion_length": 60.8046875, + "epoch": 3.9771689497716896, + "grad_norm": 4.509641647338867, + "kl": 0.12548828125, + "learning_rate": 6.02283105022831e-07, + "loss": 0.005, + "reward": 1.5753461122512817, + "reward_std": 0.23474501818418503, + "rewards/accuracy_reward": 0.5753461122512817, + "rewards/format_reward": 1.0, + "step": 871 + }, + { + "completion_length": 89.40625, + "epoch": 3.981735159817352, + "grad_norm": 2.3428871631622314, + "kl": 0.113037109375, + "learning_rate": 6.018264840182648e-07, + "loss": 0.0045, + "reward": 1.8376488089561462, + "reward_std": 0.12626906298100948, + "rewards/accuracy_reward": 0.8454612195491791, + "rewards/format_reward": 0.9921875, + "step": 872 + }, + { + "completion_length": 70.9921875, + "epoch": 3.9863013698630136, + "grad_norm": 1.8384939432144165, + "kl": 0.12890625, + "learning_rate": 6.013698630136987e-07, + "loss": 0.0052, + "reward": 1.6695202589035034, + "reward_std": 0.13698631152510643, + "rewards/accuracy_reward": 0.6695202589035034, + "rewards/format_reward": 1.0, + "step": 873 + }, + { + "completion_length": 72.8828125, + "epoch": 3.990867579908676, + "grad_norm": 2.790123224258423, + "kl": 0.132568359375, + "learning_rate": 6.009132420091324e-07, + "loss": 0.0053, + "reward": 1.5515252947807312, + "reward_std": 0.24770487844944, + "rewards/accuracy_reward": 0.5749628096818924, + "rewards/format_reward": 0.9765625, + "step": 874 + }, + { + "completion_length": 70.765625, + "epoch": 3.9954337899543377, + "grad_norm": 3.470076084136963, + "kl": 0.09423828125, + "learning_rate": 6.004566210045661e-07, + "loss": 0.0038, + "reward": 1.8020795583724976, + "reward_std": 0.08546407520771027, + "rewards/accuracy_reward": 0.8098919987678528, + "rewards/format_reward": 0.9921875, + "step": 875 + }, + { + "completion_length": 37.0, + "epoch": 4.0, + "grad_norm": 3.645047903060913, + "kl": 0.158203125, + "learning_rate": 6e-07, + "loss": 0.0061, + "reward": 1.5, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 876 + }, + { + "completion_length": 63.9921875, + "epoch": 4.004566210045662, + "grad_norm": 8.070154190063477, + "kl": 0.143310546875, + "learning_rate": 5.995433789954337e-07, + "loss": 0.0057, + "reward": 1.726302146911621, + "reward_std": 0.1457817256450653, + "rewards/accuracy_reward": 0.7263020873069763, + "rewards/format_reward": 1.0, + "step": 877 + }, + { + "completion_length": 80.28125, + "epoch": 4.0091324200913245, + "grad_norm": 3.771700620651245, + "kl": 0.1318359375, + "learning_rate": 5.990867579908675e-07, + "loss": 0.0053, + "reward": 1.7184895873069763, + "reward_std": 0.14457330107688904, + "rewards/accuracy_reward": 0.7263020873069763, + "rewards/format_reward": 0.9921875, + "step": 878 + }, + { + "completion_length": 55.3828125, + "epoch": 4.013698630136986, + "grad_norm": 3.114535093307495, + "kl": 0.193359375, + "learning_rate": 5.986301369863014e-07, + "loss": 0.0077, + "reward": 1.699999988079071, + "reward_std": 0.24036270380020142, + "rewards/accuracy_reward": 0.715624988079071, + "rewards/format_reward": 0.984375, + "step": 879 + }, + { + "completion_length": 68.9921875, + "epoch": 4.018264840182648, + "grad_norm": 5.49289608001709, + "kl": 0.1513671875, + "learning_rate": 5.981735159817351e-07, + "loss": 0.006, + "reward": 1.7083333730697632, + "reward_std": 0.14341074973344803, + "rewards/accuracy_reward": 0.7083333134651184, + "rewards/format_reward": 1.0, + "step": 880 + }, + { + "completion_length": 68.1484375, + "epoch": 4.0228310502283104, + "grad_norm": 8.557650566101074, + "kl": 0.47802734375, + "learning_rate": 5.97716894977169e-07, + "loss": 0.0191, + "reward": 1.7641276121139526, + "reward_std": 0.13008537888526917, + "rewards/accuracy_reward": 0.7641275823116302, + "rewards/format_reward": 1.0, + "step": 881 + }, + { + "completion_length": 74.2265625, + "epoch": 4.027397260273973, + "grad_norm": 1.5429102182388306, + "kl": 0.1396484375, + "learning_rate": 5.972602739726027e-07, + "loss": 0.0056, + "reward": 1.7580729722976685, + "reward_std": 0.1651972383260727, + "rewards/accuracy_reward": 0.7736978530883789, + "rewards/format_reward": 0.984375, + "step": 882 + }, + { + "completion_length": 60.5703125, + "epoch": 4.031963470319635, + "grad_norm": 1.6412632465362549, + "kl": 0.123779296875, + "learning_rate": 5.968036529680364e-07, + "loss": 0.005, + "reward": 1.7804688215255737, + "reward_std": 0.1583872102200985, + "rewards/accuracy_reward": 0.7882812321186066, + "rewards/format_reward": 0.9921875, + "step": 883 + }, + { + "completion_length": 74.703125, + "epoch": 4.036529680365296, + "grad_norm": 5.128458499908447, + "kl": 0.2080078125, + "learning_rate": 5.963470319634703e-07, + "loss": 0.0083, + "reward": 1.706798791885376, + "reward_std": 0.19644346833229065, + "rewards/accuracy_reward": 0.7302362024784088, + "rewards/format_reward": 0.9765625, + "step": 884 + }, + { + "completion_length": 60.921875, + "epoch": 4.041095890410959, + "grad_norm": 2.6405813694000244, + "kl": 0.14892578125, + "learning_rate": 5.958904109589041e-07, + "loss": 0.006, + "reward": 1.7202391624450684, + "reward_std": 0.218642920255661, + "rewards/accuracy_reward": 0.7202391028404236, + "rewards/format_reward": 1.0, + "step": 885 + }, + { + "completion_length": 66.59375, + "epoch": 4.045662100456621, + "grad_norm": 2.522268533706665, + "kl": 0.1220703125, + "learning_rate": 5.95433789954338e-07, + "loss": 0.0049, + "reward": 1.5917280912399292, + "reward_std": 0.18826671689748764, + "rewards/accuracy_reward": 0.6073530614376068, + "rewards/format_reward": 0.984375, + "step": 886 + }, + { + "completion_length": 51.8828125, + "epoch": 4.050228310502283, + "grad_norm": 2.2267160415649414, + "kl": 0.173828125, + "learning_rate": 5.949771689497717e-07, + "loss": 0.007, + "reward": 1.6658853888511658, + "reward_std": 0.22664503753185272, + "rewards/accuracy_reward": 0.6815103888511658, + "rewards/format_reward": 0.984375, + "step": 887 + }, + { + "completion_length": 51.3203125, + "epoch": 4.054794520547945, + "grad_norm": 1.9868104457855225, + "kl": 0.1552734375, + "learning_rate": 5.945205479452054e-07, + "loss": 0.0062, + "reward": 1.7429263591766357, + "reward_std": 0.18497039377689362, + "rewards/accuracy_reward": 0.7741763293743134, + "rewards/format_reward": 0.96875, + "step": 888 + }, + { + "completion_length": 56.0390625, + "epoch": 4.059360730593608, + "grad_norm": 2.1909918785095215, + "kl": 0.140869140625, + "learning_rate": 5.940639269406393e-07, + "loss": 0.0056, + "reward": 1.726171851158142, + "reward_std": 0.20413171127438545, + "rewards/accuracy_reward": 0.7339843213558197, + "rewards/format_reward": 0.9921875, + "step": 889 + }, + { + "completion_length": 69.4609375, + "epoch": 4.063926940639269, + "grad_norm": 1.6129564046859741, + "kl": 0.125732421875, + "learning_rate": 5.93607305936073e-07, + "loss": 0.005, + "reward": 1.7187500596046448, + "reward_std": 0.1436246931552887, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.984375, + "step": 890 + }, + { + "completion_length": 67.9921875, + "epoch": 4.068493150684931, + "grad_norm": 3.2070353031158447, + "kl": 0.119140625, + "learning_rate": 5.931506849315067e-07, + "loss": 0.0048, + "reward": 1.685523271560669, + "reward_std": 0.18283560872077942, + "rewards/accuracy_reward": 0.6933356523513794, + "rewards/format_reward": 0.9921875, + "step": 891 + }, + { + "completion_length": 74.7578125, + "epoch": 4.073059360730594, + "grad_norm": 2.196234703063965, + "kl": 0.092041015625, + "learning_rate": 5.926940639269407e-07, + "loss": 0.0037, + "reward": 1.7648438215255737, + "reward_std": 0.10456175357103348, + "rewards/accuracy_reward": 0.7648437023162842, + "rewards/format_reward": 1.0, + "step": 892 + }, + { + "completion_length": 62.0, + "epoch": 4.077625570776256, + "grad_norm": 3.306205987930298, + "kl": 0.12451171875, + "learning_rate": 5.922374429223744e-07, + "loss": 0.005, + "reward": 1.7713721990585327, + "reward_std": 0.16517749428749084, + "rewards/accuracy_reward": 0.7713720798492432, + "rewards/format_reward": 1.0, + "step": 893 + }, + { + "completion_length": 59.890625, + "epoch": 4.082191780821918, + "grad_norm": 2.852937698364258, + "kl": 0.13525390625, + "learning_rate": 5.917808219178083e-07, + "loss": 0.0054, + "reward": 1.6403688192367554, + "reward_std": 0.19380497932434082, + "rewards/accuracy_reward": 0.648181289434433, + "rewards/format_reward": 0.9921875, + "step": 894 + }, + { + "completion_length": 80.5546875, + "epoch": 4.0867579908675795, + "grad_norm": 4.440982818603516, + "kl": 0.110107421875, + "learning_rate": 5.91324200913242e-07, + "loss": 0.0044, + "reward": 1.7101563215255737, + "reward_std": 0.15725971013307571, + "rewards/accuracy_reward": 0.7179687321186066, + "rewards/format_reward": 0.9921875, + "step": 895 + }, + { + "completion_length": 76.8046875, + "epoch": 4.091324200913242, + "grad_norm": 1.442449927330017, + "kl": 0.130859375, + "learning_rate": 5.908675799086757e-07, + "loss": 0.0052, + "reward": 1.831250011920929, + "reward_std": 0.0731260534375906, + "rewards/accuracy_reward": 0.8312499225139618, + "rewards/format_reward": 1.0, + "step": 896 + }, + { + "completion_length": 43.0, + "epoch": 4.095890410958904, + "grad_norm": 5.812748908996582, + "kl": 0.21533203125, + "learning_rate": 5.904109589041096e-07, + "loss": 0.0086, + "reward": 1.7768229246139526, + "reward_std": 0.22991649061441422, + "rewards/accuracy_reward": 0.7768228948116302, + "rewards/format_reward": 1.0, + "step": 897 + }, + { + "completion_length": 87.3828125, + "epoch": 4.100456621004566, + "grad_norm": 2.7623848915100098, + "kl": 0.093505859375, + "learning_rate": 5.899543378995433e-07, + "loss": 0.0037, + "reward": 1.8036272525787354, + "reward_std": 0.11107254587113857, + "rewards/accuracy_reward": 0.8114396631717682, + "rewards/format_reward": 0.9921875, + "step": 898 + }, + { + "completion_length": 91.8125, + "epoch": 4.105022831050229, + "grad_norm": 2.0738606452941895, + "kl": 0.133056640625, + "learning_rate": 5.894977168949771e-07, + "loss": 0.0053, + "reward": 1.7905096411705017, + "reward_std": 0.12370277941226959, + "rewards/accuracy_reward": 0.8061346113681793, + "rewards/format_reward": 0.984375, + "step": 899 + }, + { + "completion_length": 76.5546875, + "epoch": 4.109589041095891, + "grad_norm": 3.7335944175720215, + "kl": 0.130859375, + "learning_rate": 5.89041095890411e-07, + "loss": 0.0052, + "reward": 1.6451637148857117, + "reward_std": 0.1962077133357525, + "rewards/accuracy_reward": 0.6529761403799057, + "rewards/format_reward": 0.9921875, + "step": 900 + }, + { + "completion_length": 63.0625, + "epoch": 4.114155251141552, + "grad_norm": 7.811534881591797, + "kl": 0.117431640625, + "learning_rate": 5.885844748858447e-07, + "loss": 0.0047, + "reward": 1.6516927480697632, + "reward_std": 0.20568183064460754, + "rewards/accuracy_reward": 0.651692658662796, + "rewards/format_reward": 1.0, + "step": 901 + }, + { + "completion_length": 75.40625, + "epoch": 4.1187214611872145, + "grad_norm": 4.964561462402344, + "kl": 0.11474609375, + "learning_rate": 5.881278538812785e-07, + "loss": 0.0046, + "reward": 1.767968773841858, + "reward_std": 0.17296937853097916, + "rewards/accuracy_reward": 0.7757811844348907, + "rewards/format_reward": 0.9921875, + "step": 902 + }, + { + "completion_length": 78.671875, + "epoch": 4.123287671232877, + "grad_norm": 7.2277679443359375, + "kl": 0.111083984375, + "learning_rate": 5.876712328767123e-07, + "loss": 0.0044, + "reward": 1.8033853769302368, + "reward_std": 0.12047793343663216, + "rewards/accuracy_reward": 0.8111978769302368, + "rewards/format_reward": 0.9921875, + "step": 903 + }, + { + "completion_length": 76.0546875, + "epoch": 4.127853881278539, + "grad_norm": 3.666255474090576, + "kl": 0.14697265625, + "learning_rate": 5.87214611872146e-07, + "loss": 0.0059, + "reward": 1.7567708492279053, + "reward_std": 0.15414869785308838, + "rewards/accuracy_reward": 0.7645833492279053, + "rewards/format_reward": 0.9921875, + "step": 904 + }, + { + "completion_length": 64.28125, + "epoch": 4.132420091324201, + "grad_norm": 6.144541263580322, + "kl": 0.15673828125, + "learning_rate": 5.8675799086758e-07, + "loss": 0.0063, + "reward": 1.6296875476837158, + "reward_std": 0.23411936312913895, + "rewards/accuracy_reward": 0.637499988079071, + "rewards/format_reward": 0.9921875, + "step": 905 + }, + { + "completion_length": 96.265625, + "epoch": 4.136986301369863, + "grad_norm": 4.049592018127441, + "kl": 0.103759765625, + "learning_rate": 5.863013698630137e-07, + "loss": 0.0041, + "reward": 1.807031273841858, + "reward_std": 0.16107044368982315, + "rewards/accuracy_reward": 0.8070311546325684, + "rewards/format_reward": 1.0, + "step": 906 + }, + { + "completion_length": 73.2578125, + "epoch": 4.141552511415525, + "grad_norm": 1.5114738941192627, + "kl": 0.085693359375, + "learning_rate": 5.858447488584474e-07, + "loss": 0.0034, + "reward": 1.6850818991661072, + "reward_std": 0.16721044853329659, + "rewards/accuracy_reward": 0.69289430975914, + "rewards/format_reward": 0.9921875, + "step": 907 + }, + { + "completion_length": 56.8515625, + "epoch": 4.146118721461187, + "grad_norm": 2.3427228927612305, + "kl": 0.12890625, + "learning_rate": 5.853881278538813e-07, + "loss": 0.0052, + "reward": 1.756416380405426, + "reward_std": 0.1625683754682541, + "rewards/accuracy_reward": 0.7642288506031036, + "rewards/format_reward": 0.9921875, + "step": 908 + }, + { + "completion_length": 85.671875, + "epoch": 4.1506849315068495, + "grad_norm": 2.933675765991211, + "kl": 0.092529296875, + "learning_rate": 5.84931506849315e-07, + "loss": 0.0037, + "reward": 1.723825216293335, + "reward_std": 0.14209723100066185, + "rewards/accuracy_reward": 0.7238251268863678, + "rewards/format_reward": 1.0, + "step": 909 + }, + { + "completion_length": 67.4921875, + "epoch": 4.155251141552512, + "grad_norm": 2.615705966949463, + "kl": 0.11328125, + "learning_rate": 5.844748858447488e-07, + "loss": 0.0045, + "reward": 1.6570913791656494, + "reward_std": 0.1796240657567978, + "rewards/accuracy_reward": 0.664903849363327, + "rewards/format_reward": 0.9921875, + "step": 910 + }, + { + "completion_length": 76.96875, + "epoch": 4.159817351598173, + "grad_norm": 1.9582023620605469, + "kl": 0.090087890625, + "learning_rate": 5.840182648401826e-07, + "loss": 0.0036, + "reward": 1.7256065011024475, + "reward_std": 0.17783351242542267, + "rewards/accuracy_reward": 0.7334189414978027, + "rewards/format_reward": 0.9921875, + "step": 911 + }, + { + "completion_length": 60.40625, + "epoch": 4.164383561643835, + "grad_norm": 4.850559711456299, + "kl": 0.103271484375, + "learning_rate": 5.835616438356164e-07, + "loss": 0.0041, + "reward": 1.524218738079071, + "reward_std": 0.2731492444872856, + "rewards/accuracy_reward": 0.5320312678813934, + "rewards/format_reward": 0.9921875, + "step": 912 + }, + { + "completion_length": 50.765625, + "epoch": 4.168949771689498, + "grad_norm": 2.815920352935791, + "kl": 0.18408203125, + "learning_rate": 5.831050228310503e-07, + "loss": 0.0074, + "reward": 1.653542160987854, + "reward_std": 0.23784886300563812, + "rewards/accuracy_reward": 0.6691671311855316, + "rewards/format_reward": 0.984375, + "step": 913 + }, + { + "completion_length": 76.1875, + "epoch": 4.17351598173516, + "grad_norm": 2.3140416145324707, + "kl": 0.14208984375, + "learning_rate": 5.82648401826484e-07, + "loss": 0.0057, + "reward": 1.6864583492279053, + "reward_std": 0.1315075010061264, + "rewards/accuracy_reward": 0.6864583194255829, + "rewards/format_reward": 1.0, + "step": 914 + }, + { + "completion_length": 73.3515625, + "epoch": 4.178082191780822, + "grad_norm": 3.9995479583740234, + "kl": 0.12451171875, + "learning_rate": 5.821917808219177e-07, + "loss": 0.005, + "reward": 1.7815169095993042, + "reward_std": 0.08393021672964096, + "rewards/accuracy_reward": 0.7815168499946594, + "rewards/format_reward": 1.0, + "step": 915 + }, + { + "completion_length": 66.3515625, + "epoch": 4.182648401826484, + "grad_norm": 1.871342420578003, + "kl": 0.12353515625, + "learning_rate": 5.817351598173516e-07, + "loss": 0.0049, + "reward": 1.7460670471191406, + "reward_std": 0.15686815977096558, + "rewards/accuracy_reward": 0.7538795471191406, + "rewards/format_reward": 0.9921875, + "step": 916 + }, + { + "completion_length": 77.828125, + "epoch": 4.187214611872146, + "grad_norm": 1.733489751815796, + "kl": 0.095458984375, + "learning_rate": 5.812785388127853e-07, + "loss": 0.0038, + "reward": 1.7660456895828247, + "reward_std": 0.16200437024235725, + "rewards/accuracy_reward": 0.7816706299781799, + "rewards/format_reward": 0.984375, + "step": 917 + }, + { + "completion_length": 65.046875, + "epoch": 4.191780821917808, + "grad_norm": 3.412703275680542, + "kl": 0.13232421875, + "learning_rate": 5.808219178082191e-07, + "loss": 0.0053, + "reward": 1.6960819363594055, + "reward_std": 0.19595444947481155, + "rewards/accuracy_reward": 0.7117068469524384, + "rewards/format_reward": 0.984375, + "step": 918 + }, + { + "completion_length": 83.4140625, + "epoch": 4.19634703196347, + "grad_norm": 1.8262677192687988, + "kl": 0.120849609375, + "learning_rate": 5.80365296803653e-07, + "loss": 0.0048, + "reward": 1.7575623989105225, + "reward_std": 0.1326010897755623, + "rewards/accuracy_reward": 0.7653749287128448, + "rewards/format_reward": 0.9921875, + "step": 919 + }, + { + "completion_length": 86.09375, + "epoch": 4.200913242009133, + "grad_norm": 4.315945625305176, + "kl": 0.10107421875, + "learning_rate": 5.799086757990867e-07, + "loss": 0.004, + "reward": 1.701339304447174, + "reward_std": 0.14829950034618378, + "rewards/accuracy_reward": 0.7013393044471741, + "rewards/format_reward": 1.0, + "step": 920 + }, + { + "completion_length": 74.46875, + "epoch": 4.205479452054795, + "grad_norm": 1.9493387937545776, + "kl": 0.08251953125, + "learning_rate": 5.794520547945206e-07, + "loss": 0.0033, + "reward": 1.8070870637893677, + "reward_std": 0.036666832864284515, + "rewards/accuracy_reward": 0.8070869743824005, + "rewards/format_reward": 1.0, + "step": 921 + }, + { + "completion_length": 77.328125, + "epoch": 4.210045662100456, + "grad_norm": 1.4725087881088257, + "kl": 0.1162109375, + "learning_rate": 5.789954337899543e-07, + "loss": 0.0047, + "reward": 1.8280134201049805, + "reward_std": 0.10532564483582973, + "rewards/accuracy_reward": 0.8358259201049805, + "rewards/format_reward": 0.9921875, + "step": 922 + }, + { + "completion_length": 72.9921875, + "epoch": 4.2146118721461185, + "grad_norm": 4.775530815124512, + "kl": 0.13037109375, + "learning_rate": 5.78538812785388e-07, + "loss": 0.0052, + "reward": 1.7100632786750793, + "reward_std": 0.17319176718592644, + "rewards/accuracy_reward": 0.7178757190704346, + "rewards/format_reward": 0.9921875, + "step": 923 + }, + { + "completion_length": 70.015625, + "epoch": 4.219178082191781, + "grad_norm": 2.908025026321411, + "kl": 0.162109375, + "learning_rate": 5.780821917808219e-07, + "loss": 0.0065, + "reward": 1.6276227831840515, + "reward_std": 0.22536901384592056, + "rewards/accuracy_reward": 0.6432477831840515, + "rewards/format_reward": 0.984375, + "step": 924 + }, + { + "completion_length": 59.578125, + "epoch": 4.223744292237443, + "grad_norm": 5.317495346069336, + "kl": 0.10595703125, + "learning_rate": 5.776255707762557e-07, + "loss": 0.0042, + "reward": 1.6319011449813843, + "reward_std": 0.18694934993982315, + "rewards/accuracy_reward": 0.6397135257720947, + "rewards/format_reward": 0.9921875, + "step": 925 + }, + { + "completion_length": 81.40625, + "epoch": 4.228310502283105, + "grad_norm": 2.3484511375427246, + "kl": 0.091796875, + "learning_rate": 5.771689497716896e-07, + "loss": 0.0037, + "reward": 1.7912667393684387, + "reward_std": 0.08569350093603134, + "rewards/accuracy_reward": 0.7912667095661163, + "rewards/format_reward": 1.0, + "step": 926 + }, + { + "completion_length": 68.578125, + "epoch": 4.232876712328767, + "grad_norm": 2.1061465740203857, + "kl": 0.125244140625, + "learning_rate": 5.767123287671233e-07, + "loss": 0.005, + "reward": 1.8626301884651184, + "reward_std": 0.10864401236176491, + "rewards/accuracy_reward": 0.8626301884651184, + "rewards/format_reward": 1.0, + "step": 927 + }, + { + "completion_length": 74.9765625, + "epoch": 4.237442922374429, + "grad_norm": 3.035919427871704, + "kl": 0.13134765625, + "learning_rate": 5.76255707762557e-07, + "loss": 0.0053, + "reward": 1.6922495365142822, + "reward_std": 0.15072567015886307, + "rewards/accuracy_reward": 0.7156869769096375, + "rewards/format_reward": 0.9765625, + "step": 928 + }, + { + "completion_length": 58.78125, + "epoch": 4.242009132420091, + "grad_norm": 2.8555891513824463, + "kl": 0.16064453125, + "learning_rate": 5.757990867579909e-07, + "loss": 0.0064, + "reward": 1.8009114861488342, + "reward_std": 0.18833597749471664, + "rewards/accuracy_reward": 0.8009114265441895, + "rewards/format_reward": 1.0, + "step": 929 + }, + { + "completion_length": 79.625, + "epoch": 4.2465753424657535, + "grad_norm": 4.95534086227417, + "kl": 0.113037109375, + "learning_rate": 5.753424657534246e-07, + "loss": 0.0045, + "reward": 1.668163239955902, + "reward_std": 0.20845329016447067, + "rewards/accuracy_reward": 0.6759756505489349, + "rewards/format_reward": 0.9921875, + "step": 930 + }, + { + "completion_length": 57.59375, + "epoch": 4.251141552511416, + "grad_norm": 3.5745315551757812, + "kl": 0.158203125, + "learning_rate": 5.748858447488583e-07, + "loss": 0.0063, + "reward": 1.7814725637435913, + "reward_std": 0.1947070211172104, + "rewards/accuracy_reward": 0.7814724445343018, + "rewards/format_reward": 1.0, + "step": 931 + }, + { + "completion_length": 69.296875, + "epoch": 4.255707762557078, + "grad_norm": 2.9775948524475098, + "kl": 0.129638671875, + "learning_rate": 5.744292237442923e-07, + "loss": 0.0052, + "reward": 1.5528324842453003, + "reward_std": 0.23214496672153473, + "rewards/accuracy_reward": 0.5684574842453003, + "rewards/format_reward": 0.984375, + "step": 932 + }, + { + "completion_length": 85.0859375, + "epoch": 4.260273972602739, + "grad_norm": 2.191046714782715, + "kl": 0.13232421875, + "learning_rate": 5.73972602739726e-07, + "loss": 0.0053, + "reward": 1.666010558605194, + "reward_std": 0.16855772212147713, + "rewards/accuracy_reward": 0.6816355586051941, + "rewards/format_reward": 0.984375, + "step": 933 + }, + { + "completion_length": 84.921875, + "epoch": 4.264840182648402, + "grad_norm": 1.6789612770080566, + "kl": 0.0821533203125, + "learning_rate": 5.735159817351598e-07, + "loss": 0.0033, + "reward": 1.749913215637207, + "reward_std": 0.07257125526666641, + "rewards/accuracy_reward": 0.7499131858348846, + "rewards/format_reward": 1.0, + "step": 934 + }, + { + "completion_length": 78.6171875, + "epoch": 4.269406392694064, + "grad_norm": 4.655454158782959, + "kl": 0.116455078125, + "learning_rate": 5.730593607305936e-07, + "loss": 0.0047, + "reward": 1.7203125953674316, + "reward_std": 0.14768873527646065, + "rewards/accuracy_reward": 0.7203124463558197, + "rewards/format_reward": 1.0, + "step": 935 + }, + { + "completion_length": 79.765625, + "epoch": 4.273972602739726, + "grad_norm": 2.1534454822540283, + "kl": 0.093017578125, + "learning_rate": 5.726027397260273e-07, + "loss": 0.0037, + "reward": 1.7255195379257202, + "reward_std": 0.18480905890464783, + "rewards/accuracy_reward": 0.7489570677280426, + "rewards/format_reward": 0.9765625, + "step": 936 + }, + { + "completion_length": 82.1015625, + "epoch": 4.2785388127853885, + "grad_norm": 4.468496799468994, + "kl": 0.081298828125, + "learning_rate": 5.721461187214612e-07, + "loss": 0.0032, + "reward": 1.8500558733940125, + "reward_std": 0.13352815061807632, + "rewards/accuracy_reward": 0.8578682541847229, + "rewards/format_reward": 0.9921875, + "step": 937 + }, + { + "completion_length": 55.5078125, + "epoch": 4.28310502283105, + "grad_norm": 2.743320941925049, + "kl": 0.18212890625, + "learning_rate": 5.716894977168949e-07, + "loss": 0.0073, + "reward": 1.5885499119758606, + "reward_std": 0.20670751482248306, + "rewards/accuracy_reward": 0.6041748821735382, + "rewards/format_reward": 0.984375, + "step": 938 + }, + { + "completion_length": 77.984375, + "epoch": 4.287671232876712, + "grad_norm": 3.1169564723968506, + "kl": 0.102783203125, + "learning_rate": 5.712328767123287e-07, + "loss": 0.0041, + "reward": 1.6959820985794067, + "reward_std": 0.1572049930691719, + "rewards/accuracy_reward": 0.7037945985794067, + "rewards/format_reward": 0.9921875, + "step": 939 + }, + { + "completion_length": 78.359375, + "epoch": 4.292237442922374, + "grad_norm": 1.5095194578170776, + "kl": 0.11083984375, + "learning_rate": 5.707762557077626e-07, + "loss": 0.0044, + "reward": 1.7596355080604553, + "reward_std": 0.1378481425344944, + "rewards/accuracy_reward": 0.7830728888511658, + "rewards/format_reward": 0.9765625, + "step": 940 + }, + { + "completion_length": 61.1171875, + "epoch": 4.296803652968037, + "grad_norm": 2.8175227642059326, + "kl": 0.153076171875, + "learning_rate": 5.703196347031963e-07, + "loss": 0.0061, + "reward": 1.6882672905921936, + "reward_std": 0.1907612383365631, + "rewards/accuracy_reward": 0.696079820394516, + "rewards/format_reward": 0.9921875, + "step": 941 + }, + { + "completion_length": 77.0546875, + "epoch": 4.301369863013699, + "grad_norm": 2.724752426147461, + "kl": 0.14306640625, + "learning_rate": 5.698630136986301e-07, + "loss": 0.0057, + "reward": 1.6338477730751038, + "reward_std": 0.1553211621940136, + "rewards/accuracy_reward": 0.633847713470459, + "rewards/format_reward": 1.0, + "step": 942 + }, + { + "completion_length": 72.0625, + "epoch": 4.30593607305936, + "grad_norm": 1.5134310722351074, + "kl": 0.109375, + "learning_rate": 5.694063926940639e-07, + "loss": 0.0044, + "reward": 1.8307477235794067, + "reward_std": 0.10607551783323288, + "rewards/accuracy_reward": 0.8307477533817291, + "rewards/format_reward": 1.0, + "step": 943 + }, + { + "completion_length": 84.6796875, + "epoch": 4.310502283105023, + "grad_norm": 1.9289695024490356, + "kl": 0.120849609375, + "learning_rate": 5.689497716894976e-07, + "loss": 0.0048, + "reward": 1.7625186443328857, + "reward_std": 0.14161107502877712, + "rewards/accuracy_reward": 0.7703310549259186, + "rewards/format_reward": 0.9921875, + "step": 944 + }, + { + "completion_length": 68.6171875, + "epoch": 4.315068493150685, + "grad_norm": 2.0734076499938965, + "kl": 0.149169921875, + "learning_rate": 5.684931506849316e-07, + "loss": 0.006, + "reward": 1.6850000619888306, + "reward_std": 0.14654473960399628, + "rewards/accuracy_reward": 0.6928124725818634, + "rewards/format_reward": 0.9921875, + "step": 945 + }, + { + "completion_length": 68.4765625, + "epoch": 4.319634703196347, + "grad_norm": 2.146794080734253, + "kl": 0.11572265625, + "learning_rate": 5.680365296803653e-07, + "loss": 0.0046, + "reward": 1.70016747713089, + "reward_std": 0.20281969010829926, + "rewards/accuracy_reward": 0.7157924175262451, + "rewards/format_reward": 0.984375, + "step": 946 + }, + { + "completion_length": 68.03125, + "epoch": 4.324200913242009, + "grad_norm": 2.0439021587371826, + "kl": 0.126953125, + "learning_rate": 5.67579908675799e-07, + "loss": 0.0051, + "reward": 1.8492187857627869, + "reward_std": 0.09047675505280495, + "rewards/accuracy_reward": 0.8492186665534973, + "rewards/format_reward": 1.0, + "step": 947 + }, + { + "completion_length": 68.6953125, + "epoch": 4.328767123287671, + "grad_norm": 5.651437759399414, + "kl": 0.129638671875, + "learning_rate": 5.671232876712329e-07, + "loss": 0.0052, + "reward": 1.7400199174880981, + "reward_std": 0.19576279073953629, + "rewards/accuracy_reward": 0.7478323876857758, + "rewards/format_reward": 0.9921875, + "step": 948 + }, + { + "completion_length": 92.3828125, + "epoch": 4.333333333333333, + "grad_norm": 2.3988163471221924, + "kl": 0.107666015625, + "learning_rate": 5.666666666666666e-07, + "loss": 0.0043, + "reward": 1.7797867059707642, + "reward_std": 0.091391421854496, + "rewards/accuracy_reward": 0.7797866761684418, + "rewards/format_reward": 1.0, + "step": 949 + }, + { + "completion_length": 65.796875, + "epoch": 4.337899543378995, + "grad_norm": 4.604662895202637, + "kl": 0.14306640625, + "learning_rate": 5.662100456621004e-07, + "loss": 0.0057, + "reward": 1.5967974662780762, + "reward_std": 0.19326772540807724, + "rewards/accuracy_reward": 0.596797525882721, + "rewards/format_reward": 1.0, + "step": 950 + }, + { + "completion_length": 65.34375, + "epoch": 4.342465753424658, + "grad_norm": 2.6304314136505127, + "kl": 0.194091796875, + "learning_rate": 5.657534246575342e-07, + "loss": 0.0078, + "reward": 1.6250000596046448, + "reward_std": 0.22805283963680267, + "rewards/accuracy_reward": 0.6484375, + "rewards/format_reward": 0.9765625, + "step": 951 + }, + { + "completion_length": 79.5390625, + "epoch": 4.34703196347032, + "grad_norm": 26.240947723388672, + "kl": 0.1016845703125, + "learning_rate": 5.65296803652968e-07, + "loss": 0.0041, + "reward": 1.7042073011398315, + "reward_std": 0.1676829382777214, + "rewards/accuracy_reward": 0.7120197415351868, + "rewards/format_reward": 0.9921875, + "step": 952 + }, + { + "completion_length": 74.0390625, + "epoch": 4.351598173515982, + "grad_norm": 2.3484628200531006, + "kl": 0.178466796875, + "learning_rate": 5.648401826484019e-07, + "loss": 0.0072, + "reward": 1.7171673774719238, + "reward_std": 0.16094867885112762, + "rewards/accuracy_reward": 0.7249797880649567, + "rewards/format_reward": 0.9921875, + "step": 953 + }, + { + "completion_length": 74.578125, + "epoch": 4.3561643835616435, + "grad_norm": 42.42313766479492, + "kl": 0.120361328125, + "learning_rate": 5.643835616438356e-07, + "loss": 0.0048, + "reward": 1.6389508843421936, + "reward_std": 0.1658247858285904, + "rewards/accuracy_reward": 0.638950914144516, + "rewards/format_reward": 1.0, + "step": 954 + }, + { + "completion_length": 76.90625, + "epoch": 4.360730593607306, + "grad_norm": 12.59420108795166, + "kl": 0.126220703125, + "learning_rate": 5.639269406392693e-07, + "loss": 0.005, + "reward": 1.704687476158142, + "reward_std": 0.18361148238182068, + "rewards/accuracy_reward": 0.7046875059604645, + "rewards/format_reward": 1.0, + "step": 955 + }, + { + "completion_length": 84.125, + "epoch": 4.365296803652968, + "grad_norm": 2.1863152980804443, + "kl": 0.107177734375, + "learning_rate": 5.634703196347032e-07, + "loss": 0.0043, + "reward": 1.6656250953674316, + "reward_std": 0.1505398079752922, + "rewards/accuracy_reward": 0.6812499761581421, + "rewards/format_reward": 0.984375, + "step": 956 + }, + { + "completion_length": 66.25, + "epoch": 4.36986301369863, + "grad_norm": 5.098891735076904, + "kl": 0.134033203125, + "learning_rate": 5.630136986301369e-07, + "loss": 0.0054, + "reward": 1.7317472100257874, + "reward_std": 0.13505896925926208, + "rewards/accuracy_reward": 0.7395596504211426, + "rewards/format_reward": 0.9921875, + "step": 957 + }, + { + "completion_length": 66.5078125, + "epoch": 4.3744292237442925, + "grad_norm": 10.807661056518555, + "kl": 0.119873046875, + "learning_rate": 5.625570776255707e-07, + "loss": 0.0048, + "reward": 1.7822917103767395, + "reward_std": 0.11853177845478058, + "rewards/accuracy_reward": 0.7822916507720947, + "rewards/format_reward": 1.0, + "step": 958 + }, + { + "completion_length": 67.375, + "epoch": 4.378995433789954, + "grad_norm": 1.9763253927230835, + "kl": 0.16064453125, + "learning_rate": 5.621004566210046e-07, + "loss": 0.0064, + "reward": 1.6397321224212646, + "reward_std": 0.2425994575023651, + "rewards/accuracy_reward": 0.6631696224212646, + "rewards/format_reward": 0.9765625, + "step": 959 + }, + { + "completion_length": 73.609375, + "epoch": 4.383561643835616, + "grad_norm": 11.640439987182617, + "kl": 0.12841796875, + "learning_rate": 5.616438356164383e-07, + "loss": 0.0051, + "reward": 1.6920552849769592, + "reward_std": 0.23473482113331556, + "rewards/accuracy_reward": 0.7154927253723145, + "rewards/format_reward": 0.9765625, + "step": 960 + }, + { + "completion_length": 67.1640625, + "epoch": 4.3881278538812785, + "grad_norm": 4.730476379394531, + "kl": 0.091796875, + "learning_rate": 5.611872146118722e-07, + "loss": 0.0037, + "reward": 1.8379933834075928, + "reward_std": 0.07210628129541874, + "rewards/accuracy_reward": 0.8379934132099152, + "rewards/format_reward": 1.0, + "step": 961 + }, + { + "completion_length": 95.25, + "epoch": 4.392694063926941, + "grad_norm": 1.63775634765625, + "kl": 0.069091796875, + "learning_rate": 5.607305936073059e-07, + "loss": 0.0028, + "reward": 1.818750023841858, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward": 0.8343749344348907, + "rewards/format_reward": 0.984375, + "step": 962 + }, + { + "completion_length": 58.0078125, + "epoch": 4.397260273972603, + "grad_norm": 2.0879337787628174, + "kl": 0.162109375, + "learning_rate": 5.602739726027396e-07, + "loss": 0.0065, + "reward": 1.810954213142395, + "reward_std": 0.18426654487848282, + "rewards/accuracy_reward": 0.818766713142395, + "rewards/format_reward": 0.9921875, + "step": 963 + }, + { + "completion_length": 60.8125, + "epoch": 4.401826484018265, + "grad_norm": 4.00584077835083, + "kl": 0.1513671875, + "learning_rate": 5.598173515981735e-07, + "loss": 0.0061, + "reward": 1.765178620815277, + "reward_std": 0.20918096601963043, + "rewards/accuracy_reward": 0.7729910910129547, + "rewards/format_reward": 0.9921875, + "step": 964 + }, + { + "completion_length": 62.6171875, + "epoch": 4.406392694063927, + "grad_norm": 2.43243408203125, + "kl": 0.134521484375, + "learning_rate": 5.593607305936073e-07, + "loss": 0.0054, + "reward": 1.7512852549552917, + "reward_std": 0.20323559269309044, + "rewards/accuracy_reward": 0.7512852847576141, + "rewards/format_reward": 1.0, + "step": 965 + }, + { + "completion_length": 87.6875, + "epoch": 4.410958904109589, + "grad_norm": 2.643324375152588, + "kl": 0.10693359375, + "learning_rate": 5.589041095890411e-07, + "loss": 0.0043, + "reward": 1.7709821462631226, + "reward_std": 0.12626906856894493, + "rewards/accuracy_reward": 0.7787946164608002, + "rewards/format_reward": 0.9921875, + "step": 966 + }, + { + "completion_length": 73.7421875, + "epoch": 4.415525114155251, + "grad_norm": 3.2244210243225098, + "kl": 0.10791015625, + "learning_rate": 5.584474885844749e-07, + "loss": 0.0043, + "reward": 1.76171875, + "reward_std": 0.18859816156327724, + "rewards/accuracy_reward": 0.7695312201976776, + "rewards/format_reward": 0.9921875, + "step": 967 + }, + { + "completion_length": 77.84375, + "epoch": 4.420091324200913, + "grad_norm": 4.4358625411987305, + "kl": 0.11962890625, + "learning_rate": 5.579908675799086e-07, + "loss": 0.0048, + "reward": 1.8305882215499878, + "reward_std": 0.09982336312532425, + "rewards/accuracy_reward": 0.8305881917476654, + "rewards/format_reward": 1.0, + "step": 968 + }, + { + "completion_length": 62.171875, + "epoch": 4.424657534246576, + "grad_norm": 4.14423942565918, + "kl": 0.133056640625, + "learning_rate": 5.575342465753425e-07, + "loss": 0.0053, + "reward": 1.6299851536750793, + "reward_std": 0.19754792749881744, + "rewards/accuracy_reward": 0.637797623872757, + "rewards/format_reward": 0.9921875, + "step": 969 + }, + { + "completion_length": 83.1953125, + "epoch": 4.429223744292237, + "grad_norm": 1.427676796913147, + "kl": 0.087158203125, + "learning_rate": 5.570776255707762e-07, + "loss": 0.0035, + "reward": 1.8171875476837158, + "reward_std": 0.11048543266952038, + "rewards/accuracy_reward": 0.8249998986721039, + "rewards/format_reward": 0.9921875, + "step": 970 + }, + { + "completion_length": 83.6484375, + "epoch": 4.433789954337899, + "grad_norm": 2.3894028663635254, + "kl": 0.098388671875, + "learning_rate": 5.566210045662099e-07, + "loss": 0.0039, + "reward": 1.7342448234558105, + "reward_std": 0.24424592405557632, + "rewards/accuracy_reward": 0.7576822340488434, + "rewards/format_reward": 0.9765625, + "step": 971 + }, + { + "completion_length": 86.203125, + "epoch": 4.438356164383562, + "grad_norm": 2.410252571105957, + "kl": 0.103515625, + "learning_rate": 5.561643835616439e-07, + "loss": 0.0041, + "reward": 1.8436079621315002, + "reward_std": 0.08567275106906891, + "rewards/accuracy_reward": 0.8514204323291779, + "rewards/format_reward": 0.9921875, + "step": 972 + }, + { + "completion_length": 67.296875, + "epoch": 4.442922374429224, + "grad_norm": 3.1359169483184814, + "kl": 0.156982421875, + "learning_rate": 5.557077625570776e-07, + "loss": 0.0063, + "reward": 1.6325520277023315, + "reward_std": 0.1258198693394661, + "rewards/accuracy_reward": 0.6325520575046539, + "rewards/format_reward": 1.0, + "step": 973 + }, + { + "completion_length": 75.625, + "epoch": 4.447488584474886, + "grad_norm": 3.9389309883117676, + "kl": 0.17529296875, + "learning_rate": 5.552511415525114e-07, + "loss": 0.007, + "reward": 1.6369017958641052, + "reward_std": 0.21986636519432068, + "rewards/accuracy_reward": 0.6447142958641052, + "rewards/format_reward": 0.9921875, + "step": 974 + }, + { + "completion_length": 75.4765625, + "epoch": 4.4520547945205475, + "grad_norm": 19.899776458740234, + "kl": 0.638916015625, + "learning_rate": 5.547945205479452e-07, + "loss": 0.0255, + "reward": 1.819531261920929, + "reward_std": 0.07878133933991194, + "rewards/accuracy_reward": 0.819531261920929, + "rewards/format_reward": 1.0, + "step": 975 + }, + { + "completion_length": 73.5625, + "epoch": 4.45662100456621, + "grad_norm": 3.0659143924713135, + "kl": 0.095703125, + "learning_rate": 5.543378995433789e-07, + "loss": 0.0038, + "reward": 1.6691592335700989, + "reward_std": 0.19636988639831543, + "rewards/accuracy_reward": 0.6847842335700989, + "rewards/format_reward": 0.984375, + "step": 976 + }, + { + "completion_length": 82.0, + "epoch": 4.461187214611872, + "grad_norm": 2.006531000137329, + "kl": 0.101318359375, + "learning_rate": 5.538812785388128e-07, + "loss": 0.0041, + "reward": 1.8178571462631226, + "reward_std": 0.1673773005604744, + "rewards/accuracy_reward": 0.8334820866584778, + "rewards/format_reward": 0.984375, + "step": 977 + }, + { + "completion_length": 55.5234375, + "epoch": 4.465753424657534, + "grad_norm": 2.1843132972717285, + "kl": 0.158203125, + "learning_rate": 5.534246575342465e-07, + "loss": 0.0063, + "reward": 1.770518183708191, + "reward_std": 0.19590065628290176, + "rewards/accuracy_reward": 0.7783306241035461, + "rewards/format_reward": 0.9921875, + "step": 978 + }, + { + "completion_length": 54.3984375, + "epoch": 4.470319634703197, + "grad_norm": 5.7283830642700195, + "kl": 0.16845703125, + "learning_rate": 5.529680365296803e-07, + "loss": 0.0067, + "reward": 1.692187488079071, + "reward_std": 0.193861223757267, + "rewards/accuracy_reward": 0.6999999582767487, + "rewards/format_reward": 0.9921875, + "step": 979 + }, + { + "completion_length": 73.6171875, + "epoch": 4.474885844748858, + "grad_norm": 4.302793979644775, + "kl": 0.139404296875, + "learning_rate": 5.525114155251142e-07, + "loss": 0.0056, + "reward": 1.7293124198913574, + "reward_std": 0.14934544544667006, + "rewards/accuracy_reward": 0.7293124198913574, + "rewards/format_reward": 1.0, + "step": 980 + }, + { + "completion_length": 87.0625, + "epoch": 4.47945205479452, + "grad_norm": 2.7895452976226807, + "kl": 0.115966796875, + "learning_rate": 5.520547945205479e-07, + "loss": 0.0046, + "reward": 1.7789062857627869, + "reward_std": 0.09265873953700066, + "rewards/accuracy_reward": 0.7789061963558197, + "rewards/format_reward": 1.0, + "step": 981 + }, + { + "completion_length": 78.3984375, + "epoch": 4.4840182648401825, + "grad_norm": 3.9894137382507324, + "kl": 0.1279296875, + "learning_rate": 5.515981735159817e-07, + "loss": 0.0051, + "reward": 1.6846325397491455, + "reward_std": 0.18365756422281265, + "rewards/accuracy_reward": 0.7002575993537903, + "rewards/format_reward": 0.984375, + "step": 982 + }, + { + "completion_length": 76.875, + "epoch": 4.488584474885845, + "grad_norm": 3.384289503097534, + "kl": 0.150634765625, + "learning_rate": 5.511415525114155e-07, + "loss": 0.006, + "reward": 1.7085938453674316, + "reward_std": 0.20954116433858871, + "rewards/accuracy_reward": 0.7320311665534973, + "rewards/format_reward": 0.9765625, + "step": 983 + }, + { + "completion_length": 49.8671875, + "epoch": 4.493150684931507, + "grad_norm": 5.41481876373291, + "kl": 0.14111328125, + "learning_rate": 5.506849315068492e-07, + "loss": 0.0056, + "reward": 1.5635416507720947, + "reward_std": 0.23787462711334229, + "rewards/accuracy_reward": 0.5635416507720947, + "rewards/format_reward": 1.0, + "step": 984 + }, + { + "completion_length": 87.890625, + "epoch": 4.497716894977169, + "grad_norm": 1.2277454137802124, + "kl": 0.0751953125, + "learning_rate": 5.502283105022832e-07, + "loss": 0.003, + "reward": 1.7507812976837158, + "reward_std": 0.05524272099137306, + "rewards/accuracy_reward": 0.7507811486721039, + "rewards/format_reward": 1.0, + "step": 985 + }, + { + "completion_length": 61.9453125, + "epoch": 4.502283105022831, + "grad_norm": 5.256927490234375, + "kl": 0.1865234375, + "learning_rate": 5.497716894977169e-07, + "loss": 0.0075, + "reward": 1.610156238079071, + "reward_std": 0.2878893092274666, + "rewards/accuracy_reward": 0.625781238079071, + "rewards/format_reward": 0.984375, + "step": 986 + }, + { + "completion_length": 82.71875, + "epoch": 4.506849315068493, + "grad_norm": 3.515986919403076, + "kl": 0.18603515625, + "learning_rate": 5.493150684931506e-07, + "loss": 0.0074, + "reward": 1.7588542103767395, + "reward_std": 0.1455376148223877, + "rewards/accuracy_reward": 0.7588541209697723, + "rewards/format_reward": 1.0, + "step": 987 + }, + { + "completion_length": 67.7578125, + "epoch": 4.511415525114155, + "grad_norm": 2.741415023803711, + "kl": 0.14306640625, + "learning_rate": 5.488584474885845e-07, + "loss": 0.0057, + "reward": 1.5901537537574768, + "reward_std": 0.15988682955503464, + "rewards/accuracy_reward": 0.5901537537574768, + "rewards/format_reward": 1.0, + "step": 988 + }, + { + "completion_length": 66.84375, + "epoch": 4.5159817351598175, + "grad_norm": 2.9208884239196777, + "kl": 0.12060546875, + "learning_rate": 5.484018264840182e-07, + "loss": 0.0048, + "reward": 1.7678571939468384, + "reward_std": 0.14058196544647217, + "rewards/accuracy_reward": 0.7678571343421936, + "rewards/format_reward": 1.0, + "step": 989 + }, + { + "completion_length": 67.2734375, + "epoch": 4.52054794520548, + "grad_norm": 3.776416540145874, + "kl": 0.126708984375, + "learning_rate": 5.47945205479452e-07, + "loss": 0.0051, + "reward": 1.6418346166610718, + "reward_std": 0.1818918213248253, + "rewards/accuracy_reward": 0.6418345868587494, + "rewards/format_reward": 1.0, + "step": 990 + }, + { + "completion_length": 72.625, + "epoch": 4.525114155251142, + "grad_norm": 2.1685075759887695, + "kl": 0.138916015625, + "learning_rate": 5.474885844748858e-07, + "loss": 0.0056, + "reward": 1.7085193395614624, + "reward_std": 0.22063866257667542, + "rewards/accuracy_reward": 0.72414430975914, + "rewards/format_reward": 0.984375, + "step": 991 + }, + { + "completion_length": 66.28125, + "epoch": 4.529680365296803, + "grad_norm": 14.555278778076172, + "kl": 0.1513671875, + "learning_rate": 5.470319634703196e-07, + "loss": 0.006, + "reward": 1.6950520873069763, + "reward_std": 0.19789891690015793, + "rewards/accuracy_reward": 0.6950520873069763, + "rewards/format_reward": 1.0, + "step": 992 + }, + { + "completion_length": 68.71875, + "epoch": 4.534246575342466, + "grad_norm": 4.036056995391846, + "kl": 0.15283203125, + "learning_rate": 5.465753424657535e-07, + "loss": 0.0061, + "reward": 1.5639322996139526, + "reward_std": 0.2465272918343544, + "rewards/accuracy_reward": 0.5795572698116302, + "rewards/format_reward": 0.984375, + "step": 993 + }, + { + "completion_length": 72.296875, + "epoch": 4.538812785388128, + "grad_norm": 4.413112163543701, + "kl": 0.2080078125, + "learning_rate": 5.461187214611872e-07, + "loss": 0.0083, + "reward": 1.6929687857627869, + "reward_std": 0.26582426577806473, + "rewards/accuracy_reward": 0.7242187261581421, + "rewards/format_reward": 0.96875, + "step": 994 + }, + { + "completion_length": 71.1171875, + "epoch": 4.54337899543379, + "grad_norm": 2.5498805046081543, + "kl": 0.13623046875, + "learning_rate": 5.456621004566209e-07, + "loss": 0.0055, + "reward": 1.696093738079071, + "reward_std": 0.20334278792142868, + "rewards/accuracy_reward": 0.7039062678813934, + "rewards/format_reward": 0.9921875, + "step": 995 + }, + { + "completion_length": 80.1796875, + "epoch": 4.5479452054794525, + "grad_norm": 1.7603377103805542, + "kl": 0.117919921875, + "learning_rate": 5.452054794520548e-07, + "loss": 0.0047, + "reward": 1.6623343229293823, + "reward_std": 0.15708915889263153, + "rewards/accuracy_reward": 0.6779592037200928, + "rewards/format_reward": 0.984375, + "step": 996 + }, + { + "completion_length": 56.375, + "epoch": 4.552511415525114, + "grad_norm": 4.723056316375732, + "kl": 0.17236328125, + "learning_rate": 5.447488584474885e-07, + "loss": 0.0069, + "reward": 1.7257593870162964, + "reward_std": 0.21335439383983612, + "rewards/accuracy_reward": 0.7257594168186188, + "rewards/format_reward": 1.0, + "step": 997 + }, + { + "completion_length": 59.25, + "epoch": 4.557077625570776, + "grad_norm": 2.3055455684661865, + "kl": 0.1591796875, + "learning_rate": 5.442922374429223e-07, + "loss": 0.0064, + "reward": 1.7297247648239136, + "reward_std": 0.17667409405112267, + "rewards/accuracy_reward": 0.7375371754169464, + "rewards/format_reward": 0.9921875, + "step": 998 + }, + { + "completion_length": 55.4296875, + "epoch": 4.561643835616438, + "grad_norm": 10.311870574951172, + "kl": 0.135009765625, + "learning_rate": 5.438356164383562e-07, + "loss": 0.0054, + "reward": 1.48444002866745, + "reward_std": 0.28679582476615906, + "rewards/accuracy_reward": 0.49225252866744995, + "rewards/format_reward": 0.9921875, + "step": 999 + }, + { + "completion_length": 60.46875, + "epoch": 4.566210045662101, + "grad_norm": 3.404649019241333, + "kl": 0.1982421875, + "learning_rate": 5.433789954337899e-07, + "loss": 0.0079, + "reward": 1.5433160066604614, + "reward_std": 0.2616465389728546, + "rewards/accuracy_reward": 0.574565976858139, + "rewards/format_reward": 0.96875, + "step": 1000 + }, + { + "completion_length": 73.15625, + "epoch": 4.570776255707763, + "grad_norm": 1.8293238878250122, + "kl": 0.118408203125, + "learning_rate": 5.429223744292238e-07, + "loss": 0.0047, + "reward": 1.8298460245132446, + "reward_std": 0.1409841626882553, + "rewards/accuracy_reward": 0.8454709947109222, + "rewards/format_reward": 0.984375, + "step": 1001 + }, + { + "completion_length": 76.0703125, + "epoch": 4.575342465753424, + "grad_norm": 1.9321002960205078, + "kl": 0.111328125, + "learning_rate": 5.424657534246575e-07, + "loss": 0.0044, + "reward": 1.725000023841858, + "reward_std": 0.15813970565795898, + "rewards/accuracy_reward": 0.7328124344348907, + "rewards/format_reward": 0.9921875, + "step": 1002 + }, + { + "completion_length": 65.5703125, + "epoch": 4.579908675799087, + "grad_norm": 2.1403932571411133, + "kl": 0.133056640625, + "learning_rate": 5.420091324200912e-07, + "loss": 0.0053, + "reward": 1.7848958373069763, + "reward_std": 0.21307425945997238, + "rewards/accuracy_reward": 0.8083333373069763, + "rewards/format_reward": 0.9765625, + "step": 1003 + }, + { + "completion_length": 108.484375, + "epoch": 4.584474885844749, + "grad_norm": 1.755354642868042, + "kl": 0.10107421875, + "learning_rate": 5.415525114155251e-07, + "loss": 0.004, + "reward": 1.8961884379386902, + "reward_std": 0.060667259618639946, + "rewards/accuracy_reward": 0.896188348531723, + "rewards/format_reward": 1.0, + "step": 1004 + }, + { + "completion_length": 48.4140625, + "epoch": 4.589041095890411, + "grad_norm": 1.387018084526062, + "kl": 0.1494140625, + "learning_rate": 5.410958904109589e-07, + "loss": 0.006, + "reward": 1.8950520753860474, + "reward_std": 0.12889967486262321, + "rewards/accuracy_reward": 0.9028645753860474, + "rewards/format_reward": 0.9921875, + "step": 1005 + }, + { + "completion_length": 62.9140625, + "epoch": 4.593607305936073, + "grad_norm": 3.6895904541015625, + "kl": 0.20751953125, + "learning_rate": 5.406392694063927e-07, + "loss": 0.0083, + "reward": 1.6447916626930237, + "reward_std": 0.25781603902578354, + "rewards/accuracy_reward": 0.6526041626930237, + "rewards/format_reward": 0.9921875, + "step": 1006 + }, + { + "completion_length": 73.59375, + "epoch": 4.598173515981735, + "grad_norm": 5.844861030578613, + "kl": 0.118896484375, + "learning_rate": 5.401826484018265e-07, + "loss": 0.0048, + "reward": 1.6738625168800354, + "reward_std": 0.16072557866573334, + "rewards/accuracy_reward": 0.6894874572753906, + "rewards/format_reward": 0.984375, + "step": 1007 + }, + { + "completion_length": 62.390625, + "epoch": 4.602739726027397, + "grad_norm": 9.675552368164062, + "kl": 0.143798828125, + "learning_rate": 5.397260273972602e-07, + "loss": 0.0058, + "reward": 1.6939173936843872, + "reward_std": 0.1647581309080124, + "rewards/accuracy_reward": 0.6939173638820648, + "rewards/format_reward": 1.0, + "step": 1008 + }, + { + "completion_length": 67.1015625, + "epoch": 4.607305936073059, + "grad_norm": 5.169093132019043, + "kl": 0.130859375, + "learning_rate": 5.392694063926941e-07, + "loss": 0.0052, + "reward": 1.6269097924232483, + "reward_std": 0.23041004687547684, + "rewards/accuracy_reward": 0.6425347030162811, + "rewards/format_reward": 0.984375, + "step": 1009 + }, + { + "completion_length": 66.6328125, + "epoch": 4.6118721461187215, + "grad_norm": 1.9441876411437988, + "kl": 0.106201171875, + "learning_rate": 5.388127853881278e-07, + "loss": 0.0043, + "reward": 1.5781250596046448, + "reward_std": 0.14283225312829018, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 1.0, + "step": 1010 + }, + { + "completion_length": 62.9296875, + "epoch": 4.616438356164384, + "grad_norm": 3.9866504669189453, + "kl": 0.20361328125, + "learning_rate": 5.383561643835615e-07, + "loss": 0.0081, + "reward": 1.718553066253662, + "reward_std": 0.1621505692601204, + "rewards/accuracy_reward": 0.7185530364513397, + "rewards/format_reward": 1.0, + "step": 1011 + }, + { + "completion_length": 54.625, + "epoch": 4.621004566210045, + "grad_norm": 3.620591402053833, + "kl": 0.1474609375, + "learning_rate": 5.378995433789955e-07, + "loss": 0.0059, + "reward": 1.813281238079071, + "reward_std": 0.07733980193734169, + "rewards/accuracy_reward": 0.813281238079071, + "rewards/format_reward": 1.0, + "step": 1012 + }, + { + "completion_length": 71.109375, + "epoch": 4.6255707762557075, + "grad_norm": 3.377457857131958, + "kl": 0.14208984375, + "learning_rate": 5.374429223744292e-07, + "loss": 0.0057, + "reward": 1.74702388048172, + "reward_std": 0.1983274295926094, + "rewards/accuracy_reward": 0.7470237612724304, + "rewards/format_reward": 1.0, + "step": 1013 + }, + { + "completion_length": 59.6171875, + "epoch": 4.63013698630137, + "grad_norm": 3.1818978786468506, + "kl": 0.16064453125, + "learning_rate": 5.36986301369863e-07, + "loss": 0.0064, + "reward": 1.667509913444519, + "reward_std": 0.17691579461097717, + "rewards/accuracy_reward": 0.6675098836421967, + "rewards/format_reward": 1.0, + "step": 1014 + }, + { + "completion_length": 62.40625, + "epoch": 4.634703196347032, + "grad_norm": 2.588076114654541, + "kl": 0.198974609375, + "learning_rate": 5.365296803652968e-07, + "loss": 0.008, + "reward": 1.7416791319847107, + "reward_std": 0.19379056990146637, + "rewards/accuracy_reward": 0.7573040723800659, + "rewards/format_reward": 0.984375, + "step": 1015 + }, + { + "completion_length": 64.6171875, + "epoch": 4.639269406392694, + "grad_norm": 2.7476561069488525, + "kl": 0.1728515625, + "learning_rate": 5.360730593607305e-07, + "loss": 0.0069, + "reward": 1.8008184432983398, + "reward_std": 0.1534968689084053, + "rewards/accuracy_reward": 0.8008184134960175, + "rewards/format_reward": 1.0, + "step": 1016 + }, + { + "completion_length": 71.171875, + "epoch": 4.6438356164383565, + "grad_norm": 3.0133135318756104, + "kl": 0.121826171875, + "learning_rate": 5.356164383561644e-07, + "loss": 0.0049, + "reward": 1.820052146911621, + "reward_std": 0.1514080874621868, + "rewards/accuracy_reward": 0.8278645575046539, + "rewards/format_reward": 0.9921875, + "step": 1017 + }, + { + "completion_length": 80.9375, + "epoch": 4.648401826484018, + "grad_norm": 55.512779235839844, + "kl": 0.109375, + "learning_rate": 5.351598173515981e-07, + "loss": 0.0044, + "reward": 1.7374999523162842, + "reward_std": 0.1173202246427536, + "rewards/accuracy_reward": 0.7374999821186066, + "rewards/format_reward": 1.0, + "step": 1018 + }, + { + "completion_length": 73.75, + "epoch": 4.65296803652968, + "grad_norm": 3.2106876373291016, + "kl": 0.150390625, + "learning_rate": 5.347031963470319e-07, + "loss": 0.006, + "reward": 1.7260440587997437, + "reward_std": 0.22125811874866486, + "rewards/accuracy_reward": 0.7338564693927765, + "rewards/format_reward": 0.9921875, + "step": 1019 + }, + { + "completion_length": 53.90625, + "epoch": 4.657534246575342, + "grad_norm": 3.0299935340881348, + "kl": 0.133056640625, + "learning_rate": 5.342465753424658e-07, + "loss": 0.0053, + "reward": 1.7598958611488342, + "reward_std": 0.14248863141983747, + "rewards/accuracy_reward": 0.7598957717418671, + "rewards/format_reward": 1.0, + "step": 1020 + }, + { + "completion_length": 44.4296875, + "epoch": 4.662100456621005, + "grad_norm": 17.613971710205078, + "kl": 0.751953125, + "learning_rate": 5.337899543378995e-07, + "loss": 0.03, + "reward": 1.843323826789856, + "reward_std": 0.22472677379846573, + "rewards/accuracy_reward": 0.866761326789856, + "rewards/format_reward": 0.9765625, + "step": 1021 + }, + { + "completion_length": 68.1640625, + "epoch": 4.666666666666667, + "grad_norm": 3.197855234146118, + "kl": 0.171875, + "learning_rate": 5.333333333333333e-07, + "loss": 0.0069, + "reward": 1.805405616760254, + "reward_std": 0.15048640966415405, + "rewards/accuracy_reward": 0.8054055869579315, + "rewards/format_reward": 1.0, + "step": 1022 + }, + { + "completion_length": 66.9921875, + "epoch": 4.671232876712329, + "grad_norm": 3.1693930625915527, + "kl": 0.126708984375, + "learning_rate": 5.328767123287671e-07, + "loss": 0.0051, + "reward": 1.6846354603767395, + "reward_std": 0.12612807750701904, + "rewards/accuracy_reward": 0.6846353709697723, + "rewards/format_reward": 1.0, + "step": 1023 + }, + { + "completion_length": 81.1484375, + "epoch": 4.675799086757991, + "grad_norm": 2.464825391769409, + "kl": 0.106689453125, + "learning_rate": 5.324200913242008e-07, + "loss": 0.0043, + "reward": 1.8835937976837158, + "reward_std": 0.08758953772485256, + "rewards/accuracy_reward": 0.8835937082767487, + "rewards/format_reward": 1.0, + "step": 1024 + }, + { + "completion_length": 64.78125, + "epoch": 4.680365296803653, + "grad_norm": 3.6387760639190674, + "kl": 0.17333984375, + "learning_rate": 5.319634703196348e-07, + "loss": 0.0069, + "reward": 1.7731905579566956, + "reward_std": 0.16570382565259933, + "rewards/accuracy_reward": 0.7731905579566956, + "rewards/format_reward": 1.0, + "step": 1025 + }, + { + "completion_length": 60.9296875, + "epoch": 4.684931506849315, + "grad_norm": 20.164772033691406, + "kl": 0.13232421875, + "learning_rate": 5.315068493150685e-07, + "loss": 0.0053, + "reward": 1.689843773841858, + "reward_std": 0.17715102434158325, + "rewards/accuracy_reward": 0.6976562738418579, + "rewards/format_reward": 0.9921875, + "step": 1026 + }, + { + "completion_length": 75.0, + "epoch": 4.689497716894977, + "grad_norm": 2.0931997299194336, + "kl": 0.1484375, + "learning_rate": 5.310502283105022e-07, + "loss": 0.0059, + "reward": 1.630468726158142, + "reward_std": 0.16156607866287231, + "rewards/accuracy_reward": 0.6460937261581421, + "rewards/format_reward": 0.984375, + "step": 1027 + }, + { + "completion_length": 49.234375, + "epoch": 4.69406392694064, + "grad_norm": 5.067066192626953, + "kl": 0.1962890625, + "learning_rate": 5.305936073059361e-07, + "loss": 0.0078, + "reward": 1.8552082777023315, + "reward_std": 0.15139273926615715, + "rewards/accuracy_reward": 0.8552083373069763, + "rewards/format_reward": 1.0, + "step": 1028 + }, + { + "completion_length": 80.9453125, + "epoch": 4.698630136986301, + "grad_norm": 4.103106498718262, + "kl": 0.126708984375, + "learning_rate": 5.301369863013698e-07, + "loss": 0.0051, + "reward": 1.678125023841858, + "reward_std": 0.17818758636713028, + "rewards/accuracy_reward": 0.6859374642372131, + "rewards/format_reward": 0.9921875, + "step": 1029 + }, + { + "completion_length": 75.2734375, + "epoch": 4.703196347031963, + "grad_norm": 4.427400588989258, + "kl": 0.25732421875, + "learning_rate": 5.296803652968036e-07, + "loss": 0.0103, + "reward": 1.696877896785736, + "reward_std": 0.16482967138290405, + "rewards/accuracy_reward": 0.7046903669834137, + "rewards/format_reward": 0.9921875, + "step": 1030 + }, + { + "completion_length": 56.8515625, + "epoch": 4.707762557077626, + "grad_norm": 5.5338921546936035, + "kl": 0.22607421875, + "learning_rate": 5.292237442922374e-07, + "loss": 0.0091, + "reward": 1.6015625, + "reward_std": 0.25800998508930206, + "rewards/accuracy_reward": 0.6249999403953552, + "rewards/format_reward": 0.9765625, + "step": 1031 + }, + { + "completion_length": 68.96875, + "epoch": 4.712328767123288, + "grad_norm": 2.648360013961792, + "kl": 0.138427734375, + "learning_rate": 5.287671232876712e-07, + "loss": 0.0055, + "reward": 1.7213541865348816, + "reward_std": 0.1301564909517765, + "rewards/accuracy_reward": 0.7291666567325592, + "rewards/format_reward": 0.9921875, + "step": 1032 + }, + { + "completion_length": 76.53125, + "epoch": 4.71689497716895, + "grad_norm": 3.0798656940460205, + "kl": 0.126953125, + "learning_rate": 5.283105022831051e-07, + "loss": 0.0051, + "reward": 1.7125211358070374, + "reward_std": 0.09494294971227646, + "rewards/accuracy_reward": 0.7125210464000702, + "rewards/format_reward": 1.0, + "step": 1033 + }, + { + "completion_length": 68.859375, + "epoch": 4.7214611872146115, + "grad_norm": 5.309957981109619, + "kl": 0.154296875, + "learning_rate": 5.278538812785388e-07, + "loss": 0.0062, + "reward": 1.609375, + "reward_std": 0.20264848321676254, + "rewards/accuracy_reward": 0.6093749701976776, + "rewards/format_reward": 1.0, + "step": 1034 + }, + { + "completion_length": 77.921875, + "epoch": 4.726027397260274, + "grad_norm": 4.624930381774902, + "kl": 0.13427734375, + "learning_rate": 5.273972602739725e-07, + "loss": 0.0054, + "reward": 1.7170308232307434, + "reward_std": 0.19927022606134415, + "rewards/accuracy_reward": 0.7248433232307434, + "rewards/format_reward": 0.9921875, + "step": 1035 + }, + { + "completion_length": 69.5234375, + "epoch": 4.730593607305936, + "grad_norm": 3.3374686241149902, + "kl": 0.116943359375, + "learning_rate": 5.269406392694064e-07, + "loss": 0.0047, + "reward": 1.6639309525489807, + "reward_std": 0.1389038860797882, + "rewards/accuracy_reward": 0.6717434823513031, + "rewards/format_reward": 0.9921875, + "step": 1036 + }, + { + "completion_length": 65.140625, + "epoch": 4.735159817351598, + "grad_norm": 3.858757495880127, + "kl": 0.1416015625, + "learning_rate": 5.264840182648401e-07, + "loss": 0.0057, + "reward": 1.6567708253860474, + "reward_std": 0.16906771063804626, + "rewards/accuracy_reward": 0.6723958551883698, + "rewards/format_reward": 0.984375, + "step": 1037 + }, + { + "completion_length": 70.0859375, + "epoch": 4.739726027397261, + "grad_norm": 3.435810089111328, + "kl": 0.156005859375, + "learning_rate": 5.260273972602739e-07, + "loss": 0.0063, + "reward": 1.6902902126312256, + "reward_std": 0.19441108405590057, + "rewards/accuracy_reward": 0.698102593421936, + "rewards/format_reward": 0.9921875, + "step": 1038 + }, + { + "completion_length": 60.0859375, + "epoch": 4.744292237442922, + "grad_norm": 4.343775272369385, + "kl": 0.20263671875, + "learning_rate": 5.255707762557078e-07, + "loss": 0.0081, + "reward": 1.8015338778495789, + "reward_std": 0.20425771176815033, + "rewards/accuracy_reward": 0.8015338182449341, + "rewards/format_reward": 1.0, + "step": 1039 + }, + { + "completion_length": 67.5, + "epoch": 4.748858447488584, + "grad_norm": 2.354308605194092, + "kl": 0.1474609375, + "learning_rate": 5.251141552511415e-07, + "loss": 0.0059, + "reward": 1.8274368047714233, + "reward_std": 0.13800114393234253, + "rewards/accuracy_reward": 0.8352491855621338, + "rewards/format_reward": 0.9921875, + "step": 1040 + }, + { + "completion_length": 65.109375, + "epoch": 4.7534246575342465, + "grad_norm": 4.133329391479492, + "kl": 0.119140625, + "learning_rate": 5.246575342465754e-07, + "loss": 0.0048, + "reward": 1.7390583753585815, + "reward_std": 0.1775147169828415, + "rewards/accuracy_reward": 0.7390583753585815, + "rewards/format_reward": 1.0, + "step": 1041 + }, + { + "completion_length": 96.8125, + "epoch": 4.757990867579909, + "grad_norm": 2.921767473220825, + "kl": 0.090576171875, + "learning_rate": 5.242009132420091e-07, + "loss": 0.0036, + "reward": 1.8279520869255066, + "reward_std": 0.11337075009942055, + "rewards/accuracy_reward": 0.843576967716217, + "rewards/format_reward": 0.984375, + "step": 1042 + }, + { + "completion_length": 76.25, + "epoch": 4.762557077625571, + "grad_norm": 1.6646754741668701, + "kl": 0.0908203125, + "learning_rate": 5.237442922374428e-07, + "loss": 0.0036, + "reward": 1.761111080646515, + "reward_std": 0.08823190443217754, + "rewards/accuracy_reward": 0.7611111104488373, + "rewards/format_reward": 1.0, + "step": 1043 + }, + { + "completion_length": 68.90625, + "epoch": 4.767123287671232, + "grad_norm": 6.794203758239746, + "kl": 0.12890625, + "learning_rate": 5.232876712328767e-07, + "loss": 0.0052, + "reward": 1.6395359635353088, + "reward_std": 0.1700442135334015, + "rewards/accuracy_reward": 0.6473484635353088, + "rewards/format_reward": 0.9921875, + "step": 1044 + }, + { + "completion_length": 64.5625, + "epoch": 4.771689497716895, + "grad_norm": 1.9526876211166382, + "kl": 0.160888671875, + "learning_rate": 5.228310502283105e-07, + "loss": 0.0064, + "reward": 1.818750023841858, + "reward_std": 0.19044626876711845, + "rewards/accuracy_reward": 0.8265624642372131, + "rewards/format_reward": 0.9921875, + "step": 1045 + }, + { + "completion_length": 71.8359375, + "epoch": 4.776255707762557, + "grad_norm": 5.581969261169434, + "kl": 0.121337890625, + "learning_rate": 5.223744292237443e-07, + "loss": 0.0049, + "reward": 1.7421875596046448, + "reward_std": 0.1561211347579956, + "rewards/accuracy_reward": 0.7421874701976776, + "rewards/format_reward": 1.0, + "step": 1046 + }, + { + "completion_length": 65.2109375, + "epoch": 4.780821917808219, + "grad_norm": 3.522632360458374, + "kl": 0.139892578125, + "learning_rate": 5.219178082191781e-07, + "loss": 0.0056, + "reward": 1.8170573115348816, + "reward_std": 0.1570434384047985, + "rewards/accuracy_reward": 0.840494692325592, + "rewards/format_reward": 0.9765625, + "step": 1047 + }, + { + "completion_length": 82.296875, + "epoch": 4.7853881278538815, + "grad_norm": 8.847128868103027, + "kl": 0.110107421875, + "learning_rate": 5.214611872146118e-07, + "loss": 0.0044, + "reward": 1.8859375715255737, + "reward_std": 0.0849014800041914, + "rewards/accuracy_reward": 0.8859374523162842, + "rewards/format_reward": 1.0, + "step": 1048 + }, + { + "completion_length": 73.2578125, + "epoch": 4.789954337899544, + "grad_norm": 9.131427764892578, + "kl": 0.13525390625, + "learning_rate": 5.210045662100457e-07, + "loss": 0.0054, + "reward": 1.6504226922988892, + "reward_std": 0.18509591370821, + "rewards/accuracy_reward": 0.6582351326942444, + "rewards/format_reward": 0.9921875, + "step": 1049 + }, + { + "completion_length": 79.2265625, + "epoch": 4.794520547945205, + "grad_norm": 3.1302144527435303, + "kl": 0.123779296875, + "learning_rate": 5.205479452054794e-07, + "loss": 0.005, + "reward": 1.6324219703674316, + "reward_std": 0.22072409093379974, + "rewards/accuracy_reward": 0.6480468809604645, + "rewards/format_reward": 0.984375, + "step": 1050 + }, + { + "completion_length": 68.59375, + "epoch": 4.799086757990867, + "grad_norm": 1.5062321424484253, + "kl": 0.13525390625, + "learning_rate": 5.200913242009131e-07, + "loss": 0.0054, + "reward": 1.7230769991874695, + "reward_std": 0.10901044122874737, + "rewards/accuracy_reward": 0.7387019097805023, + "rewards/format_reward": 0.984375, + "step": 1051 + }, + { + "completion_length": 57.203125, + "epoch": 4.80365296803653, + "grad_norm": 2.9574577808380127, + "kl": 0.18505859375, + "learning_rate": 5.196347031963471e-07, + "loss": 0.0074, + "reward": 1.6598585844039917, + "reward_std": 0.14134880900382996, + "rewards/accuracy_reward": 0.6676710844039917, + "rewards/format_reward": 0.9921875, + "step": 1052 + }, + { + "completion_length": 76.7734375, + "epoch": 4.808219178082192, + "grad_norm": 2.4928853511810303, + "kl": 0.1630859375, + "learning_rate": 5.191780821917808e-07, + "loss": 0.0065, + "reward": 1.7294270992279053, + "reward_std": 0.2240969017148018, + "rewards/accuracy_reward": 0.7528644800186157, + "rewards/format_reward": 0.9765625, + "step": 1053 + }, + { + "completion_length": 84.828125, + "epoch": 4.812785388127854, + "grad_norm": 4.220333576202393, + "kl": 0.114013671875, + "learning_rate": 5.187214611872146e-07, + "loss": 0.0046, + "reward": 1.5883237719535828, + "reward_std": 0.18970628082752228, + "rewards/accuracy_reward": 0.5961362421512604, + "rewards/format_reward": 0.9921875, + "step": 1054 + }, + { + "completion_length": 80.0546875, + "epoch": 4.817351598173516, + "grad_norm": 2.9225761890411377, + "kl": 0.084716796875, + "learning_rate": 5.182648401826484e-07, + "loss": 0.0034, + "reward": 1.8234375715255737, + "reward_std": 0.1099486481398344, + "rewards/accuracy_reward": 0.8234374225139618, + "rewards/format_reward": 1.0, + "step": 1055 + }, + { + "completion_length": 68.84375, + "epoch": 4.821917808219178, + "grad_norm": 15.956554412841797, + "kl": 0.126953125, + "learning_rate": 5.178082191780821e-07, + "loss": 0.0051, + "reward": 1.707698404788971, + "reward_std": 0.12136031687259674, + "rewards/accuracy_reward": 0.7155108153820038, + "rewards/format_reward": 0.9921875, + "step": 1056 + }, + { + "completion_length": 61.59375, + "epoch": 4.82648401826484, + "grad_norm": 3.7795159816741943, + "kl": 0.19775390625, + "learning_rate": 5.17351598173516e-07, + "loss": 0.0079, + "reward": 1.7173488140106201, + "reward_std": 0.15133387595415115, + "rewards/accuracy_reward": 0.7251611948013306, + "rewards/format_reward": 0.9921875, + "step": 1057 + }, + { + "completion_length": 85.015625, + "epoch": 4.831050228310502, + "grad_norm": 4.373653888702393, + "kl": 0.104736328125, + "learning_rate": 5.168949771689497e-07, + "loss": 0.0042, + "reward": 1.8429688215255737, + "reward_std": 0.08631845097988844, + "rewards/accuracy_reward": 0.8507812023162842, + "rewards/format_reward": 0.9921875, + "step": 1058 + }, + { + "completion_length": 80.9296875, + "epoch": 4.835616438356165, + "grad_norm": 2.2139017581939697, + "kl": 0.091552734375, + "learning_rate": 5.164383561643836e-07, + "loss": 0.0037, + "reward": 1.7554687857627869, + "reward_std": 0.12153397500514984, + "rewards/accuracy_reward": 0.7632811963558197, + "rewards/format_reward": 0.9921875, + "step": 1059 + }, + { + "completion_length": 69.9296875, + "epoch": 4.840182648401827, + "grad_norm": 2.391106128692627, + "kl": 0.113525390625, + "learning_rate": 5.159817351598174e-07, + "loss": 0.0045, + "reward": 1.6293052434921265, + "reward_std": 0.18556885421276093, + "rewards/accuracy_reward": 0.6371176838874817, + "rewards/format_reward": 0.9921875, + "step": 1060 + }, + { + "completion_length": 75.5078125, + "epoch": 4.844748858447488, + "grad_norm": 3.7311413288116455, + "kl": 0.1337890625, + "learning_rate": 5.155251141552511e-07, + "loss": 0.0053, + "reward": 1.8738667964935303, + "reward_std": 0.09744009375572205, + "rewards/accuracy_reward": 0.8738666772842407, + "rewards/format_reward": 1.0, + "step": 1061 + }, + { + "completion_length": 88.328125, + "epoch": 4.8493150684931505, + "grad_norm": 2.399247884750366, + "kl": 0.146484375, + "learning_rate": 5.150684931506849e-07, + "loss": 0.0058, + "reward": 1.706250011920929, + "reward_std": 0.21963772177696228, + "rewards/accuracy_reward": 0.7140624821186066, + "rewards/format_reward": 0.9921875, + "step": 1062 + }, + { + "completion_length": 73.6171875, + "epoch": 4.853881278538813, + "grad_norm": 2.581521511077881, + "kl": 0.1435546875, + "learning_rate": 5.146118721461187e-07, + "loss": 0.0058, + "reward": 1.7601500153541565, + "reward_std": 0.14986789226531982, + "rewards/accuracy_reward": 0.7679624557495117, + "rewards/format_reward": 0.9921875, + "step": 1063 + }, + { + "completion_length": 72.1484375, + "epoch": 4.858447488584475, + "grad_norm": 8.218365669250488, + "kl": 0.1533203125, + "learning_rate": 5.141552511415524e-07, + "loss": 0.0061, + "reward": 1.7895833253860474, + "reward_std": 0.18269683420658112, + "rewards/accuracy_reward": 0.8052083253860474, + "rewards/format_reward": 0.984375, + "step": 1064 + }, + { + "completion_length": 75.6875, + "epoch": 4.863013698630137, + "grad_norm": 6.358737468719482, + "kl": 0.130126953125, + "learning_rate": 5.136986301369864e-07, + "loss": 0.0052, + "reward": 1.8111705780029297, + "reward_std": 0.11504914239048958, + "rewards/accuracy_reward": 0.8111704587936401, + "rewards/format_reward": 1.0, + "step": 1065 + }, + { + "completion_length": 72.0859375, + "epoch": 4.867579908675799, + "grad_norm": 1.565542221069336, + "kl": 0.123779296875, + "learning_rate": 5.132420091324201e-07, + "loss": 0.005, + "reward": 1.7661458849906921, + "reward_std": 0.13723178207874298, + "rewards/accuracy_reward": 0.773958295583725, + "rewards/format_reward": 0.9921875, + "step": 1066 + }, + { + "completion_length": 81.34375, + "epoch": 4.872146118721461, + "grad_norm": 1.8809258937835693, + "kl": 0.123291015625, + "learning_rate": 5.127853881278538e-07, + "loss": 0.0049, + "reward": 1.8238808512687683, + "reward_std": 0.09778433851897717, + "rewards/accuracy_reward": 0.8238807916641235, + "rewards/format_reward": 1.0, + "step": 1067 + }, + { + "completion_length": 80.703125, + "epoch": 4.876712328767123, + "grad_norm": 1.7312757968902588, + "kl": 0.0810546875, + "learning_rate": 5.123287671232877e-07, + "loss": 0.0032, + "reward": 1.624392330646515, + "reward_std": 0.09445247054100037, + "rewards/accuracy_reward": 0.6243923306465149, + "rewards/format_reward": 1.0, + "step": 1068 + }, + { + "completion_length": 73.265625, + "epoch": 4.8812785388127855, + "grad_norm": 2.2705366611480713, + "kl": 0.134033203125, + "learning_rate": 5.118721461187214e-07, + "loss": 0.0054, + "reward": 1.6665404438972473, + "reward_std": 0.17886501550674438, + "rewards/accuracy_reward": 0.6821653842926025, + "rewards/format_reward": 0.984375, + "step": 1069 + }, + { + "completion_length": 74.671875, + "epoch": 4.885844748858448, + "grad_norm": 2.8938543796539307, + "kl": 0.12646484375, + "learning_rate": 5.114155251141552e-07, + "loss": 0.0051, + "reward": 1.8554381728172302, + "reward_std": 0.18266908079385757, + "rewards/accuracy_reward": 0.8866880834102631, + "rewards/format_reward": 0.96875, + "step": 1070 + }, + { + "completion_length": 84.8203125, + "epoch": 4.890410958904109, + "grad_norm": 6.559085845947266, + "kl": 0.0888671875, + "learning_rate": 5.10958904109589e-07, + "loss": 0.0036, + "reward": 1.6242188215255737, + "reward_std": 0.14865797758102417, + "rewards/accuracy_reward": 0.6320312321186066, + "rewards/format_reward": 0.9921875, + "step": 1071 + }, + { + "completion_length": 72.203125, + "epoch": 4.894977168949771, + "grad_norm": 2.1274454593658447, + "kl": 0.126708984375, + "learning_rate": 5.105022831050228e-07, + "loss": 0.0051, + "reward": 1.7550346851348877, + "reward_std": 0.15320852398872375, + "rewards/accuracy_reward": 0.7628472149372101, + "rewards/format_reward": 0.9921875, + "step": 1072 + }, + { + "completion_length": 73.6484375, + "epoch": 4.899543378995434, + "grad_norm": 2.215421676635742, + "kl": 0.134765625, + "learning_rate": 5.100456621004567e-07, + "loss": 0.0054, + "reward": 1.7328497171401978, + "reward_std": 0.17783771082758904, + "rewards/accuracy_reward": 0.756287157535553, + "rewards/format_reward": 0.9765625, + "step": 1073 + }, + { + "completion_length": 82.7109375, + "epoch": 4.904109589041096, + "grad_norm": 7.915944576263428, + "kl": 0.1328125, + "learning_rate": 5.095890410958904e-07, + "loss": 0.0053, + "reward": 1.7984375953674316, + "reward_std": 0.14389308542013168, + "rewards/accuracy_reward": 0.8062499761581421, + "rewards/format_reward": 0.9921875, + "step": 1074 + }, + { + "completion_length": 56.046875, + "epoch": 4.908675799086758, + "grad_norm": 6.48905086517334, + "kl": 0.142822265625, + "learning_rate": 5.091324200913241e-07, + "loss": 0.0057, + "reward": 1.7908854484558105, + "reward_std": 0.16380437463521957, + "rewards/accuracy_reward": 0.7986978888511658, + "rewards/format_reward": 0.9921875, + "step": 1075 + }, + { + "completion_length": 76.328125, + "epoch": 4.91324200913242, + "grad_norm": 15.81816577911377, + "kl": 0.103759765625, + "learning_rate": 5.08675799086758e-07, + "loss": 0.0042, + "reward": 1.774218738079071, + "reward_std": 0.14345712214708328, + "rewards/accuracy_reward": 0.782031238079071, + "rewards/format_reward": 0.9921875, + "step": 1076 + }, + { + "completion_length": 65.71875, + "epoch": 4.917808219178082, + "grad_norm": 2.252168655395508, + "kl": 0.138916015625, + "learning_rate": 5.082191780821917e-07, + "loss": 0.0056, + "reward": 1.7870659828186035, + "reward_std": 0.14891928434371948, + "rewards/accuracy_reward": 0.7870660126209259, + "rewards/format_reward": 1.0, + "step": 1077 + }, + { + "completion_length": 84.4921875, + "epoch": 4.922374429223744, + "grad_norm": 6.789572715759277, + "kl": 0.1016845703125, + "learning_rate": 5.077625570776255e-07, + "loss": 0.0041, + "reward": 1.780468761920929, + "reward_std": 0.12099719420075417, + "rewards/accuracy_reward": 0.7882812023162842, + "rewards/format_reward": 0.9921875, + "step": 1078 + }, + { + "completion_length": 93.4765625, + "epoch": 4.926940639269406, + "grad_norm": 1.4697335958480835, + "kl": 0.0927734375, + "learning_rate": 5.073059360730594e-07, + "loss": 0.0037, + "reward": 1.7980054020881653, + "reward_std": 0.06987884640693665, + "rewards/accuracy_reward": 0.7980053424835205, + "rewards/format_reward": 1.0, + "step": 1079 + }, + { + "completion_length": 67.8125, + "epoch": 4.931506849315069, + "grad_norm": 1.78167724609375, + "kl": 0.12353515625, + "learning_rate": 5.068493150684931e-07, + "loss": 0.005, + "reward": 1.8179687857627869, + "reward_std": 0.09306978806853294, + "rewards/accuracy_reward": 0.8257812857627869, + "rewards/format_reward": 0.9921875, + "step": 1080 + }, + { + "completion_length": 71.4453125, + "epoch": 4.936073059360731, + "grad_norm": 2.6702702045440674, + "kl": 0.146484375, + "learning_rate": 5.06392694063927e-07, + "loss": 0.0059, + "reward": 1.8533853888511658, + "reward_std": 0.27950893342494965, + "rewards/accuracy_reward": 0.884635329246521, + "rewards/format_reward": 0.96875, + "step": 1081 + }, + { + "completion_length": 98.71875, + "epoch": 4.940639269406392, + "grad_norm": 2.6240620613098145, + "kl": 0.06982421875, + "learning_rate": 5.059360730593607e-07, + "loss": 0.0028, + "reward": 1.7433823347091675, + "reward_std": 0.1912398487329483, + "rewards/accuracy_reward": 0.7746323049068451, + "rewards/format_reward": 0.96875, + "step": 1082 + }, + { + "completion_length": 66.03125, + "epoch": 4.945205479452055, + "grad_norm": 5.760696887969971, + "kl": 0.15234375, + "learning_rate": 5.054794520547944e-07, + "loss": 0.0061, + "reward": 1.7298898100852966, + "reward_std": 0.21766673773527145, + "rewards/accuracy_reward": 0.7377023696899414, + "rewards/format_reward": 0.9921875, + "step": 1083 + }, + { + "completion_length": 68.59375, + "epoch": 4.949771689497717, + "grad_norm": 5.8692216873168945, + "kl": 0.16357421875, + "learning_rate": 5.050228310502283e-07, + "loss": 0.0065, + "reward": 1.7146621942520142, + "reward_std": 0.10843057557940483, + "rewards/accuracy_reward": 0.7146620750427246, + "rewards/format_reward": 1.0, + "step": 1084 + }, + { + "completion_length": 92.359375, + "epoch": 4.954337899543379, + "grad_norm": 3.167354106903076, + "kl": 0.1181640625, + "learning_rate": 5.045662100456621e-07, + "loss": 0.0047, + "reward": 1.6780134439468384, + "reward_std": 0.2429228127002716, + "rewards/accuracy_reward": 0.6936383247375488, + "rewards/format_reward": 0.984375, + "step": 1085 + }, + { + "completion_length": 79.984375, + "epoch": 4.958904109589041, + "grad_norm": 2.3840832710266113, + "kl": 0.113037109375, + "learning_rate": 5.041095890410959e-07, + "loss": 0.0045, + "reward": 1.7828125357627869, + "reward_std": 0.19722937047481537, + "rewards/accuracy_reward": 0.8140624761581421, + "rewards/format_reward": 0.96875, + "step": 1086 + }, + { + "completion_length": 60.84375, + "epoch": 4.963470319634704, + "grad_norm": 2.938784599304199, + "kl": 0.140380859375, + "learning_rate": 5.036529680365297e-07, + "loss": 0.0056, + "reward": 1.6975947618484497, + "reward_std": 0.22429338097572327, + "rewards/accuracy_reward": 0.7210322618484497, + "rewards/format_reward": 0.9765625, + "step": 1087 + }, + { + "completion_length": 73.8125, + "epoch": 4.968036529680365, + "grad_norm": 3.95701265335083, + "kl": 0.11474609375, + "learning_rate": 5.031963470319634e-07, + "loss": 0.0046, + "reward": 1.6440104246139526, + "reward_std": 0.1992247775197029, + "rewards/accuracy_reward": 0.651822954416275, + "rewards/format_reward": 0.9921875, + "step": 1088 + }, + { + "completion_length": 71.96875, + "epoch": 4.972602739726027, + "grad_norm": 7.810092926025391, + "kl": 0.1279296875, + "learning_rate": 5.027397260273973e-07, + "loss": 0.0051, + "reward": 1.7612414360046387, + "reward_std": 0.17399008572101593, + "rewards/accuracy_reward": 0.7690538763999939, + "rewards/format_reward": 0.9921875, + "step": 1089 + }, + { + "completion_length": 95.4765625, + "epoch": 4.9771689497716896, + "grad_norm": 2.2332069873809814, + "kl": 0.106201171875, + "learning_rate": 5.02283105022831e-07, + "loss": 0.0042, + "reward": 1.6498697996139526, + "reward_std": 0.23457543551921844, + "rewards/accuracy_reward": 0.6811197698116302, + "rewards/format_reward": 0.96875, + "step": 1090 + }, + { + "completion_length": 67.2421875, + "epoch": 4.981735159817352, + "grad_norm": 2.379009485244751, + "kl": 0.138916015625, + "learning_rate": 5.018264840182647e-07, + "loss": 0.0056, + "reward": 1.758593738079071, + "reward_std": 0.15046585351228714, + "rewards/accuracy_reward": 0.766406238079071, + "rewards/format_reward": 0.9921875, + "step": 1091 + }, + { + "completion_length": 80.3515625, + "epoch": 4.986301369863014, + "grad_norm": 1.9919111728668213, + "kl": 0.100830078125, + "learning_rate": 5.013698630136987e-07, + "loss": 0.004, + "reward": 1.7437500357627869, + "reward_std": 0.16439745388925076, + "rewards/accuracy_reward": 0.7671874463558197, + "rewards/format_reward": 0.9765625, + "step": 1092 + }, + { + "completion_length": 87.265625, + "epoch": 4.9908675799086755, + "grad_norm": 1.678598403930664, + "kl": 0.108642578125, + "learning_rate": 5.009132420091324e-07, + "loss": 0.0043, + "reward": 1.6603811383247375, + "reward_std": 0.1863521747291088, + "rewards/accuracy_reward": 0.6838186085224152, + "rewards/format_reward": 0.9765625, + "step": 1093 + }, + { + "completion_length": 86.515625, + "epoch": 4.995433789954338, + "grad_norm": 1.237178921699524, + "kl": 0.105224609375, + "learning_rate": 5.004566210045662e-07, + "loss": 0.0042, + "reward": 1.8344122171401978, + "reward_std": 0.09498458355665207, + "rewards/accuracy_reward": 0.8422246873378754, + "rewards/format_reward": 0.9921875, + "step": 1094 + }, + { + "completion_length": 49.625, + "epoch": 5.0, + "grad_norm": 3.034471273422241, + "kl": 0.10205078125, + "learning_rate": 5e-07, + "loss": 0.0039, + "reward": 1.1875, + "reward_std": 0.36278264224529266, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 1.0, + "step": 1095 + }, + { + "completion_length": 73.484375, + "epoch": 5.004566210045662, + "grad_norm": 4.169987678527832, + "kl": 0.15771484375, + "learning_rate": 4.995433789954337e-07, + "loss": 0.0063, + "reward": 1.7950236201286316, + "reward_std": 0.1954728439450264, + "rewards/accuracy_reward": 0.818461149930954, + "rewards/format_reward": 0.9765625, + "step": 1096 + }, + { + "completion_length": 53.1484375, + "epoch": 5.0091324200913245, + "grad_norm": 2.2528958320617676, + "kl": 0.18115234375, + "learning_rate": 4.990867579908676e-07, + "loss": 0.0073, + "reward": 1.6565104126930237, + "reward_std": 0.2434339076280594, + "rewards/accuracy_reward": 0.6799479424953461, + "rewards/format_reward": 0.9765625, + "step": 1097 + }, + { + "completion_length": 77.0390625, + "epoch": 5.013698630136986, + "grad_norm": 2.312629461288452, + "kl": 0.150634765625, + "learning_rate": 4.986301369863014e-07, + "loss": 0.006, + "reward": 1.6930060386657715, + "reward_std": 0.15761591121554375, + "rewards/accuracy_reward": 0.7008183896541595, + "rewards/format_reward": 0.9921875, + "step": 1098 + }, + { + "completion_length": 68.0625, + "epoch": 5.018264840182648, + "grad_norm": 4.447121620178223, + "kl": 0.180908203125, + "learning_rate": 4.981735159817351e-07, + "loss": 0.0072, + "reward": 1.689843773841858, + "reward_std": 0.19052022695541382, + "rewards/accuracy_reward": 0.7054686844348907, + "rewards/format_reward": 0.984375, + "step": 1099 + }, + { + "completion_length": 63.734375, + "epoch": 5.0228310502283104, + "grad_norm": 5.222453594207764, + "kl": 0.1005859375, + "learning_rate": 4.977168949771689e-07, + "loss": 0.004, + "reward": 1.8026909828186035, + "reward_std": 0.11854784563183784, + "rewards/accuracy_reward": 0.8105034232139587, + "rewards/format_reward": 0.9921875, + "step": 1100 + }, + { + "completion_length": 97.75, + "epoch": 5.027397260273973, + "grad_norm": 2.315014362335205, + "kl": 0.0711669921875, + "learning_rate": 4.972602739726027e-07, + "loss": 0.0028, + "reward": 1.7503806352615356, + "reward_std": 0.11963648349046707, + "rewards/accuracy_reward": 0.7503806054592133, + "rewards/format_reward": 1.0, + "step": 1101 + }, + { + "completion_length": 69.828125, + "epoch": 5.031963470319635, + "grad_norm": 2.1413135528564453, + "kl": 0.13818359375, + "learning_rate": 4.968036529680365e-07, + "loss": 0.0055, + "reward": 1.8750000596046448, + "reward_std": 0.10205792635679245, + "rewards/accuracy_reward": 0.8749999105930328, + "rewards/format_reward": 1.0, + "step": 1102 + }, + { + "completion_length": 95.1328125, + "epoch": 5.036529680365296, + "grad_norm": 1.4922230243682861, + "kl": 0.1259765625, + "learning_rate": 4.963470319634703e-07, + "loss": 0.005, + "reward": 1.7955846786499023, + "reward_std": 0.07053190469741821, + "rewards/accuracy_reward": 0.79558464884758, + "rewards/format_reward": 1.0, + "step": 1103 + }, + { + "completion_length": 75.15625, + "epoch": 5.041095890410959, + "grad_norm": 4.347703456878662, + "kl": 0.125244140625, + "learning_rate": 4.958904109589041e-07, + "loss": 0.005, + "reward": 1.6125000715255737, + "reward_std": 0.16528953611850739, + "rewards/accuracy_reward": 0.612500011920929, + "rewards/format_reward": 1.0, + "step": 1104 + }, + { + "completion_length": 83.9453125, + "epoch": 5.045662100456621, + "grad_norm": 1.9074816703796387, + "kl": 0.109375, + "learning_rate": 4.954337899543379e-07, + "loss": 0.0044, + "reward": 1.810937523841858, + "reward_std": 0.16448542103171349, + "rewards/accuracy_reward": 0.8265624642372131, + "rewards/format_reward": 0.984375, + "step": 1105 + }, + { + "completion_length": 66.875, + "epoch": 5.050228310502283, + "grad_norm": 2.969620943069458, + "kl": 0.12841796875, + "learning_rate": 4.949771689497717e-07, + "loss": 0.0051, + "reward": 1.6987723112106323, + "reward_std": 0.19702571630477905, + "rewards/accuracy_reward": 0.6987722814083099, + "rewards/format_reward": 1.0, + "step": 1106 + }, + { + "completion_length": 97.59375, + "epoch": 5.054794520547945, + "grad_norm": 2.6807308197021484, + "kl": 0.086669921875, + "learning_rate": 4.945205479452055e-07, + "loss": 0.0035, + "reward": 1.841210961341858, + "reward_std": 0.1313929297029972, + "rewards/accuracy_reward": 0.8490233421325684, + "rewards/format_reward": 0.9921875, + "step": 1107 + }, + { + "completion_length": 67.1015625, + "epoch": 5.059360730593608, + "grad_norm": 1.8944909572601318, + "kl": 0.135986328125, + "learning_rate": 4.940639269406393e-07, + "loss": 0.0054, + "reward": 1.8903645873069763, + "reward_std": 0.09690769761800766, + "rewards/accuracy_reward": 0.8903645575046539, + "rewards/format_reward": 1.0, + "step": 1108 + }, + { + "completion_length": 77.6015625, + "epoch": 5.063926940639269, + "grad_norm": 3.560889959335327, + "kl": 0.17138671875, + "learning_rate": 4.93607305936073e-07, + "loss": 0.0069, + "reward": 1.6163216829299927, + "reward_std": 0.2479196935892105, + "rewards/accuracy_reward": 0.6319466531276703, + "rewards/format_reward": 0.984375, + "step": 1109 + }, + { + "completion_length": 64.453125, + "epoch": 5.068493150684931, + "grad_norm": 1.8171228170394897, + "kl": 0.158203125, + "learning_rate": 4.931506849315068e-07, + "loss": 0.0063, + "reward": 1.7542577981948853, + "reward_std": 0.1399587020277977, + "rewards/accuracy_reward": 0.7542578279972076, + "rewards/format_reward": 1.0, + "step": 1110 + }, + { + "completion_length": 86.5078125, + "epoch": 5.073059360730594, + "grad_norm": 2.2326784133911133, + "kl": 0.106689453125, + "learning_rate": 4.926940639269407e-07, + "loss": 0.0043, + "reward": 1.7170758247375488, + "reward_std": 0.146156445145607, + "rewards/accuracy_reward": 0.7248883247375488, + "rewards/format_reward": 0.9921875, + "step": 1111 + }, + { + "completion_length": 60.7109375, + "epoch": 5.077625570776256, + "grad_norm": 2.7463958263397217, + "kl": 0.16796875, + "learning_rate": 4.922374429223744e-07, + "loss": 0.0067, + "reward": 1.6381410360336304, + "reward_std": 0.25498080253601074, + "rewards/accuracy_reward": 0.6615785360336304, + "rewards/format_reward": 0.9765625, + "step": 1112 + }, + { + "completion_length": 72.3046875, + "epoch": 5.082191780821918, + "grad_norm": 1.4025739431381226, + "kl": 0.10888671875, + "learning_rate": 4.917808219178081e-07, + "loss": 0.0044, + "reward": 1.7656250596046448, + "reward_std": 0.07576144114136696, + "rewards/accuracy_reward": 0.7734374403953552, + "rewards/format_reward": 0.9921875, + "step": 1113 + }, + { + "completion_length": 53.21875, + "epoch": 5.0867579908675795, + "grad_norm": 3.9440958499908447, + "kl": 0.142822265625, + "learning_rate": 4.91324200913242e-07, + "loss": 0.0057, + "reward": 1.8531250357627869, + "reward_std": 0.14943039789795876, + "rewards/accuracy_reward": 0.8609374761581421, + "rewards/format_reward": 0.9921875, + "step": 1114 + }, + { + "completion_length": 71.0859375, + "epoch": 5.091324200913242, + "grad_norm": 2.5945069789886475, + "kl": 0.101318359375, + "learning_rate": 4.908675799086758e-07, + "loss": 0.0041, + "reward": 1.7131696939468384, + "reward_std": 0.09270836971700191, + "rewards/accuracy_reward": 0.7131696045398712, + "rewards/format_reward": 1.0, + "step": 1115 + }, + { + "completion_length": 65.1796875, + "epoch": 5.095890410958904, + "grad_norm": 1.469778060913086, + "kl": 0.121337890625, + "learning_rate": 4.904109589041096e-07, + "loss": 0.0048, + "reward": 1.7162946462631226, + "reward_std": 0.12621085345745087, + "rewards/accuracy_reward": 0.7241071164608002, + "rewards/format_reward": 0.9921875, + "step": 1116 + }, + { + "completion_length": 80.140625, + "epoch": 5.100456621004566, + "grad_norm": 2.706517457962036, + "kl": 0.099365234375, + "learning_rate": 4.899543378995434e-07, + "loss": 0.004, + "reward": 1.8068211078643799, + "reward_std": 0.12114018388092518, + "rewards/accuracy_reward": 0.8224460184574127, + "rewards/format_reward": 0.984375, + "step": 1117 + }, + { + "completion_length": 58.5078125, + "epoch": 5.105022831050229, + "grad_norm": 4.029208183288574, + "kl": 0.17138671875, + "learning_rate": 4.894977168949771e-07, + "loss": 0.0069, + "reward": 1.7942708134651184, + "reward_std": 0.09960613120347261, + "rewards/accuracy_reward": 0.7942708432674408, + "rewards/format_reward": 1.0, + "step": 1118 + }, + { + "completion_length": 67.359375, + "epoch": 5.109589041095891, + "grad_norm": 2.30452823638916, + "kl": 0.15185546875, + "learning_rate": 4.89041095890411e-07, + "loss": 0.0061, + "reward": 1.7510417103767395, + "reward_std": 0.18200254440307617, + "rewards/accuracy_reward": 0.7666666209697723, + "rewards/format_reward": 0.984375, + "step": 1119 + }, + { + "completion_length": 70.984375, + "epoch": 5.114155251141552, + "grad_norm": 1.813476324081421, + "kl": 0.1337890625, + "learning_rate": 4.885844748858447e-07, + "loss": 0.0054, + "reward": 1.7627604007720947, + "reward_std": 0.09784993343055248, + "rewards/accuracy_reward": 0.76276034116745, + "rewards/format_reward": 1.0, + "step": 1120 + }, + { + "completion_length": 67.1171875, + "epoch": 5.1187214611872145, + "grad_norm": 2.7639548778533936, + "kl": 0.158203125, + "learning_rate": 4.881278538812786e-07, + "loss": 0.0063, + "reward": 1.6960286498069763, + "reward_std": 0.1810276135802269, + "rewards/accuracy_reward": 0.6960286498069763, + "rewards/format_reward": 1.0, + "step": 1121 + }, + { + "completion_length": 64.8984375, + "epoch": 5.123287671232877, + "grad_norm": 1.0785552263259888, + "kl": 0.116943359375, + "learning_rate": 4.876712328767123e-07, + "loss": 0.0047, + "reward": 1.7366800904273987, + "reward_std": 0.12058132514357567, + "rewards/accuracy_reward": 0.7523050308227539, + "rewards/format_reward": 0.984375, + "step": 1122 + }, + { + "completion_length": 70.203125, + "epoch": 5.127853881278539, + "grad_norm": 1.4450359344482422, + "kl": 0.165771484375, + "learning_rate": 4.872146118721461e-07, + "loss": 0.0066, + "reward": 1.8294271230697632, + "reward_std": 0.1284763477742672, + "rewards/accuracy_reward": 0.8450520634651184, + "rewards/format_reward": 0.984375, + "step": 1123 + }, + { + "completion_length": 73.5390625, + "epoch": 5.132420091324201, + "grad_norm": 1.4733330011367798, + "kl": 0.126220703125, + "learning_rate": 4.867579908675799e-07, + "loss": 0.0051, + "reward": 1.8328125476837158, + "reward_std": 0.09086007624864578, + "rewards/accuracy_reward": 0.8328124284744263, + "rewards/format_reward": 1.0, + "step": 1124 + }, + { + "completion_length": 62.890625, + "epoch": 5.136986301369863, + "grad_norm": 5.508318901062012, + "kl": 0.16796875, + "learning_rate": 4.863013698630137e-07, + "loss": 0.0067, + "reward": 1.7619792222976685, + "reward_std": 0.18441661447286606, + "rewards/accuracy_reward": 0.7619791626930237, + "rewards/format_reward": 1.0, + "step": 1125 + }, + { + "completion_length": 68.4453125, + "epoch": 5.141552511415525, + "grad_norm": 2.6522018909454346, + "kl": 0.16748046875, + "learning_rate": 4.858447488584474e-07, + "loss": 0.0067, + "reward": 1.705004334449768, + "reward_std": 0.19281967729330063, + "rewards/accuracy_reward": 0.7206293344497681, + "rewards/format_reward": 0.984375, + "step": 1126 + }, + { + "completion_length": 69.0, + "epoch": 5.146118721461187, + "grad_norm": 2.304168224334717, + "kl": 0.143310546875, + "learning_rate": 4.853881278538813e-07, + "loss": 0.0057, + "reward": 1.7598021030426025, + "reward_std": 0.18482757359743118, + "rewards/accuracy_reward": 0.7832395136356354, + "rewards/format_reward": 0.9765625, + "step": 1127 + }, + { + "completion_length": 92.8203125, + "epoch": 5.1506849315068495, + "grad_norm": 2.8171322345733643, + "kl": 0.082763671875, + "learning_rate": 4.84931506849315e-07, + "loss": 0.0033, + "reward": 1.7391226291656494, + "reward_std": 0.10928737744688988, + "rewards/accuracy_reward": 0.7469350695610046, + "rewards/format_reward": 0.9921875, + "step": 1128 + }, + { + "completion_length": 66.7578125, + "epoch": 5.155251141552512, + "grad_norm": 3.0008230209350586, + "kl": 0.22119140625, + "learning_rate": 4.844748858447489e-07, + "loss": 0.0088, + "reward": 1.7692708373069763, + "reward_std": 0.20399662107229233, + "rewards/accuracy_reward": 0.7927083671092987, + "rewards/format_reward": 0.9765625, + "step": 1129 + }, + { + "completion_length": 71.7734375, + "epoch": 5.159817351598173, + "grad_norm": 3.2780561447143555, + "kl": 0.15673828125, + "learning_rate": 4.840182648401826e-07, + "loss": 0.0063, + "reward": 1.8493314981460571, + "reward_std": 0.12171986699104309, + "rewards/accuracy_reward": 0.8493313789367676, + "rewards/format_reward": 1.0, + "step": 1130 + }, + { + "completion_length": 72.875, + "epoch": 5.164383561643835, + "grad_norm": 2.125765800476074, + "kl": 0.1103515625, + "learning_rate": 4.835616438356164e-07, + "loss": 0.0044, + "reward": 1.8101563453674316, + "reward_std": 0.09943688660860062, + "rewards/accuracy_reward": 0.8179686963558197, + "rewards/format_reward": 0.9921875, + "step": 1131 + }, + { + "completion_length": 70.375, + "epoch": 5.168949771689498, + "grad_norm": 8.532958984375, + "kl": 0.138671875, + "learning_rate": 4.831050228310502e-07, + "loss": 0.0055, + "reward": 1.7585145235061646, + "reward_std": 0.1326666846871376, + "rewards/accuracy_reward": 0.7663269340991974, + "rewards/format_reward": 0.9921875, + "step": 1132 + }, + { + "completion_length": 86.296875, + "epoch": 5.17351598173516, + "grad_norm": 1.4057005643844604, + "kl": 0.105224609375, + "learning_rate": 4.82648401826484e-07, + "loss": 0.0042, + "reward": 1.8115254044532776, + "reward_std": 0.059396788477897644, + "rewards/accuracy_reward": 0.8115253746509552, + "rewards/format_reward": 1.0, + "step": 1133 + }, + { + "completion_length": 50.203125, + "epoch": 5.178082191780822, + "grad_norm": 4.812795162200928, + "kl": 0.13623046875, + "learning_rate": 4.821917808219178e-07, + "loss": 0.0055, + "reward": 1.5721354484558105, + "reward_std": 0.19010350108146667, + "rewards/accuracy_reward": 0.5721354186534882, + "rewards/format_reward": 1.0, + "step": 1134 + }, + { + "completion_length": 62.796875, + "epoch": 5.182648401826484, + "grad_norm": 5.795816421508789, + "kl": 0.14892578125, + "learning_rate": 4.817351598173516e-07, + "loss": 0.006, + "reward": 1.839672565460205, + "reward_std": 0.16954216361045837, + "rewards/accuracy_reward": 0.8631100654602051, + "rewards/format_reward": 0.9765625, + "step": 1135 + }, + { + "completion_length": 65.6015625, + "epoch": 5.187214611872146, + "grad_norm": 4.185856342315674, + "kl": 0.14453125, + "learning_rate": 4.812785388127853e-07, + "loss": 0.0058, + "reward": 1.727814257144928, + "reward_std": 0.19888605177402496, + "rewards/accuracy_reward": 0.7512516975402832, + "rewards/format_reward": 0.9765625, + "step": 1136 + }, + { + "completion_length": 74.3046875, + "epoch": 5.191780821917808, + "grad_norm": 1.525724172592163, + "kl": 0.113525390625, + "learning_rate": 4.808219178082192e-07, + "loss": 0.0045, + "reward": 1.7773438096046448, + "reward_std": 0.12639044970273972, + "rewards/accuracy_reward": 0.77734375, + "rewards/format_reward": 1.0, + "step": 1137 + }, + { + "completion_length": 54.1875, + "epoch": 5.19634703196347, + "grad_norm": 2.832690954208374, + "kl": 0.21484375, + "learning_rate": 4.80365296803653e-07, + "loss": 0.0086, + "reward": 1.666732907295227, + "reward_std": 0.19027332961559296, + "rewards/accuracy_reward": 0.6745454370975494, + "rewards/format_reward": 0.9921875, + "step": 1138 + }, + { + "completion_length": 58.1953125, + "epoch": 5.200913242009133, + "grad_norm": 2.4609923362731934, + "kl": 0.17724609375, + "learning_rate": 4.799086757990867e-07, + "loss": 0.0071, + "reward": 1.7091332077980042, + "reward_std": 0.24910558015108109, + "rewards/accuracy_reward": 0.7247581779956818, + "rewards/format_reward": 0.984375, + "step": 1139 + }, + { + "completion_length": 94.75, + "epoch": 5.205479452054795, + "grad_norm": 11.776135444641113, + "kl": 0.2222900390625, + "learning_rate": 4.794520547945205e-07, + "loss": 0.0089, + "reward": 1.803125023841858, + "reward_std": 0.09865947626531124, + "rewards/accuracy_reward": 0.8109373450279236, + "rewards/format_reward": 0.9921875, + "step": 1140 + }, + { + "completion_length": 75.5546875, + "epoch": 5.210045662100456, + "grad_norm": 1.514672875404358, + "kl": 0.13330078125, + "learning_rate": 4.789954337899543e-07, + "loss": 0.0053, + "reward": 1.8759238719940186, + "reward_std": 0.10365894064307213, + "rewards/accuracy_reward": 0.8915487825870514, + "rewards/format_reward": 0.984375, + "step": 1141 + }, + { + "completion_length": 65.953125, + "epoch": 5.2146118721461185, + "grad_norm": 2.5782454013824463, + "kl": 0.110595703125, + "learning_rate": 4.785388127853881e-07, + "loss": 0.0044, + "reward": 1.6188101172447205, + "reward_std": 0.24680721014738083, + "rewards/accuracy_reward": 0.6422475874423981, + "rewards/format_reward": 0.9765625, + "step": 1142 + }, + { + "completion_length": 54.1875, + "epoch": 5.219178082191781, + "grad_norm": 6.618932723999023, + "kl": 0.1689453125, + "learning_rate": 4.780821917808219e-07, + "loss": 0.0068, + "reward": 1.6172246932983398, + "reward_std": 0.23022934794425964, + "rewards/accuracy_reward": 0.6250371932983398, + "rewards/format_reward": 0.9921875, + "step": 1143 + }, + { + "completion_length": 75.8203125, + "epoch": 5.223744292237443, + "grad_norm": 3.0787389278411865, + "kl": 0.10595703125, + "learning_rate": 4.776255707762557e-07, + "loss": 0.0042, + "reward": 1.806249976158142, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward": 0.8062499165534973, + "rewards/format_reward": 1.0, + "step": 1144 + }, + { + "completion_length": 65.4453125, + "epoch": 5.228310502283105, + "grad_norm": 2.9733009338378906, + "kl": 0.1591796875, + "learning_rate": 4.771689497716894e-07, + "loss": 0.0064, + "reward": 1.7895833849906921, + "reward_std": 0.13740738481283188, + "rewards/accuracy_reward": 0.7973958253860474, + "rewards/format_reward": 0.9921875, + "step": 1145 + }, + { + "completion_length": 68.8828125, + "epoch": 5.232876712328767, + "grad_norm": 3.3826847076416016, + "kl": 0.1767578125, + "learning_rate": 4.7671232876712324e-07, + "loss": 0.0071, + "reward": 1.7227678894996643, + "reward_std": 0.2944817692041397, + "rewards/accuracy_reward": 0.7540178298950195, + "rewards/format_reward": 0.96875, + "step": 1146 + }, + { + "completion_length": 104.4140625, + "epoch": 5.237442922374429, + "grad_norm": 2.469144105911255, + "kl": 0.07373046875, + "learning_rate": 4.762557077625571e-07, + "loss": 0.0029, + "reward": 1.896093726158142, + "reward_std": 0.07282309047877789, + "rewards/accuracy_reward": 0.9039061367511749, + "rewards/format_reward": 0.9921875, + "step": 1147 + }, + { + "completion_length": 73.578125, + "epoch": 5.242009132420091, + "grad_norm": 2.969399929046631, + "kl": 0.139892578125, + "learning_rate": 4.7579908675799086e-07, + "loss": 0.0056, + "reward": 1.8597139716148376, + "reward_std": 0.13000112399458885, + "rewards/accuracy_reward": 0.8597138822078705, + "rewards/format_reward": 1.0, + "step": 1148 + }, + { + "completion_length": 77.1953125, + "epoch": 5.2465753424657535, + "grad_norm": 1.6237159967422485, + "kl": 0.115478515625, + "learning_rate": 4.7534246575342465e-07, + "loss": 0.0046, + "reward": 1.7437500953674316, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward": 0.7593749761581421, + "rewards/format_reward": 0.984375, + "step": 1149 + }, + { + "completion_length": 73.2578125, + "epoch": 5.251141552511416, + "grad_norm": 3.891481637954712, + "kl": 0.108154296875, + "learning_rate": 4.748858447488584e-07, + "loss": 0.0043, + "reward": 1.792336344718933, + "reward_std": 0.23970390856266022, + "rewards/accuracy_reward": 0.8157738149166107, + "rewards/format_reward": 0.9765625, + "step": 1150 + }, + { + "completion_length": 92.109375, + "epoch": 5.255707762557078, + "grad_norm": 2.1832501888275146, + "kl": 0.074951171875, + "learning_rate": 4.744292237442922e-07, + "loss": 0.003, + "reward": 1.7013021111488342, + "reward_std": 0.13120698183774948, + "rewards/accuracy_reward": 0.7091145515441895, + "rewards/format_reward": 0.9921875, + "step": 1151 + }, + { + "completion_length": 79.3359375, + "epoch": 5.260273972602739, + "grad_norm": 2.02133846282959, + "kl": 0.15185546875, + "learning_rate": 4.73972602739726e-07, + "loss": 0.0061, + "reward": 1.7348758578300476, + "reward_std": 0.2100473716855049, + "rewards/accuracy_reward": 0.7661258280277252, + "rewards/format_reward": 0.96875, + "step": 1152 + }, + { + "completion_length": 69.28125, + "epoch": 5.264840182648402, + "grad_norm": 1.7159175872802734, + "kl": 0.146484375, + "learning_rate": 4.735159817351598e-07, + "loss": 0.0058, + "reward": 1.6989583373069763, + "reward_std": 0.17543572187423706, + "rewards/accuracy_reward": 0.7223958075046539, + "rewards/format_reward": 0.9765625, + "step": 1153 + }, + { + "completion_length": 62.1328125, + "epoch": 5.269406392694064, + "grad_norm": 3.155850887298584, + "kl": 0.1455078125, + "learning_rate": 4.730593607305936e-07, + "loss": 0.0058, + "reward": 1.734188973903656, + "reward_std": 0.2661764770746231, + "rewards/accuracy_reward": 0.7576265037059784, + "rewards/format_reward": 0.9765625, + "step": 1154 + }, + { + "completion_length": 64.859375, + "epoch": 5.273972602739726, + "grad_norm": 2.906141757965088, + "kl": 0.18798828125, + "learning_rate": 4.726027397260274e-07, + "loss": 0.0075, + "reward": 1.7283979058265686, + "reward_std": 0.15935904532670975, + "rewards/accuracy_reward": 0.7362103760242462, + "rewards/format_reward": 0.9921875, + "step": 1155 + }, + { + "completion_length": 68.3984375, + "epoch": 5.2785388127853885, + "grad_norm": 3.6288256645202637, + "kl": 0.1611328125, + "learning_rate": 4.7214611872146116e-07, + "loss": 0.0065, + "reward": 1.6093750596046448, + "reward_std": 0.2415616661310196, + "rewards/accuracy_reward": 0.6328125298023224, + "rewards/format_reward": 0.9765625, + "step": 1156 + }, + { + "completion_length": 65.546875, + "epoch": 5.28310502283105, + "grad_norm": 5.221689701080322, + "kl": 0.21435546875, + "learning_rate": 4.71689497716895e-07, + "loss": 0.0086, + "reward": 1.7025855779647827, + "reward_std": 0.11607009917497635, + "rewards/accuracy_reward": 0.7025855779647827, + "rewards/format_reward": 1.0, + "step": 1157 + }, + { + "completion_length": 66.5546875, + "epoch": 5.287671232876712, + "grad_norm": 1.9122740030288696, + "kl": 0.1474609375, + "learning_rate": 4.7123287671232874e-07, + "loss": 0.0059, + "reward": 1.8352110981941223, + "reward_std": 0.097220653668046, + "rewards/accuracy_reward": 0.8430235981941223, + "rewards/format_reward": 0.9921875, + "step": 1158 + }, + { + "completion_length": 70.34375, + "epoch": 5.292237442922374, + "grad_norm": 10.378006935119629, + "kl": 0.1640625, + "learning_rate": 4.707762557077625e-07, + "loss": 0.0066, + "reward": 1.8343608975410461, + "reward_std": 0.10324066504836082, + "rewards/accuracy_reward": 0.8343608975410461, + "rewards/format_reward": 1.0, + "step": 1159 + }, + { + "completion_length": 67.8828125, + "epoch": 5.296803652968037, + "grad_norm": 6.8480119705200195, + "kl": 0.14599609375, + "learning_rate": 4.703196347031963e-07, + "loss": 0.0058, + "reward": 1.7857979536056519, + "reward_std": 0.13422124087810516, + "rewards/accuracy_reward": 0.7936104536056519, + "rewards/format_reward": 0.9921875, + "step": 1160 + }, + { + "completion_length": 100.640625, + "epoch": 5.301369863013699, + "grad_norm": 3.1650795936584473, + "kl": 0.0848388671875, + "learning_rate": 4.6986301369863015e-07, + "loss": 0.0034, + "reward": 1.803321123123169, + "reward_std": 0.17942239344120026, + "rewards/accuracy_reward": 0.8345710933208466, + "rewards/format_reward": 0.96875, + "step": 1161 + }, + { + "completion_length": 53.546875, + "epoch": 5.30593607305936, + "grad_norm": 4.188801288604736, + "kl": 0.20947265625, + "learning_rate": 4.694063926940639e-07, + "loss": 0.0084, + "reward": 1.7130786776542664, + "reward_std": 0.21442808210849762, + "rewards/accuracy_reward": 0.7287036776542664, + "rewards/format_reward": 0.984375, + "step": 1162 + }, + { + "completion_length": 80.5625, + "epoch": 5.310502283105023, + "grad_norm": 2.222104787826538, + "kl": 0.08837890625, + "learning_rate": 4.689497716894977e-07, + "loss": 0.0035, + "reward": 1.7203125953674316, + "reward_std": 0.16751586645841599, + "rewards/accuracy_reward": 0.7437499463558197, + "rewards/format_reward": 0.9765625, + "step": 1163 + }, + { + "completion_length": 54.359375, + "epoch": 5.315068493150685, + "grad_norm": 2.051166296005249, + "kl": 0.15380859375, + "learning_rate": 4.684931506849315e-07, + "loss": 0.0062, + "reward": 1.7621361017227173, + "reward_std": 0.11842145770788193, + "rewards/accuracy_reward": 0.7699486315250397, + "rewards/format_reward": 0.9921875, + "step": 1164 + }, + { + "completion_length": 64.0546875, + "epoch": 5.319634703196347, + "grad_norm": 3.0098822116851807, + "kl": 0.177734375, + "learning_rate": 4.680365296803653e-07, + "loss": 0.0071, + "reward": 1.6876825094223022, + "reward_std": 0.1372067779302597, + "rewards/accuracy_reward": 0.6876825392246246, + "rewards/format_reward": 1.0, + "step": 1165 + }, + { + "completion_length": 80.75, + "epoch": 5.324200913242009, + "grad_norm": 2.739140033721924, + "kl": 0.0966796875, + "learning_rate": 4.6757990867579904e-07, + "loss": 0.0039, + "reward": 1.740625023841858, + "reward_std": 0.14044751226902008, + "rewards/accuracy_reward": 0.7484374046325684, + "rewards/format_reward": 0.9921875, + "step": 1166 + }, + { + "completion_length": 67.6875, + "epoch": 5.328767123287671, + "grad_norm": 2.510779857635498, + "kl": 0.103271484375, + "learning_rate": 4.671232876712329e-07, + "loss": 0.0041, + "reward": 1.607812523841858, + "reward_std": 0.1767766922712326, + "rewards/accuracy_reward": 0.6078124940395355, + "rewards/format_reward": 1.0, + "step": 1167 + }, + { + "completion_length": 78.734375, + "epoch": 5.333333333333333, + "grad_norm": 2.7751314640045166, + "kl": 0.126708984375, + "learning_rate": 4.6666666666666666e-07, + "loss": 0.0051, + "reward": 1.853255271911621, + "reward_std": 0.19070542603731155, + "rewards/accuracy_reward": 0.8688801825046539, + "rewards/format_reward": 0.984375, + "step": 1168 + }, + { + "completion_length": 67.75, + "epoch": 5.337899543378995, + "grad_norm": 1.8172931671142578, + "kl": 0.146240234375, + "learning_rate": 4.6621004566210045e-07, + "loss": 0.0059, + "reward": 1.7744792103767395, + "reward_std": 0.09880542568862438, + "rewards/accuracy_reward": 0.78229159116745, + "rewards/format_reward": 0.9921875, + "step": 1169 + }, + { + "completion_length": 96.1640625, + "epoch": 5.342465753424658, + "grad_norm": 2.242799758911133, + "kl": 0.11083984375, + "learning_rate": 4.657534246575342e-07, + "loss": 0.0044, + "reward": 1.6858011484146118, + "reward_std": 0.16090433299541473, + "rewards/accuracy_reward": 0.7014259994029999, + "rewards/format_reward": 0.984375, + "step": 1170 + }, + { + "completion_length": 94.90625, + "epoch": 5.34703196347032, + "grad_norm": 1.8737952709197998, + "kl": 0.102783203125, + "learning_rate": 4.65296803652968e-07, + "loss": 0.0041, + "reward": 1.8359375, + "reward_std": 0.1631505787372589, + "rewards/accuracy_reward": 0.8515624403953552, + "rewards/format_reward": 0.984375, + "step": 1171 + }, + { + "completion_length": 108.46875, + "epoch": 5.351598173515982, + "grad_norm": 1.0832995176315308, + "kl": 0.1112060546875, + "learning_rate": 4.648401826484018e-07, + "loss": 0.0044, + "reward": 1.8958333730697632, + "reward_std": 0.05308555904775858, + "rewards/accuracy_reward": 0.8958332538604736, + "rewards/format_reward": 1.0, + "step": 1172 + }, + { + "completion_length": 89.1875, + "epoch": 5.3561643835616435, + "grad_norm": 2.971842050552368, + "kl": 0.1611328125, + "learning_rate": 4.643835616438356e-07, + "loss": 0.0065, + "reward": 1.6929687857627869, + "reward_std": 0.22123289108276367, + "rewards/accuracy_reward": 0.7164062261581421, + "rewards/format_reward": 0.9765625, + "step": 1173 + }, + { + "completion_length": 79.7109375, + "epoch": 5.360730593607306, + "grad_norm": 3.3845016956329346, + "kl": 0.107177734375, + "learning_rate": 4.639269406392694e-07, + "loss": 0.0043, + "reward": 1.7539063096046448, + "reward_std": 0.1683424860239029, + "rewards/accuracy_reward": 0.75390625, + "rewards/format_reward": 1.0, + "step": 1174 + }, + { + "completion_length": 88.7265625, + "epoch": 5.365296803652968, + "grad_norm": 1.8644884824752808, + "kl": 0.115478515625, + "learning_rate": 4.634703196347032e-07, + "loss": 0.0046, + "reward": 1.635702133178711, + "reward_std": 0.2014181986451149, + "rewards/accuracy_reward": 0.6591395735740662, + "rewards/format_reward": 0.9765625, + "step": 1175 + }, + { + "completion_length": 79.828125, + "epoch": 5.36986301369863, + "grad_norm": 11.902091026306152, + "kl": 0.126953125, + "learning_rate": 4.6301369863013696e-07, + "loss": 0.0051, + "reward": 1.7371652126312256, + "reward_std": 0.1940011829137802, + "rewards/accuracy_reward": 0.7527901232242584, + "rewards/format_reward": 0.984375, + "step": 1176 + }, + { + "completion_length": 66.2734375, + "epoch": 5.3744292237442925, + "grad_norm": 2.139374017715454, + "kl": 0.11083984375, + "learning_rate": 4.625570776255708e-07, + "loss": 0.0044, + "reward": 1.8981026411056519, + "reward_std": 0.12666313955560327, + "rewards/accuracy_reward": 0.9137276411056519, + "rewards/format_reward": 0.984375, + "step": 1177 + }, + { + "completion_length": 68.4921875, + "epoch": 5.378995433789954, + "grad_norm": 1.7856290340423584, + "kl": 0.12744140625, + "learning_rate": 4.6210045662100454e-07, + "loss": 0.0051, + "reward": 1.6984019875526428, + "reward_std": 0.18527702055871487, + "rewards/accuracy_reward": 0.7140269875526428, + "rewards/format_reward": 0.984375, + "step": 1178 + }, + { + "completion_length": 85.734375, + "epoch": 5.383561643835616, + "grad_norm": 3.6187846660614014, + "kl": 0.090576171875, + "learning_rate": 4.616438356164383e-07, + "loss": 0.0036, + "reward": 1.8394480347633362, + "reward_std": 0.08028085343539715, + "rewards/accuracy_reward": 0.8394480347633362, + "rewards/format_reward": 1.0, + "step": 1179 + }, + { + "completion_length": 73.796875, + "epoch": 5.3881278538812785, + "grad_norm": 2.5453333854675293, + "kl": 0.17138671875, + "learning_rate": 4.611872146118721e-07, + "loss": 0.0068, + "reward": 1.810937523841858, + "reward_std": 0.19586525857448578, + "rewards/accuracy_reward": 0.8343749344348907, + "rewards/format_reward": 0.9765625, + "step": 1180 + }, + { + "completion_length": 75.5546875, + "epoch": 5.392694063926941, + "grad_norm": 2.8857522010803223, + "kl": 0.107177734375, + "learning_rate": 4.6073059360730595e-07, + "loss": 0.0043, + "reward": 1.685937523841858, + "reward_std": 0.14966705068945885, + "rewards/accuracy_reward": 0.6859375238418579, + "rewards/format_reward": 1.0, + "step": 1181 + }, + { + "completion_length": 64.7890625, + "epoch": 5.397260273972603, + "grad_norm": 2.9432222843170166, + "kl": 0.14306640625, + "learning_rate": 4.602739726027397e-07, + "loss": 0.0057, + "reward": 1.8203496932983398, + "reward_std": 0.20706991106271744, + "rewards/accuracy_reward": 0.8281622231006622, + "rewards/format_reward": 0.9921875, + "step": 1182 + }, + { + "completion_length": 88.6328125, + "epoch": 5.401826484018265, + "grad_norm": 2.689836025238037, + "kl": 0.0927734375, + "learning_rate": 4.5981735159817347e-07, + "loss": 0.0037, + "reward": 1.772569477558136, + "reward_std": 0.2539152055978775, + "rewards/accuracy_reward": 0.8116318881511688, + "rewards/format_reward": 0.9609375, + "step": 1183 + }, + { + "completion_length": 74.28125, + "epoch": 5.406392694063927, + "grad_norm": 3.8888282775878906, + "kl": 0.14453125, + "learning_rate": 4.593607305936073e-07, + "loss": 0.0058, + "reward": 1.6562862396240234, + "reward_std": 0.22701606899499893, + "rewards/accuracy_reward": 0.679723709821701, + "rewards/format_reward": 0.9765625, + "step": 1184 + }, + { + "completion_length": 90.3671875, + "epoch": 5.410958904109589, + "grad_norm": 5.0972981452941895, + "kl": 0.13427734375, + "learning_rate": 4.589041095890411e-07, + "loss": 0.0054, + "reward": 1.7076823115348816, + "reward_std": 0.1942017376422882, + "rewards/accuracy_reward": 0.7154947817325592, + "rewards/format_reward": 0.9921875, + "step": 1185 + }, + { + "completion_length": 94.515625, + "epoch": 5.415525114155251, + "grad_norm": 2.337641716003418, + "kl": 0.112548828125, + "learning_rate": 4.5844748858447483e-07, + "loss": 0.0045, + "reward": 1.8321614265441895, + "reward_std": 0.08036978542804718, + "rewards/accuracy_reward": 0.8321613669395447, + "rewards/format_reward": 1.0, + "step": 1186 + }, + { + "completion_length": 70.9296875, + "epoch": 5.420091324200913, + "grad_norm": 1.6933238506317139, + "kl": 0.099853515625, + "learning_rate": 4.579908675799087e-07, + "loss": 0.004, + "reward": 1.6846354007720947, + "reward_std": 0.20877984166145325, + "rewards/accuracy_reward": 0.7002603709697723, + "rewards/format_reward": 0.984375, + "step": 1187 + }, + { + "completion_length": 86.8359375, + "epoch": 5.424657534246576, + "grad_norm": 9.663613319396973, + "kl": 0.094970703125, + "learning_rate": 4.5753424657534246e-07, + "loss": 0.0038, + "reward": 1.784375011920929, + "reward_std": 0.10888781771063805, + "rewards/accuracy_reward": 0.7843749225139618, + "rewards/format_reward": 1.0, + "step": 1188 + }, + { + "completion_length": 90.921875, + "epoch": 5.429223744292237, + "grad_norm": 3.4232490062713623, + "kl": 0.1181640625, + "learning_rate": 4.5707762557077625e-07, + "loss": 0.0047, + "reward": 1.7392657399177551, + "reward_std": 0.12979238480329514, + "rewards/accuracy_reward": 0.7470781803131104, + "rewards/format_reward": 0.9921875, + "step": 1189 + }, + { + "completion_length": 74.96875, + "epoch": 5.433789954337899, + "grad_norm": 2.6465795040130615, + "kl": 0.132080078125, + "learning_rate": 4.5662100456621e-07, + "loss": 0.0053, + "reward": 1.7466145753860474, + "reward_std": 0.129929106682539, + "rewards/accuracy_reward": 0.7544271051883698, + "rewards/format_reward": 0.9921875, + "step": 1190 + }, + { + "completion_length": 77.8828125, + "epoch": 5.438356164383562, + "grad_norm": 4.823699951171875, + "kl": 0.12158203125, + "learning_rate": 4.561643835616438e-07, + "loss": 0.0049, + "reward": 1.679805874824524, + "reward_std": 0.12897202000021935, + "rewards/accuracy_reward": 0.6798058748245239, + "rewards/format_reward": 1.0, + "step": 1191 + }, + { + "completion_length": 61.109375, + "epoch": 5.442922374429224, + "grad_norm": 1.3143653869628906, + "kl": 0.13623046875, + "learning_rate": 4.557077625570776e-07, + "loss": 0.0055, + "reward": 1.6895833611488342, + "reward_std": 0.10995453409850597, + "rewards/accuracy_reward": 0.6895833015441895, + "rewards/format_reward": 1.0, + "step": 1192 + }, + { + "completion_length": 94.640625, + "epoch": 5.447488584474886, + "grad_norm": 26.47618293762207, + "kl": 0.086181640625, + "learning_rate": 4.552511415525114e-07, + "loss": 0.0035, + "reward": 1.8638021349906921, + "reward_std": 0.14757593348622322, + "rewards/accuracy_reward": 0.8872395753860474, + "rewards/format_reward": 0.9765625, + "step": 1193 + }, + { + "completion_length": 52.4609375, + "epoch": 5.4520547945205475, + "grad_norm": 5.206370830535889, + "kl": 0.17138671875, + "learning_rate": 4.547945205479452e-07, + "loss": 0.0069, + "reward": 1.6548038721084595, + "reward_std": 0.23970913887023926, + "rewards/accuracy_reward": 0.6704288721084595, + "rewards/format_reward": 0.984375, + "step": 1194 + }, + { + "completion_length": 61.78125, + "epoch": 5.45662100456621, + "grad_norm": 4.111104488372803, + "kl": 0.15966796875, + "learning_rate": 4.54337899543379e-07, + "loss": 0.0064, + "reward": 1.7300130128860474, + "reward_std": 0.17958877980709076, + "rewards/accuracy_reward": 0.7456380128860474, + "rewards/format_reward": 0.984375, + "step": 1195 + }, + { + "completion_length": 79.1796875, + "epoch": 5.461187214611872, + "grad_norm": 1.4285087585449219, + "kl": 0.10009765625, + "learning_rate": 4.5388127853881276e-07, + "loss": 0.004, + "reward": 1.8193080425262451, + "reward_std": 0.11568646691739559, + "rewards/accuracy_reward": 0.827120453119278, + "rewards/format_reward": 0.9921875, + "step": 1196 + }, + { + "completion_length": 80.234375, + "epoch": 5.465753424657534, + "grad_norm": 24.182233810424805, + "kl": 0.119384765625, + "learning_rate": 4.534246575342466e-07, + "loss": 0.0048, + "reward": 1.6295573711395264, + "reward_std": 0.16487130522727966, + "rewards/accuracy_reward": 0.6451822966337204, + "rewards/format_reward": 0.984375, + "step": 1197 + }, + { + "completion_length": 73.296875, + "epoch": 5.470319634703197, + "grad_norm": 1.8759607076644897, + "kl": 0.115234375, + "learning_rate": 4.5296803652968034e-07, + "loss": 0.0046, + "reward": 1.742903709411621, + "reward_std": 0.14027476869523525, + "rewards/accuracy_reward": 0.7585286498069763, + "rewards/format_reward": 0.984375, + "step": 1198 + }, + { + "completion_length": 56.65625, + "epoch": 5.474885844748858, + "grad_norm": 2.124757766723633, + "kl": 0.129638671875, + "learning_rate": 4.525114155251141e-07, + "loss": 0.0052, + "reward": 1.766406238079071, + "reward_std": 0.11525032296776772, + "rewards/accuracy_reward": 0.7664062678813934, + "rewards/format_reward": 1.0, + "step": 1199 + }, + { + "completion_length": 71.984375, + "epoch": 5.47945205479452, + "grad_norm": 3.025282382965088, + "kl": 0.1591796875, + "learning_rate": 4.520547945205479e-07, + "loss": 0.0064, + "reward": 1.565625011920929, + "reward_std": 0.25334832072257996, + "rewards/accuracy_reward": 0.5812499821186066, + "rewards/format_reward": 0.984375, + "step": 1200 + }, + { + "completion_length": 71.421875, + "epoch": 5.4840182648401825, + "grad_norm": 3.4319229125976562, + "kl": 0.141357421875, + "learning_rate": 4.5159817351598175e-07, + "loss": 0.0057, + "reward": 1.8551432490348816, + "reward_std": 0.10183484852313995, + "rewards/accuracy_reward": 0.8551432490348816, + "rewards/format_reward": 1.0, + "step": 1201 + }, + { + "completion_length": 85.296875, + "epoch": 5.488584474885845, + "grad_norm": 2.546898365020752, + "kl": 0.114501953125, + "learning_rate": 4.511415525114155e-07, + "loss": 0.0046, + "reward": 1.6248489022254944, + "reward_std": 0.21220972388982773, + "rewards/accuracy_reward": 0.6404739022254944, + "rewards/format_reward": 0.984375, + "step": 1202 + }, + { + "completion_length": 107.4453125, + "epoch": 5.493150684931507, + "grad_norm": 2.221569299697876, + "kl": 0.12939453125, + "learning_rate": 4.5068493150684927e-07, + "loss": 0.0052, + "reward": 1.7825521230697632, + "reward_std": 0.11523282900452614, + "rewards/accuracy_reward": 0.7903645038604736, + "rewards/format_reward": 0.9921875, + "step": 1203 + }, + { + "completion_length": 68.296875, + "epoch": 5.497716894977169, + "grad_norm": 5.981943130493164, + "kl": 0.162109375, + "learning_rate": 4.502283105022831e-07, + "loss": 0.0065, + "reward": 1.6208333373069763, + "reward_std": 0.18434154987335205, + "rewards/accuracy_reward": 0.6208333373069763, + "rewards/format_reward": 1.0, + "step": 1204 + }, + { + "completion_length": 82.796875, + "epoch": 5.502283105022831, + "grad_norm": 6.735628128051758, + "kl": 0.10888671875, + "learning_rate": 4.497716894977169e-07, + "loss": 0.0044, + "reward": 1.7694801092147827, + "reward_std": 0.08905210345983505, + "rewards/accuracy_reward": 0.7694801688194275, + "rewards/format_reward": 1.0, + "step": 1205 + }, + { + "completion_length": 48.140625, + "epoch": 5.506849315068493, + "grad_norm": 4.568028926849365, + "kl": 0.12255859375, + "learning_rate": 4.4931506849315063e-07, + "loss": 0.0049, + "reward": 1.8573929071426392, + "reward_std": 0.12051096558570862, + "rewards/accuracy_reward": 0.8573929369449615, + "rewards/format_reward": 1.0, + "step": 1206 + }, + { + "completion_length": 79.2265625, + "epoch": 5.511415525114155, + "grad_norm": 2.16815447807312, + "kl": 0.114990234375, + "learning_rate": 4.488584474885845e-07, + "loss": 0.0046, + "reward": 1.8097842931747437, + "reward_std": 0.13499605283141136, + "rewards/accuracy_reward": 0.8175966441631317, + "rewards/format_reward": 0.9921875, + "step": 1207 + }, + { + "completion_length": 83.3828125, + "epoch": 5.5159817351598175, + "grad_norm": 9.177694320678711, + "kl": 0.100830078125, + "learning_rate": 4.4840182648401826e-07, + "loss": 0.004, + "reward": 1.8088541626930237, + "reward_std": 0.14230135083198547, + "rewards/accuracy_reward": 0.8166666626930237, + "rewards/format_reward": 0.9921875, + "step": 1208 + }, + { + "completion_length": 66.96875, + "epoch": 5.52054794520548, + "grad_norm": 5.906530380249023, + "kl": 0.109375, + "learning_rate": 4.4794520547945205e-07, + "loss": 0.0044, + "reward": 1.6549479365348816, + "reward_std": 0.16808292269706726, + "rewards/accuracy_reward": 0.6549479067325592, + "rewards/format_reward": 1.0, + "step": 1209 + }, + { + "completion_length": 71.90625, + "epoch": 5.525114155251142, + "grad_norm": 2.249753713607788, + "kl": 0.14599609375, + "learning_rate": 4.474885844748858e-07, + "loss": 0.0058, + "reward": 1.7879971265792847, + "reward_std": 0.13272903114557266, + "rewards/accuracy_reward": 0.8036221563816071, + "rewards/format_reward": 0.984375, + "step": 1210 + }, + { + "completion_length": 71.3359375, + "epoch": 5.529680365296803, + "grad_norm": 3.3971409797668457, + "kl": 0.142578125, + "learning_rate": 4.470319634703196e-07, + "loss": 0.0057, + "reward": 1.7578125596046448, + "reward_std": 0.14966705441474915, + "rewards/accuracy_reward": 0.7578124403953552, + "rewards/format_reward": 1.0, + "step": 1211 + }, + { + "completion_length": 82.359375, + "epoch": 5.534246575342466, + "grad_norm": 4.350315093994141, + "kl": 0.116455078125, + "learning_rate": 4.465753424657534e-07, + "loss": 0.0047, + "reward": 1.66940575838089, + "reward_std": 0.14690347015857697, + "rewards/accuracy_reward": 0.6850306689739227, + "rewards/format_reward": 0.984375, + "step": 1212 + }, + { + "completion_length": 81.0234375, + "epoch": 5.538812785388128, + "grad_norm": 2.2065749168395996, + "kl": 0.094970703125, + "learning_rate": 4.461187214611872e-07, + "loss": 0.0038, + "reward": 1.7763858437538147, + "reward_std": 0.07247142866253853, + "rewards/accuracy_reward": 0.7763858437538147, + "rewards/format_reward": 1.0, + "step": 1213 + }, + { + "completion_length": 50.5703125, + "epoch": 5.54337899543379, + "grad_norm": 1.8587976694107056, + "kl": 0.17333984375, + "learning_rate": 4.45662100456621e-07, + "loss": 0.0069, + "reward": 1.8041483163833618, + "reward_std": 0.1315086344256997, + "rewards/accuracy_reward": 0.8119607865810394, + "rewards/format_reward": 0.9921875, + "step": 1214 + }, + { + "completion_length": 72.6953125, + "epoch": 5.5479452054794525, + "grad_norm": 2.4681146144866943, + "kl": 0.10400390625, + "learning_rate": 4.4520547945205477e-07, + "loss": 0.0042, + "reward": 1.7903646230697632, + "reward_std": 0.08470549248158932, + "rewards/accuracy_reward": 0.7903645634651184, + "rewards/format_reward": 1.0, + "step": 1215 + }, + { + "completion_length": 85.7890625, + "epoch": 5.552511415525114, + "grad_norm": 3.0611839294433594, + "kl": 0.077392578125, + "learning_rate": 4.4474885844748856e-07, + "loss": 0.0031, + "reward": 1.7632812857627869, + "reward_std": 0.20296207815408707, + "rewards/accuracy_reward": 0.7789061665534973, + "rewards/format_reward": 0.984375, + "step": 1216 + }, + { + "completion_length": 68.2421875, + "epoch": 5.557077625570776, + "grad_norm": 2.2732932567596436, + "kl": 0.120849609375, + "learning_rate": 4.442922374429224e-07, + "loss": 0.0048, + "reward": 1.663699746131897, + "reward_std": 0.1902095228433609, + "rewards/accuracy_reward": 0.6715123057365417, + "rewards/format_reward": 0.9921875, + "step": 1217 + }, + { + "completion_length": 87.59375, + "epoch": 5.561643835616438, + "grad_norm": 1.4389771223068237, + "kl": 0.117431640625, + "learning_rate": 4.4383561643835613e-07, + "loss": 0.0047, + "reward": 1.8174665570259094, + "reward_std": 0.0971344392746687, + "rewards/accuracy_reward": 0.825279027223587, + "rewards/format_reward": 0.9921875, + "step": 1218 + }, + { + "completion_length": 98.3046875, + "epoch": 5.566210045662101, + "grad_norm": 1.4271727800369263, + "kl": 0.12255859375, + "learning_rate": 4.433789954337899e-07, + "loss": 0.0049, + "reward": 1.8162059783935547, + "reward_std": 0.11401430889964104, + "rewards/accuracy_reward": 0.8240183591842651, + "rewards/format_reward": 0.9921875, + "step": 1219 + }, + { + "completion_length": 73.3046875, + "epoch": 5.570776255707763, + "grad_norm": 2.519207239151001, + "kl": 0.117919921875, + "learning_rate": 4.429223744292237e-07, + "loss": 0.0047, + "reward": 1.7786458730697632, + "reward_std": 0.13994821161031723, + "rewards/accuracy_reward": 0.7786457538604736, + "rewards/format_reward": 1.0, + "step": 1220 + }, + { + "completion_length": 68.9453125, + "epoch": 5.575342465753424, + "grad_norm": 2.901245594024658, + "kl": 0.16357421875, + "learning_rate": 4.4246575342465755e-07, + "loss": 0.0065, + "reward": 1.639657735824585, + "reward_std": 0.24836767464876175, + "rewards/accuracy_reward": 0.655282735824585, + "rewards/format_reward": 0.984375, + "step": 1221 + }, + { + "completion_length": 96.234375, + "epoch": 5.579908675799087, + "grad_norm": 1.3380907773971558, + "kl": 0.1065673828125, + "learning_rate": 4.420091324200913e-07, + "loss": 0.0043, + "reward": 1.8123116493225098, + "reward_std": 0.06037373095750809, + "rewards/accuracy_reward": 0.8123115301132202, + "rewards/format_reward": 1.0, + "step": 1222 + }, + { + "completion_length": 78.203125, + "epoch": 5.584474885844749, + "grad_norm": 2.1692612171173096, + "kl": 0.11328125, + "learning_rate": 4.4155251141552507e-07, + "loss": 0.0045, + "reward": 1.852519154548645, + "reward_std": 0.08084761165082455, + "rewards/accuracy_reward": 0.8603315949440002, + "rewards/format_reward": 0.9921875, + "step": 1223 + }, + { + "completion_length": 76.1015625, + "epoch": 5.589041095890411, + "grad_norm": 1.5047615766525269, + "kl": 0.13916015625, + "learning_rate": 4.410958904109589e-07, + "loss": 0.0056, + "reward": 1.8007813096046448, + "reward_std": 0.13284454122185707, + "rewards/accuracy_reward": 0.8164061903953552, + "rewards/format_reward": 0.984375, + "step": 1224 + }, + { + "completion_length": 95.3671875, + "epoch": 5.593607305936073, + "grad_norm": 5.50695276260376, + "kl": 0.0765380859375, + "learning_rate": 4.406392694063927e-07, + "loss": 0.0031, + "reward": 1.821587860584259, + "reward_std": 0.11784346960484982, + "rewards/accuracy_reward": 0.8294003307819366, + "rewards/format_reward": 0.9921875, + "step": 1225 + }, + { + "completion_length": 79.359375, + "epoch": 5.598173515981735, + "grad_norm": 1.8532605171203613, + "kl": 0.123046875, + "learning_rate": 4.4018264840182643e-07, + "loss": 0.0049, + "reward": 1.672764003276825, + "reward_std": 0.1454983726143837, + "rewards/accuracy_reward": 0.672764003276825, + "rewards/format_reward": 1.0, + "step": 1226 + }, + { + "completion_length": 49.9296875, + "epoch": 5.602739726027397, + "grad_norm": 4.534260272979736, + "kl": 0.18798828125, + "learning_rate": 4.397260273972603e-07, + "loss": 0.0075, + "reward": 1.7486504912376404, + "reward_std": 0.23138123750686646, + "rewards/accuracy_reward": 0.7642754018306732, + "rewards/format_reward": 0.984375, + "step": 1227 + }, + { + "completion_length": 79.5390625, + "epoch": 5.607305936073059, + "grad_norm": 2.4813179969787598, + "kl": 0.113525390625, + "learning_rate": 4.3926940639269406e-07, + "loss": 0.0045, + "reward": 1.854801058769226, + "reward_std": 0.15426605194807053, + "rewards/accuracy_reward": 0.8782385289669037, + "rewards/format_reward": 0.9765625, + "step": 1228 + }, + { + "completion_length": 65.8984375, + "epoch": 5.6118721461187215, + "grad_norm": 1.7068575620651245, + "kl": 0.123291015625, + "learning_rate": 4.3881278538812785e-07, + "loss": 0.0049, + "reward": 1.8518972992897034, + "reward_std": 0.11522487178444862, + "rewards/accuracy_reward": 0.8675222992897034, + "rewards/format_reward": 0.984375, + "step": 1229 + }, + { + "completion_length": 73.5390625, + "epoch": 5.616438356164384, + "grad_norm": 2.039097309112549, + "kl": 0.150390625, + "learning_rate": 4.383561643835616e-07, + "loss": 0.006, + "reward": 1.8411458730697632, + "reward_std": 0.12557167932391167, + "rewards/accuracy_reward": 0.8411458134651184, + "rewards/format_reward": 1.0, + "step": 1230 + }, + { + "completion_length": 62.3515625, + "epoch": 5.621004566210045, + "grad_norm": 1.1969468593597412, + "kl": 0.14990234375, + "learning_rate": 4.378995433789954e-07, + "loss": 0.006, + "reward": 1.891406238079071, + "reward_std": 0.055242715403437614, + "rewards/accuracy_reward": 0.8914062678813934, + "rewards/format_reward": 1.0, + "step": 1231 + }, + { + "completion_length": 65.2421875, + "epoch": 5.6255707762557075, + "grad_norm": 0.9585725665092468, + "kl": 0.095703125, + "learning_rate": 4.374429223744292e-07, + "loss": 0.0038, + "reward": 1.806380271911621, + "reward_std": 0.053401291370391846, + "rewards/accuracy_reward": 0.8063801527023315, + "rewards/format_reward": 1.0, + "step": 1232 + }, + { + "completion_length": 94.8828125, + "epoch": 5.63013698630137, + "grad_norm": 1.548374891281128, + "kl": 0.121826171875, + "learning_rate": 4.36986301369863e-07, + "loss": 0.0049, + "reward": 1.8385416865348816, + "reward_std": 0.07087302953004837, + "rewards/accuracy_reward": 0.8463541269302368, + "rewards/format_reward": 0.9921875, + "step": 1233 + }, + { + "completion_length": 65.015625, + "epoch": 5.634703196347032, + "grad_norm": 3.663956880569458, + "kl": 0.10302734375, + "learning_rate": 4.365296803652968e-07, + "loss": 0.0041, + "reward": 1.701562523841858, + "reward_std": 0.11350465193390846, + "rewards/accuracy_reward": 0.7015624940395355, + "rewards/format_reward": 1.0, + "step": 1234 + }, + { + "completion_length": 76.859375, + "epoch": 5.639269406392694, + "grad_norm": 4.082939624786377, + "kl": 0.123046875, + "learning_rate": 4.3607305936073057e-07, + "loss": 0.0049, + "reward": 1.7964844107627869, + "reward_std": 0.1577010676264763, + "rewards/accuracy_reward": 0.7964843213558197, + "rewards/format_reward": 1.0, + "step": 1235 + }, + { + "completion_length": 72.140625, + "epoch": 5.6438356164383565, + "grad_norm": 2.4908390045166016, + "kl": 0.15625, + "learning_rate": 4.3561643835616436e-07, + "loss": 0.0062, + "reward": 1.7734375596046448, + "reward_std": 0.13743899390101433, + "rewards/accuracy_reward": 0.7734374701976776, + "rewards/format_reward": 1.0, + "step": 1236 + }, + { + "completion_length": 73.1953125, + "epoch": 5.648401826484018, + "grad_norm": 2.1884684562683105, + "kl": 0.122314453125, + "learning_rate": 4.351598173515982e-07, + "loss": 0.0049, + "reward": 1.7967329621315002, + "reward_std": 0.22378800809383392, + "rewards/accuracy_reward": 0.8123579323291779, + "rewards/format_reward": 0.984375, + "step": 1237 + }, + { + "completion_length": 74.21875, + "epoch": 5.65296803652968, + "grad_norm": 2.7931785583496094, + "kl": 0.149169921875, + "learning_rate": 4.3470319634703193e-07, + "loss": 0.006, + "reward": 1.7240886092185974, + "reward_std": 0.15288061648607254, + "rewards/accuracy_reward": 0.7240885198116302, + "rewards/format_reward": 1.0, + "step": 1238 + }, + { + "completion_length": 74.875, + "epoch": 5.657534246575342, + "grad_norm": 7.83119535446167, + "kl": 0.398681640625, + "learning_rate": 4.342465753424657e-07, + "loss": 0.0159, + "reward": 1.7018229365348816, + "reward_std": 0.20128536224365234, + "rewards/accuracy_reward": 0.7252604067325592, + "rewards/format_reward": 0.9765625, + "step": 1239 + }, + { + "completion_length": 85.15625, + "epoch": 5.662100456621005, + "grad_norm": 2.2747838497161865, + "kl": 0.084716796875, + "learning_rate": 4.337899543378995e-07, + "loss": 0.0034, + "reward": 1.7470728158950806, + "reward_std": 0.09693595767021179, + "rewards/accuracy_reward": 0.7470727562904358, + "rewards/format_reward": 1.0, + "step": 1240 + }, + { + "completion_length": 62.78125, + "epoch": 5.666666666666667, + "grad_norm": 3.267308235168457, + "kl": 0.15283203125, + "learning_rate": 4.3333333333333335e-07, + "loss": 0.0061, + "reward": 1.7454678416252136, + "reward_std": 0.1368042230606079, + "rewards/accuracy_reward": 0.7532803416252136, + "rewards/format_reward": 0.9921875, + "step": 1241 + }, + { + "completion_length": 88.375, + "epoch": 5.671232876712329, + "grad_norm": 2.6403214931488037, + "kl": 0.1396484375, + "learning_rate": 4.328767123287671e-07, + "loss": 0.0056, + "reward": 1.7575656175613403, + "reward_std": 0.20803096145391464, + "rewards/accuracy_reward": 0.7810031175613403, + "rewards/format_reward": 0.9765625, + "step": 1242 + }, + { + "completion_length": 79.96875, + "epoch": 5.675799086757991, + "grad_norm": 3.1201863288879395, + "kl": 0.122314453125, + "learning_rate": 4.3242009132420087e-07, + "loss": 0.0049, + "reward": 1.7832190990447998, + "reward_std": 0.16100692749023438, + "rewards/accuracy_reward": 0.7988439798355103, + "rewards/format_reward": 0.984375, + "step": 1243 + }, + { + "completion_length": 81.8515625, + "epoch": 5.680365296803653, + "grad_norm": 2.047901153564453, + "kl": 0.121337890625, + "learning_rate": 4.319634703196347e-07, + "loss": 0.0049, + "reward": 1.760881781578064, + "reward_std": 0.11731128394603729, + "rewards/accuracy_reward": 0.7608817219734192, + "rewards/format_reward": 1.0, + "step": 1244 + }, + { + "completion_length": 73.046875, + "epoch": 5.684931506849315, + "grad_norm": 1.7335350513458252, + "kl": 0.147705078125, + "learning_rate": 4.315068493150685e-07, + "loss": 0.0059, + "reward": 1.7863653898239136, + "reward_std": 0.10918881744146347, + "rewards/accuracy_reward": 0.8019903302192688, + "rewards/format_reward": 0.984375, + "step": 1245 + }, + { + "completion_length": 71.8125, + "epoch": 5.689497716894977, + "grad_norm": 2.161808967590332, + "kl": 0.149169921875, + "learning_rate": 4.3105022831050223e-07, + "loss": 0.006, + "reward": 1.7140624523162842, + "reward_std": 0.14955899119377136, + "rewards/accuracy_reward": 0.7140624821186066, + "rewards/format_reward": 1.0, + "step": 1246 + }, + { + "completion_length": 70.53125, + "epoch": 5.69406392694064, + "grad_norm": 1.7596287727355957, + "kl": 0.13671875, + "learning_rate": 4.3059360730593607e-07, + "loss": 0.0055, + "reward": 1.6859374642372131, + "reward_std": 0.13941731676459312, + "rewards/accuracy_reward": 0.6859374940395355, + "rewards/format_reward": 1.0, + "step": 1247 + }, + { + "completion_length": 78.921875, + "epoch": 5.698630136986301, + "grad_norm": 1.425408959388733, + "kl": 0.093994140625, + "learning_rate": 4.3013698630136986e-07, + "loss": 0.0038, + "reward": 1.8960938453674316, + "reward_std": 0.10968662425875664, + "rewards/accuracy_reward": 0.9039062261581421, + "rewards/format_reward": 0.9921875, + "step": 1248 + }, + { + "completion_length": 82.5625, + "epoch": 5.703196347031963, + "grad_norm": 3.0568716526031494, + "kl": 0.115234375, + "learning_rate": 4.2968036529680365e-07, + "loss": 0.0046, + "reward": 1.6471354365348816, + "reward_std": 0.22037852555513382, + "rewards/accuracy_reward": 0.6549479067325592, + "rewards/format_reward": 0.9921875, + "step": 1249 + }, + { + "completion_length": 92.828125, + "epoch": 5.707762557077626, + "grad_norm": 27.536405563354492, + "kl": 0.10205078125, + "learning_rate": 4.292237442922374e-07, + "loss": 0.0041, + "reward": 1.75323486328125, + "reward_std": 0.09919268637895584, + "rewards/accuracy_reward": 0.76104736328125, + "rewards/format_reward": 0.9921875, + "step": 1250 + }, + { + "completion_length": 75.4609375, + "epoch": 5.712328767123288, + "grad_norm": 4.183087348937988, + "kl": 0.100830078125, + "learning_rate": 4.287671232876712e-07, + "loss": 0.004, + "reward": 1.625, + "reward_std": 0.13258251547813416, + "rewards/accuracy_reward": 0.6249999403953552, + "rewards/format_reward": 1.0, + "step": 1251 + }, + { + "completion_length": 74.15625, + "epoch": 5.71689497716895, + "grad_norm": 7.003372669219971, + "kl": 0.138671875, + "learning_rate": 4.28310502283105e-07, + "loss": 0.0055, + "reward": 1.631250023841858, + "reward_std": 0.23694030940532684, + "rewards/accuracy_reward": 0.6390624642372131, + "rewards/format_reward": 0.9921875, + "step": 1252 + }, + { + "completion_length": 71.2265625, + "epoch": 5.7214611872146115, + "grad_norm": 4.267425060272217, + "kl": 0.159912109375, + "learning_rate": 4.278538812785388e-07, + "loss": 0.0064, + "reward": 1.7780134677886963, + "reward_std": 0.16748925298452377, + "rewards/accuracy_reward": 0.7858259081840515, + "rewards/format_reward": 0.9921875, + "step": 1253 + }, + { + "completion_length": 76.8984375, + "epoch": 5.726027397260274, + "grad_norm": 2.602168083190918, + "kl": 0.140625, + "learning_rate": 4.273972602739726e-07, + "loss": 0.0056, + "reward": 1.7774969339370728, + "reward_std": 0.1920681744813919, + "rewards/accuracy_reward": 0.7853094637393951, + "rewards/format_reward": 0.9921875, + "step": 1254 + }, + { + "completion_length": 80.0703125, + "epoch": 5.730593607305936, + "grad_norm": 3.621494770050049, + "kl": 0.14501953125, + "learning_rate": 4.2694063926940637e-07, + "loss": 0.0058, + "reward": 1.7632812857627869, + "reward_std": 0.18639566004276276, + "rewards/accuracy_reward": 0.7710936963558197, + "rewards/format_reward": 0.9921875, + "step": 1255 + }, + { + "completion_length": 81.65625, + "epoch": 5.735159817351598, + "grad_norm": 2.0993213653564453, + "kl": 0.143310546875, + "learning_rate": 4.2648401826484016e-07, + "loss": 0.0057, + "reward": 1.765625, + "reward_std": 0.1767766885459423, + "rewards/accuracy_reward": 0.7812499403953552, + "rewards/format_reward": 0.984375, + "step": 1256 + }, + { + "completion_length": 63.765625, + "epoch": 5.739726027397261, + "grad_norm": 2.3051700592041016, + "kl": 0.19140625, + "learning_rate": 4.26027397260274e-07, + "loss": 0.0077, + "reward": 1.800000011920929, + "reward_std": 0.11946406960487366, + "rewards/accuracy_reward": 0.8078124821186066, + "rewards/format_reward": 0.9921875, + "step": 1257 + }, + { + "completion_length": 90.7265625, + "epoch": 5.744292237442922, + "grad_norm": 5.680175304412842, + "kl": 0.13720703125, + "learning_rate": 4.2557077625570773e-07, + "loss": 0.0055, + "reward": 1.7391927242279053, + "reward_std": 0.1934903860092163, + "rewards/accuracy_reward": 0.7548176646232605, + "rewards/format_reward": 0.984375, + "step": 1258 + }, + { + "completion_length": 67.1171875, + "epoch": 5.748858447488584, + "grad_norm": 3.6025280952453613, + "kl": 0.1533203125, + "learning_rate": 4.251141552511415e-07, + "loss": 0.0061, + "reward": 1.6899954080581665, + "reward_std": 0.177649587392807, + "rewards/accuracy_reward": 0.6899954080581665, + "rewards/format_reward": 1.0, + "step": 1259 + }, + { + "completion_length": 78.46875, + "epoch": 5.7534246575342465, + "grad_norm": 2.8901851177215576, + "kl": 0.150390625, + "learning_rate": 4.246575342465753e-07, + "loss": 0.006, + "reward": 1.65723317861557, + "reward_std": 0.15903639048337936, + "rewards/accuracy_reward": 0.6728581488132477, + "rewards/format_reward": 0.984375, + "step": 1260 + }, + { + "completion_length": 90.2890625, + "epoch": 5.757990867579909, + "grad_norm": 30.16987419128418, + "kl": 0.11328125, + "learning_rate": 4.2420091324200915e-07, + "loss": 0.0045, + "reward": 1.80078125, + "reward_std": 0.13798470050096512, + "rewards/accuracy_reward": 0.8007811605930328, + "rewards/format_reward": 1.0, + "step": 1261 + }, + { + "completion_length": 69.9375, + "epoch": 5.762557077625571, + "grad_norm": 4.615162372589111, + "kl": 0.2236328125, + "learning_rate": 4.237442922374429e-07, + "loss": 0.0089, + "reward": 1.7741782069206238, + "reward_std": 0.1786958873271942, + "rewards/accuracy_reward": 0.7819906771183014, + "rewards/format_reward": 0.9921875, + "step": 1262 + }, + { + "completion_length": 85.2421875, + "epoch": 5.767123287671232, + "grad_norm": 2.68100905418396, + "kl": 0.10888671875, + "learning_rate": 4.2328767123287667e-07, + "loss": 0.0044, + "reward": 1.7717448472976685, + "reward_std": 0.1444082222878933, + "rewards/accuracy_reward": 0.7795572876930237, + "rewards/format_reward": 0.9921875, + "step": 1263 + }, + { + "completion_length": 89.0390625, + "epoch": 5.771689497716895, + "grad_norm": 3.867462158203125, + "kl": 0.141845703125, + "learning_rate": 4.228310502283105e-07, + "loss": 0.0057, + "reward": 1.6650173664093018, + "reward_std": 0.21590976417064667, + "rewards/accuracy_reward": 0.6806423366069794, + "rewards/format_reward": 0.984375, + "step": 1264 + }, + { + "completion_length": 72.515625, + "epoch": 5.776255707762557, + "grad_norm": 9.51895523071289, + "kl": 0.103271484375, + "learning_rate": 4.223744292237443e-07, + "loss": 0.0041, + "reward": 1.7899226546287537, + "reward_std": 0.1994732916355133, + "rewards/accuracy_reward": 0.8133600950241089, + "rewards/format_reward": 0.9765625, + "step": 1265 + }, + { + "completion_length": 87.1328125, + "epoch": 5.780821917808219, + "grad_norm": 3.7063732147216797, + "kl": 0.142578125, + "learning_rate": 4.2191780821917803e-07, + "loss": 0.0057, + "reward": 1.7083333730697632, + "reward_std": 0.19097717106342316, + "rewards/accuracy_reward": 0.7161458134651184, + "rewards/format_reward": 0.9921875, + "step": 1266 + }, + { + "completion_length": 85.3828125, + "epoch": 5.7853881278538815, + "grad_norm": 2.703822374343872, + "kl": 0.13037109375, + "learning_rate": 4.2146118721461187e-07, + "loss": 0.0052, + "reward": 1.5992187857627869, + "reward_std": 0.2338416427373886, + "rewards/accuracy_reward": 0.6148437559604645, + "rewards/format_reward": 0.984375, + "step": 1267 + }, + { + "completion_length": 70.9453125, + "epoch": 5.789954337899544, + "grad_norm": 4.01165246963501, + "kl": 0.17236328125, + "learning_rate": 4.2100456621004566e-07, + "loss": 0.0069, + "reward": 1.794851839542389, + "reward_std": 0.17533704824745655, + "rewards/accuracy_reward": 0.8104767203330994, + "rewards/format_reward": 0.984375, + "step": 1268 + }, + { + "completion_length": 78.515625, + "epoch": 5.794520547945205, + "grad_norm": 4.9043684005737305, + "kl": 0.137451171875, + "learning_rate": 4.2054794520547945e-07, + "loss": 0.0055, + "reward": 1.6109731793403625, + "reward_std": 0.2282901182770729, + "rewards/accuracy_reward": 0.6187856197357178, + "rewards/format_reward": 0.9921875, + "step": 1269 + }, + { + "completion_length": 89.8671875, + "epoch": 5.799086757990867, + "grad_norm": 2.3333468437194824, + "kl": 0.099853515625, + "learning_rate": 4.200913242009132e-07, + "loss": 0.004, + "reward": 1.7177083492279053, + "reward_std": 0.17229503020644188, + "rewards/accuracy_reward": 0.7255208194255829, + "rewards/format_reward": 0.9921875, + "step": 1270 + }, + { + "completion_length": 71.6171875, + "epoch": 5.80365296803653, + "grad_norm": 2.1021461486816406, + "kl": 0.153564453125, + "learning_rate": 4.19634703196347e-07, + "loss": 0.0061, + "reward": 1.6888020634651184, + "reward_std": 0.30169400572776794, + "rewards/accuracy_reward": 0.7200521230697632, + "rewards/format_reward": 0.96875, + "step": 1271 + }, + { + "completion_length": 84.703125, + "epoch": 5.808219178082192, + "grad_norm": 4.3470282554626465, + "kl": 0.148193359375, + "learning_rate": 4.191780821917808e-07, + "loss": 0.0059, + "reward": 1.7307292222976685, + "reward_std": 0.1593589335680008, + "rewards/accuracy_reward": 0.7463541030883789, + "rewards/format_reward": 0.984375, + "step": 1272 + }, + { + "completion_length": 63.3203125, + "epoch": 5.812785388127854, + "grad_norm": 5.333644390106201, + "kl": 0.159912109375, + "learning_rate": 4.187214611872146e-07, + "loss": 0.0064, + "reward": 1.6519964933395386, + "reward_std": 0.18434231728315353, + "rewards/accuracy_reward": 0.6519964933395386, + "rewards/format_reward": 1.0, + "step": 1273 + }, + { + "completion_length": 67.6953125, + "epoch": 5.817351598173516, + "grad_norm": 2.9642715454101562, + "kl": 0.1298828125, + "learning_rate": 4.182648401826484e-07, + "loss": 0.0052, + "reward": 1.6983258724212646, + "reward_std": 0.15258603543043137, + "rewards/accuracy_reward": 0.698325902223587, + "rewards/format_reward": 1.0, + "step": 1274 + }, + { + "completion_length": 74.75, + "epoch": 5.821917808219178, + "grad_norm": 2.8370203971862793, + "kl": 0.1337890625, + "learning_rate": 4.1780821917808217e-07, + "loss": 0.0053, + "reward": 1.757942795753479, + "reward_std": 0.24083544313907623, + "rewards/accuracy_reward": 0.7813802063465118, + "rewards/format_reward": 0.9765625, + "step": 1275 + }, + { + "completion_length": 90.6640625, + "epoch": 5.82648401826484, + "grad_norm": 1.1865882873535156, + "kl": 0.076171875, + "learning_rate": 4.1735159817351596e-07, + "loss": 0.0031, + "reward": 1.8952009081840515, + "reward_std": 0.09130230359733105, + "rewards/accuracy_reward": 0.9030132591724396, + "rewards/format_reward": 0.9921875, + "step": 1276 + }, + { + "completion_length": 89.4375, + "epoch": 5.831050228310502, + "grad_norm": 2.1936404705047607, + "kl": 0.094482421875, + "learning_rate": 4.168949771689498e-07, + "loss": 0.0038, + "reward": 1.672282099723816, + "reward_std": 0.1674654446542263, + "rewards/accuracy_reward": 0.6957195997238159, + "rewards/format_reward": 0.9765625, + "step": 1277 + }, + { + "completion_length": 88.984375, + "epoch": 5.835616438356165, + "grad_norm": 2.0885391235351562, + "kl": 0.12646484375, + "learning_rate": 4.1643835616438353e-07, + "loss": 0.0051, + "reward": 1.6463323831558228, + "reward_std": 0.18611476570367813, + "rewards/accuracy_reward": 0.6619572937488556, + "rewards/format_reward": 0.984375, + "step": 1278 + }, + { + "completion_length": 75.7109375, + "epoch": 5.840182648401827, + "grad_norm": 6.192373275756836, + "kl": 0.135009765625, + "learning_rate": 4.159817351598173e-07, + "loss": 0.0054, + "reward": 1.634151816368103, + "reward_std": 0.11692540347576141, + "rewards/accuracy_reward": 0.6419642567634583, + "rewards/format_reward": 0.9921875, + "step": 1279 + }, + { + "completion_length": 80.0078125, + "epoch": 5.844748858447488, + "grad_norm": 2.136561155319214, + "kl": 0.16015625, + "learning_rate": 4.155251141552511e-07, + "loss": 0.0064, + "reward": 1.7792754769325256, + "reward_std": 0.09816346131265163, + "rewards/accuracy_reward": 0.7870878875255585, + "rewards/format_reward": 0.9921875, + "step": 1280 + }, + { + "completion_length": 99.0078125, + "epoch": 5.8493150684931505, + "grad_norm": 1.984879732131958, + "kl": 0.08544921875, + "learning_rate": 4.1506849315068495e-07, + "loss": 0.0034, + "reward": 1.7765625715255737, + "reward_std": 0.06629125960171223, + "rewards/accuracy_reward": 0.7843749523162842, + "rewards/format_reward": 0.9921875, + "step": 1281 + }, + { + "completion_length": 82.71875, + "epoch": 5.853881278538813, + "grad_norm": 2.8610336780548096, + "kl": 0.11669921875, + "learning_rate": 4.146118721461187e-07, + "loss": 0.0047, + "reward": 1.854315459728241, + "reward_std": 0.11991548165678978, + "rewards/accuracy_reward": 0.8543154299259186, + "rewards/format_reward": 1.0, + "step": 1282 + }, + { + "completion_length": 98.296875, + "epoch": 5.858447488584475, + "grad_norm": 1.845438003540039, + "kl": 0.101318359375, + "learning_rate": 4.1415525114155247e-07, + "loss": 0.004, + "reward": 1.8150991797447205, + "reward_std": 0.07342393416911364, + "rewards/accuracy_reward": 0.8150991201400757, + "rewards/format_reward": 1.0, + "step": 1283 + }, + { + "completion_length": 67.1953125, + "epoch": 5.863013698630137, + "grad_norm": 5.036915302276611, + "kl": 0.145751953125, + "learning_rate": 4.136986301369863e-07, + "loss": 0.0058, + "reward": 1.7412946820259094, + "reward_std": 0.175389152020216, + "rewards/accuracy_reward": 0.7491071224212646, + "rewards/format_reward": 0.9921875, + "step": 1284 + }, + { + "completion_length": 67.8828125, + "epoch": 5.867579908675799, + "grad_norm": 5.918455123901367, + "kl": 0.150390625, + "learning_rate": 4.132420091324201e-07, + "loss": 0.006, + "reward": 1.7437500357627869, + "reward_std": 0.2584189176559448, + "rewards/accuracy_reward": 0.7749999761581421, + "rewards/format_reward": 0.96875, + "step": 1285 + }, + { + "completion_length": 74.6015625, + "epoch": 5.872146118721461, + "grad_norm": 1.850948691368103, + "kl": 0.132568359375, + "learning_rate": 4.1278538812785383e-07, + "loss": 0.0053, + "reward": 1.9023438096046448, + "reward_std": 0.08417459111660719, + "rewards/accuracy_reward": 0.9023437201976776, + "rewards/format_reward": 1.0, + "step": 1286 + }, + { + "completion_length": 68.390625, + "epoch": 5.876712328767123, + "grad_norm": 4.7030534744262695, + "kl": 0.166015625, + "learning_rate": 4.1232876712328767e-07, + "loss": 0.0067, + "reward": 1.8146693706512451, + "reward_std": 0.1410301998257637, + "rewards/accuracy_reward": 0.8146693110466003, + "rewards/format_reward": 1.0, + "step": 1287 + }, + { + "completion_length": 83.390625, + "epoch": 5.8812785388127855, + "grad_norm": 2.299928665161133, + "kl": 0.114501953125, + "learning_rate": 4.1187214611872146e-07, + "loss": 0.0046, + "reward": 1.7009549140930176, + "reward_std": 0.18842098861932755, + "rewards/accuracy_reward": 0.7087673544883728, + "rewards/format_reward": 0.9921875, + "step": 1288 + }, + { + "completion_length": 77.59375, + "epoch": 5.885844748858448, + "grad_norm": 2.9478793144226074, + "kl": 0.1025390625, + "learning_rate": 4.1141552511415525e-07, + "loss": 0.0041, + "reward": 1.748046875, + "reward_std": 0.15785659104585648, + "rewards/accuracy_reward": 0.7480468451976776, + "rewards/format_reward": 1.0, + "step": 1289 + }, + { + "completion_length": 100.3359375, + "epoch": 5.890410958904109, + "grad_norm": 2.3754220008850098, + "kl": 0.112060546875, + "learning_rate": 4.10958904109589e-07, + "loss": 0.0045, + "reward": 1.7859375476837158, + "reward_std": 0.17908401414752007, + "rewards/accuracy_reward": 0.8015624284744263, + "rewards/format_reward": 0.984375, + "step": 1290 + }, + { + "completion_length": 69.46875, + "epoch": 5.894977168949771, + "grad_norm": 3.575843334197998, + "kl": 0.134765625, + "learning_rate": 4.105022831050228e-07, + "loss": 0.0054, + "reward": 1.845781147480011, + "reward_std": 0.08029642701148987, + "rewards/accuracy_reward": 0.8457811176776886, + "rewards/format_reward": 1.0, + "step": 1291 + }, + { + "completion_length": 94.5234375, + "epoch": 5.899543378995434, + "grad_norm": 7.622490406036377, + "kl": 0.0859375, + "learning_rate": 4.100456621004566e-07, + "loss": 0.0034, + "reward": 1.796093761920929, + "reward_std": 0.1538807898759842, + "rewards/accuracy_reward": 0.803906261920929, + "rewards/format_reward": 0.9921875, + "step": 1292 + }, + { + "completion_length": 82.234375, + "epoch": 5.904109589041096, + "grad_norm": 7.203249931335449, + "kl": 0.11962890625, + "learning_rate": 4.095890410958904e-07, + "loss": 0.0048, + "reward": 1.737942636013031, + "reward_std": 0.16788282990455627, + "rewards/accuracy_reward": 0.7457550466060638, + "rewards/format_reward": 0.9921875, + "step": 1293 + }, + { + "completion_length": 82.7578125, + "epoch": 5.908675799086758, + "grad_norm": 2.2500412464141846, + "kl": 0.101806640625, + "learning_rate": 4.091324200913242e-07, + "loss": 0.0041, + "reward": 1.7422269582748413, + "reward_std": 0.13895701617002487, + "rewards/accuracy_reward": 0.7500394582748413, + "rewards/format_reward": 0.9921875, + "step": 1294 + }, + { + "completion_length": 93.71875, + "epoch": 5.91324200913242, + "grad_norm": 2.051222324371338, + "kl": 0.122314453125, + "learning_rate": 4.0867579908675797e-07, + "loss": 0.0049, + "reward": 1.8802083730697632, + "reward_std": 0.08680902794003487, + "rewards/accuracy_reward": 0.8880207538604736, + "rewards/format_reward": 0.9921875, + "step": 1295 + }, + { + "completion_length": 98.2421875, + "epoch": 5.917808219178082, + "grad_norm": 1.9777852296829224, + "kl": 0.10009765625, + "learning_rate": 4.0821917808219176e-07, + "loss": 0.004, + "reward": 1.805654764175415, + "reward_std": 0.10066970996558666, + "rewards/accuracy_reward": 0.8134672045707703, + "rewards/format_reward": 0.9921875, + "step": 1296 + }, + { + "completion_length": 73.6875, + "epoch": 5.922374429223744, + "grad_norm": 2.501546859741211, + "kl": 0.1171875, + "learning_rate": 4.077625570776256e-07, + "loss": 0.0047, + "reward": 1.7920072674751282, + "reward_std": 0.11444094032049179, + "rewards/accuracy_reward": 0.7920072078704834, + "rewards/format_reward": 1.0, + "step": 1297 + }, + { + "completion_length": 113.390625, + "epoch": 5.926940639269406, + "grad_norm": 1.9746527671813965, + "kl": 0.05908203125, + "learning_rate": 4.0730593607305933e-07, + "loss": 0.0024, + "reward": 1.830468773841858, + "reward_std": 0.17702769488096237, + "rewards/accuracy_reward": 0.8617186546325684, + "rewards/format_reward": 0.96875, + "step": 1298 + }, + { + "completion_length": 89.3203125, + "epoch": 5.931506849315069, + "grad_norm": 1.95270574092865, + "kl": 0.118896484375, + "learning_rate": 4.068493150684931e-07, + "loss": 0.0048, + "reward": 1.784375011920929, + "reward_std": 0.11048543453216553, + "rewards/accuracy_reward": 0.784375011920929, + "rewards/format_reward": 1.0, + "step": 1299 + }, + { + "completion_length": 80.3359375, + "epoch": 5.936073059360731, + "grad_norm": 2.471496820449829, + "kl": 0.116943359375, + "learning_rate": 4.063926940639269e-07, + "loss": 0.0047, + "reward": 1.7593750357627869, + "reward_std": 0.11229449138045311, + "rewards/accuracy_reward": 0.7671874761581421, + "rewards/format_reward": 0.9921875, + "step": 1300 + }, + { + "completion_length": 82.578125, + "epoch": 5.940639269406392, + "grad_norm": 2.451826572418213, + "kl": 0.15087890625, + "learning_rate": 4.0593607305936075e-07, + "loss": 0.006, + "reward": 1.713808834552765, + "reward_std": 0.16193728893995285, + "rewards/accuracy_reward": 0.7216213047504425, + "rewards/format_reward": 0.9921875, + "step": 1301 + }, + { + "completion_length": 68.0234375, + "epoch": 5.945205479452055, + "grad_norm": 1.9107226133346558, + "kl": 0.087890625, + "learning_rate": 4.054794520547945e-07, + "loss": 0.0035, + "reward": 1.7034350037574768, + "reward_std": 0.09215648844838142, + "rewards/accuracy_reward": 0.7034350335597992, + "rewards/format_reward": 1.0, + "step": 1302 + }, + { + "completion_length": 87.0234375, + "epoch": 5.949771689497717, + "grad_norm": 1.9171605110168457, + "kl": 0.104736328125, + "learning_rate": 4.0502283105022827e-07, + "loss": 0.0042, + "reward": 1.7768229246139526, + "reward_std": 0.09278659522533417, + "rewards/accuracy_reward": 0.7768228650093079, + "rewards/format_reward": 1.0, + "step": 1303 + }, + { + "completion_length": 65.109375, + "epoch": 5.954337899543379, + "grad_norm": 2.9926059246063232, + "kl": 0.1611328125, + "learning_rate": 4.045662100456621e-07, + "loss": 0.0064, + "reward": 1.6825549602508545, + "reward_std": 0.27830731868743896, + "rewards/accuracy_reward": 0.7216174304485321, + "rewards/format_reward": 0.9609375, + "step": 1304 + }, + { + "completion_length": 73.109375, + "epoch": 5.958904109589041, + "grad_norm": 7.7803120613098145, + "kl": 0.1435546875, + "learning_rate": 4.041095890410959e-07, + "loss": 0.0057, + "reward": 1.7353760600090027, + "reward_std": 0.16425937414169312, + "rewards/accuracy_reward": 0.7431885600090027, + "rewards/format_reward": 0.9921875, + "step": 1305 + }, + { + "completion_length": 83.65625, + "epoch": 5.963470319634704, + "grad_norm": 2.513974905014038, + "kl": 0.105712890625, + "learning_rate": 4.0365296803652963e-07, + "loss": 0.0042, + "reward": 1.8269480466842651, + "reward_std": 0.1296938955783844, + "rewards/accuracy_reward": 0.8347605168819427, + "rewards/format_reward": 0.9921875, + "step": 1306 + }, + { + "completion_length": 61.1328125, + "epoch": 5.968036529680365, + "grad_norm": 3.1066365242004395, + "kl": 0.169921875, + "learning_rate": 4.0319634703196347e-07, + "loss": 0.0068, + "reward": 1.8063979148864746, + "reward_std": 0.19071058928966522, + "rewards/accuracy_reward": 0.8220228552818298, + "rewards/format_reward": 0.984375, + "step": 1307 + }, + { + "completion_length": 81.75, + "epoch": 5.972602739726027, + "grad_norm": 6.998978614807129, + "kl": 0.154296875, + "learning_rate": 4.0273972602739726e-07, + "loss": 0.0062, + "reward": 1.7838542461395264, + "reward_std": 0.13565516471862793, + "rewards/accuracy_reward": 0.7916666567325592, + "rewards/format_reward": 0.9921875, + "step": 1308 + }, + { + "completion_length": 65.1953125, + "epoch": 5.9771689497716896, + "grad_norm": 2.565167188644409, + "kl": 0.11279296875, + "learning_rate": 4.0228310502283105e-07, + "loss": 0.0045, + "reward": 1.685937523841858, + "reward_std": 0.15467960759997368, + "rewards/accuracy_reward": 0.6937499940395355, + "rewards/format_reward": 0.9921875, + "step": 1309 + }, + { + "completion_length": 84.0703125, + "epoch": 5.981735159817352, + "grad_norm": 4.1508073806762695, + "kl": 0.099365234375, + "learning_rate": 4.018264840182648e-07, + "loss": 0.004, + "reward": 1.706423580646515, + "reward_std": 0.1783130094408989, + "rewards/accuracy_reward": 0.7220486104488373, + "rewards/format_reward": 0.984375, + "step": 1310 + }, + { + "completion_length": 74.0703125, + "epoch": 5.986301369863014, + "grad_norm": 4.801496505737305, + "kl": 0.17236328125, + "learning_rate": 4.013698630136986e-07, + "loss": 0.0069, + "reward": 1.8074793815612793, + "reward_std": 0.14287326484918594, + "rewards/accuracy_reward": 0.8152918219566345, + "rewards/format_reward": 0.9921875, + "step": 1311 + }, + { + "completion_length": 101.0390625, + "epoch": 5.9908675799086755, + "grad_norm": 3.006030797958374, + "kl": 0.08642578125, + "learning_rate": 4.009132420091324e-07, + "loss": 0.0035, + "reward": 1.8205991387367249, + "reward_std": 0.14634042605757713, + "rewards/accuracy_reward": 0.8284116387367249, + "rewards/format_reward": 0.9921875, + "step": 1312 + }, + { + "completion_length": 70.0078125, + "epoch": 5.995433789954338, + "grad_norm": 3.449769973754883, + "kl": 0.13720703125, + "learning_rate": 4.004566210045662e-07, + "loss": 0.0055, + "reward": 1.651562511920929, + "reward_std": 0.21917617321014404, + "rewards/accuracy_reward": 0.659375011920929, + "rewards/format_reward": 0.9921875, + "step": 1313 + }, + { + "completion_length": 44.0, + "epoch": 6.0, + "grad_norm": 3.673351287841797, + "kl": 0.1494140625, + "learning_rate": 4e-07, + "loss": 0.0045, + "reward": 1.625, + "reward_std": 0.408231720328331, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1314 + }, + { + "completion_length": 84.4453125, + "epoch": 6.004566210045662, + "grad_norm": 1.5486712455749512, + "kl": 0.10546875, + "learning_rate": 3.9954337899543377e-07, + "loss": 0.0042, + "reward": 1.8225446343421936, + "reward_std": 0.06181819178164005, + "rewards/accuracy_reward": 0.8225446343421936, + "rewards/format_reward": 1.0, + "step": 1315 + }, + { + "completion_length": 87.6484375, + "epoch": 6.0091324200913245, + "grad_norm": 2.9046413898468018, + "kl": 0.106201171875, + "learning_rate": 3.9908675799086756e-07, + "loss": 0.0042, + "reward": 1.8345487117767334, + "reward_std": 0.1487816534936428, + "rewards/accuracy_reward": 0.8501735925674438, + "rewards/format_reward": 0.984375, + "step": 1316 + }, + { + "completion_length": 91.9453125, + "epoch": 6.013698630136986, + "grad_norm": 3.910123348236084, + "kl": 0.083251953125, + "learning_rate": 3.9863013698630134e-07, + "loss": 0.0033, + "reward": 1.6695313453674316, + "reward_std": 0.2122296392917633, + "rewards/accuracy_reward": 0.6851562261581421, + "rewards/format_reward": 0.984375, + "step": 1317 + }, + { + "completion_length": 71.890625, + "epoch": 6.018264840182648, + "grad_norm": 3.065825939178467, + "kl": 0.124755859375, + "learning_rate": 3.9817351598173513e-07, + "loss": 0.005, + "reward": 1.6242188215255737, + "reward_std": 0.14951937273144722, + "rewards/accuracy_reward": 0.624218761920929, + "rewards/format_reward": 1.0, + "step": 1318 + }, + { + "completion_length": 97.1875, + "epoch": 6.0228310502283104, + "grad_norm": 1.6381162405014038, + "kl": 0.10986328125, + "learning_rate": 3.977168949771689e-07, + "loss": 0.0044, + "reward": 1.8406250476837158, + "reward_std": 0.14283224940299988, + "rewards/accuracy_reward": 0.8562499284744263, + "rewards/format_reward": 0.984375, + "step": 1319 + }, + { + "completion_length": 74.859375, + "epoch": 6.027397260273973, + "grad_norm": 4.058730125427246, + "kl": 0.13037109375, + "learning_rate": 3.972602739726027e-07, + "loss": 0.0052, + "reward": 1.7753472328186035, + "reward_std": 0.11618576943874359, + "rewards/accuracy_reward": 0.7909722030162811, + "rewards/format_reward": 0.984375, + "step": 1320 + }, + { + "completion_length": 75.2109375, + "epoch": 6.031963470319635, + "grad_norm": 2.673145055770874, + "kl": 0.103271484375, + "learning_rate": 3.9680365296803655e-07, + "loss": 0.0041, + "reward": 1.8099148869514465, + "reward_std": 0.0990208312869072, + "rewards/accuracy_reward": 0.8177272975444794, + "rewards/format_reward": 0.9921875, + "step": 1321 + }, + { + "completion_length": 70.84375, + "epoch": 6.036529680365296, + "grad_norm": 4.576772689819336, + "kl": 0.12890625, + "learning_rate": 3.963470319634703e-07, + "loss": 0.0052, + "reward": 1.7658482193946838, + "reward_std": 0.1474735364317894, + "rewards/accuracy_reward": 0.7814731895923615, + "rewards/format_reward": 0.984375, + "step": 1322 + }, + { + "completion_length": 75.84375, + "epoch": 6.041095890410959, + "grad_norm": 3.1631548404693604, + "kl": 0.1611328125, + "learning_rate": 3.9589041095890407e-07, + "loss": 0.0064, + "reward": 1.6473042964935303, + "reward_std": 0.17740929126739502, + "rewards/accuracy_reward": 0.6473042666912079, + "rewards/format_reward": 1.0, + "step": 1323 + }, + { + "completion_length": 71.1640625, + "epoch": 6.045662100456621, + "grad_norm": 2.176661968231201, + "kl": 0.1416015625, + "learning_rate": 3.954337899543379e-07, + "loss": 0.0057, + "reward": 1.8772332668304443, + "reward_std": 0.07372352294623852, + "rewards/accuracy_reward": 0.8772332966327667, + "rewards/format_reward": 1.0, + "step": 1324 + }, + { + "completion_length": 80.1328125, + "epoch": 6.050228310502283, + "grad_norm": 2.169936180114746, + "kl": 0.12109375, + "learning_rate": 3.949771689497717e-07, + "loss": 0.0048, + "reward": 1.6571251153945923, + "reward_std": 0.13837599009275436, + "rewards/accuracy_reward": 0.6571250259876251, + "rewards/format_reward": 1.0, + "step": 1325 + }, + { + "completion_length": 83.4375, + "epoch": 6.054794520547945, + "grad_norm": 2.906425714492798, + "kl": 0.15966796875, + "learning_rate": 3.9452054794520543e-07, + "loss": 0.0064, + "reward": 1.87109375, + "reward_std": 0.17256292700767517, + "rewards/accuracy_reward": 0.88671875, + "rewards/format_reward": 0.984375, + "step": 1326 + }, + { + "completion_length": 69.375, + "epoch": 6.059360730593608, + "grad_norm": 2.7249135971069336, + "kl": 0.197998046875, + "learning_rate": 3.940639269406392e-07, + "loss": 0.0079, + "reward": 1.780989646911621, + "reward_std": 0.19701316952705383, + "rewards/accuracy_reward": 0.7888020575046539, + "rewards/format_reward": 0.9921875, + "step": 1327 + }, + { + "completion_length": 85.78125, + "epoch": 6.063926940639269, + "grad_norm": 2.412091016769409, + "kl": 0.125732421875, + "learning_rate": 3.9360730593607306e-07, + "loss": 0.005, + "reward": 1.6776041984558105, + "reward_std": 0.15093514323234558, + "rewards/accuracy_reward": 0.6932291090488434, + "rewards/format_reward": 0.984375, + "step": 1328 + }, + { + "completion_length": 67.96875, + "epoch": 6.068493150684931, + "grad_norm": 2.928151845932007, + "kl": 0.14501953125, + "learning_rate": 3.9315068493150684e-07, + "loss": 0.0058, + "reward": 1.7759548425674438, + "reward_std": 0.15794897824525833, + "rewards/accuracy_reward": 0.7759548723697662, + "rewards/format_reward": 1.0, + "step": 1329 + }, + { + "completion_length": 94.921875, + "epoch": 6.073059360730594, + "grad_norm": 1.4851278066635132, + "kl": 0.093994140625, + "learning_rate": 3.926940639269406e-07, + "loss": 0.0038, + "reward": 1.8491500616073608, + "reward_std": 0.14507145062088966, + "rewards/accuracy_reward": 0.8725875020027161, + "rewards/format_reward": 0.9765625, + "step": 1330 + }, + { + "completion_length": 78.6484375, + "epoch": 6.077625570776256, + "grad_norm": 2.765091896057129, + "kl": 0.112548828125, + "learning_rate": 3.922374429223744e-07, + "loss": 0.0045, + "reward": 1.8410881161689758, + "reward_std": 0.14643773436546326, + "rewards/accuracy_reward": 0.864525556564331, + "rewards/format_reward": 0.9765625, + "step": 1331 + }, + { + "completion_length": 86.4375, + "epoch": 6.082191780821918, + "grad_norm": 3.7591657638549805, + "kl": 0.1103515625, + "learning_rate": 3.917808219178082e-07, + "loss": 0.0044, + "reward": 1.7199653387069702, + "reward_std": 0.1599731780588627, + "rewards/accuracy_reward": 0.7355902194976807, + "rewards/format_reward": 0.984375, + "step": 1332 + }, + { + "completion_length": 88.1015625, + "epoch": 6.0867579908675795, + "grad_norm": 2.042999267578125, + "kl": 0.102783203125, + "learning_rate": 3.91324200913242e-07, + "loss": 0.0041, + "reward": 1.8294270634651184, + "reward_std": 0.09024603478610516, + "rewards/accuracy_reward": 0.8372394740581512, + "rewards/format_reward": 0.9921875, + "step": 1333 + }, + { + "completion_length": 96.703125, + "epoch": 6.091324200913242, + "grad_norm": 1.6703046560287476, + "kl": 0.14990234375, + "learning_rate": 3.908675799086758e-07, + "loss": 0.006, + "reward": 1.795138955116272, + "reward_std": 0.09230764210224152, + "rewards/accuracy_reward": 0.8029513657093048, + "rewards/format_reward": 0.9921875, + "step": 1334 + }, + { + "completion_length": 84.5078125, + "epoch": 6.095890410958904, + "grad_norm": 1.742505669593811, + "kl": 0.10205078125, + "learning_rate": 3.9041095890410957e-07, + "loss": 0.0041, + "reward": 1.6895833611488342, + "reward_std": 0.2131306305527687, + "rewards/accuracy_reward": 0.7130208313465118, + "rewards/format_reward": 0.9765625, + "step": 1335 + }, + { + "completion_length": 86.1484375, + "epoch": 6.100456621004566, + "grad_norm": 12.497916221618652, + "kl": 0.13671875, + "learning_rate": 3.8995433789954336e-07, + "loss": 0.0055, + "reward": 1.6923341751098633, + "reward_std": 0.17974907904863358, + "rewards/accuracy_reward": 0.7001466453075409, + "rewards/format_reward": 0.9921875, + "step": 1336 + }, + { + "completion_length": 77.3359375, + "epoch": 6.105022831050229, + "grad_norm": 4.783447742462158, + "kl": 0.1591796875, + "learning_rate": 3.8949771689497714e-07, + "loss": 0.0064, + "reward": 1.6346808671951294, + "reward_std": 0.1741974987089634, + "rewards/accuracy_reward": 0.642493337392807, + "rewards/format_reward": 0.9921875, + "step": 1337 + }, + { + "completion_length": 85.03125, + "epoch": 6.109589041095891, + "grad_norm": 3.4984941482543945, + "kl": 0.140625, + "learning_rate": 3.8904109589041093e-07, + "loss": 0.0056, + "reward": 1.6837969422340393, + "reward_std": 0.16627999395132065, + "rewards/accuracy_reward": 0.6916094422340393, + "rewards/format_reward": 0.9921875, + "step": 1338 + }, + { + "completion_length": 95.484375, + "epoch": 6.114155251141552, + "grad_norm": 3.724400520324707, + "kl": 0.084228515625, + "learning_rate": 3.885844748858447e-07, + "loss": 0.0034, + "reward": 1.7506882548332214, + "reward_std": 0.16492830961942673, + "rewards/accuracy_reward": 0.7741256952285767, + "rewards/format_reward": 0.9765625, + "step": 1339 + }, + { + "completion_length": 84.8359375, + "epoch": 6.1187214611872145, + "grad_norm": 3.958569049835205, + "kl": 0.1142578125, + "learning_rate": 3.881278538812785e-07, + "loss": 0.0046, + "reward": 1.8411458730697632, + "reward_std": 0.12668996676802635, + "rewards/accuracy_reward": 0.8489583134651184, + "rewards/format_reward": 0.9921875, + "step": 1340 + }, + { + "completion_length": 64.8828125, + "epoch": 6.123287671232877, + "grad_norm": 7.010223388671875, + "kl": 0.1708984375, + "learning_rate": 3.8767123287671235e-07, + "loss": 0.0068, + "reward": 1.796571135520935, + "reward_std": 0.147329680621624, + "rewards/accuracy_reward": 0.8043836355209351, + "rewards/format_reward": 0.9921875, + "step": 1341 + }, + { + "completion_length": 51.1953125, + "epoch": 6.127853881278539, + "grad_norm": 1.703456163406372, + "kl": 0.18798828125, + "learning_rate": 3.872146118721461e-07, + "loss": 0.0075, + "reward": 1.7231026887893677, + "reward_std": 0.15178490430116653, + "rewards/accuracy_reward": 0.7231026887893677, + "rewards/format_reward": 1.0, + "step": 1342 + }, + { + "completion_length": 83.125, + "epoch": 6.132420091324201, + "grad_norm": 2.8786849975585938, + "kl": 0.12744140625, + "learning_rate": 3.8675799086757987e-07, + "loss": 0.0051, + "reward": 1.6835938096046448, + "reward_std": 0.15046585351228714, + "rewards/accuracy_reward": 0.6835937201976776, + "rewards/format_reward": 1.0, + "step": 1343 + }, + { + "completion_length": 84.7109375, + "epoch": 6.136986301369863, + "grad_norm": 3.438633680343628, + "kl": 0.1298828125, + "learning_rate": 3.863013698630137e-07, + "loss": 0.0052, + "reward": 1.8140625357627869, + "reward_std": 0.15467960387468338, + "rewards/accuracy_reward": 0.8140624463558197, + "rewards/format_reward": 1.0, + "step": 1344 + }, + { + "completion_length": 76.953125, + "epoch": 6.141552511415525, + "grad_norm": 5.209409236907959, + "kl": 0.1064453125, + "learning_rate": 3.858447488584475e-07, + "loss": 0.0043, + "reward": 1.6888335943222046, + "reward_std": 0.1942080445587635, + "rewards/accuracy_reward": 0.696646124124527, + "rewards/format_reward": 0.9921875, + "step": 1345 + }, + { + "completion_length": 75.7578125, + "epoch": 6.146118721461187, + "grad_norm": 1.846853494644165, + "kl": 0.115966796875, + "learning_rate": 3.8538812785388123e-07, + "loss": 0.0046, + "reward": 1.7150331735610962, + "reward_std": 0.19362305849790573, + "rewards/accuracy_reward": 0.7306581139564514, + "rewards/format_reward": 0.984375, + "step": 1346 + }, + { + "completion_length": 91.0390625, + "epoch": 6.1506849315068495, + "grad_norm": 1.0356731414794922, + "kl": 0.11962890625, + "learning_rate": 3.84931506849315e-07, + "loss": 0.0048, + "reward": 1.8451389074325562, + "reward_std": 0.0736747458577156, + "rewards/accuracy_reward": 0.852951318025589, + "rewards/format_reward": 0.9921875, + "step": 1347 + }, + { + "completion_length": 69.703125, + "epoch": 6.155251141552512, + "grad_norm": 1.3840466737747192, + "kl": 0.126953125, + "learning_rate": 3.8447488584474886e-07, + "loss": 0.0051, + "reward": 1.8907551765441895, + "reward_std": 0.06240340322256088, + "rewards/accuracy_reward": 0.8907552063465118, + "rewards/format_reward": 1.0, + "step": 1348 + }, + { + "completion_length": 90.1171875, + "epoch": 6.159817351598173, + "grad_norm": 1.7931628227233887, + "kl": 0.091064453125, + "learning_rate": 3.8401826484018264e-07, + "loss": 0.0036, + "reward": 1.792187511920929, + "reward_std": 0.10482378304004669, + "rewards/accuracy_reward": 0.7999999225139618, + "rewards/format_reward": 0.9921875, + "step": 1349 + }, + { + "completion_length": 93.625, + "epoch": 6.164383561643835, + "grad_norm": 2.3507261276245117, + "kl": 0.12060546875, + "learning_rate": 3.835616438356164e-07, + "loss": 0.0048, + "reward": 1.7338955998420715, + "reward_std": 0.16754615679383278, + "rewards/accuracy_reward": 0.7573330104351044, + "rewards/format_reward": 0.9765625, + "step": 1350 + }, + { + "completion_length": 62.859375, + "epoch": 6.168949771689498, + "grad_norm": 2.7926764488220215, + "kl": 0.1259765625, + "learning_rate": 3.831050228310502e-07, + "loss": 0.005, + "reward": 1.92203950881958, + "reward_std": 0.05836756294593215, + "rewards/accuracy_reward": 0.9220394492149353, + "rewards/format_reward": 1.0, + "step": 1351 + }, + { + "completion_length": 72.953125, + "epoch": 6.17351598173516, + "grad_norm": 1.8602731227874756, + "kl": 0.120849609375, + "learning_rate": 3.82648401826484e-07, + "loss": 0.0048, + "reward": 1.8530598878860474, + "reward_std": 0.07507604733109474, + "rewards/accuracy_reward": 0.860872358083725, + "rewards/format_reward": 0.9921875, + "step": 1352 + }, + { + "completion_length": 89.671875, + "epoch": 6.178082191780822, + "grad_norm": 1.9812475442886353, + "kl": 0.107421875, + "learning_rate": 3.821917808219178e-07, + "loss": 0.0043, + "reward": 1.8095996975898743, + "reward_std": 0.07891392894089222, + "rewards/accuracy_reward": 0.8095996379852295, + "rewards/format_reward": 1.0, + "step": 1353 + }, + { + "completion_length": 84.40625, + "epoch": 6.182648401826484, + "grad_norm": 2.6219146251678467, + "kl": 0.110107421875, + "learning_rate": 3.817351598173516e-07, + "loss": 0.0044, + "reward": 1.7649739980697632, + "reward_std": 0.13717181608080864, + "rewards/accuracy_reward": 0.7805989384651184, + "rewards/format_reward": 0.984375, + "step": 1354 + }, + { + "completion_length": 85.2265625, + "epoch": 6.187214611872146, + "grad_norm": 1.6997660398483276, + "kl": 0.078857421875, + "learning_rate": 3.8127853881278537e-07, + "loss": 0.0031, + "reward": 1.7958333492279053, + "reward_std": 0.07365695387125015, + "rewards/accuracy_reward": 0.7958332598209381, + "rewards/format_reward": 1.0, + "step": 1355 + }, + { + "completion_length": 68.3828125, + "epoch": 6.191780821917808, + "grad_norm": 2.4636871814727783, + "kl": 0.140380859375, + "learning_rate": 3.8082191780821916e-07, + "loss": 0.0056, + "reward": 1.638918399810791, + "reward_std": 0.1541249379515648, + "rewards/accuracy_reward": 0.6467308402061462, + "rewards/format_reward": 0.9921875, + "step": 1356 + }, + { + "completion_length": 96.0078125, + "epoch": 6.19634703196347, + "grad_norm": 2.066955804824829, + "kl": 0.08056640625, + "learning_rate": 3.8036529680365294e-07, + "loss": 0.0032, + "reward": 1.7703125476837158, + "reward_std": 0.11048543080687523, + "rewards/accuracy_reward": 0.7781248986721039, + "rewards/format_reward": 0.9921875, + "step": 1357 + }, + { + "completion_length": 87.1640625, + "epoch": 6.200913242009133, + "grad_norm": 1.8385370969772339, + "kl": 0.125, + "learning_rate": 3.7990867579908673e-07, + "loss": 0.005, + "reward": 1.7934895753860474, + "reward_std": 0.16230732202529907, + "rewards/accuracy_reward": 0.809114545583725, + "rewards/format_reward": 0.984375, + "step": 1358 + }, + { + "completion_length": 79.4140625, + "epoch": 6.205479452054795, + "grad_norm": 1.4888020753860474, + "kl": 0.1396484375, + "learning_rate": 3.794520547945205e-07, + "loss": 0.0056, + "reward": 1.8671875596046448, + "reward_std": 0.08838834427297115, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9921875, + "step": 1359 + }, + { + "completion_length": 81.5859375, + "epoch": 6.210045662100456, + "grad_norm": 3.295711040496826, + "kl": 0.159423828125, + "learning_rate": 3.789954337899543e-07, + "loss": 0.0064, + "reward": 1.7882669568061829, + "reward_std": 0.14686734601855278, + "rewards/accuracy_reward": 0.7960793972015381, + "rewards/format_reward": 0.9921875, + "step": 1360 + }, + { + "completion_length": 70.3828125, + "epoch": 6.2146118721461185, + "grad_norm": 4.181000709533691, + "kl": 0.1220703125, + "learning_rate": 3.7853881278538814e-07, + "loss": 0.0049, + "reward": 1.6776198744773865, + "reward_std": 0.20163647830486298, + "rewards/accuracy_reward": 0.6854324042797089, + "rewards/format_reward": 0.9921875, + "step": 1361 + }, + { + "completion_length": 74.453125, + "epoch": 6.219178082191781, + "grad_norm": 2.4070448875427246, + "kl": 0.123779296875, + "learning_rate": 3.780821917808219e-07, + "loss": 0.0049, + "reward": 1.810937523841858, + "reward_std": 0.1541428193449974, + "rewards/accuracy_reward": 0.8265624046325684, + "rewards/format_reward": 0.984375, + "step": 1362 + }, + { + "completion_length": 73.96875, + "epoch": 6.223744292237443, + "grad_norm": 4.166317939758301, + "kl": 0.16796875, + "learning_rate": 3.7762557077625567e-07, + "loss": 0.0067, + "reward": 1.6138020753860474, + "reward_std": 0.25969336926937103, + "rewards/accuracy_reward": 0.6294271051883698, + "rewards/format_reward": 0.984375, + "step": 1363 + }, + { + "completion_length": 74.0859375, + "epoch": 6.228310502283105, + "grad_norm": 9.133631706237793, + "kl": 0.107421875, + "learning_rate": 3.771689497716895e-07, + "loss": 0.0043, + "reward": 1.8240530490875244, + "reward_std": 0.12234986200928688, + "rewards/accuracy_reward": 0.8318654298782349, + "rewards/format_reward": 0.9921875, + "step": 1364 + }, + { + "completion_length": 86.6328125, + "epoch": 6.232876712328767, + "grad_norm": 2.3225855827331543, + "kl": 0.19580078125, + "learning_rate": 3.767123287671233e-07, + "loss": 0.0078, + "reward": 1.7432359457015991, + "reward_std": 0.20243436098098755, + "rewards/accuracy_reward": 0.7666734158992767, + "rewards/format_reward": 0.9765625, + "step": 1365 + }, + { + "completion_length": 75.84375, + "epoch": 6.237442922374429, + "grad_norm": 4.120067596435547, + "kl": 0.1038818359375, + "learning_rate": 3.7625570776255703e-07, + "loss": 0.0042, + "reward": 1.7382813096046448, + "reward_std": 0.1538807973265648, + "rewards/accuracy_reward": 0.74609375, + "rewards/format_reward": 0.9921875, + "step": 1366 + }, + { + "completion_length": 71.75, + "epoch": 6.242009132420091, + "grad_norm": 2.138282537460327, + "kl": 0.1630859375, + "learning_rate": 3.757990867579908e-07, + "loss": 0.0065, + "reward": 1.7145547270774841, + "reward_std": 0.18507731705904007, + "rewards/accuracy_reward": 0.722367137670517, + "rewards/format_reward": 0.9921875, + "step": 1367 + }, + { + "completion_length": 66.390625, + "epoch": 6.2465753424657535, + "grad_norm": 2.4669651985168457, + "kl": 0.20654296875, + "learning_rate": 3.7534246575342466e-07, + "loss": 0.0083, + "reward": 1.818272590637207, + "reward_std": 0.11263852939009666, + "rewards/accuracy_reward": 0.8260850310325623, + "rewards/format_reward": 0.9921875, + "step": 1368 + }, + { + "completion_length": 100.6875, + "epoch": 6.251141552511416, + "grad_norm": 1.6667773723602295, + "kl": 0.113037109375, + "learning_rate": 3.7488584474885844e-07, + "loss": 0.0045, + "reward": 1.682812511920929, + "reward_std": 0.14389308914542198, + "rewards/accuracy_reward": 0.7062499523162842, + "rewards/format_reward": 0.9765625, + "step": 1369 + }, + { + "completion_length": 58.890625, + "epoch": 6.255707762557078, + "grad_norm": 4.199071884155273, + "kl": 0.122314453125, + "learning_rate": 3.744292237442922e-07, + "loss": 0.0049, + "reward": 1.8065972328186035, + "reward_std": 0.10062464885413647, + "rewards/accuracy_reward": 0.8065972328186035, + "rewards/format_reward": 1.0, + "step": 1370 + }, + { + "completion_length": 86.0625, + "epoch": 6.260273972602739, + "grad_norm": 4.296463966369629, + "kl": 0.129150390625, + "learning_rate": 3.73972602739726e-07, + "loss": 0.0052, + "reward": 1.7294270992279053, + "reward_std": 0.1873747929930687, + "rewards/accuracy_reward": 0.7372395694255829, + "rewards/format_reward": 0.9921875, + "step": 1371 + }, + { + "completion_length": 70.609375, + "epoch": 6.264840182648402, + "grad_norm": 2.9196784496307373, + "kl": 0.138671875, + "learning_rate": 3.735159817351598e-07, + "loss": 0.0055, + "reward": 1.7803341746330261, + "reward_std": 0.09776799008250237, + "rewards/accuracy_reward": 0.7803341746330261, + "rewards/format_reward": 1.0, + "step": 1372 + }, + { + "completion_length": 89.875, + "epoch": 6.269406392694064, + "grad_norm": 2.074902296066284, + "kl": 0.11474609375, + "learning_rate": 3.730593607305936e-07, + "loss": 0.0046, + "reward": 1.7255208492279053, + "reward_std": 0.18392051756381989, + "rewards/accuracy_reward": 0.7411458194255829, + "rewards/format_reward": 0.984375, + "step": 1373 + }, + { + "completion_length": 75.140625, + "epoch": 6.273972602739726, + "grad_norm": 4.484436511993408, + "kl": 0.156005859375, + "learning_rate": 3.726027397260274e-07, + "loss": 0.0062, + "reward": 1.7850632667541504, + "reward_std": 0.14479004591703415, + "rewards/accuracy_reward": 0.792875736951828, + "rewards/format_reward": 0.9921875, + "step": 1374 + }, + { + "completion_length": 92.6640625, + "epoch": 6.2785388127853885, + "grad_norm": 3.3195595741271973, + "kl": 0.1070556640625, + "learning_rate": 3.7214611872146117e-07, + "loss": 0.0043, + "reward": 1.80859375, + "reward_std": 0.1387247210368514, + "rewards/accuracy_reward": 0.8164062201976776, + "rewards/format_reward": 0.9921875, + "step": 1375 + }, + { + "completion_length": 87.1875, + "epoch": 6.28310502283105, + "grad_norm": 4.265213966369629, + "kl": 0.153564453125, + "learning_rate": 3.7168949771689495e-07, + "loss": 0.0061, + "reward": 1.7935296893119812, + "reward_std": 0.1823706291615963, + "rewards/accuracy_reward": 0.8325920403003693, + "rewards/format_reward": 0.9609375, + "step": 1376 + }, + { + "completion_length": 84.71875, + "epoch": 6.287671232876712, + "grad_norm": 1.7530524730682373, + "kl": 0.115234375, + "learning_rate": 3.7123287671232874e-07, + "loss": 0.0046, + "reward": 1.7466146349906921, + "reward_std": 0.1300597358494997, + "rewards/accuracy_reward": 0.7622395753860474, + "rewards/format_reward": 0.984375, + "step": 1377 + }, + { + "completion_length": 92.078125, + "epoch": 6.292237442922374, + "grad_norm": 2.0412070751190186, + "kl": 0.1259765625, + "learning_rate": 3.7077625570776253e-07, + "loss": 0.005, + "reward": 1.8295753002166748, + "reward_std": 0.17172623425722122, + "rewards/accuracy_reward": 0.8608251810073853, + "rewards/format_reward": 0.96875, + "step": 1378 + }, + { + "completion_length": 53.4375, + "epoch": 6.296803652968037, + "grad_norm": 2.757150888442993, + "kl": 0.140625, + "learning_rate": 3.703196347031963e-07, + "loss": 0.0056, + "reward": 1.701302170753479, + "reward_std": 0.29356005787849426, + "rewards/accuracy_reward": 0.7403645813465118, + "rewards/format_reward": 0.9609375, + "step": 1379 + }, + { + "completion_length": 98.2109375, + "epoch": 6.301369863013699, + "grad_norm": 2.7139008045196533, + "kl": 0.07958984375, + "learning_rate": 3.698630136986301e-07, + "loss": 0.0032, + "reward": 1.783593773841858, + "reward_std": 0.09553372114896774, + "rewards/accuracy_reward": 0.7835937440395355, + "rewards/format_reward": 1.0, + "step": 1380 + }, + { + "completion_length": 90.40625, + "epoch": 6.30593607305936, + "grad_norm": 3.6404201984405518, + "kl": 0.105712890625, + "learning_rate": 3.6940639269406394e-07, + "loss": 0.0042, + "reward": 1.746897280216217, + "reward_std": 0.18012897670269012, + "rewards/accuracy_reward": 0.7625222206115723, + "rewards/format_reward": 0.984375, + "step": 1381 + }, + { + "completion_length": 70.71875, + "epoch": 6.310502283105023, + "grad_norm": 2.44050931930542, + "kl": 0.1953125, + "learning_rate": 3.689497716894977e-07, + "loss": 0.0078, + "reward": 1.7049107551574707, + "reward_std": 0.14146586507558823, + "rewards/accuracy_reward": 0.7127232253551483, + "rewards/format_reward": 0.9921875, + "step": 1382 + }, + { + "completion_length": 84.0859375, + "epoch": 6.315068493150685, + "grad_norm": 2.908999443054199, + "kl": 0.129150390625, + "learning_rate": 3.6849315068493147e-07, + "loss": 0.0052, + "reward": 1.866501271724701, + "reward_std": 0.14921535179018974, + "rewards/accuracy_reward": 0.8743137121200562, + "rewards/format_reward": 0.9921875, + "step": 1383 + }, + { + "completion_length": 66.9609375, + "epoch": 6.319634703196347, + "grad_norm": 7.363396644592285, + "kl": 0.155517578125, + "learning_rate": 3.680365296803653e-07, + "loss": 0.0062, + "reward": 1.73046875, + "reward_std": 0.14824316650629044, + "rewards/accuracy_reward": 0.7304687201976776, + "rewards/format_reward": 1.0, + "step": 1384 + }, + { + "completion_length": 63.6875, + "epoch": 6.324200913242009, + "grad_norm": 2.700493097305298, + "kl": 0.153564453125, + "learning_rate": 3.675799086757991e-07, + "loss": 0.0061, + "reward": 1.7351128458976746, + "reward_std": 0.1287803202867508, + "rewards/accuracy_reward": 0.7429253458976746, + "rewards/format_reward": 0.9921875, + "step": 1385 + }, + { + "completion_length": 77.515625, + "epoch": 6.328767123287671, + "grad_norm": 4.386509895324707, + "kl": 0.140869140625, + "learning_rate": 3.6712328767123283e-07, + "loss": 0.0056, + "reward": 1.6953125596046448, + "reward_std": 0.20576493442058563, + "rewards/accuracy_reward": 0.7187499701976776, + "rewards/format_reward": 0.9765625, + "step": 1386 + }, + { + "completion_length": 69.765625, + "epoch": 6.333333333333333, + "grad_norm": 2.0167596340179443, + "kl": 0.117431640625, + "learning_rate": 3.666666666666666e-07, + "loss": 0.0047, + "reward": 1.7052951455116272, + "reward_std": 0.12893803417682648, + "rewards/accuracy_reward": 0.7052951157093048, + "rewards/format_reward": 1.0, + "step": 1387 + }, + { + "completion_length": 90.6875, + "epoch": 6.337899543378995, + "grad_norm": 4.069055557250977, + "kl": 0.095458984375, + "learning_rate": 3.6621004566210046e-07, + "loss": 0.0038, + "reward": 1.753125011920929, + "reward_std": 0.1462521031498909, + "rewards/accuracy_reward": 0.7531249523162842, + "rewards/format_reward": 1.0, + "step": 1388 + }, + { + "completion_length": 85.3671875, + "epoch": 6.342465753424658, + "grad_norm": 1.5775866508483887, + "kl": 0.1103515625, + "learning_rate": 3.6575342465753424e-07, + "loss": 0.0044, + "reward": 1.7893466353416443, + "reward_std": 0.13156647235155106, + "rewards/accuracy_reward": 0.8127840459346771, + "rewards/format_reward": 0.9765625, + "step": 1389 + }, + { + "completion_length": 71.953125, + "epoch": 6.34703196347032, + "grad_norm": 4.919250965118408, + "kl": 0.16259765625, + "learning_rate": 3.65296803652968e-07, + "loss": 0.0065, + "reward": 1.816933274269104, + "reward_std": 0.18086620420217514, + "rewards/accuracy_reward": 0.816933274269104, + "rewards/format_reward": 1.0, + "step": 1390 + }, + { + "completion_length": 75.359375, + "epoch": 6.351598173515982, + "grad_norm": 1.6152794361114502, + "kl": 0.12841796875, + "learning_rate": 3.648401826484018e-07, + "loss": 0.0051, + "reward": 1.8754695653915405, + "reward_std": 0.05770116671919823, + "rewards/accuracy_reward": 0.8754695355892181, + "rewards/format_reward": 1.0, + "step": 1391 + }, + { + "completion_length": 97.0625, + "epoch": 6.3561643835616435, + "grad_norm": 3.0562081336975098, + "kl": 0.0654296875, + "learning_rate": 3.643835616438356e-07, + "loss": 0.0026, + "reward": 1.8640625476837158, + "reward_std": 0.11048543266952038, + "rewards/accuracy_reward": 0.8874999284744263, + "rewards/format_reward": 0.9765625, + "step": 1392 + }, + { + "completion_length": 91.1015625, + "epoch": 6.360730593607306, + "grad_norm": 11.379284858703613, + "kl": 0.123046875, + "learning_rate": 3.639269406392694e-07, + "loss": 0.0049, + "reward": 1.7335938215255737, + "reward_std": 0.13041038066148758, + "rewards/accuracy_reward": 0.741406261920929, + "rewards/format_reward": 0.9921875, + "step": 1393 + }, + { + "completion_length": 88.609375, + "epoch": 6.365296803652968, + "grad_norm": 2.2893126010894775, + "kl": 0.120849609375, + "learning_rate": 3.634703196347032e-07, + "loss": 0.0048, + "reward": 1.7554688453674316, + "reward_std": 0.14954836666584015, + "rewards/accuracy_reward": 0.7710936963558197, + "rewards/format_reward": 0.984375, + "step": 1394 + }, + { + "completion_length": 86.0390625, + "epoch": 6.36986301369863, + "grad_norm": 2.2495548725128174, + "kl": 0.10888671875, + "learning_rate": 3.6301369863013697e-07, + "loss": 0.0043, + "reward": 1.8598958253860474, + "reward_std": 0.10234630480408669, + "rewards/accuracy_reward": 0.8677082657814026, + "rewards/format_reward": 0.9921875, + "step": 1395 + }, + { + "completion_length": 75.4609375, + "epoch": 6.3744292237442925, + "grad_norm": 1.356154203414917, + "kl": 0.095947265625, + "learning_rate": 3.6255707762557075e-07, + "loss": 0.0038, + "reward": 1.6979167461395264, + "reward_std": 0.11066011898219585, + "rewards/accuracy_reward": 0.6979166567325592, + "rewards/format_reward": 1.0, + "step": 1396 + }, + { + "completion_length": 76.453125, + "epoch": 6.378995433789954, + "grad_norm": 2.3890748023986816, + "kl": 0.202880859375, + "learning_rate": 3.6210045662100454e-07, + "loss": 0.0081, + "reward": 1.8653646111488342, + "reward_std": 0.15285574202425778, + "rewards/accuracy_reward": 0.8966145813465118, + "rewards/format_reward": 0.96875, + "step": 1397 + }, + { + "completion_length": 66.3359375, + "epoch": 6.383561643835616, + "grad_norm": 1.8695800304412842, + "kl": 0.1298828125, + "learning_rate": 3.6164383561643833e-07, + "loss": 0.0052, + "reward": 1.8802322745323181, + "reward_std": 0.046260682400316, + "rewards/accuracy_reward": 0.8802323341369629, + "rewards/format_reward": 1.0, + "step": 1398 + }, + { + "completion_length": 89.1796875, + "epoch": 6.3881278538812785, + "grad_norm": 1.6961324214935303, + "kl": 0.103271484375, + "learning_rate": 3.611872146118721e-07, + "loss": 0.0041, + "reward": 1.870312511920929, + "reward_std": 0.10539799928665161, + "rewards/accuracy_reward": 0.8781249523162842, + "rewards/format_reward": 0.9921875, + "step": 1399 + }, + { + "completion_length": 90.0859375, + "epoch": 6.392694063926941, + "grad_norm": 8.425874710083008, + "kl": 0.180419921875, + "learning_rate": 3.607305936073059e-07, + "loss": 0.0072, + "reward": 1.6842634081840515, + "reward_std": 0.16779197752475739, + "rewards/accuracy_reward": 0.6842633485794067, + "rewards/format_reward": 1.0, + "step": 1400 + } + ], + "logging_steps": 1.0, + "max_steps": 2190, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}