diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,1298 +1,2837 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9984, + "epoch": 2.0, "eval_steps": 100, - "global_step": 468, + "global_step": 2144, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "completion_length": 84.06964683532715, - "epoch": 0.010666666666666666, - "grad_norm": 2.5746846199035645, - "kl": 0.0005696296691894531, - "learning_rate": 2.1276595744680853e-06, - "loss": 0.0234, - "reward": 0.9341583102941513, - "reward_std": 0.4086078226566315, - "rewards/accuracy_reward": 0.10469399155117572, - "rewards/format_reward": 0.8294643282890319, - "step": 5 - }, - { - "completion_length": 70.6178602218628, - "epoch": 0.021333333333333333, - "grad_norm": 2.298146963119507, - "kl": 0.2245147705078125, - "learning_rate": 4.255319148936171e-06, - "loss": 0.0507, - "reward": 1.1630470275878906, - "reward_std": 0.34340456649661066, - "rewards/accuracy_reward": 0.22018984369933606, - "rewards/format_reward": 0.9428571790456772, + "completion_length": 80.76250381469727, + "epoch": 0.009328358208955223, + "grad_norm": 3.5554158687591553, + "kl": 0.0003086090087890625, + "learning_rate": 4.6511627906976744e-08, + "loss": 0.0233, + "reward": 0.702067643404007, + "reward_std": 0.8366336703300477, + "rewards/accuracy_reward": 0.1699247632175684, + "rewards/format_reward": 0.5321428745985031, "step": 10 }, { - "completion_length": 58.76250247955322, - "epoch": 0.032, - "grad_norm": 7.35014533996582, - "kl": 0.1055908203125, - "learning_rate": 6.382978723404256e-06, - "loss": 0.0215, - "reward": 1.3287883758544923, - "reward_std": 0.24974945709109306, - "rewards/accuracy_reward": 0.34753834586590526, - "rewards/format_reward": 0.9812500387430191, - "step": 15 - }, - { - "completion_length": 56.804466247558594, - "epoch": 0.042666666666666665, - "grad_norm": 1.0536774396896362, - "kl": 0.12276611328125, - "learning_rate": 8.510638297872341e-06, - "loss": 0.0126, - "reward": 1.2951765954494476, - "reward_std": 0.19136211089789867, - "rewards/accuracy_reward": 0.3219622537493706, - "rewards/format_reward": 0.9732143193483352, + "completion_length": 84.78214569091797, + "epoch": 0.018656716417910446, + "grad_norm": 3.275822877883911, + "kl": 0.0003536224365234375, + "learning_rate": 9.302325581395349e-08, + "loss": 0.0161, + "reward": 0.7687230795621872, + "reward_std": 0.8754230916500092, + "rewards/accuracy_reward": 0.23300875946879387, + "rewards/format_reward": 0.5357143133878708, "step": 20 }, { - "completion_length": 56.332145309448244, - "epoch": 0.05333333333333334, - "grad_norm": 1.1471902132034302, - "kl": 0.110009765625, - "learning_rate": 1.0638297872340426e-05, - "loss": 0.0158, - "reward": 1.3868436932563781, - "reward_std": 0.15888289231806993, - "rewards/accuracy_reward": 0.4073793478310108, - "rewards/format_reward": 0.979464316368103, - "step": 25 - }, - { - "completion_length": 66.11339569091797, - "epoch": 0.064, - "grad_norm": 0.5500189065933228, - "kl": 0.135498046875, - "learning_rate": 1.2765957446808513e-05, - "loss": 0.0227, - "reward": 1.4447153329849243, - "reward_std": 0.12236948367208242, - "rewards/accuracy_reward": 0.44917954206466676, - "rewards/format_reward": 0.9955357313156128, + "completion_length": 86.25000381469727, + "epoch": 0.027985074626865673, + "grad_norm": 3.5378522872924805, + "kl": 0.0003940582275390625, + "learning_rate": 1.3953488372093021e-07, + "loss": 0.0422, + "reward": 0.6747060805559159, + "reward_std": 0.8393354952335358, + "rewards/accuracy_reward": 0.1568488895893097, + "rewards/format_reward": 0.5178571730852127, "step": 30 }, { - "completion_length": 76.48571739196777, - "epoch": 0.07466666666666667, - "grad_norm": 0.9380918145179749, - "kl": 0.176904296875, - "learning_rate": 1.4893617021276596e-05, - "loss": 0.0008, - "reward": 1.3599235594272614, - "reward_std": 0.11256115352734923, - "rewards/accuracy_reward": 0.35992350801825523, - "rewards/format_reward": 1.0, - "step": 35 - }, - { - "completion_length": 88.6205394744873, - "epoch": 0.08533333333333333, - "grad_norm": 0.7145365476608276, - "kl": 0.1474609375, - "learning_rate": 1.7021276595744682e-05, - "loss": 0.0214, - "reward": 1.385309648513794, - "reward_std": 0.10408242754638194, - "rewards/accuracy_reward": 0.38798815086483956, - "rewards/format_reward": 0.9973214387893676, + "completion_length": 82.15893173217773, + "epoch": 0.03731343283582089, + "grad_norm": 2.7074437141418457, + "kl": 0.000814056396484375, + "learning_rate": 1.8604651162790698e-07, + "loss": 0.0118, + "reward": 0.7970994591712952, + "reward_std": 0.7348823547363281, + "rewards/accuracy_reward": 0.18995655626058577, + "rewards/format_reward": 0.6071428924798965, "step": 40 }, { - "completion_length": 69.52857475280761, - "epoch": 0.096, - "grad_norm": 1.020272135734558, - "kl": 0.19755859375, - "learning_rate": 1.914893617021277e-05, - "loss": 0.0125, - "reward": 1.3666836261749267, - "reward_std": 0.11790415998548269, - "rewards/accuracy_reward": 0.36757640447467566, - "rewards/format_reward": 0.9991071462631226, - "step": 45 - }, - { - "completion_length": 66.01071739196777, - "epoch": 0.10666666666666667, - "grad_norm": 0.8179575800895691, - "kl": 0.24091796875, - "learning_rate": 1.999749429505675e-05, - "loss": 0.0168, - "reward": 1.4537438571453094, - "reward_std": 0.10831094486638904, - "rewards/accuracy_reward": 0.45642236769199374, - "rewards/format_reward": 0.9973214328289032, + "completion_length": 87.1785758972168, + "epoch": 0.04664179104477612, + "grad_norm": 2.62431263923645, + "kl": 0.00384521484375, + "learning_rate": 2.3255813953488372e-07, + "loss": 0.0565, + "reward": 0.7967314809560776, + "reward_std": 0.7611366331577301, + "rewards/accuracy_reward": 0.18244571574032306, + "rewards/format_reward": 0.6142857372760773, "step": 50 }, { - "completion_length": 72.62768211364747, - "epoch": 0.11733333333333333, - "grad_norm": 0.8046705722808838, - "kl": 0.27734375, - "learning_rate": 1.9982186200932964e-05, - "loss": 0.023, - "reward": 1.3337069928646088, - "reward_std": 0.11116362968459725, - "rewards/accuracy_reward": 0.3354926247149706, - "rewards/format_reward": 0.9982142925262452, - "step": 55 - }, - { - "completion_length": 73.03303890228271, - "epoch": 0.128, - "grad_norm": 0.623573362827301, - "kl": 0.265869140625, - "learning_rate": 1.9952983353325358e-05, - "loss": 0.0102, - "reward": 1.3988360345363617, - "reward_std": 0.10178781538270414, - "rewards/accuracy_reward": 0.40151453409343957, - "rewards/format_reward": 0.9973214387893676, + "completion_length": 78.98393173217774, + "epoch": 0.055970149253731345, + "grad_norm": 2.3845767974853516, + "kl": 0.01090850830078125, + "learning_rate": 2.7906976744186043e-07, + "loss": 0.0843, + "reward": 0.9905084490776062, + "reward_std": 0.6616127550601959, + "rewards/accuracy_reward": 0.24407987296581268, + "rewards/format_reward": 0.7464286148548126, "step": 60 }, { - "completion_length": 73.4758955001831, - "epoch": 0.13866666666666666, - "grad_norm": 1.859560251235962, - "kl": 0.262158203125, - "learning_rate": 1.990992640128218e-05, - "loss": 0.0151, - "reward": 1.4210947692394256, - "reward_std": 0.09910206436179578, - "rewards/accuracy_reward": 0.4237732715904713, - "rewards/format_reward": 0.9973214387893676, - "step": 65 - }, - { - "completion_length": 77.1178596496582, - "epoch": 0.14933333333333335, - "grad_norm": 0.7892802357673645, - "kl": 0.246630859375, - "learning_rate": 1.9853075278140913e-05, - "loss": 0.0143, - "reward": 1.3541903257369996, - "reward_std": 0.10066047627478839, - "rewards/accuracy_reward": 0.35686881598085163, - "rewards/format_reward": 0.9973214387893676, + "completion_length": 75.72143096923828, + "epoch": 0.06529850746268656, + "grad_norm": 2.440654754638672, + "kl": 0.025201416015625, + "learning_rate": 3.2558139534883724e-07, + "loss": 0.0567, + "reward": 1.0874994218349456, + "reward_std": 0.7452466487884521, + "rewards/accuracy_reward": 0.31249938160181046, + "rewards/format_reward": 0.7750000476837158, "step": 70 }, { - "completion_length": 61.362502098083496, - "epoch": 0.16, - "grad_norm": 0.828670859336853, - "kl": 0.261474609375, - "learning_rate": 1.9782509118103773e-05, - "loss": 0.0193, - "reward": 1.3949908137321472, - "reward_std": 0.08575579300522804, - "rewards/accuracy_reward": 0.3958836041390896, - "rewards/format_reward": 0.9991071462631226, - "step": 75 - }, - { - "completion_length": 56.22678833007812, - "epoch": 0.17066666666666666, - "grad_norm": 0.6337359547615051, - "kl": 0.27822265625, - "learning_rate": 1.9698326146086446e-05, - "loss": 0.0117, - "reward": 1.3398247182369232, - "reward_std": 0.07482532951980829, - "rewards/accuracy_reward": 0.34161034375429156, - "rewards/format_reward": 0.9982142925262452, + "completion_length": 74.28928871154785, + "epoch": 0.07462686567164178, + "grad_norm": 3.0141847133636475, + "kl": 0.03741455078125, + "learning_rate": 3.7209302325581396e-07, + "loss": 0.0712, + "reward": 1.129212111234665, + "reward_std": 0.6488306671380997, + "rewards/accuracy_reward": 0.3292120784521103, + "rewards/format_reward": 0.8000000417232513, "step": 80 }, { - "completion_length": 54.415181159973145, - "epoch": 0.18133333333333335, - "grad_norm": 0.882876992225647, - "kl": 0.3263671875, - "learning_rate": 1.9600643540993453e-05, - "loss": 0.0207, - "reward": 1.4424349546432496, - "reward_std": 0.10577646959573031, - "rewards/accuracy_reward": 0.44511346630752086, - "rewards/format_reward": 0.9973214387893676, - "step": 85 - }, - { - "completion_length": 64.11875247955322, - "epoch": 0.192, - "grad_norm": 0.7268953919410706, - "kl": 0.305859375, - "learning_rate": 1.9489597272610377e-05, - "loss": 0.0406, - "reward": 1.417532241344452, - "reward_std": 0.10051665157079696, - "rewards/accuracy_reward": 0.4228893216699362, - "rewards/format_reward": 0.9946428716182709, + "completion_length": 76.96786117553711, + "epoch": 0.08395522388059702, + "grad_norm": 6.2834792137146, + "kl": 0.0553955078125, + "learning_rate": 4.186046511627907e-07, + "loss": 0.0345, + "reward": 1.2077421128749848, + "reward_std": 0.7286256492137909, + "rewards/accuracy_reward": 0.3827420711517334, + "rewards/format_reward": 0.8250000476837158, "step": 90 }, { - "completion_length": 77.16607456207275, - "epoch": 0.20266666666666666, - "grad_norm": 0.6748376488685608, - "kl": 0.31484375, - "learning_rate": 1.936534191234006e-05, - "loss": 0.0475, - "reward": 1.4355210840702057, - "reward_std": 0.10607277518138289, - "rewards/accuracy_reward": 0.4444495804607868, - "rewards/format_reward": 0.9910714566707611, - "step": 95 - }, - { - "completion_length": 101.09107627868653, - "epoch": 0.21333333333333335, - "grad_norm": 2.3864388465881348, - "kl": 0.29169921875, - "learning_rate": 1.922805041804617e-05, - "loss": 0.2129, - "reward": 1.4017220735549927, - "reward_std": 0.1426205663010478, - "rewards/accuracy_reward": 0.4365434356033802, - "rewards/format_reward": 0.9651785999536514, - "step": 100 - }, - { - "epoch": 0.21333333333333335, - "eval_completion_length": 176.14483883506372, - "eval_kl": 0.33562911184210525, - "eval_loss": 0.6023176312446594, - "eval_reward": 1.1016724376302016, - "eval_reward_std": 0.3640861515151827, - "eval_rewards/accuracy_reward": 0.23043179681132497, - "eval_rewards/format_reward": 0.8712406393728758, - "eval_runtime": 245.8906, - "eval_samples_per_second": 1.22, - "eval_steps_per_second": 0.024, + "completion_length": 67.60178909301757, + "epoch": 0.09328358208955224, + "grad_norm": 2.595285415649414, + "kl": 0.05562744140625, + "learning_rate": 4.6511627906976743e-07, + "loss": 0.0324, + "reward": 1.2687552094459533, + "reward_std": 0.5548081547021866, + "rewards/accuracy_reward": 0.3866123020648956, + "rewards/format_reward": 0.8821429014205933, "step": 100 }, { - "completion_length": 155.50804328918457, - "epoch": 0.224, - "grad_norm": 536.5433349609375, - "kl": 4583.8103515625, - "learning_rate": 1.907791389330363e-05, - "loss": 466.9073, - "reward": 1.2200158536434174, - "reward_std": 0.39980794712901113, - "rewards/accuracy_reward": 0.38251580521464346, - "rewards/format_reward": 0.8375000417232513, - "step": 105 - }, - { - "completion_length": 135.80357933044434, - "epoch": 0.23466666666666666, - "grad_norm": 20.82430076599121, - "kl": 12.468359375, - "learning_rate": 1.8915141321391083e-05, - "loss": 1.7726, - "reward": 0.9558229982852936, - "reward_std": 0.5767658948898315, - "rewards/accuracy_reward": 0.2888586811721325, - "rewards/format_reward": 0.6669643104076386, + "completion_length": 71.0285743713379, + "epoch": 0.10261194029850747, + "grad_norm": 3.1827375888824463, + "kl": 0.0405517578125, + "learning_rate": 5.116279069767442e-07, + "loss": 0.0416, + "reward": 1.2789674162864686, + "reward_std": 0.741091251373291, + "rewards/accuracy_reward": 0.4361102759838104, + "rewards/format_reward": 0.8428571820259094, "step": 110 }, { - "completion_length": 36.20535850524902, - "epoch": 0.24533333333333332, - "grad_norm": 0.9738772511482239, - "kl": 3.57275390625, - "learning_rate": 1.873995927439555e-05, - "loss": 0.1528, - "reward": 1.3529624402523042, - "reward_std": 0.18142059126403182, - "rewards/accuracy_reward": 0.41367666572332384, - "rewards/format_reward": 0.9392857402563095, - "step": 115 - }, - { - "completion_length": 44.45625190734863, - "epoch": 0.256, - "grad_norm": 55.59278106689453, - "kl": 0.7744140625, - "learning_rate": 1.855261159783432e-05, - "loss": 0.0309, - "reward": 1.3545026361942292, - "reward_std": 0.09652781123295426, - "rewards/accuracy_reward": 0.3607525654137135, - "rewards/format_reward": 0.993750023841858, + "completion_length": 74.42678985595703, + "epoch": 0.11194029850746269, + "grad_norm": 2.532541036605835, + "kl": 0.0537353515625, + "learning_rate": 5.581395348837209e-07, + "loss": 0.0759, + "reward": 1.4309274792671203, + "reward_std": 0.6709185004234314, + "rewards/accuracy_reward": 0.538070285320282, + "rewards/format_reward": 0.8928571820259095, "step": 120 }, { - "completion_length": 88.1910753250122, - "epoch": 0.26666666666666666, - "grad_norm": 2.580606460571289, - "kl": 0.45029296875, - "learning_rate": 1.8353359071232954e-05, - "loss": 0.1271, - "reward": 1.4032364308834075, - "reward_std": 0.15598073843866586, - "rewards/accuracy_reward": 0.4335935153067112, - "rewards/format_reward": 0.9696428835391998, - "step": 125 - }, - { - "completion_length": 205.1491153717041, - "epoch": 0.2773333333333333, - "grad_norm": 19.771793365478516, - "kl": 3.16484375, - "learning_rate": 1.8142479045131956e-05, - "loss": 0.6248, - "reward": 1.1805975049734116, - "reward_std": 0.36486576311290264, - "rewards/accuracy_reward": 0.3922045972198248, - "rewards/format_reward": 0.7883928850293159, + "completion_length": 68.81071815490722, + "epoch": 0.12126865671641791, + "grad_norm": 1.8233903646469116, + "kl": 0.08544921875, + "learning_rate": 6.046511627906976e-07, + "loss": 0.0235, + "reward": 1.3438993215560913, + "reward_std": 0.5705544888973236, + "rewards/accuracy_reward": 0.44389927908778193, + "rewards/format_reward": 0.9000000417232513, "step": 130 }, { - "completion_length": 224.54107971191405, - "epoch": 0.288, - "grad_norm": 37.31096649169922, - "kl": 2.1166015625, - "learning_rate": 1.7920265055027285e-05, - "loss": 0.6933, - "reward": 1.0551446676254272, - "reward_std": 0.456743398308754, - "rewards/accuracy_reward": 0.28996606133878233, - "rewards/format_reward": 0.7651786148548126, - "step": 135 - }, - { - "completion_length": 152.92589988708497, - "epoch": 0.2986666666666667, - "grad_norm": 7.781869888305664, - "kl": 1.4671875, - "learning_rate": 1.76870264127822e-05, - "loss": 0.54, - "reward": 1.1673295348882675, - "reward_std": 0.4515230402350426, - "rewards/accuracy_reward": 0.34411517679691317, - "rewards/format_reward": 0.8232143253087998, + "completion_length": 68.21250534057617, + "epoch": 0.13059701492537312, + "grad_norm": 2.290971040725708, + "kl": 0.0460205078125, + "learning_rate": 6.511627906976745e-07, + "loss": 0.0242, + "reward": 1.3966900944709777, + "reward_std": 0.6165425658226014, + "rewards/accuracy_reward": 0.4966900646686554, + "rewards/format_reward": 0.9000000417232513, "step": 140 }, { - "completion_length": 90.89553909301758, - "epoch": 0.30933333333333335, - "grad_norm": 7.331554412841797, - "kl": 1.12890625, - "learning_rate": 1.7443087776079068e-05, - "loss": 0.2504, - "reward": 1.2346946597099304, - "reward_std": 0.3830049060285091, - "rewards/accuracy_reward": 0.37130174338817595, - "rewards/format_reward": 0.8633929073810578, - "step": 145 - }, - { - "completion_length": 144.13929195404052, - "epoch": 0.32, - "grad_norm": 3.7630207538604736, - "kl": 1.991015625, - "learning_rate": 1.7188788696510477e-05, - "loss": 0.4815, - "reward": 1.2255327105522156, - "reward_std": 0.29441717453300953, - "rewards/accuracy_reward": 0.35321122482419015, - "rewards/format_reward": 0.8723214626312256, + "completion_length": 66.60893173217774, + "epoch": 0.13992537313432835, + "grad_norm": 2.318082094192505, + "kl": 0.0470947265625, + "learning_rate": 6.976744186046511e-07, + "loss": 0.0443, + "reward": 1.3636222004890441, + "reward_std": 0.5036075174808502, + "rewards/accuracy_reward": 0.4671935737133026, + "rewards/format_reward": 0.8964286088943482, "step": 150 }, { - "completion_length": 127.27768478393554, - "epoch": 0.33066666666666666, - "grad_norm": 20.417327880859375, - "kl": 1.51181640625, - "learning_rate": 1.6924483146938756e-05, - "loss": 0.6594, - "reward": 1.293435949087143, - "reward_std": 0.33014910146594045, - "rewards/accuracy_reward": 0.3898644786328077, - "rewards/format_reward": 0.90357146859169, - "step": 155 - }, - { - "completion_length": 88.7991117477417, - "epoch": 0.3413333333333333, - "grad_norm": 3.8288986682891846, - "kl": 1.37109375, - "learning_rate": 1.665053902878167e-05, - "loss": 0.5809, - "reward": 1.3068266212940216, - "reward_std": 0.23101849630475044, - "rewards/accuracy_reward": 0.3612908452749252, - "rewards/format_reward": 0.9455357432365418, + "completion_length": 68.63750267028809, + "epoch": 0.14925373134328357, + "grad_norm": 2.1847078800201416, + "kl": 0.05283203125, + "learning_rate": 7.441860465116279e-07, + "loss": 0.0244, + "reward": 1.3687038898468018, + "reward_std": 0.5109428346157074, + "rewards/accuracy_reward": 0.4687038689851761, + "rewards/format_reward": 0.9000000417232513, "step": 160 }, { - "completion_length": 93.8723258972168, - "epoch": 0.352, - "grad_norm": 37.39503479003906, - "kl": 2.13388671875, - "learning_rate": 1.6367337659910223e-05, - "loss": 0.6612, - "reward": 1.2681213736534118, - "reward_std": 0.23815324977040292, - "rewards/accuracy_reward": 0.3306213151663542, - "rewards/format_reward": 0.9375000417232513, - "step": 165 - }, - { - "completion_length": 85.18661117553711, - "epoch": 0.3626666666666667, - "grad_norm": 143.8899688720703, - "kl": 4.0654296875, - "learning_rate": 1.607527324387137e-05, - "loss": 1.1039, - "reward": 1.2971380084753037, - "reward_std": 0.24464343562722207, - "rewards/accuracy_reward": 0.35070938505232335, - "rewards/format_reward": 0.9464285999536515, + "completion_length": 70.4517894744873, + "epoch": 0.15858208955223882, + "grad_norm": 1.2067421674728394, + "kl": 0.04818115234375, + "learning_rate": 7.906976744186046e-07, + "loss": 0.0364, + "reward": 1.4024234175682069, + "reward_std": 0.4660025998950005, + "rewards/accuracy_reward": 0.46670911014080046, + "rewards/format_reward": 0.9357143104076385, "step": 170 }, { - "completion_length": 117.47143363952637, - "epoch": 0.37333333333333335, - "grad_norm": 209.26617431640625, - "kl": 5.5390625, - "learning_rate": 1.5774752321174428e-05, - "loss": 1.3565, - "reward": 1.2448502421379088, - "reward_std": 0.3966982074081898, - "rewards/accuracy_reward": 0.3591358933597803, - "rewards/format_reward": 0.885714328289032, - "step": 175 - }, - { - "completion_length": 126.08304214477539, - "epoch": 0.384, - "grad_norm": 1086.019775390625, - "kl": 20.592578125, - "learning_rate": 1.5466193203405017e-05, - "loss": 2.4344, - "reward": 1.1619371354579926, - "reward_std": 0.39342261999845507, - "rewards/accuracy_reward": 0.30211565755307673, - "rewards/format_reward": 0.8598214656114578, + "completion_length": 68.050004196167, + "epoch": 0.16791044776119404, + "grad_norm": 1.5013313293457031, + "kl": 0.086083984375, + "learning_rate": 8.372093023255814e-07, + "loss": 0.0206, + "reward": 1.4320929646492004, + "reward_std": 0.4674407035112381, + "rewards/accuracy_reward": 0.5035214573144913, + "rewards/format_reward": 0.9285714626312256, "step": 180 }, { - "completion_length": 111.61429061889649, - "epoch": 0.39466666666666667, - "grad_norm": 27.37205696105957, - "kl": 10.1076171875, - "learning_rate": 1.5150025390954153e-05, - "loss": 1.7053, - "reward": 1.1650864064693451, - "reward_std": 0.41547583043575287, - "rewards/accuracy_reward": 0.3106220416724682, - "rewards/format_reward": 0.8544643312692642, - "step": 185 - }, - { - "completion_length": 126.89732666015625, - "epoch": 0.4053333333333333, - "grad_norm": 233.82647705078125, - "kl": 19.816015625, - "learning_rate": 1.4826688975173085e-05, - "loss": 3.0584, - "reward": 1.2155672758817673, - "reward_std": 0.39352322220802305, - "rewards/accuracy_reward": 0.3459243529476225, - "rewards/format_reward": 0.8696429014205933, + "completion_length": 69.41786155700683, + "epoch": 0.17723880597014927, + "grad_norm": 2.496340036392212, + "kl": 0.0986083984375, + "learning_rate": 8.837209302325581e-07, + "loss": 0.0273, + "reward": 1.497808289527893, + "reward_std": 0.48698129057884215, + "rewards/accuracy_reward": 0.5728082910180092, + "rewards/format_reward": 0.9250000298023224, "step": 190 }, { - "completion_length": 116.58572120666504, - "epoch": 0.416, - "grad_norm": 41.37778091430664, - "kl": 1.791796875, - "learning_rate": 1.4496634025785938e-05, - "loss": 0.741, - "reward": 1.239576244354248, - "reward_std": 0.363012520223856, - "rewards/accuracy_reward": 0.34582619145512583, - "rewards/format_reward": 0.8937500447034836, - "step": 195 - }, - { - "completion_length": 89.40893306732178, - "epoch": 0.4266666666666667, - "grad_norm": 8.469895362854004, - "kl": 2.60439453125, - "learning_rate": 1.4160319964412943e-05, - "loss": 0.7316, - "reward": 1.3309306919574737, - "reward_std": 0.24600692018866538, - "rewards/accuracy_reward": 0.38896636143326757, - "rewards/format_reward": 0.9419643193483352, + "completion_length": 71.72500381469726, + "epoch": 0.1865671641791045, + "grad_norm": 1.7499533891677856, + "kl": 0.063671875, + "learning_rate": 9.302325581395349e-07, + "loss": 0.029, + "reward": 1.4671559572219848, + "reward_std": 0.4771651789546013, + "rewards/accuracy_reward": 0.552870225906372, + "rewards/format_reward": 0.9142857491970062, "step": 200 }, { - "epoch": 0.4266666666666667, - "eval_completion_length": 72.73731492695056, - "eval_kl": 0.5432771381578947, - "eval_loss": 0.32045310735702515, - "eval_reward": 1.181434901137101, - "eval_reward_std": 0.2893000698011172, - "eval_rewards/accuracy_reward": 0.24440478111960387, - "eval_rewards/format_reward": 0.9370301130570864, - "eval_runtime": 133.7856, - "eval_samples_per_second": 2.242, - "eval_steps_per_second": 0.045, - "step": 200 - }, - { - "completion_length": 71.51428909301758, - "epoch": 0.43733333333333335, - "grad_norm": 2.6252498626708984, - "kl": 0.64296875, - "learning_rate": 1.3818214925076226e-05, - "loss": 0.3148, - "reward": 1.3910367608070373, - "reward_std": 0.2396655511111021, - "rewards/accuracy_reward": 0.444608124345541, - "rewards/format_reward": 0.9464286118745804, - "step": 205 - }, - { - "completion_length": 100.40625457763672, - "epoch": 0.448, - "grad_norm": 3.6516802310943604, - "kl": 2.57763671875, - "learning_rate": 1.3470795102578358e-05, - "loss": 0.6842, - "reward": 1.4023015439510345, - "reward_std": 0.2554322887212038, - "rewards/accuracy_reward": 0.4630157709121704, - "rewards/format_reward": 0.9392857491970062, + "completion_length": 70.13036041259765, + "epoch": 0.1958955223880597, + "grad_norm": 2.2533020973205566, + "kl": 0.06893310546875, + "learning_rate": 9.767441860465115e-07, + "loss": 0.0039, + "reward": 1.6629591584205627, + "reward_std": 0.3428287610411644, + "rewards/accuracy_reward": 0.7129591181874275, + "rewards/format_reward": 0.9500000238418579, "step": 210 }, { - "completion_length": 77.99553966522217, - "epoch": 0.45866666666666667, - "grad_norm": 1.8451030254364014, - "kl": 0.4015625, - "learning_rate": 1.3118544089660635e-05, - "loss": 0.3353, - "reward": 1.4199142813682557, - "reward_std": 0.17975129522383212, - "rewards/accuracy_reward": 0.4547356490045786, - "rewards/format_reward": 0.9651786029338837, - "step": 215 - }, - { - "completion_length": 67.60982398986816, - "epoch": 0.4693333333333333, - "grad_norm": 0.8835315108299255, - "kl": 2.78828125, - "learning_rate": 1.2761952203863759e-05, - "loss": 0.38, - "reward": 1.3431052803993224, - "reward_std": 0.1588349466212094, - "rewards/accuracy_reward": 0.3689980745315552, - "rewards/format_reward": 0.974107176065445, + "completion_length": 68.38393211364746, + "epoch": 0.20522388059701493, + "grad_norm": 1.5270919799804688, + "kl": 0.0611572265625, + "learning_rate": 9.999834227339474e-07, + "loss": 0.0227, + "reward": 1.565323531627655, + "reward_std": 0.47197969257831573, + "rewards/accuracy_reward": 0.6153234869241715, + "rewards/format_reward": 0.9500000238418579, "step": 220 }, { - "completion_length": 57.28928813934326, - "epoch": 0.48, - "grad_norm": 0.847754955291748, - "kl": 0.40869140625, - "learning_rate": 1.2401515805027924e-05, - "loss": 0.153, - "reward": 1.4158245086669923, - "reward_std": 0.14693537133280188, - "rewards/accuracy_reward": 0.43011016920208933, - "rewards/format_reward": 0.9857143074274063, - "step": 225 - }, - { - "completion_length": 57.68482418060303, - "epoch": 0.49066666666666664, - "grad_norm": 1.268998622894287, - "kl": 0.358349609375, - "learning_rate": 1.2037736604382279e-05, - "loss": 0.0702, - "reward": 1.3331839978694915, - "reward_std": 0.09491205215454102, - "rewards/accuracy_reward": 0.3438982125837356, - "rewards/format_reward": 0.989285734295845, + "completion_length": 69.24107437133789, + "epoch": 0.21455223880597016, + "grad_norm": 1.6075689792633057, + "kl": 0.065380859375, + "learning_rate": 9.998508112007925e-07, + "loss": 0.0179, + "reward": 1.5958670258522034, + "reward_std": 0.4164800032973289, + "rewards/accuracy_reward": 0.6637241214513778, + "rewards/format_reward": 0.9321428894996643, "step": 230 }, { - "completion_length": 88.89018306732177, - "epoch": 0.5013333333333333, - "grad_norm": 0.8988625407218933, - "kl": 0.39384765625, - "learning_rate": 1.1671120966185486e-05, - "loss": 0.3525, - "reward": 1.3964234709739685, - "reward_std": 0.19650729335844516, - "rewards/accuracy_reward": 0.438387706130743, - "rewards/format_reward": 0.9580357521772385, - "step": 235 - }, - { - "completion_length": 92.7276819229126, - "epoch": 0.512, - "grad_norm": 0.740756094455719, - "kl": 0.60595703125, - "learning_rate": 1.1302179202889505e-05, - "loss": 0.3643, - "reward": 1.4265296638011933, - "reward_std": 0.19172351472079754, - "rewards/accuracy_reward": 0.47206532061100004, - "rewards/format_reward": 0.954464316368103, + "completion_length": 71.3910743713379, + "epoch": 0.22388059701492538, + "grad_norm": 1.9208016395568848, + "kl": 0.04765625, + "learning_rate": 9.995856233072862e-07, + "loss": 0.0218, + "reward": 1.6363189816474915, + "reward_std": 0.34947128742933276, + "rewards/accuracy_reward": 0.6720332384109498, + "rewards/format_reward": 0.9642857313156128, "step": 240 }, { - "completion_length": 71.25178890228271, - "epoch": 0.5226666666666666, - "grad_norm": 0.5437249541282654, - "kl": 0.362109375, - "learning_rate": 1.0931424864807624e-05, - "loss": 0.1706, - "reward": 1.4180180430412292, - "reward_std": 0.10797093212604522, - "rewards/accuracy_reward": 0.4340894088149071, - "rewards/format_reward": 0.9839286059141159, - "step": 245 - }, - { - "completion_length": 75.95000343322754, - "epoch": 0.5333333333333333, - "grad_norm": 949.4736938476562, - "kl": 9.5091796875, - "learning_rate": 1.0559374025275597e-05, - "loss": 0.5647, - "reward": 1.3797377407550813, - "reward_std": 0.14358507767319678, - "rewards/accuracy_reward": 0.41009481325745584, - "rewards/format_reward": 0.9696428835391998, + "completion_length": 74.13928833007813, + "epoch": 0.2332089552238806, + "grad_norm": 2.320880889892578, + "kl": 0.04034423828125, + "learning_rate": 9.991879293897066e-07, + "loss": 0.0257, + "reward": 1.5270272970199585, + "reward_std": 0.4767302840948105, + "rewards/accuracy_reward": 0.594884404540062, + "rewards/format_reward": 0.9321428894996643, "step": 250 }, { - "completion_length": 77.59732494354247, - "epoch": 0.544, - "grad_norm": 0.7238170504570007, - "kl": 0.32158203125, - "learning_rate": 1.0186544562300766e-05, - "loss": 0.2572, - "reward": 1.3099871039390565, - "reward_std": 0.1657089657150209, - "rewards/accuracy_reward": 0.33945133732631805, - "rewards/format_reward": 0.970535746216774, - "step": 255 - }, - { - "completion_length": 97.20089836120606, - "epoch": 0.5546666666666666, - "grad_norm": 3.450988292694092, - "kl": 0.63017578125, - "learning_rate": 9.813455437699238e-06, - "loss": 0.5045, - "reward": 1.350427383184433, - "reward_std": 0.24149130024015902, - "rewards/accuracy_reward": 0.40310589894652366, - "rewards/format_reward": 0.9473214596509933, + "completion_length": 75.13571853637696, + "epoch": 0.24253731343283583, + "grad_norm": 1.547238826751709, + "kl": 0.24193115234375, + "learning_rate": 9.986578349291513e-07, + "loss": 0.0391, + "reward": 1.5423155069351195, + "reward_std": 0.3424570709466934, + "rewards/accuracy_reward": 0.6030297234654427, + "rewards/format_reward": 0.9392857432365418, "step": 260 }, { - "completion_length": 77.62411079406738, - "epoch": 0.5653333333333334, - "grad_norm": 5.708080768585205, - "kl": 0.60048828125, - "learning_rate": 9.440625974724408e-06, - "loss": 0.364, - "reward": 1.3411081552505493, - "reward_std": 0.1925573032349348, - "rewards/accuracy_reward": 0.3795009471476078, - "rewards/format_reward": 0.9616071909666062, - "step": 265 - }, - { - "completion_length": 53.72053852081299, - "epoch": 0.576, - "grad_norm": 1.22533118724823, - "kl": 0.39970703125, - "learning_rate": 9.068575135192377e-06, - "loss": 0.0964, - "reward": 1.4191315472126007, - "reward_std": 0.08616148820146918, - "rewards/accuracy_reward": 0.42716721072793007, - "rewards/format_reward": 0.9919643104076385, + "completion_length": 71.19643096923828, + "epoch": 0.251865671641791, + "grad_norm": 2.154519557952881, + "kl": 0.0890625, + "learning_rate": 9.979954805235599e-07, + "loss": 0.0103, + "reward": 1.5991590857505797, + "reward_std": 0.4609019085764885, + "rewards/accuracy_reward": 0.6670161545276642, + "rewards/format_reward": 0.9321428894996643, "step": 270 }, { - "completion_length": 56.51428775787353, - "epoch": 0.5866666666666667, - "grad_norm": 0.7483408451080322, - "kl": 0.53984375, - "learning_rate": 8.697820797110499e-06, - "loss": 0.1011, - "reward": 1.364069801568985, - "reward_std": 0.10476710051298141, - "rewards/accuracy_reward": 0.3729983098804951, - "rewards/format_reward": 0.9910714536905288, - "step": 275 - }, - { - "completion_length": 61.34464511871338, - "epoch": 0.5973333333333334, - "grad_norm": 0.9969340562820435, - "kl": 0.3724609375, - "learning_rate": 8.328879033814516e-06, - "loss": 0.1033, - "reward": 1.4192217588424683, - "reward_std": 0.08946678219363094, - "rewards/accuracy_reward": 0.42904310415033253, - "rewards/format_reward": 0.9901785969734191, + "completion_length": 69.06071701049805, + "epoch": 0.26119402985074625, + "grad_norm": 1.3340944051742554, + "kl": 0.0639404296875, + "learning_rate": 9.972010418504234e-07, + "loss": 0.0256, + "reward": 1.436481499671936, + "reward_std": 0.4502955049276352, + "rewards/accuracy_reward": 0.5079100087285042, + "rewards/format_reward": 0.9285714566707611, "step": 280 }, { - "completion_length": 73.94821739196777, - "epoch": 0.608, - "grad_norm": 46.924434661865234, - "kl": 0.70830078125, - "learning_rate": 7.962263395617724e-06, - "loss": 0.3018, - "reward": 1.4005569756031035, - "reward_std": 0.1211006199941039, - "rewards/accuracy_reward": 0.4228783592581749, - "rewards/format_reward": 0.9776786029338836, - "step": 285 - }, - { - "completion_length": 93.46786251068116, - "epoch": 0.6186666666666667, - "grad_norm": 55.51490783691406, - "kl": 2.4828125, - "learning_rate": 7.598484194972076e-06, - "loss": 0.8693, - "reward": 1.4277709782123567, - "reward_std": 0.1801470622420311, - "rewards/accuracy_reward": 0.4706280644983053, - "rewards/format_reward": 0.9571428954601288, + "completion_length": 67.31607513427734, + "epoch": 0.27052238805970147, + "grad_norm": 2.049099922180176, + "kl": 0.053271484375, + "learning_rate": 9.96274729620189e-07, + "loss": 0.0232, + "reward": 1.6535230159759522, + "reward_std": 0.4071090579032898, + "rewards/accuracy_reward": 0.6963800877332688, + "rewards/format_reward": 0.9571428775787354, "step": 290 }, { - "completion_length": 85.74196815490723, - "epoch": 0.6293333333333333, - "grad_norm": 7.867539405822754, - "kl": 1.81318359375, - "learning_rate": 7.238047796136247e-06, - "loss": 0.6622, - "reward": 1.4081061065196991, - "reward_std": 0.17619385970756413, - "rewards/accuracy_reward": 0.44560605213046073, - "rewards/format_reward": 0.9625000298023224, - "step": 295 - }, - { - "completion_length": 85.03036060333253, - "epoch": 0.64, - "grad_norm": 13.451857566833496, - "kl": 1.27861328125, - "learning_rate": 6.881455910339369e-06, - "loss": 0.462, - "reward": 1.4045335412025453, - "reward_std": 0.1680739532224834, - "rewards/accuracy_reward": 0.44292634129524233, - "rewards/format_reward": 0.9616071701049804, + "completion_length": 63.45714645385742, + "epoch": 0.2798507462686567, + "grad_norm": 1.6050524711608887, + "kl": 0.0631591796875, + "learning_rate": 9.95216789520372e-07, + "loss": 0.0271, + "reward": 1.6196397304534913, + "reward_std": 0.39271264970302583, + "rewards/accuracy_reward": 0.6946396440267563, + "rewards/format_reward": 0.9250000357627869, "step": 300 }, { - "epoch": 0.64, - "eval_completion_length": 91.3207751826236, - "eval_kl": 5.6284950657894735, - "eval_loss": 1.060723066329956, - "eval_reward": 1.2168061639133252, - "eval_reward_std": 0.16671450046430292, - "eval_rewards/accuracy_reward": 0.2562798003322984, - "eval_rewards/format_reward": 0.9605263455917961, - "eval_runtime": 180.899, - "eval_samples_per_second": 1.658, - "eval_steps_per_second": 0.033, - "step": 300 - }, - { - "completion_length": 74.95536079406739, - "epoch": 0.6506666666666666, - "grad_norm": 2.4690773487091064, - "kl": 0.80390625, - "learning_rate": 6.529204897421644e-06, - "loss": 0.354, - "reward": 1.4394736528396606, - "reward_std": 0.16046528667211532, - "rewards/accuracy_reward": 0.4742950351908803, - "rewards/format_reward": 0.9651786118745804, - "step": 305 - }, - { - "completion_length": 81.43750381469727, - "epoch": 0.6613333333333333, - "grad_norm": 4.2583770751953125, - "kl": 32.16533203125, - "learning_rate": 6.181785074923778e-06, - "loss": 4.7213, - "reward": 1.348479652404785, - "reward_std": 0.2050942711532116, - "rewards/accuracy_reward": 0.392229587584734, - "rewards/format_reward": 0.9562500327825546, + "completion_length": 63.45536079406738, + "epoch": 0.2891791044776119, + "grad_norm": 1.266472578048706, + "kl": 0.04791259765625, + "learning_rate": 9.94027502150393e-07, + "loss": 0.0252, + "reward": 1.6061357140541077, + "reward_std": 0.22739854846149682, + "rewards/accuracy_reward": 0.6204214036464691, + "rewards/format_reward": 0.9857142925262451, "step": 310 }, { - "completion_length": 63.128573989868165, - "epoch": 0.672, - "grad_norm": 0.9833464622497559, - "kl": 0.66259765625, - "learning_rate": 5.839680035587061e-06, - "loss": 0.2127, - "reward": 1.3414492011070251, - "reward_std": 0.1257700956426561, - "rewards/accuracy_reward": 0.363770566880703, - "rewards/format_reward": 0.9776786029338836, - "step": 315 - }, - { - "completion_length": 68.11607437133789, - "epoch": 0.6826666666666666, - "grad_norm": 7.631124496459961, - "kl": 1.7080078125, - "learning_rate": 5.503365974214059e-06, - "loss": 0.4633, - "reward": 1.432806944847107, - "reward_std": 0.1586287125945091, - "rewards/accuracy_reward": 0.4649497330188751, - "rewards/format_reward": 0.9678571701049805, + "completion_length": 61.62143135070801, + "epoch": 0.29850746268656714, + "grad_norm": 1.3716604709625244, + "kl": 0.0652587890625, + "learning_rate": 9.92707182947153e-07, + "loss": 0.004, + "reward": 1.6570343613624572, + "reward_std": 0.30255255550146104, + "rewards/accuracy_reward": 0.6856057047843933, + "rewards/format_reward": 0.9714285850524902, "step": 320 }, { - "completion_length": 58.20446720123291, - "epoch": 0.6933333333333334, - "grad_norm": 1.8391218185424805, - "kl": 0.525537109375, - "learning_rate": 5.173311024826916e-06, - "loss": 0.1086, - "reward": 1.3823483526706695, - "reward_std": 0.0876714764162898, - "rewards/accuracy_reward": 0.3957411389797926, - "rewards/format_reward": 0.9866071581840515, - "step": 325 - }, - { - "completion_length": 59.60357398986817, - "epoch": 0.704, - "grad_norm": 0.7015554308891296, - "kl": 0.5265625, - "learning_rate": 4.849974609045849e-06, - "loss": 0.1281, - "reward": 1.4378543078899384, - "reward_std": 0.07850976921617984, - "rewards/accuracy_reward": 0.4458899348974228, - "rewards/format_reward": 0.9919643044471741, + "completion_length": 61.96428909301758, + "epoch": 0.30783582089552236, + "grad_norm": 1.6377546787261963, + "kl": 0.061328125, + "learning_rate": 9.912561821013702e-07, + "loss": 0.0088, + "reward": 1.6331152200698853, + "reward_std": 0.27604506313800814, + "rewards/accuracy_reward": 0.6652580350637436, + "rewards/format_reward": 0.9678571581840515, "step": 330 }, { - "completion_length": 59.381252479553225, - "epoch": 0.7146666666666667, - "grad_norm": 0.9814135432243347, - "kl": 0.4943359375, - "learning_rate": 4.533806796594989e-06, - "loss": 0.1291, - "reward": 1.4527435183525086, - "reward_std": 0.08425388187170028, - "rewards/accuracy_reward": 0.4616720281541348, - "rewards/format_reward": 0.9910714507102967, - "step": 335 - }, - { - "completion_length": 63.149109840393066, - "epoch": 0.7253333333333334, - "grad_norm": 5.584976673126221, - "kl": 0.45751953125, - "learning_rate": 4.2252476788255735e-06, - "loss": 0.1251, - "reward": 1.4177719175815582, - "reward_std": 0.1328437015414238, - "rewards/accuracy_reward": 0.4320575550198555, - "rewards/format_reward": 0.9857143104076386, + "completion_length": 67.14464683532715, + "epoch": 0.31716417910447764, + "grad_norm": 1.6840460300445557, + "kl": 0.0669677734375, + "learning_rate": 9.896748844646973e-07, + "loss": 0.0098, + "reward": 1.5535342574119568, + "reward_std": 0.3371937394142151, + "rewards/accuracy_reward": 0.5785342127084732, + "rewards/format_reward": 0.975000011920929, "step": 340 }, { - "completion_length": 63.60714550018311, - "epoch": 0.736, - "grad_norm": 1.5182867050170898, - "kl": 0.6908203125, - "learning_rate": 3.924726756128632e-06, - "loss": 0.1723, - "reward": 1.4673471570014953, - "reward_std": 0.11950606107711792, - "rewards/accuracy_reward": 0.47895423248410224, - "rewards/format_reward": 0.9883928805589676, - "step": 345 - }, - { - "completion_length": 64.90714607238769, - "epoch": 0.7466666666666667, - "grad_norm": 3.1805872917175293, - "kl": 0.72119140625, - "learning_rate": 3.6326623400897797e-06, - "loss": 0.1866, - "reward": 1.4130909383296966, - "reward_std": 0.1138463762239553, - "rewards/accuracy_reward": 0.4300551608204842, - "rewards/format_reward": 0.9830357372760773, + "completion_length": 67.35714607238769, + "epoch": 0.32649253731343286, + "grad_norm": 1.7342702150344849, + "kl": 0.0959228515625, + "learning_rate": 9.87963709447648e-07, + "loss": 0.0064, + "reward": 1.604421865940094, + "reward_std": 0.40422345995903014, + "rewards/accuracy_reward": 0.6472789824008942, + "rewards/format_reward": 0.9571428775787354, "step": 350 }, { - "completion_length": 62.03928833007812, - "epoch": 0.7573333333333333, - "grad_norm": 2.188754081726074, - "kl": 0.65986328125, - "learning_rate": 3.3494609712183323e-06, - "loss": 0.1627, - "reward": 1.4096969187259674, - "reward_std": 0.1287734190002084, - "rewards/accuracy_reward": 0.42666114717721937, - "rewards/format_reward": 0.9830357521772385, - "step": 355 - }, - { - "completion_length": 64.46964530944824, - "epoch": 0.768, - "grad_norm": 1.6737561225891113, - "kl": 0.46240234375, - "learning_rate": 3.0755168530612444e-06, - "loss": 0.151, - "reward": 1.4208829939365386, - "reward_std": 0.133208098821342, - "rewards/accuracy_reward": 0.4414186552166939, - "rewards/format_reward": 0.9794643223285675, + "completion_length": 69.5875030517578, + "epoch": 0.3358208955223881, + "grad_norm": 1.869360089302063, + "kl": 0.05755615234375, + "learning_rate": 9.86123110908355e-07, + "loss": 0.0305, + "reward": 1.583025586605072, + "reward_std": 0.3722677960991859, + "rewards/accuracy_reward": 0.6258826851844788, + "rewards/format_reward": 0.9571428775787354, "step": 360 }, { - "completion_length": 53.77232398986816, - "epoch": 0.7786666666666666, - "grad_norm": 0.9620945453643799, - "kl": 0.52314453125, - "learning_rate": 2.8112113034895273e-06, - "loss": 0.072, - "reward": 1.4342491328716278, - "reward_std": 0.1155846749432385, - "rewards/accuracy_reward": 0.44496337226592003, - "rewards/format_reward": 0.9892857372760773, - "step": 365 - }, - { - "completion_length": 56.451788139343265, - "epoch": 0.7893333333333333, - "grad_norm": 2.126991033554077, - "kl": 0.45625, - "learning_rate": 2.5569122239209366e-06, - "loss": 0.1048, - "reward": 1.4586413204669952, - "reward_std": 0.1040911391377449, - "rewards/accuracy_reward": 0.47024838626384735, - "rewards/format_reward": 0.9883928805589676, + "completion_length": 64.60536041259766, + "epoch": 0.3451492537313433, + "grad_norm": 2.2044732570648193, + "kl": 0.065771484375, + "learning_rate": 9.841535770321918e-07, + "loss": 0.0147, + "reward": 1.6211875200271606, + "reward_std": 0.316946816444397, + "rewards/accuracy_reward": 0.6604732155799866, + "rewards/format_reward": 0.9607143044471741, "step": 370 }, { - "completion_length": 56.86071720123291, - "epoch": 0.8, - "grad_norm": 7.045074939727783, - "kl": 0.62421875, - "learning_rate": 2.312973587217798e-06, - "loss": 0.0941, - "reward": 1.4210815012454987, - "reward_std": 0.09894018331542612, - "rewards/accuracy_reward": 0.4300100013613701, - "rewards/format_reward": 0.9910714507102967, - "step": 375 - }, - { - "completion_length": 62.136609649658205, - "epoch": 0.8106666666666666, - "grad_norm": 11.092679023742676, - "kl": 0.87412109375, - "learning_rate": 2.079734944972717e-06, - "loss": 0.2099, - "reward": 1.4036639988422395, - "reward_std": 0.12907396703958512, - "rewards/accuracy_reward": 0.4179496355354786, - "rewards/format_reward": 0.9857143223285675, + "completion_length": 70.96607475280761, + "epoch": 0.35447761194029853, + "grad_norm": 2.0380122661590576, + "kl": 0.058251953125, + "learning_rate": 9.820556302022914e-07, + "loss": 0.0245, + "reward": 1.502309012413025, + "reward_std": 0.3732792004942894, + "rewards/accuracy_reward": 0.5380232602357864, + "rewards/format_reward": 0.9642857313156128, "step": 380 }, { - "completion_length": 60.57321701049805, - "epoch": 0.8213333333333334, - "grad_norm": 1.1720848083496094, - "kl": 0.6015625, - "learning_rate": 1.8575209548680472e-06, - "loss": 0.1551, - "reward": 1.463521283864975, - "reward_std": 0.14698517890647053, - "rewards/accuracy_reward": 0.4778069369494915, - "rewards/format_reward": 0.9857143193483353, - "step": 385 - }, - { - "completion_length": 59.10446681976318, - "epoch": 0.832, - "grad_norm": 2.895113706588745, - "kl": 0.48857421875, - "learning_rate": 1.646640928767047e-06, - "loss": 0.1194, - "reward": 1.4841475129127502, - "reward_std": 0.12819948960095645, - "rewards/accuracy_reward": 0.4957545682787895, - "rewards/format_reward": 0.9883928805589676, + "completion_length": 71.89107513427734, + "epoch": 0.36380597014925375, + "grad_norm": 1.5634608268737793, + "kl": 0.04359130859375, + "learning_rate": 9.798298268609945e-07, + "loss": 0.0108, + "reward": 1.6344878315925597, + "reward_std": 0.2746693938970566, + "rewards/accuracy_reward": 0.6630592048168182, + "rewards/format_reward": 0.9714285850524902, "step": 390 }, { - "completion_length": 59.55178813934326, - "epoch": 0.8426666666666667, - "grad_norm": 5.176278591156006, - "kl": 0.54697265625, - "learning_rate": 1.447388402165686e-06, - "loss": 0.1119, - "reward": 1.4193052053451538, - "reward_std": 0.12172278184443712, - "rewards/accuracy_reward": 0.42912655863910915, - "rewards/format_reward": 0.9901785910129547, - "step": 395 - }, - { - "completion_length": 58.429466819763185, - "epoch": 0.8533333333333334, - "grad_norm": 0.599294900894165, - "kl": 0.41142578125, - "learning_rate": 1.2600407256044544e-06, - "loss": 0.0648, - "reward": 1.3848988234996795, - "reward_std": 0.09859151560813188, - "rewards/accuracy_reward": 0.3947202034294605, - "rewards/format_reward": 0.990178593993187, + "completion_length": 67.97678909301757, + "epoch": 0.373134328358209, + "grad_norm": 1.5492973327636719, + "kl": 0.049169921875, + "learning_rate": 9.774767573622604e-07, + "loss": -0.0056, + "reward": 1.6418705940246583, + "reward_std": 0.40805205404758454, + "rewards/accuracy_reward": 0.6775848090648651, + "rewards/format_reward": 0.9642857313156128, "step": 400 }, { - "epoch": 0.8533333333333334, - "eval_completion_length": 59.8646643789191, - "eval_kl": 0.4040655838815789, - "eval_loss": 0.05607712268829346, - "eval_reward": 1.257983226525156, - "eval_reward_std": 0.16123126015851372, - "eval_rewards/accuracy_reward": 0.2655019748367761, - "eval_rewards/format_reward": 0.9924812254152799, - "eval_runtime": 74.3306, - "eval_samples_per_second": 4.036, - "eval_steps_per_second": 0.081, - "step": 400 - }, - { - "completion_length": 56.616073799133304, - "epoch": 0.864, - "grad_norm": 2.1417887210845947, - "kl": 0.401025390625, - "learning_rate": 1.084858678608922e-06, - "loss": 0.0531, - "reward": 1.4679090082645416, - "reward_std": 0.1119827002286911, - "rewards/accuracy_reward": 0.47415893375873563, - "rewards/format_reward": 0.9937500178813934, - "step": 405 - }, - { - "completion_length": 57.13214511871338, - "epoch": 0.8746666666666667, - "grad_norm": 0.820955216884613, - "kl": 0.420703125, - "learning_rate": 9.220861066963715e-07, - "loss": 0.0668, - "reward": 1.4469731509685517, - "reward_std": 0.09708367697894574, - "rewards/accuracy_reward": 0.4532231085002422, - "rewards/format_reward": 0.9937500178813934, + "completion_length": 67.27143173217773, + "epoch": 0.3824626865671642, + "grad_norm": 1.479768991470337, + "kl": 0.04423828125, + "learning_rate": 9.749970458150892e-07, + "loss": 0.0044, + "reward": 1.6326549172401428, + "reward_std": 0.2752505071461201, + "rewards/accuracy_reward": 0.6647977441549301, + "rewards/format_reward": 0.9678571581840515, "step": 410 }, { - "completion_length": 57.99910984039307, - "epoch": 0.8853333333333333, - "grad_norm": 1.4979448318481445, - "kl": 0.424609375, - "learning_rate": 7.719495819538325e-07, - "loss": 0.0467, - "reward": 1.402563601732254, - "reward_std": 0.10260953474789858, - "rewards/accuracy_reward": 0.40702785216271875, - "rewards/format_reward": 0.9955357313156128, - "step": 415 - }, - { - "completion_length": 56.56785945892334, - "epoch": 0.896, - "grad_norm": 0.8355128765106201, - "kl": 0.41708984375, - "learning_rate": 6.346580876599395e-07, - "loss": 0.0486, - "reward": 1.4388800382614135, - "reward_std": 0.09468872109428048, - "rewards/accuracy_reward": 0.4424514189362526, - "rewards/format_reward": 0.9964285731315613, + "completion_length": 67.4732177734375, + "epoch": 0.3917910447761194, + "grad_norm": 1.4039720296859741, + "kl": 0.0602783203125, + "learning_rate": 9.723913499179864e-07, + "loss": -0.0015, + "reward": 1.6066879868507384, + "reward_std": 0.25821444392204285, + "rewards/accuracy_reward": 0.6424022316932678, + "rewards/format_reward": 0.9642857313156128, "step": 420 }, { - "completion_length": 55.70357437133789, - "epoch": 0.9066666666666666, - "grad_norm": 1.6477917432785034, - "kl": 0.376025390625, - "learning_rate": 5.104027273896239e-07, - "loss": 0.0342, - "reward": 1.4677242636680603, - "reward_std": 0.09204176338389516, - "rewards/accuracy_reward": 0.4712956376373768, - "rewards/format_reward": 0.9964285850524902, - "step": 425 - }, - { - "completion_length": 56.08839569091797, - "epoch": 0.9173333333333333, - "grad_norm": 0.8006694912910461, - "kl": 0.366943359375, - "learning_rate": 3.9935645900654906e-07, - "loss": 0.0466, - "reward": 1.4908641874790192, - "reward_std": 0.10689534619450569, - "rewards/accuracy_reward": 0.49711409360170367, - "rewards/format_reward": 0.993750023841858, + "completion_length": 69.50357513427734, + "epoch": 0.40111940298507465, + "grad_norm": 1.6743965148925781, + "kl": 0.0521240234375, + "learning_rate": 9.696603607845213e-07, + "loss": 0.0049, + "reward": 1.6788877964019775, + "reward_std": 0.21843514144420623, + "rewards/accuracy_reward": 0.696744903922081, + "rewards/format_reward": 0.9821428656578064, "step": 430 }, { - "completion_length": 62.13571701049805, - "epoch": 0.928, - "grad_norm": 1.3103991746902466, - "kl": 0.3810546875, - "learning_rate": 3.016738539135566e-07, - "loss": 0.0875, - "reward": 1.4480026721954347, - "reward_std": 0.12071084063500166, - "rewards/accuracy_reward": 0.45960976406931875, - "rewards/format_reward": 0.9883928894996643, - "step": 435 - }, - { - "completion_length": 59.050002098083496, - "epoch": 0.9386666666666666, - "grad_norm": 2.8908112049102783, - "kl": 0.422265625, - "learning_rate": 2.1749088189622848e-07, - "loss": 0.08, - "reward": 1.5175962269306182, - "reward_std": 0.13130544871091843, - "rewards/accuracy_reward": 0.5300961822271347, - "rewards/format_reward": 0.9875000357627869, + "completion_length": 73.39286003112792, + "epoch": 0.41044776119402987, + "grad_norm": 2.3262836933135986, + "kl": 0.0503662109375, + "learning_rate": 9.668048027600215e-07, + "loss": 0.0105, + "reward": 1.6075750589370728, + "reward_std": 0.2825287111103535, + "rewards/accuracy_reward": 0.6432893007993699, + "rewards/format_reward": 0.9642857313156128, "step": 440 }, { - "completion_length": 56.333037757873534, - "epoch": 0.9493333333333334, - "grad_norm": 0.7191564440727234, - "kl": 0.3669921875, - "learning_rate": 1.4692472185908635e-07, - "loss": 0.0393, - "reward": 1.4232155978679657, - "reward_std": 0.09858108786866068, - "rewards/accuracy_reward": 0.43125124275684357, - "rewards/format_reward": 0.991964316368103, - "step": 445 - }, - { - "completion_length": 60.06160945892334, - "epoch": 0.96, - "grad_norm": 2.495954990386963, - "kl": 0.41845703125, - "learning_rate": 9.00735987178214e-08, - "loss": 0.097, - "reward": 1.4348016560077668, - "reward_std": 0.12248040870763362, - "rewards/accuracy_reward": 0.4455158442258835, - "rewards/format_reward": 0.9892857432365417, + "completion_length": 72.74286041259765, + "epoch": 0.4197761194029851, + "grad_norm": 1.7251629829406738, + "kl": 0.0970703125, + "learning_rate": 9.63825433229453e-07, + "loss": 0.0266, + "reward": 1.518755042552948, + "reward_std": 0.33103890269994735, + "rewards/accuracy_reward": 0.5580406993627548, + "rewards/format_reward": 0.9607143044471741, "step": 450 }, { - "completion_length": 61.350003242492676, - "epoch": 0.9706666666666667, - "grad_norm": 1.4091302156448364, - "kl": 0.43271484375, - "learning_rate": 4.701664667464245e-08, - "loss": 0.0853, - "reward": 1.4209991097450256, - "reward_std": 0.1221369774080813, - "rewards/accuracy_reward": 0.43171332255005834, - "rewards/format_reward": 0.989285734295845, - "step": 455 - }, - { - "completion_length": 57.74553813934326, - "epoch": 0.9813333333333333, - "grad_norm": 1.536041498184204, - "kl": 0.43828125, - "learning_rate": 1.781379906703573e-08, - "loss": 0.0521, - "reward": 1.384695702791214, - "reward_std": 0.11102860439568758, - "rewards/accuracy_reward": 0.39451706781983376, - "rewards/format_reward": 0.9901785969734191, + "completion_length": 66.66250305175781, + "epoch": 0.4291044776119403, + "grad_norm": 1.6775908470153809, + "kl": 0.0539306640625, + "learning_rate": 9.607230424165377e-07, + "loss": 0.0261, + "reward": 1.7472254872322082, + "reward_std": 0.31838269531726837, + "rewards/accuracy_reward": 0.7757968813180923, + "rewards/format_reward": 0.9714285850524902, "step": 460 }, { - "completion_length": 61.55000305175781, - "epoch": 0.992, - "grad_norm": 1.4777039289474487, - "kl": 0.41337890625, - "learning_rate": 2.5057049432519744e-09, - "loss": 0.1045, - "reward": 1.4856867432594298, - "reward_std": 0.1303960378281772, - "rewards/accuracy_reward": 0.4981866620481014, - "rewards/format_reward": 0.9875000298023224, - "step": 465 - }, - { - "completion_length": 63.61012268066406, - "epoch": 0.9984, - "kl": 0.4064127604166667, - "reward": 1.4594595730304718, - "reward_std": 0.11608642246574163, - "rewards/accuracy_reward": 0.4713642696539561, - "rewards/format_reward": 0.9880952537059784, - "step": 468, + "completion_length": 67.56786041259765, + "epoch": 0.43843283582089554, + "grad_norm": 1.6923627853393555, + "kl": 0.077734375, + "learning_rate": 9.574984531741611e-07, + "loss": 0.0047, + "reward": 1.7525209188461304, + "reward_std": 0.3508969932794571, + "rewards/accuracy_reward": 0.7810923129320144, + "rewards/format_reward": 0.9714285850524902, + "step": 470 + }, + { + "completion_length": 65.32321662902832, + "epoch": 0.44776119402985076, + "grad_norm": 1.5190156698226929, + "kl": 0.06884765625, + "learning_rate": 9.54152520766126e-07, + "loss": 0.0059, + "reward": 1.766723918914795, + "reward_std": 0.37145484536886214, + "rewards/accuracy_reward": 0.7952953040599823, + "rewards/format_reward": 0.9714285850524902, + "step": 480 + }, + { + "completion_length": 69.62678909301758, + "epoch": 0.457089552238806, + "grad_norm": 1.7992936372756958, + "kl": 0.056884765625, + "learning_rate": 9.506861326403078e-07, + "loss": 0.0076, + "reward": 1.7028631448745728, + "reward_std": 0.2720485903322697, + "rewards/accuracy_reward": 0.7350059449672699, + "rewards/format_reward": 0.9678571581840515, + "step": 490 + }, + { + "completion_length": 69.01786041259766, + "epoch": 0.4664179104477612, + "grad_norm": 1.5572190284729004, + "kl": 0.0785400390625, + "learning_rate": 9.471002081932765e-07, + "loss": -0.0034, + "reward": 1.6342598915100097, + "reward_std": 0.3042344495654106, + "rewards/accuracy_reward": 0.6664026975631714, + "rewards/format_reward": 0.9678571581840515, + "step": 500 + }, + { + "completion_length": 67.47857551574707, + "epoch": 0.47574626865671643, + "grad_norm": 1.720417857170105, + "kl": 0.0673583984375, + "learning_rate": 9.433956985264429e-07, + "loss": 0.0443, + "reward": 1.6883746266365052, + "reward_std": 0.3526159428060055, + "rewards/accuracy_reward": 0.7348031461238861, + "rewards/format_reward": 0.9535714507102966, + "step": 510 + }, + { + "completion_length": 67.19643173217773, + "epoch": 0.48507462686567165, + "grad_norm": 2.2320525646209717, + "kl": 0.0647216796875, + "learning_rate": 9.395735861937961e-07, + "loss": 0.0205, + "reward": 1.5947609901428224, + "reward_std": 0.36155851259827615, + "rewards/accuracy_reward": 0.6376180469989776, + "rewards/format_reward": 0.9571428775787354, + "step": 520 + }, + { + "completion_length": 67.32678909301758, + "epoch": 0.4944029850746269, + "grad_norm": 1.879781723022461, + "kl": 0.0639404296875, + "learning_rate": 9.35634884941299e-07, + "loss": 0.0091, + "reward": 1.6876210808753966, + "reward_std": 0.3087269324809313, + "rewards/accuracy_reward": 0.705478173494339, + "rewards/format_reward": 0.9821428656578064, + "step": 530 + }, + { + "completion_length": 63.63214683532715, + "epoch": 0.503731343283582, + "grad_norm": 1.3112099170684814, + "kl": 0.0567626953125, + "learning_rate": 9.315806394380106e-07, + "loss": 0.0068, + "reward": 1.6719667196273804, + "reward_std": 0.33818312287330626, + "rewards/accuracy_reward": 0.7041095197200775, + "rewards/format_reward": 0.9678571581840515, + "step": 540 + }, + { + "completion_length": 64.35714569091797, + "epoch": 0.5130597014925373, + "grad_norm": 1.8375056982040405, + "kl": 0.0653076171875, + "learning_rate": 9.274119249990065e-07, + "loss": 0.0127, + "reward": 1.7698135852813721, + "reward_std": 0.2920295000076294, + "rewards/accuracy_reward": 0.8019563972949981, + "rewards/format_reward": 0.9678571581840515, + "step": 550 + }, + { + "completion_length": 68.70536003112792, + "epoch": 0.5223880597014925, + "grad_norm": 1.9574058055877686, + "kl": 0.05888671875, + "learning_rate": 9.231298473001708e-07, + "loss": 0.0036, + "reward": 1.7129918217658997, + "reward_std": 0.34599815756082536, + "rewards/accuracy_reward": 0.7344203501939773, + "rewards/format_reward": 0.9785714387893677, + "step": 560 + }, + { + "completion_length": 69.03214416503906, + "epoch": 0.5317164179104478, + "grad_norm": 1.8017107248306274, + "kl": 0.061181640625, + "learning_rate": 9.187355420849361e-07, + "loss": 0.0119, + "reward": 1.6503841280937195, + "reward_std": 0.3259289041161537, + "rewards/accuracy_reward": 0.693241173028946, + "rewards/format_reward": 0.9571428775787354, + "step": 570 + }, + { + "completion_length": 69.47857475280762, + "epoch": 0.5410447761194029, + "grad_norm": 2.017598867416382, + "kl": 0.10400390625, + "learning_rate": 9.142301748630477e-07, + "loss": 0.0123, + "reward": 1.6612332701683044, + "reward_std": 0.4310059420764446, + "rewards/accuracy_reward": 0.7255189776420593, + "rewards/format_reward": 0.935714316368103, + "step": 580 + }, + { + "completion_length": 68.25000305175782, + "epoch": 0.5503731343283582, + "grad_norm": 2.1093478202819824, + "kl": 0.0540283203125, + "learning_rate": 9.096149406014339e-07, + "loss": 0.0209, + "reward": 1.61421457529068, + "reward_std": 0.3157083109021187, + "rewards/accuracy_reward": 0.6356431007385254, + "rewards/format_reward": 0.9785714387893677, + "step": 590 + }, + { + "completion_length": 68.48750343322754, + "epoch": 0.5597014925373134, + "grad_norm": 2.3081631660461426, + "kl": 0.0652099609375, + "learning_rate": 9.048910634072616e-07, + "loss": 0.0296, + "reward": 1.7730529069900514, + "reward_std": 0.4064901053905487, + "rewards/accuracy_reward": 0.8051956534385681, + "rewards/format_reward": 0.9678571581840515, + "step": 600 + }, + { + "completion_length": 69.62857398986816, + "epoch": 0.5690298507462687, + "grad_norm": 1.806151270866394, + "kl": 0.0667724609375, + "learning_rate": 9.000597962032655e-07, + "loss": 0.0022, + "reward": 1.6536051154136657, + "reward_std": 0.34955830723047254, + "rewards/accuracy_reward": 0.6678907722234726, + "rewards/format_reward": 0.9857142925262451, + "step": 610 + }, + { + "completion_length": 70.45714492797852, + "epoch": 0.5783582089552238, + "grad_norm": 1.78762948513031, + "kl": 0.0666015625, + "learning_rate": 8.951224203954319e-07, + "loss": 0.0182, + "reward": 1.6154707312583922, + "reward_std": 0.39499356150627135, + "rewards/accuracy_reward": 0.6547564685344696, + "rewards/format_reward": 0.9607143044471741, + "step": 620 + }, + { + "completion_length": 68.52500228881836, + "epoch": 0.5876865671641791, + "grad_norm": 1.8373074531555176, + "kl": 0.0646240234375, + "learning_rate": 8.900802455331295e-07, + "loss": -0.0014, + "reward": 1.6453839540481567, + "reward_std": 0.2968112088739872, + "rewards/accuracy_reward": 0.6739553868770599, + "rewards/format_reward": 0.9714285850524902, + "step": 630 + }, + { + "completion_length": 71.13928833007813, + "epoch": 0.5970149253731343, + "grad_norm": 2.3494443893432617, + "kl": 0.065283203125, + "learning_rate": 8.849346089617754e-07, + "loss": 0.0255, + "reward": 1.6130225896835326, + "reward_std": 0.30624193586409093, + "rewards/accuracy_reward": 0.6630225509405137, + "rewards/format_reward": 0.9500000238418579, + "step": 640 + }, + { + "completion_length": 72.21786003112793, + "epoch": 0.6063432835820896, + "grad_norm": 2.3844680786132812, + "kl": 0.0684326171875, + "learning_rate": 8.79686875468128e-07, + "loss": 0.0046, + "reward": 1.7178043603897095, + "reward_std": 0.35469576716423035, + "rewards/accuracy_reward": 0.7392328917980194, + "rewards/format_reward": 0.9785714387893677, + "step": 650 + }, + { + "completion_length": 72.70536041259766, + "epoch": 0.6156716417910447, + "grad_norm": 1.4608559608459473, + "kl": 0.0635986328125, + "learning_rate": 8.74338436918302e-07, + "loss": 0.0023, + "reward": 1.6720715284347534, + "reward_std": 0.36648082435131074, + "rewards/accuracy_reward": 0.6970714896917343, + "rewards/format_reward": 0.975000011920929, + "step": 660 + }, + { + "completion_length": 75.94464569091797, + "epoch": 0.625, + "grad_norm": 2.298766851425171, + "kl": 0.085693359375, + "learning_rate": 8.688907118886022e-07, + "loss": 0.0401, + "reward": 1.6139689564704895, + "reward_std": 0.47850009202957156, + "rewards/accuracy_reward": 0.6818261086940766, + "rewards/format_reward": 0.9321428894996643, + "step": 670 + }, + { + "completion_length": 73.01786041259766, + "epoch": 0.6343283582089553, + "grad_norm": 1.7089123725891113, + "kl": 0.070751953125, + "learning_rate": 8.633451452892706e-07, + "loss": 0.0188, + "reward": 1.705644690990448, + "reward_std": 0.33403306007385253, + "rewards/accuracy_reward": 0.7449303567409515, + "rewards/format_reward": 0.9607143044471741, + "step": 680 + }, + { + "completion_length": 71.15535964965821, + "epoch": 0.6436567164179104, + "grad_norm": 1.5410668849945068, + "kl": 0.1378173828125, + "learning_rate": 8.577032079812508e-07, + "loss": 0.0244, + "reward": 1.7658620357513428, + "reward_std": 0.3162283442914486, + "rewards/accuracy_reward": 0.7944333791732788, + "rewards/format_reward": 0.9714285850524902, + "step": 690 + }, + { + "completion_length": 70.04107437133788, + "epoch": 0.6529850746268657, + "grad_norm": 1.481061577796936, + "kl": 0.0783447265625, + "learning_rate": 8.519663963860686e-07, + "loss": 0.0155, + "reward": 1.8003361344337463, + "reward_std": 0.34957138299942014, + "rewards/accuracy_reward": 0.8467646658420562, + "rewards/format_reward": 0.9535714507102966, + "step": 700 + }, + { + "completion_length": 75.01786041259766, + "epoch": 0.6623134328358209, + "grad_norm": 1.845693826675415, + "kl": 0.067138671875, + "learning_rate": 8.461362320889337e-07, + "loss": 0.035, + "reward": 1.5683961391448975, + "reward_std": 0.3354021891951561, + "rewards/accuracy_reward": 0.6326818078756332, + "rewards/format_reward": 0.935714316368103, + "step": 710 + }, + { + "completion_length": 69.06786041259765, + "epoch": 0.6716417910447762, + "grad_norm": 2.0775063037872314, + "kl": 0.0607421875, + "learning_rate": 8.402142614351664e-07, + "loss": 0.0032, + "reward": 1.591350018978119, + "reward_std": 0.2729812040925026, + "rewards/accuracy_reward": 0.6199213981628418, + "rewards/format_reward": 0.9714285850524902, + "step": 720 + }, + { + "completion_length": 67.36786003112793, + "epoch": 0.6809701492537313, + "grad_norm": 1.322515606880188, + "kl": 0.07421875, + "learning_rate": 8.342020551200577e-07, + "loss": 0.0268, + "reward": 1.8004352688789367, + "reward_std": 0.2811557039618492, + "rewards/accuracy_reward": 0.8147209256887435, + "rewards/format_reward": 0.9857142925262451, + "step": 730 + }, + { + "completion_length": 69.08035926818847, + "epoch": 0.6902985074626866, + "grad_norm": 1.9179843664169312, + "kl": 8.136181640625, + "learning_rate": 8.281012077722711e-07, + "loss": 0.5182, + "reward": 1.6437391877174377, + "reward_std": 0.3600235849618912, + "rewards/accuracy_reward": 0.6865962952375412, + "rewards/format_reward": 0.9571428775787354, + "step": 740 + }, + { + "completion_length": 70.78393173217773, + "epoch": 0.6996268656716418, + "grad_norm": 0.5337070226669312, + "kl": 0.0772705078125, + "learning_rate": 8.219133375308958e-07, + "loss": 0.0034, + "reward": 1.654793643951416, + "reward_std": 0.28940345235168935, + "rewards/accuracy_reward": 0.6833649843931198, + "rewards/format_reward": 0.9714285850524902, + "step": 750 + }, + { + "completion_length": 72.35714569091797, + "epoch": 0.7089552238805971, + "grad_norm": 1.4123231172561646, + "kl": 0.0892822265625, + "learning_rate": 8.156400856162649e-07, + "loss": 0.025, + "reward": 1.8219229459762574, + "reward_std": 0.35138517022132876, + "rewards/accuracy_reward": 0.85763720870018, + "rewards/format_reward": 0.9642857313156128, + "step": 760 + }, + { + "completion_length": 69.29643096923829, + "epoch": 0.7182835820895522, + "grad_norm": 1.3444018363952637, + "kl": 0.076171875, + "learning_rate": 8.09283115894652e-07, + "loss": 0.0271, + "reward": 1.7166542887687684, + "reward_std": 0.30117630362510683, + "rewards/accuracy_reward": 0.7345113605260849, + "rewards/format_reward": 0.9821428656578064, + "step": 770 + }, + { + "completion_length": 72.51250381469727, + "epoch": 0.7276119402985075, + "grad_norm": 1.8944647312164307, + "kl": 0.09345703125, + "learning_rate": 8.028441144369595e-07, + "loss": 0.0448, + "reward": 1.6563441157341003, + "reward_std": 0.43048029839992524, + "rewards/accuracy_reward": 0.6992011934518814, + "rewards/format_reward": 0.9571428775787354, + "step": 780 + }, + { + "completion_length": 70.47321739196778, + "epoch": 0.7369402985074627, + "grad_norm": 1.7745001316070557, + "kl": 0.0919921875, + "learning_rate": 7.963247890715207e-07, + "loss": 0.0444, + "reward": 1.7815419554710388, + "reward_std": 0.3929228842258453, + "rewards/accuracy_reward": 0.8315418988466263, + "rewards/format_reward": 0.9500000238418579, + "step": 790 + }, + { + "completion_length": 72.16964607238769, + "epoch": 0.746268656716418, + "grad_norm": 1.974408745765686, + "kl": 0.0925048828125, + "learning_rate": 7.897268689311277e-07, + "loss": 0.0183, + "reward": 1.7186100244522096, + "reward_std": 0.3842825159430504, + "rewards/accuracy_reward": 0.7757528483867645, + "rewards/format_reward": 0.9428571701049805, + "step": 800 + }, + { + "completion_length": 67.99464569091796, + "epoch": 0.7555970149253731, + "grad_norm": 2.2044825553894043, + "kl": 0.0998046875, + "learning_rate": 7.830521039944111e-07, + "loss": 0.022, + "reward": 1.7123480796813966, + "reward_std": 0.41690561175346375, + "rewards/accuracy_reward": 0.7516337633132935, + "rewards/format_reward": 0.9607143044471741, + "step": 810 + }, + { + "completion_length": 68.9446460723877, + "epoch": 0.7649253731343284, + "grad_norm": 2.256535530090332, + "kl": 0.117919921875, + "learning_rate": 7.763022646216898e-07, + "loss": 0.0425, + "reward": 1.6642390131950378, + "reward_std": 0.40519198030233383, + "rewards/accuracy_reward": 0.717810434103012, + "rewards/format_reward": 0.9464285969734192, + "step": 820 + }, + { + "completion_length": 65.02678985595703, + "epoch": 0.7742537313432836, + "grad_norm": 3.2883059978485107, + "kl": 0.1132568359375, + "learning_rate": 7.69479141085415e-07, + "loss": 0.0365, + "reward": 1.7119892239570618, + "reward_std": 0.42224171608686445, + "rewards/accuracy_reward": 0.7691320240497589, + "rewards/format_reward": 0.9428571701049805, + "step": 830 + }, + { + "completion_length": 61.841073989868164, + "epoch": 0.7835820895522388, + "grad_norm": 1.8810310363769531, + "kl": 0.1488525390625, + "learning_rate": 7.62584543095333e-07, + "loss": 0.0248, + "reward": 1.7832204461097718, + "reward_std": 0.3654919177293777, + "rewards/accuracy_reward": 0.8117918312549591, + "rewards/format_reward": 0.9714285850524902, + "step": 840 + }, + { + "completion_length": 61.98393096923828, + "epoch": 0.792910447761194, + "grad_norm": 1.9467182159423828, + "kl": 0.1082763671875, + "learning_rate": 7.556202993184919e-07, + "loss": 0.0256, + "reward": 1.7194927215576172, + "reward_std": 0.3814562723040581, + "rewards/accuracy_reward": 0.7552069336175918, + "rewards/format_reward": 0.9642857313156128, + "step": 850 + }, + { + "completion_length": 66.74464569091796, + "epoch": 0.8022388059701493, + "grad_norm": 2.014716863632202, + "kl": 0.08974609375, + "learning_rate": 7.485882568942221e-07, + "loss": 0.0456, + "reward": 1.6012683391571045, + "reward_std": 0.40750244110822675, + "rewards/accuracy_reward": 0.6476968169212342, + "rewards/format_reward": 0.9535714507102966, + "step": 860 + }, + { + "completion_length": 66.23214721679688, + "epoch": 0.8115671641791045, + "grad_norm": 2.129955768585205, + "kl": 0.093798828125, + "learning_rate": 7.414902809442152e-07, + "loss": 0.0281, + "reward": 1.6679976344108582, + "reward_std": 0.34511433243751527, + "rewards/accuracy_reward": 0.7144261717796325, + "rewards/format_reward": 0.9535714507102966, + "step": 870 + }, + { + "completion_length": 67.16071701049805, + "epoch": 0.8208955223880597, + "grad_norm": 2.186460494995117, + "kl": 0.0970703125, + "learning_rate": 7.34328254077834e-07, + "loss": 0.0361, + "reward": 1.6634240865707397, + "reward_std": 0.37089474499225616, + "rewards/accuracy_reward": 0.7205668687820435, + "rewards/format_reward": 0.9428571701049805, + "step": 880 + }, + { + "completion_length": 65.0785743713379, + "epoch": 0.8302238805970149, + "grad_norm": 1.7805360555648804, + "kl": 0.110400390625, + "learning_rate": 7.271040758927851e-07, + "loss": 0.0441, + "reward": 1.8284477829933166, + "reward_std": 0.4117472216486931, + "rewards/accuracy_reward": 0.8641619980335236, + "rewards/format_reward": 0.9642857313156128, + "step": 890 + }, + { + "completion_length": 70.82321891784667, + "epoch": 0.8395522388059702, + "grad_norm": 2.1508734226226807, + "kl": 0.1005615234375, + "learning_rate": 7.198196624712854e-07, + "loss": 0.016, + "reward": 1.8012970447540284, + "reward_std": 0.3139400020241737, + "rewards/accuracy_reward": 0.8227256000041961, + "rewards/format_reward": 0.9785714387893677, + "step": 900 + }, + { + "completion_length": 74.9428611755371, + "epoch": 0.8488805970149254, + "grad_norm": 2.10746693611145, + "kl": 0.08759765625, + "learning_rate": 7.124769458718553e-07, + "loss": 0.056, + "reward": 1.5954893708229065, + "reward_std": 0.34964061081409453, + "rewards/accuracy_reward": 0.6454893425107002, + "rewards/format_reward": 0.9500000238418579, + "step": 910 + }, + { + "completion_length": 75.12500267028808, + "epoch": 0.8582089552238806, + "grad_norm": 2.46114444732666, + "kl": 0.127734375, + "learning_rate": 7.050778736168757e-07, + "loss": 0.0763, + "reward": 1.6531456112861633, + "reward_std": 0.43803721368312837, + "rewards/accuracy_reward": 0.7067169964313507, + "rewards/format_reward": 0.9464285969734192, + "step": 920 + }, + { + "completion_length": 71.69286041259765, + "epoch": 0.8675373134328358, + "grad_norm": 1.798766851425171, + "kl": 0.0821044921875, + "learning_rate": 6.976244081760421e-07, + "loss": 0.0107, + "reward": 1.6478505611419678, + "reward_std": 0.30902823358774184, + "rewards/accuracy_reward": 0.6799933612346649, + "rewards/format_reward": 0.9678571581840515, + "step": 930 + }, + { + "completion_length": 74.23929138183594, + "epoch": 0.8768656716417911, + "grad_norm": 1.387108564376831, + "kl": 0.082666015625, + "learning_rate": 6.90118526445857e-07, + "loss": 0.0423, + "reward": 1.7309996485710144, + "reward_std": 0.32146010994911195, + "rewards/accuracy_reward": 0.759571023285389, + "rewards/format_reward": 0.9714285850524902, + "step": 940 + }, + { + "completion_length": 68.40893211364747, + "epoch": 0.8861940298507462, + "grad_norm": 2.0120604038238525, + "kl": 0.0880615234375, + "learning_rate": 6.825622192252921e-07, + "loss": 0.0292, + "reward": 1.8234087228775024, + "reward_std": 0.3823896735906601, + "rewards/accuracy_reward": 0.855551540851593, + "rewards/format_reward": 0.9678571581840515, + "step": 950 + }, + { + "completion_length": 73.71428909301758, + "epoch": 0.8955223880597015, + "grad_norm": 1.7282947301864624, + "kl": 0.102880859375, + "learning_rate": 6.749574906877657e-07, + "loss": 0.0738, + "reward": 1.68308025598526, + "reward_std": 0.30713263638317584, + "rewards/accuracy_reward": 0.7152230381965637, + "rewards/format_reward": 0.9678571581840515, + "step": 960 + }, + { + "completion_length": 70.13393173217773, + "epoch": 0.9048507462686567, + "grad_norm": 2.129154920578003, + "kl": 0.0845703125, + "learning_rate": 6.673063578495724e-07, + "loss": 0.0577, + "reward": 1.7846791386604308, + "reward_std": 0.3723163902759552, + "rewards/accuracy_reward": 0.8168219566345215, + "rewards/format_reward": 0.9678571581840515, + "step": 970 + }, + { + "completion_length": 69.42321739196777, + "epoch": 0.914179104477612, + "grad_norm": 1.6752643585205078, + "kl": 0.112060546875, + "learning_rate": 6.596108500349054e-07, + "loss": 0.041, + "reward": 1.7457309246063233, + "reward_std": 0.31748437383212147, + "rewards/accuracy_reward": 0.777873745560646, + "rewards/format_reward": 0.9678571581840515, + "step": 980 + }, + { + "completion_length": 73.28928833007812, + "epoch": 0.9235074626865671, + "grad_norm": 2.0198373794555664, + "kl": 0.093896484375, + "learning_rate": 6.518730083376159e-07, + "loss": 0.0537, + "reward": 1.7607192754745484, + "reward_std": 0.34001109227538107, + "rewards/accuracy_reward": 0.8000049650669098, + "rewards/format_reward": 0.9607143044471741, + "step": 990 + }, + { + "completion_length": 69.72500381469726, + "epoch": 0.9328358208955224, + "grad_norm": 2.3777832984924316, + "kl": 0.09541015625, + "learning_rate": 6.440948850798489e-07, + "loss": 0.0244, + "reward": 1.8093934774398803, + "reward_std": 0.38601240515708923, + "rewards/accuracy_reward": 0.852250587940216, + "rewards/format_reward": 0.9571428775787354, + "step": 1000 + }, + { + "completion_length": 67.48928871154786, + "epoch": 0.9421641791044776, + "grad_norm": 2.5734946727752686, + "kl": 0.120947265625, + "learning_rate": 6.362785432677031e-07, + "loss": 0.0295, + "reward": 1.7467341661453246, + "reward_std": 0.3131743848323822, + "rewards/accuracy_reward": 0.7717340975999832, + "rewards/format_reward": 0.975000011920929, + "step": 1010 + }, + { + "completion_length": 70.03214569091797, + "epoch": 0.9514925373134329, + "grad_norm": 1.714362621307373, + "kl": 0.70224609375, + "learning_rate": 6.284260560440555e-07, + "loss": 0.1081, + "reward": 1.706698751449585, + "reward_std": 0.3540080994367599, + "rewards/accuracy_reward": 0.7531272619962692, + "rewards/format_reward": 0.9535714507102966, + "step": 1020 + }, + { + "completion_length": 72.99643173217774, + "epoch": 0.960820895522388, + "grad_norm": 2.4071664810180664, + "kl": 0.10888671875, + "learning_rate": 6.205395061386977e-07, + "loss": 0.0534, + "reward": 1.7199920177459718, + "reward_std": 0.3448140546679497, + "rewards/accuracy_reward": 0.7557062715291977, + "rewards/format_reward": 0.9642857313156128, + "step": 1030 + }, + { + "completion_length": 76.91786041259766, + "epoch": 0.9701492537313433, + "grad_norm": 1.9239792823791504, + "kl": 0.10830078125, + "learning_rate": 6.126209853159292e-07, + "loss": 0.0593, + "reward": 1.6530640482902528, + "reward_std": 0.3546563595533371, + "rewards/accuracy_reward": 0.7102068483829498, + "rewards/format_reward": 0.9428571701049805, + "step": 1040 + }, + { + "completion_length": 70.87143211364746, + "epoch": 0.9794776119402985, + "grad_norm": 1.4151670932769775, + "kl": 0.119775390625, + "learning_rate": 6.046725938197562e-07, + "loss": 0.0386, + "reward": 1.5671744227409363, + "reward_std": 0.28503450006246567, + "rewards/accuracy_reward": 0.6171743661165238, + "rewards/format_reward": 0.9500000238418579, + "step": 1050 + }, + { + "completion_length": 66.54107513427735, + "epoch": 0.9888059701492538, + "grad_norm": 1.8404815196990967, + "kl": 0.17099609375, + "learning_rate": 5.966964398168388e-07, + "loss": 0.0115, + "reward": 1.7105038046836853, + "reward_std": 0.3218017935752869, + "rewards/accuracy_reward": 0.7497894197702408, + "rewards/format_reward": 0.9607143044471741, + "step": 1060 + }, + { + "completion_length": 70.21071739196778, + "epoch": 0.9981343283582089, + "grad_norm": 2.08766770362854, + "kl": 0.127783203125, + "learning_rate": 5.886946388373387e-07, + "loss": 0.0594, + "reward": 1.7187183141708373, + "reward_std": 0.3648382157087326, + "rewards/accuracy_reward": 0.7544325530529022, + "rewards/format_reward": 0.9642857313156128, + "step": 1070 + }, + { + "epoch": 1.0, + "eval_completion_length": 75.61289332463191, + "eval_kl": 0.09996619591346154, + "eval_loss": 0.06692659854888916, + "eval_reward": 1.4602800424282367, + "eval_reward_std": 0.42169225674409133, + "eval_rewards/accuracy_reward": 0.5014887635524456, + "eval_rewards/format_reward": 0.9587912467809824, + "eval_runtime": 47.6825, + "eval_samples_per_second": 6.292, + "eval_steps_per_second": 0.063, + "step": 1072 + }, + { + "completion_length": 70.44866466522217, + "epoch": 1.007462686567164, + "grad_norm": 2.2260494232177734, + "kl": 0.1055908203125, + "learning_rate": 5.806693132138119e-07, + "loss": 0.0211, + "reward": 1.602527216076851, + "reward_std": 0.34793712198734283, + "rewards/accuracy_reward": 0.6516343206167221, + "rewards/format_reward": 0.9508928805589676, + "step": 1080 + }, + { + "completion_length": 75.92857513427734, + "epoch": 1.0167910447761195, + "grad_norm": 3.099503755569458, + "kl": 0.157080078125, + "learning_rate": 5.72622591518301e-07, + "loss": 0.0704, + "reward": 1.6265796184539796, + "reward_std": 0.4228915572166443, + "rewards/accuracy_reward": 0.6872938483953476, + "rewards/format_reward": 0.9392857432365418, + "step": 1090 + }, + { + "completion_length": 79.1178596496582, + "epoch": 1.0261194029850746, + "grad_norm": 1.8064438104629517, + "kl": 0.111572265625, + "learning_rate": 5.645566079977671e-07, + "loss": 0.0929, + "reward": 1.6500928401947021, + "reward_std": 0.3845163181424141, + "rewards/accuracy_reward": 0.710807067155838, + "rewards/format_reward": 0.9392857432365418, + "step": 1100 + }, + { + "completion_length": 69.94464645385742, + "epoch": 1.0354477611940298, + "grad_norm": 2.362318515777588, + "kl": 0.160107421875, + "learning_rate": 5.564735020080223e-07, + "loss": 0.0267, + "reward": 1.580272126197815, + "reward_std": 0.3825854122638702, + "rewards/accuracy_reward": 0.6267006665468215, + "rewards/format_reward": 0.9535714507102966, + "step": 1110 + }, + { + "completion_length": 76.14643249511718, + "epoch": 1.044776119402985, + "grad_norm": 2.1877493858337402, + "kl": 0.1645751953125, + "learning_rate": 5.48375417446302e-07, + "loss": 0.0467, + "reward": 1.584252393245697, + "reward_std": 0.3610908523201942, + "rewards/accuracy_reward": 0.6413952112197876, + "rewards/format_reward": 0.9428571701049805, + "step": 1120 + }, + { + "completion_length": 70.25536041259765, + "epoch": 1.0541044776119404, + "grad_norm": 2.7232489585876465, + "kl": 0.125830078125, + "learning_rate": 5.402645021826366e-07, + "loss": 0.0578, + "reward": 1.772194254398346, + "reward_std": 0.40680772215127947, + "rewards/accuracy_reward": 0.822194242477417, + "rewards/format_reward": 0.9500000238418579, + "step": 1130 + }, + { + "completion_length": 65.6017879486084, + "epoch": 1.0634328358208955, + "grad_norm": 2.4742653369903564, + "kl": 0.117236328125, + "learning_rate": 5.321429074901677e-07, + "loss": 0.033, + "reward": 1.6377050042152406, + "reward_std": 0.4377037823200226, + "rewards/accuracy_reward": 0.71627636551857, + "rewards/format_reward": 0.9214286088943482, + "step": 1140 + }, + { + "completion_length": 70.41428909301757, + "epoch": 1.0727611940298507, + "grad_norm": 1.614649772644043, + "kl": 0.133837890625, + "learning_rate": 5.240127874745607e-07, + "loss": 0.0438, + "reward": 1.5852539539337158, + "reward_std": 0.39130024015903475, + "rewards/accuracy_reward": 0.6495396479964256, + "rewards/format_reward": 0.9357143104076385, + "step": 1150 + }, + { + "completion_length": 77.19286155700684, + "epoch": 1.0820895522388059, + "grad_norm": 1.8157048225402832, + "kl": 0.21376953125, + "learning_rate": 5.158762985026694e-07, + "loss": 0.0551, + "reward": 1.6831493973731995, + "reward_std": 0.41253395304083823, + "rewards/accuracy_reward": 0.7402921438217163, + "rewards/format_reward": 0.9428571701049805, + "step": 1160 + }, + { + "completion_length": 76.68214645385743, + "epoch": 1.0914179104477613, + "grad_norm": 2.1081531047821045, + "kl": 0.168408203125, + "learning_rate": 5.077355986305972e-07, + "loss": 0.0945, + "reward": 1.6710915088653564, + "reward_std": 0.3878686547279358, + "rewards/accuracy_reward": 0.7246629059314728, + "rewards/format_reward": 0.9464285969734192, + "step": 1170 + }, + { + "completion_length": 77.65536041259766, + "epoch": 1.1007462686567164, + "grad_norm": 6341.7412109375, + "kl": 12.6751953125, + "learning_rate": 4.995928470313124e-07, + "loss": 0.3487, + "reward": 1.6495628237724305, + "reward_std": 0.3766570270061493, + "rewards/accuracy_reward": 0.7209913581609726, + "rewards/format_reward": 0.9285714566707611, + "step": 1180 + }, + { + "completion_length": 74.65714645385742, + "epoch": 1.1100746268656716, + "grad_norm": 4.411093711853027, + "kl": 0.16279296875, + "learning_rate": 4.914502034219666e-07, + "loss": 0.0718, + "reward": 1.7520782113075257, + "reward_std": 0.44092509597539903, + "rewards/accuracy_reward": 0.8342210054397583, + "rewards/format_reward": 0.9178571760654449, + "step": 1190 + }, + { + "completion_length": 73.26071701049804, + "epoch": 1.1194029850746268, + "grad_norm": 2.536661148071289, + "kl": 0.17431640625, + "learning_rate": 4.833098274910694e-07, + "loss": 0.0623, + "reward": 1.7323001742362976, + "reward_std": 0.43993830531835554, + "rewards/accuracy_reward": 0.7894430190324784, + "rewards/format_reward": 0.9428571701049805, + "step": 1200 + }, + { + "completion_length": 75.08928985595703, + "epoch": 1.1287313432835822, + "grad_norm": 2.4216952323913574, + "kl": 0.14658203125, + "learning_rate": 4.7517387832566974e-07, + "loss": 0.0496, + "reward": 1.5848352193832398, + "reward_std": 0.41373440325260163, + "rewards/accuracy_reward": 0.6455494791269303, + "rewards/format_reward": 0.9392857372760772, + "step": 1210 + }, + { + "completion_length": 74.93214645385743, + "epoch": 1.1380597014925373, + "grad_norm": 1.8166048526763916, + "kl": 0.13095703125, + "learning_rate": 4.6704451383869697e-07, + "loss": 0.048, + "reward": 1.6902888536453247, + "reward_std": 0.3345677748322487, + "rewards/accuracy_reward": 0.726003086566925, + "rewards/format_reward": 0.9642857253551483, + "step": 1220 + }, + { + "completion_length": 81.00536041259765, + "epoch": 1.1473880597014925, + "grad_norm": 8.36291217803955, + "kl": 0.1490234375, + "learning_rate": 4.589238901966142e-07, + "loss": 0.0925, + "reward": 1.6237655639648438, + "reward_std": 0.3611886277794838, + "rewards/accuracy_reward": 0.6809083729982376, + "rewards/format_reward": 0.9428571701049805, + "step": 1230 + }, + { + "completion_length": 73.41786117553711, + "epoch": 1.1567164179104479, + "grad_norm": 2.9101083278656006, + "kl": 0.3484375, + "learning_rate": 4.5081416124753437e-07, + "loss": 0.0762, + "reward": 1.6043060779571534, + "reward_std": 0.4248610377311707, + "rewards/accuracy_reward": 0.6900202602148056, + "rewards/format_reward": 0.9142857551574707, + "step": 1240 + }, + { + "completion_length": 72.5267894744873, + "epoch": 1.166044776119403, + "grad_norm": 2.438911199569702, + "kl": 0.18359375, + "learning_rate": 4.4271747794994966e-07, + "loss": 0.0714, + "reward": 1.7026634693145752, + "reward_std": 0.4447076439857483, + "rewards/accuracy_reward": 0.7812348544597626, + "rewards/format_reward": 0.9214286029338836, + "step": 1250 + }, + { + "completion_length": 79.20535926818847, + "epoch": 1.1753731343283582, + "grad_norm": 8.309577941894531, + "kl": 0.183251953125, + "learning_rate": 4.3463598780223076e-07, + "loss": 0.073, + "reward": 1.5920994281768799, + "reward_std": 0.4000379353761673, + "rewards/accuracy_reward": 0.6528136610984803, + "rewards/format_reward": 0.9392857372760772, + "step": 1260 + }, + { + "completion_length": 69.58571662902833, + "epoch": 1.1847014925373134, + "grad_norm": 1.868142008781433, + "kl": 0.199267578125, + "learning_rate": 4.265718342730409e-07, + "loss": 0.012, + "reward": 1.71357262134552, + "reward_std": 0.44742541313171386, + "rewards/accuracy_reward": 0.7850010991096497, + "rewards/format_reward": 0.9285714566707611, + "step": 1270 + }, + { + "completion_length": 68.57857513427734, + "epoch": 1.1940298507462686, + "grad_norm": 2.8732476234436035, + "kl": 0.225830078125, + "learning_rate": 4.185271562328193e-07, + "loss": 0.0114, + "reward": 1.7408075332641602, + "reward_std": 0.3936370387673378, + "rewards/accuracy_reward": 0.7908074587583542, + "rewards/format_reward": 0.9500000238418579, + "step": 1280 + }, + { + "completion_length": 69.23571701049805, + "epoch": 1.203358208955224, + "grad_norm": 1.9602234363555908, + "kl": 0.15458984375, + "learning_rate": 4.1050408738648534e-07, + "loss": 0.0206, + "reward": 1.7514087915420533, + "reward_std": 0.3404985681176186, + "rewards/accuracy_reward": 0.8121230438351631, + "rewards/format_reward": 0.9392857372760772, + "step": 1290 + }, + { + "completion_length": 74.0803596496582, + "epoch": 1.212686567164179, + "grad_norm": 1.8836650848388672, + "kl": 0.152587890625, + "learning_rate": 4.025047557075116e-07, + "loss": 0.042, + "reward": 1.73599853515625, + "reward_std": 0.4292837709188461, + "rewards/accuracy_reward": 0.7967127650976181, + "rewards/format_reward": 0.9392857432365418, + "step": 1300 + }, + { + "completion_length": 72.14821701049804, + "epoch": 1.2220149253731343, + "grad_norm": 2.230212688446045, + "kl": 0.2654296875, + "learning_rate": 3.945312828735179e-07, + "loss": 0.0592, + "reward": 1.5891595482826233, + "reward_std": 0.4225751027464867, + "rewards/accuracy_reward": 0.660588052868843, + "rewards/format_reward": 0.9285714626312256, + "step": 1310 + }, + { + "completion_length": 68.49643135070801, + "epoch": 1.2313432835820897, + "grad_norm": 2.6835989952087402, + "kl": 0.22177734375, + "learning_rate": 3.8658578370353384e-07, + "loss": 0.0292, + "reward": 1.7525344610214233, + "reward_std": 0.34538685381412504, + "rewards/accuracy_reward": 0.8132486999034881, + "rewards/format_reward": 0.9392857372760772, + "step": 1320 + }, + { + "completion_length": 68.90893211364747, + "epoch": 1.2406716417910448, + "grad_norm": 3.3443515300750732, + "kl": 0.206689453125, + "learning_rate": 3.7867036559708257e-07, + "loss": 0.039, + "reward": 1.6924095749855042, + "reward_std": 0.40744485706090927, + "rewards/accuracy_reward": 0.7638381004333497, + "rewards/format_reward": 0.9285714626312256, + "step": 1330 + }, + { + "completion_length": 69.20714645385742, + "epoch": 1.25, + "grad_norm": 4.8876471519470215, + "kl": 0.1783203125, + "learning_rate": 3.7078712797523086e-07, + "loss": 0.0464, + "reward": 1.5992920517921447, + "reward_std": 0.42251739650964737, + "rewards/accuracy_reward": 0.6492919966578483, + "rewards/format_reward": 0.9500000238418579, + "step": 1340 + }, + { + "completion_length": 68.11250305175781, + "epoch": 1.2593283582089552, + "grad_norm": 2.7070319652557373, + "kl": 0.220703125, + "learning_rate": 3.6293816172375555e-07, + "loss": 0.0187, + "reward": 1.653515362739563, + "reward_std": 0.42918106317520144, + "rewards/accuracy_reward": 0.7499439001083374, + "rewards/format_reward": 0.9035714745521546, + "step": 1350 + }, + { + "completion_length": 68.8857177734375, + "epoch": 1.2686567164179103, + "grad_norm": 1.7205545902252197, + "kl": 0.180078125, + "learning_rate": 3.5512554863857425e-07, + "loss": 0.0178, + "reward": 1.6903465867042542, + "reward_std": 0.42783099561929705, + "rewards/accuracy_reward": 0.743917989730835, + "rewards/format_reward": 0.9464285910129547, + "step": 1360 + }, + { + "completion_length": 73.38214645385742, + "epoch": 1.2779850746268657, + "grad_norm": 8.320770263671875, + "kl": 0.21845703125, + "learning_rate": 3.4735136087358643e-07, + "loss": 0.058, + "reward": 1.621816062927246, + "reward_std": 0.44447104036808016, + "rewards/accuracy_reward": 0.6825302928686142, + "rewards/format_reward": 0.9392857432365418, + "step": 1370 + }, + { + "completion_length": 70.06428871154785, + "epoch": 1.287313432835821, + "grad_norm": 3.1990509033203125, + "kl": 0.27470703125, + "learning_rate": 3.3961766039107257e-07, + "loss": 0.0647, + "reward": 1.6193347573280334, + "reward_std": 0.44890994131565093, + "rewards/accuracy_reward": 0.6871918082237244, + "rewards/format_reward": 0.9321428894996643, + "step": 1380 + }, + { + "completion_length": 68.82857322692871, + "epoch": 1.296641791044776, + "grad_norm": 2.818559408187866, + "kl": 1.778125, + "learning_rate": 3.3192649841479636e-07, + "loss": 0.1511, + "reward": 1.691490936279297, + "reward_std": 0.4614251464605331, + "rewards/accuracy_reward": 0.7557766169309617, + "rewards/format_reward": 0.935714316368103, + "step": 1390 + }, + { + "completion_length": 73.2642894744873, + "epoch": 1.3059701492537314, + "grad_norm": 3.4755916595458984, + "kl": 0.253173828125, + "learning_rate": 3.242799148859533e-07, + "loss": 0.0903, + "reward": 1.665787649154663, + "reward_std": 0.46672678291797637, + "rewards/accuracy_reward": 0.7479305237531662, + "rewards/format_reward": 0.9178571820259094, + "step": 1400 + }, + { + "completion_length": 70.29464569091797, + "epoch": 1.3152985074626866, + "grad_norm": 2.2736079692840576, + "kl": 0.23642578125, + "learning_rate": 3.1667993792211547e-07, + "loss": 0.0266, + "reward": 1.6500344276428223, + "reward_std": 0.3108953431248665, + "rewards/accuracy_reward": 0.6928915083408356, + "rewards/format_reward": 0.9571428716182708, + "step": 1410 + }, + { + "completion_length": 77.02321701049804, + "epoch": 1.3246268656716418, + "grad_norm": 3.1085715293884277, + "kl": 0.33759765625, + "learning_rate": 3.091285832793082e-07, + "loss": 0.0969, + "reward": 1.6022907137870788, + "reward_std": 0.5451263844966888, + "rewards/accuracy_reward": 0.7130049914121628, + "rewards/format_reward": 0.8892857491970062, + "step": 1420 + }, + { + "completion_length": 71.59464607238769, + "epoch": 1.333955223880597, + "grad_norm": 21.11083221435547, + "kl": 0.3099609375, + "learning_rate": 3.016278538173689e-07, + "loss": 0.0348, + "reward": 1.6516450881958007, + "reward_std": 0.43751112520694735, + "rewards/accuracy_reward": 0.7302164614200592, + "rewards/format_reward": 0.9214286029338836, + "step": 1430 + }, + { + "completion_length": 65.63393096923828, + "epoch": 1.3432835820895521, + "grad_norm": 1.5812572240829468, + "kl": 0.18408203125, + "learning_rate": 2.9417973896872337e-07, + "loss": 0.0258, + "reward": 1.7273393750190735, + "reward_std": 0.41720412373542787, + "rewards/accuracy_reward": 0.7844821512699127, + "rewards/format_reward": 0.9428571701049805, + "step": 1440 + }, + { + "completion_length": 76.08035926818847, + "epoch": 1.3526119402985075, + "grad_norm": 2.308365821838379, + "kl": 0.2029296875, + "learning_rate": 2.8678621421072625e-07, + "loss": 0.0963, + "reward": 1.7020103931427002, + "reward_std": 0.4564393892884254, + "rewards/accuracy_reward": 0.76986745595932, + "rewards/format_reward": 0.9321428894996643, + "step": 1450 + }, + { + "completion_length": 64.87857437133789, + "epoch": 1.3619402985074627, + "grad_norm": 2.462268829345703, + "kl": 0.1779296875, + "learning_rate": 2.7944924054170084e-07, + "loss": 0.0098, + "reward": 1.7308134078979491, + "reward_std": 0.30605203956365584, + "rewards/accuracy_reward": 0.7879562079906464, + "rewards/format_reward": 0.9428571701049805, + "step": 1460 + }, + { + "completion_length": 69.03214683532715, + "epoch": 1.3712686567164178, + "grad_norm": 6.558331489562988, + "kl": 0.149462890625, + "learning_rate": 2.7217076396081927e-07, + "loss": 0.026, + "reward": 1.738177752494812, + "reward_std": 0.401779568195343, + "rewards/accuracy_reward": 0.7881776750087738, + "rewards/format_reward": 0.9500000238418579, + "step": 1470 + }, + { + "completion_length": 73.4142894744873, + "epoch": 1.3805970149253732, + "grad_norm": 2.197303533554077, + "kl": 0.21044921875, + "learning_rate": 2.64952714951963e-07, + "loss": 0.1136, + "reward": 1.7457318425178527, + "reward_std": 0.46812490820884706, + "rewards/accuracy_reward": 0.8278746277093887, + "rewards/format_reward": 0.9178571820259094, + "step": 1480 + }, + { + "completion_length": 69.76071701049804, + "epoch": 1.3899253731343284, + "grad_norm": 4.07311487197876, + "kl": 0.340771484375, + "learning_rate": 2.5779700797169586e-07, + "loss": 0.0531, + "reward": 1.632162046432495, + "reward_std": 0.41271309554576874, + "rewards/accuracy_reward": 0.7035905659198761, + "rewards/format_reward": 0.9285714626312256, + "step": 1490 + }, + { + "completion_length": 78.0785758972168, + "epoch": 1.3992537313432836, + "grad_norm": 1.6365816593170166, + "kl": 0.516943359375, + "learning_rate": 2.5070554094148916e-07, + "loss": 0.1584, + "reward": 1.590348196029663, + "reward_std": 0.3931408829987049, + "rewards/accuracy_reward": 0.6510624587535858, + "rewards/format_reward": 0.9392857372760772, + "step": 1500 + }, + { + "completion_length": 68.17500381469726, + "epoch": 1.4085820895522387, + "grad_norm": 1.9180026054382324, + "kl": 0.14326171875, + "learning_rate": 2.436801947443335e-07, + "loss": 0.0294, + "reward": 1.7068129420280456, + "reward_std": 0.39681439399719237, + "rewards/accuracy_reward": 0.7675271093845367, + "rewards/format_reward": 0.9392857432365418, + "step": 1510 + }, + { + "completion_length": 76.95178985595703, + "epoch": 1.417910447761194, + "grad_norm": 7.032691478729248, + "kl": 0.202197265625, + "learning_rate": 2.367228327258674e-07, + "loss": 0.1028, + "reward": 1.7151259303092956, + "reward_std": 0.3991871036589146, + "rewards/accuracy_reward": 0.7686973065137863, + "rewards/format_reward": 0.9464285969734192, + "step": 1520 + }, + { + "completion_length": 69.95714569091797, + "epoch": 1.4272388059701493, + "grad_norm": 2.3910961151123047, + "kl": 0.24638671875, + "learning_rate": 2.2983530020015933e-07, + "loss": 0.0588, + "reward": 1.7471950054168701, + "reward_std": 0.37923727482557296, + "rewards/accuracy_reward": 0.7757663518190384, + "rewards/format_reward": 0.9714285850524902, + "step": 1530 + }, + { + "completion_length": 68.0160743713379, + "epoch": 1.4365671641791045, + "grad_norm": 4.41068696975708, + "kl": 0.273046875, + "learning_rate": 2.230194239602717e-07, + "loss": 0.069, + "reward": 1.839265477657318, + "reward_std": 0.3157119289040565, + "rewards/accuracy_reward": 0.882122528553009, + "rewards/format_reward": 0.9571428775787354, + "step": 1540 + }, + { + "completion_length": 72.88571662902832, + "epoch": 1.4458955223880596, + "grad_norm": 2.147146224975586, + "kl": 0.236962890625, + "learning_rate": 2.1627701179373643e-07, + "loss": 0.0707, + "reward": 1.662415087223053, + "reward_std": 0.4311062529683113, + "rewards/accuracy_reward": 0.7338435798883438, + "rewards/format_reward": 0.9285714626312256, + "step": 1550 + }, + { + "completion_length": 68.96607513427735, + "epoch": 1.455223880597015, + "grad_norm": 2.620600938796997, + "kl": 0.234619140625, + "learning_rate": 2.0960985200307312e-07, + "loss": 0.0194, + "reward": 1.6952961206436157, + "reward_std": 0.4336378961801529, + "rewards/accuracy_reward": 0.7417246133089066, + "rewards/format_reward": 0.9535714507102966, + "step": 1560 + }, + { + "completion_length": 70.6339324951172, + "epoch": 1.4645522388059702, + "grad_norm": 2.1037166118621826, + "kl": 0.260498046875, + "learning_rate": 2.0301971293147352e-07, + "loss": 0.0591, + "reward": 1.7625232696533204, + "reward_std": 0.37615562081336973, + "rewards/accuracy_reward": 0.8125232458114624, + "rewards/format_reward": 0.9500000238418579, + "step": 1570 + }, + { + "completion_length": 72.0053596496582, + "epoch": 1.4738805970149254, + "grad_norm": 2.6927168369293213, + "kl": 0.322705078125, + "learning_rate": 1.9650834249378124e-07, + "loss": 0.0975, + "reward": 1.7478647828102112, + "reward_std": 0.42380866408348083, + "rewards/accuracy_reward": 0.8121504783630371, + "rewards/format_reward": 0.9357143104076385, + "step": 1580 + }, + { + "completion_length": 68.50893135070801, + "epoch": 1.4832089552238805, + "grad_norm": 2.638317823410034, + "kl": 0.203271484375, + "learning_rate": 1.9007746771288806e-07, + "loss": 0.0314, + "reward": 1.659421944618225, + "reward_std": 0.39078644961118697, + "rewards/accuracy_reward": 0.7165647685527802, + "rewards/format_reward": 0.9428571701049805, + "step": 1590 + }, + { + "completion_length": 71.92500457763671, + "epoch": 1.4925373134328357, + "grad_norm": 2.5060784816741943, + "kl": 0.14453125, + "learning_rate": 1.8372879426167332e-07, + "loss": 0.0556, + "reward": 1.5733938455581664, + "reward_std": 0.36534764170646666, + "rewards/accuracy_reward": 0.6126794695854187, + "rewards/format_reward": 0.9607143044471741, + "step": 1600 + }, + { + "completion_length": 69.96786003112793, + "epoch": 1.501865671641791, + "grad_norm": 4.351828575134277, + "kl": 0.168701171875, + "learning_rate": 1.7746400601060475e-07, + "loss": 0.0385, + "reward": 1.6393524765968324, + "reward_std": 0.3634603455662727, + "rewards/accuracy_reward": 0.68220956325531, + "rewards/format_reward": 0.9571428775787354, + "step": 1610 + }, + { + "completion_length": 71.86786117553712, + "epoch": 1.5111940298507462, + "grad_norm": 3.9751698970794678, + "kl": 0.318359375, + "learning_rate": 1.7128476458112207e-07, + "loss": 0.07, + "reward": 1.654927670955658, + "reward_std": 0.4259908489882946, + "rewards/accuracy_reward": 0.7192133754491806, + "rewards/format_reward": 0.935714316368103, + "step": 1620 + }, + { + "completion_length": 73.06786003112794, + "epoch": 1.5205223880597014, + "grad_norm": 4.052099704742432, + "kl": 0.48408203125, + "learning_rate": 1.6519270890492282e-07, + "loss": 0.0817, + "reward": 1.8346825242042542, + "reward_std": 0.43689552545547483, + "rewards/accuracy_reward": 0.8918253421783447, + "rewards/format_reward": 0.9428571701049805, + "step": 1630 + }, + { + "completion_length": 70.95178909301758, + "epoch": 1.5298507462686568, + "grad_norm": 7.291477680206299, + "kl": 0.26201171875, + "learning_rate": 1.5918945478926481e-07, + "loss": 0.0384, + "reward": 1.6705774188041687, + "reward_std": 0.29296426847577095, + "rewards/accuracy_reward": 0.6955773562192917, + "rewards/format_reward": 0.975000011920929, + "step": 1640 + }, + { + "completion_length": 70.5696460723877, + "epoch": 1.539179104477612, + "grad_norm": 3.3498435020446777, + "kl": 0.213671875, + "learning_rate": 1.5327659448840313e-07, + "loss": 0.0533, + "reward": 1.752088689804077, + "reward_std": 0.3687770262360573, + "rewards/accuracy_reward": 0.7985171914100647, + "rewards/format_reward": 0.9535714507102966, + "step": 1650 + }, + { + "completion_length": 76.42143173217774, + "epoch": 1.5485074626865671, + "grad_norm": 4.063335418701172, + "kl": 0.35205078125, + "learning_rate": 1.474556962812729e-07, + "loss": 0.1094, + "reward": 1.6160147190093994, + "reward_std": 0.4147659420967102, + "rewards/accuracy_reward": 0.6981575042009354, + "rewards/format_reward": 0.9178571820259094, + "step": 1660 + }, + { + "completion_length": 75.91250343322754, + "epoch": 1.5578358208955225, + "grad_norm": 1.601982593536377, + "kl": 0.3421875, + "learning_rate": 1.4172830405553214e-07, + "loss": 0.0972, + "reward": 1.6295251965522766, + "reward_std": 0.4272765040397644, + "rewards/accuracy_reward": 0.6973822951316834, + "rewards/format_reward": 0.9321428894996643, + "step": 1670 + }, + { + "completion_length": 70.08750381469727, + "epoch": 1.5671641791044775, + "grad_norm": 4.637321949005127, + "kl": 0.28974609375, + "learning_rate": 1.3609593689807352e-07, + "loss": 0.0654, + "reward": 1.7085243463516235, + "reward_std": 0.4388610184192657, + "rewards/accuracy_reward": 0.7728100091218948, + "rewards/format_reward": 0.935714316368103, + "step": 1680 + }, + { + "completion_length": 75.6321460723877, + "epoch": 1.5764925373134329, + "grad_norm": 2.443453073501587, + "kl": 0.3375, + "learning_rate": 1.3056008869211427e-07, + "loss": 0.1288, + "reward": 1.7601337909698487, + "reward_std": 0.3984776630997658, + "rewards/accuracy_reward": 0.8172765672206879, + "rewards/format_reward": 0.9428571701049805, + "step": 1690 + }, + { + "completion_length": 71.43928985595703, + "epoch": 1.585820895522388, + "grad_norm": 2.7298126220703125, + "kl": 0.2486328125, + "learning_rate": 1.251222277209702e-07, + "loss": 0.0587, + "reward": 1.6708721756935119, + "reward_std": 0.3407215870916843, + "rewards/accuracy_reward": 0.7137292563915253, + "rewards/format_reward": 0.9571428775787354, + "step": 1700 + }, + { + "completion_length": 72.47678909301757, + "epoch": 1.5951492537313432, + "grad_norm": 2.588491439819336, + "kl": 0.21982421875, + "learning_rate": 1.1978379627862088e-07, + "loss": 0.0587, + "reward": 1.5847897171974181, + "reward_std": 0.362374846637249, + "rewards/accuracy_reward": 0.638361094892025, + "rewards/format_reward": 0.9464285969734192, + "step": 1710 + }, + { + "completion_length": 70.4357177734375, + "epoch": 1.6044776119402986, + "grad_norm": 2.0158324241638184, + "kl": 0.24248046875, + "learning_rate": 1.1454621028716694e-07, + "loss": 0.0595, + "reward": 1.7178019404411315, + "reward_std": 0.3643430769443512, + "rewards/accuracy_reward": 0.7678018808364868, + "rewards/format_reward": 0.9500000238418579, + "step": 1720 + }, + { + "completion_length": 71.73571739196777, + "epoch": 1.6138059701492538, + "grad_norm": 1.9249950647354126, + "kl": 0.2228515625, + "learning_rate": 1.0941085892128272e-07, + "loss": 0.0771, + "reward": 1.810201096534729, + "reward_std": 0.3912685692310333, + "rewards/accuracy_reward": 0.8744867205619812, + "rewards/format_reward": 0.935714316368103, + "step": 1730 + }, + { + "completion_length": 67.61964569091796, + "epoch": 1.623134328358209, + "grad_norm": 2.0906991958618164, + "kl": 0.2560546875, + "learning_rate": 1.0437910423976237e-07, + "loss": 0.0393, + "reward": 1.6972728729248048, + "reward_std": 0.3199976682662964, + "rewards/accuracy_reward": 0.7365585595369339, + "rewards/format_reward": 0.9607143044471741, + "step": 1740 + }, + { + "completion_length": 68.35000190734863, + "epoch": 1.6324626865671643, + "grad_norm": 2.7774672508239746, + "kl": 0.203515625, + "learning_rate": 9.945228082425894e-08, + "loss": 0.054, + "reward": 1.6996734976768493, + "reward_std": 0.35243515819311144, + "rewards/accuracy_reward": 0.7532448709011078, + "rewards/format_reward": 0.9464285969734192, + "step": 1750 + }, + { + "completion_length": 82.38571891784667, + "epoch": 1.6417910447761193, + "grad_norm": 11.614768981933594, + "kl": 0.30517578125, + "learning_rate": 9.463169542531057e-08, + "loss": 0.1471, + "reward": 1.8141414523124695, + "reward_std": 0.3870817393064499, + "rewards/accuracy_reward": 0.8891413986682892, + "rewards/format_reward": 0.9250000357627869, + "step": 1760 + }, + { + "completion_length": 71.69643211364746, + "epoch": 1.6511194029850746, + "grad_norm": 3.386569023132324, + "kl": 0.31064453125, + "learning_rate": 8.99186266157485e-08, + "loss": 0.1075, + "reward": 1.7770594358444214, + "reward_std": 0.42672722935676577, + "rewards/accuracy_reward": 0.8413451433181762, + "rewards/format_reward": 0.935714316368103, + "step": 1770 + }, + { + "completion_length": 74.81250305175782, + "epoch": 1.6604477611940298, + "grad_norm": 3.187945604324341, + "kl": 0.289111328125, + "learning_rate": 8.531432445157921e-08, + "loss": 0.0863, + "reward": 1.6912956357002258, + "reward_std": 0.36863686591386796, + "rewards/accuracy_reward": 0.7555812746286392, + "rewards/format_reward": 0.9357143104076385, + "step": 1780 + }, + { + "completion_length": 69.46786003112793, + "epoch": 1.669776119402985, + "grad_norm": 4.341835975646973, + "kl": 0.2396484375, + "learning_rate": 8.082001014042944e-08, + "loss": 0.0597, + "reward": 1.576663601398468, + "reward_std": 0.36128911972045896, + "rewards/accuracy_reward": 0.6338064014911652, + "rewards/format_reward": 0.942857164144516, + "step": 1790 + }, + { + "completion_length": 68.04107513427735, + "epoch": 1.6791044776119404, + "grad_norm": 3.1412479877471924, + "kl": 0.219384765625, + "learning_rate": 7.643687571764329e-08, + "loss": 0.0468, + "reward": 1.7867911338806153, + "reward_std": 0.35844905152916906, + "rewards/accuracy_reward": 0.8332196563482285, + "rewards/format_reward": 0.9535714507102966, + "step": 1800 + }, + { + "completion_length": 69.81607551574707, + "epoch": 1.6884328358208955, + "grad_norm": 4.341177463531494, + "kl": 0.27998046875, + "learning_rate": 7.2166083730116e-08, + "loss": 0.0501, + "reward": 1.7094895005226136, + "reward_std": 0.4133823424577713, + "rewards/accuracy_reward": 0.7380608856678009, + "rewards/format_reward": 0.9714285850524902, + "step": 1810 + }, + { + "completion_length": 76.75893287658691, + "epoch": 1.6977611940298507, + "grad_norm": 2.1134347915649414, + "kl": 0.197900390625, + "learning_rate": 6.800876692794993e-08, + "loss": 0.1271, + "reward": 1.724982464313507, + "reward_std": 0.42608653008937836, + "rewards/accuracy_reward": 0.7821252822875977, + "rewards/format_reward": 0.942857164144516, + "step": 1820 + }, + { + "completion_length": 74.59464721679687, + "epoch": 1.707089552238806, + "grad_norm": 3.5184261798858643, + "kl": 0.280078125, + "learning_rate": 6.39660279640129e-08, + "loss": 0.0977, + "reward": 1.6460832118988038, + "reward_std": 0.4460886180400848, + "rewards/accuracy_reward": 0.7175117880105972, + "rewards/format_reward": 0.9285714626312256, + "step": 1830 + }, + { + "completion_length": 66.23036003112793, + "epoch": 1.716417910447761, + "grad_norm": 3.080709457397461, + "kl": 0.27099609375, + "learning_rate": 6.003893910147967e-08, + "loss": 0.0398, + "reward": 1.7503175139427185, + "reward_std": 0.29155280590057375, + "rewards/accuracy_reward": 0.786031711101532, + "rewards/format_reward": 0.9642857313156128, + "step": 1840 + }, + { + "completion_length": 71.16607513427735, + "epoch": 1.7257462686567164, + "grad_norm": 3.831777572631836, + "kl": 0.28515625, + "learning_rate": 5.622854192943316e-08, + "loss": 0.0768, + "reward": 1.732144832611084, + "reward_std": 0.467063182592392, + "rewards/accuracy_reward": 0.8321447849273682, + "rewards/format_reward": 0.9000000357627869, + "step": 1850 + }, + { + "completion_length": 67.84643249511718, + "epoch": 1.7350746268656716, + "grad_norm": 2.302868127822876, + "kl": 0.266845703125, + "learning_rate": 5.2535847086602636e-08, + "loss": 0.0761, + "reward": 1.8191146731376648, + "reward_std": 0.3647874310612679, + "rewards/accuracy_reward": 0.8619718044996262, + "rewards/format_reward": 0.9571428775787354, + "step": 1860 + }, + { + "completion_length": 69.80357475280762, + "epoch": 1.7444029850746268, + "grad_norm": 2.980681896209717, + "kl": 0.268798828125, + "learning_rate": 4.89618339933095e-08, + "loss": 0.0669, + "reward": 1.7465506196022034, + "reward_std": 0.28898321464657784, + "rewards/accuracy_reward": 0.7894077003002167, + "rewards/format_reward": 0.9571428775787354, + "step": 1870 + }, + { + "completion_length": 68.23214607238769, + "epoch": 1.7537313432835822, + "grad_norm": 2.6691131591796875, + "kl": 0.3056640625, + "learning_rate": 4.5507450591693975e-08, + "loss": 0.0533, + "reward": 1.7299192309379579, + "reward_std": 0.2962379239499569, + "rewards/accuracy_reward": 0.7620620489120483, + "rewards/format_reward": 0.9678571581840515, + "step": 1880 + }, + { + "completion_length": 65.82500381469727, + "epoch": 1.7630597014925373, + "grad_norm": 3.187333345413208, + "kl": 0.26552734375, + "learning_rate": 4.2173613094290626e-08, + "loss": 0.0457, + "reward": 1.7060747623443604, + "reward_std": 0.4388273775577545, + "rewards/accuracy_reward": 0.7596461176872253, + "rewards/format_reward": 0.9464285969734192, + "step": 1890 + }, + { + "completion_length": 73.71786003112793, + "epoch": 1.7723880597014925, + "grad_norm": 3.8043019771575928, + "kl": 0.31396484375, + "learning_rate": 3.896120574101969e-08, + "loss": 0.1125, + "reward": 1.6053473830223084, + "reward_std": 0.46198561042547226, + "rewards/accuracy_reward": 0.6803473174571991, + "rewards/format_reward": 0.9250000357627869, + "step": 1900 + }, + { + "completion_length": 71.06428909301758, + "epoch": 1.7817164179104479, + "grad_norm": 12.418741226196289, + "kl": 0.27548828125, + "learning_rate": 3.5871080564658265e-08, + "loss": 0.0729, + "reward": 1.5581155657768249, + "reward_std": 0.39694445878267287, + "rewards/accuracy_reward": 0.6009726643562316, + "rewards/format_reward": 0.9571428775787354, + "step": 1910 + }, + { + "completion_length": 69.58750343322754, + "epoch": 1.7910447761194028, + "grad_norm": 2.281094551086426, + "kl": 0.27919921875, + "learning_rate": 3.290405716485456e-08, + "loss": 0.0532, + "reward": 1.7076811790466309, + "reward_std": 0.43241163045167924, + "rewards/accuracy_reward": 0.7719668626785279, + "rewards/format_reward": 0.935714316368103, + "step": 1920 + }, + { + "completion_length": 72.5892894744873, + "epoch": 1.8003731343283582, + "grad_norm": 2.05045223236084, + "kl": 0.1744140625, + "learning_rate": 3.00609224907431e-08, + "loss": 0.0677, + "reward": 1.6015358567237854, + "reward_std": 0.30856244415044787, + "rewards/accuracy_reward": 0.6408215403556824, + "rewards/format_reward": 0.9607143044471741, + "step": 1930 + }, + { + "completion_length": 73.64107322692871, + "epoch": 1.8097014925373134, + "grad_norm": 3.3584227561950684, + "kl": 0.26279296875, + "learning_rate": 2.73424306322218e-08, + "loss": 0.0763, + "reward": 1.7042565584182738, + "reward_std": 0.36942601650953294, + "rewards/accuracy_reward": 0.7756850928068161, + "rewards/format_reward": 0.9285714626312256, + "step": 1940 + }, + { + "completion_length": 75.22321853637695, + "epoch": 1.8190298507462686, + "grad_norm": 2.7679641246795654, + "kl": 0.25546875, + "learning_rate": 2.474930261994257e-08, + "loss": 0.1194, + "reward": 1.7792153120040894, + "reward_std": 0.49641439616680144, + "rewards/accuracy_reward": 0.8435010492801667, + "rewards/format_reward": 0.935714316368103, + "step": 1950 + }, + { + "completion_length": 66.86607475280762, + "epoch": 1.828358208955224, + "grad_norm": 4.81545352935791, + "kl": 0.270458984375, + "learning_rate": 2.228222623407111e-08, + "loss": 0.0559, + "reward": 1.719323456287384, + "reward_std": 0.40858686715364456, + "rewards/accuracy_reward": 0.7764662265777588, + "rewards/format_reward": 0.9428571701049805, + "step": 1960 + }, + { + "completion_length": 70.97500267028809, + "epoch": 1.837686567164179, + "grad_norm": 7.122328758239746, + "kl": 0.255859375, + "learning_rate": 1.9941855821865915e-08, + "loss": 0.0747, + "reward": 1.5889485955238343, + "reward_std": 0.3792146876454353, + "rewards/accuracy_reward": 0.6460913956165314, + "rewards/format_reward": 0.9428571701049805, + "step": 1970 + }, + { + "completion_length": 73.52321853637696, + "epoch": 1.8470149253731343, + "grad_norm": 21.022397994995117, + "kl": 0.33388671875, + "learning_rate": 1.772881212412436e-08, + "loss": 0.0969, + "reward": 1.8433643937110902, + "reward_std": 0.3912557512521744, + "rewards/accuracy_reward": 0.9005071640014648, + "rewards/format_reward": 0.9428571701049805, + "step": 1980 + }, + { + "completion_length": 65.22321701049805, + "epoch": 1.8563432835820897, + "grad_norm": 6.194070816040039, + "kl": 0.2392578125, + "learning_rate": 1.5643682110542623e-08, + "loss": 0.0117, + "reward": 1.6194544792175294, + "reward_std": 0.2789491511881351, + "rewards/accuracy_reward": 0.6587401002645492, + "rewards/format_reward": 0.9607143044471741, + "step": 1990 + }, + { + "completion_length": 66.95893173217773, + "epoch": 1.8656716417910446, + "grad_norm": 2.9723830223083496, + "kl": 0.25517578125, + "learning_rate": 1.3687018824032336e-08, + "loss": 0.0558, + "reward": 1.8543817281723023, + "reward_std": 0.3739734962582588, + "rewards/accuracy_reward": 0.9150959610939026, + "rewards/format_reward": 0.9392857432365418, + "step": 2000 + }, + { + "completion_length": 69.1857177734375, + "epoch": 1.875, + "grad_norm": 2.838501453399658, + "kl": 0.33935546875, + "learning_rate": 1.1859341234036203e-08, + "loss": 0.1099, + "reward": 1.8656662106513977, + "reward_std": 0.39905201345682145, + "rewards/accuracy_reward": 0.9120947599411011, + "rewards/format_reward": 0.9535714507102966, + "step": 2010 + }, + { + "completion_length": 79.23036117553711, + "epoch": 1.8843283582089554, + "grad_norm": 2.2912213802337646, + "kl": 0.25498046875, + "learning_rate": 1.0161134098880974e-08, + "loss": 0.1158, + "reward": 1.6939324378967284, + "reward_std": 0.5166901051998138, + "rewards/accuracy_reward": 0.7832180857658386, + "rewards/format_reward": 0.910714328289032, + "step": 2020 + }, + { + "completion_length": 76.20178909301758, + "epoch": 1.8936567164179103, + "grad_norm": 5.580301761627197, + "kl": 0.3548828125, + "learning_rate": 8.592847837203653e-09, + "loss": 0.1194, + "reward": 1.732421338558197, + "reward_std": 0.39300083220005033, + "rewards/accuracy_reward": 0.7824212819337845, + "rewards/format_reward": 0.9500000178813934, + "step": 2030 + }, + { + "completion_length": 69.75893211364746, + "epoch": 1.9029850746268657, + "grad_norm": 2.2989277839660645, + "kl": 0.26123046875, + "learning_rate": 7.154898408486321e-09, + "loss": 0.0587, + "reward": 1.9577446222305297, + "reward_std": 0.3280174434185028, + "rewards/accuracy_reward": 0.9898874044418335, + "rewards/format_reward": 0.9678571581840515, + "step": 2040 + }, + { + "completion_length": 66.70714530944824, + "epoch": 1.912313432835821, + "grad_norm": 2.700030565261841, + "kl": 0.2146484375, + "learning_rate": 5.847667202730444e-09, + "loss": 0.0336, + "reward": 1.701856541633606, + "reward_std": 0.26435703188180926, + "rewards/accuracy_reward": 0.7268565416336059, + "rewards/format_reward": 0.975000011920929, + "step": 2050 + }, + { + "completion_length": 77.875004196167, + "epoch": 1.921641791044776, + "grad_norm": 3.0810651779174805, + "kl": 0.34150390625, + "learning_rate": 4.671500939300133e-09, + "loss": 0.0934, + "reward": 1.650016152858734, + "reward_std": 0.4673632770776749, + "rewards/accuracy_reward": 0.7071589350700378, + "rewards/format_reward": 0.9428571701049805, + "step": 2060 + }, + { + "completion_length": 68.88928871154785, + "epoch": 1.9309701492537314, + "grad_norm": 3.2456414699554443, + "kl": 0.2634765625, + "learning_rate": 3.6267115749610277e-09, + "loss": 0.0575, + "reward": 1.7053727746009826, + "reward_std": 0.33721098080277445, + "rewards/accuracy_reward": 0.7625156164169311, + "rewards/format_reward": 0.9428571701049805, + "step": 2070 + }, + { + "completion_length": 64.33750267028809, + "epoch": 1.9402985074626866, + "grad_norm": 2.983149290084839, + "kl": 0.22578125, + "learning_rate": 2.7135762211394907e-09, + "loss": 0.0079, + "reward": 1.7429541110992433, + "reward_std": 0.39028028324246405, + "rewards/accuracy_reward": 0.8000969350337982, + "rewards/format_reward": 0.9428571701049805, + "step": 2080 + }, + { + "completion_length": 66.84821701049805, + "epoch": 1.9496268656716418, + "grad_norm": 3.759716749191284, + "kl": 0.26376953125, + "learning_rate": 1.9323370704238394e-09, + "loss": 0.0618, + "reward": 1.6058704495429992, + "reward_std": 0.34787444174289706, + "rewards/accuracy_reward": 0.663013243675232, + "rewards/format_reward": 0.9428571701049805, + "step": 2090 + }, + { + "completion_length": 69.08750228881836, + "epoch": 1.9589552238805972, + "grad_norm": 3.2065060138702393, + "kl": 0.24267578125, + "learning_rate": 1.2832013323270663e-09, + "loss": 0.0404, + "reward": 1.6871079802513123, + "reward_std": 0.3759960770606995, + "rewards/accuracy_reward": 0.7263936132192612, + "rewards/format_reward": 0.9607143044471741, + "step": 2100 + }, + { + "completion_length": 70.07143173217773, + "epoch": 1.9682835820895521, + "grad_norm": 4.975913047790527, + "kl": 0.44765625, + "learning_rate": 7.663411783283558e-10, + "loss": 0.084, + "reward": 1.711716628074646, + "reward_std": 0.36439805179834367, + "rewards/accuracy_reward": 0.7617165893316269, + "rewards/format_reward": 0.9500000178813934, + "step": 2110 + }, + { + "completion_length": 68.29821815490723, + "epoch": 1.9776119402985075, + "grad_norm": 3.3940882682800293, + "kl": 0.2744140625, + "learning_rate": 3.8189369620761357e-10, + "loss": 0.0525, + "reward": 1.7731546998023986, + "reward_std": 0.42709719240665434, + "rewards/accuracy_reward": 0.8231546580791473, + "rewards/format_reward": 0.9500000238418579, + "step": 2120 + }, + { + "completion_length": 67.21250267028809, + "epoch": 1.9869402985074627, + "grad_norm": 4.463015556335449, + "kl": 0.2267578125, + "learning_rate": 1.2996085368566224e-10, + "loss": 0.0227, + "reward": 1.697384738922119, + "reward_std": 0.34280937165021896, + "rewards/accuracy_reward": 0.7366703957319259, + "rewards/format_reward": 0.9607142984867096, + "step": 2130 + }, + { + "completion_length": 67.09286003112793, + "epoch": 1.9962686567164178, + "grad_norm": 3.4289767742156982, + "kl": 0.22529296875, + "learning_rate": 1.0609471378875135e-11, + "loss": 0.0325, + "reward": 1.7102928757667542, + "reward_std": 0.42841927111148836, + "rewards/accuracy_reward": 0.7602928072214127, + "rewards/format_reward": 0.9500000238418579, + "step": 2140 + }, + { + "epoch": 2.0, + "eval_completion_length": 71.8117197672526, + "eval_kl": 0.2966796875, + "eval_loss": 0.052891816943883896, + "eval_reward": 1.4605671564737956, + "eval_reward_std": 0.3952869375546773, + "eval_rewards/accuracy_reward": 0.512948073943456, + "eval_rewards/format_reward": 0.9476190686225892, + "eval_runtime": 40.5993, + "eval_samples_per_second": 7.389, + "eval_steps_per_second": 0.074, + "step": 2144 + }, + { + "epoch": 2.0, + "step": 2144, "total_flos": 0.0, - "train_loss": 5.353166522324368, - "train_runtime": 7842.24, - "train_samples_per_second": 1.913, - "train_steps_per_second": 0.06 + "train_loss": 0.0492804956890872, + "train_runtime": 5735.3452, + "train_samples_per_second": 5.231, + "train_steps_per_second": 0.374 } ], - "logging_steps": 5, - "max_steps": 468, + "logging_steps": 10, + "max_steps": 2144, "num_input_tokens_seen": 0, - "num_train_epochs": 1, + "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": {