{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992131527726011, "eval_steps": 100, "global_step": 1131, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 894.0125112533569, "epoch": 0.008834775886583565, "grad_norm": 0.10339409232367645, "kl": 6.553309503942728e-05, "learning_rate": 1.7543859649122807e-06, "loss": 0.0, "reward": 0.20572917186655104, "reward_std": 0.22462533507496119, "rewards/accuracy_reward": 0.20572917186655104, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 871.2338674545288, "epoch": 0.01766955177316713, "grad_norm": 0.1191837579508088, "kl": 0.0008328911615535616, "learning_rate": 3.5087719298245615e-06, "loss": 0.0, "reward": 0.250000006519258, "reward_std": 0.275143482722342, "rewards/accuracy_reward": 0.250000006519258, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 805.9927246570587, "epoch": 0.026504327659750693, "grad_norm": 0.11427842333813355, "kl": 0.006137973070144654, "learning_rate": 5.263157894736842e-06, "loss": 0.0002, "reward": 0.3583333409391344, "reward_std": 0.3040109956637025, "rewards/accuracy_reward": 0.3583333409391344, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 815.7588698387146, "epoch": 0.03533910354633426, "grad_norm": 0.1094331129364983, "kl": 0.011934018135070801, "learning_rate": 7.017543859649123e-06, "loss": 0.0005, "reward": 0.3635416746605188, "reward_std": 0.31483631301671267, "rewards/accuracy_reward": 0.3635416746605188, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 811.1395996570587, "epoch": 0.044173879432917826, "grad_norm": 0.11064742736511399, "kl": 0.027907085418701173, "learning_rate": 8.771929824561405e-06, "loss": 0.0011, "reward": 0.38958334126509725, "reward_std": 0.31934686191380024, "rewards/accuracy_reward": 0.38958334126509725, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 729.52032995224, "epoch": 0.053008655319501385, "grad_norm": 0.5600590072843962, "kl": 0.11420574188232421, "learning_rate": 1.0526315789473684e-05, "loss": 0.0046, "reward": 0.47083334238268437, "reward_std": 0.34550804551690817, "rewards/accuracy_reward": 0.47083334238268437, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 748.378140592575, "epoch": 0.06184343120608495, "grad_norm": 0.49897174304455477, "kl": 0.4907238006591797, "learning_rate": 1.2280701754385966e-05, "loss": 0.0196, "reward": 0.35729167410172524, "reward_std": 0.3094236543402076, "rewards/accuracy_reward": 0.35729167410172524, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 731.9463713169098, "epoch": 0.07067820709266852, "grad_norm": 2.1010371186603622, "kl": 0.5549324035644532, "learning_rate": 1.4035087719298246e-05, "loss": 0.0222, "reward": 0.39947917480021716, "reward_std": 0.3283679597079754, "rewards/accuracy_reward": 0.39947917480021716, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 636.9864743709564, "epoch": 0.07951298297925208, "grad_norm": 0.3320296791157868, "kl": 0.8359504699707031, "learning_rate": 1.578947368421053e-05, "loss": 0.0334, "reward": 0.35625000754371283, "reward_std": 0.30491310544312, "rewards/accuracy_reward": 0.35625000754371283, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 752.3406419754028, "epoch": 0.08834775886583565, "grad_norm": 13.222864026304222, "kl": 1.2244148254394531, "learning_rate": 1.754385964912281e-05, "loss": 0.049, "reward": 0.3458333408460021, "reward_std": 0.2922835685312748, "rewards/accuracy_reward": 0.3458333408460021, "rewards/format_reward": 0.0, "step": 100 }, { "epoch": 0.08834775886583565, "eval_completion_length": 806.9528756189828, "eval_kl": 0.7132556029040404, "eval_loss": 0.028531787917017937, "eval_reward": 0.3063973137224563, "eval_reward_std": 0.3090865022004253, "eval_rewards/accuracy_reward": 0.3063973137224563, "eval_rewards/format_reward": 0.0, "eval_runtime": 538.1844, "eval_samples_per_second": 0.184, "eval_steps_per_second": 0.061, "step": 100 }, { "completion_length": 788.5630361557007, "epoch": 0.09718253475241921, "grad_norm": 1.9841707275878488, "kl": 1.9315879821777344, "learning_rate": 1.929824561403509e-05, "loss": 0.0773, "reward": 0.3375000071246177, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.3375000071246177, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 782.2432436466217, "epoch": 0.10601731063900277, "grad_norm": 1.6220755341742679, "kl": 1.2291488647460938, "learning_rate": 1.9998282416292057e-05, "loss": 0.0492, "reward": 0.29166667312383654, "reward_std": 0.2625139458104968, "rewards/accuracy_reward": 0.29166667312383654, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 855.3958479881287, "epoch": 0.11485208652558634, "grad_norm": 1.2506380617356658, "kl": 0.8863227844238282, "learning_rate": 1.9987788208027496e-05, "loss": 0.0355, "reward": 0.3015625067986548, "reward_std": 0.2796540316194296, "rewards/accuracy_reward": 0.3015625067986548, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 923.2276175498962, "epoch": 0.1236868624121699, "grad_norm": 0.7947305780961375, "kl": 0.8832084655761718, "learning_rate": 1.9967764005730785e-05, "loss": 0.0353, "reward": 0.17343750461004676, "reward_std": 0.2128979079425335, "rewards/accuracy_reward": 0.17343750461004676, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 867.6395987987519, "epoch": 0.13252163829875346, "grad_norm": 0.497580876778096, "kl": 0.5946540832519531, "learning_rate": 1.993822891578708e-05, "loss": 0.0238, "reward": 0.18281250479631125, "reward_std": 0.2192126763984561, "rewards/accuracy_reward": 0.18281250479631125, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 718.904184293747, "epoch": 0.14135641418533704, "grad_norm": 1.857046714456291, "kl": 0.6682144165039062, "learning_rate": 1.9899211119533938e-05, "loss": 0.0267, "reward": 0.3848958414513618, "reward_std": 0.33107428904622793, "rewards/accuracy_reward": 0.3848958414513618, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 918.8239655733108, "epoch": 0.1501911900719206, "grad_norm": 0.20248662305659806, "kl": 0.5040359497070312, "learning_rate": 1.985074784637167e-05, "loss": 0.0202, "reward": 0.09479166907258332, "reward_std": 0.09923207573592663, "rewards/accuracy_reward": 0.09479166907258332, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 827.5974071979523, "epoch": 0.15902596595850416, "grad_norm": 0.19897870444761726, "kl": 0.17742691040039063, "learning_rate": 1.9792885338240375e-05, "loss": 0.0071, "reward": 0.17291667088866233, "reward_std": 0.17771562654525042, "rewards/accuracy_reward": 0.17291667088866233, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 696.4333508253097, "epoch": 0.16786074184508773, "grad_norm": 0.4541804472147627, "kl": 0.3155979156494141, "learning_rate": 1.9725678805497507e-05, "loss": 0.0126, "reward": 0.42395834233611823, "reward_std": 0.3536270335316658, "rewards/accuracy_reward": 0.42395834233611823, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 709.8849118709564, "epoch": 0.1766955177316713, "grad_norm": 0.5097645308657522, "kl": 1.0620399475097657, "learning_rate": 1.964919237423812e-05, "loss": 0.0425, "reward": 0.2687500062398612, "reward_std": 0.2625139458104968, "rewards/accuracy_reward": 0.2687500062398612, "rewards/format_reward": 0.0, "step": 200 }, { "epoch": 0.1766955177316713, "eval_completion_length": 699.2357113000119, "eval_kl": 0.9798473011363636, "eval_loss": 0.039183806627988815, "eval_reward": 0.3872053963367385, "eval_reward_std": 0.34990924777406635, "eval_rewards/accuracy_reward": 0.3872053963367385, "eval_rewards/format_reward": 0.0, "eval_runtime": 513.8181, "eval_samples_per_second": 0.193, "eval_steps_per_second": 0.064, "step": 200 }, { "completion_length": 786.7765783786774, "epoch": 0.18553029361825485, "grad_norm": 0.15936983650030578, "kl": 1.27841796875, "learning_rate": 1.9563499025108e-05, "loss": 0.0511, "reward": 0.25937500610016284, "reward_std": 0.25980761647224426, "rewards/accuracy_reward": 0.25937500610016284, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 909.2531366825103, "epoch": 0.19436506950483842, "grad_norm": 0.15108105876463188, "kl": 0.6172576904296875, "learning_rate": 1.9468680523668136e-05, "loss": 0.0247, "reward": 0.08593750246800483, "reward_std": 0.11998060066252947, "rewards/accuracy_reward": 0.08593750246800483, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 847.0453273773194, "epoch": 0.203199845391422, "grad_norm": 0.4781092711117684, "kl": 0.719610595703125, "learning_rate": 1.936482734237689e-05, "loss": 0.0288, "reward": 0.1854166716337204, "reward_std": 0.2201147861778736, "rewards/accuracy_reward": 0.1854166716337204, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 844.9718894481659, "epoch": 0.21203462127800554, "grad_norm": 0.6125806510450937, "kl": 0.6312515258789062, "learning_rate": 1.9252038574264403e-05, "loss": 0.0253, "reward": 0.1963541720993817, "reward_std": 0.25168862845748663, "rewards/accuracy_reward": 0.1963541720993817, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 716.1927267074585, "epoch": 0.22086939716458912, "grad_norm": 0.42158257729983767, "kl": 1.013921356201172, "learning_rate": 1.913042183838153e-05, "loss": 0.0406, "reward": 0.3276041746605188, "reward_std": 0.3418996063992381, "rewards/accuracy_reward": 0.3276041746605188, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 772.1078284740448, "epoch": 0.2297041730511727, "grad_norm": 0.49721786332420803, "kl": 1.1279647827148438, "learning_rate": 1.9000093177113524e-05, "loss": 0.0451, "reward": 0.25833333977498113, "reward_std": 0.27604559250175953, "rewards/accuracy_reward": 0.25833333977498113, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 798.2672049999237, "epoch": 0.23853894893775623, "grad_norm": 0.6009045781426623, "kl": 0.8998489379882812, "learning_rate": 1.8861176945456542e-05, "loss": 0.036, "reward": 0.26093750656582415, "reward_std": 0.2931856783106923, "rewards/accuracy_reward": 0.26093750656582415, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 739.2422043561935, "epoch": 0.2473737248243398, "grad_norm": 0.9064888423502573, "kl": 1.204058837890625, "learning_rate": 1.8713805692362458e-05, "loss": 0.0482, "reward": 0.31979167396202685, "reward_std": 0.30761943478137255, "rewards/accuracy_reward": 0.31979167396202685, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 823.738035774231, "epoch": 0.2562085007109234, "grad_norm": 0.4395412639205522, "kl": 0.8689178466796875, "learning_rate": 1.8558120034265396e-05, "loss": 0.0348, "reward": 0.24218750628642738, "reward_std": 0.275143482722342, "rewards/accuracy_reward": 0.24218750628642738, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 866.1130366325378, "epoch": 0.2650432765975069, "grad_norm": 0.4910227951322757, "kl": 0.7059478759765625, "learning_rate": 1.8394268520910467e-05, "loss": 0.0282, "reward": 0.236458339728415, "reward_std": 0.27153504360467196, "rewards/accuracy_reward": 0.236458339728415, "rewards/format_reward": 0.0, "step": 300 }, { "epoch": 0.2650432765975069, "eval_completion_length": 855.1784671051333, "eval_kl": 0.9380622632575758, "eval_loss": 0.03754038363695145, "eval_reward": 0.2558922623143052, "eval_reward_std": 0.2857592190154875, "eval_rewards/accuracy_reward": 0.2558922623143052, "eval_rewards/format_reward": 0.0, "eval_runtime": 552.4722, "eval_samples_per_second": 0.179, "eval_steps_per_second": 0.06, "step": 300 }, { "completion_length": 812.8906393527984, "epoch": 0.27387805248409053, "grad_norm": 0.8576089815755503, "kl": 1.0579757690429688, "learning_rate": 1.8222407493612878e-05, "loss": 0.0423, "reward": 0.2651041731238365, "reward_std": 0.27153504360467196, "rewards/accuracy_reward": 0.2651041731238365, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 733.2239722013474, "epoch": 0.2827128283706741, "grad_norm": 0.2427855401581468, "kl": 0.7223701477050781, "learning_rate": 1.8042700936082574e-05, "loss": 0.0289, "reward": 0.29479167317040267, "reward_std": 0.27604559250175953, "rewards/accuracy_reward": 0.29479167317040267, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 711.7026183843612, "epoch": 0.2915476042572576, "grad_norm": 1.9778007104616155, "kl": 0.8535140991210938, "learning_rate": 1.7855320317956785e-05, "loss": 0.0341, "reward": 0.3192708408460021, "reward_std": 0.303108885884285, "rewards/accuracy_reward": 0.3192708408460021, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 728.3005360603332, "epoch": 0.3003823801438412, "grad_norm": 0.38171472488642083, "kl": 0.7892318725585937, "learning_rate": 1.766044443118978e-05, "loss": 0.0316, "reward": 0.3526041739620268, "reward_std": 0.3112278738990426, "rewards/accuracy_reward": 0.3526041739620268, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 767.1573046445847, "epoch": 0.30921715603042477, "grad_norm": 0.24577197070273166, "kl": 0.5293319702148438, "learning_rate": 1.7458259219455896e-05, "loss": 0.0212, "reward": 0.3062500067986548, "reward_std": 0.2814582511782646, "rewards/accuracy_reward": 0.3062500067986548, "rewards/format_reward": 0.0, "step": 350 }, { "completion_length": 743.670326089859, "epoch": 0.3180519319170083, "grad_norm": 2.0103657238013652, "kl": 0.5427993774414063, "learning_rate": 1.7248957600728664e-05, "loss": 0.0217, "reward": 0.3505208409391344, "reward_std": 0.3175426423549652, "rewards/accuracy_reward": 0.3505208409391344, "rewards/format_reward": 0.0, "step": 360 }, { "completion_length": 840.4734511613846, "epoch": 0.3268867078035919, "grad_norm": 1.0323054081356446, "kl": 1.1146942138671876, "learning_rate": 1.7032739283205324e-05, "loss": 0.0446, "reward": 0.2598958395421505, "reward_std": 0.25890550669282675, "rewards/accuracy_reward": 0.2598958395421505, "rewards/format_reward": 0.0, "step": 370 }, { "completion_length": 839.7666805267334, "epoch": 0.33572148369017546, "grad_norm": 0.5119633048650775, "kl": 1.2632476806640625, "learning_rate": 1.6809810574752316e-05, "loss": 0.0505, "reward": 0.2333333392161876, "reward_std": 0.25168862845748663, "rewards/accuracy_reward": 0.2333333392161876, "rewards/format_reward": 0.0, "step": 380 }, { "completion_length": 733.7547018527985, "epoch": 0.344556259576759, "grad_norm": 1.5656576378617317, "kl": 1.1475967407226562, "learning_rate": 1.658038418605361e-05, "loss": 0.0459, "reward": 0.2567708396818489, "reward_std": 0.2634160555899143, "rewards/accuracy_reward": 0.2567708396818489, "rewards/format_reward": 0.0, "step": 390 }, { "completion_length": 885.3140710353852, "epoch": 0.3533910354633426, "grad_norm": 1.4058117586591972, "kl": 1.0219383239746094, "learning_rate": 1.6344679027649726e-05, "loss": 0.0409, "reward": 0.247916672937572, "reward_std": 0.2688287142664194, "rewards/accuracy_reward": 0.247916672937572, "rewards/format_reward": 0.0, "step": 400 }, { "epoch": 0.3533910354633426, "eval_completion_length": 985.952867296007, "eval_kl": 1.9370610400883839, "eval_loss": 0.07748492807149887, "eval_reward": 0.26262626924900095, "eval_reward_std": 0.27992739821925305, "eval_rewards/accuracy_reward": 0.26262626924900095, "eval_rewards/format_reward": 0.0, "eval_runtime": 572.8755, "eval_samples_per_second": 0.173, "eval_steps_per_second": 0.058, "step": 400 }, { "completion_length": 956.2359453678131, "epoch": 0.36222581134992615, "grad_norm": 0.42438124420657664, "kl": 1.3301658630371094, "learning_rate": 1.6102920001061003e-05, "loss": 0.0532, "reward": 0.20677083879709243, "reward_std": 0.2381569817662239, "rewards/accuracy_reward": 0.20677083879709243, "rewards/format_reward": 0.0, "step": 410 }, { "completion_length": 952.6323000907898, "epoch": 0.3710605872365097, "grad_norm": 0.44079559643052724, "kl": 1.332806396484375, "learning_rate": 1.5855337784194576e-05, "loss": 0.0533, "reward": 0.19427083847112953, "reward_std": 0.2309401035308838, "rewards/accuracy_reward": 0.19427083847112953, "rewards/format_reward": 0.0, "step": 420 }, { "completion_length": 976.3510466098785, "epoch": 0.3798953631230933, "grad_norm": 1.0796755649934018, "kl": 1.4209449768066407, "learning_rate": 1.560216861123964e-05, "loss": 0.0568, "reward": 0.22760417237877845, "reward_std": 0.2498844088986516, "rewards/accuracy_reward": 0.22760417237877845, "rewards/format_reward": 0.0, "step": 430 }, { "completion_length": 959.9578190803528, "epoch": 0.38873013900967684, "grad_norm": 0.46008757265822564, "kl": 1.3368026733398437, "learning_rate": 1.534365404726116e-05, "loss": 0.0535, "reward": 0.20885417223908007, "reward_std": 0.24808018933981657, "rewards/accuracy_reward": 0.20885417223908007, "rewards/format_reward": 0.0, "step": 440 }, { "completion_length": 917.8979267597199, "epoch": 0.3975649148962604, "grad_norm": 1.329880644661546, "kl": 1.244671630859375, "learning_rate": 1.5080040757707045e-05, "loss": 0.0498, "reward": 0.20833333865739406, "reward_std": 0.22462533507496119, "rewards/accuracy_reward": 0.20833333865739406, "rewards/format_reward": 0.0, "step": 450 }, { "completion_length": 877.7073036193848, "epoch": 0.406399690782844, "grad_norm": 2.63876569492226, "kl": 1.6055191040039063, "learning_rate": 1.4811580273048707e-05, "loss": 0.0642, "reward": 0.21770833879709245, "reward_std": 0.23635276220738888, "rewards/accuracy_reward": 0.21770833879709245, "rewards/format_reward": 0.0, "step": 460 }, { "completion_length": 842.9104291439056, "epoch": 0.41523446666942754, "grad_norm": 2.9897920260252797, "kl": 1.5962646484375, "learning_rate": 1.4538528748779561e-05, "loss": 0.0638, "reward": 0.27239584024064245, "reward_std": 0.2976962272077799, "rewards/accuracy_reward": 0.27239584024064245, "rewards/format_reward": 0.0, "step": 470 }, { "completion_length": 846.857304239273, "epoch": 0.4240692425560111, "grad_norm": 0.4405188552245265, "kl": 2.0165779113769533, "learning_rate": 1.4261146721000554e-05, "loss": 0.0807, "reward": 0.26614583996124563, "reward_std": 0.275143482722342, "rewards/accuracy_reward": 0.26614583996124563, "rewards/format_reward": 0.0, "step": 480 }, { "completion_length": 788.802097249031, "epoch": 0.4329040184425947, "grad_norm": 0.9995458801565597, "kl": 2.145621490478516, "learning_rate": 1.3979698857825816e-05, "loss": 0.0858, "reward": 0.2781250068452209, "reward_std": 0.28416458051651716, "rewards/accuracy_reward": 0.2781250068452209, "rewards/format_reward": 0.0, "step": 490 }, { "completion_length": 716.1062657117843, "epoch": 0.44173879432917823, "grad_norm": 0.7288658015008603, "kl": 1.736468505859375, "learning_rate": 1.3694453706845725e-05, "loss": 0.0695, "reward": 0.273437506519258, "reward_std": 0.2787519218400121, "rewards/accuracy_reward": 0.273437506519258, "rewards/format_reward": 0.0, "step": 500 }, { "epoch": 0.44173879432917823, "eval_completion_length": 691.8889026834507, "eval_kl": 1.6545336174242424, "eval_loss": 0.06618843972682953, "eval_reward": 0.2828282897520547, "eval_reward_std": 0.29159103981172196, "eval_rewards/accuracy_reward": 0.2828282897520547, "eval_rewards/format_reward": 0.0, "eval_runtime": 537.4204, "eval_samples_per_second": 0.184, "eval_steps_per_second": 0.061, "step": 500 }, { "completion_length": 708.6135584831238, "epoch": 0.4505735702157618, "grad_norm": 0.3575580002721733, "kl": 1.9233924865722656, "learning_rate": 1.3405683438888281e-05, "loss": 0.0769, "reward": 0.23697917256504297, "reward_std": 0.2571012871339917, "rewards/accuracy_reward": 0.23697917256504297, "rewards/format_reward": 0.0, "step": 510 }, { "completion_length": 768.7807433485984, "epoch": 0.4594083461023454, "grad_norm": 0.3623016043344235, "kl": 1.5948562622070312, "learning_rate": 1.3113663588323356e-05, "loss": 0.0638, "reward": 0.2958333403803408, "reward_std": 0.29679411742836237, "rewards/accuracy_reward": 0.2958333403803408, "rewards/format_reward": 0.0, "step": 520 }, { "completion_length": 761.9812644720078, "epoch": 0.4682431219889289, "grad_norm": 1.1199341162268286, "kl": 2.285528564453125, "learning_rate": 1.2818672790157543e-05, "loss": 0.0914, "reward": 0.3104166739620268, "reward_std": 0.3112278738990426, "rewards/accuracy_reward": 0.3104166739620268, "rewards/format_reward": 0.0, "step": 530 }, { "completion_length": 801.8776177227497, "epoch": 0.47707789787551247, "grad_norm": 2.033721333586972, "kl": 1.8555000305175782, "learning_rate": 1.252099251417048e-05, "loss": 0.0742, "reward": 0.26406250600703063, "reward_std": 0.2625139458104968, "rewards/accuracy_reward": 0.26406250600703063, "rewards/format_reward": 0.0, "step": 540 }, { "completion_length": 772.0724115848541, "epoch": 0.48591267376209607, "grad_norm": 0.7488074779014002, "kl": 1.9588623046875, "learning_rate": 1.2220906796346375e-05, "loss": 0.0784, "reward": 0.28177083991467955, "reward_std": 0.2796540316194296, "rewards/accuracy_reward": 0.28177083991467955, "rewards/format_reward": 0.0, "step": 550 }, { "completion_length": 853.2635561227798, "epoch": 0.4947474496486796, "grad_norm": 0.8443715089551579, "kl": 1.8191993713378907, "learning_rate": 1.1918701967856892e-05, "loss": 0.0728, "reward": 0.3036458405200392, "reward_std": 0.29679411742836237, "rewards/accuracy_reward": 0.3036458405200392, "rewards/format_reward": 0.0, "step": 560 }, { "completion_length": 858.300532579422, "epoch": 0.5035822255352632, "grad_norm": 1.2444227038825535, "kl": 2.2291900634765627, "learning_rate": 1.1614666381854107e-05, "loss": 0.0892, "reward": 0.26041667321696876, "reward_std": 0.2805561413988471, "rewards/accuracy_reward": 0.26041667321696876, "rewards/format_reward": 0.0, "step": 570 }, { "completion_length": 960.0770899295807, "epoch": 0.5124170014218468, "grad_norm": 0.5125806971801138, "kl": 1.9308998107910156, "learning_rate": 1.1309090138334112e-05, "loss": 0.0772, "reward": 0.3078125073108822, "reward_std": 0.30040255654603243, "rewards/accuracy_reward": 0.3078125073108822, "rewards/format_reward": 0.0, "step": 580 }, { "completion_length": 884.6036564350128, "epoch": 0.5212517773084303, "grad_norm": 0.6680675746939368, "kl": 1.9921707153320312, "learning_rate": 1.100226480733388e-05, "loss": 0.0797, "reward": 0.24947917312383652, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.24947917312383652, "rewards/format_reward": 0.0, "step": 590 }, { "completion_length": 799.9510572910309, "epoch": 0.5300865531950139, "grad_norm": 3.062049979351596, "kl": 2.207135009765625, "learning_rate": 1.0694483150725458e-05, "loss": 0.0883, "reward": 0.29583334028720853, "reward_std": 0.29498989786952734, "rewards/accuracy_reward": 0.29583334028720853, "rewards/format_reward": 0.0, "step": 600 }, { "epoch": 0.5300865531950139, "eval_completion_length": 775.0841884420375, "eval_kl": 2.8066208964646466, "eval_loss": 0.11227121949195862, "eval_reward": 0.28619529352043616, "eval_reward_std": 0.3207501437928941, "eval_rewards/accuracy_reward": 0.28619529352043616, "eval_rewards/format_reward": 0.0, "eval_runtime": 554.5488, "eval_samples_per_second": 0.179, "eval_steps_per_second": 0.06, "step": 600 }, { "completion_length": 862.4078252792358, "epoch": 0.5389213290815974, "grad_norm": 0.4132033561721883, "kl": 1.8186492919921875, "learning_rate": 1.038603884287294e-05, "loss": 0.0728, "reward": 0.15156250400468707, "reward_std": 0.17861773632466793, "rewards/accuracy_reward": 0.15156250400468707, "rewards/format_reward": 0.0, "step": 610 }, { "completion_length": 623.4411586642266, "epoch": 0.5477561049681811, "grad_norm": 0.5984776310180936, "kl": 0.5929832458496094, "learning_rate": 1.0077226190418783e-05, "loss": 0.0237, "reward": 0.1755208382382989, "reward_std": 0.2255274448543787, "rewards/accuracy_reward": 0.1755208382382989, "rewards/format_reward": 0.0, "step": 620 }, { "completion_length": 675.0708496570587, "epoch": 0.5565908808547646, "grad_norm": 0.4802015784335563, "kl": 0.47046356201171874, "learning_rate": 9.768339851466818e-06, "loss": 0.0188, "reward": 0.2515625067520887, "reward_std": 0.2940877880901098, "rewards/accuracy_reward": 0.2515625067520887, "rewards/format_reward": 0.0, "step": 630 }, { "completion_length": 768.3067860364914, "epoch": 0.5654256567413481, "grad_norm": 2.4930383667378413, "kl": 1.0879661560058593, "learning_rate": 9.45967455442995e-06, "loss": 0.0435, "reward": 0.34322917480021714, "reward_std": 0.3301721792668104, "rewards/accuracy_reward": 0.34322917480021714, "rewards/format_reward": 0.0, "step": 640 }, { "completion_length": 789.8026188373566, "epoch": 0.5742604326279317, "grad_norm": 1.7100954361428433, "kl": 2.7097198486328127, "learning_rate": 9.151524816810686e-06, "loss": 0.1084, "reward": 0.31666667400859294, "reward_std": 0.30671732500195503, "rewards/accuracy_reward": 0.31666667400859294, "rewards/format_reward": 0.0, "step": 650 }, { "completion_length": 707.0859542965889, "epoch": 0.5830952085145152, "grad_norm": 1.57666179442439, "kl": 1.7728652954101562, "learning_rate": 8.844184664182993e-06, "loss": 0.0709, "reward": 0.35000000791624186, "reward_std": 0.33107428904622793, "rewards/accuracy_reward": 0.35000000791624186, "rewards/format_reward": 0.0, "step": 660 }, { "completion_length": 773.0073058724404, "epoch": 0.5919299844010988, "grad_norm": 0.9807231122231896, "kl": 2.1020263671875, "learning_rate": 8.537947349643493e-06, "loss": 0.0841, "reward": 0.29739584047347306, "reward_std": 0.3112278738990426, "rewards/accuracy_reward": 0.29739584047347306, "rewards/format_reward": 0.0, "step": 670 }, { "completion_length": 777.6401177406311, "epoch": 0.6007647602876824, "grad_norm": 0.5403220713078971, "kl": 1.6934829711914063, "learning_rate": 8.23310507399973e-06, "loss": 0.0678, "reward": 0.2744791732635349, "reward_std": 0.2805561413988471, "rewards/accuracy_reward": 0.2744791732635349, "rewards/format_reward": 0.0, "step": 680 }, { "completion_length": 767.1880352020264, "epoch": 0.609599536174266, "grad_norm": 0.6884159799218071, "kl": 2.134033203125, "learning_rate": 7.929948706962508e-06, "loss": 0.0854, "reward": 0.28958334047347306, "reward_std": 0.30671732500195503, "rewards/accuracy_reward": 0.28958334047347306, "rewards/format_reward": 0.0, "step": 690 }, { "completion_length": 716.2224100291729, "epoch": 0.6184343120608495, "grad_norm": 0.9853920893844104, "kl": 1.9078399658203125, "learning_rate": 7.628767509608304e-06, "loss": 0.0763, "reward": 0.31041667447425425, "reward_std": 0.32566163036972284, "rewards/accuracy_reward": 0.31041667447425425, "rewards/format_reward": 0.0, "step": 700 }, { "epoch": 0.6184343120608495, "eval_completion_length": 676.0437842475044, "eval_kl": 2.118035827020202, "eval_loss": 0.08472807705402374, "eval_reward": 0.30976431749083777, "eval_reward_std": 0.3382456061815975, "eval_rewards/accuracy_reward": 0.30976431749083777, "eval_rewards/format_reward": 0.0, "eval_runtime": 527.3635, "eval_samples_per_second": 0.188, "eval_steps_per_second": 0.063, "step": 700 }, { "completion_length": 693.3250136733055, "epoch": 0.6272690879474331, "grad_norm": 0.6986064715250526, "kl": 2.01304931640625, "learning_rate": 7.329848858376585e-06, "loss": 0.0805, "reward": 0.31510417414829134, "reward_std": 0.3166405325755477, "rewards/accuracy_reward": 0.31510417414829134, "rewards/format_reward": 0.0, "step": 710 }, { "completion_length": 737.9067856788636, "epoch": 0.6361038638340166, "grad_norm": 2.3133745632209393, "kl": 2.0740203857421875, "learning_rate": 7.033477970865381e-06, "loss": 0.083, "reward": 0.32343750772997737, "reward_std": 0.3175426423549652, "rewards/accuracy_reward": 0.32343750772997737, "rewards/format_reward": 0.0, "step": 720 }, { "completion_length": 747.067202681303, "epoch": 0.6449386397206002, "grad_norm": 0.7455664324085425, "kl": 2.0231887817382814, "learning_rate": 6.73993763368675e-06, "loss": 0.0809, "reward": 0.3057291737757623, "reward_std": 0.30040255654603243, "rewards/accuracy_reward": 0.3057291737757623, "rewards/format_reward": 0.0, "step": 730 }, { "completion_length": 730.435950744152, "epoch": 0.6537734156071838, "grad_norm": 1.9312009716198788, "kl": 2.154399108886719, "learning_rate": 6.449507932641796e-06, "loss": 0.0862, "reward": 0.2927083405200392, "reward_std": 0.3040109956637025, "rewards/accuracy_reward": 0.2927083405200392, "rewards/format_reward": 0.0, "step": 740 }, { "completion_length": 707.3302231192589, "epoch": 0.6626081914937674, "grad_norm": 0.6668502719545971, "kl": 1.8355056762695312, "learning_rate": 6.16246598547271e-06, "loss": 0.0734, "reward": 0.2916666741017252, "reward_std": 0.32115108147263527, "rewards/accuracy_reward": 0.2916666741017252, "rewards/format_reward": 0.0, "step": 750 }, { "completion_length": 740.6020975530148, "epoch": 0.6714429673803509, "grad_norm": 0.7705413022710031, "kl": 2.15982666015625, "learning_rate": 5.8790856774468385e-06, "loss": 0.0864, "reward": 0.3046875072643161, "reward_std": 0.30491310544312, "rewards/accuracy_reward": 0.3046875072643161, "rewards/format_reward": 0.0, "step": 760 }, { "completion_length": 739.286992508173, "epoch": 0.6802777432669345, "grad_norm": 0.5391118755792965, "kl": 2.0162887573242188, "learning_rate": 5.599637400025036e-06, "loss": 0.0807, "reward": 0.31458334061317145, "reward_std": 0.3112278738990426, "rewards/accuracy_reward": 0.31458334061317145, "rewards/format_reward": 0.0, "step": 770 }, { "completion_length": 748.7869940280914, "epoch": 0.689112519153518, "grad_norm": 0.7122982868192613, "kl": 2.226336669921875, "learning_rate": 5.324387792863719e-06, "loss": 0.0891, "reward": 0.3234375074971467, "reward_std": 0.3058152152225375, "rewards/accuracy_reward": 0.3234375074971467, "rewards/format_reward": 0.0, "step": 780 }, { "completion_length": 737.4474101424217, "epoch": 0.6979472950401016, "grad_norm": 0.6757655195430946, "kl": 2.05509033203125, "learning_rate": 5.053599489396732e-06, "loss": 0.0822, "reward": 0.31250000759027896, "reward_std": 0.31393420323729515, "rewards/accuracy_reward": 0.31250000759027896, "rewards/format_reward": 0.0, "step": 790 }, { "completion_length": 711.7630351662635, "epoch": 0.7067820709266852, "grad_norm": 0.5761504098242444, "kl": 1.904425048828125, "learning_rate": 4.787530866239826e-06, "loss": 0.0762, "reward": 0.30729167386889455, "reward_std": 0.303108885884285, "rewards/accuracy_reward": 0.30729167386889455, "rewards/format_reward": 0.0, "step": 800 }, { "epoch": 0.7067820709266852, "eval_completion_length": 720.3198793199327, "eval_kl": 1.9343237058080809, "eval_loss": 0.07739535719156265, "eval_reward": 0.32996633739182446, "eval_reward_std": 0.3149183229966597, "eval_rewards/accuracy_reward": 0.32996633739182446, "eval_rewards/format_reward": 0.0, "eval_runtime": 534.7287, "eval_samples_per_second": 0.185, "eval_steps_per_second": 0.062, "step": 800 }, { "completion_length": 715.5786605343222, "epoch": 0.7156168468132688, "grad_norm": 0.7254446582873234, "kl": 2.05001220703125, "learning_rate": 4.52643579665683e-06, "loss": 0.082, "reward": 0.3187500074040145, "reward_std": 0.31303209345787764, "rewards/accuracy_reward": 0.3187500074040145, "rewards/format_reward": 0.0, "step": 810 }, { "completion_length": 729.5484510660172, "epoch": 0.7244516226998523, "grad_norm": 0.8829844749096067, "kl": 2.499810791015625, "learning_rate": 4.270563408322772e-06, "loss": 0.1, "reward": 0.3192708410322666, "reward_std": 0.3184447521343827, "rewards/accuracy_reward": 0.3192708410322666, "rewards/format_reward": 0.0, "step": 820 }, { "completion_length": 717.5286597132683, "epoch": 0.7332863985864359, "grad_norm": 1.7173630228550354, "kl": 2.062689208984375, "learning_rate": 4.020157845615075e-06, "loss": 0.0825, "reward": 0.33333334093913436, "reward_std": 0.3094236543402076, "rewards/accuracy_reward": 0.33333334093913436, "rewards/format_reward": 0.0, "step": 830 }, { "completion_length": 731.0494927078486, "epoch": 0.7421211744730194, "grad_norm": 0.829005223012386, "kl": 2.804341125488281, "learning_rate": 3.7754580366596116e-06, "loss": 0.1122, "reward": 0.29739584065973756, "reward_std": 0.3220531912520528, "rewards/accuracy_reward": 0.29739584065973756, "rewards/format_reward": 0.0, "step": 840 }, { "completion_length": 706.3677220225334, "epoch": 0.7509559503596029, "grad_norm": 1.6348693110104273, "kl": 1.9862686157226563, "learning_rate": 3.5366974653539653e-06, "loss": 0.0795, "reward": 0.30312500717118385, "reward_std": 0.30761943478137255, "rewards/accuracy_reward": 0.30312500717118385, "rewards/format_reward": 0.0, "step": 850 }, { "completion_length": 698.5177225530148, "epoch": 0.7597907262461866, "grad_norm": 0.9159549909583196, "kl": 2.294660949707031, "learning_rate": 3.304103948585341e-06, "loss": 0.0918, "reward": 0.3046875072643161, "reward_std": 0.30130466632544994, "rewards/accuracy_reward": 0.3046875072643161, "rewards/format_reward": 0.0, "step": 860 }, { "completion_length": 672.9390772372484, "epoch": 0.7686255021327701, "grad_norm": 2.1045299190913833, "kl": 2.3146575927734374, "learning_rate": 3.0778994188557722e-06, "loss": 0.0926, "reward": 0.315625007962808, "reward_std": 0.32475952059030533, "rewards/accuracy_reward": 0.315625007962808, "rewards/format_reward": 0.0, "step": 870 }, { "completion_length": 674.6109511375428, "epoch": 0.7774602780193537, "grad_norm": 0.7983792148121029, "kl": 2.302967834472656, "learning_rate": 2.8582997125219604e-06, "loss": 0.0921, "reward": 0.30260417428798975, "reward_std": 0.32566163036972284, "rewards/accuracy_reward": 0.30260417428798975, "rewards/format_reward": 0.0, "step": 880 }, { "completion_length": 656.9838689267635, "epoch": 0.7862950539059372, "grad_norm": 0.669988664196779, "kl": 2.0931640625, "learning_rate": 2.645514363851874e-06, "loss": 0.0837, "reward": 0.3010416736826301, "reward_std": 0.2931856783106923, "rewards/accuracy_reward": 0.3010416736826301, "rewards/format_reward": 0.0, "step": 890 }, { "completion_length": 652.4843894541264, "epoch": 0.7951298297925208, "grad_norm": 1.3222110289859523, "kl": 1.9164962768554688, "learning_rate": 2.4397464050945753e-06, "loss": 0.0767, "reward": 0.2838541739154607, "reward_std": 0.31393420323729515, "rewards/accuracy_reward": 0.2838541739154607, "rewards/format_reward": 0.0, "step": 900 }, { "epoch": 0.7951298297925208, "eval_completion_length": 622.8788027522539, "eval_kl": 2.0288036616161618, "eval_loss": 0.08118358999490738, "eval_reward": 0.3131313209581857, "eval_reward_std": 0.3265819645891286, "eval_rewards/accuracy_reward": 0.3131313209581857, "eval_rewards/format_reward": 0.0, "eval_runtime": 506.0213, "eval_samples_per_second": 0.196, "eval_steps_per_second": 0.065, "step": 900 }, { "completion_length": 651.8578262448311, "epoch": 0.8039646056791043, "grad_norm": 1.0220083602716086, "kl": 2.2107833862304687, "learning_rate": 2.24119217275401e-06, "loss": 0.0884, "reward": 0.2739583405200392, "reward_std": 0.30671732500195503, "rewards/accuracy_reward": 0.2739583405200392, "rewards/format_reward": 0.0, "step": 910 }, { "completion_length": 630.3770972907544, "epoch": 0.812799381565688, "grad_norm": 1.140106665270527, "kl": 2.0034912109375, "learning_rate": 2.0500411202516814e-06, "loss": 0.0802, "reward": 0.29166667340323327, "reward_std": 0.2850666902959347, "rewards/accuracy_reward": 0.29166667340323327, "rewards/format_reward": 0.0, "step": 920 }, { "completion_length": 633.9614718899131, "epoch": 0.8216341574522715, "grad_norm": 1.1778599052603038, "kl": 1.8928955078125, "learning_rate": 1.8664756371568981e-06, "loss": 0.0757, "reward": 0.2963541735429317, "reward_std": 0.29498989786952734, "rewards/accuracy_reward": 0.2963541735429317, "rewards/format_reward": 0.0, "step": 930 }, { "completion_length": 658.522930726409, "epoch": 0.8304689333388551, "grad_norm": 0.6577212329055425, "kl": 2.294940185546875, "learning_rate": 1.6906708751570955e-06, "loss": 0.0918, "reward": 0.31093750675208864, "reward_std": 0.30220677610486746, "rewards/accuracy_reward": 0.31093750675208864, "rewards/format_reward": 0.0, "step": 940 }, { "completion_length": 677.2802217006683, "epoch": 0.8393037092254386, "grad_norm": 0.7422568973111673, "kl": 2.5144363403320313, "learning_rate": 1.5227945809342992e-06, "loss": 0.1006, "reward": 0.3151041742414236, "reward_std": 0.3166405325755477, "rewards/accuracy_reward": 0.3151041742414236, "rewards/format_reward": 0.0, "step": 950 }, { "completion_length": 652.5901178598403, "epoch": 0.8481384851120222, "grad_norm": 0.810421906439625, "kl": 2.262736511230469, "learning_rate": 1.363006936107183e-06, "loss": 0.0905, "reward": 0.34635417480021713, "reward_std": 0.336486947722733, "rewards/accuracy_reward": 0.34635417480021713, "rewards/format_reward": 0.0, "step": 960 }, { "completion_length": 670.2057422459126, "epoch": 0.8569732609986057, "grad_norm": 0.7920465762778155, "kl": 2.5366302490234376, "learning_rate": 1.2114604043914225e-06, "loss": 0.1015, "reward": 0.32031250768341124, "reward_std": 0.3175426423549652, "rewards/accuracy_reward": 0.32031250768341124, "rewards/format_reward": 0.0, "step": 970 }, { "completion_length": 682.0869930744171, "epoch": 0.8658080368851894, "grad_norm": 0.6071556277616226, "kl": 2.4378128051757812, "learning_rate": 1.068299586124224e-06, "loss": 0.0975, "reward": 0.33072917396202683, "reward_std": 0.2995004467666149, "rewards/accuracy_reward": 0.33072917396202683, "rewards/format_reward": 0.0, "step": 980 }, { "completion_length": 663.8849090188742, "epoch": 0.8746428127717729, "grad_norm": 0.7004418244247146, "kl": 2.3257492065429686, "learning_rate": 9.336610802918044e-07, "loss": 0.093, "reward": 0.32447917433455586, "reward_std": 0.3094236543402076, "rewards/accuracy_reward": 0.32447917433455586, "rewards/format_reward": 0.0, "step": 990 }, { "completion_length": 678.5744934767484, "epoch": 0.8834775886583565, "grad_norm": 1.5130552859053217, "kl": 2.3184799194335937, "learning_rate": 8.076733541914617e-07, "loss": 0.0927, "reward": 0.33281250740401447, "reward_std": 0.30491310544312, "rewards/accuracy_reward": 0.33281250740401447, "rewards/format_reward": 0.0, "step": 1000 }, { "epoch": 0.8834775886583565, "eval_completion_length": 664.8687005476518, "eval_kl": 2.323409880050505, "eval_loss": 0.0929863303899765, "eval_reward": 0.3535353619642932, "eval_reward_std": 0.3440774269778319, "eval_rewards/accuracy_reward": 0.3535353619642932, "eval_rewards/format_reward": 0.0, "eval_runtime": 516.2096, "eval_samples_per_second": 0.192, "eval_steps_per_second": 0.064, "step": 1000 }, { "completion_length": 702.9177223086357, "epoch": 0.89231236454494, "grad_norm": 1.529025375905959, "kl": 2.152081298828125, "learning_rate": 6.90456620852632e-07, "loss": 0.0861, "reward": 0.3427083408460021, "reward_std": 0.31393420323729515, "rewards/accuracy_reward": 0.3427083408460021, "rewards/format_reward": 0.0, "step": 1010 }, { "completion_length": 693.2552229389548, "epoch": 0.9011471404315236, "grad_norm": 0.6357248703500725, "kl": 2.1104583740234375, "learning_rate": 5.821227243338712e-07, "loss": 0.0844, "reward": 0.32656250763684513, "reward_std": 0.3157384227961302, "rewards/accuracy_reward": 0.32656250763684513, "rewards/format_reward": 0.0, "step": 1020 }, { "completion_length": 669.4468886733055, "epoch": 0.9099819163181071, "grad_norm": 2.2284285054958324, "kl": 2.1510406494140626, "learning_rate": 4.827750330052117e-07, "loss": 0.0861, "reward": 0.3536458413582295, "reward_std": 0.32566163036972284, "rewards/accuracy_reward": 0.3536458413582295, "rewards/format_reward": 0.0, "step": 1030 }, { "completion_length": 683.547409978509, "epoch": 0.9188166922046908, "grad_norm": 0.6632003611968932, "kl": 2.2973922729492187, "learning_rate": 3.925083409177266e-07, "loss": 0.0919, "reward": 0.34010417442768814, "reward_std": 0.31934686191380024, "rewards/accuracy_reward": 0.34010417442768814, "rewards/format_reward": 0.0, "step": 1040 }, { "completion_length": 679.4802215665579, "epoch": 0.9276514680912743, "grad_norm": 0.8599780179720123, "kl": 2.3347732543945314, "learning_rate": 3.114087773543939e-07, "loss": 0.0934, "reward": 0.3500000080559403, "reward_std": 0.33107428904622793, "rewards/accuracy_reward": 0.3500000080559403, "rewards/format_reward": 0.0, "step": 1050 }, { "completion_length": 703.7713681519032, "epoch": 0.9364862439778578, "grad_norm": 1.174851381620777, "kl": 2.408406066894531, "learning_rate": 2.395537246485846e-07, "loss": 0.0963, "reward": 0.329166674753651, "reward_std": 0.34370382595807314, "rewards/accuracy_reward": 0.329166674753651, "rewards/format_reward": 0.0, "step": 1060 }, { "completion_length": 678.3260560303927, "epoch": 0.9453210198644414, "grad_norm": 1.54743742951263, "kl": 2.394914245605469, "learning_rate": 1.7701174434858193e-07, "loss": 0.0958, "reward": 0.33802084121853115, "reward_std": 0.32024897169321775, "rewards/accuracy_reward": 0.33802084121853115, "rewards/format_reward": 0.0, "step": 1070 }, { "completion_length": 664.8302224695683, "epoch": 0.9541557957510249, "grad_norm": 1.5666079028942566, "kl": 2.3258026123046873, "learning_rate": 1.2384251179857642e-07, "loss": 0.093, "reward": 0.3578125084284693, "reward_std": 0.35001859441399574, "rewards/accuracy_reward": 0.3578125084284693, "rewards/format_reward": 0.0, "step": 1080 }, { "completion_length": 682.4229293212295, "epoch": 0.9629905716376085, "grad_norm": 0.6943507484399737, "kl": 2.487345886230469, "learning_rate": 8.009675919856574e-08, "loss": 0.0995, "reward": 0.32239584033377466, "reward_std": 0.29679411742836237, "rewards/accuracy_reward": 0.32239584033377466, "rewards/format_reward": 0.0, "step": 1090 }, { "completion_length": 689.0625125810504, "epoch": 0.9718253475241921, "grad_norm": 0.6276023210613672, "kl": 2.3767745971679686, "learning_rate": 4.581622719748269e-08, "loss": 0.0951, "reward": 0.3395833406597376, "reward_std": 0.2985983369871974, "rewards/accuracy_reward": 0.3395833406597376, "rewards/format_reward": 0.0, "step": 1100 }, { "epoch": 0.9718253475241921, "eval_completion_length": 665.5656705528799, "eval_kl": 2.368923611111111, "eval_loss": 0.09475857019424438, "eval_reward": 0.3804713891010092, "eval_reward_std": 0.3382456061815975, "eval_rewards/accuracy_reward": 0.3804713891010092, "eval_rewards/format_reward": 0.0, "eval_runtime": 510.5875, "eval_samples_per_second": 0.194, "eval_steps_per_second": 0.065, "step": 1100 }, { "completion_length": 687.5755343332887, "epoch": 0.9806601234107757, "grad_norm": 0.5833877177508657, "kl": 2.3449798583984376, "learning_rate": 2.1033625065747244e-08, "loss": 0.0938, "reward": 0.3359375076368451, "reward_std": 0.31393420323729515, "rewards/accuracy_reward": 0.3359375076368451, "rewards/format_reward": 0.0, "step": 1110 }, { "completion_length": 686.5161595344543, "epoch": 0.9894948992973592, "grad_norm": 0.8177663703595698, "kl": 2.952043151855469, "learning_rate": 5.772599485236452e-09, "loss": 0.118, "reward": 0.3333333409857005, "reward_std": 0.3238574108108878, "rewards/accuracy_reward": 0.3333333409857005, "rewards/format_reward": 0.0, "step": 1120 }, { "completion_length": 679.1244929388165, "epoch": 0.9983296751839428, "grad_norm": 1.027763278137717, "kl": 2.3051712036132814, "learning_rate": 4.7711986460585725e-11, "loss": 0.0922, "reward": 0.3552083419635892, "reward_std": 0.34370382595807314, "rewards/accuracy_reward": 0.3552083419635892, "rewards/format_reward": 0.0, "step": 1130 }, { "completion_length": 695.1406378149986, "epoch": 0.9992131527726011, "kl": 2.40301513671875, "reward": 0.3125000074505806, "reward_std": 0.32475952059030533, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "step": 1131, "total_flos": 0.0, "train_loss": 0.06294877494734573, "train_runtime": 428958.1793, "train_samples_per_second": 0.169, "train_steps_per_second": 0.003 } ], "logging_steps": 10, "max_steps": 1131, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }