| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9992131527726011, | |
| "eval_steps": 100, | |
| "global_step": 1131, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 894.0125112533569, | |
| "epoch": 0.008834775886583565, | |
| "grad_norm": 0.10339409232367645, | |
| "kl": 6.553309503942728e-05, | |
| "learning_rate": 1.7543859649122807e-06, | |
| "loss": 0.0, | |
| "reward": 0.20572917186655104, | |
| "reward_std": 0.22462533507496119, | |
| "rewards/accuracy_reward": 0.20572917186655104, | |
| "rewards/format_reward": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 871.2338674545288, | |
| "epoch": 0.01766955177316713, | |
| "grad_norm": 0.1191837579508088, | |
| "kl": 0.0008328911615535616, | |
| "learning_rate": 3.5087719298245615e-06, | |
| "loss": 0.0, | |
| "reward": 0.250000006519258, | |
| "reward_std": 0.275143482722342, | |
| "rewards/accuracy_reward": 0.250000006519258, | |
| "rewards/format_reward": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 805.9927246570587, | |
| "epoch": 0.026504327659750693, | |
| "grad_norm": 0.11427842333813355, | |
| "kl": 0.006137973070144654, | |
| "learning_rate": 5.263157894736842e-06, | |
| "loss": 0.0002, | |
| "reward": 0.3583333409391344, | |
| "reward_std": 0.3040109956637025, | |
| "rewards/accuracy_reward": 0.3583333409391344, | |
| "rewards/format_reward": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 815.7588698387146, | |
| "epoch": 0.03533910354633426, | |
| "grad_norm": 0.1094331129364983, | |
| "kl": 0.011934018135070801, | |
| "learning_rate": 7.017543859649123e-06, | |
| "loss": 0.0005, | |
| "reward": 0.3635416746605188, | |
| "reward_std": 0.31483631301671267, | |
| "rewards/accuracy_reward": 0.3635416746605188, | |
| "rewards/format_reward": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 811.1395996570587, | |
| "epoch": 0.044173879432917826, | |
| "grad_norm": 0.11064742736511399, | |
| "kl": 0.027907085418701173, | |
| "learning_rate": 8.771929824561405e-06, | |
| "loss": 0.0011, | |
| "reward": 0.38958334126509725, | |
| "reward_std": 0.31934686191380024, | |
| "rewards/accuracy_reward": 0.38958334126509725, | |
| "rewards/format_reward": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 729.52032995224, | |
| "epoch": 0.053008655319501385, | |
| "grad_norm": 0.5600590072843962, | |
| "kl": 0.11420574188232421, | |
| "learning_rate": 1.0526315789473684e-05, | |
| "loss": 0.0046, | |
| "reward": 0.47083334238268437, | |
| "reward_std": 0.34550804551690817, | |
| "rewards/accuracy_reward": 0.47083334238268437, | |
| "rewards/format_reward": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 748.378140592575, | |
| "epoch": 0.06184343120608495, | |
| "grad_norm": 0.49897174304455477, | |
| "kl": 0.4907238006591797, | |
| "learning_rate": 1.2280701754385966e-05, | |
| "loss": 0.0196, | |
| "reward": 0.35729167410172524, | |
| "reward_std": 0.3094236543402076, | |
| "rewards/accuracy_reward": 0.35729167410172524, | |
| "rewards/format_reward": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 731.9463713169098, | |
| "epoch": 0.07067820709266852, | |
| "grad_norm": 2.1010371186603622, | |
| "kl": 0.5549324035644532, | |
| "learning_rate": 1.4035087719298246e-05, | |
| "loss": 0.0222, | |
| "reward": 0.39947917480021716, | |
| "reward_std": 0.3283679597079754, | |
| "rewards/accuracy_reward": 0.39947917480021716, | |
| "rewards/format_reward": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 636.9864743709564, | |
| "epoch": 0.07951298297925208, | |
| "grad_norm": 0.3320296791157868, | |
| "kl": 0.8359504699707031, | |
| "learning_rate": 1.578947368421053e-05, | |
| "loss": 0.0334, | |
| "reward": 0.35625000754371283, | |
| "reward_std": 0.30491310544312, | |
| "rewards/accuracy_reward": 0.35625000754371283, | |
| "rewards/format_reward": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 752.3406419754028, | |
| "epoch": 0.08834775886583565, | |
| "grad_norm": 13.222864026304222, | |
| "kl": 1.2244148254394531, | |
| "learning_rate": 1.754385964912281e-05, | |
| "loss": 0.049, | |
| "reward": 0.3458333408460021, | |
| "reward_std": 0.2922835685312748, | |
| "rewards/accuracy_reward": 0.3458333408460021, | |
| "rewards/format_reward": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.08834775886583565, | |
| "eval_completion_length": 806.9528756189828, | |
| "eval_kl": 0.7132556029040404, | |
| "eval_loss": 0.028531787917017937, | |
| "eval_reward": 0.3063973137224563, | |
| "eval_reward_std": 0.3090865022004253, | |
| "eval_rewards/accuracy_reward": 0.3063973137224563, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 538.1844, | |
| "eval_samples_per_second": 0.184, | |
| "eval_steps_per_second": 0.061, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 788.5630361557007, | |
| "epoch": 0.09718253475241921, | |
| "grad_norm": 1.9841707275878488, | |
| "kl": 1.9315879821777344, | |
| "learning_rate": 1.929824561403509e-05, | |
| "loss": 0.0773, | |
| "reward": 0.3375000071246177, | |
| "reward_std": 0.28867512941360474, | |
| "rewards/accuracy_reward": 0.3375000071246177, | |
| "rewards/format_reward": 0.0, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 782.2432436466217, | |
| "epoch": 0.10601731063900277, | |
| "grad_norm": 1.6220755341742679, | |
| "kl": 1.2291488647460938, | |
| "learning_rate": 1.9998282416292057e-05, | |
| "loss": 0.0492, | |
| "reward": 0.29166667312383654, | |
| "reward_std": 0.2625139458104968, | |
| "rewards/accuracy_reward": 0.29166667312383654, | |
| "rewards/format_reward": 0.0, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 855.3958479881287, | |
| "epoch": 0.11485208652558634, | |
| "grad_norm": 1.2506380617356658, | |
| "kl": 0.8863227844238282, | |
| "learning_rate": 1.9987788208027496e-05, | |
| "loss": 0.0355, | |
| "reward": 0.3015625067986548, | |
| "reward_std": 0.2796540316194296, | |
| "rewards/accuracy_reward": 0.3015625067986548, | |
| "rewards/format_reward": 0.0, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 923.2276175498962, | |
| "epoch": 0.1236868624121699, | |
| "grad_norm": 0.7947305780961375, | |
| "kl": 0.8832084655761718, | |
| "learning_rate": 1.9967764005730785e-05, | |
| "loss": 0.0353, | |
| "reward": 0.17343750461004676, | |
| "reward_std": 0.2128979079425335, | |
| "rewards/accuracy_reward": 0.17343750461004676, | |
| "rewards/format_reward": 0.0, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 867.6395987987519, | |
| "epoch": 0.13252163829875346, | |
| "grad_norm": 0.497580876778096, | |
| "kl": 0.5946540832519531, | |
| "learning_rate": 1.993822891578708e-05, | |
| "loss": 0.0238, | |
| "reward": 0.18281250479631125, | |
| "reward_std": 0.2192126763984561, | |
| "rewards/accuracy_reward": 0.18281250479631125, | |
| "rewards/format_reward": 0.0, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 718.904184293747, | |
| "epoch": 0.14135641418533704, | |
| "grad_norm": 1.857046714456291, | |
| "kl": 0.6682144165039062, | |
| "learning_rate": 1.9899211119533938e-05, | |
| "loss": 0.0267, | |
| "reward": 0.3848958414513618, | |
| "reward_std": 0.33107428904622793, | |
| "rewards/accuracy_reward": 0.3848958414513618, | |
| "rewards/format_reward": 0.0, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 918.8239655733108, | |
| "epoch": 0.1501911900719206, | |
| "grad_norm": 0.20248662305659806, | |
| "kl": 0.5040359497070312, | |
| "learning_rate": 1.985074784637167e-05, | |
| "loss": 0.0202, | |
| "reward": 0.09479166907258332, | |
| "reward_std": 0.09923207573592663, | |
| "rewards/accuracy_reward": 0.09479166907258332, | |
| "rewards/format_reward": 0.0, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 827.5974071979523, | |
| "epoch": 0.15902596595850416, | |
| "grad_norm": 0.19897870444761726, | |
| "kl": 0.17742691040039063, | |
| "learning_rate": 1.9792885338240375e-05, | |
| "loss": 0.0071, | |
| "reward": 0.17291667088866233, | |
| "reward_std": 0.17771562654525042, | |
| "rewards/accuracy_reward": 0.17291667088866233, | |
| "rewards/format_reward": 0.0, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 696.4333508253097, | |
| "epoch": 0.16786074184508773, | |
| "grad_norm": 0.4541804472147627, | |
| "kl": 0.3155979156494141, | |
| "learning_rate": 1.9725678805497507e-05, | |
| "loss": 0.0126, | |
| "reward": 0.42395834233611823, | |
| "reward_std": 0.3536270335316658, | |
| "rewards/accuracy_reward": 0.42395834233611823, | |
| "rewards/format_reward": 0.0, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 709.8849118709564, | |
| "epoch": 0.1766955177316713, | |
| "grad_norm": 0.5097645308657522, | |
| "kl": 1.0620399475097657, | |
| "learning_rate": 1.964919237423812e-05, | |
| "loss": 0.0425, | |
| "reward": 0.2687500062398612, | |
| "reward_std": 0.2625139458104968, | |
| "rewards/accuracy_reward": 0.2687500062398612, | |
| "rewards/format_reward": 0.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1766955177316713, | |
| "eval_completion_length": 699.2357113000119, | |
| "eval_kl": 0.9798473011363636, | |
| "eval_loss": 0.039183806627988815, | |
| "eval_reward": 0.3872053963367385, | |
| "eval_reward_std": 0.34990924777406635, | |
| "eval_rewards/accuracy_reward": 0.3872053963367385, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 513.8181, | |
| "eval_samples_per_second": 0.193, | |
| "eval_steps_per_second": 0.064, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 786.7765783786774, | |
| "epoch": 0.18553029361825485, | |
| "grad_norm": 0.15936983650030578, | |
| "kl": 1.27841796875, | |
| "learning_rate": 1.9563499025108e-05, | |
| "loss": 0.0511, | |
| "reward": 0.25937500610016284, | |
| "reward_std": 0.25980761647224426, | |
| "rewards/accuracy_reward": 0.25937500610016284, | |
| "rewards/format_reward": 0.0, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 909.2531366825103, | |
| "epoch": 0.19436506950483842, | |
| "grad_norm": 0.15108105876463188, | |
| "kl": 0.6172576904296875, | |
| "learning_rate": 1.9468680523668136e-05, | |
| "loss": 0.0247, | |
| "reward": 0.08593750246800483, | |
| "reward_std": 0.11998060066252947, | |
| "rewards/accuracy_reward": 0.08593750246800483, | |
| "rewards/format_reward": 0.0, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 847.0453273773194, | |
| "epoch": 0.203199845391422, | |
| "grad_norm": 0.4781092711117684, | |
| "kl": 0.719610595703125, | |
| "learning_rate": 1.936482734237689e-05, | |
| "loss": 0.0288, | |
| "reward": 0.1854166716337204, | |
| "reward_std": 0.2201147861778736, | |
| "rewards/accuracy_reward": 0.1854166716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 844.9718894481659, | |
| "epoch": 0.21203462127800554, | |
| "grad_norm": 0.6125806510450937, | |
| "kl": 0.6312515258789062, | |
| "learning_rate": 1.9252038574264403e-05, | |
| "loss": 0.0253, | |
| "reward": 0.1963541720993817, | |
| "reward_std": 0.25168862845748663, | |
| "rewards/accuracy_reward": 0.1963541720993817, | |
| "rewards/format_reward": 0.0, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 716.1927267074585, | |
| "epoch": 0.22086939716458912, | |
| "grad_norm": 0.42158257729983767, | |
| "kl": 1.013921356201172, | |
| "learning_rate": 1.913042183838153e-05, | |
| "loss": 0.0406, | |
| "reward": 0.3276041746605188, | |
| "reward_std": 0.3418996063992381, | |
| "rewards/accuracy_reward": 0.3276041746605188, | |
| "rewards/format_reward": 0.0, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 772.1078284740448, | |
| "epoch": 0.2297041730511727, | |
| "grad_norm": 0.49721786332420803, | |
| "kl": 1.1279647827148438, | |
| "learning_rate": 1.9000093177113524e-05, | |
| "loss": 0.0451, | |
| "reward": 0.25833333977498113, | |
| "reward_std": 0.27604559250175953, | |
| "rewards/accuracy_reward": 0.25833333977498113, | |
| "rewards/format_reward": 0.0, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 798.2672049999237, | |
| "epoch": 0.23853894893775623, | |
| "grad_norm": 0.6009045781426623, | |
| "kl": 0.8998489379882812, | |
| "learning_rate": 1.8861176945456542e-05, | |
| "loss": 0.036, | |
| "reward": 0.26093750656582415, | |
| "reward_std": 0.2931856783106923, | |
| "rewards/accuracy_reward": 0.26093750656582415, | |
| "rewards/format_reward": 0.0, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 739.2422043561935, | |
| "epoch": 0.2473737248243398, | |
| "grad_norm": 0.9064888423502573, | |
| "kl": 1.204058837890625, | |
| "learning_rate": 1.8713805692362458e-05, | |
| "loss": 0.0482, | |
| "reward": 0.31979167396202685, | |
| "reward_std": 0.30761943478137255, | |
| "rewards/accuracy_reward": 0.31979167396202685, | |
| "rewards/format_reward": 0.0, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 823.738035774231, | |
| "epoch": 0.2562085007109234, | |
| "grad_norm": 0.4395412639205522, | |
| "kl": 0.8689178466796875, | |
| "learning_rate": 1.8558120034265396e-05, | |
| "loss": 0.0348, | |
| "reward": 0.24218750628642738, | |
| "reward_std": 0.275143482722342, | |
| "rewards/accuracy_reward": 0.24218750628642738, | |
| "rewards/format_reward": 0.0, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 866.1130366325378, | |
| "epoch": 0.2650432765975069, | |
| "grad_norm": 0.4910227951322757, | |
| "kl": 0.7059478759765625, | |
| "learning_rate": 1.8394268520910467e-05, | |
| "loss": 0.0282, | |
| "reward": 0.236458339728415, | |
| "reward_std": 0.27153504360467196, | |
| "rewards/accuracy_reward": 0.236458339728415, | |
| "rewards/format_reward": 0.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2650432765975069, | |
| "eval_completion_length": 855.1784671051333, | |
| "eval_kl": 0.9380622632575758, | |
| "eval_loss": 0.03754038363695145, | |
| "eval_reward": 0.2558922623143052, | |
| "eval_reward_std": 0.2857592190154875, | |
| "eval_rewards/accuracy_reward": 0.2558922623143052, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 552.4722, | |
| "eval_samples_per_second": 0.179, | |
| "eval_steps_per_second": 0.06, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 812.8906393527984, | |
| "epoch": 0.27387805248409053, | |
| "grad_norm": 0.8576089815755503, | |
| "kl": 1.0579757690429688, | |
| "learning_rate": 1.8222407493612878e-05, | |
| "loss": 0.0423, | |
| "reward": 0.2651041731238365, | |
| "reward_std": 0.27153504360467196, | |
| "rewards/accuracy_reward": 0.2651041731238365, | |
| "rewards/format_reward": 0.0, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 733.2239722013474, | |
| "epoch": 0.2827128283706741, | |
| "grad_norm": 0.2427855401581468, | |
| "kl": 0.7223701477050781, | |
| "learning_rate": 1.8042700936082574e-05, | |
| "loss": 0.0289, | |
| "reward": 0.29479167317040267, | |
| "reward_std": 0.27604559250175953, | |
| "rewards/accuracy_reward": 0.29479167317040267, | |
| "rewards/format_reward": 0.0, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 711.7026183843612, | |
| "epoch": 0.2915476042572576, | |
| "grad_norm": 1.9778007104616155, | |
| "kl": 0.8535140991210938, | |
| "learning_rate": 1.7855320317956785e-05, | |
| "loss": 0.0341, | |
| "reward": 0.3192708408460021, | |
| "reward_std": 0.303108885884285, | |
| "rewards/accuracy_reward": 0.3192708408460021, | |
| "rewards/format_reward": 0.0, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 728.3005360603332, | |
| "epoch": 0.3003823801438412, | |
| "grad_norm": 0.38171472488642083, | |
| "kl": 0.7892318725585937, | |
| "learning_rate": 1.766044443118978e-05, | |
| "loss": 0.0316, | |
| "reward": 0.3526041739620268, | |
| "reward_std": 0.3112278738990426, | |
| "rewards/accuracy_reward": 0.3526041739620268, | |
| "rewards/format_reward": 0.0, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 767.1573046445847, | |
| "epoch": 0.30921715603042477, | |
| "grad_norm": 0.24577197070273166, | |
| "kl": 0.5293319702148438, | |
| "learning_rate": 1.7458259219455896e-05, | |
| "loss": 0.0212, | |
| "reward": 0.3062500067986548, | |
| "reward_std": 0.2814582511782646, | |
| "rewards/accuracy_reward": 0.3062500067986548, | |
| "rewards/format_reward": 0.0, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 743.670326089859, | |
| "epoch": 0.3180519319170083, | |
| "grad_norm": 2.0103657238013652, | |
| "kl": 0.5427993774414063, | |
| "learning_rate": 1.7248957600728664e-05, | |
| "loss": 0.0217, | |
| "reward": 0.3505208409391344, | |
| "reward_std": 0.3175426423549652, | |
| "rewards/accuracy_reward": 0.3505208409391344, | |
| "rewards/format_reward": 0.0, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 840.4734511613846, | |
| "epoch": 0.3268867078035919, | |
| "grad_norm": 1.0323054081356446, | |
| "kl": 1.1146942138671876, | |
| "learning_rate": 1.7032739283205324e-05, | |
| "loss": 0.0446, | |
| "reward": 0.2598958395421505, | |
| "reward_std": 0.25890550669282675, | |
| "rewards/accuracy_reward": 0.2598958395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 839.7666805267334, | |
| "epoch": 0.33572148369017546, | |
| "grad_norm": 0.5119633048650775, | |
| "kl": 1.2632476806640625, | |
| "learning_rate": 1.6809810574752316e-05, | |
| "loss": 0.0505, | |
| "reward": 0.2333333392161876, | |
| "reward_std": 0.25168862845748663, | |
| "rewards/accuracy_reward": 0.2333333392161876, | |
| "rewards/format_reward": 0.0, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 733.7547018527985, | |
| "epoch": 0.344556259576759, | |
| "grad_norm": 1.5656576378617317, | |
| "kl": 1.1475967407226562, | |
| "learning_rate": 1.658038418605361e-05, | |
| "loss": 0.0459, | |
| "reward": 0.2567708396818489, | |
| "reward_std": 0.2634160555899143, | |
| "rewards/accuracy_reward": 0.2567708396818489, | |
| "rewards/format_reward": 0.0, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 885.3140710353852, | |
| "epoch": 0.3533910354633426, | |
| "grad_norm": 1.4058117586591972, | |
| "kl": 1.0219383239746094, | |
| "learning_rate": 1.6344679027649726e-05, | |
| "loss": 0.0409, | |
| "reward": 0.247916672937572, | |
| "reward_std": 0.2688287142664194, | |
| "rewards/accuracy_reward": 0.247916672937572, | |
| "rewards/format_reward": 0.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3533910354633426, | |
| "eval_completion_length": 985.952867296007, | |
| "eval_kl": 1.9370610400883839, | |
| "eval_loss": 0.07748492807149887, | |
| "eval_reward": 0.26262626924900095, | |
| "eval_reward_std": 0.27992739821925305, | |
| "eval_rewards/accuracy_reward": 0.26262626924900095, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 572.8755, | |
| "eval_samples_per_second": 0.173, | |
| "eval_steps_per_second": 0.058, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 956.2359453678131, | |
| "epoch": 0.36222581134992615, | |
| "grad_norm": 0.42438124420657664, | |
| "kl": 1.3301658630371094, | |
| "learning_rate": 1.6102920001061003e-05, | |
| "loss": 0.0532, | |
| "reward": 0.20677083879709243, | |
| "reward_std": 0.2381569817662239, | |
| "rewards/accuracy_reward": 0.20677083879709243, | |
| "rewards/format_reward": 0.0, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 952.6323000907898, | |
| "epoch": 0.3710605872365097, | |
| "grad_norm": 0.44079559643052724, | |
| "kl": 1.332806396484375, | |
| "learning_rate": 1.5855337784194576e-05, | |
| "loss": 0.0533, | |
| "reward": 0.19427083847112953, | |
| "reward_std": 0.2309401035308838, | |
| "rewards/accuracy_reward": 0.19427083847112953, | |
| "rewards/format_reward": 0.0, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 976.3510466098785, | |
| "epoch": 0.3798953631230933, | |
| "grad_norm": 1.0796755649934018, | |
| "kl": 1.4209449768066407, | |
| "learning_rate": 1.560216861123964e-05, | |
| "loss": 0.0568, | |
| "reward": 0.22760417237877845, | |
| "reward_std": 0.2498844088986516, | |
| "rewards/accuracy_reward": 0.22760417237877845, | |
| "rewards/format_reward": 0.0, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 959.9578190803528, | |
| "epoch": 0.38873013900967684, | |
| "grad_norm": 0.46008757265822564, | |
| "kl": 1.3368026733398437, | |
| "learning_rate": 1.534365404726116e-05, | |
| "loss": 0.0535, | |
| "reward": 0.20885417223908007, | |
| "reward_std": 0.24808018933981657, | |
| "rewards/accuracy_reward": 0.20885417223908007, | |
| "rewards/format_reward": 0.0, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 917.8979267597199, | |
| "epoch": 0.3975649148962604, | |
| "grad_norm": 1.329880644661546, | |
| "kl": 1.244671630859375, | |
| "learning_rate": 1.5080040757707045e-05, | |
| "loss": 0.0498, | |
| "reward": 0.20833333865739406, | |
| "reward_std": 0.22462533507496119, | |
| "rewards/accuracy_reward": 0.20833333865739406, | |
| "rewards/format_reward": 0.0, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 877.7073036193848, | |
| "epoch": 0.406399690782844, | |
| "grad_norm": 2.63876569492226, | |
| "kl": 1.6055191040039063, | |
| "learning_rate": 1.4811580273048707e-05, | |
| "loss": 0.0642, | |
| "reward": 0.21770833879709245, | |
| "reward_std": 0.23635276220738888, | |
| "rewards/accuracy_reward": 0.21770833879709245, | |
| "rewards/format_reward": 0.0, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 842.9104291439056, | |
| "epoch": 0.41523446666942754, | |
| "grad_norm": 2.9897920260252797, | |
| "kl": 1.5962646484375, | |
| "learning_rate": 1.4538528748779561e-05, | |
| "loss": 0.0638, | |
| "reward": 0.27239584024064245, | |
| "reward_std": 0.2976962272077799, | |
| "rewards/accuracy_reward": 0.27239584024064245, | |
| "rewards/format_reward": 0.0, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 846.857304239273, | |
| "epoch": 0.4240692425560111, | |
| "grad_norm": 0.4405188552245265, | |
| "kl": 2.0165779113769533, | |
| "learning_rate": 1.4261146721000554e-05, | |
| "loss": 0.0807, | |
| "reward": 0.26614583996124563, | |
| "reward_std": 0.275143482722342, | |
| "rewards/accuracy_reward": 0.26614583996124563, | |
| "rewards/format_reward": 0.0, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 788.802097249031, | |
| "epoch": 0.4329040184425947, | |
| "grad_norm": 0.9995458801565597, | |
| "kl": 2.145621490478516, | |
| "learning_rate": 1.3979698857825816e-05, | |
| "loss": 0.0858, | |
| "reward": 0.2781250068452209, | |
| "reward_std": 0.28416458051651716, | |
| "rewards/accuracy_reward": 0.2781250068452209, | |
| "rewards/format_reward": 0.0, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 716.1062657117843, | |
| "epoch": 0.44173879432917823, | |
| "grad_norm": 0.7288658015008603, | |
| "kl": 1.736468505859375, | |
| "learning_rate": 1.3694453706845725e-05, | |
| "loss": 0.0695, | |
| "reward": 0.273437506519258, | |
| "reward_std": 0.2787519218400121, | |
| "rewards/accuracy_reward": 0.273437506519258, | |
| "rewards/format_reward": 0.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.44173879432917823, | |
| "eval_completion_length": 691.8889026834507, | |
| "eval_kl": 1.6545336174242424, | |
| "eval_loss": 0.06618843972682953, | |
| "eval_reward": 0.2828282897520547, | |
| "eval_reward_std": 0.29159103981172196, | |
| "eval_rewards/accuracy_reward": 0.2828282897520547, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 537.4204, | |
| "eval_samples_per_second": 0.184, | |
| "eval_steps_per_second": 0.061, | |
| "step": 500 | |
| }, | |
| { | |
| "completion_length": 708.6135584831238, | |
| "epoch": 0.4505735702157618, | |
| "grad_norm": 0.3575580002721733, | |
| "kl": 1.9233924865722656, | |
| "learning_rate": 1.3405683438888281e-05, | |
| "loss": 0.0769, | |
| "reward": 0.23697917256504297, | |
| "reward_std": 0.2571012871339917, | |
| "rewards/accuracy_reward": 0.23697917256504297, | |
| "rewards/format_reward": 0.0, | |
| "step": 510 | |
| }, | |
| { | |
| "completion_length": 768.7807433485984, | |
| "epoch": 0.4594083461023454, | |
| "grad_norm": 0.3623016043344235, | |
| "kl": 1.5948562622070312, | |
| "learning_rate": 1.3113663588323356e-05, | |
| "loss": 0.0638, | |
| "reward": 0.2958333403803408, | |
| "reward_std": 0.29679411742836237, | |
| "rewards/accuracy_reward": 0.2958333403803408, | |
| "rewards/format_reward": 0.0, | |
| "step": 520 | |
| }, | |
| { | |
| "completion_length": 761.9812644720078, | |
| "epoch": 0.4682431219889289, | |
| "grad_norm": 1.1199341162268286, | |
| "kl": 2.285528564453125, | |
| "learning_rate": 1.2818672790157543e-05, | |
| "loss": 0.0914, | |
| "reward": 0.3104166739620268, | |
| "reward_std": 0.3112278738990426, | |
| "rewards/accuracy_reward": 0.3104166739620268, | |
| "rewards/format_reward": 0.0, | |
| "step": 530 | |
| }, | |
| { | |
| "completion_length": 801.8776177227497, | |
| "epoch": 0.47707789787551247, | |
| "grad_norm": 2.033721333586972, | |
| "kl": 1.8555000305175782, | |
| "learning_rate": 1.252099251417048e-05, | |
| "loss": 0.0742, | |
| "reward": 0.26406250600703063, | |
| "reward_std": 0.2625139458104968, | |
| "rewards/accuracy_reward": 0.26406250600703063, | |
| "rewards/format_reward": 0.0, | |
| "step": 540 | |
| }, | |
| { | |
| "completion_length": 772.0724115848541, | |
| "epoch": 0.48591267376209607, | |
| "grad_norm": 0.7488074779014002, | |
| "kl": 1.9588623046875, | |
| "learning_rate": 1.2220906796346375e-05, | |
| "loss": 0.0784, | |
| "reward": 0.28177083991467955, | |
| "reward_std": 0.2796540316194296, | |
| "rewards/accuracy_reward": 0.28177083991467955, | |
| "rewards/format_reward": 0.0, | |
| "step": 550 | |
| }, | |
| { | |
| "completion_length": 853.2635561227798, | |
| "epoch": 0.4947474496486796, | |
| "grad_norm": 0.8443715089551579, | |
| "kl": 1.8191993713378907, | |
| "learning_rate": 1.1918701967856892e-05, | |
| "loss": 0.0728, | |
| "reward": 0.3036458405200392, | |
| "reward_std": 0.29679411742836237, | |
| "rewards/accuracy_reward": 0.3036458405200392, | |
| "rewards/format_reward": 0.0, | |
| "step": 560 | |
| }, | |
| { | |
| "completion_length": 858.300532579422, | |
| "epoch": 0.5035822255352632, | |
| "grad_norm": 1.2444227038825535, | |
| "kl": 2.2291900634765627, | |
| "learning_rate": 1.1614666381854107e-05, | |
| "loss": 0.0892, | |
| "reward": 0.26041667321696876, | |
| "reward_std": 0.2805561413988471, | |
| "rewards/accuracy_reward": 0.26041667321696876, | |
| "rewards/format_reward": 0.0, | |
| "step": 570 | |
| }, | |
| { | |
| "completion_length": 960.0770899295807, | |
| "epoch": 0.5124170014218468, | |
| "grad_norm": 0.5125806971801138, | |
| "kl": 1.9308998107910156, | |
| "learning_rate": 1.1309090138334112e-05, | |
| "loss": 0.0772, | |
| "reward": 0.3078125073108822, | |
| "reward_std": 0.30040255654603243, | |
| "rewards/accuracy_reward": 0.3078125073108822, | |
| "rewards/format_reward": 0.0, | |
| "step": 580 | |
| }, | |
| { | |
| "completion_length": 884.6036564350128, | |
| "epoch": 0.5212517773084303, | |
| "grad_norm": 0.6680675746939368, | |
| "kl": 1.9921707153320312, | |
| "learning_rate": 1.100226480733388e-05, | |
| "loss": 0.0797, | |
| "reward": 0.24947917312383652, | |
| "reward_std": 0.28867512941360474, | |
| "rewards/accuracy_reward": 0.24947917312383652, | |
| "rewards/format_reward": 0.0, | |
| "step": 590 | |
| }, | |
| { | |
| "completion_length": 799.9510572910309, | |
| "epoch": 0.5300865531950139, | |
| "grad_norm": 3.062049979351596, | |
| "kl": 2.207135009765625, | |
| "learning_rate": 1.0694483150725458e-05, | |
| "loss": 0.0883, | |
| "reward": 0.29583334028720853, | |
| "reward_std": 0.29498989786952734, | |
| "rewards/accuracy_reward": 0.29583334028720853, | |
| "rewards/format_reward": 0.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5300865531950139, | |
| "eval_completion_length": 775.0841884420375, | |
| "eval_kl": 2.8066208964646466, | |
| "eval_loss": 0.11227121949195862, | |
| "eval_reward": 0.28619529352043616, | |
| "eval_reward_std": 0.3207501437928941, | |
| "eval_rewards/accuracy_reward": 0.28619529352043616, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 554.5488, | |
| "eval_samples_per_second": 0.179, | |
| "eval_steps_per_second": 0.06, | |
| "step": 600 | |
| }, | |
| { | |
| "completion_length": 862.4078252792358, | |
| "epoch": 0.5389213290815974, | |
| "grad_norm": 0.4132033561721883, | |
| "kl": 1.8186492919921875, | |
| "learning_rate": 1.038603884287294e-05, | |
| "loss": 0.0728, | |
| "reward": 0.15156250400468707, | |
| "reward_std": 0.17861773632466793, | |
| "rewards/accuracy_reward": 0.15156250400468707, | |
| "rewards/format_reward": 0.0, | |
| "step": 610 | |
| }, | |
| { | |
| "completion_length": 623.4411586642266, | |
| "epoch": 0.5477561049681811, | |
| "grad_norm": 0.5984776310180936, | |
| "kl": 0.5929832458496094, | |
| "learning_rate": 1.0077226190418783e-05, | |
| "loss": 0.0237, | |
| "reward": 0.1755208382382989, | |
| "reward_std": 0.2255274448543787, | |
| "rewards/accuracy_reward": 0.1755208382382989, | |
| "rewards/format_reward": 0.0, | |
| "step": 620 | |
| }, | |
| { | |
| "completion_length": 675.0708496570587, | |
| "epoch": 0.5565908808547646, | |
| "grad_norm": 0.4802015784335563, | |
| "kl": 0.47046356201171874, | |
| "learning_rate": 9.768339851466818e-06, | |
| "loss": 0.0188, | |
| "reward": 0.2515625067520887, | |
| "reward_std": 0.2940877880901098, | |
| "rewards/accuracy_reward": 0.2515625067520887, | |
| "rewards/format_reward": 0.0, | |
| "step": 630 | |
| }, | |
| { | |
| "completion_length": 768.3067860364914, | |
| "epoch": 0.5654256567413481, | |
| "grad_norm": 2.4930383667378413, | |
| "kl": 1.0879661560058593, | |
| "learning_rate": 9.45967455442995e-06, | |
| "loss": 0.0435, | |
| "reward": 0.34322917480021714, | |
| "reward_std": 0.3301721792668104, | |
| "rewards/accuracy_reward": 0.34322917480021714, | |
| "rewards/format_reward": 0.0, | |
| "step": 640 | |
| }, | |
| { | |
| "completion_length": 789.8026188373566, | |
| "epoch": 0.5742604326279317, | |
| "grad_norm": 1.7100954361428433, | |
| "kl": 2.7097198486328127, | |
| "learning_rate": 9.151524816810686e-06, | |
| "loss": 0.1084, | |
| "reward": 0.31666667400859294, | |
| "reward_std": 0.30671732500195503, | |
| "rewards/accuracy_reward": 0.31666667400859294, | |
| "rewards/format_reward": 0.0, | |
| "step": 650 | |
| }, | |
| { | |
| "completion_length": 707.0859542965889, | |
| "epoch": 0.5830952085145152, | |
| "grad_norm": 1.57666179442439, | |
| "kl": 1.7728652954101562, | |
| "learning_rate": 8.844184664182993e-06, | |
| "loss": 0.0709, | |
| "reward": 0.35000000791624186, | |
| "reward_std": 0.33107428904622793, | |
| "rewards/accuracy_reward": 0.35000000791624186, | |
| "rewards/format_reward": 0.0, | |
| "step": 660 | |
| }, | |
| { | |
| "completion_length": 773.0073058724404, | |
| "epoch": 0.5919299844010988, | |
| "grad_norm": 0.9807231122231896, | |
| "kl": 2.1020263671875, | |
| "learning_rate": 8.537947349643493e-06, | |
| "loss": 0.0841, | |
| "reward": 0.29739584047347306, | |
| "reward_std": 0.3112278738990426, | |
| "rewards/accuracy_reward": 0.29739584047347306, | |
| "rewards/format_reward": 0.0, | |
| "step": 670 | |
| }, | |
| { | |
| "completion_length": 777.6401177406311, | |
| "epoch": 0.6007647602876824, | |
| "grad_norm": 0.5403220713078971, | |
| "kl": 1.6934829711914063, | |
| "learning_rate": 8.23310507399973e-06, | |
| "loss": 0.0678, | |
| "reward": 0.2744791732635349, | |
| "reward_std": 0.2805561413988471, | |
| "rewards/accuracy_reward": 0.2744791732635349, | |
| "rewards/format_reward": 0.0, | |
| "step": 680 | |
| }, | |
| { | |
| "completion_length": 767.1880352020264, | |
| "epoch": 0.609599536174266, | |
| "grad_norm": 0.6884159799218071, | |
| "kl": 2.134033203125, | |
| "learning_rate": 7.929948706962508e-06, | |
| "loss": 0.0854, | |
| "reward": 0.28958334047347306, | |
| "reward_std": 0.30671732500195503, | |
| "rewards/accuracy_reward": 0.28958334047347306, | |
| "rewards/format_reward": 0.0, | |
| "step": 690 | |
| }, | |
| { | |
| "completion_length": 716.2224100291729, | |
| "epoch": 0.6184343120608495, | |
| "grad_norm": 0.9853920893844104, | |
| "kl": 1.9078399658203125, | |
| "learning_rate": 7.628767509608304e-06, | |
| "loss": 0.0763, | |
| "reward": 0.31041667447425425, | |
| "reward_std": 0.32566163036972284, | |
| "rewards/accuracy_reward": 0.31041667447425425, | |
| "rewards/format_reward": 0.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6184343120608495, | |
| "eval_completion_length": 676.0437842475044, | |
| "eval_kl": 2.118035827020202, | |
| "eval_loss": 0.08472807705402374, | |
| "eval_reward": 0.30976431749083777, | |
| "eval_reward_std": 0.3382456061815975, | |
| "eval_rewards/accuracy_reward": 0.30976431749083777, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 527.3635, | |
| "eval_samples_per_second": 0.188, | |
| "eval_steps_per_second": 0.063, | |
| "step": 700 | |
| }, | |
| { | |
| "completion_length": 693.3250136733055, | |
| "epoch": 0.6272690879474331, | |
| "grad_norm": 0.6986064715250526, | |
| "kl": 2.01304931640625, | |
| "learning_rate": 7.329848858376585e-06, | |
| "loss": 0.0805, | |
| "reward": 0.31510417414829134, | |
| "reward_std": 0.3166405325755477, | |
| "rewards/accuracy_reward": 0.31510417414829134, | |
| "rewards/format_reward": 0.0, | |
| "step": 710 | |
| }, | |
| { | |
| "completion_length": 737.9067856788636, | |
| "epoch": 0.6361038638340166, | |
| "grad_norm": 2.3133745632209393, | |
| "kl": 2.0740203857421875, | |
| "learning_rate": 7.033477970865381e-06, | |
| "loss": 0.083, | |
| "reward": 0.32343750772997737, | |
| "reward_std": 0.3175426423549652, | |
| "rewards/accuracy_reward": 0.32343750772997737, | |
| "rewards/format_reward": 0.0, | |
| "step": 720 | |
| }, | |
| { | |
| "completion_length": 747.067202681303, | |
| "epoch": 0.6449386397206002, | |
| "grad_norm": 0.7455664324085425, | |
| "kl": 2.0231887817382814, | |
| "learning_rate": 6.73993763368675e-06, | |
| "loss": 0.0809, | |
| "reward": 0.3057291737757623, | |
| "reward_std": 0.30040255654603243, | |
| "rewards/accuracy_reward": 0.3057291737757623, | |
| "rewards/format_reward": 0.0, | |
| "step": 730 | |
| }, | |
| { | |
| "completion_length": 730.435950744152, | |
| "epoch": 0.6537734156071838, | |
| "grad_norm": 1.9312009716198788, | |
| "kl": 2.154399108886719, | |
| "learning_rate": 6.449507932641796e-06, | |
| "loss": 0.0862, | |
| "reward": 0.2927083405200392, | |
| "reward_std": 0.3040109956637025, | |
| "rewards/accuracy_reward": 0.2927083405200392, | |
| "rewards/format_reward": 0.0, | |
| "step": 740 | |
| }, | |
| { | |
| "completion_length": 707.3302231192589, | |
| "epoch": 0.6626081914937674, | |
| "grad_norm": 0.6668502719545971, | |
| "kl": 1.8355056762695312, | |
| "learning_rate": 6.16246598547271e-06, | |
| "loss": 0.0734, | |
| "reward": 0.2916666741017252, | |
| "reward_std": 0.32115108147263527, | |
| "rewards/accuracy_reward": 0.2916666741017252, | |
| "rewards/format_reward": 0.0, | |
| "step": 750 | |
| }, | |
| { | |
| "completion_length": 740.6020975530148, | |
| "epoch": 0.6714429673803509, | |
| "grad_norm": 0.7705413022710031, | |
| "kl": 2.15982666015625, | |
| "learning_rate": 5.8790856774468385e-06, | |
| "loss": 0.0864, | |
| "reward": 0.3046875072643161, | |
| "reward_std": 0.30491310544312, | |
| "rewards/accuracy_reward": 0.3046875072643161, | |
| "rewards/format_reward": 0.0, | |
| "step": 760 | |
| }, | |
| { | |
| "completion_length": 739.286992508173, | |
| "epoch": 0.6802777432669345, | |
| "grad_norm": 0.5391118755792965, | |
| "kl": 2.0162887573242188, | |
| "learning_rate": 5.599637400025036e-06, | |
| "loss": 0.0807, | |
| "reward": 0.31458334061317145, | |
| "reward_std": 0.3112278738990426, | |
| "rewards/accuracy_reward": 0.31458334061317145, | |
| "rewards/format_reward": 0.0, | |
| "step": 770 | |
| }, | |
| { | |
| "completion_length": 748.7869940280914, | |
| "epoch": 0.689112519153518, | |
| "grad_norm": 0.7122982868192613, | |
| "kl": 2.226336669921875, | |
| "learning_rate": 5.324387792863719e-06, | |
| "loss": 0.0891, | |
| "reward": 0.3234375074971467, | |
| "reward_std": 0.3058152152225375, | |
| "rewards/accuracy_reward": 0.3234375074971467, | |
| "rewards/format_reward": 0.0, | |
| "step": 780 | |
| }, | |
| { | |
| "completion_length": 737.4474101424217, | |
| "epoch": 0.6979472950401016, | |
| "grad_norm": 0.6757655195430946, | |
| "kl": 2.05509033203125, | |
| "learning_rate": 5.053599489396732e-06, | |
| "loss": 0.0822, | |
| "reward": 0.31250000759027896, | |
| "reward_std": 0.31393420323729515, | |
| "rewards/accuracy_reward": 0.31250000759027896, | |
| "rewards/format_reward": 0.0, | |
| "step": 790 | |
| }, | |
| { | |
| "completion_length": 711.7630351662635, | |
| "epoch": 0.7067820709266852, | |
| "grad_norm": 0.5761504098242444, | |
| "kl": 1.904425048828125, | |
| "learning_rate": 4.787530866239826e-06, | |
| "loss": 0.0762, | |
| "reward": 0.30729167386889455, | |
| "reward_std": 0.303108885884285, | |
| "rewards/accuracy_reward": 0.30729167386889455, | |
| "rewards/format_reward": 0.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.7067820709266852, | |
| "eval_completion_length": 720.3198793199327, | |
| "eval_kl": 1.9343237058080809, | |
| "eval_loss": 0.07739535719156265, | |
| "eval_reward": 0.32996633739182446, | |
| "eval_reward_std": 0.3149183229966597, | |
| "eval_rewards/accuracy_reward": 0.32996633739182446, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 534.7287, | |
| "eval_samples_per_second": 0.185, | |
| "eval_steps_per_second": 0.062, | |
| "step": 800 | |
| }, | |
| { | |
| "completion_length": 715.5786605343222, | |
| "epoch": 0.7156168468132688, | |
| "grad_norm": 0.7254446582873234, | |
| "kl": 2.05001220703125, | |
| "learning_rate": 4.52643579665683e-06, | |
| "loss": 0.082, | |
| "reward": 0.3187500074040145, | |
| "reward_std": 0.31303209345787764, | |
| "rewards/accuracy_reward": 0.3187500074040145, | |
| "rewards/format_reward": 0.0, | |
| "step": 810 | |
| }, | |
| { | |
| "completion_length": 729.5484510660172, | |
| "epoch": 0.7244516226998523, | |
| "grad_norm": 0.8829844749096067, | |
| "kl": 2.499810791015625, | |
| "learning_rate": 4.270563408322772e-06, | |
| "loss": 0.1, | |
| "reward": 0.3192708410322666, | |
| "reward_std": 0.3184447521343827, | |
| "rewards/accuracy_reward": 0.3192708410322666, | |
| "rewards/format_reward": 0.0, | |
| "step": 820 | |
| }, | |
| { | |
| "completion_length": 717.5286597132683, | |
| "epoch": 0.7332863985864359, | |
| "grad_norm": 1.7173630228550354, | |
| "kl": 2.062689208984375, | |
| "learning_rate": 4.020157845615075e-06, | |
| "loss": 0.0825, | |
| "reward": 0.33333334093913436, | |
| "reward_std": 0.3094236543402076, | |
| "rewards/accuracy_reward": 0.33333334093913436, | |
| "rewards/format_reward": 0.0, | |
| "step": 830 | |
| }, | |
| { | |
| "completion_length": 731.0494927078486, | |
| "epoch": 0.7421211744730194, | |
| "grad_norm": 0.829005223012386, | |
| "kl": 2.804341125488281, | |
| "learning_rate": 3.7754580366596116e-06, | |
| "loss": 0.1122, | |
| "reward": 0.29739584065973756, | |
| "reward_std": 0.3220531912520528, | |
| "rewards/accuracy_reward": 0.29739584065973756, | |
| "rewards/format_reward": 0.0, | |
| "step": 840 | |
| }, | |
| { | |
| "completion_length": 706.3677220225334, | |
| "epoch": 0.7509559503596029, | |
| "grad_norm": 1.6348693110104273, | |
| "kl": 1.9862686157226563, | |
| "learning_rate": 3.5366974653539653e-06, | |
| "loss": 0.0795, | |
| "reward": 0.30312500717118385, | |
| "reward_std": 0.30761943478137255, | |
| "rewards/accuracy_reward": 0.30312500717118385, | |
| "rewards/format_reward": 0.0, | |
| "step": 850 | |
| }, | |
| { | |
| "completion_length": 698.5177225530148, | |
| "epoch": 0.7597907262461866, | |
| "grad_norm": 0.9159549909583196, | |
| "kl": 2.294660949707031, | |
| "learning_rate": 3.304103948585341e-06, | |
| "loss": 0.0918, | |
| "reward": 0.3046875072643161, | |
| "reward_std": 0.30130466632544994, | |
| "rewards/accuracy_reward": 0.3046875072643161, | |
| "rewards/format_reward": 0.0, | |
| "step": 860 | |
| }, | |
| { | |
| "completion_length": 672.9390772372484, | |
| "epoch": 0.7686255021327701, | |
| "grad_norm": 2.1045299190913833, | |
| "kl": 2.3146575927734374, | |
| "learning_rate": 3.0778994188557722e-06, | |
| "loss": 0.0926, | |
| "reward": 0.315625007962808, | |
| "reward_std": 0.32475952059030533, | |
| "rewards/accuracy_reward": 0.315625007962808, | |
| "rewards/format_reward": 0.0, | |
| "step": 870 | |
| }, | |
| { | |
| "completion_length": 674.6109511375428, | |
| "epoch": 0.7774602780193537, | |
| "grad_norm": 0.7983792148121029, | |
| "kl": 2.302967834472656, | |
| "learning_rate": 2.8582997125219604e-06, | |
| "loss": 0.0921, | |
| "reward": 0.30260417428798975, | |
| "reward_std": 0.32566163036972284, | |
| "rewards/accuracy_reward": 0.30260417428798975, | |
| "rewards/format_reward": 0.0, | |
| "step": 880 | |
| }, | |
| { | |
| "completion_length": 656.9838689267635, | |
| "epoch": 0.7862950539059372, | |
| "grad_norm": 0.669988664196779, | |
| "kl": 2.0931640625, | |
| "learning_rate": 2.645514363851874e-06, | |
| "loss": 0.0837, | |
| "reward": 0.3010416736826301, | |
| "reward_std": 0.2931856783106923, | |
| "rewards/accuracy_reward": 0.3010416736826301, | |
| "rewards/format_reward": 0.0, | |
| "step": 890 | |
| }, | |
| { | |
| "completion_length": 652.4843894541264, | |
| "epoch": 0.7951298297925208, | |
| "grad_norm": 1.3222110289859523, | |
| "kl": 1.9164962768554688, | |
| "learning_rate": 2.4397464050945753e-06, | |
| "loss": 0.0767, | |
| "reward": 0.2838541739154607, | |
| "reward_std": 0.31393420323729515, | |
| "rewards/accuracy_reward": 0.2838541739154607, | |
| "rewards/format_reward": 0.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7951298297925208, | |
| "eval_completion_length": 622.8788027522539, | |
| "eval_kl": 2.0288036616161618, | |
| "eval_loss": 0.08118358999490738, | |
| "eval_reward": 0.3131313209581857, | |
| "eval_reward_std": 0.3265819645891286, | |
| "eval_rewards/accuracy_reward": 0.3131313209581857, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 506.0213, | |
| "eval_samples_per_second": 0.196, | |
| "eval_steps_per_second": 0.065, | |
| "step": 900 | |
| }, | |
| { | |
| "completion_length": 651.8578262448311, | |
| "epoch": 0.8039646056791043, | |
| "grad_norm": 1.0220083602716086, | |
| "kl": 2.2107833862304687, | |
| "learning_rate": 2.24119217275401e-06, | |
| "loss": 0.0884, | |
| "reward": 0.2739583405200392, | |
| "reward_std": 0.30671732500195503, | |
| "rewards/accuracy_reward": 0.2739583405200392, | |
| "rewards/format_reward": 0.0, | |
| "step": 910 | |
| }, | |
| { | |
| "completion_length": 630.3770972907544, | |
| "epoch": 0.812799381565688, | |
| "grad_norm": 1.140106665270527, | |
| "kl": 2.0034912109375, | |
| "learning_rate": 2.0500411202516814e-06, | |
| "loss": 0.0802, | |
| "reward": 0.29166667340323327, | |
| "reward_std": 0.2850666902959347, | |
| "rewards/accuracy_reward": 0.29166667340323327, | |
| "rewards/format_reward": 0.0, | |
| "step": 920 | |
| }, | |
| { | |
| "completion_length": 633.9614718899131, | |
| "epoch": 0.8216341574522715, | |
| "grad_norm": 1.1778599052603038, | |
| "kl": 1.8928955078125, | |
| "learning_rate": 1.8664756371568981e-06, | |
| "loss": 0.0757, | |
| "reward": 0.2963541735429317, | |
| "reward_std": 0.29498989786952734, | |
| "rewards/accuracy_reward": 0.2963541735429317, | |
| "rewards/format_reward": 0.0, | |
| "step": 930 | |
| }, | |
| { | |
| "completion_length": 658.522930726409, | |
| "epoch": 0.8304689333388551, | |
| "grad_norm": 0.6577212329055425, | |
| "kl": 2.294940185546875, | |
| "learning_rate": 1.6906708751570955e-06, | |
| "loss": 0.0918, | |
| "reward": 0.31093750675208864, | |
| "reward_std": 0.30220677610486746, | |
| "rewards/accuracy_reward": 0.31093750675208864, | |
| "rewards/format_reward": 0.0, | |
| "step": 940 | |
| }, | |
| { | |
| "completion_length": 677.2802217006683, | |
| "epoch": 0.8393037092254386, | |
| "grad_norm": 0.7422568973111673, | |
| "kl": 2.5144363403320313, | |
| "learning_rate": 1.5227945809342992e-06, | |
| "loss": 0.1006, | |
| "reward": 0.3151041742414236, | |
| "reward_std": 0.3166405325755477, | |
| "rewards/accuracy_reward": 0.3151041742414236, | |
| "rewards/format_reward": 0.0, | |
| "step": 950 | |
| }, | |
| { | |
| "completion_length": 652.5901178598403, | |
| "epoch": 0.8481384851120222, | |
| "grad_norm": 0.810421906439625, | |
| "kl": 2.262736511230469, | |
| "learning_rate": 1.363006936107183e-06, | |
| "loss": 0.0905, | |
| "reward": 0.34635417480021713, | |
| "reward_std": 0.336486947722733, | |
| "rewards/accuracy_reward": 0.34635417480021713, | |
| "rewards/format_reward": 0.0, | |
| "step": 960 | |
| }, | |
| { | |
| "completion_length": 670.2057422459126, | |
| "epoch": 0.8569732609986057, | |
| "grad_norm": 0.7920465762778155, | |
| "kl": 2.5366302490234376, | |
| "learning_rate": 1.2114604043914225e-06, | |
| "loss": 0.1015, | |
| "reward": 0.32031250768341124, | |
| "reward_std": 0.3175426423549652, | |
| "rewards/accuracy_reward": 0.32031250768341124, | |
| "rewards/format_reward": 0.0, | |
| "step": 970 | |
| }, | |
| { | |
| "completion_length": 682.0869930744171, | |
| "epoch": 0.8658080368851894, | |
| "grad_norm": 0.6071556277616226, | |
| "kl": 2.4378128051757812, | |
| "learning_rate": 1.068299586124224e-06, | |
| "loss": 0.0975, | |
| "reward": 0.33072917396202683, | |
| "reward_std": 0.2995004467666149, | |
| "rewards/accuracy_reward": 0.33072917396202683, | |
| "rewards/format_reward": 0.0, | |
| "step": 980 | |
| }, | |
| { | |
| "completion_length": 663.8849090188742, | |
| "epoch": 0.8746428127717729, | |
| "grad_norm": 0.7004418244247146, | |
| "kl": 2.3257492065429686, | |
| "learning_rate": 9.336610802918044e-07, | |
| "loss": 0.093, | |
| "reward": 0.32447917433455586, | |
| "reward_std": 0.3094236543402076, | |
| "rewards/accuracy_reward": 0.32447917433455586, | |
| "rewards/format_reward": 0.0, | |
| "step": 990 | |
| }, | |
| { | |
| "completion_length": 678.5744934767484, | |
| "epoch": 0.8834775886583565, | |
| "grad_norm": 1.5130552859053217, | |
| "kl": 2.3184799194335937, | |
| "learning_rate": 8.076733541914617e-07, | |
| "loss": 0.0927, | |
| "reward": 0.33281250740401447, | |
| "reward_std": 0.30491310544312, | |
| "rewards/accuracy_reward": 0.33281250740401447, | |
| "rewards/format_reward": 0.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8834775886583565, | |
| "eval_completion_length": 664.8687005476518, | |
| "eval_kl": 2.323409880050505, | |
| "eval_loss": 0.0929863303899765, | |
| "eval_reward": 0.3535353619642932, | |
| "eval_reward_std": 0.3440774269778319, | |
| "eval_rewards/accuracy_reward": 0.3535353619642932, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 516.2096, | |
| "eval_samples_per_second": 0.192, | |
| "eval_steps_per_second": 0.064, | |
| "step": 1000 | |
| }, | |
| { | |
| "completion_length": 702.9177223086357, | |
| "epoch": 0.89231236454494, | |
| "grad_norm": 1.529025375905959, | |
| "kl": 2.152081298828125, | |
| "learning_rate": 6.90456620852632e-07, | |
| "loss": 0.0861, | |
| "reward": 0.3427083408460021, | |
| "reward_std": 0.31393420323729515, | |
| "rewards/accuracy_reward": 0.3427083408460021, | |
| "rewards/format_reward": 0.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "completion_length": 693.2552229389548, | |
| "epoch": 0.9011471404315236, | |
| "grad_norm": 0.6357248703500725, | |
| "kl": 2.1104583740234375, | |
| "learning_rate": 5.821227243338712e-07, | |
| "loss": 0.0844, | |
| "reward": 0.32656250763684513, | |
| "reward_std": 0.3157384227961302, | |
| "rewards/accuracy_reward": 0.32656250763684513, | |
| "rewards/format_reward": 0.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "completion_length": 669.4468886733055, | |
| "epoch": 0.9099819163181071, | |
| "grad_norm": 2.2284285054958324, | |
| "kl": 2.1510406494140626, | |
| "learning_rate": 4.827750330052117e-07, | |
| "loss": 0.0861, | |
| "reward": 0.3536458413582295, | |
| "reward_std": 0.32566163036972284, | |
| "rewards/accuracy_reward": 0.3536458413582295, | |
| "rewards/format_reward": 0.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "completion_length": 683.547409978509, | |
| "epoch": 0.9188166922046908, | |
| "grad_norm": 0.6632003611968932, | |
| "kl": 2.2973922729492187, | |
| "learning_rate": 3.925083409177266e-07, | |
| "loss": 0.0919, | |
| "reward": 0.34010417442768814, | |
| "reward_std": 0.31934686191380024, | |
| "rewards/accuracy_reward": 0.34010417442768814, | |
| "rewards/format_reward": 0.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "completion_length": 679.4802215665579, | |
| "epoch": 0.9276514680912743, | |
| "grad_norm": 0.8599780179720123, | |
| "kl": 2.3347732543945314, | |
| "learning_rate": 3.114087773543939e-07, | |
| "loss": 0.0934, | |
| "reward": 0.3500000080559403, | |
| "reward_std": 0.33107428904622793, | |
| "rewards/accuracy_reward": 0.3500000080559403, | |
| "rewards/format_reward": 0.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "completion_length": 703.7713681519032, | |
| "epoch": 0.9364862439778578, | |
| "grad_norm": 1.174851381620777, | |
| "kl": 2.408406066894531, | |
| "learning_rate": 2.395537246485846e-07, | |
| "loss": 0.0963, | |
| "reward": 0.329166674753651, | |
| "reward_std": 0.34370382595807314, | |
| "rewards/accuracy_reward": 0.329166674753651, | |
| "rewards/format_reward": 0.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "completion_length": 678.3260560303927, | |
| "epoch": 0.9453210198644414, | |
| "grad_norm": 1.54743742951263, | |
| "kl": 2.394914245605469, | |
| "learning_rate": 1.7701174434858193e-07, | |
| "loss": 0.0958, | |
| "reward": 0.33802084121853115, | |
| "reward_std": 0.32024897169321775, | |
| "rewards/accuracy_reward": 0.33802084121853115, | |
| "rewards/format_reward": 0.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "completion_length": 664.8302224695683, | |
| "epoch": 0.9541557957510249, | |
| "grad_norm": 1.5666079028942566, | |
| "kl": 2.3258026123046873, | |
| "learning_rate": 1.2384251179857642e-07, | |
| "loss": 0.093, | |
| "reward": 0.3578125084284693, | |
| "reward_std": 0.35001859441399574, | |
| "rewards/accuracy_reward": 0.3578125084284693, | |
| "rewards/format_reward": 0.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "completion_length": 682.4229293212295, | |
| "epoch": 0.9629905716376085, | |
| "grad_norm": 0.6943507484399737, | |
| "kl": 2.487345886230469, | |
| "learning_rate": 8.009675919856574e-08, | |
| "loss": 0.0995, | |
| "reward": 0.32239584033377466, | |
| "reward_std": 0.29679411742836237, | |
| "rewards/accuracy_reward": 0.32239584033377466, | |
| "rewards/format_reward": 0.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "completion_length": 689.0625125810504, | |
| "epoch": 0.9718253475241921, | |
| "grad_norm": 0.6276023210613672, | |
| "kl": 2.3767745971679686, | |
| "learning_rate": 4.581622719748269e-08, | |
| "loss": 0.0951, | |
| "reward": 0.3395833406597376, | |
| "reward_std": 0.2985983369871974, | |
| "rewards/accuracy_reward": 0.3395833406597376, | |
| "rewards/format_reward": 0.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9718253475241921, | |
| "eval_completion_length": 665.5656705528799, | |
| "eval_kl": 2.368923611111111, | |
| "eval_loss": 0.09475857019424438, | |
| "eval_reward": 0.3804713891010092, | |
| "eval_reward_std": 0.3382456061815975, | |
| "eval_rewards/accuracy_reward": 0.3804713891010092, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 510.5875, | |
| "eval_samples_per_second": 0.194, | |
| "eval_steps_per_second": 0.065, | |
| "step": 1100 | |
| }, | |
| { | |
| "completion_length": 687.5755343332887, | |
| "epoch": 0.9806601234107757, | |
| "grad_norm": 0.5833877177508657, | |
| "kl": 2.3449798583984376, | |
| "learning_rate": 2.1033625065747244e-08, | |
| "loss": 0.0938, | |
| "reward": 0.3359375076368451, | |
| "reward_std": 0.31393420323729515, | |
| "rewards/accuracy_reward": 0.3359375076368451, | |
| "rewards/format_reward": 0.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "completion_length": 686.5161595344543, | |
| "epoch": 0.9894948992973592, | |
| "grad_norm": 0.8177663703595698, | |
| "kl": 2.952043151855469, | |
| "learning_rate": 5.772599485236452e-09, | |
| "loss": 0.118, | |
| "reward": 0.3333333409857005, | |
| "reward_std": 0.3238574108108878, | |
| "rewards/accuracy_reward": 0.3333333409857005, | |
| "rewards/format_reward": 0.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "completion_length": 679.1244929388165, | |
| "epoch": 0.9983296751839428, | |
| "grad_norm": 1.027763278137717, | |
| "kl": 2.3051712036132814, | |
| "learning_rate": 4.7711986460585725e-11, | |
| "loss": 0.0922, | |
| "reward": 0.3552083419635892, | |
| "reward_std": 0.34370382595807314, | |
| "rewards/accuracy_reward": 0.3552083419635892, | |
| "rewards/format_reward": 0.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "completion_length": 695.1406378149986, | |
| "epoch": 0.9992131527726011, | |
| "kl": 2.40301513671875, | |
| "reward": 0.3125000074505806, | |
| "reward_std": 0.32475952059030533, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 1131, | |
| "total_flos": 0.0, | |
| "train_loss": 0.06294877494734573, | |
| "train_runtime": 428958.1793, | |
| "train_samples_per_second": 0.169, | |
| "train_steps_per_second": 0.003 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1131, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |