Qwen2.5-1.5B-Open-R1-GRPO / trainer_state.json
May811's picture
Model save
9afce01 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9992131527726011,
"eval_steps": 100,
"global_step": 1131,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 894.0125112533569,
"epoch": 0.008834775886583565,
"grad_norm": 0.10339409232367645,
"kl": 6.553309503942728e-05,
"learning_rate": 1.7543859649122807e-06,
"loss": 0.0,
"reward": 0.20572917186655104,
"reward_std": 0.22462533507496119,
"rewards/accuracy_reward": 0.20572917186655104,
"rewards/format_reward": 0.0,
"step": 10
},
{
"completion_length": 871.2338674545288,
"epoch": 0.01766955177316713,
"grad_norm": 0.1191837579508088,
"kl": 0.0008328911615535616,
"learning_rate": 3.5087719298245615e-06,
"loss": 0.0,
"reward": 0.250000006519258,
"reward_std": 0.275143482722342,
"rewards/accuracy_reward": 0.250000006519258,
"rewards/format_reward": 0.0,
"step": 20
},
{
"completion_length": 805.9927246570587,
"epoch": 0.026504327659750693,
"grad_norm": 0.11427842333813355,
"kl": 0.006137973070144654,
"learning_rate": 5.263157894736842e-06,
"loss": 0.0002,
"reward": 0.3583333409391344,
"reward_std": 0.3040109956637025,
"rewards/accuracy_reward": 0.3583333409391344,
"rewards/format_reward": 0.0,
"step": 30
},
{
"completion_length": 815.7588698387146,
"epoch": 0.03533910354633426,
"grad_norm": 0.1094331129364983,
"kl": 0.011934018135070801,
"learning_rate": 7.017543859649123e-06,
"loss": 0.0005,
"reward": 0.3635416746605188,
"reward_std": 0.31483631301671267,
"rewards/accuracy_reward": 0.3635416746605188,
"rewards/format_reward": 0.0,
"step": 40
},
{
"completion_length": 811.1395996570587,
"epoch": 0.044173879432917826,
"grad_norm": 0.11064742736511399,
"kl": 0.027907085418701173,
"learning_rate": 8.771929824561405e-06,
"loss": 0.0011,
"reward": 0.38958334126509725,
"reward_std": 0.31934686191380024,
"rewards/accuracy_reward": 0.38958334126509725,
"rewards/format_reward": 0.0,
"step": 50
},
{
"completion_length": 729.52032995224,
"epoch": 0.053008655319501385,
"grad_norm": 0.5600590072843962,
"kl": 0.11420574188232421,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.0046,
"reward": 0.47083334238268437,
"reward_std": 0.34550804551690817,
"rewards/accuracy_reward": 0.47083334238268437,
"rewards/format_reward": 0.0,
"step": 60
},
{
"completion_length": 748.378140592575,
"epoch": 0.06184343120608495,
"grad_norm": 0.49897174304455477,
"kl": 0.4907238006591797,
"learning_rate": 1.2280701754385966e-05,
"loss": 0.0196,
"reward": 0.35729167410172524,
"reward_std": 0.3094236543402076,
"rewards/accuracy_reward": 0.35729167410172524,
"rewards/format_reward": 0.0,
"step": 70
},
{
"completion_length": 731.9463713169098,
"epoch": 0.07067820709266852,
"grad_norm": 2.1010371186603622,
"kl": 0.5549324035644532,
"learning_rate": 1.4035087719298246e-05,
"loss": 0.0222,
"reward": 0.39947917480021716,
"reward_std": 0.3283679597079754,
"rewards/accuracy_reward": 0.39947917480021716,
"rewards/format_reward": 0.0,
"step": 80
},
{
"completion_length": 636.9864743709564,
"epoch": 0.07951298297925208,
"grad_norm": 0.3320296791157868,
"kl": 0.8359504699707031,
"learning_rate": 1.578947368421053e-05,
"loss": 0.0334,
"reward": 0.35625000754371283,
"reward_std": 0.30491310544312,
"rewards/accuracy_reward": 0.35625000754371283,
"rewards/format_reward": 0.0,
"step": 90
},
{
"completion_length": 752.3406419754028,
"epoch": 0.08834775886583565,
"grad_norm": 13.222864026304222,
"kl": 1.2244148254394531,
"learning_rate": 1.754385964912281e-05,
"loss": 0.049,
"reward": 0.3458333408460021,
"reward_std": 0.2922835685312748,
"rewards/accuracy_reward": 0.3458333408460021,
"rewards/format_reward": 0.0,
"step": 100
},
{
"epoch": 0.08834775886583565,
"eval_completion_length": 806.9528756189828,
"eval_kl": 0.7132556029040404,
"eval_loss": 0.028531787917017937,
"eval_reward": 0.3063973137224563,
"eval_reward_std": 0.3090865022004253,
"eval_rewards/accuracy_reward": 0.3063973137224563,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 538.1844,
"eval_samples_per_second": 0.184,
"eval_steps_per_second": 0.061,
"step": 100
},
{
"completion_length": 788.5630361557007,
"epoch": 0.09718253475241921,
"grad_norm": 1.9841707275878488,
"kl": 1.9315879821777344,
"learning_rate": 1.929824561403509e-05,
"loss": 0.0773,
"reward": 0.3375000071246177,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.3375000071246177,
"rewards/format_reward": 0.0,
"step": 110
},
{
"completion_length": 782.2432436466217,
"epoch": 0.10601731063900277,
"grad_norm": 1.6220755341742679,
"kl": 1.2291488647460938,
"learning_rate": 1.9998282416292057e-05,
"loss": 0.0492,
"reward": 0.29166667312383654,
"reward_std": 0.2625139458104968,
"rewards/accuracy_reward": 0.29166667312383654,
"rewards/format_reward": 0.0,
"step": 120
},
{
"completion_length": 855.3958479881287,
"epoch": 0.11485208652558634,
"grad_norm": 1.2506380617356658,
"kl": 0.8863227844238282,
"learning_rate": 1.9987788208027496e-05,
"loss": 0.0355,
"reward": 0.3015625067986548,
"reward_std": 0.2796540316194296,
"rewards/accuracy_reward": 0.3015625067986548,
"rewards/format_reward": 0.0,
"step": 130
},
{
"completion_length": 923.2276175498962,
"epoch": 0.1236868624121699,
"grad_norm": 0.7947305780961375,
"kl": 0.8832084655761718,
"learning_rate": 1.9967764005730785e-05,
"loss": 0.0353,
"reward": 0.17343750461004676,
"reward_std": 0.2128979079425335,
"rewards/accuracy_reward": 0.17343750461004676,
"rewards/format_reward": 0.0,
"step": 140
},
{
"completion_length": 867.6395987987519,
"epoch": 0.13252163829875346,
"grad_norm": 0.497580876778096,
"kl": 0.5946540832519531,
"learning_rate": 1.993822891578708e-05,
"loss": 0.0238,
"reward": 0.18281250479631125,
"reward_std": 0.2192126763984561,
"rewards/accuracy_reward": 0.18281250479631125,
"rewards/format_reward": 0.0,
"step": 150
},
{
"completion_length": 718.904184293747,
"epoch": 0.14135641418533704,
"grad_norm": 1.857046714456291,
"kl": 0.6682144165039062,
"learning_rate": 1.9899211119533938e-05,
"loss": 0.0267,
"reward": 0.3848958414513618,
"reward_std": 0.33107428904622793,
"rewards/accuracy_reward": 0.3848958414513618,
"rewards/format_reward": 0.0,
"step": 160
},
{
"completion_length": 918.8239655733108,
"epoch": 0.1501911900719206,
"grad_norm": 0.20248662305659806,
"kl": 0.5040359497070312,
"learning_rate": 1.985074784637167e-05,
"loss": 0.0202,
"reward": 0.09479166907258332,
"reward_std": 0.09923207573592663,
"rewards/accuracy_reward": 0.09479166907258332,
"rewards/format_reward": 0.0,
"step": 170
},
{
"completion_length": 827.5974071979523,
"epoch": 0.15902596595850416,
"grad_norm": 0.19897870444761726,
"kl": 0.17742691040039063,
"learning_rate": 1.9792885338240375e-05,
"loss": 0.0071,
"reward": 0.17291667088866233,
"reward_std": 0.17771562654525042,
"rewards/accuracy_reward": 0.17291667088866233,
"rewards/format_reward": 0.0,
"step": 180
},
{
"completion_length": 696.4333508253097,
"epoch": 0.16786074184508773,
"grad_norm": 0.4541804472147627,
"kl": 0.3155979156494141,
"learning_rate": 1.9725678805497507e-05,
"loss": 0.0126,
"reward": 0.42395834233611823,
"reward_std": 0.3536270335316658,
"rewards/accuracy_reward": 0.42395834233611823,
"rewards/format_reward": 0.0,
"step": 190
},
{
"completion_length": 709.8849118709564,
"epoch": 0.1766955177316713,
"grad_norm": 0.5097645308657522,
"kl": 1.0620399475097657,
"learning_rate": 1.964919237423812e-05,
"loss": 0.0425,
"reward": 0.2687500062398612,
"reward_std": 0.2625139458104968,
"rewards/accuracy_reward": 0.2687500062398612,
"rewards/format_reward": 0.0,
"step": 200
},
{
"epoch": 0.1766955177316713,
"eval_completion_length": 699.2357113000119,
"eval_kl": 0.9798473011363636,
"eval_loss": 0.039183806627988815,
"eval_reward": 0.3872053963367385,
"eval_reward_std": 0.34990924777406635,
"eval_rewards/accuracy_reward": 0.3872053963367385,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 513.8181,
"eval_samples_per_second": 0.193,
"eval_steps_per_second": 0.064,
"step": 200
},
{
"completion_length": 786.7765783786774,
"epoch": 0.18553029361825485,
"grad_norm": 0.15936983650030578,
"kl": 1.27841796875,
"learning_rate": 1.9563499025108e-05,
"loss": 0.0511,
"reward": 0.25937500610016284,
"reward_std": 0.25980761647224426,
"rewards/accuracy_reward": 0.25937500610016284,
"rewards/format_reward": 0.0,
"step": 210
},
{
"completion_length": 909.2531366825103,
"epoch": 0.19436506950483842,
"grad_norm": 0.15108105876463188,
"kl": 0.6172576904296875,
"learning_rate": 1.9468680523668136e-05,
"loss": 0.0247,
"reward": 0.08593750246800483,
"reward_std": 0.11998060066252947,
"rewards/accuracy_reward": 0.08593750246800483,
"rewards/format_reward": 0.0,
"step": 220
},
{
"completion_length": 847.0453273773194,
"epoch": 0.203199845391422,
"grad_norm": 0.4781092711117684,
"kl": 0.719610595703125,
"learning_rate": 1.936482734237689e-05,
"loss": 0.0288,
"reward": 0.1854166716337204,
"reward_std": 0.2201147861778736,
"rewards/accuracy_reward": 0.1854166716337204,
"rewards/format_reward": 0.0,
"step": 230
},
{
"completion_length": 844.9718894481659,
"epoch": 0.21203462127800554,
"grad_norm": 0.6125806510450937,
"kl": 0.6312515258789062,
"learning_rate": 1.9252038574264403e-05,
"loss": 0.0253,
"reward": 0.1963541720993817,
"reward_std": 0.25168862845748663,
"rewards/accuracy_reward": 0.1963541720993817,
"rewards/format_reward": 0.0,
"step": 240
},
{
"completion_length": 716.1927267074585,
"epoch": 0.22086939716458912,
"grad_norm": 0.42158257729983767,
"kl": 1.013921356201172,
"learning_rate": 1.913042183838153e-05,
"loss": 0.0406,
"reward": 0.3276041746605188,
"reward_std": 0.3418996063992381,
"rewards/accuracy_reward": 0.3276041746605188,
"rewards/format_reward": 0.0,
"step": 250
},
{
"completion_length": 772.1078284740448,
"epoch": 0.2297041730511727,
"grad_norm": 0.49721786332420803,
"kl": 1.1279647827148438,
"learning_rate": 1.9000093177113524e-05,
"loss": 0.0451,
"reward": 0.25833333977498113,
"reward_std": 0.27604559250175953,
"rewards/accuracy_reward": 0.25833333977498113,
"rewards/format_reward": 0.0,
"step": 260
},
{
"completion_length": 798.2672049999237,
"epoch": 0.23853894893775623,
"grad_norm": 0.6009045781426623,
"kl": 0.8998489379882812,
"learning_rate": 1.8861176945456542e-05,
"loss": 0.036,
"reward": 0.26093750656582415,
"reward_std": 0.2931856783106923,
"rewards/accuracy_reward": 0.26093750656582415,
"rewards/format_reward": 0.0,
"step": 270
},
{
"completion_length": 739.2422043561935,
"epoch": 0.2473737248243398,
"grad_norm": 0.9064888423502573,
"kl": 1.204058837890625,
"learning_rate": 1.8713805692362458e-05,
"loss": 0.0482,
"reward": 0.31979167396202685,
"reward_std": 0.30761943478137255,
"rewards/accuracy_reward": 0.31979167396202685,
"rewards/format_reward": 0.0,
"step": 280
},
{
"completion_length": 823.738035774231,
"epoch": 0.2562085007109234,
"grad_norm": 0.4395412639205522,
"kl": 0.8689178466796875,
"learning_rate": 1.8558120034265396e-05,
"loss": 0.0348,
"reward": 0.24218750628642738,
"reward_std": 0.275143482722342,
"rewards/accuracy_reward": 0.24218750628642738,
"rewards/format_reward": 0.0,
"step": 290
},
{
"completion_length": 866.1130366325378,
"epoch": 0.2650432765975069,
"grad_norm": 0.4910227951322757,
"kl": 0.7059478759765625,
"learning_rate": 1.8394268520910467e-05,
"loss": 0.0282,
"reward": 0.236458339728415,
"reward_std": 0.27153504360467196,
"rewards/accuracy_reward": 0.236458339728415,
"rewards/format_reward": 0.0,
"step": 300
},
{
"epoch": 0.2650432765975069,
"eval_completion_length": 855.1784671051333,
"eval_kl": 0.9380622632575758,
"eval_loss": 0.03754038363695145,
"eval_reward": 0.2558922623143052,
"eval_reward_std": 0.2857592190154875,
"eval_rewards/accuracy_reward": 0.2558922623143052,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 552.4722,
"eval_samples_per_second": 0.179,
"eval_steps_per_second": 0.06,
"step": 300
},
{
"completion_length": 812.8906393527984,
"epoch": 0.27387805248409053,
"grad_norm": 0.8576089815755503,
"kl": 1.0579757690429688,
"learning_rate": 1.8222407493612878e-05,
"loss": 0.0423,
"reward": 0.2651041731238365,
"reward_std": 0.27153504360467196,
"rewards/accuracy_reward": 0.2651041731238365,
"rewards/format_reward": 0.0,
"step": 310
},
{
"completion_length": 733.2239722013474,
"epoch": 0.2827128283706741,
"grad_norm": 0.2427855401581468,
"kl": 0.7223701477050781,
"learning_rate": 1.8042700936082574e-05,
"loss": 0.0289,
"reward": 0.29479167317040267,
"reward_std": 0.27604559250175953,
"rewards/accuracy_reward": 0.29479167317040267,
"rewards/format_reward": 0.0,
"step": 320
},
{
"completion_length": 711.7026183843612,
"epoch": 0.2915476042572576,
"grad_norm": 1.9778007104616155,
"kl": 0.8535140991210938,
"learning_rate": 1.7855320317956785e-05,
"loss": 0.0341,
"reward": 0.3192708408460021,
"reward_std": 0.303108885884285,
"rewards/accuracy_reward": 0.3192708408460021,
"rewards/format_reward": 0.0,
"step": 330
},
{
"completion_length": 728.3005360603332,
"epoch": 0.3003823801438412,
"grad_norm": 0.38171472488642083,
"kl": 0.7892318725585937,
"learning_rate": 1.766044443118978e-05,
"loss": 0.0316,
"reward": 0.3526041739620268,
"reward_std": 0.3112278738990426,
"rewards/accuracy_reward": 0.3526041739620268,
"rewards/format_reward": 0.0,
"step": 340
},
{
"completion_length": 767.1573046445847,
"epoch": 0.30921715603042477,
"grad_norm": 0.24577197070273166,
"kl": 0.5293319702148438,
"learning_rate": 1.7458259219455896e-05,
"loss": 0.0212,
"reward": 0.3062500067986548,
"reward_std": 0.2814582511782646,
"rewards/accuracy_reward": 0.3062500067986548,
"rewards/format_reward": 0.0,
"step": 350
},
{
"completion_length": 743.670326089859,
"epoch": 0.3180519319170083,
"grad_norm": 2.0103657238013652,
"kl": 0.5427993774414063,
"learning_rate": 1.7248957600728664e-05,
"loss": 0.0217,
"reward": 0.3505208409391344,
"reward_std": 0.3175426423549652,
"rewards/accuracy_reward": 0.3505208409391344,
"rewards/format_reward": 0.0,
"step": 360
},
{
"completion_length": 840.4734511613846,
"epoch": 0.3268867078035919,
"grad_norm": 1.0323054081356446,
"kl": 1.1146942138671876,
"learning_rate": 1.7032739283205324e-05,
"loss": 0.0446,
"reward": 0.2598958395421505,
"reward_std": 0.25890550669282675,
"rewards/accuracy_reward": 0.2598958395421505,
"rewards/format_reward": 0.0,
"step": 370
},
{
"completion_length": 839.7666805267334,
"epoch": 0.33572148369017546,
"grad_norm": 0.5119633048650775,
"kl": 1.2632476806640625,
"learning_rate": 1.6809810574752316e-05,
"loss": 0.0505,
"reward": 0.2333333392161876,
"reward_std": 0.25168862845748663,
"rewards/accuracy_reward": 0.2333333392161876,
"rewards/format_reward": 0.0,
"step": 380
},
{
"completion_length": 733.7547018527985,
"epoch": 0.344556259576759,
"grad_norm": 1.5656576378617317,
"kl": 1.1475967407226562,
"learning_rate": 1.658038418605361e-05,
"loss": 0.0459,
"reward": 0.2567708396818489,
"reward_std": 0.2634160555899143,
"rewards/accuracy_reward": 0.2567708396818489,
"rewards/format_reward": 0.0,
"step": 390
},
{
"completion_length": 885.3140710353852,
"epoch": 0.3533910354633426,
"grad_norm": 1.4058117586591972,
"kl": 1.0219383239746094,
"learning_rate": 1.6344679027649726e-05,
"loss": 0.0409,
"reward": 0.247916672937572,
"reward_std": 0.2688287142664194,
"rewards/accuracy_reward": 0.247916672937572,
"rewards/format_reward": 0.0,
"step": 400
},
{
"epoch": 0.3533910354633426,
"eval_completion_length": 985.952867296007,
"eval_kl": 1.9370610400883839,
"eval_loss": 0.07748492807149887,
"eval_reward": 0.26262626924900095,
"eval_reward_std": 0.27992739821925305,
"eval_rewards/accuracy_reward": 0.26262626924900095,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 572.8755,
"eval_samples_per_second": 0.173,
"eval_steps_per_second": 0.058,
"step": 400
},
{
"completion_length": 956.2359453678131,
"epoch": 0.36222581134992615,
"grad_norm": 0.42438124420657664,
"kl": 1.3301658630371094,
"learning_rate": 1.6102920001061003e-05,
"loss": 0.0532,
"reward": 0.20677083879709243,
"reward_std": 0.2381569817662239,
"rewards/accuracy_reward": 0.20677083879709243,
"rewards/format_reward": 0.0,
"step": 410
},
{
"completion_length": 952.6323000907898,
"epoch": 0.3710605872365097,
"grad_norm": 0.44079559643052724,
"kl": 1.332806396484375,
"learning_rate": 1.5855337784194576e-05,
"loss": 0.0533,
"reward": 0.19427083847112953,
"reward_std": 0.2309401035308838,
"rewards/accuracy_reward": 0.19427083847112953,
"rewards/format_reward": 0.0,
"step": 420
},
{
"completion_length": 976.3510466098785,
"epoch": 0.3798953631230933,
"grad_norm": 1.0796755649934018,
"kl": 1.4209449768066407,
"learning_rate": 1.560216861123964e-05,
"loss": 0.0568,
"reward": 0.22760417237877845,
"reward_std": 0.2498844088986516,
"rewards/accuracy_reward": 0.22760417237877845,
"rewards/format_reward": 0.0,
"step": 430
},
{
"completion_length": 959.9578190803528,
"epoch": 0.38873013900967684,
"grad_norm": 0.46008757265822564,
"kl": 1.3368026733398437,
"learning_rate": 1.534365404726116e-05,
"loss": 0.0535,
"reward": 0.20885417223908007,
"reward_std": 0.24808018933981657,
"rewards/accuracy_reward": 0.20885417223908007,
"rewards/format_reward": 0.0,
"step": 440
},
{
"completion_length": 917.8979267597199,
"epoch": 0.3975649148962604,
"grad_norm": 1.329880644661546,
"kl": 1.244671630859375,
"learning_rate": 1.5080040757707045e-05,
"loss": 0.0498,
"reward": 0.20833333865739406,
"reward_std": 0.22462533507496119,
"rewards/accuracy_reward": 0.20833333865739406,
"rewards/format_reward": 0.0,
"step": 450
},
{
"completion_length": 877.7073036193848,
"epoch": 0.406399690782844,
"grad_norm": 2.63876569492226,
"kl": 1.6055191040039063,
"learning_rate": 1.4811580273048707e-05,
"loss": 0.0642,
"reward": 0.21770833879709245,
"reward_std": 0.23635276220738888,
"rewards/accuracy_reward": 0.21770833879709245,
"rewards/format_reward": 0.0,
"step": 460
},
{
"completion_length": 842.9104291439056,
"epoch": 0.41523446666942754,
"grad_norm": 2.9897920260252797,
"kl": 1.5962646484375,
"learning_rate": 1.4538528748779561e-05,
"loss": 0.0638,
"reward": 0.27239584024064245,
"reward_std": 0.2976962272077799,
"rewards/accuracy_reward": 0.27239584024064245,
"rewards/format_reward": 0.0,
"step": 470
},
{
"completion_length": 846.857304239273,
"epoch": 0.4240692425560111,
"grad_norm": 0.4405188552245265,
"kl": 2.0165779113769533,
"learning_rate": 1.4261146721000554e-05,
"loss": 0.0807,
"reward": 0.26614583996124563,
"reward_std": 0.275143482722342,
"rewards/accuracy_reward": 0.26614583996124563,
"rewards/format_reward": 0.0,
"step": 480
},
{
"completion_length": 788.802097249031,
"epoch": 0.4329040184425947,
"grad_norm": 0.9995458801565597,
"kl": 2.145621490478516,
"learning_rate": 1.3979698857825816e-05,
"loss": 0.0858,
"reward": 0.2781250068452209,
"reward_std": 0.28416458051651716,
"rewards/accuracy_reward": 0.2781250068452209,
"rewards/format_reward": 0.0,
"step": 490
},
{
"completion_length": 716.1062657117843,
"epoch": 0.44173879432917823,
"grad_norm": 0.7288658015008603,
"kl": 1.736468505859375,
"learning_rate": 1.3694453706845725e-05,
"loss": 0.0695,
"reward": 0.273437506519258,
"reward_std": 0.2787519218400121,
"rewards/accuracy_reward": 0.273437506519258,
"rewards/format_reward": 0.0,
"step": 500
},
{
"epoch": 0.44173879432917823,
"eval_completion_length": 691.8889026834507,
"eval_kl": 1.6545336174242424,
"eval_loss": 0.06618843972682953,
"eval_reward": 0.2828282897520547,
"eval_reward_std": 0.29159103981172196,
"eval_rewards/accuracy_reward": 0.2828282897520547,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 537.4204,
"eval_samples_per_second": 0.184,
"eval_steps_per_second": 0.061,
"step": 500
},
{
"completion_length": 708.6135584831238,
"epoch": 0.4505735702157618,
"grad_norm": 0.3575580002721733,
"kl": 1.9233924865722656,
"learning_rate": 1.3405683438888281e-05,
"loss": 0.0769,
"reward": 0.23697917256504297,
"reward_std": 0.2571012871339917,
"rewards/accuracy_reward": 0.23697917256504297,
"rewards/format_reward": 0.0,
"step": 510
},
{
"completion_length": 768.7807433485984,
"epoch": 0.4594083461023454,
"grad_norm": 0.3623016043344235,
"kl": 1.5948562622070312,
"learning_rate": 1.3113663588323356e-05,
"loss": 0.0638,
"reward": 0.2958333403803408,
"reward_std": 0.29679411742836237,
"rewards/accuracy_reward": 0.2958333403803408,
"rewards/format_reward": 0.0,
"step": 520
},
{
"completion_length": 761.9812644720078,
"epoch": 0.4682431219889289,
"grad_norm": 1.1199341162268286,
"kl": 2.285528564453125,
"learning_rate": 1.2818672790157543e-05,
"loss": 0.0914,
"reward": 0.3104166739620268,
"reward_std": 0.3112278738990426,
"rewards/accuracy_reward": 0.3104166739620268,
"rewards/format_reward": 0.0,
"step": 530
},
{
"completion_length": 801.8776177227497,
"epoch": 0.47707789787551247,
"grad_norm": 2.033721333586972,
"kl": 1.8555000305175782,
"learning_rate": 1.252099251417048e-05,
"loss": 0.0742,
"reward": 0.26406250600703063,
"reward_std": 0.2625139458104968,
"rewards/accuracy_reward": 0.26406250600703063,
"rewards/format_reward": 0.0,
"step": 540
},
{
"completion_length": 772.0724115848541,
"epoch": 0.48591267376209607,
"grad_norm": 0.7488074779014002,
"kl": 1.9588623046875,
"learning_rate": 1.2220906796346375e-05,
"loss": 0.0784,
"reward": 0.28177083991467955,
"reward_std": 0.2796540316194296,
"rewards/accuracy_reward": 0.28177083991467955,
"rewards/format_reward": 0.0,
"step": 550
},
{
"completion_length": 853.2635561227798,
"epoch": 0.4947474496486796,
"grad_norm": 0.8443715089551579,
"kl": 1.8191993713378907,
"learning_rate": 1.1918701967856892e-05,
"loss": 0.0728,
"reward": 0.3036458405200392,
"reward_std": 0.29679411742836237,
"rewards/accuracy_reward": 0.3036458405200392,
"rewards/format_reward": 0.0,
"step": 560
},
{
"completion_length": 858.300532579422,
"epoch": 0.5035822255352632,
"grad_norm": 1.2444227038825535,
"kl": 2.2291900634765627,
"learning_rate": 1.1614666381854107e-05,
"loss": 0.0892,
"reward": 0.26041667321696876,
"reward_std": 0.2805561413988471,
"rewards/accuracy_reward": 0.26041667321696876,
"rewards/format_reward": 0.0,
"step": 570
},
{
"completion_length": 960.0770899295807,
"epoch": 0.5124170014218468,
"grad_norm": 0.5125806971801138,
"kl": 1.9308998107910156,
"learning_rate": 1.1309090138334112e-05,
"loss": 0.0772,
"reward": 0.3078125073108822,
"reward_std": 0.30040255654603243,
"rewards/accuracy_reward": 0.3078125073108822,
"rewards/format_reward": 0.0,
"step": 580
},
{
"completion_length": 884.6036564350128,
"epoch": 0.5212517773084303,
"grad_norm": 0.6680675746939368,
"kl": 1.9921707153320312,
"learning_rate": 1.100226480733388e-05,
"loss": 0.0797,
"reward": 0.24947917312383652,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.24947917312383652,
"rewards/format_reward": 0.0,
"step": 590
},
{
"completion_length": 799.9510572910309,
"epoch": 0.5300865531950139,
"grad_norm": 3.062049979351596,
"kl": 2.207135009765625,
"learning_rate": 1.0694483150725458e-05,
"loss": 0.0883,
"reward": 0.29583334028720853,
"reward_std": 0.29498989786952734,
"rewards/accuracy_reward": 0.29583334028720853,
"rewards/format_reward": 0.0,
"step": 600
},
{
"epoch": 0.5300865531950139,
"eval_completion_length": 775.0841884420375,
"eval_kl": 2.8066208964646466,
"eval_loss": 0.11227121949195862,
"eval_reward": 0.28619529352043616,
"eval_reward_std": 0.3207501437928941,
"eval_rewards/accuracy_reward": 0.28619529352043616,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 554.5488,
"eval_samples_per_second": 0.179,
"eval_steps_per_second": 0.06,
"step": 600
},
{
"completion_length": 862.4078252792358,
"epoch": 0.5389213290815974,
"grad_norm": 0.4132033561721883,
"kl": 1.8186492919921875,
"learning_rate": 1.038603884287294e-05,
"loss": 0.0728,
"reward": 0.15156250400468707,
"reward_std": 0.17861773632466793,
"rewards/accuracy_reward": 0.15156250400468707,
"rewards/format_reward": 0.0,
"step": 610
},
{
"completion_length": 623.4411586642266,
"epoch": 0.5477561049681811,
"grad_norm": 0.5984776310180936,
"kl": 0.5929832458496094,
"learning_rate": 1.0077226190418783e-05,
"loss": 0.0237,
"reward": 0.1755208382382989,
"reward_std": 0.2255274448543787,
"rewards/accuracy_reward": 0.1755208382382989,
"rewards/format_reward": 0.0,
"step": 620
},
{
"completion_length": 675.0708496570587,
"epoch": 0.5565908808547646,
"grad_norm": 0.4802015784335563,
"kl": 0.47046356201171874,
"learning_rate": 9.768339851466818e-06,
"loss": 0.0188,
"reward": 0.2515625067520887,
"reward_std": 0.2940877880901098,
"rewards/accuracy_reward": 0.2515625067520887,
"rewards/format_reward": 0.0,
"step": 630
},
{
"completion_length": 768.3067860364914,
"epoch": 0.5654256567413481,
"grad_norm": 2.4930383667378413,
"kl": 1.0879661560058593,
"learning_rate": 9.45967455442995e-06,
"loss": 0.0435,
"reward": 0.34322917480021714,
"reward_std": 0.3301721792668104,
"rewards/accuracy_reward": 0.34322917480021714,
"rewards/format_reward": 0.0,
"step": 640
},
{
"completion_length": 789.8026188373566,
"epoch": 0.5742604326279317,
"grad_norm": 1.7100954361428433,
"kl": 2.7097198486328127,
"learning_rate": 9.151524816810686e-06,
"loss": 0.1084,
"reward": 0.31666667400859294,
"reward_std": 0.30671732500195503,
"rewards/accuracy_reward": 0.31666667400859294,
"rewards/format_reward": 0.0,
"step": 650
},
{
"completion_length": 707.0859542965889,
"epoch": 0.5830952085145152,
"grad_norm": 1.57666179442439,
"kl": 1.7728652954101562,
"learning_rate": 8.844184664182993e-06,
"loss": 0.0709,
"reward": 0.35000000791624186,
"reward_std": 0.33107428904622793,
"rewards/accuracy_reward": 0.35000000791624186,
"rewards/format_reward": 0.0,
"step": 660
},
{
"completion_length": 773.0073058724404,
"epoch": 0.5919299844010988,
"grad_norm": 0.9807231122231896,
"kl": 2.1020263671875,
"learning_rate": 8.537947349643493e-06,
"loss": 0.0841,
"reward": 0.29739584047347306,
"reward_std": 0.3112278738990426,
"rewards/accuracy_reward": 0.29739584047347306,
"rewards/format_reward": 0.0,
"step": 670
},
{
"completion_length": 777.6401177406311,
"epoch": 0.6007647602876824,
"grad_norm": 0.5403220713078971,
"kl": 1.6934829711914063,
"learning_rate": 8.23310507399973e-06,
"loss": 0.0678,
"reward": 0.2744791732635349,
"reward_std": 0.2805561413988471,
"rewards/accuracy_reward": 0.2744791732635349,
"rewards/format_reward": 0.0,
"step": 680
},
{
"completion_length": 767.1880352020264,
"epoch": 0.609599536174266,
"grad_norm": 0.6884159799218071,
"kl": 2.134033203125,
"learning_rate": 7.929948706962508e-06,
"loss": 0.0854,
"reward": 0.28958334047347306,
"reward_std": 0.30671732500195503,
"rewards/accuracy_reward": 0.28958334047347306,
"rewards/format_reward": 0.0,
"step": 690
},
{
"completion_length": 716.2224100291729,
"epoch": 0.6184343120608495,
"grad_norm": 0.9853920893844104,
"kl": 1.9078399658203125,
"learning_rate": 7.628767509608304e-06,
"loss": 0.0763,
"reward": 0.31041667447425425,
"reward_std": 0.32566163036972284,
"rewards/accuracy_reward": 0.31041667447425425,
"rewards/format_reward": 0.0,
"step": 700
},
{
"epoch": 0.6184343120608495,
"eval_completion_length": 676.0437842475044,
"eval_kl": 2.118035827020202,
"eval_loss": 0.08472807705402374,
"eval_reward": 0.30976431749083777,
"eval_reward_std": 0.3382456061815975,
"eval_rewards/accuracy_reward": 0.30976431749083777,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 527.3635,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.063,
"step": 700
},
{
"completion_length": 693.3250136733055,
"epoch": 0.6272690879474331,
"grad_norm": 0.6986064715250526,
"kl": 2.01304931640625,
"learning_rate": 7.329848858376585e-06,
"loss": 0.0805,
"reward": 0.31510417414829134,
"reward_std": 0.3166405325755477,
"rewards/accuracy_reward": 0.31510417414829134,
"rewards/format_reward": 0.0,
"step": 710
},
{
"completion_length": 737.9067856788636,
"epoch": 0.6361038638340166,
"grad_norm": 2.3133745632209393,
"kl": 2.0740203857421875,
"learning_rate": 7.033477970865381e-06,
"loss": 0.083,
"reward": 0.32343750772997737,
"reward_std": 0.3175426423549652,
"rewards/accuracy_reward": 0.32343750772997737,
"rewards/format_reward": 0.0,
"step": 720
},
{
"completion_length": 747.067202681303,
"epoch": 0.6449386397206002,
"grad_norm": 0.7455664324085425,
"kl": 2.0231887817382814,
"learning_rate": 6.73993763368675e-06,
"loss": 0.0809,
"reward": 0.3057291737757623,
"reward_std": 0.30040255654603243,
"rewards/accuracy_reward": 0.3057291737757623,
"rewards/format_reward": 0.0,
"step": 730
},
{
"completion_length": 730.435950744152,
"epoch": 0.6537734156071838,
"grad_norm": 1.9312009716198788,
"kl": 2.154399108886719,
"learning_rate": 6.449507932641796e-06,
"loss": 0.0862,
"reward": 0.2927083405200392,
"reward_std": 0.3040109956637025,
"rewards/accuracy_reward": 0.2927083405200392,
"rewards/format_reward": 0.0,
"step": 740
},
{
"completion_length": 707.3302231192589,
"epoch": 0.6626081914937674,
"grad_norm": 0.6668502719545971,
"kl": 1.8355056762695312,
"learning_rate": 6.16246598547271e-06,
"loss": 0.0734,
"reward": 0.2916666741017252,
"reward_std": 0.32115108147263527,
"rewards/accuracy_reward": 0.2916666741017252,
"rewards/format_reward": 0.0,
"step": 750
},
{
"completion_length": 740.6020975530148,
"epoch": 0.6714429673803509,
"grad_norm": 0.7705413022710031,
"kl": 2.15982666015625,
"learning_rate": 5.8790856774468385e-06,
"loss": 0.0864,
"reward": 0.3046875072643161,
"reward_std": 0.30491310544312,
"rewards/accuracy_reward": 0.3046875072643161,
"rewards/format_reward": 0.0,
"step": 760
},
{
"completion_length": 739.286992508173,
"epoch": 0.6802777432669345,
"grad_norm": 0.5391118755792965,
"kl": 2.0162887573242188,
"learning_rate": 5.599637400025036e-06,
"loss": 0.0807,
"reward": 0.31458334061317145,
"reward_std": 0.3112278738990426,
"rewards/accuracy_reward": 0.31458334061317145,
"rewards/format_reward": 0.0,
"step": 770
},
{
"completion_length": 748.7869940280914,
"epoch": 0.689112519153518,
"grad_norm": 0.7122982868192613,
"kl": 2.226336669921875,
"learning_rate": 5.324387792863719e-06,
"loss": 0.0891,
"reward": 0.3234375074971467,
"reward_std": 0.3058152152225375,
"rewards/accuracy_reward": 0.3234375074971467,
"rewards/format_reward": 0.0,
"step": 780
},
{
"completion_length": 737.4474101424217,
"epoch": 0.6979472950401016,
"grad_norm": 0.6757655195430946,
"kl": 2.05509033203125,
"learning_rate": 5.053599489396732e-06,
"loss": 0.0822,
"reward": 0.31250000759027896,
"reward_std": 0.31393420323729515,
"rewards/accuracy_reward": 0.31250000759027896,
"rewards/format_reward": 0.0,
"step": 790
},
{
"completion_length": 711.7630351662635,
"epoch": 0.7067820709266852,
"grad_norm": 0.5761504098242444,
"kl": 1.904425048828125,
"learning_rate": 4.787530866239826e-06,
"loss": 0.0762,
"reward": 0.30729167386889455,
"reward_std": 0.303108885884285,
"rewards/accuracy_reward": 0.30729167386889455,
"rewards/format_reward": 0.0,
"step": 800
},
{
"epoch": 0.7067820709266852,
"eval_completion_length": 720.3198793199327,
"eval_kl": 1.9343237058080809,
"eval_loss": 0.07739535719156265,
"eval_reward": 0.32996633739182446,
"eval_reward_std": 0.3149183229966597,
"eval_rewards/accuracy_reward": 0.32996633739182446,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 534.7287,
"eval_samples_per_second": 0.185,
"eval_steps_per_second": 0.062,
"step": 800
},
{
"completion_length": 715.5786605343222,
"epoch": 0.7156168468132688,
"grad_norm": 0.7254446582873234,
"kl": 2.05001220703125,
"learning_rate": 4.52643579665683e-06,
"loss": 0.082,
"reward": 0.3187500074040145,
"reward_std": 0.31303209345787764,
"rewards/accuracy_reward": 0.3187500074040145,
"rewards/format_reward": 0.0,
"step": 810
},
{
"completion_length": 729.5484510660172,
"epoch": 0.7244516226998523,
"grad_norm": 0.8829844749096067,
"kl": 2.499810791015625,
"learning_rate": 4.270563408322772e-06,
"loss": 0.1,
"reward": 0.3192708410322666,
"reward_std": 0.3184447521343827,
"rewards/accuracy_reward": 0.3192708410322666,
"rewards/format_reward": 0.0,
"step": 820
},
{
"completion_length": 717.5286597132683,
"epoch": 0.7332863985864359,
"grad_norm": 1.7173630228550354,
"kl": 2.062689208984375,
"learning_rate": 4.020157845615075e-06,
"loss": 0.0825,
"reward": 0.33333334093913436,
"reward_std": 0.3094236543402076,
"rewards/accuracy_reward": 0.33333334093913436,
"rewards/format_reward": 0.0,
"step": 830
},
{
"completion_length": 731.0494927078486,
"epoch": 0.7421211744730194,
"grad_norm": 0.829005223012386,
"kl": 2.804341125488281,
"learning_rate": 3.7754580366596116e-06,
"loss": 0.1122,
"reward": 0.29739584065973756,
"reward_std": 0.3220531912520528,
"rewards/accuracy_reward": 0.29739584065973756,
"rewards/format_reward": 0.0,
"step": 840
},
{
"completion_length": 706.3677220225334,
"epoch": 0.7509559503596029,
"grad_norm": 1.6348693110104273,
"kl": 1.9862686157226563,
"learning_rate": 3.5366974653539653e-06,
"loss": 0.0795,
"reward": 0.30312500717118385,
"reward_std": 0.30761943478137255,
"rewards/accuracy_reward": 0.30312500717118385,
"rewards/format_reward": 0.0,
"step": 850
},
{
"completion_length": 698.5177225530148,
"epoch": 0.7597907262461866,
"grad_norm": 0.9159549909583196,
"kl": 2.294660949707031,
"learning_rate": 3.304103948585341e-06,
"loss": 0.0918,
"reward": 0.3046875072643161,
"reward_std": 0.30130466632544994,
"rewards/accuracy_reward": 0.3046875072643161,
"rewards/format_reward": 0.0,
"step": 860
},
{
"completion_length": 672.9390772372484,
"epoch": 0.7686255021327701,
"grad_norm": 2.1045299190913833,
"kl": 2.3146575927734374,
"learning_rate": 3.0778994188557722e-06,
"loss": 0.0926,
"reward": 0.315625007962808,
"reward_std": 0.32475952059030533,
"rewards/accuracy_reward": 0.315625007962808,
"rewards/format_reward": 0.0,
"step": 870
},
{
"completion_length": 674.6109511375428,
"epoch": 0.7774602780193537,
"grad_norm": 0.7983792148121029,
"kl": 2.302967834472656,
"learning_rate": 2.8582997125219604e-06,
"loss": 0.0921,
"reward": 0.30260417428798975,
"reward_std": 0.32566163036972284,
"rewards/accuracy_reward": 0.30260417428798975,
"rewards/format_reward": 0.0,
"step": 880
},
{
"completion_length": 656.9838689267635,
"epoch": 0.7862950539059372,
"grad_norm": 0.669988664196779,
"kl": 2.0931640625,
"learning_rate": 2.645514363851874e-06,
"loss": 0.0837,
"reward": 0.3010416736826301,
"reward_std": 0.2931856783106923,
"rewards/accuracy_reward": 0.3010416736826301,
"rewards/format_reward": 0.0,
"step": 890
},
{
"completion_length": 652.4843894541264,
"epoch": 0.7951298297925208,
"grad_norm": 1.3222110289859523,
"kl": 1.9164962768554688,
"learning_rate": 2.4397464050945753e-06,
"loss": 0.0767,
"reward": 0.2838541739154607,
"reward_std": 0.31393420323729515,
"rewards/accuracy_reward": 0.2838541739154607,
"rewards/format_reward": 0.0,
"step": 900
},
{
"epoch": 0.7951298297925208,
"eval_completion_length": 622.8788027522539,
"eval_kl": 2.0288036616161618,
"eval_loss": 0.08118358999490738,
"eval_reward": 0.3131313209581857,
"eval_reward_std": 0.3265819645891286,
"eval_rewards/accuracy_reward": 0.3131313209581857,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 506.0213,
"eval_samples_per_second": 0.196,
"eval_steps_per_second": 0.065,
"step": 900
},
{
"completion_length": 651.8578262448311,
"epoch": 0.8039646056791043,
"grad_norm": 1.0220083602716086,
"kl": 2.2107833862304687,
"learning_rate": 2.24119217275401e-06,
"loss": 0.0884,
"reward": 0.2739583405200392,
"reward_std": 0.30671732500195503,
"rewards/accuracy_reward": 0.2739583405200392,
"rewards/format_reward": 0.0,
"step": 910
},
{
"completion_length": 630.3770972907544,
"epoch": 0.812799381565688,
"grad_norm": 1.140106665270527,
"kl": 2.0034912109375,
"learning_rate": 2.0500411202516814e-06,
"loss": 0.0802,
"reward": 0.29166667340323327,
"reward_std": 0.2850666902959347,
"rewards/accuracy_reward": 0.29166667340323327,
"rewards/format_reward": 0.0,
"step": 920
},
{
"completion_length": 633.9614718899131,
"epoch": 0.8216341574522715,
"grad_norm": 1.1778599052603038,
"kl": 1.8928955078125,
"learning_rate": 1.8664756371568981e-06,
"loss": 0.0757,
"reward": 0.2963541735429317,
"reward_std": 0.29498989786952734,
"rewards/accuracy_reward": 0.2963541735429317,
"rewards/format_reward": 0.0,
"step": 930
},
{
"completion_length": 658.522930726409,
"epoch": 0.8304689333388551,
"grad_norm": 0.6577212329055425,
"kl": 2.294940185546875,
"learning_rate": 1.6906708751570955e-06,
"loss": 0.0918,
"reward": 0.31093750675208864,
"reward_std": 0.30220677610486746,
"rewards/accuracy_reward": 0.31093750675208864,
"rewards/format_reward": 0.0,
"step": 940
},
{
"completion_length": 677.2802217006683,
"epoch": 0.8393037092254386,
"grad_norm": 0.7422568973111673,
"kl": 2.5144363403320313,
"learning_rate": 1.5227945809342992e-06,
"loss": 0.1006,
"reward": 0.3151041742414236,
"reward_std": 0.3166405325755477,
"rewards/accuracy_reward": 0.3151041742414236,
"rewards/format_reward": 0.0,
"step": 950
},
{
"completion_length": 652.5901178598403,
"epoch": 0.8481384851120222,
"grad_norm": 0.810421906439625,
"kl": 2.262736511230469,
"learning_rate": 1.363006936107183e-06,
"loss": 0.0905,
"reward": 0.34635417480021713,
"reward_std": 0.336486947722733,
"rewards/accuracy_reward": 0.34635417480021713,
"rewards/format_reward": 0.0,
"step": 960
},
{
"completion_length": 670.2057422459126,
"epoch": 0.8569732609986057,
"grad_norm": 0.7920465762778155,
"kl": 2.5366302490234376,
"learning_rate": 1.2114604043914225e-06,
"loss": 0.1015,
"reward": 0.32031250768341124,
"reward_std": 0.3175426423549652,
"rewards/accuracy_reward": 0.32031250768341124,
"rewards/format_reward": 0.0,
"step": 970
},
{
"completion_length": 682.0869930744171,
"epoch": 0.8658080368851894,
"grad_norm": 0.6071556277616226,
"kl": 2.4378128051757812,
"learning_rate": 1.068299586124224e-06,
"loss": 0.0975,
"reward": 0.33072917396202683,
"reward_std": 0.2995004467666149,
"rewards/accuracy_reward": 0.33072917396202683,
"rewards/format_reward": 0.0,
"step": 980
},
{
"completion_length": 663.8849090188742,
"epoch": 0.8746428127717729,
"grad_norm": 0.7004418244247146,
"kl": 2.3257492065429686,
"learning_rate": 9.336610802918044e-07,
"loss": 0.093,
"reward": 0.32447917433455586,
"reward_std": 0.3094236543402076,
"rewards/accuracy_reward": 0.32447917433455586,
"rewards/format_reward": 0.0,
"step": 990
},
{
"completion_length": 678.5744934767484,
"epoch": 0.8834775886583565,
"grad_norm": 1.5130552859053217,
"kl": 2.3184799194335937,
"learning_rate": 8.076733541914617e-07,
"loss": 0.0927,
"reward": 0.33281250740401447,
"reward_std": 0.30491310544312,
"rewards/accuracy_reward": 0.33281250740401447,
"rewards/format_reward": 0.0,
"step": 1000
},
{
"epoch": 0.8834775886583565,
"eval_completion_length": 664.8687005476518,
"eval_kl": 2.323409880050505,
"eval_loss": 0.0929863303899765,
"eval_reward": 0.3535353619642932,
"eval_reward_std": 0.3440774269778319,
"eval_rewards/accuracy_reward": 0.3535353619642932,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 516.2096,
"eval_samples_per_second": 0.192,
"eval_steps_per_second": 0.064,
"step": 1000
},
{
"completion_length": 702.9177223086357,
"epoch": 0.89231236454494,
"grad_norm": 1.529025375905959,
"kl": 2.152081298828125,
"learning_rate": 6.90456620852632e-07,
"loss": 0.0861,
"reward": 0.3427083408460021,
"reward_std": 0.31393420323729515,
"rewards/accuracy_reward": 0.3427083408460021,
"rewards/format_reward": 0.0,
"step": 1010
},
{
"completion_length": 693.2552229389548,
"epoch": 0.9011471404315236,
"grad_norm": 0.6357248703500725,
"kl": 2.1104583740234375,
"learning_rate": 5.821227243338712e-07,
"loss": 0.0844,
"reward": 0.32656250763684513,
"reward_std": 0.3157384227961302,
"rewards/accuracy_reward": 0.32656250763684513,
"rewards/format_reward": 0.0,
"step": 1020
},
{
"completion_length": 669.4468886733055,
"epoch": 0.9099819163181071,
"grad_norm": 2.2284285054958324,
"kl": 2.1510406494140626,
"learning_rate": 4.827750330052117e-07,
"loss": 0.0861,
"reward": 0.3536458413582295,
"reward_std": 0.32566163036972284,
"rewards/accuracy_reward": 0.3536458413582295,
"rewards/format_reward": 0.0,
"step": 1030
},
{
"completion_length": 683.547409978509,
"epoch": 0.9188166922046908,
"grad_norm": 0.6632003611968932,
"kl": 2.2973922729492187,
"learning_rate": 3.925083409177266e-07,
"loss": 0.0919,
"reward": 0.34010417442768814,
"reward_std": 0.31934686191380024,
"rewards/accuracy_reward": 0.34010417442768814,
"rewards/format_reward": 0.0,
"step": 1040
},
{
"completion_length": 679.4802215665579,
"epoch": 0.9276514680912743,
"grad_norm": 0.8599780179720123,
"kl": 2.3347732543945314,
"learning_rate": 3.114087773543939e-07,
"loss": 0.0934,
"reward": 0.3500000080559403,
"reward_std": 0.33107428904622793,
"rewards/accuracy_reward": 0.3500000080559403,
"rewards/format_reward": 0.0,
"step": 1050
},
{
"completion_length": 703.7713681519032,
"epoch": 0.9364862439778578,
"grad_norm": 1.174851381620777,
"kl": 2.408406066894531,
"learning_rate": 2.395537246485846e-07,
"loss": 0.0963,
"reward": 0.329166674753651,
"reward_std": 0.34370382595807314,
"rewards/accuracy_reward": 0.329166674753651,
"rewards/format_reward": 0.0,
"step": 1060
},
{
"completion_length": 678.3260560303927,
"epoch": 0.9453210198644414,
"grad_norm": 1.54743742951263,
"kl": 2.394914245605469,
"learning_rate": 1.7701174434858193e-07,
"loss": 0.0958,
"reward": 0.33802084121853115,
"reward_std": 0.32024897169321775,
"rewards/accuracy_reward": 0.33802084121853115,
"rewards/format_reward": 0.0,
"step": 1070
},
{
"completion_length": 664.8302224695683,
"epoch": 0.9541557957510249,
"grad_norm": 1.5666079028942566,
"kl": 2.3258026123046873,
"learning_rate": 1.2384251179857642e-07,
"loss": 0.093,
"reward": 0.3578125084284693,
"reward_std": 0.35001859441399574,
"rewards/accuracy_reward": 0.3578125084284693,
"rewards/format_reward": 0.0,
"step": 1080
},
{
"completion_length": 682.4229293212295,
"epoch": 0.9629905716376085,
"grad_norm": 0.6943507484399737,
"kl": 2.487345886230469,
"learning_rate": 8.009675919856574e-08,
"loss": 0.0995,
"reward": 0.32239584033377466,
"reward_std": 0.29679411742836237,
"rewards/accuracy_reward": 0.32239584033377466,
"rewards/format_reward": 0.0,
"step": 1090
},
{
"completion_length": 689.0625125810504,
"epoch": 0.9718253475241921,
"grad_norm": 0.6276023210613672,
"kl": 2.3767745971679686,
"learning_rate": 4.581622719748269e-08,
"loss": 0.0951,
"reward": 0.3395833406597376,
"reward_std": 0.2985983369871974,
"rewards/accuracy_reward": 0.3395833406597376,
"rewards/format_reward": 0.0,
"step": 1100
},
{
"epoch": 0.9718253475241921,
"eval_completion_length": 665.5656705528799,
"eval_kl": 2.368923611111111,
"eval_loss": 0.09475857019424438,
"eval_reward": 0.3804713891010092,
"eval_reward_std": 0.3382456061815975,
"eval_rewards/accuracy_reward": 0.3804713891010092,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 510.5875,
"eval_samples_per_second": 0.194,
"eval_steps_per_second": 0.065,
"step": 1100
},
{
"completion_length": 687.5755343332887,
"epoch": 0.9806601234107757,
"grad_norm": 0.5833877177508657,
"kl": 2.3449798583984376,
"learning_rate": 2.1033625065747244e-08,
"loss": 0.0938,
"reward": 0.3359375076368451,
"reward_std": 0.31393420323729515,
"rewards/accuracy_reward": 0.3359375076368451,
"rewards/format_reward": 0.0,
"step": 1110
},
{
"completion_length": 686.5161595344543,
"epoch": 0.9894948992973592,
"grad_norm": 0.8177663703595698,
"kl": 2.952043151855469,
"learning_rate": 5.772599485236452e-09,
"loss": 0.118,
"reward": 0.3333333409857005,
"reward_std": 0.3238574108108878,
"rewards/accuracy_reward": 0.3333333409857005,
"rewards/format_reward": 0.0,
"step": 1120
},
{
"completion_length": 679.1244929388165,
"epoch": 0.9983296751839428,
"grad_norm": 1.027763278137717,
"kl": 2.3051712036132814,
"learning_rate": 4.7711986460585725e-11,
"loss": 0.0922,
"reward": 0.3552083419635892,
"reward_std": 0.34370382595807314,
"rewards/accuracy_reward": 0.3552083419635892,
"rewards/format_reward": 0.0,
"step": 1130
},
{
"completion_length": 695.1406378149986,
"epoch": 0.9992131527726011,
"kl": 2.40301513671875,
"reward": 0.3125000074505806,
"reward_std": 0.32475952059030533,
"rewards/accuracy_reward": 0.3125000074505806,
"rewards/format_reward": 0.0,
"step": 1131,
"total_flos": 0.0,
"train_loss": 0.06294877494734573,
"train_runtime": 428958.1793,
"train_samples_per_second": 0.169,
"train_steps_per_second": 0.003
}
],
"logging_steps": 10,
"max_steps": 1131,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}