{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.997867803837953, "eval_steps": 116, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 625.9241256713867, "epoch": 0.008528784648187633, "grad_norm": 0.20551569759845734, "kl": 0.0, "learning_rate": 2.127659574468085e-08, "loss": 0.0447, "reward": 0.7433036118745804, "reward_std": 0.190913749858737, "rewards/accuracy_reward": 0.7299107536673546, "rewards/format_reward": 0.013392857741564512, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 583.8616371154785, "epoch": 0.042643923240938165, "grad_norm": 0.5290549397468567, "kl": 7.251650094985962e-05, "learning_rate": 1.0638297872340425e-07, "loss": 0.054, "reward": 0.7550223553553224, "reward_std": 0.237873874604702, "rewards/accuracy_reward": 0.7466518199071288, "rewards/format_reward": 0.008370536146685481, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 599.1326179504395, "epoch": 0.08528784648187633, "grad_norm": 0.28203362226486206, "kl": 8.721351623535157e-05, "learning_rate": 2.127659574468085e-07, "loss": 0.0351, "reward": 0.727232176065445, "reward_std": 0.2079640648327768, "rewards/accuracy_reward": 0.7214286044239998, "rewards/format_reward": 0.005803571781143546, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 575.5317222595215, "epoch": 0.1279317697228145, "grad_norm": 0.22249764204025269, "kl": 0.00016361474990844727, "learning_rate": 3.1914893617021275e-07, "loss": 0.03, "reward": 0.7473214611411094, "reward_std": 0.22900055218487977, "rewards/accuracy_reward": 0.7375000298023224, "rewards/format_reward": 0.009821429150179029, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 582.1107414245605, "epoch": 0.17057569296375266, "grad_norm": 0.3668869137763977, "kl": 0.00010156631469726562, "learning_rate": 4.25531914893617e-07, "loss": 0.0314, "reward": 0.7700893200933934, "reward_std": 0.21237293258309364, "rewards/accuracy_reward": 0.7607143215835095, "rewards/format_reward": 0.009375000512227416, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 565.0393089294433, "epoch": 0.21321961620469082, "grad_norm": 0.27896830439567566, "kl": 0.00015020370483398438, "learning_rate": 5.319148936170212e-07, "loss": 0.036, "reward": 0.7875000402331352, "reward_std": 0.21531264819204807, "rewards/accuracy_reward": 0.7767857536673546, "rewards/format_reward": 0.010714286286383868, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 576.9236869812012, "epoch": 0.255863539445629, "grad_norm": 0.41902390122413635, "kl": 0.0003345251083374023, "learning_rate": 6.382978723404255e-07, "loss": 0.0403, "reward": 0.775000037252903, "reward_std": 0.23194959200918674, "rewards/accuracy_reward": 0.7656250312924385, "rewards/format_reward": 0.00937500041909516, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 569.61029586792, "epoch": 0.29850746268656714, "grad_norm": 0.16671252250671387, "kl": 0.13303523063659667, "learning_rate": 7.446808510638297e-07, "loss": 0.0383, "reward": 0.7870536059141159, "reward_std": 0.21664966912940145, "rewards/accuracy_reward": 0.7705357536673546, "rewards/format_reward": 0.0165178578812629, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 577.1330627441406, "epoch": 0.3411513859275053, "grad_norm": 0.4918578565120697, "kl": 0.0008090019226074218, "learning_rate": 8.51063829787234e-07, "loss": 0.0388, "reward": 0.8004464715719223, "reward_std": 0.23178436178714037, "rewards/accuracy_reward": 0.771428607404232, "rewards/format_reward": 0.02901785881258547, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 591.5464546203614, "epoch": 0.3837953091684435, "grad_norm": 0.5081108808517456, "kl": 0.005326557159423828, "learning_rate": 9.574468085106384e-07, "loss": 0.0432, "reward": 0.8508928954601288, "reward_std": 0.2962774943560362, "rewards/accuracy_reward": 0.7718750327825546, "rewards/format_reward": 0.07901786002330483, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 590.6241355895996, "epoch": 0.42643923240938164, "grad_norm": 1.225230097770691, "kl": 0.02999114990234375, "learning_rate": 9.998747147528373e-07, "loss": 0.0284, "reward": 0.9977679073810577, "reward_std": 0.4272202838212252, "rewards/accuracy_reward": 0.701339316368103, "rewards/format_reward": 0.2964285858441144, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 584.9955612182617, "epoch": 0.4690831556503198, "grad_norm": 1.62312912940979, "kl": 0.04905548095703125, "learning_rate": 9.991093100466482e-07, "loss": 0.052, "reward": 1.2584822058677674, "reward_std": 0.4769852660596371, "rewards/accuracy_reward": 0.7388393208384514, "rewards/format_reward": 0.5196428842842579, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 619.11029586792, "epoch": 0.511727078891258, "grad_norm": 0.35674989223480225, "kl": 0.02668609619140625, "learning_rate": 9.976491676662678e-07, "loss": 0.0348, "reward": 1.3142857804894448, "reward_std": 0.4189721491187811, "rewards/accuracy_reward": 0.7379464611411095, "rewards/format_reward": 0.5763393118977547, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 606.5446723937988, "epoch": 0.5543710021321961, "grad_norm": 0.7614251375198364, "kl": 0.1965301513671875, "learning_rate": 9.95496320064109e-07, "loss": 0.0326, "reward": 1.4531250655651093, "reward_std": 0.39104298427700995, "rewards/accuracy_reward": 0.7379464611411095, "rewards/format_reward": 0.7151786029338837, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 619.4513656616211, "epoch": 0.5970149253731343, "grad_norm": 0.5526299476623535, "kl": 0.0309814453125, "learning_rate": 9.926537639070456e-07, "loss": 0.0332, "reward": 1.5433036401867866, "reward_std": 0.345558512583375, "rewards/accuracy_reward": 0.7343750335276127, "rewards/format_reward": 0.8089286044239998, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 601.5808296203613, "epoch": 0.6396588486140725, "grad_norm": 0.3345666825771332, "kl": 0.02752227783203125, "learning_rate": 9.891254559051884e-07, "loss": 0.0323, "reward": 1.6361607968807221, "reward_std": 0.3074555268511176, "rewards/accuracy_reward": 0.764285746216774, "rewards/format_reward": 0.8718750342726708, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 611.288419342041, "epoch": 0.6823027718550106, "grad_norm": 0.35215917229652405, "kl": 0.02867279052734375, "learning_rate": 9.849163073043223e-07, "loss": 0.0428, "reward": 1.6526786476373672, "reward_std": 0.28676611334085467, "rewards/accuracy_reward": 0.7589286133646965, "rewards/format_reward": 0.8937500417232513, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 626.8102943420411, "epoch": 0.7249466950959488, "grad_norm": 0.3329945206642151, "kl": 0.01969757080078125, "learning_rate": 9.800321770496724e-07, "loss": 0.028, "reward": 1.6875000774860383, "reward_std": 0.2533166547305882, "rewards/accuracy_reward": 0.7848214626312255, "rewards/format_reward": 0.9026786088943481, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 606.9535957336426, "epoch": 0.767590618336887, "grad_norm": 0.29067689180374146, "kl": 0.0251251220703125, "learning_rate": 9.744798636305187e-07, "loss": 0.024, "reward": 1.662500074505806, "reward_std": 0.2734585601836443, "rewards/accuracy_reward": 0.7611607506871223, "rewards/format_reward": 0.9013393223285675, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 608.546459197998, "epoch": 0.8102345415778252, "grad_norm": 0.33922263979911804, "kl": 0.02018585205078125, "learning_rate": 9.68267095617003e-07, "loss": 0.0242, "reward": 1.6745536416769027, "reward_std": 0.24658216908574104, "rewards/accuracy_reward": 0.7522321805357933, "rewards/format_reward": 0.9223214715719223, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 609.9174369812011, "epoch": 0.8528784648187633, "grad_norm": 0.5688201785087585, "kl": 0.02324676513671875, "learning_rate": 9.614025209023083e-07, "loss": 0.0296, "reward": 1.6991072326898575, "reward_std": 0.24431310119107366, "rewards/accuracy_reward": 0.781250037252903, "rewards/format_reward": 0.9178571790456772, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 589.7357414245605, "epoch": 0.8955223880597015, "grad_norm": 0.3856689929962158, "kl": 0.02101287841796875, "learning_rate": 9.538956946651815e-07, "loss": 0.0288, "reward": 1.7343750864267349, "reward_std": 0.2345518351532519, "rewards/accuracy_reward": 0.8107143267989159, "rewards/format_reward": 0.9236607521772384, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 600.2009185791015, "epoch": 0.9381663113006397, "grad_norm": 0.3187831938266754, "kl": 0.02591400146484375, "learning_rate": 9.457570660695539e-07, "loss": 0.0116, "reward": 1.734821507334709, "reward_std": 0.22256441051140427, "rewards/accuracy_reward": 0.8138393193483353, "rewards/format_reward": 0.9209821775555611, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 577.4808288574219, "epoch": 0.9808102345415778, "grad_norm": 0.19666177034378052, "kl": 0.0201080322265625, "learning_rate": 9.369979637197774e-07, "loss": 0.0232, "reward": 1.7446429431438446, "reward_std": 0.2030067172832787, "rewards/accuracy_reward": 0.7982143253087998, "rewards/format_reward": 0.9464286059141159, "step": 115 }, { "epoch": 0.9893390191897654, "eval_clip_ratio": 0.0, "eval_completion_length": 591.1813688732329, "eval_kl": 0.027180020771329364, "eval_loss": 0.00365327182225883, "eval_reward": 1.684807332735213, "eval_reward_std": 0.22303236411913993, "eval_rewards/accuracy_reward": 0.7500000307484279, "eval_rewards/format_reward": 0.9348072892143613, "eval_runtime": 686.4197, "eval_samples_per_second": 0.728, "eval_steps_per_second": 0.013, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 598.285604095459, "epoch": 1.0255863539445629, "grad_norm": 0.17957638204097748, "kl": 0.0184295654296875, "learning_rate": 9.276305798917158e-07, "loss": 0.0077, "reward": 1.723214367032051, "reward_std": 0.22227218970656396, "rewards/accuracy_reward": 0.7964286059141159, "rewards/format_reward": 0.9267857506871223, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 582.7888671875, "epoch": 1.068230277185501, "grad_norm": 0.35526904463768005, "kl": 0.0201629638671875, "learning_rate": 9.176679535616476e-07, "loss": 0.0216, "reward": 1.751785784959793, "reward_std": 0.2007270947098732, "rewards/accuracy_reward": 0.8142857477068901, "rewards/format_reward": 0.9375000387430191, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 572.4165481567383, "epoch": 1.1108742004264391, "grad_norm": 0.44200047850608826, "kl": 0.362432861328125, "learning_rate": 9.071239522565976e-07, "loss": 0.021, "reward": 1.732142946124077, "reward_std": 0.21233755089342593, "rewards/accuracy_reward": 0.8000000350177288, "rewards/format_reward": 0.9321428954601287, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 551.3214492797852, "epoch": 1.1535181236673775, "grad_norm": 0.1597866415977478, "kl": 0.0220245361328125, "learning_rate": 8.960132527513642e-07, "loss": 0.0171, "reward": 1.7830357879400254, "reward_std": 0.1866126311942935, "rewards/accuracy_reward": 0.8316964685916901, "rewards/format_reward": 0.9513393223285675, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 581.4509185791015, "epoch": 1.1961620469083156, "grad_norm": 0.24776776134967804, "kl": 0.0202850341796875, "learning_rate": 8.8435132063911e-07, "loss": 0.0073, "reward": 1.7156250864267348, "reward_std": 0.18961388804018497, "rewards/accuracy_reward": 0.77857146859169, "rewards/format_reward": 0.9370535984635353, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 600.3785957336426, "epoch": 1.2388059701492538, "grad_norm": 0.38918188214302063, "kl": 0.0415618896484375, "learning_rate": 8.721543888039532e-07, "loss": 0.0098, "reward": 1.7325893640518188, "reward_std": 0.20605442952364683, "rewards/accuracy_reward": 0.7897321820259094, "rewards/format_reward": 0.942857176065445, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 586.5723472595215, "epoch": 1.2814498933901919, "grad_norm": 0.39274245500564575, "kl": 0.0493682861328125, "learning_rate": 8.594394348255237e-07, "loss": 0.0246, "reward": 1.7558036506175996, "reward_std": 0.2199950136244297, "rewards/accuracy_reward": 0.8058036044239998, "rewards/format_reward": 0.9500000298023223, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 564.6044891357421, "epoch": 1.32409381663113, "grad_norm": 0.7922531962394714, "kl": 0.0613037109375, "learning_rate": 8.462241573469377e-07, "loss": 0.0263, "reward": 1.7375000715255737, "reward_std": 0.17584939412772654, "rewards/accuracy_reward": 0.8004464656114578, "rewards/format_reward": 0.9370536029338836, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 575.3759223937989, "epoch": 1.3667377398720681, "grad_norm": 10.02066707611084, "kl": 0.46389617919921877, "learning_rate": 8.325269514390834e-07, "loss": 0.0185, "reward": 1.7901786595582962, "reward_std": 0.17941874554380774, "rewards/accuracy_reward": 0.8223214641213417, "rewards/format_reward": 0.9678571701049805, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 572.3335052490235, "epoch": 1.4093816631130065, "grad_norm": 0.1462014764547348, "kl": 0.05237274169921875, "learning_rate": 8.183668829955111e-07, "loss": 0.0162, "reward": 1.7723215103149415, "reward_std": 0.1762597480788827, "rewards/accuracy_reward": 0.816071467101574, "rewards/format_reward": 0.9562500298023224, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 575.169223022461, "epoch": 1.4520255863539446, "grad_norm": 1.8751811981201172, "kl": 0.18811492919921874, "learning_rate": 8.037636621935684e-07, "loss": 0.0151, "reward": 1.7419643700122833, "reward_std": 0.1959962229244411, "rewards/accuracy_reward": 0.7982143238186836, "rewards/format_reward": 0.9437500312924385, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 588.2317245483398, "epoch": 1.4946695095948828, "grad_norm": 0.17714911699295044, "kl": 0.132525634765625, "learning_rate": 7.887376160587213e-07, "loss": 0.0172, "reward": 1.7156250804662705, "reward_std": 0.19640195239335298, "rewards/accuracy_reward": 0.7674107566475868, "rewards/format_reward": 0.948214316368103, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 559.4638603210449, "epoch": 1.537313432835821, "grad_norm": 0.21344700455665588, "kl": 0.0337677001953125, "learning_rate": 7.733096601702507e-07, "loss": 0.0098, "reward": 1.788839367032051, "reward_std": 0.1706329697743058, "rewards/accuracy_reward": 0.8178571820259094, "rewards/format_reward": 0.9709821701049804, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 574.9826164245605, "epoch": 1.579957356076759, "grad_norm": 0.3246748745441437, "kl": 0.0662689208984375, "learning_rate": 7.575012695477076e-07, "loss": 0.0171, "reward": 1.764285796880722, "reward_std": 0.18098030481487512, "rewards/accuracy_reward": 0.8044643223285675, "rewards/format_reward": 0.9598214566707611, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 591.3893157958985, "epoch": 1.6226012793176974, "grad_norm": 0.47960391640663147, "kl": 0.056695556640625, "learning_rate": 7.413344487586542e-07, "loss": 0.0212, "reward": 1.7665179401636124, "reward_std": 0.22513661198318005, "rewards/accuracy_reward": 0.8071428954601287, "rewards/format_reward": 0.9593750357627868, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 576.8236846923828, "epoch": 1.6652452025586353, "grad_norm": 0.9285232424736023, "kl": 0.2149566650390625, "learning_rate": 7.248317012892968e-07, "loss": 0.0264, "reward": 1.7383929401636125, "reward_std": 0.21417219610884786, "rewards/accuracy_reward": 0.7852678969502449, "rewards/format_reward": 0.9531250342726707, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 569.6236854553223, "epoch": 1.7078891257995736, "grad_norm": 19.19864845275879, "kl": 0.493951416015625, "learning_rate": 7.08015998220647e-07, "loss": 0.025, "reward": 1.7593750774860382, "reward_std": 0.1970167408697307, "rewards/accuracy_reward": 0.7933036029338837, "rewards/format_reward": 0.9660714626312256, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 579.9201164245605, "epoch": 1.7505330490405118, "grad_norm": 8.568023681640625, "kl": 0.36058349609375, "learning_rate": 6.909107462538111e-07, "loss": 0.0273, "reward": 1.7156250894069671, "reward_std": 0.2314098752103746, "rewards/accuracy_reward": 0.762946467101574, "rewards/format_reward": 0.9526785984635353, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 585.3098495483398, "epoch": 1.79317697228145, "grad_norm": 1.4387701749801636, "kl": 0.8639892578125, "learning_rate": 6.735397551289178e-07, "loss": 0.0267, "reward": 1.6991072207689286, "reward_std": 0.25122642405331136, "rewards/accuracy_reward": 0.740625037252903, "rewards/format_reward": 0.9584821745753288, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 573.5219017028809, "epoch": 1.835820895522388, "grad_norm": 8.508292198181152, "kl": 1.6532470703125, "learning_rate": 6.559272044830316e-07, "loss": 0.0335, "reward": 1.7223215013742448, "reward_std": 0.23952382281422616, "rewards/accuracy_reward": 0.7700893215835094, "rewards/format_reward": 0.9522321805357933, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 577.850471496582, "epoch": 1.8784648187633262, "grad_norm": 3.9072465896606445, "kl": 0.9760009765625, "learning_rate": 6.380976101931879e-07, "loss": 0.0349, "reward": 1.6732143580913543, "reward_std": 0.2966056760400534, "rewards/accuracy_reward": 0.7321428880095482, "rewards/format_reward": 0.9410714641213417, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 571.9951133728027, "epoch": 1.9211087420042645, "grad_norm": 13.385315895080566, "kl": 2.62431640625, "learning_rate": 6.200757902513962e-07, "loss": 0.0609, "reward": 1.6495536506175994, "reward_std": 0.29212585240602496, "rewards/accuracy_reward": 0.7040178939700127, "rewards/format_reward": 0.9455357491970062, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 602.8174324035645, "epoch": 1.9637526652452024, "grad_norm": 3.510239362716675, "kl": 1.172021484375, "learning_rate": 6.018868302191139e-07, "loss": 0.0426, "reward": 1.5660715043544768, "reward_std": 0.343078551068902, "rewards/accuracy_reward": 0.6276786014437675, "rewards/format_reward": 0.9383928880095482, "step": 230 }, { "epoch": 1.9808102345415777, "eval_clip_ratio": 0.0, "eval_completion_length": 590.3601776607453, "eval_kl": 2.955357142857143, "eval_loss": 0.06222715228796005, "eval_reward": 1.3279479032471067, "eval_reward_std": 0.4392576685973576, "eval_rewards/accuracy_reward": 0.46938777679488775, "eval_rewards/format_reward": 0.858560131655799, "eval_runtime": 674.3732, "eval_samples_per_second": 0.741, "eval_steps_per_second": 0.013, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 605.6790473937988, "epoch": 2.008528784648188, "grad_norm": 6.303433418273926, "kl": 3.3275390625, "learning_rate": 5.835560483092742e-07, "loss": 0.0882, "reward": 1.3517857775092126, "reward_std": 0.4619227208197117, "rewards/accuracy_reward": 0.5165178820490837, "rewards/format_reward": 0.8352678924798965, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 588.4384216308594, "epoch": 2.0511727078891258, "grad_norm": 20.91950225830078, "kl": 7.1640625, "learning_rate": 5.651089601444752e-07, "loss": 0.1247, "reward": 1.1812500566244126, "reward_std": 0.5094705298542976, "rewards/accuracy_reward": 0.43125002160668374, "rewards/format_reward": 0.7500000283122062, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 582.0741310119629, "epoch": 2.093816631130064, "grad_norm": 13.203470230102539, "kl": 6.3, "learning_rate": 5.465712432403811e-07, "loss": 0.1256, "reward": 1.2410714894533157, "reward_std": 0.5110540725290775, "rewards/accuracy_reward": 0.46830358877778056, "rewards/format_reward": 0.7727678924798965, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 590.8152046203613, "epoch": 2.136460554371002, "grad_norm": 88.20843505859375, "kl": 11.7703125, "learning_rate": 5.279687012637798e-07, "loss": 0.2042, "reward": 1.3339286342263221, "reward_std": 0.5208067961037159, "rewards/accuracy_reward": 0.5129464477300644, "rewards/format_reward": 0.8209821820259094, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 582.5169929504394, "epoch": 2.1791044776119404, "grad_norm": 19.759809494018555, "kl": 4.6005859375, "learning_rate": 5.093272281150382e-07, "loss": 0.0949, "reward": 1.3361607685685157, "reward_std": 0.5283136948943138, "rewards/accuracy_reward": 0.5258928835391998, "rewards/format_reward": 0.8102678999304771, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 574.399136352539, "epoch": 2.2217484008528783, "grad_norm": 73.8835678100586, "kl": 13.4765625, "learning_rate": 4.906727718849618e-07, "loss": 0.209, "reward": 1.168750050663948, "reward_std": 0.5519715771079063, "rewards/accuracy_reward": 0.4321428779512644, "rewards/format_reward": 0.7366071745753289, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 569.409400177002, "epoch": 2.2643923240938166, "grad_norm": 16.285621643066406, "kl": 15.2921875, "learning_rate": 4.7203129873622036e-07, "loss": 0.2319, "reward": 1.1008929148316384, "reward_std": 0.5826808042824269, "rewards/accuracy_reward": 0.39241073541343213, "rewards/format_reward": 0.7084821775555611, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 563.786190032959, "epoch": 2.307036247334755, "grad_norm": 5.511695861816406, "kl": 3.4419921875, "learning_rate": 4.534287567596188e-07, "loss": 0.0542, "reward": 1.3165179178118707, "reward_std": 0.49827431738376615, "rewards/accuracy_reward": 0.5075893081724644, "rewards/format_reward": 0.8089286148548126, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 558.287523651123, "epoch": 2.349680170575693, "grad_norm": 7.120125770568848, "kl": 5.0576171875, "learning_rate": 4.348910398555249e-07, "loss": 0.0723, "reward": 1.3750000685453414, "reward_std": 0.5270605705678463, "rewards/accuracy_reward": 0.5455357402563095, "rewards/format_reward": 0.8294643267989159, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 573.1196708679199, "epoch": 2.3923240938166312, "grad_norm": 9.884531021118164, "kl": 10.0318359375, "learning_rate": 4.1644395169072575e-07, "loss": 0.1729, "reward": 1.2901786297559739, "reward_std": 0.5422291226685048, "rewards/accuracy_reward": 0.48660716265439985, "rewards/format_reward": 0.8035714611411094, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 557.1683288574219, "epoch": 2.434968017057569, "grad_norm": 8.612386703491211, "kl": 4.2802734375, "learning_rate": 3.9811316978088615e-07, "loss": 0.0639, "reward": 1.4187500596046447, "reward_std": 0.4759579010307789, "rewards/accuracy_reward": 0.5776785999536515, "rewards/format_reward": 0.8410714656114578, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 546.3219017028808, "epoch": 2.4776119402985075, "grad_norm": 31.60814094543457, "kl": 9.052734375, "learning_rate": 3.799242097486038e-07, "loss": 0.1517, "reward": 1.3687500596046447, "reward_std": 0.5219749353826046, "rewards/accuracy_reward": 0.5669643111526966, "rewards/format_reward": 0.8017857521772385, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 548.1094017028809, "epoch": 2.520255863539446, "grad_norm": 59.2335090637207, "kl": 10.06796875, "learning_rate": 3.619023898068123e-07, "loss": 0.1374, "reward": 1.2995536252856255, "reward_std": 0.51812051422894, "rewards/accuracy_reward": 0.5620535910129547, "rewards/format_reward": 0.7375000342726707, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 558.5031517028808, "epoch": 2.5628997867803838, "grad_norm": 31.365585327148438, "kl": 3.1970703125, "learning_rate": 3.4407279551696846e-07, "loss": 0.0461, "reward": 1.2352679088711738, "reward_std": 0.531840232014656, "rewards/accuracy_reward": 0.5383928880095482, "rewards/format_reward": 0.6968750298023224, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 548.0366317749024, "epoch": 2.605543710021322, "grad_norm": 13.923192977905273, "kl": 6.5228515625, "learning_rate": 3.2646024487108213e-07, "loss": 0.0853, "reward": 1.2236607685685157, "reward_std": 0.5477135334163904, "rewards/accuracy_reward": 0.5044643051922322, "rewards/format_reward": 0.7191964611411095, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 517.9973487854004, "epoch": 2.64818763326226, "grad_norm": 11.7457914352417, "kl": 11.935546875, "learning_rate": 3.0908925374618887e-07, "loss": 0.1622, "reward": 1.2687500640749931, "reward_std": 0.5634565785527229, "rewards/accuracy_reward": 0.5196428790688514, "rewards/format_reward": 0.7491071805357933, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 542.1607376098633, "epoch": 2.6908315565031984, "grad_norm": 10.802907943725586, "kl": 7.883203125, "learning_rate": 2.91984001779353e-07, "loss": 0.1125, "reward": 1.2767857626080512, "reward_std": 0.5758342906832695, "rewards/accuracy_reward": 0.5187500260770321, "rewards/format_reward": 0.7580357491970062, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 536.2406478881836, "epoch": 2.7334754797441363, "grad_norm": 9.723703384399414, "kl": 6.530859375, "learning_rate": 2.751682987107029e-07, "loss": 0.0812, "reward": 1.3321429282426833, "reward_std": 0.526002112776041, "rewards/accuracy_reward": 0.5491071671247483, "rewards/format_reward": 0.7830357521772384, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 521.1303764343262, "epoch": 2.7761194029850746, "grad_norm": 18.712772369384766, "kl": 10.8078125, "learning_rate": 2.5866555124134577e-07, "loss": 0.1505, "reward": 1.2674107685685159, "reward_std": 0.5784162662923336, "rewards/accuracy_reward": 0.5366071693599224, "rewards/format_reward": 0.730803607404232, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 535.4544891357422, "epoch": 2.818763326226013, "grad_norm": 7.585498809814453, "kl": 9.569921875, "learning_rate": 2.424987304522924e-07, "loss": 0.1261, "reward": 1.19464291036129, "reward_std": 0.5549623288214207, "rewards/accuracy_reward": 0.5209821693599224, "rewards/format_reward": 0.6736607439815998, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 528.2076141357422, "epoch": 2.861407249466951, "grad_norm": 9.224991798400879, "kl": 6.6015625, "learning_rate": 2.2669033982974944e-07, "loss": 0.0755, "reward": 1.228125052154064, "reward_std": 0.5089043751358986, "rewards/accuracy_reward": 0.550000024586916, "rewards/format_reward": 0.6781250283122062, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 530.958950805664, "epoch": 2.9040511727078893, "grad_norm": 24.710325241088867, "kl": 7.4828125, "learning_rate": 2.1126238394127867e-07, "loss": 0.114, "reward": 1.2035714864730835, "reward_std": 0.5295904573053122, "rewards/accuracy_reward": 0.5526785984635353, "rewards/format_reward": 0.6508928887546063, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 529.0553779602051, "epoch": 2.946695095948827, "grad_norm": 36.00743865966797, "kl": 11.071875, "learning_rate": 1.9623633780643155e-07, "loss": 0.188, "reward": 1.220535770058632, "reward_std": 0.5231191631406545, "rewards/accuracy_reward": 0.5361607357859611, "rewards/format_reward": 0.6843750298023223, "step": 345 }, { "epoch": 2.9722814498933903, "eval_clip_ratio": 0.0, "eval_completion_length": 531.799803234282, "eval_kl": 9.749503968253968, "eval_loss": 0.13944962620735168, "eval_reward": 1.1026077540147872, "eval_reward_std": 0.4790610531492839, "eval_rewards/accuracy_reward": 0.4600340352644996, "eval_rewards/format_reward": 0.642573726082605, "eval_runtime": 734.1918, "eval_samples_per_second": 0.681, "eval_steps_per_second": 0.012, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 531.3727928161621, "epoch": 2.9893390191897655, "grad_norm": 17.88933753967285, "kl": 10.34296875, "learning_rate": 1.8163311700448898e-07, "loss": 0.1236, "reward": 1.1388393327593804, "reward_std": 0.4919752091169357, "rewards/accuracy_reward": 0.49821431189775467, "rewards/format_reward": 0.6406250275671482, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 524.6839851379394, "epoch": 3.0341151385927505, "grad_norm": 8.39860725402832, "kl": 7.53515625, "learning_rate": 1.674730485609166e-07, "loss": 0.099, "reward": 1.140625049173832, "reward_std": 0.5006550896912814, "rewards/accuracy_reward": 0.5415178820490837, "rewards/format_reward": 0.5991071693599224, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 531.0768104553223, "epoch": 3.076759061833689, "grad_norm": 18.39265251159668, "kl": 8.1181640625, "learning_rate": 1.537758426530622e-07, "loss": 0.1106, "reward": 1.1013393431901932, "reward_std": 0.4775242738425732, "rewards/accuracy_reward": 0.5392857365310192, "rewards/format_reward": 0.5620535992085933, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 534.2736846923829, "epoch": 3.1194029850746268, "grad_norm": 11.617506980895996, "kl": 7.2947265625, "learning_rate": 1.4056056517447634e-07, "loss": 0.0915, "reward": 1.0933036252856254, "reward_std": 0.4881337985396385, "rewards/accuracy_reward": 0.532589315623045, "rewards/format_reward": 0.5607143111526967, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 538.0062713623047, "epoch": 3.162046908315565, "grad_norm": 11.465629577636719, "kl": 7.7173828125, "learning_rate": 1.2784561119604682e-07, "loss": 0.0985, "reward": 1.10089291036129, "reward_std": 0.4965208202600479, "rewards/accuracy_reward": 0.5200893104076385, "rewards/format_reward": 0.5808035977184772, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 531.9433242797852, "epoch": 3.204690831556503, "grad_norm": 23.9652156829834, "kl": 9.834765625, "learning_rate": 1.156486793608899e-07, "loss": 0.1229, "reward": 1.101339338719845, "reward_std": 0.452479437738657, "rewards/accuracy_reward": 0.5160714529454709, "rewards/format_reward": 0.5852678865194321, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 534.8468994140625, "epoch": 3.2473347547974414, "grad_norm": 18.34585189819336, "kl": 10.09140625, "learning_rate": 1.0398674724863581e-07, "loss": 0.1464, "reward": 1.1111607685685159, "reward_std": 0.5048069790005684, "rewards/accuracy_reward": 0.5276785962283611, "rewards/format_reward": 0.5834821693599224, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 536.6924369812011, "epoch": 3.2899786780383797, "grad_norm": 14.767237663269043, "kl": 9.25234375, "learning_rate": 9.287604774340235e-08, "loss": 0.1232, "reward": 1.101339329779148, "reward_std": 0.49512304849922656, "rewards/accuracy_reward": 0.5138393111526967, "rewards/format_reward": 0.5875000298023224, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 532.2884185791015, "epoch": 3.3326226012793176, "grad_norm": 9.985774993896484, "kl": 7.86640625, "learning_rate": 8.233204643835234e-08, "loss": 0.1138, "reward": 1.075446480512619, "reward_std": 0.46411947570741174, "rewards/accuracy_reward": 0.5026785962283611, "rewards/format_reward": 0.5727678835391998, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 527.2410995483399, "epoch": 3.375266524520256, "grad_norm": 10.612527847290039, "kl": 8.09375, "learning_rate": 7.236942010828429e-08, "loss": 0.0785, "reward": 1.1446428999304772, "reward_std": 0.48770338781177996, "rewards/accuracy_reward": 0.5575893074274063, "rewards/format_reward": 0.5870535977184772, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 519.1937744140625, "epoch": 3.417910447761194, "grad_norm": 14.665472984313965, "kl": 11.00234375, "learning_rate": 6.300203628022271e-08, "loss": 0.152, "reward": 1.1830357760190964, "reward_std": 0.5015905275940895, "rewards/accuracy_reward": 0.5790178842842579, "rewards/format_reward": 0.6040178872644901, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 531.4053771972656, "epoch": 3.4605543710021323, "grad_norm": 8.385228157043457, "kl": 9.28125, "learning_rate": 5.42429339304461e-08, "loss": 0.1379, "reward": 1.1437500521540642, "reward_std": 0.46195379123091695, "rewards/accuracy_reward": 0.5531250216066838, "rewards/format_reward": 0.5906250216066837, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 531.0544860839843, "epoch": 3.50319829424307, "grad_norm": 18.6485652923584, "kl": 8.909765625, "learning_rate": 4.610430533481857e-08, "loss": 0.1119, "reward": 1.1084821969270706, "reward_std": 0.4929712563753128, "rewards/accuracy_reward": 0.5491071715950966, "rewards/format_reward": 0.5593750216066837, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 533.8643081665039, "epoch": 3.5458422174840085, "grad_norm": 84.65238189697266, "kl": 8.594140625, "learning_rate": 3.859747909769162e-08, "loss": 0.1078, "reward": 1.0660714849829673, "reward_std": 0.473931773006916, "rewards/accuracy_reward": 0.5361607417464256, "rewards/format_reward": 0.5299107395112514, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 536.6745796203613, "epoch": 3.588486140724947, "grad_norm": 19.54568862915039, "kl": 9.418359375, "learning_rate": 3.173290438299697e-08, "loss": 0.1327, "reward": 1.0656250432133674, "reward_std": 0.4773729760199785, "rewards/accuracy_reward": 0.5245535988360643, "rewards/format_reward": 0.5410714514553547, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 517.5031486511231, "epoch": 3.631130063965885, "grad_norm": 22.7406005859375, "kl": 8.5650390625, "learning_rate": 2.5520136369481194e-08, "loss": 0.1112, "reward": 1.1593750432133674, "reward_std": 0.45310505069792273, "rewards/accuracy_reward": 0.5647321693599224, "rewards/format_reward": 0.5946428835391998, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 513.7027023315429, "epoch": 3.673773987206823, "grad_norm": 25.179290771484375, "kl": 8.64296875, "learning_rate": 1.996782295032745e-08, "loss": 0.1274, "reward": 1.1517857566475869, "reward_std": 0.4889927223324776, "rewards/accuracy_reward": 0.5678571730852127, "rewards/format_reward": 0.583928594738245, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 527.8647552490235, "epoch": 3.716417910447761, "grad_norm": 14.159469604492188, "kl": 7.8265625, "learning_rate": 1.508369269567783e-08, "loss": 0.1046, "reward": 1.1281250417232513, "reward_std": 0.5101183526217937, "rewards/accuracy_reward": 0.546428595483303, "rewards/format_reward": 0.5816964566707611, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 523.8451080322266, "epoch": 3.7590618336886994, "grad_norm": 18.611404418945312, "kl": 9.108984375, "learning_rate": 1.0874544094811422e-08, "loss": 0.1173, "reward": 1.0482143327593803, "reward_std": 0.45587412640452385, "rewards/accuracy_reward": 0.5294643141329288, "rewards/format_reward": 0.518750024214387, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 528.406273651123, "epoch": 3.8017057569296373, "grad_norm": 9.024343490600586, "kl": 8.72421875, "learning_rate": 7.346236092954316e-09, "loss": 0.103, "reward": 1.0665179088711738, "reward_std": 0.47449378967285155, "rewards/accuracy_reward": 0.5200893059372902, "rewards/format_reward": 0.5464285977184773, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 522.9732391357422, "epoch": 3.8443496801705757, "grad_norm": 15.007638931274414, "kl": 7.5888671875, "learning_rate": 4.50367993589107e-09, "loss": 0.1024, "reward": 1.1848214849829675, "reward_std": 0.4590866263955832, "rewards/accuracy_reward": 0.5825893096625805, "rewards/format_reward": 0.6022321686148644, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 539.4236862182618, "epoch": 3.886993603411514, "grad_norm": 11.34084701538086, "kl": 8.90546875, "learning_rate": 2.3508323337321224e-09, "loss": 0.1158, "reward": 1.0486607655882836, "reward_std": 0.4737320654094219, "rewards/accuracy_reward": 0.5111607365310192, "rewards/format_reward": 0.5375000245869159, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 534.0035942077636, "epoch": 3.929637526652452, "grad_norm": 18.918825149536133, "kl": 7.205078125, "learning_rate": 8.906899533517864e-10, "loss": 0.0902, "reward": 1.1607143417000771, "reward_std": 0.46907868683338166, "rewards/accuracy_reward": 0.5776785992085933, "rewards/format_reward": 0.5830357417464256, "step": 460 }, { "epoch": 3.9637526652452024, "eval_clip_ratio": 0.0, "eval_completion_length": 529.8555946955605, "eval_kl": 8.018105158730158, "eval_loss": 0.10982762277126312, "eval_reward": 1.0456349707785106, "eval_reward_std": 0.43395746865915874, "eval_rewards/accuracy_reward": 0.48384356072970797, "eval_rewards/format_reward": 0.561791407683539, "eval_runtime": 659.4598, "eval_samples_per_second": 0.758, "eval_steps_per_second": 0.014, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 526.7964500427246, "epoch": 3.9722814498933903, "grad_norm": 13.803497314453125, "kl": 7.684765625, "learning_rate": 1.252852471625987e-10, "loss": 0.0773, "reward": 1.129464340209961, "reward_std": 0.445505191385746, "rewards/accuracy_reward": 0.5602678842842579, "rewards/format_reward": 0.5691964529454708, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 529.8702189127604, "epoch": 3.997867803837953, "kl": 8.166666666666666, "reward": 1.1056548183163006, "reward_std": 0.48472560321291286, "rewards/accuracy_reward": 0.5610119315485159, "rewards/format_reward": 0.5446428805589676, "step": 468, "total_flos": 0.0, "train_loss": 0.07395310898940279, "train_runtime": 53156.2352, "train_samples_per_second": 0.564, "train_steps_per_second": 0.009 } ], "logging_steps": 5, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }