| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.997867803837953, | |
| "eval_steps": 116, | |
| "global_step": 468, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 625.9241256713867, | |
| "epoch": 0.008528784648187633, | |
| "grad_norm": 0.20551569759845734, | |
| "kl": 0.0, | |
| "learning_rate": 2.127659574468085e-08, | |
| "loss": 0.0447, | |
| "reward": 0.7433036118745804, | |
| "reward_std": 0.190913749858737, | |
| "rewards/accuracy_reward": 0.7299107536673546, | |
| "rewards/format_reward": 0.013392857741564512, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 583.8616371154785, | |
| "epoch": 0.042643923240938165, | |
| "grad_norm": 0.5290549397468567, | |
| "kl": 7.251650094985962e-05, | |
| "learning_rate": 1.0638297872340425e-07, | |
| "loss": 0.054, | |
| "reward": 0.7550223553553224, | |
| "reward_std": 0.237873874604702, | |
| "rewards/accuracy_reward": 0.7466518199071288, | |
| "rewards/format_reward": 0.008370536146685481, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 599.1326179504395, | |
| "epoch": 0.08528784648187633, | |
| "grad_norm": 0.28203362226486206, | |
| "kl": 8.721351623535157e-05, | |
| "learning_rate": 2.127659574468085e-07, | |
| "loss": 0.0351, | |
| "reward": 0.727232176065445, | |
| "reward_std": 0.2079640648327768, | |
| "rewards/accuracy_reward": 0.7214286044239998, | |
| "rewards/format_reward": 0.005803571781143546, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 575.5317222595215, | |
| "epoch": 0.1279317697228145, | |
| "grad_norm": 0.22249764204025269, | |
| "kl": 0.00016361474990844727, | |
| "learning_rate": 3.1914893617021275e-07, | |
| "loss": 0.03, | |
| "reward": 0.7473214611411094, | |
| "reward_std": 0.22900055218487977, | |
| "rewards/accuracy_reward": 0.7375000298023224, | |
| "rewards/format_reward": 0.009821429150179029, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 582.1107414245605, | |
| "epoch": 0.17057569296375266, | |
| "grad_norm": 0.3668869137763977, | |
| "kl": 0.00010156631469726562, | |
| "learning_rate": 4.25531914893617e-07, | |
| "loss": 0.0314, | |
| "reward": 0.7700893200933934, | |
| "reward_std": 0.21237293258309364, | |
| "rewards/accuracy_reward": 0.7607143215835095, | |
| "rewards/format_reward": 0.009375000512227416, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 565.0393089294433, | |
| "epoch": 0.21321961620469082, | |
| "grad_norm": 0.27896830439567566, | |
| "kl": 0.00015020370483398438, | |
| "learning_rate": 5.319148936170212e-07, | |
| "loss": 0.036, | |
| "reward": 0.7875000402331352, | |
| "reward_std": 0.21531264819204807, | |
| "rewards/accuracy_reward": 0.7767857536673546, | |
| "rewards/format_reward": 0.010714286286383868, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 576.9236869812012, | |
| "epoch": 0.255863539445629, | |
| "grad_norm": 0.41902390122413635, | |
| "kl": 0.0003345251083374023, | |
| "learning_rate": 6.382978723404255e-07, | |
| "loss": 0.0403, | |
| "reward": 0.775000037252903, | |
| "reward_std": 0.23194959200918674, | |
| "rewards/accuracy_reward": 0.7656250312924385, | |
| "rewards/format_reward": 0.00937500041909516, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 569.61029586792, | |
| "epoch": 0.29850746268656714, | |
| "grad_norm": 0.16671252250671387, | |
| "kl": 0.13303523063659667, | |
| "learning_rate": 7.446808510638297e-07, | |
| "loss": 0.0383, | |
| "reward": 0.7870536059141159, | |
| "reward_std": 0.21664966912940145, | |
| "rewards/accuracy_reward": 0.7705357536673546, | |
| "rewards/format_reward": 0.0165178578812629, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 577.1330627441406, | |
| "epoch": 0.3411513859275053, | |
| "grad_norm": 0.4918578565120697, | |
| "kl": 0.0008090019226074218, | |
| "learning_rate": 8.51063829787234e-07, | |
| "loss": 0.0388, | |
| "reward": 0.8004464715719223, | |
| "reward_std": 0.23178436178714037, | |
| "rewards/accuracy_reward": 0.771428607404232, | |
| "rewards/format_reward": 0.02901785881258547, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 591.5464546203614, | |
| "epoch": 0.3837953091684435, | |
| "grad_norm": 0.5081108808517456, | |
| "kl": 0.005326557159423828, | |
| "learning_rate": 9.574468085106384e-07, | |
| "loss": 0.0432, | |
| "reward": 0.8508928954601288, | |
| "reward_std": 0.2962774943560362, | |
| "rewards/accuracy_reward": 0.7718750327825546, | |
| "rewards/format_reward": 0.07901786002330483, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 590.6241355895996, | |
| "epoch": 0.42643923240938164, | |
| "grad_norm": 1.225230097770691, | |
| "kl": 0.02999114990234375, | |
| "learning_rate": 9.998747147528373e-07, | |
| "loss": 0.0284, | |
| "reward": 0.9977679073810577, | |
| "reward_std": 0.4272202838212252, | |
| "rewards/accuracy_reward": 0.701339316368103, | |
| "rewards/format_reward": 0.2964285858441144, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 584.9955612182617, | |
| "epoch": 0.4690831556503198, | |
| "grad_norm": 1.62312912940979, | |
| "kl": 0.04905548095703125, | |
| "learning_rate": 9.991093100466482e-07, | |
| "loss": 0.052, | |
| "reward": 1.2584822058677674, | |
| "reward_std": 0.4769852660596371, | |
| "rewards/accuracy_reward": 0.7388393208384514, | |
| "rewards/format_reward": 0.5196428842842579, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 619.11029586792, | |
| "epoch": 0.511727078891258, | |
| "grad_norm": 0.35674989223480225, | |
| "kl": 0.02668609619140625, | |
| "learning_rate": 9.976491676662678e-07, | |
| "loss": 0.0348, | |
| "reward": 1.3142857804894448, | |
| "reward_std": 0.4189721491187811, | |
| "rewards/accuracy_reward": 0.7379464611411095, | |
| "rewards/format_reward": 0.5763393118977547, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 606.5446723937988, | |
| "epoch": 0.5543710021321961, | |
| "grad_norm": 0.7614251375198364, | |
| "kl": 0.1965301513671875, | |
| "learning_rate": 9.95496320064109e-07, | |
| "loss": 0.0326, | |
| "reward": 1.4531250655651093, | |
| "reward_std": 0.39104298427700995, | |
| "rewards/accuracy_reward": 0.7379464611411095, | |
| "rewards/format_reward": 0.7151786029338837, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 619.4513656616211, | |
| "epoch": 0.5970149253731343, | |
| "grad_norm": 0.5526299476623535, | |
| "kl": 0.0309814453125, | |
| "learning_rate": 9.926537639070456e-07, | |
| "loss": 0.0332, | |
| "reward": 1.5433036401867866, | |
| "reward_std": 0.345558512583375, | |
| "rewards/accuracy_reward": 0.7343750335276127, | |
| "rewards/format_reward": 0.8089286044239998, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 601.5808296203613, | |
| "epoch": 0.6396588486140725, | |
| "grad_norm": 0.3345666825771332, | |
| "kl": 0.02752227783203125, | |
| "learning_rate": 9.891254559051884e-07, | |
| "loss": 0.0323, | |
| "reward": 1.6361607968807221, | |
| "reward_std": 0.3074555268511176, | |
| "rewards/accuracy_reward": 0.764285746216774, | |
| "rewards/format_reward": 0.8718750342726708, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 611.288419342041, | |
| "epoch": 0.6823027718550106, | |
| "grad_norm": 0.35215917229652405, | |
| "kl": 0.02867279052734375, | |
| "learning_rate": 9.849163073043223e-07, | |
| "loss": 0.0428, | |
| "reward": 1.6526786476373672, | |
| "reward_std": 0.28676611334085467, | |
| "rewards/accuracy_reward": 0.7589286133646965, | |
| "rewards/format_reward": 0.8937500417232513, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.8102943420411, | |
| "epoch": 0.7249466950959488, | |
| "grad_norm": 0.3329945206642151, | |
| "kl": 0.01969757080078125, | |
| "learning_rate": 9.800321770496724e-07, | |
| "loss": 0.028, | |
| "reward": 1.6875000774860383, | |
| "reward_std": 0.2533166547305882, | |
| "rewards/accuracy_reward": 0.7848214626312255, | |
| "rewards/format_reward": 0.9026786088943481, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 606.9535957336426, | |
| "epoch": 0.767590618336887, | |
| "grad_norm": 0.29067689180374146, | |
| "kl": 0.0251251220703125, | |
| "learning_rate": 9.744798636305187e-07, | |
| "loss": 0.024, | |
| "reward": 1.662500074505806, | |
| "reward_std": 0.2734585601836443, | |
| "rewards/accuracy_reward": 0.7611607506871223, | |
| "rewards/format_reward": 0.9013393223285675, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 608.546459197998, | |
| "epoch": 0.8102345415778252, | |
| "grad_norm": 0.33922263979911804, | |
| "kl": 0.02018585205078125, | |
| "learning_rate": 9.68267095617003e-07, | |
| "loss": 0.0242, | |
| "reward": 1.6745536416769027, | |
| "reward_std": 0.24658216908574104, | |
| "rewards/accuracy_reward": 0.7522321805357933, | |
| "rewards/format_reward": 0.9223214715719223, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 609.9174369812011, | |
| "epoch": 0.8528784648187633, | |
| "grad_norm": 0.5688201785087585, | |
| "kl": 0.02324676513671875, | |
| "learning_rate": 9.614025209023083e-07, | |
| "loss": 0.0296, | |
| "reward": 1.6991072326898575, | |
| "reward_std": 0.24431310119107366, | |
| "rewards/accuracy_reward": 0.781250037252903, | |
| "rewards/format_reward": 0.9178571790456772, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.7357414245605, | |
| "epoch": 0.8955223880597015, | |
| "grad_norm": 0.3856689929962158, | |
| "kl": 0.02101287841796875, | |
| "learning_rate": 9.538956946651815e-07, | |
| "loss": 0.0288, | |
| "reward": 1.7343750864267349, | |
| "reward_std": 0.2345518351532519, | |
| "rewards/accuracy_reward": 0.8107143267989159, | |
| "rewards/format_reward": 0.9236607521772384, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 600.2009185791015, | |
| "epoch": 0.9381663113006397, | |
| "grad_norm": 0.3187831938266754, | |
| "kl": 0.02591400146484375, | |
| "learning_rate": 9.457570660695539e-07, | |
| "loss": 0.0116, | |
| "reward": 1.734821507334709, | |
| "reward_std": 0.22256441051140427, | |
| "rewards/accuracy_reward": 0.8138393193483353, | |
| "rewards/format_reward": 0.9209821775555611, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 577.4808288574219, | |
| "epoch": 0.9808102345415778, | |
| "grad_norm": 0.19666177034378052, | |
| "kl": 0.0201080322265625, | |
| "learning_rate": 9.369979637197774e-07, | |
| "loss": 0.0232, | |
| "reward": 1.7446429431438446, | |
| "reward_std": 0.2030067172832787, | |
| "rewards/accuracy_reward": 0.7982143253087998, | |
| "rewards/format_reward": 0.9464286059141159, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.9893390191897654, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 591.1813688732329, | |
| "eval_kl": 0.027180020771329364, | |
| "eval_loss": 0.00365327182225883, | |
| "eval_reward": 1.684807332735213, | |
| "eval_reward_std": 0.22303236411913993, | |
| "eval_rewards/accuracy_reward": 0.7500000307484279, | |
| "eval_rewards/format_reward": 0.9348072892143613, | |
| "eval_runtime": 686.4197, | |
| "eval_samples_per_second": 0.728, | |
| "eval_steps_per_second": 0.013, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.285604095459, | |
| "epoch": 1.0255863539445629, | |
| "grad_norm": 0.17957638204097748, | |
| "kl": 0.0184295654296875, | |
| "learning_rate": 9.276305798917158e-07, | |
| "loss": 0.0077, | |
| "reward": 1.723214367032051, | |
| "reward_std": 0.22227218970656396, | |
| "rewards/accuracy_reward": 0.7964286059141159, | |
| "rewards/format_reward": 0.9267857506871223, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 582.7888671875, | |
| "epoch": 1.068230277185501, | |
| "grad_norm": 0.35526904463768005, | |
| "kl": 0.0201629638671875, | |
| "learning_rate": 9.176679535616476e-07, | |
| "loss": 0.0216, | |
| "reward": 1.751785784959793, | |
| "reward_std": 0.2007270947098732, | |
| "rewards/accuracy_reward": 0.8142857477068901, | |
| "rewards/format_reward": 0.9375000387430191, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 572.4165481567383, | |
| "epoch": 1.1108742004264391, | |
| "grad_norm": 0.44200047850608826, | |
| "kl": 0.362432861328125, | |
| "learning_rate": 9.071239522565976e-07, | |
| "loss": 0.021, | |
| "reward": 1.732142946124077, | |
| "reward_std": 0.21233755089342593, | |
| "rewards/accuracy_reward": 0.8000000350177288, | |
| "rewards/format_reward": 0.9321428954601287, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 551.3214492797852, | |
| "epoch": 1.1535181236673775, | |
| "grad_norm": 0.1597866415977478, | |
| "kl": 0.0220245361328125, | |
| "learning_rate": 8.960132527513642e-07, | |
| "loss": 0.0171, | |
| "reward": 1.7830357879400254, | |
| "reward_std": 0.1866126311942935, | |
| "rewards/accuracy_reward": 0.8316964685916901, | |
| "rewards/format_reward": 0.9513393223285675, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 581.4509185791015, | |
| "epoch": 1.1961620469083156, | |
| "grad_norm": 0.24776776134967804, | |
| "kl": 0.0202850341796875, | |
| "learning_rate": 8.8435132063911e-07, | |
| "loss": 0.0073, | |
| "reward": 1.7156250864267348, | |
| "reward_std": 0.18961388804018497, | |
| "rewards/accuracy_reward": 0.77857146859169, | |
| "rewards/format_reward": 0.9370535984635353, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 600.3785957336426, | |
| "epoch": 1.2388059701492538, | |
| "grad_norm": 0.38918188214302063, | |
| "kl": 0.0415618896484375, | |
| "learning_rate": 8.721543888039532e-07, | |
| "loss": 0.0098, | |
| "reward": 1.7325893640518188, | |
| "reward_std": 0.20605442952364683, | |
| "rewards/accuracy_reward": 0.7897321820259094, | |
| "rewards/format_reward": 0.942857176065445, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 586.5723472595215, | |
| "epoch": 1.2814498933901919, | |
| "grad_norm": 0.39274245500564575, | |
| "kl": 0.0493682861328125, | |
| "learning_rate": 8.594394348255237e-07, | |
| "loss": 0.0246, | |
| "reward": 1.7558036506175996, | |
| "reward_std": 0.2199950136244297, | |
| "rewards/accuracy_reward": 0.8058036044239998, | |
| "rewards/format_reward": 0.9500000298023223, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 564.6044891357421, | |
| "epoch": 1.32409381663113, | |
| "grad_norm": 0.7922531962394714, | |
| "kl": 0.0613037109375, | |
| "learning_rate": 8.462241573469377e-07, | |
| "loss": 0.0263, | |
| "reward": 1.7375000715255737, | |
| "reward_std": 0.17584939412772654, | |
| "rewards/accuracy_reward": 0.8004464656114578, | |
| "rewards/format_reward": 0.9370536029338836, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 575.3759223937989, | |
| "epoch": 1.3667377398720681, | |
| "grad_norm": 10.02066707611084, | |
| "kl": 0.46389617919921877, | |
| "learning_rate": 8.325269514390834e-07, | |
| "loss": 0.0185, | |
| "reward": 1.7901786595582962, | |
| "reward_std": 0.17941874554380774, | |
| "rewards/accuracy_reward": 0.8223214641213417, | |
| "rewards/format_reward": 0.9678571701049805, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 572.3335052490235, | |
| "epoch": 1.4093816631130065, | |
| "grad_norm": 0.1462014764547348, | |
| "kl": 0.05237274169921875, | |
| "learning_rate": 8.183668829955111e-07, | |
| "loss": 0.0162, | |
| "reward": 1.7723215103149415, | |
| "reward_std": 0.1762597480788827, | |
| "rewards/accuracy_reward": 0.816071467101574, | |
| "rewards/format_reward": 0.9562500298023224, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 575.169223022461, | |
| "epoch": 1.4520255863539446, | |
| "grad_norm": 1.8751811981201172, | |
| "kl": 0.18811492919921874, | |
| "learning_rate": 8.037636621935684e-07, | |
| "loss": 0.0151, | |
| "reward": 1.7419643700122833, | |
| "reward_std": 0.1959962229244411, | |
| "rewards/accuracy_reward": 0.7982143238186836, | |
| "rewards/format_reward": 0.9437500312924385, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 588.2317245483398, | |
| "epoch": 1.4946695095948828, | |
| "grad_norm": 0.17714911699295044, | |
| "kl": 0.132525634765625, | |
| "learning_rate": 7.887376160587213e-07, | |
| "loss": 0.0172, | |
| "reward": 1.7156250804662705, | |
| "reward_std": 0.19640195239335298, | |
| "rewards/accuracy_reward": 0.7674107566475868, | |
| "rewards/format_reward": 0.948214316368103, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 559.4638603210449, | |
| "epoch": 1.537313432835821, | |
| "grad_norm": 0.21344700455665588, | |
| "kl": 0.0337677001953125, | |
| "learning_rate": 7.733096601702507e-07, | |
| "loss": 0.0098, | |
| "reward": 1.788839367032051, | |
| "reward_std": 0.1706329697743058, | |
| "rewards/accuracy_reward": 0.8178571820259094, | |
| "rewards/format_reward": 0.9709821701049804, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 574.9826164245605, | |
| "epoch": 1.579957356076759, | |
| "grad_norm": 0.3246748745441437, | |
| "kl": 0.0662689208984375, | |
| "learning_rate": 7.575012695477076e-07, | |
| "loss": 0.0171, | |
| "reward": 1.764285796880722, | |
| "reward_std": 0.18098030481487512, | |
| "rewards/accuracy_reward": 0.8044643223285675, | |
| "rewards/format_reward": 0.9598214566707611, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 591.3893157958985, | |
| "epoch": 1.6226012793176974, | |
| "grad_norm": 0.47960391640663147, | |
| "kl": 0.056695556640625, | |
| "learning_rate": 7.413344487586542e-07, | |
| "loss": 0.0212, | |
| "reward": 1.7665179401636124, | |
| "reward_std": 0.22513661198318005, | |
| "rewards/accuracy_reward": 0.8071428954601287, | |
| "rewards/format_reward": 0.9593750357627868, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 576.8236846923828, | |
| "epoch": 1.6652452025586353, | |
| "grad_norm": 0.9285232424736023, | |
| "kl": 0.2149566650390625, | |
| "learning_rate": 7.248317012892968e-07, | |
| "loss": 0.0264, | |
| "reward": 1.7383929401636125, | |
| "reward_std": 0.21417219610884786, | |
| "rewards/accuracy_reward": 0.7852678969502449, | |
| "rewards/format_reward": 0.9531250342726707, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 569.6236854553223, | |
| "epoch": 1.7078891257995736, | |
| "grad_norm": 19.19864845275879, | |
| "kl": 0.493951416015625, | |
| "learning_rate": 7.08015998220647e-07, | |
| "loss": 0.025, | |
| "reward": 1.7593750774860382, | |
| "reward_std": 0.1970167408697307, | |
| "rewards/accuracy_reward": 0.7933036029338837, | |
| "rewards/format_reward": 0.9660714626312256, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 579.9201164245605, | |
| "epoch": 1.7505330490405118, | |
| "grad_norm": 8.568023681640625, | |
| "kl": 0.36058349609375, | |
| "learning_rate": 6.909107462538111e-07, | |
| "loss": 0.0273, | |
| "reward": 1.7156250894069671, | |
| "reward_std": 0.2314098752103746, | |
| "rewards/accuracy_reward": 0.762946467101574, | |
| "rewards/format_reward": 0.9526785984635353, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 585.3098495483398, | |
| "epoch": 1.79317697228145, | |
| "grad_norm": 1.4387701749801636, | |
| "kl": 0.8639892578125, | |
| "learning_rate": 6.735397551289178e-07, | |
| "loss": 0.0267, | |
| "reward": 1.6991072207689286, | |
| "reward_std": 0.25122642405331136, | |
| "rewards/accuracy_reward": 0.740625037252903, | |
| "rewards/format_reward": 0.9584821745753288, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 573.5219017028809, | |
| "epoch": 1.835820895522388, | |
| "grad_norm": 8.508292198181152, | |
| "kl": 1.6532470703125, | |
| "learning_rate": 6.559272044830316e-07, | |
| "loss": 0.0335, | |
| "reward": 1.7223215013742448, | |
| "reward_std": 0.23952382281422616, | |
| "rewards/accuracy_reward": 0.7700893215835094, | |
| "rewards/format_reward": 0.9522321805357933, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 577.850471496582, | |
| "epoch": 1.8784648187633262, | |
| "grad_norm": 3.9072465896606445, | |
| "kl": 0.9760009765625, | |
| "learning_rate": 6.380976101931879e-07, | |
| "loss": 0.0349, | |
| "reward": 1.6732143580913543, | |
| "reward_std": 0.2966056760400534, | |
| "rewards/accuracy_reward": 0.7321428880095482, | |
| "rewards/format_reward": 0.9410714641213417, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 571.9951133728027, | |
| "epoch": 1.9211087420042645, | |
| "grad_norm": 13.385315895080566, | |
| "kl": 2.62431640625, | |
| "learning_rate": 6.200757902513962e-07, | |
| "loss": 0.0609, | |
| "reward": 1.6495536506175994, | |
| "reward_std": 0.29212585240602496, | |
| "rewards/accuracy_reward": 0.7040178939700127, | |
| "rewards/format_reward": 0.9455357491970062, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 602.8174324035645, | |
| "epoch": 1.9637526652452024, | |
| "grad_norm": 3.510239362716675, | |
| "kl": 1.172021484375, | |
| "learning_rate": 6.018868302191139e-07, | |
| "loss": 0.0426, | |
| "reward": 1.5660715043544768, | |
| "reward_std": 0.343078551068902, | |
| "rewards/accuracy_reward": 0.6276786014437675, | |
| "rewards/format_reward": 0.9383928880095482, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.9808102345415777, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 590.3601776607453, | |
| "eval_kl": 2.955357142857143, | |
| "eval_loss": 0.06222715228796005, | |
| "eval_reward": 1.3279479032471067, | |
| "eval_reward_std": 0.4392576685973576, | |
| "eval_rewards/accuracy_reward": 0.46938777679488775, | |
| "eval_rewards/format_reward": 0.858560131655799, | |
| "eval_runtime": 674.3732, | |
| "eval_samples_per_second": 0.741, | |
| "eval_steps_per_second": 0.013, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 605.6790473937988, | |
| "epoch": 2.008528784648188, | |
| "grad_norm": 6.303433418273926, | |
| "kl": 3.3275390625, | |
| "learning_rate": 5.835560483092742e-07, | |
| "loss": 0.0882, | |
| "reward": 1.3517857775092126, | |
| "reward_std": 0.4619227208197117, | |
| "rewards/accuracy_reward": 0.5165178820490837, | |
| "rewards/format_reward": 0.8352678924798965, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 588.4384216308594, | |
| "epoch": 2.0511727078891258, | |
| "grad_norm": 20.91950225830078, | |
| "kl": 7.1640625, | |
| "learning_rate": 5.651089601444752e-07, | |
| "loss": 0.1247, | |
| "reward": 1.1812500566244126, | |
| "reward_std": 0.5094705298542976, | |
| "rewards/accuracy_reward": 0.43125002160668374, | |
| "rewards/format_reward": 0.7500000283122062, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 582.0741310119629, | |
| "epoch": 2.093816631130064, | |
| "grad_norm": 13.203470230102539, | |
| "kl": 6.3, | |
| "learning_rate": 5.465712432403811e-07, | |
| "loss": 0.1256, | |
| "reward": 1.2410714894533157, | |
| "reward_std": 0.5110540725290775, | |
| "rewards/accuracy_reward": 0.46830358877778056, | |
| "rewards/format_reward": 0.7727678924798965, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 590.8152046203613, | |
| "epoch": 2.136460554371002, | |
| "grad_norm": 88.20843505859375, | |
| "kl": 11.7703125, | |
| "learning_rate": 5.279687012637798e-07, | |
| "loss": 0.2042, | |
| "reward": 1.3339286342263221, | |
| "reward_std": 0.5208067961037159, | |
| "rewards/accuracy_reward": 0.5129464477300644, | |
| "rewards/format_reward": 0.8209821820259094, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 582.5169929504394, | |
| "epoch": 2.1791044776119404, | |
| "grad_norm": 19.759809494018555, | |
| "kl": 4.6005859375, | |
| "learning_rate": 5.093272281150382e-07, | |
| "loss": 0.0949, | |
| "reward": 1.3361607685685157, | |
| "reward_std": 0.5283136948943138, | |
| "rewards/accuracy_reward": 0.5258928835391998, | |
| "rewards/format_reward": 0.8102678999304771, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 574.399136352539, | |
| "epoch": 2.2217484008528783, | |
| "grad_norm": 73.8835678100586, | |
| "kl": 13.4765625, | |
| "learning_rate": 4.906727718849618e-07, | |
| "loss": 0.209, | |
| "reward": 1.168750050663948, | |
| "reward_std": 0.5519715771079063, | |
| "rewards/accuracy_reward": 0.4321428779512644, | |
| "rewards/format_reward": 0.7366071745753289, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 569.409400177002, | |
| "epoch": 2.2643923240938166, | |
| "grad_norm": 16.285621643066406, | |
| "kl": 15.2921875, | |
| "learning_rate": 4.7203129873622036e-07, | |
| "loss": 0.2319, | |
| "reward": 1.1008929148316384, | |
| "reward_std": 0.5826808042824269, | |
| "rewards/accuracy_reward": 0.39241073541343213, | |
| "rewards/format_reward": 0.7084821775555611, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 563.786190032959, | |
| "epoch": 2.307036247334755, | |
| "grad_norm": 5.511695861816406, | |
| "kl": 3.4419921875, | |
| "learning_rate": 4.534287567596188e-07, | |
| "loss": 0.0542, | |
| "reward": 1.3165179178118707, | |
| "reward_std": 0.49827431738376615, | |
| "rewards/accuracy_reward": 0.5075893081724644, | |
| "rewards/format_reward": 0.8089286148548126, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 558.287523651123, | |
| "epoch": 2.349680170575693, | |
| "grad_norm": 7.120125770568848, | |
| "kl": 5.0576171875, | |
| "learning_rate": 4.348910398555249e-07, | |
| "loss": 0.0723, | |
| "reward": 1.3750000685453414, | |
| "reward_std": 0.5270605705678463, | |
| "rewards/accuracy_reward": 0.5455357402563095, | |
| "rewards/format_reward": 0.8294643267989159, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 573.1196708679199, | |
| "epoch": 2.3923240938166312, | |
| "grad_norm": 9.884531021118164, | |
| "kl": 10.0318359375, | |
| "learning_rate": 4.1644395169072575e-07, | |
| "loss": 0.1729, | |
| "reward": 1.2901786297559739, | |
| "reward_std": 0.5422291226685048, | |
| "rewards/accuracy_reward": 0.48660716265439985, | |
| "rewards/format_reward": 0.8035714611411094, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 557.1683288574219, | |
| "epoch": 2.434968017057569, | |
| "grad_norm": 8.612386703491211, | |
| "kl": 4.2802734375, | |
| "learning_rate": 3.9811316978088615e-07, | |
| "loss": 0.0639, | |
| "reward": 1.4187500596046447, | |
| "reward_std": 0.4759579010307789, | |
| "rewards/accuracy_reward": 0.5776785999536515, | |
| "rewards/format_reward": 0.8410714656114578, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 546.3219017028808, | |
| "epoch": 2.4776119402985075, | |
| "grad_norm": 31.60814094543457, | |
| "kl": 9.052734375, | |
| "learning_rate": 3.799242097486038e-07, | |
| "loss": 0.1517, | |
| "reward": 1.3687500596046447, | |
| "reward_std": 0.5219749353826046, | |
| "rewards/accuracy_reward": 0.5669643111526966, | |
| "rewards/format_reward": 0.8017857521772385, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 548.1094017028809, | |
| "epoch": 2.520255863539446, | |
| "grad_norm": 59.2335090637207, | |
| "kl": 10.06796875, | |
| "learning_rate": 3.619023898068123e-07, | |
| "loss": 0.1374, | |
| "reward": 1.2995536252856255, | |
| "reward_std": 0.51812051422894, | |
| "rewards/accuracy_reward": 0.5620535910129547, | |
| "rewards/format_reward": 0.7375000342726707, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 558.5031517028808, | |
| "epoch": 2.5628997867803838, | |
| "grad_norm": 31.365585327148438, | |
| "kl": 3.1970703125, | |
| "learning_rate": 3.4407279551696846e-07, | |
| "loss": 0.0461, | |
| "reward": 1.2352679088711738, | |
| "reward_std": 0.531840232014656, | |
| "rewards/accuracy_reward": 0.5383928880095482, | |
| "rewards/format_reward": 0.6968750298023224, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 548.0366317749024, | |
| "epoch": 2.605543710021322, | |
| "grad_norm": 13.923192977905273, | |
| "kl": 6.5228515625, | |
| "learning_rate": 3.2646024487108213e-07, | |
| "loss": 0.0853, | |
| "reward": 1.2236607685685157, | |
| "reward_std": 0.5477135334163904, | |
| "rewards/accuracy_reward": 0.5044643051922322, | |
| "rewards/format_reward": 0.7191964611411095, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 517.9973487854004, | |
| "epoch": 2.64818763326226, | |
| "grad_norm": 11.7457914352417, | |
| "kl": 11.935546875, | |
| "learning_rate": 3.0908925374618887e-07, | |
| "loss": 0.1622, | |
| "reward": 1.2687500640749931, | |
| "reward_std": 0.5634565785527229, | |
| "rewards/accuracy_reward": 0.5196428790688514, | |
| "rewards/format_reward": 0.7491071805357933, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 542.1607376098633, | |
| "epoch": 2.6908315565031984, | |
| "grad_norm": 10.802907943725586, | |
| "kl": 7.883203125, | |
| "learning_rate": 2.91984001779353e-07, | |
| "loss": 0.1125, | |
| "reward": 1.2767857626080512, | |
| "reward_std": 0.5758342906832695, | |
| "rewards/accuracy_reward": 0.5187500260770321, | |
| "rewards/format_reward": 0.7580357491970062, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 536.2406478881836, | |
| "epoch": 2.7334754797441363, | |
| "grad_norm": 9.723703384399414, | |
| "kl": 6.530859375, | |
| "learning_rate": 2.751682987107029e-07, | |
| "loss": 0.0812, | |
| "reward": 1.3321429282426833, | |
| "reward_std": 0.526002112776041, | |
| "rewards/accuracy_reward": 0.5491071671247483, | |
| "rewards/format_reward": 0.7830357521772384, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 521.1303764343262, | |
| "epoch": 2.7761194029850746, | |
| "grad_norm": 18.712772369384766, | |
| "kl": 10.8078125, | |
| "learning_rate": 2.5866555124134577e-07, | |
| "loss": 0.1505, | |
| "reward": 1.2674107685685159, | |
| "reward_std": 0.5784162662923336, | |
| "rewards/accuracy_reward": 0.5366071693599224, | |
| "rewards/format_reward": 0.730803607404232, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 535.4544891357422, | |
| "epoch": 2.818763326226013, | |
| "grad_norm": 7.585498809814453, | |
| "kl": 9.569921875, | |
| "learning_rate": 2.424987304522924e-07, | |
| "loss": 0.1261, | |
| "reward": 1.19464291036129, | |
| "reward_std": 0.5549623288214207, | |
| "rewards/accuracy_reward": 0.5209821693599224, | |
| "rewards/format_reward": 0.6736607439815998, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 528.2076141357422, | |
| "epoch": 2.861407249466951, | |
| "grad_norm": 9.224991798400879, | |
| "kl": 6.6015625, | |
| "learning_rate": 2.2669033982974944e-07, | |
| "loss": 0.0755, | |
| "reward": 1.228125052154064, | |
| "reward_std": 0.5089043751358986, | |
| "rewards/accuracy_reward": 0.550000024586916, | |
| "rewards/format_reward": 0.6781250283122062, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 530.958950805664, | |
| "epoch": 2.9040511727078893, | |
| "grad_norm": 24.710325241088867, | |
| "kl": 7.4828125, | |
| "learning_rate": 2.1126238394127867e-07, | |
| "loss": 0.114, | |
| "reward": 1.2035714864730835, | |
| "reward_std": 0.5295904573053122, | |
| "rewards/accuracy_reward": 0.5526785984635353, | |
| "rewards/format_reward": 0.6508928887546063, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 529.0553779602051, | |
| "epoch": 2.946695095948827, | |
| "grad_norm": 36.00743865966797, | |
| "kl": 11.071875, | |
| "learning_rate": 1.9623633780643155e-07, | |
| "loss": 0.188, | |
| "reward": 1.220535770058632, | |
| "reward_std": 0.5231191631406545, | |
| "rewards/accuracy_reward": 0.5361607357859611, | |
| "rewards/format_reward": 0.6843750298023223, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.9722814498933903, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 531.799803234282, | |
| "eval_kl": 9.749503968253968, | |
| "eval_loss": 0.13944962620735168, | |
| "eval_reward": 1.1026077540147872, | |
| "eval_reward_std": 0.4790610531492839, | |
| "eval_rewards/accuracy_reward": 0.4600340352644996, | |
| "eval_rewards/format_reward": 0.642573726082605, | |
| "eval_runtime": 734.1918, | |
| "eval_samples_per_second": 0.681, | |
| "eval_steps_per_second": 0.012, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.3727928161621, | |
| "epoch": 2.9893390191897655, | |
| "grad_norm": 17.88933753967285, | |
| "kl": 10.34296875, | |
| "learning_rate": 1.8163311700448898e-07, | |
| "loss": 0.1236, | |
| "reward": 1.1388393327593804, | |
| "reward_std": 0.4919752091169357, | |
| "rewards/accuracy_reward": 0.49821431189775467, | |
| "rewards/format_reward": 0.6406250275671482, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 524.6839851379394, | |
| "epoch": 3.0341151385927505, | |
| "grad_norm": 8.39860725402832, | |
| "kl": 7.53515625, | |
| "learning_rate": 1.674730485609166e-07, | |
| "loss": 0.099, | |
| "reward": 1.140625049173832, | |
| "reward_std": 0.5006550896912814, | |
| "rewards/accuracy_reward": 0.5415178820490837, | |
| "rewards/format_reward": 0.5991071693599224, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.0768104553223, | |
| "epoch": 3.076759061833689, | |
| "grad_norm": 18.39265251159668, | |
| "kl": 8.1181640625, | |
| "learning_rate": 1.537758426530622e-07, | |
| "loss": 0.1106, | |
| "reward": 1.1013393431901932, | |
| "reward_std": 0.4775242738425732, | |
| "rewards/accuracy_reward": 0.5392857365310192, | |
| "rewards/format_reward": 0.5620535992085933, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 534.2736846923829, | |
| "epoch": 3.1194029850746268, | |
| "grad_norm": 11.617506980895996, | |
| "kl": 7.2947265625, | |
| "learning_rate": 1.4056056517447634e-07, | |
| "loss": 0.0915, | |
| "reward": 1.0933036252856254, | |
| "reward_std": 0.4881337985396385, | |
| "rewards/accuracy_reward": 0.532589315623045, | |
| "rewards/format_reward": 0.5607143111526967, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 538.0062713623047, | |
| "epoch": 3.162046908315565, | |
| "grad_norm": 11.465629577636719, | |
| "kl": 7.7173828125, | |
| "learning_rate": 1.2784561119604682e-07, | |
| "loss": 0.0985, | |
| "reward": 1.10089291036129, | |
| "reward_std": 0.4965208202600479, | |
| "rewards/accuracy_reward": 0.5200893104076385, | |
| "rewards/format_reward": 0.5808035977184772, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.9433242797852, | |
| "epoch": 3.204690831556503, | |
| "grad_norm": 23.9652156829834, | |
| "kl": 9.834765625, | |
| "learning_rate": 1.156486793608899e-07, | |
| "loss": 0.1229, | |
| "reward": 1.101339338719845, | |
| "reward_std": 0.452479437738657, | |
| "rewards/accuracy_reward": 0.5160714529454709, | |
| "rewards/format_reward": 0.5852678865194321, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 534.8468994140625, | |
| "epoch": 3.2473347547974414, | |
| "grad_norm": 18.34585189819336, | |
| "kl": 10.09140625, | |
| "learning_rate": 1.0398674724863581e-07, | |
| "loss": 0.1464, | |
| "reward": 1.1111607685685159, | |
| "reward_std": 0.5048069790005684, | |
| "rewards/accuracy_reward": 0.5276785962283611, | |
| "rewards/format_reward": 0.5834821693599224, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 536.6924369812011, | |
| "epoch": 3.2899786780383797, | |
| "grad_norm": 14.767237663269043, | |
| "kl": 9.25234375, | |
| "learning_rate": 9.287604774340235e-08, | |
| "loss": 0.1232, | |
| "reward": 1.101339329779148, | |
| "reward_std": 0.49512304849922656, | |
| "rewards/accuracy_reward": 0.5138393111526967, | |
| "rewards/format_reward": 0.5875000298023224, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 532.2884185791015, | |
| "epoch": 3.3326226012793176, | |
| "grad_norm": 9.985774993896484, | |
| "kl": 7.86640625, | |
| "learning_rate": 8.233204643835234e-08, | |
| "loss": 0.1138, | |
| "reward": 1.075446480512619, | |
| "reward_std": 0.46411947570741174, | |
| "rewards/accuracy_reward": 0.5026785962283611, | |
| "rewards/format_reward": 0.5727678835391998, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 527.2410995483399, | |
| "epoch": 3.375266524520256, | |
| "grad_norm": 10.612527847290039, | |
| "kl": 8.09375, | |
| "learning_rate": 7.236942010828429e-08, | |
| "loss": 0.0785, | |
| "reward": 1.1446428999304772, | |
| "reward_std": 0.48770338781177996, | |
| "rewards/accuracy_reward": 0.5575893074274063, | |
| "rewards/format_reward": 0.5870535977184772, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 519.1937744140625, | |
| "epoch": 3.417910447761194, | |
| "grad_norm": 14.665472984313965, | |
| "kl": 11.00234375, | |
| "learning_rate": 6.300203628022271e-08, | |
| "loss": 0.152, | |
| "reward": 1.1830357760190964, | |
| "reward_std": 0.5015905275940895, | |
| "rewards/accuracy_reward": 0.5790178842842579, | |
| "rewards/format_reward": 0.6040178872644901, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.4053771972656, | |
| "epoch": 3.4605543710021323, | |
| "grad_norm": 8.385228157043457, | |
| "kl": 9.28125, | |
| "learning_rate": 5.42429339304461e-08, | |
| "loss": 0.1379, | |
| "reward": 1.1437500521540642, | |
| "reward_std": 0.46195379123091695, | |
| "rewards/accuracy_reward": 0.5531250216066838, | |
| "rewards/format_reward": 0.5906250216066837, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.0544860839843, | |
| "epoch": 3.50319829424307, | |
| "grad_norm": 18.6485652923584, | |
| "kl": 8.909765625, | |
| "learning_rate": 4.610430533481857e-08, | |
| "loss": 0.1119, | |
| "reward": 1.1084821969270706, | |
| "reward_std": 0.4929712563753128, | |
| "rewards/accuracy_reward": 0.5491071715950966, | |
| "rewards/format_reward": 0.5593750216066837, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 533.8643081665039, | |
| "epoch": 3.5458422174840085, | |
| "grad_norm": 84.65238189697266, | |
| "kl": 8.594140625, | |
| "learning_rate": 3.859747909769162e-08, | |
| "loss": 0.1078, | |
| "reward": 1.0660714849829673, | |
| "reward_std": 0.473931773006916, | |
| "rewards/accuracy_reward": 0.5361607417464256, | |
| "rewards/format_reward": 0.5299107395112514, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 536.6745796203613, | |
| "epoch": 3.588486140724947, | |
| "grad_norm": 19.54568862915039, | |
| "kl": 9.418359375, | |
| "learning_rate": 3.173290438299697e-08, | |
| "loss": 0.1327, | |
| "reward": 1.0656250432133674, | |
| "reward_std": 0.4773729760199785, | |
| "rewards/accuracy_reward": 0.5245535988360643, | |
| "rewards/format_reward": 0.5410714514553547, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 517.5031486511231, | |
| "epoch": 3.631130063965885, | |
| "grad_norm": 22.7406005859375, | |
| "kl": 8.5650390625, | |
| "learning_rate": 2.5520136369481194e-08, | |
| "loss": 0.1112, | |
| "reward": 1.1593750432133674, | |
| "reward_std": 0.45310505069792273, | |
| "rewards/accuracy_reward": 0.5647321693599224, | |
| "rewards/format_reward": 0.5946428835391998, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 513.7027023315429, | |
| "epoch": 3.673773987206823, | |
| "grad_norm": 25.179290771484375, | |
| "kl": 8.64296875, | |
| "learning_rate": 1.996782295032745e-08, | |
| "loss": 0.1274, | |
| "reward": 1.1517857566475869, | |
| "reward_std": 0.4889927223324776, | |
| "rewards/accuracy_reward": 0.5678571730852127, | |
| "rewards/format_reward": 0.583928594738245, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 527.8647552490235, | |
| "epoch": 3.716417910447761, | |
| "grad_norm": 14.159469604492188, | |
| "kl": 7.8265625, | |
| "learning_rate": 1.508369269567783e-08, | |
| "loss": 0.1046, | |
| "reward": 1.1281250417232513, | |
| "reward_std": 0.5101183526217937, | |
| "rewards/accuracy_reward": 0.546428595483303, | |
| "rewards/format_reward": 0.5816964566707611, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 523.8451080322266, | |
| "epoch": 3.7590618336886994, | |
| "grad_norm": 18.611404418945312, | |
| "kl": 9.108984375, | |
| "learning_rate": 1.0874544094811422e-08, | |
| "loss": 0.1173, | |
| "reward": 1.0482143327593803, | |
| "reward_std": 0.45587412640452385, | |
| "rewards/accuracy_reward": 0.5294643141329288, | |
| "rewards/format_reward": 0.518750024214387, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 528.406273651123, | |
| "epoch": 3.8017057569296373, | |
| "grad_norm": 9.024343490600586, | |
| "kl": 8.72421875, | |
| "learning_rate": 7.346236092954316e-09, | |
| "loss": 0.103, | |
| "reward": 1.0665179088711738, | |
| "reward_std": 0.47449378967285155, | |
| "rewards/accuracy_reward": 0.5200893059372902, | |
| "rewards/format_reward": 0.5464285977184773, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 522.9732391357422, | |
| "epoch": 3.8443496801705757, | |
| "grad_norm": 15.007638931274414, | |
| "kl": 7.5888671875, | |
| "learning_rate": 4.50367993589107e-09, | |
| "loss": 0.1024, | |
| "reward": 1.1848214849829675, | |
| "reward_std": 0.4590866263955832, | |
| "rewards/accuracy_reward": 0.5825893096625805, | |
| "rewards/format_reward": 0.6022321686148644, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 539.4236862182618, | |
| "epoch": 3.886993603411514, | |
| "grad_norm": 11.34084701538086, | |
| "kl": 8.90546875, | |
| "learning_rate": 2.3508323337321224e-09, | |
| "loss": 0.1158, | |
| "reward": 1.0486607655882836, | |
| "reward_std": 0.4737320654094219, | |
| "rewards/accuracy_reward": 0.5111607365310192, | |
| "rewards/format_reward": 0.5375000245869159, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 534.0035942077636, | |
| "epoch": 3.929637526652452, | |
| "grad_norm": 18.918825149536133, | |
| "kl": 7.205078125, | |
| "learning_rate": 8.906899533517864e-10, | |
| "loss": 0.0902, | |
| "reward": 1.1607143417000771, | |
| "reward_std": 0.46907868683338166, | |
| "rewards/accuracy_reward": 0.5776785992085933, | |
| "rewards/format_reward": 0.5830357417464256, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.9637526652452024, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 529.8555946955605, | |
| "eval_kl": 8.018105158730158, | |
| "eval_loss": 0.10982762277126312, | |
| "eval_reward": 1.0456349707785106, | |
| "eval_reward_std": 0.43395746865915874, | |
| "eval_rewards/accuracy_reward": 0.48384356072970797, | |
| "eval_rewards/format_reward": 0.561791407683539, | |
| "eval_runtime": 659.4598, | |
| "eval_samples_per_second": 0.758, | |
| "eval_steps_per_second": 0.014, | |
| "step": 464 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 526.7964500427246, | |
| "epoch": 3.9722814498933903, | |
| "grad_norm": 13.803497314453125, | |
| "kl": 7.684765625, | |
| "learning_rate": 1.252852471625987e-10, | |
| "loss": 0.0773, | |
| "reward": 1.129464340209961, | |
| "reward_std": 0.445505191385746, | |
| "rewards/accuracy_reward": 0.5602678842842579, | |
| "rewards/format_reward": 0.5691964529454708, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 529.8702189127604, | |
| "epoch": 3.997867803837953, | |
| "kl": 8.166666666666666, | |
| "reward": 1.1056548183163006, | |
| "reward_std": 0.48472560321291286, | |
| "rewards/accuracy_reward": 0.5610119315485159, | |
| "rewards/format_reward": 0.5446428805589676, | |
| "step": 468, | |
| "total_flos": 0.0, | |
| "train_loss": 0.07395310898940279, | |
| "train_runtime": 53156.2352, | |
| "train_samples_per_second": 0.564, | |
| "train_steps_per_second": 0.009 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 468, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |