diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6232 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9996410921840921, + "eval_steps": 100, + "global_step": 2263, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 400.17890625, + "epoch": 0.002208663482510146, + "grad_norm": 1.4726563692092896, + "kl": 0.0001227259635925293, + "learning_rate": 4.405286343612335e-07, + "loss": 0.0, + "reward": 0.57421875, + "reward_std": 0.458265589363873, + "rewards/accuracy_reward": 0.15078125, + "rewards/format_reward": 0.4234375, + "step": 5 + }, + { + "completion_length": 379.875, + "epoch": 0.004417326965020292, + "grad_norm": 1.1294347047805786, + "kl": 0.00025554299354553224, + "learning_rate": 8.81057268722467e-07, + "loss": 0.0, + "reward": 0.6515625, + "reward_std": 0.4448237407952547, + "rewards/accuracy_reward": 0.146875, + "rewards/format_reward": 0.5046875, + "step": 10 + }, + { + "completion_length": 349.56640625, + "epoch": 0.006625990447530438, + "grad_norm": 0.7087352871894836, + "kl": 0.003926074504852295, + "learning_rate": 1.3215859030837006e-06, + "loss": 0.0002, + "reward": 0.7, + "reward_std": 0.4231703171506524, + "rewards/accuracy_reward": 0.1078125, + "rewards/format_reward": 0.5921875, + "step": 15 + }, + { + "completion_length": 259.32421875, + "epoch": 0.008834653930040584, + "grad_norm": 6.263583660125732, + "kl": 0.08897933959960938, + "learning_rate": 1.762114537444934e-06, + "loss": 0.0036, + "reward": 0.9296875, + "reward_std": 0.34432896580547095, + "rewards/accuracy_reward": 0.09453125, + "rewards/format_reward": 0.83515625, + "step": 20 + }, + { + "completion_length": 258.88203125, + "epoch": 0.01104331741255073, + "grad_norm": 0.662657618522644, + "kl": 0.041219329833984374, + "learning_rate": 2.2026431718061673e-06, + "loss": 0.0016, + "reward": 0.93359375, + "reward_std": 0.3294501030817628, + "rewards/accuracy_reward": 0.08828125, + "rewards/format_reward": 0.8453125, + "step": 25 + }, + { + "completion_length": 283.37578125, + "epoch": 0.013251980895060876, + "grad_norm": 0.8266062140464783, + "kl": 0.03238487243652344, + "learning_rate": 2.643171806167401e-06, + "loss": 0.0013, + "reward": 0.88671875, + "reward_std": 0.38558061737567184, + "rewards/accuracy_reward": 0.1015625, + "rewards/format_reward": 0.78515625, + "step": 30 + }, + { + "completion_length": 228.65625, + "epoch": 0.015460644377571022, + "grad_norm": 0.9381304383277893, + "kl": 0.023354721069335938, + "learning_rate": 3.0837004405286347e-06, + "loss": 0.0009, + "reward": 0.9203125, + "reward_std": 0.34575226698070766, + "rewards/accuracy_reward": 0.09296875, + "rewards/format_reward": 0.82734375, + "step": 35 + }, + { + "completion_length": 175.81953125, + "epoch": 0.017669307860081168, + "grad_norm": 0.654951274394989, + "kl": 0.04830093383789062, + "learning_rate": 3.524229074889868e-06, + "loss": 0.0019, + "reward": 0.99765625, + "reward_std": 0.3096940713003278, + "rewards/accuracy_reward": 0.11015625, + "rewards/format_reward": 0.8875, + "step": 40 + }, + { + "completion_length": 179.19765625, + "epoch": 0.019877971342591314, + "grad_norm": 0.46008017659187317, + "kl": 0.0401031494140625, + "learning_rate": 3.964757709251102e-06, + "loss": 0.0016, + "reward": 1.0421875, + "reward_std": 0.22858326323330402, + "rewards/accuracy_reward": 0.09453125, + "rewards/format_reward": 0.94765625, + "step": 45 + }, + { + "completion_length": 219.9734375, + "epoch": 0.02208663482510146, + "grad_norm": 0.6333373188972473, + "kl": 0.05000381469726563, + "learning_rate": 4.405286343612335e-06, + "loss": 0.002, + "reward": 1.11640625, + "reward_std": 0.2830535739660263, + "rewards/accuracy_reward": 0.16875, + "rewards/format_reward": 0.94765625, + "step": 50 + }, + { + "completion_length": 206.340625, + "epoch": 0.024295298307611606, + "grad_norm": 0.444731205701828, + "kl": 0.07784347534179688, + "learning_rate": 4.8458149779735685e-06, + "loss": 0.0031, + "reward": 1.08671875, + "reward_std": 0.24719008896499872, + "rewards/accuracy_reward": 0.1296875, + "rewards/format_reward": 0.95703125, + "step": 55 + }, + { + "completion_length": 217.25546875, + "epoch": 0.026503961790121752, + "grad_norm": 0.49777764081954956, + "kl": 0.0484649658203125, + "learning_rate": 5.286343612334802e-06, + "loss": 0.0019, + "reward": 1.096875, + "reward_std": 0.2545401843264699, + "rewards/accuracy_reward": 0.1390625, + "rewards/format_reward": 0.9578125, + "step": 60 + }, + { + "completion_length": 206.72265625, + "epoch": 0.028712625272631898, + "grad_norm": 0.5592818260192871, + "kl": 0.05699615478515625, + "learning_rate": 5.7268722466960354e-06, + "loss": 0.0023, + "reward": 1.1234375, + "reward_std": 0.25865183435380457, + "rewards/accuracy_reward": 0.16640625, + "rewards/format_reward": 0.95703125, + "step": 65 + }, + { + "completion_length": 178.8796875, + "epoch": 0.030921288755142044, + "grad_norm": 0.7485532164573669, + "kl": 0.06690826416015624, + "learning_rate": 6.167400881057269e-06, + "loss": 0.0027, + "reward": 1.15859375, + "reward_std": 0.2730751080438495, + "rewards/accuracy_reward": 0.1859375, + "rewards/format_reward": 0.97265625, + "step": 70 + }, + { + "completion_length": 206.44375, + "epoch": 0.033129952237652194, + "grad_norm": 0.5321747064590454, + "kl": 0.05711669921875, + "learning_rate": 6.607929515418503e-06, + "loss": 0.0023, + "reward": 1.14140625, + "reward_std": 0.2609355779364705, + "rewards/accuracy_reward": 0.17578125, + "rewards/format_reward": 0.965625, + "step": 75 + }, + { + "completion_length": 180.4796875, + "epoch": 0.035338615720162336, + "grad_norm": 0.7185708284378052, + "kl": 0.079705810546875, + "learning_rate": 7.048458149779736e-06, + "loss": 0.0032, + "reward": 1.20859375, + "reward_std": 0.2991209041327238, + "rewards/accuracy_reward": 0.240625, + "rewards/format_reward": 0.96796875, + "step": 80 + }, + { + "completion_length": 205.42890625, + "epoch": 0.037547279202672486, + "grad_norm": 0.6930840611457825, + "kl": 0.079119873046875, + "learning_rate": 7.48898678414097e-06, + "loss": 0.0032, + "reward": 1.2125, + "reward_std": 0.29966206308454274, + "rewards/accuracy_reward": 0.24296875, + "rewards/format_reward": 0.96953125, + "step": 85 + }, + { + "completion_length": 232.1125, + "epoch": 0.03975594268518263, + "grad_norm": 0.6082009673118591, + "kl": 0.078369140625, + "learning_rate": 7.929515418502203e-06, + "loss": 0.0031, + "reward": 1.2390625, + "reward_std": 0.3367170764133334, + "rewards/accuracy_reward": 0.2796875, + "rewards/format_reward": 0.959375, + "step": 90 + }, + { + "completion_length": 234.2796875, + "epoch": 0.04196460616769278, + "grad_norm": 0.5688744783401489, + "kl": 0.0821014404296875, + "learning_rate": 8.370044052863436e-06, + "loss": 0.0033, + "reward": 1.2484375, + "reward_std": 0.3449540941044688, + "rewards/accuracy_reward": 0.28515625, + "rewards/format_reward": 0.96328125, + "step": 95 + }, + { + "completion_length": 247.909375, + "epoch": 0.04417326965020292, + "grad_norm": 0.4896390438079834, + "kl": 0.093121337890625, + "learning_rate": 8.81057268722467e-06, + "loss": 0.0037, + "reward": 1.271875, + "reward_std": 0.29160809628665446, + "rewards/accuracy_reward": 0.2953125, + "rewards/format_reward": 0.9765625, + "step": 100 + }, + { + "epoch": 0.04417326965020292, + "eval_completion_length": 264.7725, + "eval_kl": 0.09, + "eval_loss": 0.003620876930654049, + "eval_reward": 1.2708333349227905, + "eval_reward_std": 0.30468439966440203, + "eval_rewards/accuracy_reward": 0.29583333343267443, + "eval_rewards/format_reward": 0.975, + "eval_runtime": 145.932, + "eval_samples_per_second": 0.678, + "eval_steps_per_second": 0.027, + "step": 100 + }, + { + "completion_length": 292.07734375, + "epoch": 0.04638193313271307, + "grad_norm": 0.5129627585411072, + "kl": 0.099822998046875, + "learning_rate": 9.251101321585904e-06, + "loss": 0.004, + "reward": 1.2453125, + "reward_std": 0.3150330139324069, + "rewards/accuracy_reward": 0.28671875, + "rewards/format_reward": 0.95859375, + "step": 105 + }, + { + "completion_length": 264.43125, + "epoch": 0.04859059661522321, + "grad_norm": 0.604174017906189, + "kl": 0.103936767578125, + "learning_rate": 9.691629955947137e-06, + "loss": 0.0042, + "reward": 1.26015625, + "reward_std": 0.26412205342203376, + "rewards/accuracy_reward": 0.290625, + "rewards/format_reward": 0.96953125, + "step": 110 + }, + { + "completion_length": 238.78203125, + "epoch": 0.05079926009773336, + "grad_norm": 0.5758931040763855, + "kl": 5888.534802246094, + "learning_rate": 1.0132158590308372e-05, + "loss": 234.88, + "reward": 1.30703125, + "reward_std": 0.3428795490413904, + "rewards/accuracy_reward": 0.35078125, + "rewards/format_reward": 0.95625, + "step": 115 + }, + { + "completion_length": 227.2875, + "epoch": 0.053007923580243504, + "grad_norm": 0.6410739421844482, + "kl": 0.21376953125, + "learning_rate": 1.0572687224669605e-05, + "loss": 0.0085, + "reward": 1.2140625, + "reward_std": 0.39241575095802544, + "rewards/accuracy_reward": 0.29765625, + "rewards/format_reward": 0.91640625, + "step": 120 + }, + { + "completion_length": 208.746875, + "epoch": 0.055216587062753654, + "grad_norm": 118.43889617919922, + "kl": 9.3187255859375, + "learning_rate": 1.1013215859030836e-05, + "loss": 0.3719, + "reward": 1.1359375, + "reward_std": 0.34939223267138003, + "rewards/accuracy_reward": 0.2296875, + "rewards/format_reward": 0.90625, + "step": 125 + }, + { + "completion_length": 164.7515625, + "epoch": 0.057425250545263797, + "grad_norm": 0.6973806023597717, + "kl": 1.569244384765625, + "learning_rate": 1.1453744493392071e-05, + "loss": 0.0628, + "reward": 1.20390625, + "reward_std": 0.3942227842286229, + "rewards/accuracy_reward": 0.29140625, + "rewards/format_reward": 0.9125, + "step": 130 + }, + { + "completion_length": 188.734375, + "epoch": 0.059633914027773946, + "grad_norm": 54.8037223815918, + "kl": 9.47294921875, + "learning_rate": 1.1894273127753304e-05, + "loss": 0.3796, + "reward": 1.20234375, + "reward_std": 0.32416225373744967, + "rewards/accuracy_reward": 0.27734375, + "rewards/format_reward": 0.925, + "step": 135 + }, + { + "completion_length": 150.5953125, + "epoch": 0.06184257751028409, + "grad_norm": 33.95077133178711, + "kl": 1.503021240234375, + "learning_rate": 1.2334801762114539e-05, + "loss": 0.0602, + "reward": 1.23671875, + "reward_std": 0.30887170899659394, + "rewards/accuracy_reward": 0.28203125, + "rewards/format_reward": 0.9546875, + "step": 140 + }, + { + "completion_length": 179.94453125, + "epoch": 0.06405124099279423, + "grad_norm": 2.577629804611206, + "kl": 1.558148193359375, + "learning_rate": 1.2775330396475772e-05, + "loss": 0.0623, + "reward": 1.11953125, + "reward_std": 0.4326841413974762, + "rewards/accuracy_reward": 0.27265625, + "rewards/format_reward": 0.846875, + "step": 145 + }, + { + "completion_length": 218.13203125, + "epoch": 0.06625990447530439, + "grad_norm": 4.710011005401611, + "kl": 2.9638153076171876, + "learning_rate": 1.3215859030837006e-05, + "loss": 0.1188, + "reward": 1.196875, + "reward_std": 0.4242498528212309, + "rewards/accuracy_reward": 0.32421875, + "rewards/format_reward": 0.87265625, + "step": 150 + }, + { + "completion_length": 283.83828125, + "epoch": 0.06846856795781453, + "grad_norm": 1.0956288576126099, + "kl": 10.248779296875, + "learning_rate": 1.3656387665198238e-05, + "loss": 0.4114, + "reward": 1.19453125, + "reward_std": 0.4564665203914046, + "rewards/accuracy_reward": 0.34921875, + "rewards/format_reward": 0.8453125, + "step": 155 + }, + { + "completion_length": 209.16328125, + "epoch": 0.07067723144032467, + "grad_norm": 0.584996223449707, + "kl": 0.1743133544921875, + "learning_rate": 1.4096916299559472e-05, + "loss": 0.007, + "reward": 1.36953125, + "reward_std": 0.3628161208704114, + "rewards/accuracy_reward": 0.40859375, + "rewards/format_reward": 0.9609375, + "step": 160 + }, + { + "completion_length": 196.24453125, + "epoch": 0.07288589492283482, + "grad_norm": 0.8083503246307373, + "kl": 0.168878173828125, + "learning_rate": 1.4537444933920706e-05, + "loss": 0.0068, + "reward": 1.290625, + "reward_std": 0.31575766000896693, + "rewards/accuracy_reward": 0.33515625, + "rewards/format_reward": 0.95546875, + "step": 165 + }, + { + "completion_length": 190.34453125, + "epoch": 0.07509455840534497, + "grad_norm": 0.6046306490898132, + "kl": 0.1955078125, + "learning_rate": 1.497797356828194e-05, + "loss": 0.0078, + "reward": 1.1984375, + "reward_std": 0.37014698795974255, + "rewards/accuracy_reward": 0.2765625, + "rewards/format_reward": 0.921875, + "step": 170 + }, + { + "completion_length": 148.55, + "epoch": 0.07730322188785511, + "grad_norm": 0.509132444858551, + "kl": 0.1782470703125, + "learning_rate": 1.5418502202643173e-05, + "loss": 0.0071, + "reward": 1.23125, + "reward_std": 0.23883652742952108, + "rewards/accuracy_reward": 0.246875, + "rewards/format_reward": 0.984375, + "step": 175 + }, + { + "completion_length": 307.5828125, + "epoch": 0.07951188537036526, + "grad_norm": 115.14045715332031, + "kl": 2.41651611328125, + "learning_rate": 1.5859030837004406e-05, + "loss": 0.0967, + "reward": 1.1296875, + "reward_std": 0.4080469489097595, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.8484375, + "step": 180 + }, + { + "completion_length": 745.15546875, + "epoch": 0.0817205488528754, + "grad_norm": 1.7153384685516357, + "kl": 1.31116943359375, + "learning_rate": 1.629955947136564e-05, + "loss": 0.0524, + "reward": 0.30546875, + "reward_std": 0.46172712091356516, + "rewards/accuracy_reward": 0.0890625, + "rewards/format_reward": 0.21640625, + "step": 185 + }, + { + "completion_length": 366.26796875, + "epoch": 0.08392921233538556, + "grad_norm": 2.2445106506347656, + "kl": 4.1861083984375, + "learning_rate": 1.6740088105726872e-05, + "loss": 0.1674, + "reward": 0.69453125, + "reward_std": 0.5489674057811499, + "rewards/accuracy_reward": 0.225, + "rewards/format_reward": 0.46953125, + "step": 190 + }, + { + "completion_length": 276.47890625, + "epoch": 0.0861378758178957, + "grad_norm": 1.3458584547042847, + "kl": 0.4182861328125, + "learning_rate": 1.718061674008811e-05, + "loss": 0.0167, + "reward": 1.2015625, + "reward_std": 0.47771220188587904, + "rewards/accuracy_reward": 0.35546875, + "rewards/format_reward": 0.84609375, + "step": 195 + }, + { + "completion_length": 330.8, + "epoch": 0.08834653930040584, + "grad_norm": 5.644898414611816, + "kl": 3.825439453125, + "learning_rate": 1.762114537444934e-05, + "loss": 0.153, + "reward": 1.13203125, + "reward_std": 0.5322479158639908, + "rewards/accuracy_reward": 0.33125, + "rewards/format_reward": 0.80078125, + "step": 200 + }, + { + "epoch": 0.08834653930040584, + "eval_completion_length": 356.7441674804688, + "eval_kl": 3.136875, + "eval_loss": 0.12226903438568115, + "eval_reward": 1.0791666674613953, + "eval_reward_std": 0.5326099014282226, + "eval_rewards/accuracy_reward": 0.3129166668653488, + "eval_rewards/format_reward": 0.76625, + "eval_runtime": 294.9089, + "eval_samples_per_second": 0.336, + "eval_steps_per_second": 0.014, + "step": 200 + }, + { + "completion_length": 272.7140625, + "epoch": 0.09055520278291598, + "grad_norm": 3.581615924835205, + "kl": 1.26063232421875, + "learning_rate": 1.8061674008810575e-05, + "loss": 0.0504, + "reward": 1.13828125, + "reward_std": 0.35043725427240136, + "rewards/accuracy_reward": 0.24296875, + "rewards/format_reward": 0.8953125, + "step": 205 + }, + { + "completion_length": 215.88984375, + "epoch": 0.09276386626542614, + "grad_norm": 4.773531436920166, + "kl": 0.9517578125, + "learning_rate": 1.8502202643171808e-05, + "loss": 0.0381, + "reward": 1.23828125, + "reward_std": 0.30075737833976746, + "rewards/accuracy_reward": 0.30703125, + "rewards/format_reward": 0.93125, + "step": 210 + }, + { + "completion_length": 225.04765625, + "epoch": 0.09497252974793628, + "grad_norm": 2.473970890045166, + "kl": 0.9461669921875, + "learning_rate": 1.894273127753304e-05, + "loss": 0.0378, + "reward": 1.09140625, + "reward_std": 0.4519174795597792, + "rewards/accuracy_reward": 0.2453125, + "rewards/format_reward": 0.84609375, + "step": 215 + }, + { + "completion_length": 297.01875, + "epoch": 0.09718119323044642, + "grad_norm": 2.2808244228363037, + "kl": 21.850811767578126, + "learning_rate": 1.9383259911894274e-05, + "loss": 0.8722, + "reward": 1.08046875, + "reward_std": 0.46868909504264594, + "rewards/accuracy_reward": 0.27734375, + "rewards/format_reward": 0.803125, + "step": 220 + }, + { + "completion_length": 228.60234375, + "epoch": 0.09938985671295657, + "grad_norm": 1.9014919996261597, + "kl": 0.230517578125, + "learning_rate": 1.982378854625551e-05, + "loss": 0.0092, + "reward": 1.23046875, + "reward_std": 0.3573141796514392, + "rewards/accuracy_reward": 0.30234375, + "rewards/format_reward": 0.928125, + "step": 225 + }, + { + "completion_length": 389.16640625, + "epoch": 0.10159852019546672, + "grad_norm": 7.406689643859863, + "kl": 1.709527587890625, + "learning_rate": 1.99998928589406e-05, + "loss": 0.0684, + "reward": 1.0375, + "reward_std": 0.49771256893873217, + "rewards/accuracy_reward": 0.26875, + "rewards/format_reward": 0.76875, + "step": 230 + }, + { + "completion_length": 524.803125, + "epoch": 0.10380718367797687, + "grad_norm": 10.62135124206543, + "kl": 5.1268310546875, + "learning_rate": 1.999923811633618e-05, + "loss": 0.2051, + "reward": 0.834375, + "reward_std": 0.5984118554741145, + "rewards/accuracy_reward": 0.21015625, + "rewards/format_reward": 0.62421875, + "step": 235 + }, + { + "completion_length": 332.025, + "epoch": 0.10601584716048701, + "grad_norm": 1.785484790802002, + "kl": 8.025537109375, + "learning_rate": 1.999798819286288e-05, + "loss": 0.3211, + "reward": 0.48359375, + "reward_std": 0.4500583238899708, + "rewards/accuracy_reward": 0.11484375, + "rewards/format_reward": 0.36875, + "step": 240 + }, + { + "completion_length": 292.31328125, + "epoch": 0.10822451064299715, + "grad_norm": 3.578373432159424, + "kl": 0.82308349609375, + "learning_rate": 1.9996143162919416e-05, + "loss": 0.0329, + "reward": 0.8859375, + "reward_std": 0.5004101138561964, + "rewards/accuracy_reward": 0.1703125, + "rewards/format_reward": 0.715625, + "step": 245 + }, + { + "completion_length": 230.86953125, + "epoch": 0.11043317412550731, + "grad_norm": 3.702038288116455, + "kl": 11.077984619140626, + "learning_rate": 1.9993703136326808e-05, + "loss": 0.4433, + "reward": 1.26875, + "reward_std": 0.3013193493708968, + "rewards/accuracy_reward": 0.30390625, + "rewards/format_reward": 0.96484375, + "step": 250 + }, + { + "completion_length": 218.70078125, + "epoch": 0.11264183760801745, + "grad_norm": 7.172746658325195, + "kl": 0.32041015625, + "learning_rate": 1.999066825832184e-05, + "loss": 0.0128, + "reward": 1.26015625, + "reward_std": 0.23753905296325684, + "rewards/accuracy_reward": 0.26875, + "rewards/format_reward": 0.99140625, + "step": 255 + }, + { + "completion_length": 253.38984375, + "epoch": 0.11485050109052759, + "grad_norm": 1.3221828937530518, + "kl": 8.120147705078125, + "learning_rate": 1.9987038709548408e-05, + "loss": 0.3232, + "reward": 1.24609375, + "reward_std": 0.2821945507079363, + "rewards/accuracy_reward": 0.27734375, + "rewards/format_reward": 0.96875, + "step": 260 + }, + { + "completion_length": 288.26796875, + "epoch": 0.11705916457303774, + "grad_norm": 0.6487714052200317, + "kl": 0.57447509765625, + "learning_rate": 1.9982814706046766e-05, + "loss": 0.023, + "reward": 1.1859375, + "reward_std": 0.3182327225804329, + "rewards/accuracy_reward": 0.2390625, + "rewards/format_reward": 0.946875, + "step": 265 + }, + { + "completion_length": 361.62265625, + "epoch": 0.11926782805554789, + "grad_norm": 3.041498899459839, + "kl": 1.428155517578125, + "learning_rate": 1.997799649924068e-05, + "loss": 0.0572, + "reward": 1.06640625, + "reward_std": 0.3401893651112914, + "rewards/accuracy_reward": 0.1796875, + "rewards/format_reward": 0.88671875, + "step": 270 + }, + { + "completion_length": 322.9, + "epoch": 0.12147649153805803, + "grad_norm": 1.5550798177719116, + "kl": 0.65260009765625, + "learning_rate": 1.9972584375922453e-05, + "loss": 0.0261, + "reward": 1.13984375, + "reward_std": 0.37677015643566847, + "rewards/accuracy_reward": 0.23984375, + "rewards/format_reward": 0.9, + "step": 275 + }, + { + "completion_length": 319.37578125, + "epoch": 0.12368515502056818, + "grad_norm": 114126.546875, + "kl": 4326.556237792969, + "learning_rate": 1.996657865823585e-05, + "loss": 172.9404, + "reward": 1.23828125, + "reward_std": 0.33014066983014345, + "rewards/accuracy_reward": 0.30078125, + "rewards/format_reward": 0.9375, + "step": 280 + }, + { + "completion_length": 312.85546875, + "epoch": 0.12589381850307832, + "grad_norm": 1.4805908203125, + "kl": 0.228668212890625, + "learning_rate": 1.995997970365694e-05, + "loss": 0.0091, + "reward": 1.21171875, + "reward_std": 0.3828111669048667, + "rewards/accuracy_reward": 0.2890625, + "rewards/format_reward": 0.92265625, + "step": 285 + }, + { + "completion_length": 354.6484375, + "epoch": 0.12810248198558846, + "grad_norm": 53.09714889526367, + "kl": 4.07802734375, + "learning_rate": 1.9952787904972794e-05, + "loss": 0.1632, + "reward": 1.053125, + "reward_std": 0.4529764140024781, + "rewards/accuracy_reward": 0.21640625, + "rewards/format_reward": 0.83671875, + "step": 290 + }, + { + "completion_length": 285.98125, + "epoch": 0.1303111454680986, + "grad_norm": 0.581076443195343, + "kl": 0.92733154296875, + "learning_rate": 1.9945003690258127e-05, + "loss": 0.0371, + "reward": 1.1953125, + "reward_std": 0.31984729822725055, + "rewards/accuracy_reward": 0.259375, + "rewards/format_reward": 0.9359375, + "step": 295 + }, + { + "completion_length": 269.2640625, + "epoch": 0.13251980895060878, + "grad_norm": 0.33922237157821655, + "kl": 0.34735107421875, + "learning_rate": 1.993662752284981e-05, + "loss": 0.0139, + "reward": 1.2796875, + "reward_std": 0.257637638784945, + "rewards/accuracy_reward": 0.30390625, + "rewards/format_reward": 0.97578125, + "step": 300 + }, + { + "epoch": 0.13251980895060878, + "eval_completion_length": 312.8345849609375, + "eval_kl": 0.2959375, + "eval_loss": 0.012029927223920822, + "eval_reward": 1.2270833349227905, + "eval_reward_std": 0.3163718029856682, + "eval_rewards/accuracy_reward": 0.2704166667163372, + "eval_rewards/format_reward": 0.9566666674613953, + "eval_runtime": 158.7274, + "eval_samples_per_second": 0.624, + "eval_steps_per_second": 0.025, + "step": 300 + }, + { + "completion_length": 344.0703125, + "epoch": 0.13472847243311892, + "grad_norm": 15.648075103759766, + "kl": 0.812335205078125, + "learning_rate": 1.9927659901319292e-05, + "loss": 0.0324, + "reward": 1.18515625, + "reward_std": 0.36495272126048806, + "rewards/accuracy_reward": 0.24765625, + "rewards/format_reward": 0.9375, + "step": 305 + }, + { + "completion_length": 351.78984375, + "epoch": 0.13693713591562906, + "grad_norm": 0.3627087473869324, + "kl": 0.196575927734375, + "learning_rate": 1.9918101359442908e-05, + "loss": 0.0079, + "reward": 1.1203125, + "reward_std": 0.3219464411959052, + "rewards/accuracy_reward": 0.20234375, + "rewards/format_reward": 0.91796875, + "step": 310 + }, + { + "completion_length": 289.5125, + "epoch": 0.1391457993981392, + "grad_norm": 0.41371065378189087, + "kl": 0.208282470703125, + "learning_rate": 1.990795246617014e-05, + "loss": 0.0083, + "reward": 1.15234375, + "reward_std": 0.30122786965221166, + "rewards/accuracy_reward": 0.21328125, + "rewards/format_reward": 0.9390625, + "step": 315 + }, + { + "completion_length": 243.4625, + "epoch": 0.14135446288064935, + "grad_norm": 0.3589507043361664, + "kl": 0.216851806640625, + "learning_rate": 1.989721382558972e-05, + "loss": 0.0087, + "reward": 1.2484375, + "reward_std": 0.3238747540861368, + "rewards/accuracy_reward": 0.28515625, + "rewards/format_reward": 0.96328125, + "step": 320 + }, + { + "completion_length": 243.1734375, + "epoch": 0.1435631263631595, + "grad_norm": 0.38298115134239197, + "kl": 0.201153564453125, + "learning_rate": 1.988588607689369e-05, + "loss": 0.008, + "reward": 1.2078125, + "reward_std": 0.2437373088672757, + "rewards/accuracy_reward": 0.23515625, + "rewards/format_reward": 0.97265625, + "step": 325 + }, + { + "completion_length": 278.3796875, + "epoch": 0.14577178984566963, + "grad_norm": 0.7377725839614868, + "kl": 0.229010009765625, + "learning_rate": 1.987396989433935e-05, + "loss": 0.0092, + "reward": 1.153125, + "reward_std": 0.36884579751640556, + "rewards/accuracy_reward": 0.2328125, + "rewards/format_reward": 0.9203125, + "step": 330 + }, + { + "completion_length": 280.9796875, + "epoch": 0.14798045332817977, + "grad_norm": 3.068225622177124, + "kl": 0.5145751953125, + "learning_rate": 1.986146598720913e-05, + "loss": 0.0205, + "reward": 1.0796875, + "reward_std": 0.3447397375479341, + "rewards/accuracy_reward": 0.17734375, + "rewards/format_reward": 0.90234375, + "step": 335 + }, + { + "completion_length": 231.14296875, + "epoch": 0.15018911681068994, + "grad_norm": 1.5392614603042603, + "kl": 1.3296142578125, + "learning_rate": 1.984837509976837e-05, + "loss": 0.0532, + "reward": 1.14375, + "reward_std": 0.3358943074941635, + "rewards/accuracy_reward": 0.215625, + "rewards/format_reward": 0.928125, + "step": 340 + }, + { + "completion_length": 248.0734375, + "epoch": 0.15239778029320009, + "grad_norm": 0.6375353336334229, + "kl": 1.9455322265625, + "learning_rate": 1.9834698011221008e-05, + "loss": 0.0778, + "reward": 1.13203125, + "reward_std": 0.38343740683048966, + "rewards/accuracy_reward": 0.2203125, + "rewards/format_reward": 0.91171875, + "step": 345 + }, + { + "completion_length": 261.58046875, + "epoch": 0.15460644377571023, + "grad_norm": 2110458.0, + "kl": 12810.955236816406, + "learning_rate": 1.982043553566321e-05, + "loss": 512.4465, + "reward": 1.06953125, + "reward_std": 0.38607826493680475, + "rewards/accuracy_reward": 0.18046875, + "rewards/format_reward": 0.8890625, + "step": 350 + }, + { + "completion_length": 325.5890625, + "epoch": 0.15681510725822037, + "grad_norm": 7.603396415710449, + "kl": 20.8695068359375, + "learning_rate": 1.980558852203492e-05, + "loss": 0.8364, + "reward": 1.02265625, + "reward_std": 0.46340725645422937, + "rewards/accuracy_reward": 0.209375, + "rewards/format_reward": 0.81328125, + "step": 355 + }, + { + "completion_length": 344.390625, + "epoch": 0.1590237707407305, + "grad_norm": 1.510093331336975, + "kl": 1.6796875, + "learning_rate": 1.979015785406931e-05, + "loss": 0.0672, + "reward": 1.0, + "reward_std": 0.470697814039886, + "rewards/accuracy_reward": 0.19921875, + "rewards/format_reward": 0.80078125, + "step": 360 + }, + { + "completion_length": 243.0875, + "epoch": 0.16123243422324066, + "grad_norm": 7.704539775848389, + "kl": 4.3294921875, + "learning_rate": 1.97741444502402e-05, + "loss": 0.1733, + "reward": 1.16328125, + "reward_std": 0.3427719760686159, + "rewards/accuracy_reward": 0.2390625, + "rewards/format_reward": 0.92421875, + "step": 365 + }, + { + "completion_length": 207.48046875, + "epoch": 0.1634410977057508, + "grad_norm": 5.120603561401367, + "kl": 2.038916015625, + "learning_rate": 1.9757549263707366e-05, + "loss": 0.0816, + "reward": 1.19296875, + "reward_std": 0.35142498891800644, + "rewards/accuracy_reward": 0.25625, + "rewards/format_reward": 0.93671875, + "step": 370 + }, + { + "completion_length": 229.01953125, + "epoch": 0.16564976118826094, + "grad_norm": 7.258485317230225, + "kl": 2.72215576171875, + "learning_rate": 1.974037328225982e-05, + "loss": 0.1089, + "reward": 1.14921875, + "reward_std": 0.3493613565340638, + "rewards/accuracy_reward": 0.22421875, + "rewards/format_reward": 0.925, + "step": 375 + }, + { + "completion_length": 217.7515625, + "epoch": 0.1678584246707711, + "grad_norm": 7.898167133331299, + "kl": 2.309033203125, + "learning_rate": 1.972261752825701e-05, + "loss": 0.0924, + "reward": 1.14453125, + "reward_std": 0.300789905525744, + "rewards/accuracy_reward": 0.20625, + "rewards/format_reward": 0.93828125, + "step": 380 + }, + { + "completion_length": 168.23515625, + "epoch": 0.17006708815328125, + "grad_norm": 0.5930284261703491, + "kl": 1.09581298828125, + "learning_rate": 1.9704283058567972e-05, + "loss": 0.0439, + "reward": 1.17421875, + "reward_std": 0.2850266819819808, + "rewards/accuracy_reward": 0.2140625, + "rewards/format_reward": 0.96015625, + "step": 385 + }, + { + "completion_length": 156.07421875, + "epoch": 0.1722757516357914, + "grad_norm": 2.664320945739746, + "kl": 1.3061279296875, + "learning_rate": 1.968537096450841e-05, + "loss": 0.0523, + "reward": 1.1390625, + "reward_std": 0.28086008559912445, + "rewards/accuracy_reward": 0.18046875, + "rewards/format_reward": 0.95859375, + "step": 390 + }, + { + "completion_length": 204.665625, + "epoch": 0.17448441511830154, + "grad_norm": 1.541552186012268, + "kl": 1.1947509765625, + "learning_rate": 1.9665882371775735e-05, + "loss": 0.0478, + "reward": 1.1328125, + "reward_std": 0.26257925033569335, + "rewards/accuracy_reward": 0.175, + "rewards/format_reward": 0.9578125, + "step": 395 + }, + { + "completion_length": 227.2359375, + "epoch": 0.17669307860081168, + "grad_norm": 0.41995060443878174, + "kl": 0.6468994140625, + "learning_rate": 1.9645818440382096e-05, + "loss": 0.0259, + "reward": 1.215625, + "reward_std": 0.3341550791636109, + "rewards/accuracy_reward": 0.2703125, + "rewards/format_reward": 0.9453125, + "step": 400 + }, + { + "epoch": 0.17669307860081168, + "eval_completion_length": 243.36083374023437, + "eval_kl": 0.38421875, + "eval_loss": 0.015584616921842098, + "eval_reward": 1.2075, + "eval_reward_std": 0.30897092461586, + "eval_rewards/accuracy_reward": 0.2583333334326744, + "eval_rewards/format_reward": 0.9491666674613952, + "eval_runtime": 159.9641, + "eval_samples_per_second": 0.619, + "eval_steps_per_second": 0.025, + "step": 400 + }, + { + "completion_length": 266.815625, + "epoch": 0.17890174208332182, + "grad_norm": 0.7400681376457214, + "kl": 0.40008544921875, + "learning_rate": 1.962518036458529e-05, + "loss": 0.016, + "reward": 1.190625, + "reward_std": 0.31112865209579466, + "rewards/accuracy_reward": 0.23828125, + "rewards/format_reward": 0.95234375, + "step": 405 + }, + { + "completion_length": 280.94921875, + "epoch": 0.18111040556583197, + "grad_norm": 0.30900517106056213, + "kl": 0.38648681640625, + "learning_rate": 1.9603969372817695e-05, + "loss": 0.0155, + "reward": 1.240625, + "reward_std": 0.294854056276381, + "rewards/accuracy_reward": 0.28828125, + "rewards/format_reward": 0.95234375, + "step": 410 + }, + { + "completion_length": 358.7671875, + "epoch": 0.1833190690483421, + "grad_norm": 1.0198228359222412, + "kl": 1.0703125, + "learning_rate": 1.9582186727613152e-05, + "loss": 0.0428, + "reward": 1.01796875, + "reward_std": 0.4371380554512143, + "rewards/accuracy_reward": 0.2234375, + "rewards/format_reward": 0.79453125, + "step": 415 + }, + { + "completion_length": 286.39140625, + "epoch": 0.18552773253085228, + "grad_norm": 1.2672603130340576, + "kl": 0.32391357421875, + "learning_rate": 1.955983372553182e-05, + "loss": 0.013, + "reward": 0.946875, + "reward_std": 0.4627113614231348, + "rewards/accuracy_reward": 0.171875, + "rewards/format_reward": 0.775, + "step": 420 + }, + { + "completion_length": 207.659375, + "epoch": 0.18773639601336242, + "grad_norm": 0.4993106424808502, + "kl": 0.32420654296875, + "learning_rate": 1.953691169708298e-05, + "loss": 0.013, + "reward": 1.05859375, + "reward_std": 0.3334008002653718, + "rewards/accuracy_reward": 0.15546875, + "rewards/format_reward": 0.903125, + "step": 425 + }, + { + "completion_length": 167.09296875, + "epoch": 0.18994505949587256, + "grad_norm": 0.7137377858161926, + "kl": 0.3811279296875, + "learning_rate": 1.9513422006645867e-05, + "loss": 0.0152, + "reward": 1.12265625, + "reward_std": 0.3202515557408333, + "rewards/accuracy_reward": 0.1984375, + "rewards/format_reward": 0.92421875, + "step": 430 + }, + { + "completion_length": 259.22109375, + "epoch": 0.1921537229783827, + "grad_norm": 0.4750295877456665, + "kl": 0.30950927734375, + "learning_rate": 1.9489366052388443e-05, + "loss": 0.0124, + "reward": 1.121875, + "reward_std": 0.4058058561757207, + "rewards/accuracy_reward": 0.225, + "rewards/format_reward": 0.896875, + "step": 435 + }, + { + "completion_length": 234.15546875, + "epoch": 0.19436238646089285, + "grad_norm": 0.5221619009971619, + "kl": 0.27547607421875, + "learning_rate": 1.9464745266184173e-05, + "loss": 0.011, + "reward": 1.1921875, + "reward_std": 0.3065523250028491, + "rewards/accuracy_reward": 0.25546875, + "rewards/format_reward": 0.93671875, + "step": 440 + }, + { + "completion_length": 150.1078125, + "epoch": 0.196571049943403, + "grad_norm": 7.660337924957275, + "kl": 0.38460693359375, + "learning_rate": 1.9439561113526802e-05, + "loss": 0.0154, + "reward": 1.16328125, + "reward_std": 0.2454235328361392, + "rewards/accuracy_reward": 0.2015625, + "rewards/format_reward": 0.96171875, + "step": 445 + }, + { + "completion_length": 145.90859375, + "epoch": 0.19877971342591313, + "grad_norm": 0.47105056047439575, + "kl": 0.276025390625, + "learning_rate": 1.9413815093443128e-05, + "loss": 0.011, + "reward": 1.14140625, + "reward_std": 0.21770920380949974, + "rewards/accuracy_reward": 0.1765625, + "rewards/format_reward": 0.96484375, + "step": 450 + }, + { + "completion_length": 162.709375, + "epoch": 0.20098837690842328, + "grad_norm": 0.4455668032169342, + "kl": 0.29571533203125, + "learning_rate": 1.938750873840377e-05, + "loss": 0.0118, + "reward": 1.1203125, + "reward_std": 0.2609225707128644, + "rewards/accuracy_reward": 0.16171875, + "rewards/format_reward": 0.95859375, + "step": 455 + }, + { + "completion_length": 185.5453125, + "epoch": 0.20319704039093345, + "grad_norm": 1.09501051902771, + "kl": 0.34464111328125, + "learning_rate": 1.9360643614231942e-05, + "loss": 0.0138, + "reward": 1.08125, + "reward_std": 0.2819837937131524, + "rewards/accuracy_reward": 0.1375, + "rewards/format_reward": 0.94375, + "step": 460 + }, + { + "completion_length": 178.6546875, + "epoch": 0.2054057038734436, + "grad_norm": 0.501656174659729, + "kl": 0.2873779296875, + "learning_rate": 1.9333221320010275e-05, + "loss": 0.0115, + "reward": 1.1515625, + "reward_std": 0.2634758483618498, + "rewards/accuracy_reward": 0.19453125, + "rewards/format_reward": 0.95703125, + "step": 465 + }, + { + "completion_length": 190.03671875, + "epoch": 0.20761436735595373, + "grad_norm": 1.7912346124649048, + "kl": 0.30848388671875, + "learning_rate": 1.930524348798562e-05, + "loss": 0.0123, + "reward": 1.14609375, + "reward_std": 0.31838786210864783, + "rewards/accuracy_reward": 0.215625, + "rewards/format_reward": 0.93046875, + "step": 470 + }, + { + "completion_length": 174.62109375, + "epoch": 0.20982303083846388, + "grad_norm": 0.4804052412509918, + "kl": 0.32216796875, + "learning_rate": 1.9276711783471888e-05, + "loss": 0.0129, + "reward": 1.096875, + "reward_std": 0.2830366240814328, + "rewards/accuracy_reward": 0.165625, + "rewards/format_reward": 0.93125, + "step": 475 + }, + { + "completion_length": 144.003125, + "epoch": 0.21203169432097402, + "grad_norm": 0.47488105297088623, + "kl": 0.29373779296875, + "learning_rate": 1.9247627904750937e-05, + "loss": 0.0117, + "reward": 1.22109375, + "reward_std": 0.2334042889997363, + "rewards/accuracy_reward": 0.2328125, + "rewards/format_reward": 0.98828125, + "step": 480 + }, + { + "completion_length": 225.4140625, + "epoch": 0.21424035780348416, + "grad_norm": 0.6563146114349365, + "kl": 0.37607421875, + "learning_rate": 1.9217993582971485e-05, + "loss": 0.015, + "reward": 1.12109375, + "reward_std": 0.35556240323930977, + "rewards/accuracy_reward": 0.21015625, + "rewards/format_reward": 0.9109375, + "step": 485 + }, + { + "completion_length": 148.5078125, + "epoch": 0.2164490212859943, + "grad_norm": 0.6001664400100708, + "kl": 0.50482177734375, + "learning_rate": 1.9187810582046056e-05, + "loss": 0.0202, + "reward": 1.16796875, + "reward_std": 0.2445027854293585, + "rewards/accuracy_reward": 0.196875, + "rewards/format_reward": 0.97109375, + "step": 490 + }, + { + "completion_length": 175.846875, + "epoch": 0.21865768476850445, + "grad_norm": 0.48493316769599915, + "kl": 0.33309326171875, + "learning_rate": 1.9157080698546e-05, + "loss": 0.0133, + "reward": 1.134375, + "reward_std": 0.2596387291327119, + "rewards/accuracy_reward": 0.1765625, + "rewards/format_reward": 0.9578125, + "step": 495 + }, + { + "completion_length": 172.95859375, + "epoch": 0.22086634825101462, + "grad_norm": 0.34802621603012085, + "kl": 0.29521484375, + "learning_rate": 1.9125805761594553e-05, + "loss": 0.0118, + "reward": 1.13125, + "reward_std": 0.2513675343245268, + "rewards/accuracy_reward": 0.1703125, + "rewards/format_reward": 0.9609375, + "step": 500 + }, + { + "epoch": 0.22086634825101462, + "eval_completion_length": 168.7541668701172, + "eval_kl": 0.3077734375, + "eval_loss": 0.012391554191708565, + "eval_reward": 1.204166669845581, + "eval_reward_std": 0.2949218952655792, + "eval_rewards/accuracy_reward": 0.2508333337306976, + "eval_rewards/format_reward": 0.9533333349227905, + "eval_runtime": 143.7617, + "eval_samples_per_second": 0.689, + "eval_steps_per_second": 0.028, + "step": 500 + }, + { + "completion_length": 160.86171875, + "epoch": 0.22307501173352476, + "grad_norm": 0.750469446182251, + "kl": 0.33531494140625, + "learning_rate": 1.9093987632757957e-05, + "loss": 0.0134, + "reward": 1.13828125, + "reward_std": 0.2916230414062738, + "rewards/accuracy_reward": 0.18828125, + "rewards/format_reward": 0.95, + "step": 505 + }, + { + "completion_length": 158.5875, + "epoch": 0.2252836752160349, + "grad_norm": 0.4148224890232086, + "kl": 0.35068359375, + "learning_rate": 1.9061628205934662e-05, + "loss": 0.014, + "reward": 1.12265625, + "reward_std": 0.263315293751657, + "rewards/accuracy_reward": 0.16953125, + "rewards/format_reward": 0.953125, + "step": 510 + }, + { + "completion_length": 182.6203125, + "epoch": 0.22749233869854504, + "grad_norm": 0.3992038667201996, + "kl": 0.2927001953125, + "learning_rate": 1.9028729407242598e-05, + "loss": 0.0117, + "reward": 1.1828125, + "reward_std": 0.3094627659767866, + "rewards/accuracy_reward": 0.23125, + "rewards/format_reward": 0.9515625, + "step": 515 + }, + { + "completion_length": 191.246875, + "epoch": 0.22970100218105519, + "grad_norm": 0.5499962568283081, + "kl": 0.318408203125, + "learning_rate": 1.8995293194904512e-05, + "loss": 0.0127, + "reward": 1.12734375, + "reward_std": 0.2894813433289528, + "rewards/accuracy_reward": 0.17890625, + "rewards/format_reward": 0.9484375, + "step": 520 + }, + { + "completion_length": 167.7296875, + "epoch": 0.23190966566356533, + "grad_norm": 0.38229528069496155, + "kl": 0.31822509765625, + "learning_rate": 1.896132155913143e-05, + "loss": 0.0127, + "reward": 1.11484375, + "reward_std": 0.29848874974995854, + "rewards/accuracy_reward": 0.178125, + "rewards/format_reward": 0.93671875, + "step": 525 + }, + { + "completion_length": 142.56015625, + "epoch": 0.23411832914607547, + "grad_norm": 0.5173822641372681, + "kl": 0.33304443359375, + "learning_rate": 1.892681652200418e-05, + "loss": 0.0133, + "reward": 1.08984375, + "reward_std": 0.2929655512794852, + "rewards/accuracy_reward": 0.15234375, + "rewards/format_reward": 0.9375, + "step": 530 + }, + { + "completion_length": 151.20078125, + "epoch": 0.2363269926285856, + "grad_norm": 0.521994411945343, + "kl": 0.344140625, + "learning_rate": 1.8891780137353036e-05, + "loss": 0.0138, + "reward": 1.0921875, + "reward_std": 0.2893120773136616, + "rewards/accuracy_reward": 0.1578125, + "rewards/format_reward": 0.934375, + "step": 535 + }, + { + "completion_length": 132.7984375, + "epoch": 0.23853565611109578, + "grad_norm": 0.5721232295036316, + "kl": 0.36300048828125, + "learning_rate": 1.885621449063547e-05, + "loss": 0.0145, + "reward": 1.128125, + "reward_std": 0.24814818538725375, + "rewards/accuracy_reward": 0.1671875, + "rewards/format_reward": 0.9609375, + "step": 540 + }, + { + "completion_length": 144.96875, + "epoch": 0.24074431959360593, + "grad_norm": 0.3831787407398224, + "kl": 0.34149169921875, + "learning_rate": 1.8820121698812028e-05, + "loss": 0.0137, + "reward": 1.14140625, + "reward_std": 0.22749478761106728, + "rewards/accuracy_reward": 0.1765625, + "rewards/format_reward": 0.96484375, + "step": 545 + }, + { + "completion_length": 169.89296875, + "epoch": 0.24295298307611607, + "grad_norm": 0.5316097140312195, + "kl": 0.32564697265625, + "learning_rate": 1.8783503910220296e-05, + "loss": 0.013, + "reward": 1.1546875, + "reward_std": 0.2780306525528431, + "rewards/accuracy_reward": 0.2046875, + "rewards/format_reward": 0.95, + "step": 550 + }, + { + "completion_length": 149.83984375, + "epoch": 0.2451616465586262, + "grad_norm": 0.4427882432937622, + "kl": 0.3146484375, + "learning_rate": 1.8746363304447073e-05, + "loss": 0.0126, + "reward": 1.1578125, + "reward_std": 0.22796925920993089, + "rewards/accuracy_reward": 0.1859375, + "rewards/format_reward": 0.971875, + "step": 555 + }, + { + "completion_length": 165.7453125, + "epoch": 0.24737031004113635, + "grad_norm": 0.47105562686920166, + "kl": 0.298486328125, + "learning_rate": 1.8708702092198576e-05, + "loss": 0.0119, + "reward": 1.17578125, + "reward_std": 0.2685113290324807, + "rewards/accuracy_reward": 0.2125, + "rewards/format_reward": 0.96328125, + "step": 560 + }, + { + "completion_length": 168.32265625, + "epoch": 0.2495789735236465, + "grad_norm": 0.3255383372306824, + "kl": 0.3055908203125, + "learning_rate": 1.867052251516891e-05, + "loss": 0.0122, + "reward": 1.1390625, + "reward_std": 0.24939600769430398, + "rewards/accuracy_reward": 0.1703125, + "rewards/format_reward": 0.96875, + "step": 565 + }, + { + "completion_length": 184.50546875, + "epoch": 0.25178763700615664, + "grad_norm": 0.4039280116558075, + "kl": 0.31488037109375, + "learning_rate": 1.8631826845906588e-05, + "loss": 0.0126, + "reward": 1.14140625, + "reward_std": 0.2917328651994467, + "rewards/accuracy_reward": 0.1984375, + "rewards/format_reward": 0.94296875, + "step": 570 + }, + { + "completion_length": 187.24296875, + "epoch": 0.2539963004886668, + "grad_norm": 0.48246172070503235, + "kl": 0.33466796875, + "learning_rate": 1.8592617387679304e-05, + "loss": 0.0134, + "reward": 1.15625, + "reward_std": 0.32329851035028695, + "rewards/accuracy_reward": 0.21640625, + "rewards/format_reward": 0.93984375, + "step": 575 + }, + { + "completion_length": 157.41796875, + "epoch": 0.2562049639711769, + "grad_norm": 0.5687190294265747, + "kl": 0.365576171875, + "learning_rate": 1.8552896474336816e-05, + "loss": 0.0146, + "reward": 1.1390625, + "reward_std": 0.25018255431205033, + "rewards/accuracy_reward": 0.17890625, + "rewards/format_reward": 0.96015625, + "step": 580 + }, + { + "completion_length": 126.7265625, + "epoch": 0.25841362745368707, + "grad_norm": 0.4837645888328552, + "kl": 0.494287109375, + "learning_rate": 1.8512666470172024e-05, + "loss": 0.0198, + "reward": 1.13515625, + "reward_std": 0.25846064239740374, + "rewards/accuracy_reward": 0.1828125, + "rewards/format_reward": 0.95234375, + "step": 585 + }, + { + "completion_length": 117.5125, + "epoch": 0.2606222909361972, + "grad_norm": 0.5099273324012756, + "kl": 0.46514892578125, + "learning_rate": 1.8471929769780247e-05, + "loss": 0.0186, + "reward": 1.1375, + "reward_std": 0.27797329761087897, + "rewards/accuracy_reward": 0.18671875, + "rewards/format_reward": 0.95078125, + "step": 590 + }, + { + "completion_length": 129.4875, + "epoch": 0.26283095441870735, + "grad_norm": 0.48087364435195923, + "kl": 23.66878662109375, + "learning_rate": 1.8430688797916702e-05, + "loss": 0.9494, + "reward": 1.10234375, + "reward_std": 0.26711587999016045, + "rewards/accuracy_reward": 0.14609375, + "rewards/format_reward": 0.95625, + "step": 595 + }, + { + "completion_length": 137.51171875, + "epoch": 0.26503961790121755, + "grad_norm": 0.5267772674560547, + "kl": 0.357568359375, + "learning_rate": 1.8388946009352157e-05, + "loss": 0.0143, + "reward": 1.15625, + "reward_std": 0.2736740421503782, + "rewards/accuracy_reward": 0.2046875, + "rewards/format_reward": 0.9515625, + "step": 600 + }, + { + "epoch": 0.26503961790121755, + "eval_completion_length": 110.09, + "eval_kl": 0.377890625, + "eval_loss": 0.01513399463146925, + "eval_reward": 1.1708333349227906, + "eval_reward_std": 0.2645035409927368, + "eval_rewards/accuracy_reward": 0.20208333373069765, + "eval_rewards/format_reward": 0.96875, + "eval_runtime": 129.4495, + "eval_samples_per_second": 0.765, + "eval_steps_per_second": 0.031, + "step": 600 + }, + { + "completion_length": 103.6046875, + "epoch": 0.2672482813837277, + "grad_norm": 0.5802999138832092, + "kl": 0.42120361328125, + "learning_rate": 1.8346703888726833e-05, + "loss": 0.0168, + "reward": 1.1421875, + "reward_std": 0.22200765572488307, + "rewards/accuracy_reward": 0.1765625, + "rewards/format_reward": 0.965625, + "step": 605 + }, + { + "completion_length": 141.5125, + "epoch": 0.26945694486623784, + "grad_norm": 0.595243513584137, + "kl": 0.3810546875, + "learning_rate": 1.8303964950402498e-05, + "loss": 0.0152, + "reward": 1.12265625, + "reward_std": 0.29428734816610813, + "rewards/accuracy_reward": 0.184375, + "rewards/format_reward": 0.93828125, + "step": 610 + }, + { + "completion_length": 138.7, + "epoch": 0.271665608348748, + "grad_norm": 0.43679195642471313, + "kl": 0.38223876953125, + "learning_rate": 1.8260731738312817e-05, + "loss": 0.0153, + "reward": 1.10078125, + "reward_std": 0.23885549493134023, + "rewards/accuracy_reward": 0.1484375, + "rewards/format_reward": 0.95234375, + "step": 615 + }, + { + "completion_length": 104.80703125, + "epoch": 0.2738742718312581, + "grad_norm": 0.5839166045188904, + "kl": 0.4041748046875, + "learning_rate": 1.8217006825811924e-05, + "loss": 0.0162, + "reward": 1.12890625, + "reward_std": 0.19053069781512022, + "rewards/accuracy_reward": 0.14609375, + "rewards/format_reward": 0.9828125, + "step": 620 + }, + { + "completion_length": 132.15859375, + "epoch": 0.27608293531376826, + "grad_norm": 1.1488077640533447, + "kl": 0.4154296875, + "learning_rate": 1.8172792815521246e-05, + "loss": 0.0166, + "reward": 1.12890625, + "reward_std": 0.24578131809830667, + "rewards/accuracy_reward": 0.1703125, + "rewards/format_reward": 0.95859375, + "step": 625 + }, + { + "completion_length": 203.21328125, + "epoch": 0.2782915987962784, + "grad_norm": 0.9778295159339905, + "kl": 1.12738037109375, + "learning_rate": 1.81280923391746e-05, + "loss": 0.0452, + "reward": 0.99453125, + "reward_std": 0.3527091216295958, + "rewards/accuracy_reward": 0.1265625, + "rewards/format_reward": 0.86796875, + "step": 630 + }, + { + "completion_length": 107.22421875, + "epoch": 0.28050026227878855, + "grad_norm": 0.4877747595310211, + "kl": 0.5244873046875, + "learning_rate": 1.8082908057461534e-05, + "loss": 0.021, + "reward": 1.0953125, + "reward_std": 0.23968660701066255, + "rewards/accuracy_reward": 0.1484375, + "rewards/format_reward": 0.946875, + "step": 635 + }, + { + "completion_length": 81.0328125, + "epoch": 0.2827089257612987, + "grad_norm": 0.7603225708007812, + "kl": 0.5284912109375, + "learning_rate": 1.8037242659868958e-05, + "loss": 0.0211, + "reward": 1.12109375, + "reward_std": 0.21915814336389303, + "rewards/accuracy_reward": 0.14453125, + "rewards/format_reward": 0.9765625, + "step": 640 + }, + { + "completion_length": 118.26171875, + "epoch": 0.28491758924380883, + "grad_norm": 0.565264880657196, + "kl": 0.44853515625, + "learning_rate": 1.7991098864521066e-05, + "loss": 0.018, + "reward": 1.12578125, + "reward_std": 0.2820789096876979, + "rewards/accuracy_reward": 0.17734375, + "rewards/format_reward": 0.9484375, + "step": 645 + }, + { + "completion_length": 92.51484375, + "epoch": 0.287126252726319, + "grad_norm": 0.5516146421432495, + "kl": 0.4996337890625, + "learning_rate": 1.794447941801754e-05, + "loss": 0.02, + "reward": 1.13515625, + "reward_std": 0.21679833866655826, + "rewards/accuracy_reward": 0.16328125, + "rewards/format_reward": 0.971875, + "step": 650 + }, + { + "completion_length": 126.5140625, + "epoch": 0.2893349162088291, + "grad_norm": 0.5977817177772522, + "kl": 0.4461181640625, + "learning_rate": 1.7897387095270058e-05, + "loss": 0.0178, + "reward": 1.10546875, + "reward_std": 0.24418613854795695, + "rewards/accuracy_reward": 0.14296875, + "rewards/format_reward": 0.9625, + "step": 655 + }, + { + "completion_length": 153.0734375, + "epoch": 0.29154357969133926, + "grad_norm": 0.4894627630710602, + "kl": 0.37508544921875, + "learning_rate": 1.7849824699337143e-05, + "loss": 0.015, + "reward": 1.1125, + "reward_std": 0.22963083293288947, + "rewards/accuracy_reward": 0.1515625, + "rewards/format_reward": 0.9609375, + "step": 660 + }, + { + "completion_length": 103.91953125, + "epoch": 0.2937522431738494, + "grad_norm": 0.6808644533157349, + "kl": 0.456884765625, + "learning_rate": 1.7801795061257293e-05, + "loss": 0.0183, + "reward": 1.121875, + "reward_std": 0.21878602355718613, + "rewards/accuracy_reward": 0.1484375, + "rewards/format_reward": 0.9734375, + "step": 665 + }, + { + "completion_length": 90.246875, + "epoch": 0.29596090665635955, + "grad_norm": 0.4546065330505371, + "kl": 0.46005859375, + "learning_rate": 1.77533010398805e-05, + "loss": 0.0184, + "reward": 1.0796875, + "reward_std": 0.19561193585395814, + "rewards/accuracy_reward": 0.109375, + "rewards/format_reward": 0.9703125, + "step": 670 + }, + { + "completion_length": 108.37578125, + "epoch": 0.29816957013886974, + "grad_norm": 0.4939492344856262, + "kl": 0.40859375, + "learning_rate": 1.7704345521698057e-05, + "loss": 0.0163, + "reward": 1.0984375, + "reward_std": 0.2110065519809723, + "rewards/accuracy_reward": 0.13515625, + "rewards/format_reward": 0.96328125, + "step": 675 + }, + { + "completion_length": 116.196875, + "epoch": 0.3003782336213799, + "grad_norm": 0.4660269021987915, + "kl": 0.397998046875, + "learning_rate": 1.765493142067076e-05, + "loss": 0.0159, + "reward": 1.14140625, + "reward_std": 0.23255243562161923, + "rewards/accuracy_reward": 0.1765625, + "rewards/format_reward": 0.96484375, + "step": 680 + }, + { + "completion_length": 104.7328125, + "epoch": 0.30258689710389003, + "grad_norm": 0.5599631071090698, + "kl": 0.39521484375, + "learning_rate": 1.7605061678055453e-05, + "loss": 0.0158, + "reward": 1.11953125, + "reward_std": 0.17798166144639255, + "rewards/accuracy_reward": 0.1359375, + "rewards/format_reward": 0.98359375, + "step": 685 + }, + { + "completion_length": 129.27890625, + "epoch": 0.30479556058640017, + "grad_norm": 0.4298873543739319, + "kl": 0.3538818359375, + "learning_rate": 1.7554739262229965e-05, + "loss": 0.0142, + "reward": 1.12265625, + "reward_std": 0.25020663160830736, + "rewards/accuracy_reward": 0.16328125, + "rewards/format_reward": 0.959375, + "step": 690 + }, + { + "completion_length": 126.02890625, + "epoch": 0.3070042240689103, + "grad_norm": 0.4924304485321045, + "kl": 0.373681640625, + "learning_rate": 1.7503967168516426e-05, + "loss": 0.015, + "reward": 1.11953125, + "reward_std": 0.2316643577069044, + "rewards/accuracy_reward": 0.159375, + "rewards/format_reward": 0.96015625, + "step": 695 + }, + { + "completion_length": 112.05, + "epoch": 0.30921288755142046, + "grad_norm": 0.5005078315734863, + "kl": 0.364013671875, + "learning_rate": 1.7452748419002968e-05, + "loss": 0.0146, + "reward": 1.14296875, + "reward_std": 0.20688416287302971, + "rewards/accuracy_reward": 0.17109375, + "rewards/format_reward": 0.971875, + "step": 700 + }, + { + "epoch": 0.30921288755142046, + "eval_completion_length": 114.4858334350586, + "eval_kl": 0.38703125, + "eval_loss": 0.015563694760203362, + "eval_reward": 1.1183333349227906, + "eval_reward_std": 0.22717599272727967, + "eval_rewards/accuracy_reward": 0.15583333373069763, + "eval_rewards/format_reward": 0.9625, + "eval_runtime": 118.9296, + "eval_samples_per_second": 0.832, + "eval_steps_per_second": 0.034, + "step": 700 + }, + { + "completion_length": 116.096875, + "epoch": 0.3114215510339306, + "grad_norm": 0.4761113226413727, + "kl": 0.3679443359375, + "learning_rate": 1.740108606236385e-05, + "loss": 0.0147, + "reward": 1.1265625, + "reward_std": 0.20904745440930128, + "rewards/accuracy_reward": 0.16015625, + "rewards/format_reward": 0.96640625, + "step": 705 + }, + { + "completion_length": 134.46640625, + "epoch": 0.31363021451644074, + "grad_norm": 0.7244411110877991, + "kl": 0.38353271484375, + "learning_rate": 1.7348983173677986e-05, + "loss": 0.0153, + "reward": 1.0765625, + "reward_std": 0.23576183728873729, + "rewards/accuracy_reward": 0.13515625, + "rewards/format_reward": 0.94140625, + "step": 710 + }, + { + "completion_length": 99.53828125, + "epoch": 0.3158388779989509, + "grad_norm": 0.4390712380409241, + "kl": 0.4088623046875, + "learning_rate": 1.7296442854245915e-05, + "loss": 0.0164, + "reward": 1.1109375, + "reward_std": 0.1941352991387248, + "rewards/accuracy_reward": 0.14453125, + "rewards/format_reward": 0.96640625, + "step": 715 + }, + { + "completion_length": 84.16640625, + "epoch": 0.318047541481461, + "grad_norm": 0.8809035420417786, + "kl": 0.4468994140625, + "learning_rate": 1.72434682314052e-05, + "loss": 0.0179, + "reward": 1.15390625, + "reward_std": 0.1831468353047967, + "rewards/accuracy_reward": 0.1765625, + "rewards/format_reward": 0.97734375, + "step": 720 + }, + { + "completion_length": 82.83203125, + "epoch": 0.32025620496397117, + "grad_norm": 0.7408865690231323, + "kl": 0.4578857421875, + "learning_rate": 1.719006245834429e-05, + "loss": 0.0183, + "reward": 1.11328125, + "reward_std": 0.16447940673679112, + "rewards/accuracy_reward": 0.13828125, + "rewards/format_reward": 0.975, + "step": 725 + }, + { + "completion_length": 124.11171875, + "epoch": 0.3224648684464813, + "grad_norm": 0.4459853172302246, + "kl": 803.6734375, + "learning_rate": 1.7136228713914805e-05, + "loss": 32.0277, + "reward": 1.0515625, + "reward_std": 0.2270077530294657, + "rewards/accuracy_reward": 0.10703125, + "rewards/format_reward": 0.94453125, + "step": 730 + }, + { + "completion_length": 116.0078125, + "epoch": 0.32467353192899145, + "grad_norm": 0.8283806443214417, + "kl": 0.4873046875, + "learning_rate": 1.7081970202442363e-05, + "loss": 0.0195, + "reward": 1.10625, + "reward_std": 0.24761096592992543, + "rewards/accuracy_reward": 0.15390625, + "rewards/format_reward": 0.95234375, + "step": 735 + }, + { + "completion_length": 75.5578125, + "epoch": 0.3268821954115016, + "grad_norm": 0.5588904023170471, + "kl": 0.44139404296875, + "learning_rate": 1.7027290153535826e-05, + "loss": 0.0177, + "reward": 1.16015625, + "reward_std": 0.16179091222584246, + "rewards/accuracy_reward": 0.165625, + "rewards/format_reward": 0.99453125, + "step": 740 + }, + { + "completion_length": 114.0390625, + "epoch": 0.32909085889401174, + "grad_norm": 0.43830907344818115, + "kl": 0.3618896484375, + "learning_rate": 1.6972191821895065e-05, + "loss": 0.0145, + "reward": 1.10390625, + "reward_std": 0.14213568177074193, + "rewards/accuracy_reward": 0.11953125, + "rewards/format_reward": 0.984375, + "step": 745 + }, + { + "completion_length": 154.39921875, + "epoch": 0.3312995223765219, + "grad_norm": 0.34231725335121155, + "kl": 0.31685791015625, + "learning_rate": 1.691667848711723e-05, + "loss": 0.0127, + "reward": 1.10546875, + "reward_std": 0.19847314581274986, + "rewards/accuracy_reward": 0.128125, + "rewards/format_reward": 0.97734375, + "step": 750 + }, + { + "completion_length": 178.4203125, + "epoch": 0.3335081858590321, + "grad_norm": 0.524643063545227, + "kl": 3.5463134765625, + "learning_rate": 1.686075345350156e-05, + "loss": 0.1422, + "reward": 1.0859375, + "reward_std": 0.2875434797257185, + "rewards/accuracy_reward": 0.14609375, + "rewards/format_reward": 0.93984375, + "step": 755 + }, + { + "completion_length": 147.73203125, + "epoch": 0.3357168493415422, + "grad_norm": 0.3619137704372406, + "kl": 0.3319091796875, + "learning_rate": 1.6804420049852676e-05, + "loss": 0.0133, + "reward": 1.1296875, + "reward_std": 0.23654117435216904, + "rewards/accuracy_reward": 0.165625, + "rewards/format_reward": 0.9640625, + "step": 760 + }, + { + "completion_length": 121.4109375, + "epoch": 0.33792551282405237, + "grad_norm": 0.36858057975769043, + "kl": 0.32564697265625, + "learning_rate": 1.6747681629282468e-05, + "loss": 0.013, + "reward": 1.16875, + "reward_std": 0.2164825988933444, + "rewards/accuracy_reward": 0.18671875, + "rewards/format_reward": 0.98203125, + "step": 765 + }, + { + "completion_length": 108.2828125, + "epoch": 0.3401341763065625, + "grad_norm": 0.4734199047088623, + "kl": 0.35654296875, + "learning_rate": 1.6690541569010474e-05, + "loss": 0.0143, + "reward": 1.13828125, + "reward_std": 0.20721396785229446, + "rewards/accuracy_reward": 0.15859375, + "rewards/format_reward": 0.9796875, + "step": 770 + }, + { + "completion_length": 126.0984375, + "epoch": 0.34234283978907265, + "grad_norm": 0.47700235247612, + "kl": 0.3507568359375, + "learning_rate": 1.6633003270162903e-05, + "loss": 0.014, + "reward": 1.1484375, + "reward_std": 0.20858664382249117, + "rewards/accuracy_reward": 0.1734375, + "rewards/format_reward": 0.975, + "step": 775 + }, + { + "completion_length": 166.13125, + "epoch": 0.3445515032715828, + "grad_norm": 0.39217832684516907, + "kl": 0.35064697265625, + "learning_rate": 1.6575070157570152e-05, + "loss": 0.014, + "reward": 1.13515625, + "reward_std": 0.2673689084127545, + "rewards/accuracy_reward": 0.18359375, + "rewards/format_reward": 0.9515625, + "step": 780 + }, + { + "completion_length": 169.01328125, + "epoch": 0.34676016675409294, + "grad_norm": 0.39597201347351074, + "kl": 0.34754638671875, + "learning_rate": 1.6516745679562977e-05, + "loss": 0.0139, + "reward": 1.065625, + "reward_std": 0.2720937805250287, + "rewards/accuracy_reward": 0.1328125, + "rewards/format_reward": 0.9328125, + "step": 785 + }, + { + "completion_length": 182.8359375, + "epoch": 0.3489688302366031, + "grad_norm": 0.47019919753074646, + "kl": 0.34156494140625, + "learning_rate": 1.6458033307767217e-05, + "loss": 0.0137, + "reward": 1.08046875, + "reward_std": 0.3094723552465439, + "rewards/accuracy_reward": 0.16640625, + "rewards/format_reward": 0.9140625, + "step": 790 + }, + { + "completion_length": 129.996875, + "epoch": 0.3511774937191132, + "grad_norm": 1.1053619384765625, + "kl": 0.59439697265625, + "learning_rate": 1.6398936536897182e-05, + "loss": 0.0238, + "reward": 1.09609375, + "reward_std": 0.27469405010342596, + "rewards/accuracy_reward": 0.1703125, + "rewards/format_reward": 0.92578125, + "step": 795 + }, + { + "completion_length": 56.90234375, + "epoch": 0.35338615720162336, + "grad_norm": 0.5199185609817505, + "kl": 0.6048828125, + "learning_rate": 1.6339458884547613e-05, + "loss": 0.0242, + "reward": 1.14140625, + "reward_std": 0.15660744477063418, + "rewards/accuracy_reward": 0.15390625, + "rewards/format_reward": 0.9875, + "step": 800 + }, + { + "epoch": 0.35338615720162336, + "eval_completion_length": 43.742083358764646, + "eval_kl": 0.65765625, + "eval_loss": 0.026582278311252594, + "eval_reward": 1.1675, + "eval_reward_std": 0.16831182479858398, + "eval_rewards/accuracy_reward": 0.1725, + "eval_rewards/format_reward": 0.995, + "eval_runtime": 49.7219, + "eval_samples_per_second": 1.991, + "eval_steps_per_second": 0.08, + "step": 800 + }, + { + "completion_length": 64.53046875, + "epoch": 0.3555948206841335, + "grad_norm": 0.4935765266418457, + "kl": 0.5750732421875, + "learning_rate": 1.6279603890984315e-05, + "loss": 0.023, + "reward": 1.14765625, + "reward_std": 0.20158261395990848, + "rewards/accuracy_reward": 0.16015625, + "rewards/format_reward": 0.9875, + "step": 805 + }, + { + "completion_length": 93.0140625, + "epoch": 0.35780348416664365, + "grad_norm": 0.5734583139419556, + "kl": 0.4998291015625, + "learning_rate": 1.6219375118933442e-05, + "loss": 0.02, + "reward": 1.14765625, + "reward_std": 0.2238040953874588, + "rewards/accuracy_reward": 0.17890625, + "rewards/format_reward": 0.96875, + "step": 810 + }, + { + "completion_length": 134.42578125, + "epoch": 0.3600121476491538, + "grad_norm": 0.49560049176216125, + "kl": 0.4668212890625, + "learning_rate": 1.6158776153369406e-05, + "loss": 0.0187, + "reward": 1.0765625, + "reward_std": 0.29757872987538575, + "rewards/accuracy_reward": 0.14453125, + "rewards/format_reward": 0.93203125, + "step": 815 + }, + { + "completion_length": 158.89296875, + "epoch": 0.36222081113166393, + "grad_norm": 0.5122950077056885, + "kl": 0.46318359375, + "learning_rate": 1.609781060130152e-05, + "loss": 0.0185, + "reward": 1.01171875, + "reward_std": 0.29846451599150897, + "rewards/accuracy_reward": 0.1046875, + "rewards/format_reward": 0.90703125, + "step": 820 + }, + { + "completion_length": 116.4859375, + "epoch": 0.3644294746141741, + "grad_norm": 0.5021364092826843, + "kl": 0.4975830078125, + "learning_rate": 1.6036482091559287e-05, + "loss": 0.0199, + "reward": 1.12109375, + "reward_std": 0.2664882358163595, + "rewards/accuracy_reward": 0.165625, + "rewards/format_reward": 0.95546875, + "step": 825 + }, + { + "completion_length": 131.8703125, + "epoch": 0.3666381380966842, + "grad_norm": 0.6027432680130005, + "kl": 0.4765380859375, + "learning_rate": 1.5974794274576394e-05, + "loss": 0.0191, + "reward": 1.12578125, + "reward_std": 0.28144511561840774, + "rewards/accuracy_reward": 0.17265625, + "rewards/format_reward": 0.953125, + "step": 830 + }, + { + "completion_length": 134.915625, + "epoch": 0.3688468015791944, + "grad_norm": 0.49983125925064087, + "kl": 0.4760986328125, + "learning_rate": 1.5912750822173446e-05, + "loss": 0.019, + "reward": 1.134375, + "reward_std": 0.29985770154744384, + "rewards/accuracy_reward": 0.20078125, + "rewards/format_reward": 0.93359375, + "step": 835 + }, + { + "completion_length": 91.075, + "epoch": 0.37105546506170456, + "grad_norm": 0.4414427876472473, + "kl": 0.6185791015625, + "learning_rate": 1.5850355427339398e-05, + "loss": 0.0247, + "reward": 1.1, + "reward_std": 0.2404505180194974, + "rewards/accuracy_reward": 0.14375, + "rewards/format_reward": 0.95625, + "step": 840 + }, + { + "completion_length": 61.1109375, + "epoch": 0.3732641285442147, + "grad_norm": 0.46862563490867615, + "kl": 0.6706787109375, + "learning_rate": 1.5787611804011735e-05, + "loss": 0.0268, + "reward": 1.16875, + "reward_std": 0.17206176780164242, + "rewards/accuracy_reward": 0.18828125, + "rewards/format_reward": 0.98046875, + "step": 845 + }, + { + "completion_length": 78.2640625, + "epoch": 0.37547279202672484, + "grad_norm": 0.7195978164672852, + "kl": 0.6672119140625, + "learning_rate": 1.5724523686855423e-05, + "loss": 0.0267, + "reward": 1.13203125, + "reward_std": 0.1878314608708024, + "rewards/accuracy_reward": 0.16328125, + "rewards/format_reward": 0.96875, + "step": 850 + }, + { + "completion_length": 103.509375, + "epoch": 0.377681455509235, + "grad_norm": 0.5233339071273804, + "kl": 0.6544677734375, + "learning_rate": 1.56610948310406e-05, + "loss": 0.0262, + "reward": 1.13359375, + "reward_std": 0.24245705269277096, + "rewards/accuracy_reward": 0.1734375, + "rewards/format_reward": 0.96015625, + "step": 855 + }, + { + "completion_length": 97.459375, + "epoch": 0.37989011899174513, + "grad_norm": 0.4953531324863434, + "kl": 0.575927734375, + "learning_rate": 1.5597329012019065e-05, + "loss": 0.023, + "reward": 1.15859375, + "reward_std": 0.2319757068529725, + "rewards/accuracy_reward": 0.19609375, + "rewards/format_reward": 0.9625, + "step": 860 + }, + { + "completion_length": 126.03515625, + "epoch": 0.38209878247425527, + "grad_norm": 1.7415945529937744, + "kl": 0.5650390625, + "learning_rate": 1.5533230025299547e-05, + "loss": 0.0226, + "reward": 1.0765625, + "reward_std": 0.25536851994693277, + "rewards/accuracy_reward": 0.13828125, + "rewards/format_reward": 0.93828125, + "step": 865 + }, + { + "completion_length": 92.45625, + "epoch": 0.3843074459567654, + "grad_norm": 0.5169078707695007, + "kl": 0.5216796875, + "learning_rate": 1.5468801686221793e-05, + "loss": 0.0209, + "reward": 1.14765625, + "reward_std": 0.21833606492727994, + "rewards/accuracy_reward": 0.18203125, + "rewards/format_reward": 0.965625, + "step": 870 + }, + { + "completion_length": 111.06875, + "epoch": 0.38651610943927556, + "grad_norm": 1.1891313791275024, + "kl": 0.5312255859375, + "learning_rate": 1.540404782972946e-05, + "loss": 0.0213, + "reward": 1.08828125, + "reward_std": 0.2383767468854785, + "rewards/accuracy_reward": 0.1390625, + "rewards/format_reward": 0.94921875, + "step": 875 + }, + { + "completion_length": 96.92109375, + "epoch": 0.3887247729217857, + "grad_norm": 0.49358442425727844, + "kl": 0.54716796875, + "learning_rate": 1.5338972310141863e-05, + "loss": 0.0219, + "reward": 1.14296875, + "reward_std": 0.2417955880984664, + "rewards/accuracy_reward": 0.18359375, + "rewards/format_reward": 0.959375, + "step": 880 + }, + { + "completion_length": 103.10234375, + "epoch": 0.39093343640429584, + "grad_norm": 0.45550188422203064, + "kl": 0.5120849609375, + "learning_rate": 1.5273579000924545e-05, + "loss": 0.0205, + "reward": 1.12265625, + "reward_std": 0.230152091011405, + "rewards/accuracy_reward": 0.1609375, + "rewards/format_reward": 0.96171875, + "step": 885 + }, + { + "completion_length": 79.07578125, + "epoch": 0.393142099886806, + "grad_norm": 0.4740158021450043, + "kl": 0.751025390625, + "learning_rate": 1.5207871794458715e-05, + "loss": 0.03, + "reward": 1.17578125, + "reward_std": 0.19955000430345535, + "rewards/accuracy_reward": 0.19375, + "rewards/format_reward": 0.98203125, + "step": 890 + }, + { + "completion_length": 105.76171875, + "epoch": 0.3953507633693161, + "grad_norm": 0.37959805130958557, + "kl": 0.492724609375, + "learning_rate": 1.5141854601809583e-05, + "loss": 0.0197, + "reward": 1.15, + "reward_std": 0.21266062185168266, + "rewards/accuracy_reward": 0.18125, + "rewards/format_reward": 0.96875, + "step": 895 + }, + { + "completion_length": 106.47109375, + "epoch": 0.39755942685182627, + "grad_norm": 0.4141887128353119, + "kl": 0.47958984375, + "learning_rate": 1.5075531352493528e-05, + "loss": 0.0192, + "reward": 1.125, + "reward_std": 0.23161781765520573, + "rewards/accuracy_reward": 0.1609375, + "rewards/format_reward": 0.9640625, + "step": 900 + }, + { + "epoch": 0.39755942685182627, + "eval_completion_length": 128.19291748046874, + "eval_kl": 0.500859375, + "eval_loss": 0.02018117904663086, + "eval_reward": 1.1125, + "eval_reward_std": 0.26222177892923354, + "eval_rewards/accuracy_reward": 0.1604166667163372, + "eval_rewards/format_reward": 0.9520833349227905, + "eval_runtime": 156.2795, + "eval_samples_per_second": 0.633, + "eval_steps_per_second": 0.026, + "step": 900 + }, + { + "completion_length": 115.94921875, + "epoch": 0.3997680903343364, + "grad_norm": 0.4243628680706024, + "kl": 0.49537353515625, + "learning_rate": 1.5008905994244255e-05, + "loss": 0.0198, + "reward": 1.10234375, + "reward_std": 0.225167977437377, + "rewards/accuracy_reward": 0.15078125, + "rewards/format_reward": 0.9515625, + "step": 905 + }, + { + "completion_length": 72.07578125, + "epoch": 0.40197675381684655, + "grad_norm": 0.4959592819213867, + "kl": 0.5396484375, + "learning_rate": 1.4941982492777749e-05, + "loss": 0.0216, + "reward": 1.1703125, + "reward_std": 0.1914088014513254, + "rewards/accuracy_reward": 0.18984375, + "rewards/format_reward": 0.98046875, + "step": 910 + }, + { + "completion_length": 84.55390625, + "epoch": 0.40418541729935675, + "grad_norm": 0.6311984062194824, + "kl": 0.56240234375, + "learning_rate": 1.4874764831556285e-05, + "loss": 0.0225, + "reward": 1.15625, + "reward_std": 0.20792635306715965, + "rewards/accuracy_reward": 0.1890625, + "rewards/format_reward": 0.9671875, + "step": 915 + }, + { + "completion_length": 115.203125, + "epoch": 0.4063940807818669, + "grad_norm": 0.5944263935089111, + "kl": 0.4927734375, + "learning_rate": 1.4807257011551297e-05, + "loss": 0.0197, + "reward": 1.15703125, + "reward_std": 0.26277261301875116, + "rewards/accuracy_reward": 0.21015625, + "rewards/format_reward": 0.946875, + "step": 920 + }, + { + "completion_length": 111.07578125, + "epoch": 0.40860274426437704, + "grad_norm": 0.7743093967437744, + "kl": 0.488720703125, + "learning_rate": 1.4739463051005221e-05, + "loss": 0.0196, + "reward": 1.103125, + "reward_std": 0.2299284663051367, + "rewards/accuracy_reward": 0.14453125, + "rewards/format_reward": 0.95859375, + "step": 925 + }, + { + "completion_length": 95.52109375, + "epoch": 0.4108114077468872, + "grad_norm": 0.7759421467781067, + "kl": 0.5315185546875, + "learning_rate": 1.4671386985192327e-05, + "loss": 0.0213, + "reward": 1.1671875, + "reward_std": 0.1855922631919384, + "rewards/accuracy_reward": 0.17890625, + "rewards/format_reward": 0.98828125, + "step": 930 + }, + { + "completion_length": 108.65546875, + "epoch": 0.4130200712293973, + "grad_norm": 0.4987127482891083, + "kl": 0.45135498046875, + "learning_rate": 1.460303286617854e-05, + "loss": 0.0181, + "reward": 1.1625, + "reward_std": 0.17659219540655613, + "rewards/accuracy_reward": 0.16640625, + "rewards/format_reward": 0.99609375, + "step": 935 + }, + { + "completion_length": 144.80078125, + "epoch": 0.41522873471190747, + "grad_norm": 0.5061682462692261, + "kl": 5.969921875, + "learning_rate": 1.4534404762580239e-05, + "loss": 0.2394, + "reward": 1.16484375, + "reward_std": 0.21013734135776757, + "rewards/accuracy_reward": 0.17890625, + "rewards/format_reward": 0.9859375, + "step": 940 + }, + { + "completion_length": 113.9171875, + "epoch": 0.4174373981944176, + "grad_norm": 0.35856854915618896, + "kl": 0.4087646484375, + "learning_rate": 1.4465506759322074e-05, + "loss": 0.0164, + "reward": 1.196875, + "reward_std": 0.16952291671186687, + "rewards/accuracy_reward": 0.2015625, + "rewards/format_reward": 0.9953125, + "step": 945 + }, + { + "completion_length": 112.64765625, + "epoch": 0.41964606167692775, + "grad_norm": 0.3706095218658447, + "kl": 0.4105224609375, + "learning_rate": 1.4396342957393844e-05, + "loss": 0.0164, + "reward": 1.17265625, + "reward_std": 0.1865989552810788, + "rewards/accuracy_reward": 0.18203125, + "rewards/format_reward": 0.990625, + "step": 950 + }, + { + "completion_length": 137.5375, + "epoch": 0.4218547251594379, + "grad_norm": 1.2025071382522583, + "kl": 0.38111572265625, + "learning_rate": 1.4326917473606368e-05, + "loss": 0.0152, + "reward": 1.09921875, + "reward_std": 0.18654303345829248, + "rewards/accuracy_reward": 0.121875, + "rewards/format_reward": 0.97734375, + "step": 955 + }, + { + "completion_length": 160.10234375, + "epoch": 0.42406338864194804, + "grad_norm": 3.2986419200897217, + "kl": 0.59927978515625, + "learning_rate": 1.4257234440346469e-05, + "loss": 0.024, + "reward": 1.1609375, + "reward_std": 0.23215112816542388, + "rewards/accuracy_reward": 0.18828125, + "rewards/format_reward": 0.97265625, + "step": 960 + }, + { + "completion_length": 167.0, + "epoch": 0.4262720521244582, + "grad_norm": 0.610000729560852, + "kl": 0.558154296875, + "learning_rate": 1.4187298005330976e-05, + "loss": 0.0223, + "reward": 1.1421875, + "reward_std": 0.31437007896602154, + "rewards/accuracy_reward": 0.2078125, + "rewards/format_reward": 0.934375, + "step": 965 + }, + { + "completion_length": 104.16484375, + "epoch": 0.4284807156069683, + "grad_norm": 8.364782333374023, + "kl": 2.54658203125, + "learning_rate": 1.4117112331359865e-05, + "loss": 0.1018, + "reward": 1.115625, + "reward_std": 0.22522333543747663, + "rewards/accuracy_reward": 0.1671875, + "rewards/format_reward": 0.9484375, + "step": 970 + }, + { + "completion_length": 63.9890625, + "epoch": 0.43068937908947846, + "grad_norm": 0.708032488822937, + "kl": 1.1080322265625, + "learning_rate": 1.4046681596068468e-05, + "loss": 0.0444, + "reward": 1.171875, + "reward_std": 0.16694873906672, + "rewards/accuracy_reward": 0.19609375, + "rewards/format_reward": 0.97578125, + "step": 975 + }, + { + "completion_length": 60.2453125, + "epoch": 0.4328980425719886, + "grad_norm": 1.7320284843444824, + "kl": 1.096923828125, + "learning_rate": 1.3976009991678803e-05, + "loss": 0.0439, + "reward": 1.1546875, + "reward_std": 0.17392283789813517, + "rewards/accuracy_reward": 0.17578125, + "rewards/format_reward": 0.97890625, + "step": 980 + }, + { + "completion_length": 68.43359375, + "epoch": 0.43510670605449875, + "grad_norm": 0.7800001502037048, + "kl": 1.552880859375, + "learning_rate": 1.390510172475005e-05, + "loss": 0.0621, + "reward": 1.16875, + "reward_std": 0.20654550790786744, + "rewards/accuracy_reward": 0.1921875, + "rewards/format_reward": 0.9765625, + "step": 985 + }, + { + "completion_length": 57.4203125, + "epoch": 0.4373153695370089, + "grad_norm": 0.8300907015800476, + "kl": 0.723046875, + "learning_rate": 1.383396101592817e-05, + "loss": 0.0289, + "reward": 1.13984375, + "reward_std": 0.17493642698973416, + "rewards/accuracy_reward": 0.1546875, + "rewards/format_reward": 0.98515625, + "step": 990 + }, + { + "completion_length": 73.23671875, + "epoch": 0.4395240330195191, + "grad_norm": 2.696807384490967, + "kl": 1.3876220703125, + "learning_rate": 1.3762592099694666e-05, + "loss": 0.0555, + "reward": 1.1109375, + "reward_std": 0.2104167841374874, + "rewards/accuracy_reward": 0.14140625, + "rewards/format_reward": 0.96953125, + "step": 995 + }, + { + "completion_length": 132.46328125, + "epoch": 0.44173269650202923, + "grad_norm": 3.0761334896087646, + "kl": 3.6865478515625, + "learning_rate": 1.3690999224114547e-05, + "loss": 0.1477, + "reward": 1.0703125, + "reward_std": 0.2970853915438056, + "rewards/accuracy_reward": 0.1515625, + "rewards/format_reward": 0.91875, + "step": 1000 + }, + { + "epoch": 0.44173269650202923, + "eval_completion_length": 104.09083335876466, + "eval_kl": 1.039375, + "eval_loss": 0.03946812078356743, + "eval_reward": 1.1433333349227905, + "eval_reward_std": 0.2386816355586052, + "eval_rewards/accuracy_reward": 0.1854166667163372, + "eval_rewards/format_reward": 0.9579166674613953, + "eval_runtime": 111.3439, + "eval_samples_per_second": 0.889, + "eval_steps_per_second": 0.036, + "step": 1000 + }, + { + "completion_length": 103.2375, + "epoch": 0.4439413599845394, + "grad_norm": 1.7991042137145996, + "kl": 1.09813232421875, + "learning_rate": 1.361918665058348e-05, + "loss": 0.0439, + "reward": 1.13125, + "reward_std": 0.21907511353492737, + "rewards/accuracy_reward": 0.16015625, + "rewards/format_reward": 0.97109375, + "step": 1005 + }, + { + "completion_length": 106.32421875, + "epoch": 0.4461500234670495, + "grad_norm": 1.2199746370315552, + "kl": 0.9680908203125, + "learning_rate": 1.354715865357411e-05, + "loss": 0.0388, + "reward": 1.15390625, + "reward_std": 0.2513147694990039, + "rewards/accuracy_reward": 0.1828125, + "rewards/format_reward": 0.97109375, + "step": 1010 + }, + { + "completion_length": 84.5375, + "epoch": 0.44835868694955966, + "grad_norm": 0.8461725115776062, + "kl": 0.64466552734375, + "learning_rate": 1.3474919520381673e-05, + "loss": 0.0258, + "reward": 1.1921875, + "reward_std": 0.20436475947499275, + "rewards/accuracy_reward": 0.20234375, + "rewards/format_reward": 0.98984375, + "step": 1015 + }, + { + "completion_length": 95.0921875, + "epoch": 0.4505673504320698, + "grad_norm": 0.5356422066688538, + "kl": 0.674560546875, + "learning_rate": 1.3402473550868769e-05, + "loss": 0.027, + "reward": 1.178125, + "reward_std": 0.23662711773067713, + "rewards/accuracy_reward": 0.2046875, + "rewards/format_reward": 0.9734375, + "step": 1020 + }, + { + "completion_length": 126.2546875, + "epoch": 0.45277601391457994, + "grad_norm": 0.44370850920677185, + "kl": 0.50389404296875, + "learning_rate": 1.3329825057209446e-05, + "loss": 0.0202, + "reward": 1.1453125, + "reward_std": 0.2518596975132823, + "rewards/accuracy_reward": 0.20078125, + "rewards/format_reward": 0.94453125, + "step": 1025 + }, + { + "completion_length": 100.384375, + "epoch": 0.4549846773970901, + "grad_norm": 0.30460554361343384, + "kl": 0.5373046875, + "learning_rate": 1.3256978363632515e-05, + "loss": 0.0215, + "reward": 1.18984375, + "reward_std": 0.21483363024890423, + "rewards/accuracy_reward": 0.228125, + "rewards/format_reward": 0.96171875, + "step": 1030 + }, + { + "completion_length": 106.22578125, + "epoch": 0.45719334087960023, + "grad_norm": 0.5371220707893372, + "kl": 0.5292236328125, + "learning_rate": 1.3183937806164174e-05, + "loss": 0.0212, + "reward": 1.18671875, + "reward_std": 0.264132690615952, + "rewards/accuracy_reward": 0.23125, + "rewards/format_reward": 0.95546875, + "step": 1035 + }, + { + "completion_length": 90.2515625, + "epoch": 0.45940200436211037, + "grad_norm": 0.6759688854217529, + "kl": 0.541845703125, + "learning_rate": 1.3110707732369896e-05, + "loss": 0.0217, + "reward": 1.1765625, + "reward_std": 0.22657935097813606, + "rewards/accuracy_reward": 0.2109375, + "rewards/format_reward": 0.965625, + "step": 1040 + }, + { + "completion_length": 129.69609375, + "epoch": 0.4616106678446205, + "grad_norm": 0.425448477268219, + "kl": 0.549853515625, + "learning_rate": 1.3037292501095674e-05, + "loss": 0.022, + "reward": 1.128125, + "reward_std": 0.24908192362636328, + "rewards/accuracy_reward": 0.1953125, + "rewards/format_reward": 0.9328125, + "step": 1045 + }, + { + "completion_length": 100.40859375, + "epoch": 0.46381933132713066, + "grad_norm": 0.36143267154693604, + "kl": 0.5515869140625, + "learning_rate": 1.2963696482208552e-05, + "loss": 0.0221, + "reward": 1.16484375, + "reward_std": 0.22955130971968174, + "rewards/accuracy_reward": 0.1984375, + "rewards/format_reward": 0.96640625, + "step": 1050 + }, + { + "completion_length": 86.01484375, + "epoch": 0.4660279948096408, + "grad_norm": 0.3947383463382721, + "kl": 0.59249267578125, + "learning_rate": 1.2889924056336531e-05, + "loss": 0.0237, + "reward": 1.15703125, + "reward_std": 0.17950487434864043, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.96953125, + "step": 1055 + }, + { + "completion_length": 82.04375, + "epoch": 0.46823665829215094, + "grad_norm": 0.6859544515609741, + "kl": 0.5910888671875, + "learning_rate": 1.2815979614607818e-05, + "loss": 0.0236, + "reward": 1.1953125, + "reward_std": 0.20693482737988234, + "rewards/accuracy_reward": 0.21953125, + "rewards/format_reward": 0.97578125, + "step": 1060 + }, + { + "completion_length": 95.7609375, + "epoch": 0.4704453217746611, + "grad_norm": 0.36446619033813477, + "kl": 0.541748046875, + "learning_rate": 1.274186755838945e-05, + "loss": 0.0217, + "reward": 1.19375, + "reward_std": 0.2092124553397298, + "rewards/accuracy_reward": 0.2203125, + "rewards/format_reward": 0.9734375, + "step": 1065 + }, + { + "completion_length": 139.6734375, + "epoch": 0.4726539852571712, + "grad_norm": 0.5893868803977966, + "kl": 0.471240234375, + "learning_rate": 1.2667592299025331e-05, + "loss": 0.0188, + "reward": 1.1484375, + "reward_std": 0.23675706721842288, + "rewards/accuracy_reward": 0.19453125, + "rewards/format_reward": 0.95390625, + "step": 1070 + }, + { + "completion_length": 116.73828125, + "epoch": 0.4748626487396814, + "grad_norm": 0.4791277348995209, + "kl": 0.50758056640625, + "learning_rate": 1.259315825757362e-05, + "loss": 0.0203, + "reward": 1.11796875, + "reward_std": 0.20144069343805313, + "rewards/accuracy_reward": 0.15, + "rewards/format_reward": 0.96796875, + "step": 1075 + }, + { + "completion_length": 115.9921875, + "epoch": 0.47707131222219157, + "grad_norm": 0.5381553173065186, + "kl": 0.49412841796875, + "learning_rate": 1.251856986454363e-05, + "loss": 0.0198, + "reward": 1.19765625, + "reward_std": 0.24683814216405153, + "rewards/accuracy_reward": 0.23359375, + "rewards/format_reward": 0.9640625, + "step": 1080 + }, + { + "completion_length": 122.90390625, + "epoch": 0.4792799757047017, + "grad_norm": 0.5010456442832947, + "kl": 0.4677001953125, + "learning_rate": 1.2443831559632065e-05, + "loss": 0.0187, + "reward": 1.15546875, + "reward_std": 0.2554084826260805, + "rewards/accuracy_reward": 0.1984375, + "rewards/format_reward": 0.95703125, + "step": 1085 + }, + { + "completion_length": 151.96953125, + "epoch": 0.48148863918721185, + "grad_norm": 0.6068748235702515, + "kl": 0.54783935546875, + "learning_rate": 1.2368947791458785e-05, + "loss": 0.0219, + "reward": 1.14296875, + "reward_std": 0.31096592992544175, + "rewards/accuracy_reward": 0.21015625, + "rewards/format_reward": 0.9328125, + "step": 1090 + }, + { + "completion_length": 125.5328125, + "epoch": 0.483697302669722, + "grad_norm": 0.37912383675575256, + "kl": 0.45006103515625, + "learning_rate": 1.2293923017302004e-05, + "loss": 0.018, + "reward": 1.1578125, + "reward_std": 0.2355531807988882, + "rewards/accuracy_reward": 0.20859375, + "rewards/format_reward": 0.94921875, + "step": 1095 + }, + { + "completion_length": 93.94375, + "epoch": 0.48590596615223214, + "grad_norm": 0.45108431577682495, + "kl": 0.535595703125, + "learning_rate": 1.221876170283298e-05, + "loss": 0.0214, + "reward": 1.16640625, + "reward_std": 0.1948365481570363, + "rewards/accuracy_reward": 0.19609375, + "rewards/format_reward": 0.9703125, + "step": 1100 + }, + { + "epoch": 0.48590596615223214, + "eval_completion_length": 87.3554167175293, + "eval_kl": 0.49109375, + "eval_loss": 0.019712308421730995, + "eval_reward": 1.206666669845581, + "eval_reward_std": 0.194391932785511, + "eval_rewards/accuracy_reward": 0.2316666667163372, + "eval_rewards/format_reward": 0.975, + "eval_runtime": 107.5828, + "eval_samples_per_second": 0.92, + "eval_steps_per_second": 0.037, + "step": 1100 + }, + { + "completion_length": 93.66171875, + "epoch": 0.4881146296347423, + "grad_norm": 0.3893759548664093, + "kl": 0.50633544921875, + "learning_rate": 1.214346832185021e-05, + "loss": 0.0203, + "reward": 1.18046875, + "reward_std": 0.19928432293236256, + "rewards/accuracy_reward": 0.20625, + "rewards/format_reward": 0.97421875, + "step": 1105 + }, + { + "completion_length": 113.6171875, + "epoch": 0.4903232931172524, + "grad_norm": 0.5400230288505554, + "kl": 0.63984375, + "learning_rate": 1.2068047356013136e-05, + "loss": 0.0256, + "reward": 1.13359375, + "reward_std": 0.24417215697467326, + "rewards/accuracy_reward": 0.18203125, + "rewards/format_reward": 0.9515625, + "step": 1110 + }, + { + "completion_length": 112.42421875, + "epoch": 0.49253195659976257, + "grad_norm": 0.4314129054546356, + "kl": 0.5706787109375, + "learning_rate": 1.1992503294575385e-05, + "loss": 0.0228, + "reward": 1.16171875, + "reward_std": 0.21959545239806175, + "rewards/accuracy_reward": 0.203125, + "rewards/format_reward": 0.95859375, + "step": 1115 + }, + { + "completion_length": 116.0015625, + "epoch": 0.4947406200822727, + "grad_norm": 1.4012691974639893, + "kl": 0.51864013671875, + "learning_rate": 1.1916840634117555e-05, + "loss": 0.0207, + "reward": 1.1296875, + "reward_std": 0.2404359621927142, + "rewards/accuracy_reward": 0.17421875, + "rewards/format_reward": 0.95546875, + "step": 1120 + }, + { + "completion_length": 119.28984375, + "epoch": 0.49694928356478285, + "grad_norm": 0.530860960483551, + "kl": 0.5022705078125, + "learning_rate": 1.1841063878279572e-05, + "loss": 0.0201, + "reward": 1.18203125, + "reward_std": 0.25134353432804346, + "rewards/accuracy_reward": 0.2265625, + "rewards/format_reward": 0.95546875, + "step": 1125 + }, + { + "completion_length": 108.6421875, + "epoch": 0.499157947047293, + "grad_norm": 0.4978342056274414, + "kl": 0.541162109375, + "learning_rate": 1.1765177537492616e-05, + "loss": 0.0216, + "reward": 1.18359375, + "reward_std": 0.24454718120396138, + "rewards/accuracy_reward": 0.225, + "rewards/format_reward": 0.95859375, + "step": 1130 + }, + { + "completion_length": 85.57890625, + "epoch": 0.5013666105298031, + "grad_norm": 0.4762633144855499, + "kl": 0.542431640625, + "learning_rate": 1.1689186128710654e-05, + "loss": 0.0217, + "reward": 1.19609375, + "reward_std": 0.1994084009900689, + "rewards/accuracy_reward": 0.21796875, + "rewards/format_reward": 0.978125, + "step": 1135 + }, + { + "completion_length": 98.32109375, + "epoch": 0.5035752740123133, + "grad_norm": 0.4662761092185974, + "kl": 0.5237060546875, + "learning_rate": 1.1613094175141568e-05, + "loss": 0.0209, + "reward": 1.17734375, + "reward_std": 0.1960198676213622, + "rewards/accuracy_reward": 0.2, + "rewards/format_reward": 0.97734375, + "step": 1140 + }, + { + "completion_length": 129.81640625, + "epoch": 0.5057839374948234, + "grad_norm": 0.6282123923301697, + "kl": 0.52607421875, + "learning_rate": 1.1536906205977936e-05, + "loss": 0.021, + "reward": 1.2, + "reward_std": 0.24621726330369711, + "rewards/accuracy_reward": 0.2453125, + "rewards/format_reward": 0.9546875, + "step": 1145 + }, + { + "completion_length": 144.84921875, + "epoch": 0.5079926009773336, + "grad_norm": 0.8076626658439636, + "kl": 0.6271484375, + "learning_rate": 1.1460626756127431e-05, + "loss": 0.0251, + "reward": 1.2, + "reward_std": 0.2839036539196968, + "rewards/accuracy_reward": 0.25546875, + "rewards/format_reward": 0.94453125, + "step": 1150 + }, + { + "completion_length": 133.01328125, + "epoch": 0.5102012644598437, + "grad_norm": 0.7102019190788269, + "kl": 0.682958984375, + "learning_rate": 1.1384260365942905e-05, + "loss": 0.0273, + "reward": 1.14140625, + "reward_std": 0.22996564749628307, + "rewards/accuracy_reward": 0.184375, + "rewards/format_reward": 0.95703125, + "step": 1155 + }, + { + "completion_length": 118.7953125, + "epoch": 0.5124099279423538, + "grad_norm": 0.6336015462875366, + "kl": 0.6332275390625, + "learning_rate": 1.1307811580952113e-05, + "loss": 0.0253, + "reward": 1.17734375, + "reward_std": 0.2275516463443637, + "rewards/accuracy_reward": 0.2140625, + "rewards/format_reward": 0.96328125, + "step": 1160 + }, + { + "completion_length": 125.0515625, + "epoch": 0.514618591424864, + "grad_norm": 0.5651206374168396, + "kl": 0.5558349609375, + "learning_rate": 1.123128495158718e-05, + "loss": 0.0222, + "reward": 1.2, + "reward_std": 0.22535169757902623, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 0.965625, + "step": 1165 + }, + { + "completion_length": 150.834375, + "epoch": 0.5168272549073741, + "grad_norm": 0.5256941914558411, + "kl": 0.49346923828125, + "learning_rate": 1.1154685032913719e-05, + "loss": 0.0197, + "reward": 1.13046875, + "reward_std": 0.22142891082912683, + "rewards/accuracy_reward": 0.17109375, + "rewards/format_reward": 0.959375, + "step": 1170 + }, + { + "completion_length": 147.46875, + "epoch": 0.5190359183898843, + "grad_norm": 1.1057101488113403, + "kl": 0.4740234375, + "learning_rate": 1.1078016384359725e-05, + "loss": 0.019, + "reward": 1.17734375, + "reward_std": 0.2350118851289153, + "rewards/accuracy_reward": 0.21484375, + "rewards/format_reward": 0.9625, + "step": 1175 + }, + { + "completion_length": 165.5859375, + "epoch": 0.5212445818723944, + "grad_norm": 0.5781757235527039, + "kl": 0.5138427734375, + "learning_rate": 1.100128356944417e-05, + "loss": 0.0206, + "reward": 1.15703125, + "reward_std": 0.3072576764971018, + "rewards/accuracy_reward": 0.23515625, + "rewards/format_reward": 0.921875, + "step": 1180 + }, + { + "completion_length": 133.359375, + "epoch": 0.5234532453549046, + "grad_norm": 0.5776464343070984, + "kl": 0.7113037109375, + "learning_rate": 1.0924491155505375e-05, + "loss": 0.0285, + "reward": 1.121875, + "reward_std": 0.3050299068912864, + "rewards/accuracy_reward": 0.19765625, + "rewards/format_reward": 0.92421875, + "step": 1185 + }, + { + "completion_length": 119.27421875, + "epoch": 0.5256619088374147, + "grad_norm": 0.7527931928634644, + "kl": 0.5626953125, + "learning_rate": 1.0847643713429155e-05, + "loss": 0.0225, + "reward": 1.16015625, + "reward_std": 0.2789567396044731, + "rewards/accuracy_reward": 0.2171875, + "rewards/format_reward": 0.94296875, + "step": 1190 + }, + { + "completion_length": 75.74609375, + "epoch": 0.527870572319925, + "grad_norm": 0.5353002548217773, + "kl": 0.5966064453125, + "learning_rate": 1.0770745817376741e-05, + "loss": 0.0239, + "reward": 1.171875, + "reward_std": 0.1676252031698823, + "rewards/accuracy_reward": 0.19140625, + "rewards/format_reward": 0.98046875, + "step": 1195 + }, + { + "completion_length": 74.70078125, + "epoch": 0.5300792358024351, + "grad_norm": 0.7278808951377869, + "kl": 0.5812255859375, + "learning_rate": 1.0693802044512525e-05, + "loss": 0.0233, + "reward": 1.17734375, + "reward_std": 0.16255829595029353, + "rewards/accuracy_reward": 0.1890625, + "rewards/format_reward": 0.98828125, + "step": 1200 + }, + { + "epoch": 0.5300792358024351, + "eval_completion_length": 84.3741668701172, + "eval_kl": 0.55484375, + "eval_loss": 0.022250505164265633, + "eval_reward": 1.195, + "eval_reward_std": 0.15836685180664062, + "eval_rewards/accuracy_reward": 0.2025, + "eval_rewards/format_reward": 0.9925, + "eval_runtime": 102.7549, + "eval_samples_per_second": 0.963, + "eval_steps_per_second": 0.039, + "step": 1200 + }, + { + "completion_length": 89.65546875, + "epoch": 0.5322878992849452, + "grad_norm": 0.8118963241577148, + "kl": 0.56036376953125, + "learning_rate": 1.061681697473159e-05, + "loss": 0.0224, + "reward": 1.23828125, + "reward_std": 0.16189471799880267, + "rewards/accuracy_reward": 0.24609375, + "rewards/format_reward": 0.9921875, + "step": 1205 + }, + { + "completion_length": 171.05078125, + "epoch": 0.5344965627674554, + "grad_norm": 1.0046565532684326, + "kl": 1.409912109375, + "learning_rate": 1.0539795190387141e-05, + "loss": 0.0564, + "reward": 1.14140625, + "reward_std": 0.23411216996610165, + "rewards/accuracy_reward": 0.17578125, + "rewards/format_reward": 0.965625, + "step": 1210 + }, + { + "completion_length": 244.696875, + "epoch": 0.5367052262499655, + "grad_norm": 1.4813671112060547, + "kl": 1.9112060546875, + "learning_rate": 1.0462741276017711e-05, + "loss": 0.0765, + "reward": 1.12265625, + "reward_std": 0.312072067707777, + "rewards/accuracy_reward": 0.203125, + "rewards/format_reward": 0.91953125, + "step": 1215 + }, + { + "completion_length": 137.07109375, + "epoch": 0.5389138897324757, + "grad_norm": 0.44882670044898987, + "kl": 1.1479248046875, + "learning_rate": 1.038565981807431e-05, + "loss": 0.0459, + "reward": 1.165625, + "reward_std": 0.24441679026931523, + "rewards/accuracy_reward": 0.2, + "rewards/format_reward": 0.965625, + "step": 1220 + }, + { + "completion_length": 112.965625, + "epoch": 0.5411225532149858, + "grad_norm": 1.2693246603012085, + "kl": 0.73072509765625, + "learning_rate": 1.0308555404647407e-05, + "loss": 0.0292, + "reward": 1.190625, + "reward_std": 0.2190632749348879, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 0.971875, + "step": 1225 + }, + { + "completion_length": 109.203125, + "epoch": 0.543331216697496, + "grad_norm": 0.6727134585380554, + "kl": 0.66895751953125, + "learning_rate": 1.0231432625193842e-05, + "loss": 0.0267, + "reward": 1.23359375, + "reward_std": 0.2255825974047184, + "rewards/accuracy_reward": 0.25546875, + "rewards/format_reward": 0.978125, + "step": 1230 + }, + { + "completion_length": 114.903125, + "epoch": 0.5455398801800061, + "grad_norm": 0.5706783533096313, + "kl": 0.6170654296875, + "learning_rate": 1.0154296070263649e-05, + "loss": 0.0247, + "reward": 1.1875, + "reward_std": 0.2010749163106084, + "rewards/accuracy_reward": 0.2125, + "rewards/format_reward": 0.975, + "step": 1235 + }, + { + "completion_length": 121.5609375, + "epoch": 0.5477485436625162, + "grad_norm": 0.6042254567146301, + "kl": 0.58348388671875, + "learning_rate": 1.0077150331226822e-05, + "loss": 0.0233, + "reward": 1.25546875, + "reward_std": 0.22948720771819353, + "rewards/accuracy_reward": 0.2875, + "rewards/format_reward": 0.96796875, + "step": 1240 + }, + { + "completion_length": 130.03984375, + "epoch": 0.5499572071450264, + "grad_norm": 0.6111878752708435, + "kl": 0.63865966796875, + "learning_rate": 1e-05, + "loss": 0.0256, + "reward": 1.19375, + "reward_std": 0.25420499257743356, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 0.959375, + "step": 1245 + }, + { + "completion_length": 154.44453125, + "epoch": 0.5521658706275365, + "grad_norm": 2.512364387512207, + "kl": 0.91964111328125, + "learning_rate": 9.922849668773181e-06, + "loss": 0.0368, + "reward": 1.12890625, + "reward_std": 0.29293579459190366, + "rewards/accuracy_reward": 0.203125, + "rewards/format_reward": 0.92578125, + "step": 1250 + }, + { + "completion_length": 129.0, + "epoch": 0.5543745341100467, + "grad_norm": 5.317835807800293, + "kl": 1.59818115234375, + "learning_rate": 9.845703929736351e-06, + "loss": 0.0639, + "reward": 1.18984375, + "reward_std": 0.26693961266428234, + "rewards/accuracy_reward": 0.23984375, + "rewards/format_reward": 0.95, + "step": 1255 + }, + { + "completion_length": 108.20859375, + "epoch": 0.5565831975925568, + "grad_norm": 0.5913572311401367, + "kl": 1.3113525390625, + "learning_rate": 9.768567374806163e-06, + "loss": 0.0524, + "reward": 1.153125, + "reward_std": 0.2126995487138629, + "rewards/accuracy_reward": 0.18359375, + "rewards/format_reward": 0.96953125, + "step": 1260 + }, + { + "completion_length": 107.97890625, + "epoch": 0.558791861075067, + "grad_norm": 1.2350952625274658, + "kl": 0.59656982421875, + "learning_rate": 9.691444595352596e-06, + "loss": 0.0239, + "reward": 1.23359375, + "reward_std": 0.21166725642979145, + "rewards/accuracy_reward": 0.2640625, + "rewards/format_reward": 0.96953125, + "step": 1265 + }, + { + "completion_length": 145.5171875, + "epoch": 0.5610005245575771, + "grad_norm": 0.784472644329071, + "kl": 0.96728515625, + "learning_rate": 9.614340181925692e-06, + "loss": 0.0387, + "reward": 1.15625, + "reward_std": 0.2822375038638711, + "rewards/accuracy_reward": 0.21328125, + "rewards/format_reward": 0.94296875, + "step": 1270 + }, + { + "completion_length": 147.246875, + "epoch": 0.5632091880400872, + "grad_norm": 0.712846577167511, + "kl": 0.7251220703125, + "learning_rate": 9.53725872398229e-06, + "loss": 0.029, + "reward": 1.11328125, + "reward_std": 0.28186143897473814, + "rewards/accuracy_reward": 0.18984375, + "rewards/format_reward": 0.9234375, + "step": 1275 + }, + { + "completion_length": 83.80546875, + "epoch": 0.5654178515225974, + "grad_norm": 0.5126023888587952, + "kl": 0.632373046875, + "learning_rate": 9.460204809612864e-06, + "loss": 0.0253, + "reward": 1.12734375, + "reward_std": 0.193264627084136, + "rewards/accuracy_reward": 0.14609375, + "rewards/format_reward": 0.98125, + "step": 1280 + }, + { + "completion_length": 62.38515625, + "epoch": 0.5676265150051075, + "grad_norm": 0.3859677314758301, + "kl": 0.5292724609375, + "learning_rate": 9.383183025268411e-06, + "loss": 0.0212, + "reward": 1.21328125, + "reward_std": 0.15019516460597515, + "rewards/accuracy_reward": 0.21796875, + "rewards/format_reward": 0.9953125, + "step": 1285 + }, + { + "completion_length": 69.62578125, + "epoch": 0.5698351784876177, + "grad_norm": 0.4441126883029938, + "kl": 0.4895263671875, + "learning_rate": 9.306197955487479e-06, + "loss": 0.0196, + "reward": 1.22265625, + "reward_std": 0.1509174121543765, + "rewards/accuracy_reward": 0.22578125, + "rewards/format_reward": 0.996875, + "step": 1290 + }, + { + "completion_length": 90.8921875, + "epoch": 0.5720438419701278, + "grad_norm": 0.24060453474521637, + "kl": 0.4620361328125, + "learning_rate": 9.22925418262326e-06, + "loss": 0.0185, + "reward": 1.21015625, + "reward_std": 0.15958714820444583, + "rewards/accuracy_reward": 0.215625, + "rewards/format_reward": 0.99453125, + "step": 1295 + }, + { + "completion_length": 113.90625, + "epoch": 0.574252505452638, + "grad_norm": 0.5410070419311523, + "kl": 0.451025390625, + "learning_rate": 9.15235628657085e-06, + "loss": 0.018, + "reward": 1.253125, + "reward_std": 0.21325785480439663, + "rewards/accuracy_reward": 0.26953125, + "rewards/format_reward": 0.98359375, + "step": 1300 + }, + { + "epoch": 0.574252505452638, + "eval_completion_length": 121.16583343505859, + "eval_kl": 0.428828125, + "eval_loss": 0.017305398359894753, + "eval_reward": 1.22625, + "eval_reward_std": 0.23555977791547775, + "eval_rewards/accuracy_reward": 0.2533333334326744, + "eval_rewards/format_reward": 0.9729166674613953, + "eval_runtime": 139.5802, + "eval_samples_per_second": 0.709, + "eval_steps_per_second": 0.029, + "step": 1300 + }, + { + "completion_length": 123.81796875, + "epoch": 0.5764611689351481, + "grad_norm": 0.5413645505905151, + "kl": 0.43470458984375, + "learning_rate": 9.07550884449463e-06, + "loss": 0.0174, + "reward": 1.178125, + "reward_std": 0.20707119330763818, + "rewards/accuracy_reward": 0.209375, + "rewards/format_reward": 0.96875, + "step": 1305 + }, + { + "completion_length": 143.82578125, + "epoch": 0.5786698324176582, + "grad_norm": 0.4805835783481598, + "kl": 0.44522705078125, + "learning_rate": 8.998716430555832e-06, + "loss": 0.0178, + "reward": 1.128125, + "reward_std": 0.2421926449984312, + "rewards/accuracy_reward": 0.17890625, + "rewards/format_reward": 0.94921875, + "step": 1310 + }, + { + "completion_length": 146.4421875, + "epoch": 0.5808784959001684, + "grad_norm": 0.36650240421295166, + "kl": 0.50875244140625, + "learning_rate": 8.921983615640277e-06, + "loss": 0.0203, + "reward": 1.13515625, + "reward_std": 0.2266262538731098, + "rewards/accuracy_reward": 0.1765625, + "rewards/format_reward": 0.95859375, + "step": 1315 + }, + { + "completion_length": 125.47421875, + "epoch": 0.5830871593826785, + "grad_norm": 0.431325227022171, + "kl": 0.4361328125, + "learning_rate": 8.845314967086281e-06, + "loss": 0.0174, + "reward": 1.13984375, + "reward_std": 0.16203333698213102, + "rewards/accuracy_reward": 0.1609375, + "rewards/format_reward": 0.97890625, + "step": 1320 + }, + { + "completion_length": 116.61015625, + "epoch": 0.5852958228651887, + "grad_norm": 0.44710680842399597, + "kl": 0.41380615234375, + "learning_rate": 8.768715048412823e-06, + "loss": 0.0166, + "reward": 1.215625, + "reward_std": 0.19275038037449121, + "rewards/accuracy_reward": 0.228125, + "rewards/format_reward": 0.9875, + "step": 1325 + }, + { + "completion_length": 119.11171875, + "epoch": 0.5875044863476988, + "grad_norm": 0.49566569924354553, + "kl": 0.420166015625, + "learning_rate": 8.692188419047889e-06, + "loss": 0.0168, + "reward": 1.19375, + "reward_std": 0.1878614580258727, + "rewards/accuracy_reward": 0.20625, + "rewards/format_reward": 0.9875, + "step": 1330 + }, + { + "completion_length": 132.55390625, + "epoch": 0.589713149830209, + "grad_norm": 0.5140964388847351, + "kl": 0.447314453125, + "learning_rate": 8.615739634057098e-06, + "loss": 0.0179, + "reward": 1.21171875, + "reward_std": 0.19049166329205036, + "rewards/accuracy_reward": 0.2328125, + "rewards/format_reward": 0.97890625, + "step": 1335 + }, + { + "completion_length": 147.45390625, + "epoch": 0.5919218133127191, + "grad_norm": 0.5098828673362732, + "kl": 0.456298828125, + "learning_rate": 8.539373243872569e-06, + "loss": 0.0182, + "reward": 1.18359375, + "reward_std": 0.20728036612272263, + "rewards/accuracy_reward": 0.21171875, + "rewards/format_reward": 0.971875, + "step": 1340 + }, + { + "completion_length": 148.99296875, + "epoch": 0.5941304767952292, + "grad_norm": 0.5452316999435425, + "kl": 0.48438720703125, + "learning_rate": 8.463093794022069e-06, + "loss": 0.0194, + "reward": 1.2, + "reward_std": 0.26944386642426255, + "rewards/accuracy_reward": 0.24140625, + "rewards/format_reward": 0.95859375, + "step": 1345 + }, + { + "completion_length": 146.82578125, + "epoch": 0.5963391402777395, + "grad_norm": 0.3637787103652954, + "kl": 0.42889404296875, + "learning_rate": 8.386905824858436e-06, + "loss": 0.0172, + "reward": 1.165625, + "reward_std": 0.18749849069863558, + "rewards/accuracy_reward": 0.1921875, + "rewards/format_reward": 0.9734375, + "step": 1350 + }, + { + "completion_length": 163.79296875, + "epoch": 0.5985478037602496, + "grad_norm": 0.5319222807884216, + "kl": 0.41142578125, + "learning_rate": 8.310813871289349e-06, + "loss": 0.0165, + "reward": 1.1984375, + "reward_std": 0.25758711863309147, + "rewards/accuracy_reward": 0.2375, + "rewards/format_reward": 0.9609375, + "step": 1355 + }, + { + "completion_length": 158.078125, + "epoch": 0.6007564672427598, + "grad_norm": 0.4595475494861603, + "kl": 0.4110107421875, + "learning_rate": 8.234822462507384e-06, + "loss": 0.0164, + "reward": 1.2328125, + "reward_std": 0.21721374820917844, + "rewards/accuracy_reward": 0.25625, + "rewards/format_reward": 0.9765625, + "step": 1360 + }, + { + "completion_length": 145.98046875, + "epoch": 0.6029651307252699, + "grad_norm": 0.49845319986343384, + "kl": 0.41668701171875, + "learning_rate": 8.158936121720433e-06, + "loss": 0.0167, + "reward": 1.24609375, + "reward_std": 0.22847788464277982, + "rewards/accuracy_reward": 0.2765625, + "rewards/format_reward": 0.96953125, + "step": 1365 + }, + { + "completion_length": 171.2109375, + "epoch": 0.6051737942077801, + "grad_norm": 0.5033155083656311, + "kl": 0.4380126953125, + "learning_rate": 8.08315936588245e-06, + "loss": 0.0175, + "reward": 1.20390625, + "reward_std": 0.22734466083347799, + "rewards/accuracy_reward": 0.24765625, + "rewards/format_reward": 0.95625, + "step": 1370 + }, + { + "completion_length": 154.93125, + "epoch": 0.6073824576902902, + "grad_norm": 0.5040455460548401, + "kl": 0.41199951171875, + "learning_rate": 8.00749670542462e-06, + "loss": 0.0165, + "reward": 1.22734375, + "reward_std": 0.21946511473506689, + "rewards/accuracy_reward": 0.246875, + "rewards/format_reward": 0.98046875, + "step": 1375 + }, + { + "completion_length": 174.39609375, + "epoch": 0.6095911211728003, + "grad_norm": 0.6203243136405945, + "kl": 0.4758056640625, + "learning_rate": 7.931952643986866e-06, + "loss": 0.019, + "reward": 1.225, + "reward_std": 0.25668525900691747, + "rewards/accuracy_reward": 0.259375, + "rewards/format_reward": 0.965625, + "step": 1380 + }, + { + "completion_length": 148.69375, + "epoch": 0.6117997846553105, + "grad_norm": 3.571584939956665, + "kl": 0.6333251953125, + "learning_rate": 7.856531678149792e-06, + "loss": 0.0253, + "reward": 1.21015625, + "reward_std": 0.19479831736534833, + "rewards/accuracy_reward": 0.23359375, + "rewards/format_reward": 0.9765625, + "step": 1385 + }, + { + "completion_length": 163.81796875, + "epoch": 0.6140084481378206, + "grad_norm": 1.1779475212097168, + "kl": 0.9492919921875, + "learning_rate": 7.781238297167025e-06, + "loss": 0.0379, + "reward": 1.1703125, + "reward_std": 0.22253222949802876, + "rewards/accuracy_reward": 0.2109375, + "rewards/format_reward": 0.959375, + "step": 1390 + }, + { + "completion_length": 160.65625, + "epoch": 0.6162171116203308, + "grad_norm": 1.4942560195922852, + "kl": 1.04031982421875, + "learning_rate": 7.706076982698e-06, + "loss": 0.0416, + "reward": 1.2109375, + "reward_std": 0.25693559013307093, + "rewards/accuracy_reward": 0.2578125, + "rewards/format_reward": 0.953125, + "step": 1395 + }, + { + "completion_length": 142.009375, + "epoch": 0.6184257751028409, + "grad_norm": 0.8941807746887207, + "kl": 1.5090576171875, + "learning_rate": 7.631052208541217e-06, + "loss": 0.0605, + "reward": 1.24453125, + "reward_std": 0.24907068870961666, + "rewards/accuracy_reward": 0.27109375, + "rewards/format_reward": 0.9734375, + "step": 1400 + }, + { + "epoch": 0.6184257751028409, + "eval_completion_length": 126.1554168701172, + "eval_kl": 0.51203125, + "eval_loss": 0.020548321306705475, + "eval_reward": 1.2670833349227906, + "eval_reward_std": 0.2361640551686287, + "eval_rewards/accuracy_reward": 0.2795833334326744, + "eval_rewards/format_reward": 0.9875, + "eval_runtime": 108.4132, + "eval_samples_per_second": 0.913, + "eval_steps_per_second": 0.037, + "step": 1400 + }, + { + "completion_length": 129.46015625, + "epoch": 0.6206344385853511, + "grad_norm": 0.5738782286643982, + "kl": 0.52000732421875, + "learning_rate": 7.5561684403679355e-06, + "loss": 0.0208, + "reward": 1.159375, + "reward_std": 0.18879235051572324, + "rewards/accuracy_reward": 0.1828125, + "rewards/format_reward": 0.9765625, + "step": 1405 + }, + { + "completion_length": 160.28828125, + "epoch": 0.6228431020678612, + "grad_norm": 0.8489373326301575, + "kl": 0.78538818359375, + "learning_rate": 7.4814301354563735e-06, + "loss": 0.0314, + "reward": 1.1765625, + "reward_std": 0.2816866671666503, + "rewards/accuracy_reward": 0.23046875, + "rewards/format_reward": 0.94609375, + "step": 1410 + }, + { + "completion_length": 130.85, + "epoch": 0.6250517655503713, + "grad_norm": 0.4323514997959137, + "kl": 0.4590087890625, + "learning_rate": 7.40684174242638e-06, + "loss": 0.0184, + "reward": 1.1921875, + "reward_std": 0.20537027660757304, + "rewards/accuracy_reward": 0.21796875, + "rewards/format_reward": 0.97421875, + "step": 1415 + }, + { + "completion_length": 129.54921875, + "epoch": 0.6272604290328815, + "grad_norm": 0.4422205686569214, + "kl": 0.44378662109375, + "learning_rate": 7.332407700974673e-06, + "loss": 0.0178, + "reward": 1.2109375, + "reward_std": 0.22267536614090205, + "rewards/accuracy_reward": 0.23125, + "rewards/format_reward": 0.9796875, + "step": 1420 + }, + { + "completion_length": 156.784375, + "epoch": 0.6294690925153916, + "grad_norm": 0.6342864632606506, + "kl": 0.4815185546875, + "learning_rate": 7.258132441610548e-06, + "loss": 0.0193, + "reward": 1.225, + "reward_std": 0.27565329764038327, + "rewards/accuracy_reward": 0.2703125, + "rewards/format_reward": 0.9546875, + "step": 1425 + }, + { + "completion_length": 189.24375, + "epoch": 0.6316777559979018, + "grad_norm": 0.5509055256843567, + "kl": 0.6362548828125, + "learning_rate": 7.184020385392186e-06, + "loss": 0.0254, + "reward": 1.1234375, + "reward_std": 0.2952466538175941, + "rewards/accuracy_reward": 0.19765625, + "rewards/format_reward": 0.92578125, + "step": 1430 + }, + { + "completion_length": 125.340625, + "epoch": 0.6338864194804119, + "grad_norm": 0.44184309244155884, + "kl": 0.415966796875, + "learning_rate": 7.110075943663473e-06, + "loss": 0.0166, + "reward": 1.25625, + "reward_std": 0.20973300114274024, + "rewards/accuracy_reward": 0.2796875, + "rewards/format_reward": 0.9765625, + "step": 1435 + }, + { + "completion_length": 110.86875, + "epoch": 0.636095082962922, + "grad_norm": 0.393185019493103, + "kl": 0.45828857421875, + "learning_rate": 7.0363035177914505e-06, + "loss": 0.0183, + "reward": 1.196875, + "reward_std": 0.18387279994785785, + "rewards/accuracy_reward": 0.2203125, + "rewards/format_reward": 0.9765625, + "step": 1440 + }, + { + "completion_length": 121.4484375, + "epoch": 0.6383037464454322, + "grad_norm": 0.791002094745636, + "kl": 0.54757080078125, + "learning_rate": 6.962707498904331e-06, + "loss": 0.0219, + "reward": 1.21328125, + "reward_std": 0.2349924026057124, + "rewards/accuracy_reward": 0.2453125, + "rewards/format_reward": 0.96796875, + "step": 1445 + }, + { + "completion_length": 145.55703125, + "epoch": 0.6405124099279423, + "grad_norm": 0.44890421628952026, + "kl": 0.5147216796875, + "learning_rate": 6.889292267630106e-06, + "loss": 0.0206, + "reward": 1.19375, + "reward_std": 0.24552099388092757, + "rewards/accuracy_reward": 0.22890625, + "rewards/format_reward": 0.96484375, + "step": 1450 + }, + { + "completion_length": 142.5640625, + "epoch": 0.6427210734104525, + "grad_norm": 0.47677525877952576, + "kl": 0.44825439453125, + "learning_rate": 6.81606219383583e-06, + "loss": 0.0179, + "reward": 1.22109375, + "reward_std": 0.18264568988233804, + "rewards/accuracy_reward": 0.2375, + "rewards/format_reward": 0.98359375, + "step": 1455 + }, + { + "completion_length": 141.29765625, + "epoch": 0.6449297368929626, + "grad_norm": 0.5547141432762146, + "kl": 0.45455322265625, + "learning_rate": 6.743021636367488e-06, + "loss": 0.0182, + "reward": 1.24296875, + "reward_std": 0.20987400207668544, + "rewards/accuracy_reward": 0.2703125, + "rewards/format_reward": 0.97265625, + "step": 1460 + }, + { + "completion_length": 141.034375, + "epoch": 0.6471384003754728, + "grad_norm": 0.43943309783935547, + "kl": 0.45238037109375, + "learning_rate": 6.670174942790557e-06, + "loss": 0.0181, + "reward": 1.2390625, + "reward_std": 0.2042137583717704, + "rewards/accuracy_reward": 0.26484375, + "rewards/format_reward": 0.97421875, + "step": 1465 + }, + { + "completion_length": 159.996875, + "epoch": 0.6493470638579829, + "grad_norm": 0.5421745181083679, + "kl": 0.55458984375, + "learning_rate": 6.597526449131232e-06, + "loss": 0.0222, + "reward": 1.26875, + "reward_std": 0.25485040955245497, + "rewards/accuracy_reward": 0.30703125, + "rewards/format_reward": 0.96171875, + "step": 1470 + }, + { + "completion_length": 152.93984375, + "epoch": 0.651555727340493, + "grad_norm": 0.49404215812683105, + "kl": 0.4667724609375, + "learning_rate": 6.525080479618331e-06, + "loss": 0.0187, + "reward": 1.1671875, + "reward_std": 0.22337155733257533, + "rewards/accuracy_reward": 0.1984375, + "rewards/format_reward": 0.96875, + "step": 1475 + }, + { + "completion_length": 142.815625, + "epoch": 0.6537643908230032, + "grad_norm": 0.5135136842727661, + "kl": 0.438818359375, + "learning_rate": 6.452841346425891e-06, + "loss": 0.0176, + "reward": 1.2640625, + "reward_std": 0.24292335454374553, + "rewards/accuracy_reward": 0.2875, + "rewards/format_reward": 0.9765625, + "step": 1480 + }, + { + "completion_length": 133.865625, + "epoch": 0.6559730543055133, + "grad_norm": 0.5439188480377197, + "kl": 0.40977783203125, + "learning_rate": 6.380813349416523e-06, + "loss": 0.0164, + "reward": 1.2625, + "reward_std": 0.21160587538033723, + "rewards/accuracy_reward": 0.27578125, + "rewards/format_reward": 0.98671875, + "step": 1485 + }, + { + "completion_length": 141.23671875, + "epoch": 0.6581817177880235, + "grad_norm": 0.5219649076461792, + "kl": 0.4312255859375, + "learning_rate": 6.309000775885452e-06, + "loss": 0.0172, + "reward": 1.21796875, + "reward_std": 0.21080582737922668, + "rewards/accuracy_reward": 0.24375, + "rewards/format_reward": 0.97421875, + "step": 1490 + }, + { + "completion_length": 156.421875, + "epoch": 0.6603903812705336, + "grad_norm": 0.6208651065826416, + "kl": 0.45198974609375, + "learning_rate": 6.237407900305334e-06, + "loss": 0.0181, + "reward": 1.21328125, + "reward_std": 0.21545952204614877, + "rewards/accuracy_reward": 0.2453125, + "rewards/format_reward": 0.96796875, + "step": 1495 + }, + { + "completion_length": 157.92578125, + "epoch": 0.6625990447530438, + "grad_norm": 0.514742910861969, + "kl": 0.445703125, + "learning_rate": 6.166038984071833e-06, + "loss": 0.0178, + "reward": 1.20859375, + "reward_std": 0.24552375469356774, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.95859375, + "step": 1500 + }, + { + "epoch": 0.6625990447530438, + "eval_completion_length": 150.59583343505858, + "eval_kl": 0.510390625, + "eval_loss": 0.020548084750771523, + "eval_reward": 1.2470833349227906, + "eval_reward_std": 0.23129627466201783, + "eval_rewards/accuracy_reward": 0.2833333334326744, + "eval_rewards/format_reward": 0.96375, + "eval_runtime": 130.7052, + "eval_samples_per_second": 0.757, + "eval_steps_per_second": 0.031, + "step": 1500 + }, + { + "completion_length": 146.13515625, + "epoch": 0.6648077082355539, + "grad_norm": 0.46206313371658325, + "kl": 0.51572265625, + "learning_rate": 6.094898275249952e-06, + "loss": 0.0206, + "reward": 1.2828125, + "reward_std": 0.23776858411729335, + "rewards/accuracy_reward": 0.3203125, + "rewards/format_reward": 0.9625, + "step": 1505 + }, + { + "completion_length": 115.98671875, + "epoch": 0.6670163717180642, + "grad_norm": 0.7776006460189819, + "kl": 0.4756591796875, + "learning_rate": 6.023990008321199e-06, + "loss": 0.019, + "reward": 1.24765625, + "reward_std": 0.21786664836108685, + "rewards/accuracy_reward": 0.26953125, + "rewards/format_reward": 0.978125, + "step": 1510 + }, + { + "completion_length": 125.35625, + "epoch": 0.6692250352005743, + "grad_norm": 0.4395429193973541, + "kl": 0.5373779296875, + "learning_rate": 5.953318403931533e-06, + "loss": 0.0215, + "reward": 1.20703125, + "reward_std": 0.22481790594756604, + "rewards/accuracy_reward": 0.24765625, + "rewards/format_reward": 0.959375, + "step": 1515 + }, + { + "completion_length": 99.33359375, + "epoch": 0.6714336986830844, + "grad_norm": 0.45579713582992554, + "kl": 0.524560546875, + "learning_rate": 5.882887668640138e-06, + "loss": 0.021, + "reward": 1.18828125, + "reward_std": 0.16194322612136602, + "rewards/accuracy_reward": 0.1984375, + "rewards/format_reward": 0.98984375, + "step": 1520 + }, + { + "completion_length": 126.44296875, + "epoch": 0.6736423621655946, + "grad_norm": 0.42683523893356323, + "kl": 0.4496826171875, + "learning_rate": 5.812701994669028e-06, + "loss": 0.018, + "reward": 1.221875, + "reward_std": 0.24250736236572265, + "rewards/accuracy_reward": 0.24921875, + "rewards/format_reward": 0.97265625, + "step": 1525 + }, + { + "completion_length": 179.19921875, + "epoch": 0.6758510256481047, + "grad_norm": 0.9905825257301331, + "kl": 0.49107666015625, + "learning_rate": 5.742765559653537e-06, + "loss": 0.0197, + "reward": 1.16875, + "reward_std": 0.25706158187240363, + "rewards/accuracy_reward": 0.2171875, + "rewards/format_reward": 0.9515625, + "step": 1530 + }, + { + "completion_length": 223.625, + "epoch": 0.6780596891306149, + "grad_norm": 0.45996785163879395, + "kl": 0.45472412109375, + "learning_rate": 5.673082526393634e-06, + "loss": 0.0182, + "reward": 1.1953125, + "reward_std": 0.2674042139202356, + "rewards/accuracy_reward": 0.2484375, + "rewards/format_reward": 0.946875, + "step": 1535 + }, + { + "completion_length": 198.6390625, + "epoch": 0.680268352613125, + "grad_norm": 0.37783390283584595, + "kl": 0.353515625, + "learning_rate": 5.603657042606163e-06, + "loss": 0.0141, + "reward": 1.178125, + "reward_std": 0.21960081458091735, + "rewards/accuracy_reward": 0.21171875, + "rewards/format_reward": 0.96640625, + "step": 1540 + }, + { + "completion_length": 197.028125, + "epoch": 0.6824770160956352, + "grad_norm": 0.6316084265708923, + "kl": 0.368896484375, + "learning_rate": 5.53449324067793e-06, + "loss": 0.0148, + "reward": 1.171875, + "reward_std": 0.2550745034590364, + "rewards/accuracy_reward": 0.21640625, + "rewards/format_reward": 0.95546875, + "step": 1545 + }, + { + "completion_length": 172.515625, + "epoch": 0.6846856795781453, + "grad_norm": 0.5714349746704102, + "kl": 0.41336669921875, + "learning_rate": 5.465595237419768e-06, + "loss": 0.0165, + "reward": 1.265625, + "reward_std": 0.23817113135010004, + "rewards/accuracy_reward": 0.3140625, + "rewards/format_reward": 0.9515625, + "step": 1550 + }, + { + "completion_length": 164.86171875, + "epoch": 0.6868943430606554, + "grad_norm": 0.31497815251350403, + "kl": 0.40863037109375, + "learning_rate": 5.396967133821461e-06, + "loss": 0.0164, + "reward": 1.24296875, + "reward_std": 0.22363899704068899, + "rewards/accuracy_reward": 0.278125, + "rewards/format_reward": 0.96484375, + "step": 1555 + }, + { + "completion_length": 144.21171875, + "epoch": 0.6891030065431656, + "grad_norm": 0.3973537087440491, + "kl": 0.39749755859375, + "learning_rate": 5.3286130148076765e-06, + "loss": 0.0159, + "reward": 1.2203125, + "reward_std": 0.20798758920282126, + "rewards/accuracy_reward": 0.24375, + "rewards/format_reward": 0.9765625, + "step": 1560 + }, + { + "completion_length": 126.52421875, + "epoch": 0.6913116700256757, + "grad_norm": 0.5487494468688965, + "kl": 0.41744384765625, + "learning_rate": 5.260536948994786e-06, + "loss": 0.0167, + "reward": 1.246875, + "reward_std": 0.2289178878068924, + "rewards/accuracy_reward": 0.26640625, + "rewards/format_reward": 0.98046875, + "step": 1565 + }, + { + "completion_length": 153.5671875, + "epoch": 0.6935203335081859, + "grad_norm": 1.368481993675232, + "kl": 0.47470703125, + "learning_rate": 5.192742988448707e-06, + "loss": 0.019, + "reward": 1.22890625, + "reward_std": 0.24065108597278595, + "rewards/accuracy_reward": 0.2828125, + "rewards/format_reward": 0.94609375, + "step": 1570 + }, + { + "completion_length": 152.64296875, + "epoch": 0.695728996990696, + "grad_norm": 0.460273414850235, + "kl": 0.53028564453125, + "learning_rate": 5.125235168443714e-06, + "loss": 0.0212, + "reward": 1.22734375, + "reward_std": 0.26983388569206, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.94609375, + "step": 1575 + }, + { + "completion_length": 121.12890625, + "epoch": 0.6979376604732062, + "grad_norm": 0.5840798020362854, + "kl": 0.4253173828125, + "learning_rate": 5.058017507222254e-06, + "loss": 0.017, + "reward": 1.215625, + "reward_std": 0.20102577321231366, + "rewards/accuracy_reward": 0.240625, + "rewards/format_reward": 0.975, + "step": 1580 + }, + { + "completion_length": 110.19375, + "epoch": 0.7001463239557163, + "grad_norm": 0.48586878180503845, + "kl": 0.4316650390625, + "learning_rate": 4.99109400575575e-06, + "loss": 0.0173, + "reward": 1.28984375, + "reward_std": 0.19885436855256558, + "rewards/accuracy_reward": 0.30546875, + "rewards/format_reward": 0.984375, + "step": 1585 + }, + { + "completion_length": 128.54609375, + "epoch": 0.7023549874382264, + "grad_norm": 1.0986440181732178, + "kl": 0.42666015625, + "learning_rate": 4.924468647506473e-06, + "loss": 0.0171, + "reward": 1.22421875, + "reward_std": 0.23164935149252414, + "rewards/accuracy_reward": 0.2515625, + "rewards/format_reward": 0.97265625, + "step": 1590 + }, + { + "completion_length": 149.9328125, + "epoch": 0.7045636509207366, + "grad_norm": 0.595619797706604, + "kl": 0.46484375, + "learning_rate": 4.8581453981904205e-06, + "loss": 0.0186, + "reward": 1.25078125, + "reward_std": 0.2856699053198099, + "rewards/accuracy_reward": 0.2984375, + "rewards/format_reward": 0.95234375, + "step": 1595 + }, + { + "completion_length": 156.828125, + "epoch": 0.7067723144032467, + "grad_norm": 0.4311577081680298, + "kl": 0.40791015625, + "learning_rate": 4.792128205541286e-06, + "loss": 0.0163, + "reward": 1.18359375, + "reward_std": 0.2468837944790721, + "rewards/accuracy_reward": 0.2296875, + "rewards/format_reward": 0.95390625, + "step": 1600 + }, + { + "epoch": 0.7067723144032467, + "eval_completion_length": 135.73458335876464, + "eval_kl": 0.39359375, + "eval_loss": 0.015719087794423103, + "eval_reward": 1.2620833349227905, + "eval_reward_std": 0.2296227565407753, + "eval_rewards/accuracy_reward": 0.2908333334326744, + "eval_rewards/format_reward": 0.97125, + "eval_runtime": 135.2265, + "eval_samples_per_second": 0.732, + "eval_steps_per_second": 0.03, + "step": 1600 + }, + { + "completion_length": 158.7234375, + "epoch": 0.7089809778857569, + "grad_norm": 0.485534131526947, + "kl": 0.39541015625, + "learning_rate": 4.7264209990754594e-06, + "loss": 0.0158, + "reward": 1.265625, + "reward_std": 0.2561331497505307, + "rewards/accuracy_reward": 0.30703125, + "rewards/format_reward": 0.95859375, + "step": 1605 + }, + { + "completion_length": 163.1359375, + "epoch": 0.711189641368267, + "grad_norm": 0.38874199986457825, + "kl": 0.3953125, + "learning_rate": 4.661027689858142e-06, + "loss": 0.0158, + "reward": 1.15859375, + "reward_std": 0.23531383704394102, + "rewards/accuracy_reward": 0.2046875, + "rewards/format_reward": 0.95390625, + "step": 1610 + }, + { + "completion_length": 131.1453125, + "epoch": 0.7133983048507772, + "grad_norm": 0.7737115025520325, + "kl": 0.432568359375, + "learning_rate": 4.595952170270542e-06, + "loss": 0.0173, + "reward": 1.23828125, + "reward_std": 0.24948414210230113, + "rewards/accuracy_reward": 0.26953125, + "rewards/format_reward": 0.96875, + "step": 1615 + }, + { + "completion_length": 129.2015625, + "epoch": 0.7156069683332873, + "grad_norm": 0.44618454575538635, + "kl": 0.44493408203125, + "learning_rate": 4.5311983137782116e-06, + "loss": 0.0178, + "reward": 1.19453125, + "reward_std": 0.18757406566292048, + "rewards/accuracy_reward": 0.221875, + "rewards/format_reward": 0.97265625, + "step": 1620 + }, + { + "completion_length": 128.13671875, + "epoch": 0.7178156318157974, + "grad_norm": 0.4870763421058655, + "kl": 0.44984130859375, + "learning_rate": 4.4667699747004555e-06, + "loss": 0.018, + "reward": 1.22734375, + "reward_std": 0.2222797654569149, + "rewards/accuracy_reward": 0.2640625, + "rewards/format_reward": 0.96328125, + "step": 1625 + }, + { + "completion_length": 128.1109375, + "epoch": 0.7200242952983076, + "grad_norm": 0.830633819103241, + "kl": 0.471240234375, + "learning_rate": 4.402670987980938e-06, + "loss": 0.0189, + "reward": 1.21796875, + "reward_std": 0.2280710056424141, + "rewards/accuracy_reward": 0.24921875, + "rewards/format_reward": 0.96875, + "step": 1630 + }, + { + "completion_length": 116.93515625, + "epoch": 0.7222329587808177, + "grad_norm": 0.5954543948173523, + "kl": 0.46390380859375, + "learning_rate": 4.3389051689594e-06, + "loss": 0.0186, + "reward": 1.2171875, + "reward_std": 0.21377347223460674, + "rewards/accuracy_reward": 0.24296875, + "rewards/format_reward": 0.97421875, + "step": 1635 + }, + { + "completion_length": 118.27734375, + "epoch": 0.7244416222633279, + "grad_norm": 0.42961063981056213, + "kl": 0.41763916015625, + "learning_rate": 4.275476313144578e-06, + "loss": 0.0167, + "reward": 1.2125, + "reward_std": 0.21657640542834997, + "rewards/accuracy_reward": 0.23828125, + "rewards/format_reward": 0.97421875, + "step": 1640 + }, + { + "completion_length": 113.2859375, + "epoch": 0.726650285745838, + "grad_norm": 0.4778992831707001, + "kl": 0.46826171875, + "learning_rate": 4.212388195988267e-06, + "loss": 0.0187, + "reward": 1.221875, + "reward_std": 0.2040594968944788, + "rewards/accuracy_reward": 0.240625, + "rewards/format_reward": 0.98125, + "step": 1645 + }, + { + "completion_length": 134.99296875, + "epoch": 0.7288589492283482, + "grad_norm": 0.868116021156311, + "kl": 0.4645263671875, + "learning_rate": 4.1496445726606064e-06, + "loss": 0.0186, + "reward": 1.246875, + "reward_std": 0.20862858258187772, + "rewards/accuracy_reward": 0.2734375, + "rewards/format_reward": 0.9734375, + "step": 1650 + }, + { + "completion_length": 167.1515625, + "epoch": 0.7310676127108583, + "grad_norm": 0.5153465867042542, + "kl": 0.4582275390625, + "learning_rate": 4.087249177826553e-06, + "loss": 0.0183, + "reward": 1.26953125, + "reward_std": 0.29226357098668815, + "rewards/accuracy_reward": 0.3171875, + "rewards/format_reward": 0.95234375, + "step": 1655 + }, + { + "completion_length": 167.3953125, + "epoch": 0.7332762761933684, + "grad_norm": 0.9335393905639648, + "kl": 0.4902099609375, + "learning_rate": 4.025205725423607e-06, + "loss": 0.0196, + "reward": 1.2296875, + "reward_std": 0.3044658374041319, + "rewards/accuracy_reward": 0.2875, + "rewards/format_reward": 0.9421875, + "step": 1660 + }, + { + "completion_length": 124.8671875, + "epoch": 0.7354849396758786, + "grad_norm": 0.4761280119419098, + "kl": 0.4821533203125, + "learning_rate": 3.963517908440716e-06, + "loss": 0.0193, + "reward": 1.22578125, + "reward_std": 0.19983983058482407, + "rewards/accuracy_reward": 0.246875, + "rewards/format_reward": 0.97890625, + "step": 1665 + }, + { + "completion_length": 127.8421875, + "epoch": 0.7376936031583888, + "grad_norm": 0.5155916810035706, + "kl": 0.43746337890625, + "learning_rate": 3.902189398698482e-06, + "loss": 0.0175, + "reward": 1.18515625, + "reward_std": 0.17090727612376214, + "rewards/accuracy_reward": 0.2078125, + "rewards/format_reward": 0.97734375, + "step": 1670 + }, + { + "completion_length": 146.315625, + "epoch": 0.739902266640899, + "grad_norm": 0.3942101001739502, + "kl": 0.41949462890625, + "learning_rate": 3.841223846630599e-06, + "loss": 0.0168, + "reward": 1.21328125, + "reward_std": 0.19338544998317958, + "rewards/accuracy_reward": 0.23671875, + "rewards/format_reward": 0.9765625, + "step": 1675 + }, + { + "completion_length": 170.7625, + "epoch": 0.7421109301234091, + "grad_norm": 0.4603167176246643, + "kl": 0.39854736328125, + "learning_rate": 3.7806248810665613e-06, + "loss": 0.0159, + "reward": 1.25078125, + "reward_std": 0.22261980101466178, + "rewards/accuracy_reward": 0.2734375, + "rewards/format_reward": 0.97734375, + "step": 1680 + }, + { + "completion_length": 195.67265625, + "epoch": 0.7443195936059193, + "grad_norm": 0.527126669883728, + "kl": 0.41444091796875, + "learning_rate": 3.720396109015686e-06, + "loss": 0.0166, + "reward": 1.2375, + "reward_std": 0.24860016535967588, + "rewards/accuracy_reward": 0.275, + "rewards/format_reward": 0.9625, + "step": 1685 + }, + { + "completion_length": 178.1625, + "epoch": 0.7465282570884294, + "grad_norm": 0.4369182884693146, + "kl": 0.37342529296875, + "learning_rate": 3.6605411154523885e-06, + "loss": 0.0149, + "reward": 1.24140625, + "reward_std": 0.22675297893583773, + "rewards/accuracy_reward": 0.26953125, + "rewards/format_reward": 0.971875, + "step": 1690 + }, + { + "completion_length": 168.7265625, + "epoch": 0.7487369205709395, + "grad_norm": 0.5954079031944275, + "kl": 0.381005859375, + "learning_rate": 3.601063463102823e-06, + "loss": 0.0152, + "reward": 1.21015625, + "reward_std": 0.25745327677577734, + "rewards/accuracy_reward": 0.24296875, + "rewards/format_reward": 0.9671875, + "step": 1695 + }, + { + "completion_length": 176.64375, + "epoch": 0.7509455840534497, + "grad_norm": 0.4956624507904053, + "kl": 0.423779296875, + "learning_rate": 3.5419666922327854e-06, + "loss": 0.017, + "reward": 1.24609375, + "reward_std": 0.25288699120283126, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.96484375, + "step": 1700 + }, + { + "epoch": 0.7509455840534497, + "eval_completion_length": 163.3483334350586, + "eval_kl": 0.376171875, + "eval_loss": 0.015068257227540016, + "eval_reward": 1.247916669845581, + "eval_reward_std": 0.2429444035887718, + "eval_rewards/accuracy_reward": 0.2904166667163372, + "eval_rewards/format_reward": 0.9575, + "eval_runtime": 148.8095, + "eval_samples_per_second": 0.665, + "eval_steps_per_second": 0.027, + "step": 1700 + }, + { + "completion_length": 165.946875, + "epoch": 0.7531542475359598, + "grad_norm": 0.4972885251045227, + "kl": 0.40225830078125, + "learning_rate": 3.4832543204370284e-06, + "loss": 0.0161, + "reward": 1.203125, + "reward_std": 0.26637718454003334, + "rewards/accuracy_reward": 0.2375, + "rewards/format_reward": 0.965625, + "step": 1705 + }, + { + "completion_length": 173.74609375, + "epoch": 0.75536291101847, + "grad_norm": 0.6974431872367859, + "kl": 0.40240478515625, + "learning_rate": 3.424929842429848e-06, + "loss": 0.0161, + "reward": 1.17734375, + "reward_std": 0.2468060377985239, + "rewards/accuracy_reward": 0.22109375, + "rewards/format_reward": 0.95625, + "step": 1710 + }, + { + "completion_length": 154.61953125, + "epoch": 0.7575715745009801, + "grad_norm": 0.43418002128601074, + "kl": 0.3802001953125, + "learning_rate": 3.366996729837102e-06, + "loss": 0.0152, + "reward": 1.2546875, + "reward_std": 0.26053862273693085, + "rewards/accuracy_reward": 0.29140625, + "rewards/format_reward": 0.96328125, + "step": 1715 + }, + { + "completion_length": 141.70625, + "epoch": 0.7597802379834903, + "grad_norm": 0.5373135805130005, + "kl": 0.39423828125, + "learning_rate": 3.309458430989527e-06, + "loss": 0.0158, + "reward": 1.2203125, + "reward_std": 0.216771724447608, + "rewards/accuracy_reward": 0.24765625, + "rewards/format_reward": 0.97265625, + "step": 1720 + }, + { + "completion_length": 142.44765625, + "epoch": 0.7619889014660004, + "grad_norm": 0.5140719413757324, + "kl": 0.40374755859375, + "learning_rate": 3.2523183707175366e-06, + "loss": 0.0161, + "reward": 1.234375, + "reward_std": 0.21065853331238032, + "rewards/accuracy_reward": 0.2671875, + "rewards/format_reward": 0.9671875, + "step": 1725 + }, + { + "completion_length": 158.240625, + "epoch": 0.7641975649485105, + "grad_norm": 0.6687895655632019, + "kl": 0.46646728515625, + "learning_rate": 3.1955799501473226e-06, + "loss": 0.0187, + "reward": 1.18515625, + "reward_std": 0.3053234376013279, + "rewards/accuracy_reward": 0.24453125, + "rewards/format_reward": 0.940625, + "step": 1730 + }, + { + "completion_length": 169.2078125, + "epoch": 0.7664062284310207, + "grad_norm": 0.4844968616962433, + "kl": 0.48839111328125, + "learning_rate": 3.1392465464984455e-06, + "loss": 0.0195, + "reward": 1.196875, + "reward_std": 0.29177020620554683, + "rewards/accuracy_reward": 0.26640625, + "rewards/format_reward": 0.93046875, + "step": 1735 + }, + { + "completion_length": 152.02890625, + "epoch": 0.7686148919135308, + "grad_norm": 0.5372802019119263, + "kl": 0.461181640625, + "learning_rate": 3.083321512882773e-06, + "loss": 0.0184, + "reward": 1.17421875, + "reward_std": 0.2651050504297018, + "rewards/accuracy_reward": 0.22421875, + "rewards/format_reward": 0.95, + "step": 1740 + }, + { + "completion_length": 129.86171875, + "epoch": 0.770823555396041, + "grad_norm": 0.781815767288208, + "kl": 1.108740234375, + "learning_rate": 3.0278081781049405e-06, + "loss": 0.0444, + "reward": 1.23125, + "reward_std": 0.25756825953722, + "rewards/accuracy_reward": 0.271875, + "rewards/format_reward": 0.959375, + "step": 1745 + }, + { + "completion_length": 134.19296875, + "epoch": 0.7730322188785511, + "grad_norm": 0.5345208644866943, + "kl": 0.44124755859375, + "learning_rate": 2.9727098464641735e-06, + "loss": 0.0177, + "reward": 1.26875, + "reward_std": 0.23357822820544244, + "rewards/accuracy_reward": 0.30703125, + "rewards/format_reward": 0.96171875, + "step": 1750 + }, + { + "completion_length": 136.47734375, + "epoch": 0.7752408823610613, + "grad_norm": 0.6541038155555725, + "kl": 0.44271240234375, + "learning_rate": 2.9180297975576368e-06, + "loss": 0.0177, + "reward": 1.2328125, + "reward_std": 0.22248574066907167, + "rewards/accuracy_reward": 0.271875, + "rewards/format_reward": 0.9609375, + "step": 1755 + }, + { + "completion_length": 120.7203125, + "epoch": 0.7774495458435714, + "grad_norm": 0.4700789749622345, + "kl": 0.449462890625, + "learning_rate": 2.8637712860851974e-06, + "loss": 0.018, + "reward": 1.24765625, + "reward_std": 0.23170709386467933, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.96640625, + "step": 1760 + }, + { + "completion_length": 123.3, + "epoch": 0.7796582093260815, + "grad_norm": 0.6256026029586792, + "kl": 0.42320556640625, + "learning_rate": 2.8099375416557163e-06, + "loss": 0.0169, + "reward": 1.2609375, + "reward_std": 0.22770290337502958, + "rewards/accuracy_reward": 0.29140625, + "rewards/format_reward": 0.96953125, + "step": 1765 + }, + { + "completion_length": 115.96328125, + "epoch": 0.7818668728085917, + "grad_norm": 0.5066558718681335, + "kl": 0.43795166015625, + "learning_rate": 2.7565317685948e-06, + "loss": 0.0175, + "reward": 1.2984375, + "reward_std": 0.22580576539039612, + "rewards/accuracy_reward": 0.325, + "rewards/format_reward": 0.9734375, + "step": 1770 + }, + { + "completion_length": 131.2125, + "epoch": 0.7840755362911018, + "grad_norm": 0.8898307681083679, + "kl": 0.4900634765625, + "learning_rate": 2.7035571457540865e-06, + "loss": 0.0196, + "reward": 1.23203125, + "reward_std": 0.23165916074067355, + "rewards/accuracy_reward": 0.2671875, + "rewards/format_reward": 0.96484375, + "step": 1775 + }, + { + "completion_length": 136.24453125, + "epoch": 0.786284199773612, + "grad_norm": 0.49983325600624084, + "kl": 0.4647705078125, + "learning_rate": 2.651016826322017e-06, + "loss": 0.0186, + "reward": 1.26015625, + "reward_std": 0.23627216089516878, + "rewards/accuracy_reward": 0.3046875, + "rewards/format_reward": 0.95546875, + "step": 1780 + }, + { + "completion_length": 120.8796875, + "epoch": 0.7884928632561221, + "grad_norm": 0.6942005753517151, + "kl": 0.459130859375, + "learning_rate": 2.598913937636153e-06, + "loss": 0.0184, + "reward": 1.24375, + "reward_std": 0.2460821120068431, + "rewards/accuracy_reward": 0.27578125, + "rewards/format_reward": 0.96796875, + "step": 1785 + }, + { + "completion_length": 112.2453125, + "epoch": 0.7907015267386323, + "grad_norm": 0.48447439074516296, + "kl": 0.4265380859375, + "learning_rate": 2.5472515809970343e-06, + "loss": 0.0171, + "reward": 1.2421875, + "reward_std": 0.19503602739423515, + "rewards/accuracy_reward": 0.27109375, + "rewards/format_reward": 0.97109375, + "step": 1790 + }, + { + "completion_length": 105.003125, + "epoch": 0.7929101902211424, + "grad_norm": 0.9765954613685608, + "kl": 0.48828125, + "learning_rate": 2.4960328314835746e-06, + "loss": 0.0195, + "reward": 1.21796875, + "reward_std": 0.1995122255757451, + "rewards/accuracy_reward": 0.240625, + "rewards/format_reward": 0.97734375, + "step": 1795 + }, + { + "completion_length": 110.82421875, + "epoch": 0.7951188537036525, + "grad_norm": 1.8787330389022827, + "kl": 0.458447265625, + "learning_rate": 2.4452607377700367e-06, + "loss": 0.0183, + "reward": 1.25625, + "reward_std": 0.23729459717869758, + "rewards/accuracy_reward": 0.2828125, + "rewards/format_reward": 0.9734375, + "step": 1800 + }, + { + "epoch": 0.7951188537036525, + "eval_completion_length": 112.79791687011719, + "eval_kl": 0.497265625, + "eval_loss": 0.0199314896017313, + "eval_reward": 1.2658333349227906, + "eval_reward_std": 0.2067297151684761, + "eval_rewards/accuracy_reward": 0.2920833334326744, + "eval_rewards/format_reward": 0.97375, + "eval_runtime": 119.8153, + "eval_samples_per_second": 0.826, + "eval_steps_per_second": 0.033, + "step": 1800 + }, + { + "completion_length": 109.19375, + "epoch": 0.7973275171861627, + "grad_norm": 2.065337657928467, + "kl": 0.4661865234375, + "learning_rate": 2.394938321944551e-06, + "loss": 0.0187, + "reward": 1.2140625, + "reward_std": 0.22883423641324044, + "rewards/accuracy_reward": 0.2421875, + "rewards/format_reward": 0.971875, + "step": 1805 + }, + { + "completion_length": 108.35, + "epoch": 0.7995361806686728, + "grad_norm": 0.6966126561164856, + "kl": 0.7103515625, + "learning_rate": 2.3450685793292437e-06, + "loss": 0.0284, + "reward": 1.17265625, + "reward_std": 0.17452798802405595, + "rewards/accuracy_reward": 0.2015625, + "rewards/format_reward": 0.97109375, + "step": 1810 + }, + { + "completion_length": 113.3828125, + "epoch": 0.801744844151183, + "grad_norm": 0.8000837564468384, + "kl": 0.7013427734375, + "learning_rate": 2.295654478301942e-06, + "loss": 0.0281, + "reward": 1.234375, + "reward_std": 0.21197393592447042, + "rewards/accuracy_reward": 0.2640625, + "rewards/format_reward": 0.9703125, + "step": 1815 + }, + { + "completion_length": 116.340625, + "epoch": 0.8039535076336931, + "grad_norm": 0.9716657996177673, + "kl": 0.9371826171875, + "learning_rate": 2.246698960119499e-06, + "loss": 0.0375, + "reward": 1.221875, + "reward_std": 0.23381243012845515, + "rewards/accuracy_reward": 0.246875, + "rewards/format_reward": 0.975, + "step": 1820 + }, + { + "completion_length": 143.3609375, + "epoch": 0.8061621711162033, + "grad_norm": 3.171027898788452, + "kl": 0.77176513671875, + "learning_rate": 2.198204938742707e-06, + "loss": 0.0309, + "reward": 1.20234375, + "reward_std": 0.25977810826152564, + "rewards/accuracy_reward": 0.246875, + "rewards/format_reward": 0.95546875, + "step": 1825 + }, + { + "completion_length": 135.51484375, + "epoch": 0.8083708345987135, + "grad_norm": 1.4249086380004883, + "kl": 0.80992431640625, + "learning_rate": 2.150175300662862e-06, + "loss": 0.0324, + "reward": 1.18671875, + "reward_std": 0.24862184505909682, + "rewards/accuracy_reward": 0.228125, + "rewards/format_reward": 0.95859375, + "step": 1830 + }, + { + "completion_length": 151.6921875, + "epoch": 0.8105794980812236, + "grad_norm": 1.0902801752090454, + "kl": 0.86148681640625, + "learning_rate": 2.1026129047299436e-06, + "loss": 0.0345, + "reward": 1.2, + "reward_std": 0.260273445956409, + "rewards/accuracy_reward": 0.25703125, + "rewards/format_reward": 0.94296875, + "step": 1835 + }, + { + "completion_length": 140.75859375, + "epoch": 0.8127881615637338, + "grad_norm": 0.5658231973648071, + "kl": 0.635693359375, + "learning_rate": 2.055520581982463e-06, + "loss": 0.0254, + "reward": 1.26171875, + "reward_std": 0.26789135448634627, + "rewards/accuracy_reward": 0.30546875, + "rewards/format_reward": 0.95625, + "step": 1840 + }, + { + "completion_length": 108.0703125, + "epoch": 0.8149968250462439, + "grad_norm": 0.7409191727638245, + "kl": 0.51619873046875, + "learning_rate": 2.0089011354789357e-06, + "loss": 0.0206, + "reward": 1.2359375, + "reward_std": 0.21879921518266202, + "rewards/accuracy_reward": 0.26171875, + "rewards/format_reward": 0.97421875, + "step": 1845 + }, + { + "completion_length": 131.575, + "epoch": 0.8172054885287541, + "grad_norm": 0.933627724647522, + "kl": 0.5877197265625, + "learning_rate": 1.9627573401310452e-06, + "loss": 0.0235, + "reward": 1.23203125, + "reward_std": 0.26072712801396847, + "rewards/accuracy_reward": 0.275, + "rewards/format_reward": 0.95703125, + "step": 1850 + }, + { + "completion_length": 116.653125, + "epoch": 0.8194141520112642, + "grad_norm": 0.5518223643302917, + "kl": 0.523291015625, + "learning_rate": 1.9170919425384695e-06, + "loss": 0.0209, + "reward": 1.296875, + "reward_std": 0.23366234563291072, + "rewards/accuracy_reward": 0.32265625, + "rewards/format_reward": 0.97421875, + "step": 1855 + }, + { + "completion_length": 121.140625, + "epoch": 0.8216228154937744, + "grad_norm": 0.5761392712593079, + "kl": 0.49658203125, + "learning_rate": 1.8719076608254028e-06, + "loss": 0.0199, + "reward": 1.225, + "reward_std": 0.21435881238430737, + "rewards/accuracy_reward": 0.25390625, + "rewards/format_reward": 0.97109375, + "step": 1860 + }, + { + "completion_length": 136.73359375, + "epoch": 0.8238314789762845, + "grad_norm": 0.7133124470710754, + "kl": 0.52952880859375, + "learning_rate": 1.8272071844787575e-06, + "loss": 0.0212, + "reward": 1.1703125, + "reward_std": 0.2595834471285343, + "rewards/accuracy_reward": 0.209375, + "rewards/format_reward": 0.9609375, + "step": 1865 + }, + { + "completion_length": 141.15390625, + "epoch": 0.8260401424587946, + "grad_norm": 0.9754716157913208, + "kl": 0.5518310546875, + "learning_rate": 1.7829931741880802e-06, + "loss": 0.0221, + "reward": 1.2625, + "reward_std": 0.28494130074977875, + "rewards/accuracy_reward": 0.31328125, + "rewards/format_reward": 0.94921875, + "step": 1870 + }, + { + "completion_length": 123.50234375, + "epoch": 0.8282488059413048, + "grad_norm": 0.912441611289978, + "kl": 0.601611328125, + "learning_rate": 1.7392682616871836e-06, + "loss": 0.0241, + "reward": 1.225, + "reward_std": 0.2650785157456994, + "rewards/accuracy_reward": 0.2609375, + "rewards/format_reward": 0.9640625, + "step": 1875 + }, + { + "completion_length": 116.25625, + "epoch": 0.8304574694238149, + "grad_norm": 0.719677209854126, + "kl": 0.52056884765625, + "learning_rate": 1.696035049597503e-06, + "loss": 0.0208, + "reward": 1.24453125, + "reward_std": 0.22317184396088124, + "rewards/accuracy_reward": 0.26796875, + "rewards/format_reward": 0.9765625, + "step": 1880 + }, + { + "completion_length": 118.2171875, + "epoch": 0.8326661329063251, + "grad_norm": 0.5467638969421387, + "kl": 0.5210205078125, + "learning_rate": 1.6532961112731672e-06, + "loss": 0.0208, + "reward": 1.23828125, + "reward_std": 0.19823672361671923, + "rewards/accuracy_reward": 0.26171875, + "rewards/format_reward": 0.9765625, + "step": 1885 + }, + { + "completion_length": 113.9359375, + "epoch": 0.8348747963888352, + "grad_norm": 0.6019476652145386, + "kl": 0.444091796875, + "learning_rate": 1.6110539906478463e-06, + "loss": 0.0178, + "reward": 1.3015625, + "reward_std": 0.2479660578072071, + "rewards/accuracy_reward": 0.32421875, + "rewards/format_reward": 0.97734375, + "step": 1890 + }, + { + "completion_length": 122.75546875, + "epoch": 0.8370834598713454, + "grad_norm": 0.9671128988265991, + "kl": 0.5489501953125, + "learning_rate": 1.5693112020833012e-06, + "loss": 0.022, + "reward": 1.2703125, + "reward_std": 0.2313979933038354, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.9734375, + "step": 1895 + }, + { + "completion_length": 118.28046875, + "epoch": 0.8392921233538555, + "grad_norm": 0.9875019788742065, + "kl": 0.4894287109375, + "learning_rate": 1.528070230219756e-06, + "loss": 0.0196, + "reward": 1.2171875, + "reward_std": 0.21601561345160009, + "rewards/accuracy_reward": 0.24140625, + "rewards/format_reward": 0.97578125, + "step": 1900 + }, + { + "epoch": 0.8392921233538555, + "eval_completion_length": 123.54375, + "eval_kl": 0.558125, + "eval_loss": 0.022584721446037292, + "eval_reward": 1.295, + "eval_reward_std": 0.21910995721817017, + "eval_rewards/accuracy_reward": 0.32416666686534884, + "eval_rewards/format_reward": 0.9708333349227906, + "eval_runtime": 124.5976, + "eval_samples_per_second": 0.795, + "eval_steps_per_second": 0.032, + "step": 1900 + }, + { + "completion_length": 136.87578125, + "epoch": 0.8415007868363656, + "grad_norm": 0.6465866565704346, + "kl": 0.58070068359375, + "learning_rate": 1.4873335298279801e-06, + "loss": 0.0232, + "reward": 1.27265625, + "reward_std": 0.22537416350096465, + "rewards/accuracy_reward": 0.3078125, + "rewards/format_reward": 0.96484375, + "step": 1905 + }, + { + "completion_length": 152.965625, + "epoch": 0.8437094503188758, + "grad_norm": 0.5781317353248596, + "kl": 0.5817138671875, + "learning_rate": 1.447103525663186e-06, + "loss": 0.0233, + "reward": 1.20234375, + "reward_std": 0.23281266931444405, + "rewards/accuracy_reward": 0.24375, + "rewards/format_reward": 0.95859375, + "step": 1910 + }, + { + "completion_length": 153.8953125, + "epoch": 0.8459181138013859, + "grad_norm": 0.6810880303382874, + "kl": 0.59539794921875, + "learning_rate": 1.4073826123206946e-06, + "loss": 0.0238, + "reward": 1.259375, + "reward_std": 0.2544757820665836, + "rewards/accuracy_reward": 0.29609375, + "rewards/format_reward": 0.96328125, + "step": 1915 + }, + { + "completion_length": 143.8140625, + "epoch": 0.8481267772838961, + "grad_norm": 0.6009095311164856, + "kl": 0.5434326171875, + "learning_rate": 1.368173154093414e-06, + "loss": 0.0217, + "reward": 1.2453125, + "reward_std": 0.2513797411695123, + "rewards/accuracy_reward": 0.2828125, + "rewards/format_reward": 0.9625, + "step": 1920 + }, + { + "completion_length": 159.8171875, + "epoch": 0.8503354407664062, + "grad_norm": 0.6377544403076172, + "kl": 0.572900390625, + "learning_rate": 1.3294774848310954e-06, + "loss": 0.0229, + "reward": 1.23046875, + "reward_std": 0.27456119302660226, + "rewards/accuracy_reward": 0.27421875, + "rewards/format_reward": 0.95625, + "step": 1925 + }, + { + "completion_length": 155.94453125, + "epoch": 0.8525441042489164, + "grad_norm": 0.5800438523292542, + "kl": 0.6190185546875, + "learning_rate": 1.2912979078014242e-06, + "loss": 0.0248, + "reward": 1.20703125, + "reward_std": 0.23998625949025154, + "rewards/accuracy_reward": 0.253125, + "rewards/format_reward": 0.95390625, + "step": 1930 + }, + { + "completion_length": 159.3046875, + "epoch": 0.8547527677314265, + "grad_norm": 0.9404852986335754, + "kl": 0.56507568359375, + "learning_rate": 1.253636695552931e-06, + "loss": 0.0226, + "reward": 1.259375, + "reward_std": 0.2761839430779219, + "rewards/accuracy_reward": 0.30703125, + "rewards/format_reward": 0.95234375, + "step": 1935 + }, + { + "completion_length": 154.96953125, + "epoch": 0.8569614312139366, + "grad_norm": 0.9105063676834106, + "kl": 0.64293212890625, + "learning_rate": 1.216496089779703e-06, + "loss": 0.0257, + "reward": 1.22578125, + "reward_std": 0.27103531677275894, + "rewards/accuracy_reward": 0.271875, + "rewards/format_reward": 0.95390625, + "step": 1940 + }, + { + "completion_length": 147.2546875, + "epoch": 0.8591700946964468, + "grad_norm": 0.6244191527366638, + "kl": 0.49593505859375, + "learning_rate": 1.1798783011879766e-06, + "loss": 0.0198, + "reward": 1.26484375, + "reward_std": 0.2632339050993323, + "rewards/accuracy_reward": 0.30546875, + "rewards/format_reward": 0.959375, + "step": 1945 + }, + { + "completion_length": 150.15234375, + "epoch": 0.8613787581789569, + "grad_norm": 0.9317097663879395, + "kl": 0.51068115234375, + "learning_rate": 1.14378550936453e-06, + "loss": 0.0204, + "reward": 1.2453125, + "reward_std": 0.2504005776718259, + "rewards/accuracy_reward": 0.28828125, + "rewards/format_reward": 0.95703125, + "step": 1950 + }, + { + "completion_length": 136.7046875, + "epoch": 0.8635874216614671, + "grad_norm": 0.5654709935188293, + "kl": 0.496484375, + "learning_rate": 1.1082198626469687e-06, + "loss": 0.0199, + "reward": 1.22421875, + "reward_std": 0.23155678305774927, + "rewards/accuracy_reward": 0.25625, + "rewards/format_reward": 0.96796875, + "step": 1955 + }, + { + "completion_length": 135.80859375, + "epoch": 0.8657960851439772, + "grad_norm": 0.4000113904476166, + "kl": 0.499267578125, + "learning_rate": 1.0731834779958217e-06, + "loss": 0.02, + "reward": 1.253125, + "reward_std": 0.2156506871804595, + "rewards/accuracy_reward": 0.290625, + "rewards/format_reward": 0.9625, + "step": 1960 + }, + { + "completion_length": 140.81640625, + "epoch": 0.8680047486264874, + "grad_norm": 0.41182902455329895, + "kl": 0.44923095703125, + "learning_rate": 1.0386784408685713e-06, + "loss": 0.018, + "reward": 1.1765625, + "reward_std": 0.21103496849536896, + "rewards/accuracy_reward": 0.2125, + "rewards/format_reward": 0.9640625, + "step": 1965 + }, + { + "completion_length": 144.734375, + "epoch": 0.8702134121089975, + "grad_norm": 0.6124417781829834, + "kl": 0.5717529296875, + "learning_rate": 1.0047068050954868e-06, + "loss": 0.0229, + "reward": 1.19140625, + "reward_std": 0.2501412840560079, + "rewards/accuracy_reward": 0.2359375, + "rewards/format_reward": 0.95546875, + "step": 1970 + }, + { + "completion_length": 137.81015625, + "epoch": 0.8724220755915076, + "grad_norm": 0.7430810928344727, + "kl": 0.51724853515625, + "learning_rate": 9.71270592757404e-07, + "loss": 0.0207, + "reward": 1.25234375, + "reward_std": 0.2686174543574452, + "rewards/accuracy_reward": 0.28359375, + "rewards/format_reward": 0.96875, + "step": 1975 + }, + { + "completion_length": 117.8140625, + "epoch": 0.8746307390740178, + "grad_norm": 0.48936066031455994, + "kl": 0.59530029296875, + "learning_rate": 9.38371794065337e-07, + "loss": 0.0238, + "reward": 1.2453125, + "reward_std": 0.21264754123985768, + "rewards/accuracy_reward": 0.27109375, + "rewards/format_reward": 0.97421875, + "step": 1980 + }, + { + "completion_length": 136.915625, + "epoch": 0.876839402556528, + "grad_norm": 0.5388721823692322, + "kl": 0.4608642578125, + "learning_rate": 9.060123672420451e-07, + "loss": 0.0184, + "reward": 1.228125, + "reward_std": 0.22003105469048023, + "rewards/accuracy_reward": 0.25859375, + "rewards/format_reward": 0.96953125, + "step": 1985 + }, + { + "completion_length": 123.7546875, + "epoch": 0.8790480660390382, + "grad_norm": 0.7182089686393738, + "kl": 0.4820068359375, + "learning_rate": 8.741942384054481e-07, + "loss": 0.0193, + "reward": 1.2609375, + "reward_std": 0.23840244263410568, + "rewards/accuracy_reward": 0.28828125, + "rewards/format_reward": 0.97265625, + "step": 1990 + }, + { + "completion_length": 124.53359375, + "epoch": 0.8812567295215483, + "grad_norm": 0.524861752986908, + "kl": 0.452880859375, + "learning_rate": 8.429193014540015e-07, + "loss": 0.0181, + "reward": 1.253125, + "reward_std": 0.2055924626067281, + "rewards/accuracy_reward": 0.275, + "rewards/format_reward": 0.978125, + "step": 1995 + }, + { + "completion_length": 130.221875, + "epoch": 0.8834653930040585, + "grad_norm": 0.49339717626571655, + "kl": 0.484423828125, + "learning_rate": 8.121894179539469e-07, + "loss": 0.0194, + "reward": 1.225, + "reward_std": 0.22272255159914495, + "rewards/accuracy_reward": 0.25625, + "rewards/format_reward": 0.96875, + "step": 2000 + }, + { + "epoch": 0.8834653930040585, + "eval_completion_length": 142.34375, + "eval_kl": 0.450234375, + "eval_loss": 0.018087182193994522, + "eval_reward": 1.279166669845581, + "eval_reward_std": 0.2484509229660034, + "eval_rewards/accuracy_reward": 0.31458333373069763, + "eval_rewards/format_reward": 0.9645833349227906, + "eval_runtime": 145.7929, + "eval_samples_per_second": 0.679, + "eval_steps_per_second": 0.027, + "step": 2000 + }, + { + "completion_length": 130.49140625, + "epoch": 0.8856740564865686, + "grad_norm": 0.5030075907707214, + "kl": 0.4587646484375, + "learning_rate": 7.82006417028518e-07, + "loss": 0.0183, + "reward": 1.2390625, + "reward_std": 0.20883522126823664, + "rewards/accuracy_reward": 0.26015625, + "rewards/format_reward": 0.97890625, + "step": 2005 + }, + { + "completion_length": 145.6078125, + "epoch": 0.8878827199690787, + "grad_norm": 0.5313246250152588, + "kl": 0.45802001953125, + "learning_rate": 7.523720952490631e-07, + "loss": 0.0183, + "reward": 1.2578125, + "reward_std": 0.2512391902506351, + "rewards/accuracy_reward": 0.28828125, + "rewards/format_reward": 0.96953125, + "step": 2010 + }, + { + "completion_length": 157.45546875, + "epoch": 0.8900913834515889, + "grad_norm": 0.5215573310852051, + "kl": 0.49078369140625, + "learning_rate": 7.232882165281141e-07, + "loss": 0.0196, + "reward": 1.1890625, + "reward_std": 0.22680971212685108, + "rewards/accuracy_reward": 0.228125, + "rewards/format_reward": 0.9609375, + "step": 2015 + }, + { + "completion_length": 146.7203125, + "epoch": 0.892300046934099, + "grad_norm": 0.6293109059333801, + "kl": 0.49932861328125, + "learning_rate": 6.947565120143828e-07, + "loss": 0.02, + "reward": 1.2375, + "reward_std": 0.2244907196611166, + "rewards/accuracy_reward": 0.27265625, + "rewards/format_reward": 0.96484375, + "step": 2020 + }, + { + "completion_length": 152.1328125, + "epoch": 0.8945087104166092, + "grad_norm": 0.6269906759262085, + "kl": 0.4867919921875, + "learning_rate": 6.667786799897269e-07, + "loss": 0.0195, + "reward": 1.23828125, + "reward_std": 0.2209881154820323, + "rewards/accuracy_reward": 0.275, + "rewards/format_reward": 0.96328125, + "step": 2025 + }, + { + "completion_length": 143.7203125, + "epoch": 0.8967173738991193, + "grad_norm": 0.4811406433582306, + "kl": 0.45821533203125, + "learning_rate": 6.393563857680596e-07, + "loss": 0.0183, + "reward": 1.26953125, + "reward_std": 0.22661811783909797, + "rewards/accuracy_reward": 0.3015625, + "rewards/format_reward": 0.96796875, + "step": 2030 + }, + { + "completion_length": 145.00390625, + "epoch": 0.8989260373816295, + "grad_norm": 0.6537109017372131, + "kl": 0.49169921875, + "learning_rate": 6.124912615962341e-07, + "loss": 0.0197, + "reward": 1.24765625, + "reward_std": 0.22691688518971204, + "rewards/accuracy_reward": 0.2765625, + "rewards/format_reward": 0.97109375, + "step": 2035 + }, + { + "completion_length": 159.30390625, + "epoch": 0.9011347008641396, + "grad_norm": 0.5987099409103394, + "kl": 0.4700439453125, + "learning_rate": 5.861849065568726e-07, + "loss": 0.0188, + "reward": 1.2921875, + "reward_std": 0.26790456287562847, + "rewards/accuracy_reward": 0.3328125, + "rewards/format_reward": 0.959375, + "step": 2040 + }, + { + "completion_length": 157.98203125, + "epoch": 0.9033433643466497, + "grad_norm": 1.6861835718154907, + "kl": 0.5018310546875, + "learning_rate": 5.604388864732002e-07, + "loss": 0.0201, + "reward": 1.2171875, + "reward_std": 0.23706249240785837, + "rewards/accuracy_reward": 0.25703125, + "rewards/format_reward": 0.96015625, + "step": 2045 + }, + { + "completion_length": 157.39921875, + "epoch": 0.9055520278291599, + "grad_norm": 0.623263955116272, + "kl": 0.61142578125, + "learning_rate": 5.352547338158309e-07, + "loss": 0.0245, + "reward": 1.19453125, + "reward_std": 0.2746475737541914, + "rewards/accuracy_reward": 0.23828125, + "rewards/format_reward": 0.95625, + "step": 2050 + }, + { + "completion_length": 153.7953125, + "epoch": 0.90776069131167, + "grad_norm": 0.6804496049880981, + "kl": 0.4615234375, + "learning_rate": 5.106339476115596e-07, + "loss": 0.0185, + "reward": 1.23359375, + "reward_std": 0.2757456684485078, + "rewards/accuracy_reward": 0.271875, + "rewards/format_reward": 0.96171875, + "step": 2055 + }, + { + "completion_length": 160.09140625, + "epoch": 0.9099693547941802, + "grad_norm": 0.8526637554168701, + "kl": 0.48623046875, + "learning_rate": 4.865779933541348e-07, + "loss": 0.0194, + "reward": 1.253125, + "reward_std": 0.27613792307674884, + "rewards/accuracy_reward": 0.30078125, + "rewards/format_reward": 0.95234375, + "step": 2060 + }, + { + "completion_length": 161.521875, + "epoch": 0.9121780182766903, + "grad_norm": 0.6661585569381714, + "kl": 0.49755859375, + "learning_rate": 4.63088302917023e-07, + "loss": 0.0199, + "reward": 1.24375, + "reward_std": 0.24981417022645475, + "rewards/accuracy_reward": 0.2890625, + "rewards/format_reward": 0.9546875, + "step": 2065 + }, + { + "completion_length": 157.80546875, + "epoch": 0.9143866817592005, + "grad_norm": 0.7502483129501343, + "kl": 0.4491455078125, + "learning_rate": 4.401662744681845e-07, + "loss": 0.018, + "reward": 1.2625, + "reward_std": 0.23363354597240688, + "rewards/accuracy_reward": 0.29765625, + "rewards/format_reward": 0.96484375, + "step": 2070 + }, + { + "completion_length": 141.709375, + "epoch": 0.9165953452417106, + "grad_norm": 1.0072883367538452, + "kl": 0.440625, + "learning_rate": 4.1781327238684775e-07, + "loss": 0.0176, + "reward": 1.271875, + "reward_std": 0.23114844579249622, + "rewards/accuracy_reward": 0.29921875, + "rewards/format_reward": 0.97265625, + "step": 2075 + }, + { + "completion_length": 143.68515625, + "epoch": 0.9188040087242207, + "grad_norm": 0.7586592435836792, + "kl": 0.49891357421875, + "learning_rate": 3.9603062718230667e-07, + "loss": 0.02, + "reward": 1.27734375, + "reward_std": 0.2479051820933819, + "rewards/accuracy_reward": 0.30703125, + "rewards/format_reward": 0.9703125, + "step": 2080 + }, + { + "completion_length": 163.2359375, + "epoch": 0.9210126722067309, + "grad_norm": 0.7740212678909302, + "kl": 0.4748779296875, + "learning_rate": 3.748196354147127e-07, + "loss": 0.019, + "reward": 1.253125, + "reward_std": 0.2910253481939435, + "rewards/accuracy_reward": 0.3046875, + "rewards/format_reward": 0.9484375, + "step": 2085 + }, + { + "completion_length": 161.70703125, + "epoch": 0.923221335689241, + "grad_norm": 0.5001750588417053, + "kl": 0.51661376953125, + "learning_rate": 3.5418155961790546e-07, + "loss": 0.0207, + "reward": 1.18984375, + "reward_std": 0.24765556119382381, + "rewards/accuracy_reward": 0.23984375, + "rewards/format_reward": 0.95, + "step": 2090 + }, + { + "completion_length": 149.140625, + "epoch": 0.9254299991717512, + "grad_norm": 0.6595699787139893, + "kl": 0.459619140625, + "learning_rate": 3.341176282242653e-07, + "loss": 0.0184, + "reward": 1.25546875, + "reward_std": 0.23822309002280234, + "rewards/accuracy_reward": 0.29140625, + "rewards/format_reward": 0.9640625, + "step": 2095 + }, + { + "completion_length": 152.50390625, + "epoch": 0.9276386626542613, + "grad_norm": 1.5634217262268066, + "kl": 0.5083740234375, + "learning_rate": 3.1462903549159484e-07, + "loss": 0.0203, + "reward": 1.2578125, + "reward_std": 0.2591968797147274, + "rewards/accuracy_reward": 0.3015625, + "rewards/format_reward": 0.95625, + "step": 2100 + }, + { + "epoch": 0.9276386626542613, + "eval_completion_length": 141.9604168701172, + "eval_kl": 0.46765625, + "eval_loss": 0.01848418451845646, + "eval_reward": 1.28125, + "eval_reward_std": 0.24666063576936723, + "eval_rewards/accuracy_reward": 0.31916666686534884, + "eval_rewards/format_reward": 0.9620833349227905, + "eval_runtime": 156.3759, + "eval_samples_per_second": 0.633, + "eval_steps_per_second": 0.026, + "step": 2100 + }, + { + "completion_length": 148.68515625, + "epoch": 0.9298473261367715, + "grad_norm": 0.7756811380386353, + "kl": 0.4727783203125, + "learning_rate": 2.9571694143202934e-07, + "loss": 0.0189, + "reward": 1.2359375, + "reward_std": 0.2811011435464025, + "rewards/accuracy_reward": 0.27578125, + "rewards/format_reward": 0.96015625, + "step": 2105 + }, + { + "completion_length": 170.4796875, + "epoch": 0.9320559896192816, + "grad_norm": 0.5152100324630737, + "kl": 0.523291015625, + "learning_rate": 2.773824717429907e-07, + "loss": 0.0209, + "reward": 1.18046875, + "reward_std": 0.28240158669650556, + "rewards/accuracy_reward": 0.2328125, + "rewards/format_reward": 0.94765625, + "step": 2110 + }, + { + "completion_length": 138.71171875, + "epoch": 0.9342646531017917, + "grad_norm": 0.6858031153678894, + "kl": 0.481396484375, + "learning_rate": 2.5962671774018234e-07, + "loss": 0.0193, + "reward": 1.2265625, + "reward_std": 0.2497631970793009, + "rewards/accuracy_reward": 0.2625, + "rewards/format_reward": 0.9640625, + "step": 2115 + }, + { + "completion_length": 150.840625, + "epoch": 0.9364733165843019, + "grad_norm": 0.7537331581115723, + "kl": 0.49796142578125, + "learning_rate": 2.424507362926376e-07, + "loss": 0.0199, + "reward": 1.2484375, + "reward_std": 0.2752187229692936, + "rewards/accuracy_reward": 0.2875, + "rewards/format_reward": 0.9609375, + "step": 2120 + }, + { + "completion_length": 145.603125, + "epoch": 0.938681980066812, + "grad_norm": 0.854280412197113, + "kl": 0.453173828125, + "learning_rate": 2.2585554975980252e-07, + "loss": 0.0181, + "reward": 1.240625, + "reward_std": 0.24026636723428965, + "rewards/accuracy_reward": 0.278125, + "rewards/format_reward": 0.9625, + "step": 2125 + }, + { + "completion_length": 146.646875, + "epoch": 0.9408906435493222, + "grad_norm": 0.6566202640533447, + "kl": 0.47532958984375, + "learning_rate": 2.0984214593069318e-07, + "loss": 0.019, + "reward": 1.2546875, + "reward_std": 0.2486549686640501, + "rewards/accuracy_reward": 0.28671875, + "rewards/format_reward": 0.96796875, + "step": 2130 + }, + { + "completion_length": 134.93125, + "epoch": 0.9430993070318323, + "grad_norm": 0.6418664455413818, + "kl": 0.70113525390625, + "learning_rate": 1.9441147796508408e-07, + "loss": 0.028, + "reward": 1.28046875, + "reward_std": 0.270217102766037, + "rewards/accuracy_reward": 0.3140625, + "rewards/format_reward": 0.96640625, + "step": 2135 + }, + { + "completion_length": 153.1296875, + "epoch": 0.9453079705143425, + "grad_norm": 0.5165784358978271, + "kl": 0.51217041015625, + "learning_rate": 1.795644643367922e-07, + "loss": 0.0205, + "reward": 1.23984375, + "reward_std": 0.2561708649620414, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.95859375, + "step": 2140 + }, + { + "completion_length": 143.43046875, + "epoch": 0.9475166339968527, + "grad_norm": 0.6096455454826355, + "kl": 0.474267578125, + "learning_rate": 1.6530198877899417e-07, + "loss": 0.019, + "reward": 1.265625, + "reward_std": 0.24631664287298918, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.96875, + "step": 2145 + }, + { + "completion_length": 149.7796875, + "epoch": 0.9497252974793629, + "grad_norm": 0.6443772912025452, + "kl": 0.46708984375, + "learning_rate": 1.5162490023163057e-07, + "loss": 0.0187, + "reward": 1.23828125, + "reward_std": 0.23290605265647174, + "rewards/accuracy_reward": 0.275, + "rewards/format_reward": 0.96328125, + "step": 2150 + }, + { + "completion_length": 152.20546875, + "epoch": 0.951933960961873, + "grad_norm": 0.7591469883918762, + "kl": 0.5279296875, + "learning_rate": 1.3853401279086853e-07, + "loss": 0.0211, + "reward": 1.20859375, + "reward_std": 0.23095191065222026, + "rewards/accuracy_reward": 0.24921875, + "rewards/format_reward": 0.959375, + "step": 2155 + }, + { + "completion_length": 143.9375, + "epoch": 0.9541426244443831, + "grad_norm": 0.4750344753265381, + "kl": 0.47156982421875, + "learning_rate": 1.2603010566065055e-07, + "loss": 0.0189, + "reward": 1.240625, + "reward_std": 0.2516822377219796, + "rewards/accuracy_reward": 0.27734375, + "rewards/format_reward": 0.96328125, + "step": 2160 + }, + { + "completion_length": 147.12578125, + "epoch": 0.9563512879268933, + "grad_norm": 0.6196463704109192, + "kl": 0.50130615234375, + "learning_rate": 1.1411392310631153e-07, + "loss": 0.0201, + "reward": 1.2375, + "reward_std": 0.2511793440207839, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.95625, + "step": 2165 + }, + { + "completion_length": 130.6046875, + "epoch": 0.9585599514094034, + "grad_norm": 0.519320547580719, + "kl": 0.55948486328125, + "learning_rate": 1.0278617441028205e-07, + "loss": 0.0224, + "reward": 1.2484375, + "reward_std": 0.24660416580736638, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.9671875, + "step": 2170 + }, + { + "completion_length": 139.35078125, + "epoch": 0.9607686148919136, + "grad_norm": 0.5927404761314392, + "kl": 0.45987548828125, + "learning_rate": 9.204753382986097e-08, + "loss": 0.0184, + "reward": 1.240625, + "reward_std": 0.19637434519827365, + "rewards/accuracy_reward": 0.27265625, + "rewards/format_reward": 0.96796875, + "step": 2175 + }, + { + "completion_length": 139.8921875, + "epoch": 0.9629772783744237, + "grad_norm": 0.4677460491657257, + "kl": 0.47265625, + "learning_rate": 8.189864055709206e-08, + "loss": 0.0189, + "reward": 1.2296875, + "reward_std": 0.2277947474271059, + "rewards/accuracy_reward": 0.2625, + "rewards/format_reward": 0.9671875, + "step": 2180 + }, + { + "completion_length": 143.153125, + "epoch": 0.9651859418569338, + "grad_norm": 0.9342114925384521, + "kl": 0.5101318359375, + "learning_rate": 7.23400986807099e-08, + "loss": 0.0204, + "reward": 1.25078125, + "reward_std": 0.26373773720115423, + "rewards/accuracy_reward": 0.28984375, + "rewards/format_reward": 0.9609375, + "step": 2185 + }, + { + "completion_length": 148.3703125, + "epoch": 0.967394605339444, + "grad_norm": 0.46142810583114624, + "kl": 0.48370361328125, + "learning_rate": 6.337247715018869e-08, + "loss": 0.0194, + "reward": 1.2203125, + "reward_std": 0.23893490042537452, + "rewards/accuracy_reward": 0.2578125, + "rewards/format_reward": 0.9625, + "step": 2190 + }, + { + "completion_length": 139.8171875, + "epoch": 0.9696032688219541, + "grad_norm": 0.5866816639900208, + "kl": 0.4761962890625, + "learning_rate": 5.4996309741873755e-08, + "loss": 0.019, + "reward": 1.275, + "reward_std": 0.23191295862197875, + "rewards/accuracy_reward": 0.30546875, + "rewards/format_reward": 0.96953125, + "step": 2195 + }, + { + "completion_length": 150.57109375, + "epoch": 0.9718119323044643, + "grad_norm": 0.540817141532898, + "kl": 0.44088134765625, + "learning_rate": 4.7212095027209246e-08, + "loss": 0.0176, + "reward": 1.29765625, + "reward_std": 0.2451239839196205, + "rewards/accuracy_reward": 0.33046875, + "rewards/format_reward": 0.9671875, + "step": 2200 + }, + { + "epoch": 0.9718119323044643, + "eval_completion_length": 168.58166748046875, + "eval_kl": 0.486328125, + "eval_loss": 0.019449135288596153, + "eval_reward": 1.2825, + "eval_reward_std": 0.28413535237312315, + "eval_rewards/accuracy_reward": 0.3333333337306976, + "eval_rewards/format_reward": 0.9491666674613952, + "eval_runtime": 159.9487, + "eval_samples_per_second": 0.619, + "eval_steps_per_second": 0.025, + "step": 2200 + }, + { + "completion_length": 153.36015625, + "epoch": 0.9740205957869744, + "grad_norm": 0.4312755763530731, + "kl": 0.52962646484375, + "learning_rate": 4.0020296343065144e-08, + "loss": 0.0212, + "reward": 1.2046875, + "reward_std": 0.2571037333458662, + "rewards/accuracy_reward": 0.24609375, + "rewards/format_reward": 0.95859375, + "step": 2205 + }, + { + "completion_length": 146.95, + "epoch": 0.9762292592694846, + "grad_norm": 0.8880886435508728, + "kl": 0.478515625, + "learning_rate": 3.3421341764152684e-08, + "loss": 0.0191, + "reward": 1.23359375, + "reward_std": 0.2560200056061149, + "rewards/accuracy_reward": 0.27109375, + "rewards/format_reward": 0.9625, + "step": 2210 + }, + { + "completion_length": 144.825, + "epoch": 0.9784379227519947, + "grad_norm": 0.788774311542511, + "kl": 0.512060546875, + "learning_rate": 2.7415624077551383e-08, + "loss": 0.0205, + "reward": 1.2359375, + "reward_std": 0.26023210752755405, + "rewards/accuracy_reward": 0.2765625, + "rewards/format_reward": 0.959375, + "step": 2215 + }, + { + "completion_length": 133.7578125, + "epoch": 0.9806465862345048, + "grad_norm": 0.5918937921524048, + "kl": 0.51031494140625, + "learning_rate": 2.2003500759322228e-08, + "loss": 0.0204, + "reward": 1.2203125, + "reward_std": 0.24810067620128393, + "rewards/accuracy_reward": 0.2515625, + "rewards/format_reward": 0.96875, + "step": 2220 + }, + { + "completion_length": 131.171875, + "epoch": 0.982855249717015, + "grad_norm": 0.4942164421081543, + "kl": 0.49373779296875, + "learning_rate": 1.718529395323687e-08, + "loss": 0.0198, + "reward": 1.234375, + "reward_std": 0.22725256606936456, + "rewards/accuracy_reward": 0.265625, + "rewards/format_reward": 0.96875, + "step": 2225 + }, + { + "completion_length": 151.55703125, + "epoch": 0.9850639131995251, + "grad_norm": 0.4727592468261719, + "kl": 0.46812744140625, + "learning_rate": 1.2961290451594111e-08, + "loss": 0.0187, + "reward": 1.2390625, + "reward_std": 0.2715959116816521, + "rewards/accuracy_reward": 0.2796875, + "rewards/format_reward": 0.959375, + "step": 2230 + }, + { + "completion_length": 149.5328125, + "epoch": 0.9872725766820353, + "grad_norm": 0.7650235891342163, + "kl": 0.46435546875, + "learning_rate": 9.3317416781602e-09, + "loss": 0.0186, + "reward": 1.26875, + "reward_std": 0.2793737856671214, + "rewards/accuracy_reward": 0.3109375, + "rewards/format_reward": 0.9578125, + "step": 2235 + }, + { + "completion_length": 148.59296875, + "epoch": 0.9894812401645454, + "grad_norm": 0.9975623488426208, + "kl": 0.50648193359375, + "learning_rate": 6.296863673191933e-09, + "loss": 0.0203, + "reward": 1.26796875, + "reward_std": 0.270522028952837, + "rewards/accuracy_reward": 0.3109375, + "rewards/format_reward": 0.95703125, + "step": 2240 + }, + { + "completion_length": 148.49765625, + "epoch": 0.9916899036470556, + "grad_norm": 0.8689573407173157, + "kl": 0.4764892578125, + "learning_rate": 3.856837080585818e-09, + "loss": 0.0191, + "reward": 1.25, + "reward_std": 0.23697545174509288, + "rewards/accuracy_reward": 0.2828125, + "rewards/format_reward": 0.9671875, + "step": 2245 + }, + { + "completion_length": 149.41484375, + "epoch": 0.9938985671295657, + "grad_norm": 0.9692167639732361, + "kl": 0.488427734375, + "learning_rate": 2.0118071371211244e-09, + "loss": 0.0195, + "reward": 1.2078125, + "reward_std": 0.25148440394550564, + "rewards/accuracy_reward": 0.2484375, + "rewards/format_reward": 0.959375, + "step": 2250 + }, + { + "completion_length": 139.27109375, + "epoch": 0.9961072306120758, + "grad_norm": 13.9436674118042, + "kl": 0.557568359375, + "learning_rate": 7.618836638190186e-10, + "loss": 0.0223, + "reward": 1.26953125, + "reward_std": 0.2697257066145539, + "rewards/accuracy_reward": 0.30625, + "rewards/format_reward": 0.96328125, + "step": 2255 + }, + { + "completion_length": 156.4703125, + "epoch": 0.998315894094586, + "grad_norm": 0.910273015499115, + "kl": 0.52550048828125, + "learning_rate": 1.0714105940001773e-10, + "loss": 0.021, + "reward": 1.190625, + "reward_std": 0.2517782935872674, + "rewards/accuracy_reward": 0.23984375, + "rewards/format_reward": 0.95078125, + "step": 2260 + }, + { + "completion_length": 164.65104166666666, + "epoch": 0.9996410921840921, + "kl": 0.5259602864583334, + "reward": 1.20703125, + "reward_std": 0.2667766287922859, + "rewards/accuracy_reward": 0.2578125, + "rewards/format_reward": 0.94921875, + "step": 2263, + "total_flos": 0.0, + "train_loss": 2.13883109888834, + "train_runtime": 166892.9358, + "train_samples_per_second": 0.434, + "train_steps_per_second": 0.014 + } + ], + "logging_steps": 5, + "max_steps": 2263, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}