{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996410921840921, "eval_steps": 100, "global_step": 2263, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 400.17890625, "epoch": 0.002208663482510146, "grad_norm": 1.4726563692092896, "kl": 0.0001227259635925293, "learning_rate": 4.405286343612335e-07, "loss": 0.0, "reward": 0.57421875, "reward_std": 0.458265589363873, "rewards/accuracy_reward": 0.15078125, "rewards/format_reward": 0.4234375, "step": 5 }, { "completion_length": 379.875, "epoch": 0.004417326965020292, "grad_norm": 1.1294347047805786, "kl": 0.00025554299354553224, "learning_rate": 8.81057268722467e-07, "loss": 0.0, "reward": 0.6515625, "reward_std": 0.4448237407952547, "rewards/accuracy_reward": 0.146875, "rewards/format_reward": 0.5046875, "step": 10 }, { "completion_length": 349.56640625, "epoch": 0.006625990447530438, "grad_norm": 0.7087352871894836, "kl": 0.003926074504852295, "learning_rate": 1.3215859030837006e-06, "loss": 0.0002, "reward": 0.7, "reward_std": 0.4231703171506524, "rewards/accuracy_reward": 0.1078125, "rewards/format_reward": 0.5921875, "step": 15 }, { "completion_length": 259.32421875, "epoch": 0.008834653930040584, "grad_norm": 6.263583660125732, "kl": 0.08897933959960938, "learning_rate": 1.762114537444934e-06, "loss": 0.0036, "reward": 0.9296875, "reward_std": 0.34432896580547095, "rewards/accuracy_reward": 0.09453125, "rewards/format_reward": 0.83515625, "step": 20 }, { "completion_length": 258.88203125, "epoch": 0.01104331741255073, "grad_norm": 0.662657618522644, "kl": 0.041219329833984374, "learning_rate": 2.2026431718061673e-06, "loss": 0.0016, "reward": 0.93359375, "reward_std": 0.3294501030817628, "rewards/accuracy_reward": 0.08828125, "rewards/format_reward": 0.8453125, "step": 25 }, { "completion_length": 283.37578125, "epoch": 0.013251980895060876, "grad_norm": 0.8266062140464783, "kl": 0.03238487243652344, "learning_rate": 2.643171806167401e-06, "loss": 0.0013, "reward": 0.88671875, "reward_std": 0.38558061737567184, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.78515625, "step": 30 }, { "completion_length": 228.65625, "epoch": 0.015460644377571022, "grad_norm": 0.9381304383277893, "kl": 0.023354721069335938, "learning_rate": 3.0837004405286347e-06, "loss": 0.0009, "reward": 0.9203125, "reward_std": 0.34575226698070766, "rewards/accuracy_reward": 0.09296875, "rewards/format_reward": 0.82734375, "step": 35 }, { "completion_length": 175.81953125, "epoch": 0.017669307860081168, "grad_norm": 0.654951274394989, "kl": 0.04830093383789062, "learning_rate": 3.524229074889868e-06, "loss": 0.0019, "reward": 0.99765625, "reward_std": 0.3096940713003278, "rewards/accuracy_reward": 0.11015625, "rewards/format_reward": 0.8875, "step": 40 }, { "completion_length": 179.19765625, "epoch": 0.019877971342591314, "grad_norm": 0.46008017659187317, "kl": 0.0401031494140625, "learning_rate": 3.964757709251102e-06, "loss": 0.0016, "reward": 1.0421875, "reward_std": 0.22858326323330402, "rewards/accuracy_reward": 0.09453125, "rewards/format_reward": 0.94765625, "step": 45 }, { "completion_length": 219.9734375, "epoch": 0.02208663482510146, "grad_norm": 0.6333373188972473, "kl": 0.05000381469726563, "learning_rate": 4.405286343612335e-06, "loss": 0.002, "reward": 1.11640625, "reward_std": 0.2830535739660263, "rewards/accuracy_reward": 0.16875, "rewards/format_reward": 0.94765625, "step": 50 }, { "completion_length": 206.340625, "epoch": 0.024295298307611606, "grad_norm": 0.444731205701828, "kl": 0.07784347534179688, "learning_rate": 4.8458149779735685e-06, "loss": 0.0031, "reward": 1.08671875, "reward_std": 0.24719008896499872, "rewards/accuracy_reward": 0.1296875, "rewards/format_reward": 0.95703125, "step": 55 }, { "completion_length": 217.25546875, "epoch": 0.026503961790121752, "grad_norm": 0.49777764081954956, "kl": 0.0484649658203125, "learning_rate": 5.286343612334802e-06, "loss": 0.0019, "reward": 1.096875, "reward_std": 0.2545401843264699, "rewards/accuracy_reward": 0.1390625, "rewards/format_reward": 0.9578125, "step": 60 }, { "completion_length": 206.72265625, "epoch": 0.028712625272631898, "grad_norm": 0.5592818260192871, "kl": 0.05699615478515625, "learning_rate": 5.7268722466960354e-06, "loss": 0.0023, "reward": 1.1234375, "reward_std": 0.25865183435380457, "rewards/accuracy_reward": 0.16640625, "rewards/format_reward": 0.95703125, "step": 65 }, { "completion_length": 178.8796875, "epoch": 0.030921288755142044, "grad_norm": 0.7485532164573669, "kl": 0.06690826416015624, "learning_rate": 6.167400881057269e-06, "loss": 0.0027, "reward": 1.15859375, "reward_std": 0.2730751080438495, "rewards/accuracy_reward": 0.1859375, "rewards/format_reward": 0.97265625, "step": 70 }, { "completion_length": 206.44375, "epoch": 0.033129952237652194, "grad_norm": 0.5321747064590454, "kl": 0.05711669921875, "learning_rate": 6.607929515418503e-06, "loss": 0.0023, "reward": 1.14140625, "reward_std": 0.2609355779364705, "rewards/accuracy_reward": 0.17578125, "rewards/format_reward": 0.965625, "step": 75 }, { "completion_length": 180.4796875, "epoch": 0.035338615720162336, "grad_norm": 0.7185708284378052, "kl": 0.079705810546875, "learning_rate": 7.048458149779736e-06, "loss": 0.0032, "reward": 1.20859375, "reward_std": 0.2991209041327238, "rewards/accuracy_reward": 0.240625, "rewards/format_reward": 0.96796875, "step": 80 }, { "completion_length": 205.42890625, "epoch": 0.037547279202672486, "grad_norm": 0.6930840611457825, "kl": 0.079119873046875, "learning_rate": 7.48898678414097e-06, "loss": 0.0032, "reward": 1.2125, "reward_std": 0.29966206308454274, "rewards/accuracy_reward": 0.24296875, "rewards/format_reward": 0.96953125, "step": 85 }, { "completion_length": 232.1125, "epoch": 0.03975594268518263, "grad_norm": 0.6082009673118591, "kl": 0.078369140625, "learning_rate": 7.929515418502203e-06, "loss": 0.0031, "reward": 1.2390625, "reward_std": 0.3367170764133334, "rewards/accuracy_reward": 0.2796875, "rewards/format_reward": 0.959375, "step": 90 }, { "completion_length": 234.2796875, "epoch": 0.04196460616769278, "grad_norm": 0.5688744783401489, "kl": 0.0821014404296875, "learning_rate": 8.370044052863436e-06, "loss": 0.0033, "reward": 1.2484375, "reward_std": 0.3449540941044688, "rewards/accuracy_reward": 0.28515625, "rewards/format_reward": 0.96328125, "step": 95 }, { "completion_length": 247.909375, "epoch": 0.04417326965020292, "grad_norm": 0.4896390438079834, "kl": 0.093121337890625, "learning_rate": 8.81057268722467e-06, "loss": 0.0037, "reward": 1.271875, "reward_std": 0.29160809628665446, "rewards/accuracy_reward": 0.2953125, "rewards/format_reward": 0.9765625, "step": 100 }, { "epoch": 0.04417326965020292, "eval_completion_length": 264.7725, "eval_kl": 0.09, "eval_loss": 0.003620876930654049, "eval_reward": 1.2708333349227905, "eval_reward_std": 0.30468439966440203, "eval_rewards/accuracy_reward": 0.29583333343267443, "eval_rewards/format_reward": 0.975, "eval_runtime": 145.932, "eval_samples_per_second": 0.678, "eval_steps_per_second": 0.027, "step": 100 }, { "completion_length": 292.07734375, "epoch": 0.04638193313271307, "grad_norm": 0.5129627585411072, "kl": 0.099822998046875, "learning_rate": 9.251101321585904e-06, "loss": 0.004, "reward": 1.2453125, "reward_std": 0.3150330139324069, "rewards/accuracy_reward": 0.28671875, "rewards/format_reward": 0.95859375, "step": 105 }, { "completion_length": 264.43125, "epoch": 0.04859059661522321, "grad_norm": 0.604174017906189, "kl": 0.103936767578125, "learning_rate": 9.691629955947137e-06, "loss": 0.0042, "reward": 1.26015625, "reward_std": 0.26412205342203376, "rewards/accuracy_reward": 0.290625, "rewards/format_reward": 0.96953125, "step": 110 }, { "completion_length": 238.78203125, "epoch": 0.05079926009773336, "grad_norm": 0.5758931040763855, "kl": 5888.534802246094, "learning_rate": 1.0132158590308372e-05, "loss": 234.88, "reward": 1.30703125, "reward_std": 0.3428795490413904, "rewards/accuracy_reward": 0.35078125, "rewards/format_reward": 0.95625, "step": 115 }, { "completion_length": 227.2875, "epoch": 0.053007923580243504, "grad_norm": 0.6410739421844482, "kl": 0.21376953125, "learning_rate": 1.0572687224669605e-05, "loss": 0.0085, "reward": 1.2140625, "reward_std": 0.39241575095802544, "rewards/accuracy_reward": 0.29765625, "rewards/format_reward": 0.91640625, "step": 120 }, { "completion_length": 208.746875, "epoch": 0.055216587062753654, "grad_norm": 118.43889617919922, "kl": 9.3187255859375, "learning_rate": 1.1013215859030836e-05, "loss": 0.3719, "reward": 1.1359375, "reward_std": 0.34939223267138003, "rewards/accuracy_reward": 0.2296875, "rewards/format_reward": 0.90625, "step": 125 }, { "completion_length": 164.7515625, "epoch": 0.057425250545263797, "grad_norm": 0.6973806023597717, "kl": 1.569244384765625, "learning_rate": 1.1453744493392071e-05, "loss": 0.0628, "reward": 1.20390625, "reward_std": 0.3942227842286229, "rewards/accuracy_reward": 0.29140625, "rewards/format_reward": 0.9125, "step": 130 }, { "completion_length": 188.734375, "epoch": 0.059633914027773946, "grad_norm": 54.8037223815918, "kl": 9.47294921875, "learning_rate": 1.1894273127753304e-05, "loss": 0.3796, "reward": 1.20234375, "reward_std": 0.32416225373744967, "rewards/accuracy_reward": 0.27734375, "rewards/format_reward": 0.925, "step": 135 }, { "completion_length": 150.5953125, "epoch": 0.06184257751028409, "grad_norm": 33.95077133178711, "kl": 1.503021240234375, "learning_rate": 1.2334801762114539e-05, "loss": 0.0602, "reward": 1.23671875, "reward_std": 0.30887170899659394, "rewards/accuracy_reward": 0.28203125, "rewards/format_reward": 0.9546875, "step": 140 }, { "completion_length": 179.94453125, "epoch": 0.06405124099279423, "grad_norm": 2.577629804611206, "kl": 1.558148193359375, "learning_rate": 1.2775330396475772e-05, "loss": 0.0623, "reward": 1.11953125, "reward_std": 0.4326841413974762, "rewards/accuracy_reward": 0.27265625, "rewards/format_reward": 0.846875, "step": 145 }, { "completion_length": 218.13203125, "epoch": 0.06625990447530439, "grad_norm": 4.710011005401611, "kl": 2.9638153076171876, "learning_rate": 1.3215859030837006e-05, "loss": 0.1188, "reward": 1.196875, "reward_std": 0.4242498528212309, "rewards/accuracy_reward": 0.32421875, "rewards/format_reward": 0.87265625, "step": 150 }, { "completion_length": 283.83828125, "epoch": 0.06846856795781453, "grad_norm": 1.0956288576126099, "kl": 10.248779296875, "learning_rate": 1.3656387665198238e-05, "loss": 0.4114, "reward": 1.19453125, "reward_std": 0.4564665203914046, "rewards/accuracy_reward": 0.34921875, "rewards/format_reward": 0.8453125, "step": 155 }, { "completion_length": 209.16328125, "epoch": 0.07067723144032467, "grad_norm": 0.584996223449707, "kl": 0.1743133544921875, "learning_rate": 1.4096916299559472e-05, "loss": 0.007, "reward": 1.36953125, "reward_std": 0.3628161208704114, "rewards/accuracy_reward": 0.40859375, "rewards/format_reward": 0.9609375, "step": 160 }, { "completion_length": 196.24453125, "epoch": 0.07288589492283482, "grad_norm": 0.8083503246307373, "kl": 0.168878173828125, "learning_rate": 1.4537444933920706e-05, "loss": 0.0068, "reward": 1.290625, "reward_std": 0.31575766000896693, "rewards/accuracy_reward": 0.33515625, "rewards/format_reward": 0.95546875, "step": 165 }, { "completion_length": 190.34453125, "epoch": 0.07509455840534497, "grad_norm": 0.6046306490898132, "kl": 0.1955078125, "learning_rate": 1.497797356828194e-05, "loss": 0.0078, "reward": 1.1984375, "reward_std": 0.37014698795974255, "rewards/accuracy_reward": 0.2765625, "rewards/format_reward": 0.921875, "step": 170 }, { "completion_length": 148.55, "epoch": 0.07730322188785511, "grad_norm": 0.509132444858551, "kl": 0.1782470703125, "learning_rate": 1.5418502202643173e-05, "loss": 0.0071, "reward": 1.23125, "reward_std": 0.23883652742952108, "rewards/accuracy_reward": 0.246875, "rewards/format_reward": 0.984375, "step": 175 }, { "completion_length": 307.5828125, "epoch": 0.07951188537036526, "grad_norm": 115.14045715332031, "kl": 2.41651611328125, "learning_rate": 1.5859030837004406e-05, "loss": 0.0967, "reward": 1.1296875, "reward_std": 0.4080469489097595, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.8484375, "step": 180 }, { "completion_length": 745.15546875, "epoch": 0.0817205488528754, "grad_norm": 1.7153384685516357, "kl": 1.31116943359375, "learning_rate": 1.629955947136564e-05, "loss": 0.0524, "reward": 0.30546875, "reward_std": 0.46172712091356516, "rewards/accuracy_reward": 0.0890625, "rewards/format_reward": 0.21640625, "step": 185 }, { "completion_length": 366.26796875, "epoch": 0.08392921233538556, "grad_norm": 2.2445106506347656, "kl": 4.1861083984375, "learning_rate": 1.6740088105726872e-05, "loss": 0.1674, "reward": 0.69453125, "reward_std": 0.5489674057811499, "rewards/accuracy_reward": 0.225, "rewards/format_reward": 0.46953125, "step": 190 }, { "completion_length": 276.47890625, "epoch": 0.0861378758178957, "grad_norm": 1.3458584547042847, "kl": 0.4182861328125, "learning_rate": 1.718061674008811e-05, "loss": 0.0167, "reward": 1.2015625, "reward_std": 0.47771220188587904, "rewards/accuracy_reward": 0.35546875, "rewards/format_reward": 0.84609375, "step": 195 }, { "completion_length": 330.8, "epoch": 0.08834653930040584, "grad_norm": 5.644898414611816, "kl": 3.825439453125, "learning_rate": 1.762114537444934e-05, "loss": 0.153, "reward": 1.13203125, "reward_std": 0.5322479158639908, "rewards/accuracy_reward": 0.33125, "rewards/format_reward": 0.80078125, "step": 200 }, { "epoch": 0.08834653930040584, "eval_completion_length": 356.7441674804688, "eval_kl": 3.136875, "eval_loss": 0.12226903438568115, "eval_reward": 1.0791666674613953, "eval_reward_std": 0.5326099014282226, "eval_rewards/accuracy_reward": 0.3129166668653488, "eval_rewards/format_reward": 0.76625, "eval_runtime": 294.9089, "eval_samples_per_second": 0.336, "eval_steps_per_second": 0.014, "step": 200 }, { "completion_length": 272.7140625, "epoch": 0.09055520278291598, "grad_norm": 3.581615924835205, "kl": 1.26063232421875, "learning_rate": 1.8061674008810575e-05, "loss": 0.0504, "reward": 1.13828125, "reward_std": 0.35043725427240136, "rewards/accuracy_reward": 0.24296875, "rewards/format_reward": 0.8953125, "step": 205 }, { "completion_length": 215.88984375, "epoch": 0.09276386626542614, "grad_norm": 4.773531436920166, "kl": 0.9517578125, "learning_rate": 1.8502202643171808e-05, "loss": 0.0381, "reward": 1.23828125, "reward_std": 0.30075737833976746, "rewards/accuracy_reward": 0.30703125, "rewards/format_reward": 0.93125, "step": 210 }, { "completion_length": 225.04765625, "epoch": 0.09497252974793628, "grad_norm": 2.473970890045166, "kl": 0.9461669921875, "learning_rate": 1.894273127753304e-05, "loss": 0.0378, "reward": 1.09140625, "reward_std": 0.4519174795597792, "rewards/accuracy_reward": 0.2453125, "rewards/format_reward": 0.84609375, "step": 215 }, { "completion_length": 297.01875, "epoch": 0.09718119323044642, "grad_norm": 2.2808244228363037, "kl": 21.850811767578126, "learning_rate": 1.9383259911894274e-05, "loss": 0.8722, "reward": 1.08046875, "reward_std": 0.46868909504264594, "rewards/accuracy_reward": 0.27734375, "rewards/format_reward": 0.803125, "step": 220 }, { "completion_length": 228.60234375, "epoch": 0.09938985671295657, "grad_norm": 1.9014919996261597, "kl": 0.230517578125, "learning_rate": 1.982378854625551e-05, "loss": 0.0092, "reward": 1.23046875, "reward_std": 0.3573141796514392, "rewards/accuracy_reward": 0.30234375, "rewards/format_reward": 0.928125, "step": 225 }, { "completion_length": 389.16640625, "epoch": 0.10159852019546672, "grad_norm": 7.406689643859863, "kl": 1.709527587890625, "learning_rate": 1.99998928589406e-05, "loss": 0.0684, "reward": 1.0375, "reward_std": 0.49771256893873217, "rewards/accuracy_reward": 0.26875, "rewards/format_reward": 0.76875, "step": 230 }, { "completion_length": 524.803125, "epoch": 0.10380718367797687, "grad_norm": 10.62135124206543, "kl": 5.1268310546875, "learning_rate": 1.999923811633618e-05, "loss": 0.2051, "reward": 0.834375, "reward_std": 0.5984118554741145, "rewards/accuracy_reward": 0.21015625, "rewards/format_reward": 0.62421875, "step": 235 }, { "completion_length": 332.025, "epoch": 0.10601584716048701, "grad_norm": 1.785484790802002, "kl": 8.025537109375, "learning_rate": 1.999798819286288e-05, "loss": 0.3211, "reward": 0.48359375, "reward_std": 0.4500583238899708, "rewards/accuracy_reward": 0.11484375, "rewards/format_reward": 0.36875, "step": 240 }, { "completion_length": 292.31328125, "epoch": 0.10822451064299715, "grad_norm": 3.578373432159424, "kl": 0.82308349609375, "learning_rate": 1.9996143162919416e-05, "loss": 0.0329, "reward": 0.8859375, "reward_std": 0.5004101138561964, "rewards/accuracy_reward": 0.1703125, "rewards/format_reward": 0.715625, "step": 245 }, { "completion_length": 230.86953125, "epoch": 0.11043317412550731, "grad_norm": 3.702038288116455, "kl": 11.077984619140626, "learning_rate": 1.9993703136326808e-05, "loss": 0.4433, "reward": 1.26875, "reward_std": 0.3013193493708968, "rewards/accuracy_reward": 0.30390625, "rewards/format_reward": 0.96484375, "step": 250 }, { "completion_length": 218.70078125, "epoch": 0.11264183760801745, "grad_norm": 7.172746658325195, "kl": 0.32041015625, "learning_rate": 1.999066825832184e-05, "loss": 0.0128, "reward": 1.26015625, "reward_std": 0.23753905296325684, "rewards/accuracy_reward": 0.26875, "rewards/format_reward": 0.99140625, "step": 255 }, { "completion_length": 253.38984375, "epoch": 0.11485050109052759, "grad_norm": 1.3221828937530518, "kl": 8.120147705078125, "learning_rate": 1.9987038709548408e-05, "loss": 0.3232, "reward": 1.24609375, "reward_std": 0.2821945507079363, "rewards/accuracy_reward": 0.27734375, "rewards/format_reward": 0.96875, "step": 260 }, { "completion_length": 288.26796875, "epoch": 0.11705916457303774, "grad_norm": 0.6487714052200317, "kl": 0.57447509765625, "learning_rate": 1.9982814706046766e-05, "loss": 0.023, "reward": 1.1859375, "reward_std": 0.3182327225804329, "rewards/accuracy_reward": 0.2390625, "rewards/format_reward": 0.946875, "step": 265 }, { "completion_length": 361.62265625, "epoch": 0.11926782805554789, "grad_norm": 3.041498899459839, "kl": 1.428155517578125, "learning_rate": 1.997799649924068e-05, "loss": 0.0572, "reward": 1.06640625, "reward_std": 0.3401893651112914, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.88671875, "step": 270 }, { "completion_length": 322.9, "epoch": 0.12147649153805803, "grad_norm": 1.5550798177719116, "kl": 0.65260009765625, "learning_rate": 1.9972584375922453e-05, "loss": 0.0261, "reward": 1.13984375, "reward_std": 0.37677015643566847, "rewards/accuracy_reward": 0.23984375, "rewards/format_reward": 0.9, "step": 275 }, { "completion_length": 319.37578125, "epoch": 0.12368515502056818, "grad_norm": 114126.546875, "kl": 4326.556237792969, "learning_rate": 1.996657865823585e-05, "loss": 172.9404, "reward": 1.23828125, "reward_std": 0.33014066983014345, "rewards/accuracy_reward": 0.30078125, "rewards/format_reward": 0.9375, "step": 280 }, { "completion_length": 312.85546875, "epoch": 0.12589381850307832, "grad_norm": 1.4805908203125, "kl": 0.228668212890625, "learning_rate": 1.995997970365694e-05, "loss": 0.0091, "reward": 1.21171875, "reward_std": 0.3828111669048667, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.92265625, "step": 285 }, { "completion_length": 354.6484375, "epoch": 0.12810248198558846, "grad_norm": 53.09714889526367, "kl": 4.07802734375, "learning_rate": 1.9952787904972794e-05, "loss": 0.1632, "reward": 1.053125, "reward_std": 0.4529764140024781, "rewards/accuracy_reward": 0.21640625, "rewards/format_reward": 0.83671875, "step": 290 }, { "completion_length": 285.98125, "epoch": 0.1303111454680986, "grad_norm": 0.581076443195343, "kl": 0.92733154296875, "learning_rate": 1.9945003690258127e-05, "loss": 0.0371, "reward": 1.1953125, "reward_std": 0.31984729822725055, "rewards/accuracy_reward": 0.259375, "rewards/format_reward": 0.9359375, "step": 295 }, { "completion_length": 269.2640625, "epoch": 0.13251980895060878, "grad_norm": 0.33922237157821655, "kl": 0.34735107421875, "learning_rate": 1.993662752284981e-05, "loss": 0.0139, "reward": 1.2796875, "reward_std": 0.257637638784945, "rewards/accuracy_reward": 0.30390625, "rewards/format_reward": 0.97578125, "step": 300 }, { "epoch": 0.13251980895060878, "eval_completion_length": 312.8345849609375, "eval_kl": 0.2959375, "eval_loss": 0.012029927223920822, "eval_reward": 1.2270833349227905, "eval_reward_std": 0.3163718029856682, "eval_rewards/accuracy_reward": 0.2704166667163372, "eval_rewards/format_reward": 0.9566666674613953, "eval_runtime": 158.7274, "eval_samples_per_second": 0.624, "eval_steps_per_second": 0.025, "step": 300 }, { "completion_length": 344.0703125, "epoch": 0.13472847243311892, "grad_norm": 15.648075103759766, "kl": 0.812335205078125, "learning_rate": 1.9927659901319292e-05, "loss": 0.0324, "reward": 1.18515625, "reward_std": 0.36495272126048806, "rewards/accuracy_reward": 0.24765625, "rewards/format_reward": 0.9375, "step": 305 }, { "completion_length": 351.78984375, "epoch": 0.13693713591562906, "grad_norm": 0.3627087473869324, "kl": 0.196575927734375, "learning_rate": 1.9918101359442908e-05, "loss": 0.0079, "reward": 1.1203125, "reward_std": 0.3219464411959052, "rewards/accuracy_reward": 0.20234375, "rewards/format_reward": 0.91796875, "step": 310 }, { "completion_length": 289.5125, "epoch": 0.1391457993981392, "grad_norm": 0.41371065378189087, "kl": 0.208282470703125, "learning_rate": 1.990795246617014e-05, "loss": 0.0083, "reward": 1.15234375, "reward_std": 0.30122786965221166, "rewards/accuracy_reward": 0.21328125, "rewards/format_reward": 0.9390625, "step": 315 }, { "completion_length": 243.4625, "epoch": 0.14135446288064935, "grad_norm": 0.3589507043361664, "kl": 0.216851806640625, "learning_rate": 1.989721382558972e-05, "loss": 0.0087, "reward": 1.2484375, "reward_std": 0.3238747540861368, "rewards/accuracy_reward": 0.28515625, "rewards/format_reward": 0.96328125, "step": 320 }, { "completion_length": 243.1734375, "epoch": 0.1435631263631595, "grad_norm": 0.38298115134239197, "kl": 0.201153564453125, "learning_rate": 1.988588607689369e-05, "loss": 0.008, "reward": 1.2078125, "reward_std": 0.2437373088672757, "rewards/accuracy_reward": 0.23515625, "rewards/format_reward": 0.97265625, "step": 325 }, { "completion_length": 278.3796875, "epoch": 0.14577178984566963, "grad_norm": 0.7377725839614868, "kl": 0.229010009765625, "learning_rate": 1.987396989433935e-05, "loss": 0.0092, "reward": 1.153125, "reward_std": 0.36884579751640556, "rewards/accuracy_reward": 0.2328125, "rewards/format_reward": 0.9203125, "step": 330 }, { "completion_length": 280.9796875, "epoch": 0.14798045332817977, "grad_norm": 3.068225622177124, "kl": 0.5145751953125, "learning_rate": 1.986146598720913e-05, "loss": 0.0205, "reward": 1.0796875, "reward_std": 0.3447397375479341, "rewards/accuracy_reward": 0.17734375, "rewards/format_reward": 0.90234375, "step": 335 }, { "completion_length": 231.14296875, "epoch": 0.15018911681068994, "grad_norm": 1.5392614603042603, "kl": 1.3296142578125, "learning_rate": 1.984837509976837e-05, "loss": 0.0532, "reward": 1.14375, "reward_std": 0.3358943074941635, "rewards/accuracy_reward": 0.215625, "rewards/format_reward": 0.928125, "step": 340 }, { "completion_length": 248.0734375, "epoch": 0.15239778029320009, "grad_norm": 0.6375353336334229, "kl": 1.9455322265625, "learning_rate": 1.9834698011221008e-05, "loss": 0.0778, "reward": 1.13203125, "reward_std": 0.38343740683048966, "rewards/accuracy_reward": 0.2203125, "rewards/format_reward": 0.91171875, "step": 345 }, { "completion_length": 261.58046875, "epoch": 0.15460644377571023, "grad_norm": 2110458.0, "kl": 12810.955236816406, "learning_rate": 1.982043553566321e-05, "loss": 512.4465, "reward": 1.06953125, "reward_std": 0.38607826493680475, "rewards/accuracy_reward": 0.18046875, "rewards/format_reward": 0.8890625, "step": 350 }, { "completion_length": 325.5890625, "epoch": 0.15681510725822037, "grad_norm": 7.603396415710449, "kl": 20.8695068359375, "learning_rate": 1.980558852203492e-05, "loss": 0.8364, "reward": 1.02265625, "reward_std": 0.46340725645422937, "rewards/accuracy_reward": 0.209375, "rewards/format_reward": 0.81328125, "step": 355 }, { "completion_length": 344.390625, "epoch": 0.1590237707407305, "grad_norm": 1.510093331336975, "kl": 1.6796875, "learning_rate": 1.979015785406931e-05, "loss": 0.0672, "reward": 1.0, "reward_std": 0.470697814039886, "rewards/accuracy_reward": 0.19921875, "rewards/format_reward": 0.80078125, "step": 360 }, { "completion_length": 243.0875, "epoch": 0.16123243422324066, "grad_norm": 7.704539775848389, "kl": 4.3294921875, "learning_rate": 1.97741444502402e-05, "loss": 0.1733, "reward": 1.16328125, "reward_std": 0.3427719760686159, "rewards/accuracy_reward": 0.2390625, "rewards/format_reward": 0.92421875, "step": 365 }, { "completion_length": 207.48046875, "epoch": 0.1634410977057508, "grad_norm": 5.120603561401367, "kl": 2.038916015625, "learning_rate": 1.9757549263707366e-05, "loss": 0.0816, "reward": 1.19296875, "reward_std": 0.35142498891800644, "rewards/accuracy_reward": 0.25625, "rewards/format_reward": 0.93671875, "step": 370 }, { "completion_length": 229.01953125, "epoch": 0.16564976118826094, "grad_norm": 7.258485317230225, "kl": 2.72215576171875, "learning_rate": 1.974037328225982e-05, "loss": 0.1089, "reward": 1.14921875, "reward_std": 0.3493613565340638, "rewards/accuracy_reward": 0.22421875, "rewards/format_reward": 0.925, "step": 375 }, { "completion_length": 217.7515625, "epoch": 0.1678584246707711, "grad_norm": 7.898167133331299, "kl": 2.309033203125, "learning_rate": 1.972261752825701e-05, "loss": 0.0924, "reward": 1.14453125, "reward_std": 0.300789905525744, "rewards/accuracy_reward": 0.20625, "rewards/format_reward": 0.93828125, "step": 380 }, { "completion_length": 168.23515625, "epoch": 0.17006708815328125, "grad_norm": 0.5930284261703491, "kl": 1.09581298828125, "learning_rate": 1.9704283058567972e-05, "loss": 0.0439, "reward": 1.17421875, "reward_std": 0.2850266819819808, "rewards/accuracy_reward": 0.2140625, "rewards/format_reward": 0.96015625, "step": 385 }, { "completion_length": 156.07421875, "epoch": 0.1722757516357914, "grad_norm": 2.664320945739746, "kl": 1.3061279296875, "learning_rate": 1.968537096450841e-05, "loss": 0.0523, "reward": 1.1390625, "reward_std": 0.28086008559912445, "rewards/accuracy_reward": 0.18046875, "rewards/format_reward": 0.95859375, "step": 390 }, { "completion_length": 204.665625, "epoch": 0.17448441511830154, "grad_norm": 1.541552186012268, "kl": 1.1947509765625, "learning_rate": 1.9665882371775735e-05, "loss": 0.0478, "reward": 1.1328125, "reward_std": 0.26257925033569335, "rewards/accuracy_reward": 0.175, "rewards/format_reward": 0.9578125, "step": 395 }, { "completion_length": 227.2359375, "epoch": 0.17669307860081168, "grad_norm": 0.41995060443878174, "kl": 0.6468994140625, "learning_rate": 1.9645818440382096e-05, "loss": 0.0259, "reward": 1.215625, "reward_std": 0.3341550791636109, "rewards/accuracy_reward": 0.2703125, "rewards/format_reward": 0.9453125, "step": 400 }, { "epoch": 0.17669307860081168, "eval_completion_length": 243.36083374023437, "eval_kl": 0.38421875, "eval_loss": 0.015584616921842098, "eval_reward": 1.2075, "eval_reward_std": 0.30897092461586, "eval_rewards/accuracy_reward": 0.2583333334326744, "eval_rewards/format_reward": 0.9491666674613952, "eval_runtime": 159.9641, "eval_samples_per_second": 0.619, "eval_steps_per_second": 0.025, "step": 400 }, { "completion_length": 266.815625, "epoch": 0.17890174208332182, "grad_norm": 0.7400681376457214, "kl": 0.40008544921875, "learning_rate": 1.962518036458529e-05, "loss": 0.016, "reward": 1.190625, "reward_std": 0.31112865209579466, "rewards/accuracy_reward": 0.23828125, "rewards/format_reward": 0.95234375, "step": 405 }, { "completion_length": 280.94921875, "epoch": 0.18111040556583197, "grad_norm": 0.30900517106056213, "kl": 0.38648681640625, "learning_rate": 1.9603969372817695e-05, "loss": 0.0155, "reward": 1.240625, "reward_std": 0.294854056276381, "rewards/accuracy_reward": 0.28828125, "rewards/format_reward": 0.95234375, "step": 410 }, { "completion_length": 358.7671875, "epoch": 0.1833190690483421, "grad_norm": 1.0198228359222412, "kl": 1.0703125, "learning_rate": 1.9582186727613152e-05, "loss": 0.0428, "reward": 1.01796875, "reward_std": 0.4371380554512143, "rewards/accuracy_reward": 0.2234375, "rewards/format_reward": 0.79453125, "step": 415 }, { "completion_length": 286.39140625, "epoch": 0.18552773253085228, "grad_norm": 1.2672603130340576, "kl": 0.32391357421875, "learning_rate": 1.955983372553182e-05, "loss": 0.013, "reward": 0.946875, "reward_std": 0.4627113614231348, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.775, "step": 420 }, { "completion_length": 207.659375, "epoch": 0.18773639601336242, "grad_norm": 0.4993106424808502, "kl": 0.32420654296875, "learning_rate": 1.953691169708298e-05, "loss": 0.013, "reward": 1.05859375, "reward_std": 0.3334008002653718, "rewards/accuracy_reward": 0.15546875, "rewards/format_reward": 0.903125, "step": 425 }, { "completion_length": 167.09296875, "epoch": 0.18994505949587256, "grad_norm": 0.7137377858161926, "kl": 0.3811279296875, "learning_rate": 1.9513422006645867e-05, "loss": 0.0152, "reward": 1.12265625, "reward_std": 0.3202515557408333, "rewards/accuracy_reward": 0.1984375, "rewards/format_reward": 0.92421875, "step": 430 }, { "completion_length": 259.22109375, "epoch": 0.1921537229783827, "grad_norm": 0.4750295877456665, "kl": 0.30950927734375, "learning_rate": 1.9489366052388443e-05, "loss": 0.0124, "reward": 1.121875, "reward_std": 0.4058058561757207, "rewards/accuracy_reward": 0.225, "rewards/format_reward": 0.896875, "step": 435 }, { "completion_length": 234.15546875, "epoch": 0.19436238646089285, "grad_norm": 0.5221619009971619, "kl": 0.27547607421875, "learning_rate": 1.9464745266184173e-05, "loss": 0.011, "reward": 1.1921875, "reward_std": 0.3065523250028491, "rewards/accuracy_reward": 0.25546875, "rewards/format_reward": 0.93671875, "step": 440 }, { "completion_length": 150.1078125, "epoch": 0.196571049943403, "grad_norm": 7.660337924957275, "kl": 0.38460693359375, "learning_rate": 1.9439561113526802e-05, "loss": 0.0154, "reward": 1.16328125, "reward_std": 0.2454235328361392, "rewards/accuracy_reward": 0.2015625, "rewards/format_reward": 0.96171875, "step": 445 }, { "completion_length": 145.90859375, "epoch": 0.19877971342591313, "grad_norm": 0.47105056047439575, "kl": 0.276025390625, "learning_rate": 1.9413815093443128e-05, "loss": 0.011, "reward": 1.14140625, "reward_std": 0.21770920380949974, "rewards/accuracy_reward": 0.1765625, "rewards/format_reward": 0.96484375, "step": 450 }, { "completion_length": 162.709375, "epoch": 0.20098837690842328, "grad_norm": 0.4455668032169342, "kl": 0.29571533203125, "learning_rate": 1.938750873840377e-05, "loss": 0.0118, "reward": 1.1203125, "reward_std": 0.2609225707128644, "rewards/accuracy_reward": 0.16171875, "rewards/format_reward": 0.95859375, "step": 455 }, { "completion_length": 185.5453125, "epoch": 0.20319704039093345, "grad_norm": 1.09501051902771, "kl": 0.34464111328125, "learning_rate": 1.9360643614231942e-05, "loss": 0.0138, "reward": 1.08125, "reward_std": 0.2819837937131524, "rewards/accuracy_reward": 0.1375, "rewards/format_reward": 0.94375, "step": 460 }, { "completion_length": 178.6546875, "epoch": 0.2054057038734436, "grad_norm": 0.501656174659729, "kl": 0.2873779296875, "learning_rate": 1.9333221320010275e-05, "loss": 0.0115, "reward": 1.1515625, "reward_std": 0.2634758483618498, "rewards/accuracy_reward": 0.19453125, "rewards/format_reward": 0.95703125, "step": 465 }, { "completion_length": 190.03671875, "epoch": 0.20761436735595373, "grad_norm": 1.7912346124649048, "kl": 0.30848388671875, "learning_rate": 1.930524348798562e-05, "loss": 0.0123, "reward": 1.14609375, "reward_std": 0.31838786210864783, "rewards/accuracy_reward": 0.215625, "rewards/format_reward": 0.93046875, "step": 470 }, { "completion_length": 174.62109375, "epoch": 0.20982303083846388, "grad_norm": 0.4804052412509918, "kl": 0.32216796875, "learning_rate": 1.9276711783471888e-05, "loss": 0.0129, "reward": 1.096875, "reward_std": 0.2830366240814328, "rewards/accuracy_reward": 0.165625, "rewards/format_reward": 0.93125, "step": 475 }, { "completion_length": 144.003125, "epoch": 0.21203169432097402, "grad_norm": 0.47488105297088623, "kl": 0.29373779296875, "learning_rate": 1.9247627904750937e-05, "loss": 0.0117, "reward": 1.22109375, "reward_std": 0.2334042889997363, "rewards/accuracy_reward": 0.2328125, "rewards/format_reward": 0.98828125, "step": 480 }, { "completion_length": 225.4140625, "epoch": 0.21424035780348416, "grad_norm": 0.6563146114349365, "kl": 0.37607421875, "learning_rate": 1.9217993582971485e-05, "loss": 0.015, "reward": 1.12109375, "reward_std": 0.35556240323930977, "rewards/accuracy_reward": 0.21015625, "rewards/format_reward": 0.9109375, "step": 485 }, { "completion_length": 148.5078125, "epoch": 0.2164490212859943, "grad_norm": 0.6001664400100708, "kl": 0.50482177734375, "learning_rate": 1.9187810582046056e-05, "loss": 0.0202, "reward": 1.16796875, "reward_std": 0.2445027854293585, "rewards/accuracy_reward": 0.196875, "rewards/format_reward": 0.97109375, "step": 490 }, { "completion_length": 175.846875, "epoch": 0.21865768476850445, "grad_norm": 0.48493316769599915, "kl": 0.33309326171875, "learning_rate": 1.9157080698546e-05, "loss": 0.0133, "reward": 1.134375, "reward_std": 0.2596387291327119, "rewards/accuracy_reward": 0.1765625, "rewards/format_reward": 0.9578125, "step": 495 }, { "completion_length": 172.95859375, "epoch": 0.22086634825101462, "grad_norm": 0.34802621603012085, "kl": 0.29521484375, "learning_rate": 1.9125805761594553e-05, "loss": 0.0118, "reward": 1.13125, "reward_std": 0.2513675343245268, "rewards/accuracy_reward": 0.1703125, "rewards/format_reward": 0.9609375, "step": 500 }, { "epoch": 0.22086634825101462, "eval_completion_length": 168.7541668701172, "eval_kl": 0.3077734375, "eval_loss": 0.012391554191708565, "eval_reward": 1.204166669845581, "eval_reward_std": 0.2949218952655792, "eval_rewards/accuracy_reward": 0.2508333337306976, "eval_rewards/format_reward": 0.9533333349227905, "eval_runtime": 143.7617, "eval_samples_per_second": 0.689, "eval_steps_per_second": 0.028, "step": 500 }, { "completion_length": 160.86171875, "epoch": 0.22307501173352476, "grad_norm": 0.750469446182251, "kl": 0.33531494140625, "learning_rate": 1.9093987632757957e-05, "loss": 0.0134, "reward": 1.13828125, "reward_std": 0.2916230414062738, "rewards/accuracy_reward": 0.18828125, "rewards/format_reward": 0.95, "step": 505 }, { "completion_length": 158.5875, "epoch": 0.2252836752160349, "grad_norm": 0.4148224890232086, "kl": 0.35068359375, "learning_rate": 1.9061628205934662e-05, "loss": 0.014, "reward": 1.12265625, "reward_std": 0.263315293751657, "rewards/accuracy_reward": 0.16953125, "rewards/format_reward": 0.953125, "step": 510 }, { "completion_length": 182.6203125, "epoch": 0.22749233869854504, "grad_norm": 0.3992038667201996, "kl": 0.2927001953125, "learning_rate": 1.9028729407242598e-05, "loss": 0.0117, "reward": 1.1828125, "reward_std": 0.3094627659767866, "rewards/accuracy_reward": 0.23125, "rewards/format_reward": 0.9515625, "step": 515 }, { "completion_length": 191.246875, "epoch": 0.22970100218105519, "grad_norm": 0.5499962568283081, "kl": 0.318408203125, "learning_rate": 1.8995293194904512e-05, "loss": 0.0127, "reward": 1.12734375, "reward_std": 0.2894813433289528, "rewards/accuracy_reward": 0.17890625, "rewards/format_reward": 0.9484375, "step": 520 }, { "completion_length": 167.7296875, "epoch": 0.23190966566356533, "grad_norm": 0.38229528069496155, "kl": 0.31822509765625, "learning_rate": 1.896132155913143e-05, "loss": 0.0127, "reward": 1.11484375, "reward_std": 0.29848874974995854, "rewards/accuracy_reward": 0.178125, "rewards/format_reward": 0.93671875, "step": 525 }, { "completion_length": 142.56015625, "epoch": 0.23411832914607547, "grad_norm": 0.5173822641372681, "kl": 0.33304443359375, "learning_rate": 1.892681652200418e-05, "loss": 0.0133, "reward": 1.08984375, "reward_std": 0.2929655512794852, "rewards/accuracy_reward": 0.15234375, "rewards/format_reward": 0.9375, "step": 530 }, { "completion_length": 151.20078125, "epoch": 0.2363269926285856, "grad_norm": 0.521994411945343, "kl": 0.344140625, "learning_rate": 1.8891780137353036e-05, "loss": 0.0138, "reward": 1.0921875, "reward_std": 0.2893120773136616, "rewards/accuracy_reward": 0.1578125, "rewards/format_reward": 0.934375, "step": 535 }, { "completion_length": 132.7984375, "epoch": 0.23853565611109578, "grad_norm": 0.5721232295036316, "kl": 0.36300048828125, "learning_rate": 1.885621449063547e-05, "loss": 0.0145, "reward": 1.128125, "reward_std": 0.24814818538725375, "rewards/accuracy_reward": 0.1671875, "rewards/format_reward": 0.9609375, "step": 540 }, { "completion_length": 144.96875, "epoch": 0.24074431959360593, "grad_norm": 0.3831787407398224, "kl": 0.34149169921875, "learning_rate": 1.8820121698812028e-05, "loss": 0.0137, "reward": 1.14140625, "reward_std": 0.22749478761106728, "rewards/accuracy_reward": 0.1765625, "rewards/format_reward": 0.96484375, "step": 545 }, { "completion_length": 169.89296875, "epoch": 0.24295298307611607, "grad_norm": 0.5316097140312195, "kl": 0.32564697265625, "learning_rate": 1.8783503910220296e-05, "loss": 0.013, "reward": 1.1546875, "reward_std": 0.2780306525528431, "rewards/accuracy_reward": 0.2046875, "rewards/format_reward": 0.95, "step": 550 }, { "completion_length": 149.83984375, "epoch": 0.2451616465586262, "grad_norm": 0.4427882432937622, "kl": 0.3146484375, "learning_rate": 1.8746363304447073e-05, "loss": 0.0126, "reward": 1.1578125, "reward_std": 0.22796925920993089, "rewards/accuracy_reward": 0.1859375, "rewards/format_reward": 0.971875, "step": 555 }, { "completion_length": 165.7453125, "epoch": 0.24737031004113635, "grad_norm": 0.47105562686920166, "kl": 0.298486328125, "learning_rate": 1.8708702092198576e-05, "loss": 0.0119, "reward": 1.17578125, "reward_std": 0.2685113290324807, "rewards/accuracy_reward": 0.2125, "rewards/format_reward": 0.96328125, "step": 560 }, { "completion_length": 168.32265625, "epoch": 0.2495789735236465, "grad_norm": 0.3255383372306824, "kl": 0.3055908203125, "learning_rate": 1.867052251516891e-05, "loss": 0.0122, "reward": 1.1390625, "reward_std": 0.24939600769430398, "rewards/accuracy_reward": 0.1703125, "rewards/format_reward": 0.96875, "step": 565 }, { "completion_length": 184.50546875, "epoch": 0.25178763700615664, "grad_norm": 0.4039280116558075, "kl": 0.31488037109375, "learning_rate": 1.8631826845906588e-05, "loss": 0.0126, "reward": 1.14140625, "reward_std": 0.2917328651994467, "rewards/accuracy_reward": 0.1984375, "rewards/format_reward": 0.94296875, "step": 570 }, { "completion_length": 187.24296875, "epoch": 0.2539963004886668, "grad_norm": 0.48246172070503235, "kl": 0.33466796875, "learning_rate": 1.8592617387679304e-05, "loss": 0.0134, "reward": 1.15625, "reward_std": 0.32329851035028695, "rewards/accuracy_reward": 0.21640625, "rewards/format_reward": 0.93984375, "step": 575 }, { "completion_length": 157.41796875, "epoch": 0.2562049639711769, "grad_norm": 0.5687190294265747, "kl": 0.365576171875, "learning_rate": 1.8552896474336816e-05, "loss": 0.0146, "reward": 1.1390625, "reward_std": 0.25018255431205033, "rewards/accuracy_reward": 0.17890625, "rewards/format_reward": 0.96015625, "step": 580 }, { "completion_length": 126.7265625, "epoch": 0.25841362745368707, "grad_norm": 0.4837645888328552, "kl": 0.494287109375, "learning_rate": 1.8512666470172024e-05, "loss": 0.0198, "reward": 1.13515625, "reward_std": 0.25846064239740374, "rewards/accuracy_reward": 0.1828125, "rewards/format_reward": 0.95234375, "step": 585 }, { "completion_length": 117.5125, "epoch": 0.2606222909361972, "grad_norm": 0.5099273324012756, "kl": 0.46514892578125, "learning_rate": 1.8471929769780247e-05, "loss": 0.0186, "reward": 1.1375, "reward_std": 0.27797329761087897, "rewards/accuracy_reward": 0.18671875, "rewards/format_reward": 0.95078125, "step": 590 }, { "completion_length": 129.4875, "epoch": 0.26283095441870735, "grad_norm": 0.48087364435195923, "kl": 23.66878662109375, "learning_rate": 1.8430688797916702e-05, "loss": 0.9494, "reward": 1.10234375, "reward_std": 0.26711587999016045, "rewards/accuracy_reward": 0.14609375, "rewards/format_reward": 0.95625, "step": 595 }, { "completion_length": 137.51171875, "epoch": 0.26503961790121755, "grad_norm": 0.5267772674560547, "kl": 0.357568359375, "learning_rate": 1.8388946009352157e-05, "loss": 0.0143, "reward": 1.15625, "reward_std": 0.2736740421503782, "rewards/accuracy_reward": 0.2046875, "rewards/format_reward": 0.9515625, "step": 600 }, { "epoch": 0.26503961790121755, "eval_completion_length": 110.09, "eval_kl": 0.377890625, "eval_loss": 0.01513399463146925, "eval_reward": 1.1708333349227906, "eval_reward_std": 0.2645035409927368, "eval_rewards/accuracy_reward": 0.20208333373069765, "eval_rewards/format_reward": 0.96875, "eval_runtime": 129.4495, "eval_samples_per_second": 0.765, "eval_steps_per_second": 0.031, "step": 600 }, { "completion_length": 103.6046875, "epoch": 0.2672482813837277, "grad_norm": 0.5802999138832092, "kl": 0.42120361328125, "learning_rate": 1.8346703888726833e-05, "loss": 0.0168, "reward": 1.1421875, "reward_std": 0.22200765572488307, "rewards/accuracy_reward": 0.1765625, "rewards/format_reward": 0.965625, "step": 605 }, { "completion_length": 141.5125, "epoch": 0.26945694486623784, "grad_norm": 0.595243513584137, "kl": 0.3810546875, "learning_rate": 1.8303964950402498e-05, "loss": 0.0152, "reward": 1.12265625, "reward_std": 0.29428734816610813, "rewards/accuracy_reward": 0.184375, "rewards/format_reward": 0.93828125, "step": 610 }, { "completion_length": 138.7, "epoch": 0.271665608348748, "grad_norm": 0.43679195642471313, "kl": 0.38223876953125, "learning_rate": 1.8260731738312817e-05, "loss": 0.0153, "reward": 1.10078125, "reward_std": 0.23885549493134023, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.95234375, "step": 615 }, { "completion_length": 104.80703125, "epoch": 0.2738742718312581, "grad_norm": 0.5839166045188904, "kl": 0.4041748046875, "learning_rate": 1.8217006825811924e-05, "loss": 0.0162, "reward": 1.12890625, "reward_std": 0.19053069781512022, "rewards/accuracy_reward": 0.14609375, "rewards/format_reward": 0.9828125, "step": 620 }, { "completion_length": 132.15859375, "epoch": 0.27608293531376826, "grad_norm": 1.1488077640533447, "kl": 0.4154296875, "learning_rate": 1.8172792815521246e-05, "loss": 0.0166, "reward": 1.12890625, "reward_std": 0.24578131809830667, "rewards/accuracy_reward": 0.1703125, "rewards/format_reward": 0.95859375, "step": 625 }, { "completion_length": 203.21328125, "epoch": 0.2782915987962784, "grad_norm": 0.9778295159339905, "kl": 1.12738037109375, "learning_rate": 1.81280923391746e-05, "loss": 0.0452, "reward": 0.99453125, "reward_std": 0.3527091216295958, "rewards/accuracy_reward": 0.1265625, "rewards/format_reward": 0.86796875, "step": 630 }, { "completion_length": 107.22421875, "epoch": 0.28050026227878855, "grad_norm": 0.4877747595310211, "kl": 0.5244873046875, "learning_rate": 1.8082908057461534e-05, "loss": 0.021, "reward": 1.0953125, "reward_std": 0.23968660701066255, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.946875, "step": 635 }, { "completion_length": 81.0328125, "epoch": 0.2827089257612987, "grad_norm": 0.7603225708007812, "kl": 0.5284912109375, "learning_rate": 1.8037242659868958e-05, "loss": 0.0211, "reward": 1.12109375, "reward_std": 0.21915814336389303, "rewards/accuracy_reward": 0.14453125, "rewards/format_reward": 0.9765625, "step": 640 }, { "completion_length": 118.26171875, "epoch": 0.28491758924380883, "grad_norm": 0.565264880657196, "kl": 0.44853515625, "learning_rate": 1.7991098864521066e-05, "loss": 0.018, "reward": 1.12578125, "reward_std": 0.2820789096876979, "rewards/accuracy_reward": 0.17734375, "rewards/format_reward": 0.9484375, "step": 645 }, { "completion_length": 92.51484375, "epoch": 0.287126252726319, "grad_norm": 0.5516146421432495, "kl": 0.4996337890625, "learning_rate": 1.794447941801754e-05, "loss": 0.02, "reward": 1.13515625, "reward_std": 0.21679833866655826, "rewards/accuracy_reward": 0.16328125, "rewards/format_reward": 0.971875, "step": 650 }, { "completion_length": 126.5140625, "epoch": 0.2893349162088291, "grad_norm": 0.5977817177772522, "kl": 0.4461181640625, "learning_rate": 1.7897387095270058e-05, "loss": 0.0178, "reward": 1.10546875, "reward_std": 0.24418613854795695, "rewards/accuracy_reward": 0.14296875, "rewards/format_reward": 0.9625, "step": 655 }, { "completion_length": 153.0734375, "epoch": 0.29154357969133926, "grad_norm": 0.4894627630710602, "kl": 0.37508544921875, "learning_rate": 1.7849824699337143e-05, "loss": 0.015, "reward": 1.1125, "reward_std": 0.22963083293288947, "rewards/accuracy_reward": 0.1515625, "rewards/format_reward": 0.9609375, "step": 660 }, { "completion_length": 103.91953125, "epoch": 0.2937522431738494, "grad_norm": 0.6808644533157349, "kl": 0.456884765625, "learning_rate": 1.7801795061257293e-05, "loss": 0.0183, "reward": 1.121875, "reward_std": 0.21878602355718613, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9734375, "step": 665 }, { "completion_length": 90.246875, "epoch": 0.29596090665635955, "grad_norm": 0.4546065330505371, "kl": 0.46005859375, "learning_rate": 1.77533010398805e-05, "loss": 0.0184, "reward": 1.0796875, "reward_std": 0.19561193585395814, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9703125, "step": 670 }, { "completion_length": 108.37578125, "epoch": 0.29816957013886974, "grad_norm": 0.4939492344856262, "kl": 0.40859375, "learning_rate": 1.7704345521698057e-05, "loss": 0.0163, "reward": 1.0984375, "reward_std": 0.2110065519809723, "rewards/accuracy_reward": 0.13515625, "rewards/format_reward": 0.96328125, "step": 675 }, { "completion_length": 116.196875, "epoch": 0.3003782336213799, "grad_norm": 0.4660269021987915, "kl": 0.397998046875, "learning_rate": 1.765493142067076e-05, "loss": 0.0159, "reward": 1.14140625, "reward_std": 0.23255243562161923, "rewards/accuracy_reward": 0.1765625, "rewards/format_reward": 0.96484375, "step": 680 }, { "completion_length": 104.7328125, "epoch": 0.30258689710389003, "grad_norm": 0.5599631071090698, "kl": 0.39521484375, "learning_rate": 1.7605061678055453e-05, "loss": 0.0158, "reward": 1.11953125, "reward_std": 0.17798166144639255, "rewards/accuracy_reward": 0.1359375, "rewards/format_reward": 0.98359375, "step": 685 }, { "completion_length": 129.27890625, "epoch": 0.30479556058640017, "grad_norm": 0.4298873543739319, "kl": 0.3538818359375, "learning_rate": 1.7554739262229965e-05, "loss": 0.0142, "reward": 1.12265625, "reward_std": 0.25020663160830736, "rewards/accuracy_reward": 0.16328125, "rewards/format_reward": 0.959375, "step": 690 }, { "completion_length": 126.02890625, "epoch": 0.3070042240689103, "grad_norm": 0.4924304485321045, "kl": 0.373681640625, "learning_rate": 1.7503967168516426e-05, "loss": 0.015, "reward": 1.11953125, "reward_std": 0.2316643577069044, "rewards/accuracy_reward": 0.159375, "rewards/format_reward": 0.96015625, "step": 695 }, { "completion_length": 112.05, "epoch": 0.30921288755142046, "grad_norm": 0.5005078315734863, "kl": 0.364013671875, "learning_rate": 1.7452748419002968e-05, "loss": 0.0146, "reward": 1.14296875, "reward_std": 0.20688416287302971, "rewards/accuracy_reward": 0.17109375, "rewards/format_reward": 0.971875, "step": 700 }, { "epoch": 0.30921288755142046, "eval_completion_length": 114.4858334350586, "eval_kl": 0.38703125, "eval_loss": 0.015563694760203362, "eval_reward": 1.1183333349227906, "eval_reward_std": 0.22717599272727967, "eval_rewards/accuracy_reward": 0.15583333373069763, "eval_rewards/format_reward": 0.9625, "eval_runtime": 118.9296, "eval_samples_per_second": 0.832, "eval_steps_per_second": 0.034, "step": 700 }, { "completion_length": 116.096875, "epoch": 0.3114215510339306, "grad_norm": 0.4761113226413727, "kl": 0.3679443359375, "learning_rate": 1.740108606236385e-05, "loss": 0.0147, "reward": 1.1265625, "reward_std": 0.20904745440930128, "rewards/accuracy_reward": 0.16015625, "rewards/format_reward": 0.96640625, "step": 705 }, { "completion_length": 134.46640625, "epoch": 0.31363021451644074, "grad_norm": 0.7244411110877991, "kl": 0.38353271484375, "learning_rate": 1.7348983173677986e-05, "loss": 0.0153, "reward": 1.0765625, "reward_std": 0.23576183728873729, "rewards/accuracy_reward": 0.13515625, "rewards/format_reward": 0.94140625, "step": 710 }, { "completion_length": 99.53828125, "epoch": 0.3158388779989509, "grad_norm": 0.4390712380409241, "kl": 0.4088623046875, "learning_rate": 1.7296442854245915e-05, "loss": 0.0164, "reward": 1.1109375, "reward_std": 0.1941352991387248, "rewards/accuracy_reward": 0.14453125, "rewards/format_reward": 0.96640625, "step": 715 }, { "completion_length": 84.16640625, "epoch": 0.318047541481461, "grad_norm": 0.8809035420417786, "kl": 0.4468994140625, "learning_rate": 1.72434682314052e-05, "loss": 0.0179, "reward": 1.15390625, "reward_std": 0.1831468353047967, "rewards/accuracy_reward": 0.1765625, "rewards/format_reward": 0.97734375, "step": 720 }, { "completion_length": 82.83203125, "epoch": 0.32025620496397117, "grad_norm": 0.7408865690231323, "kl": 0.4578857421875, "learning_rate": 1.719006245834429e-05, "loss": 0.0183, "reward": 1.11328125, "reward_std": 0.16447940673679112, "rewards/accuracy_reward": 0.13828125, "rewards/format_reward": 0.975, "step": 725 }, { "completion_length": 124.11171875, "epoch": 0.3224648684464813, "grad_norm": 0.4459853172302246, "kl": 803.6734375, "learning_rate": 1.7136228713914805e-05, "loss": 32.0277, "reward": 1.0515625, "reward_std": 0.2270077530294657, "rewards/accuracy_reward": 0.10703125, "rewards/format_reward": 0.94453125, "step": 730 }, { "completion_length": 116.0078125, "epoch": 0.32467353192899145, "grad_norm": 0.8283806443214417, "kl": 0.4873046875, "learning_rate": 1.7081970202442363e-05, "loss": 0.0195, "reward": 1.10625, "reward_std": 0.24761096592992543, "rewards/accuracy_reward": 0.15390625, "rewards/format_reward": 0.95234375, "step": 735 }, { "completion_length": 75.5578125, "epoch": 0.3268821954115016, "grad_norm": 0.5588904023170471, "kl": 0.44139404296875, "learning_rate": 1.7027290153535826e-05, "loss": 0.0177, "reward": 1.16015625, "reward_std": 0.16179091222584246, "rewards/accuracy_reward": 0.165625, "rewards/format_reward": 0.99453125, "step": 740 }, { "completion_length": 114.0390625, "epoch": 0.32909085889401174, "grad_norm": 0.43830907344818115, "kl": 0.3618896484375, "learning_rate": 1.6972191821895065e-05, "loss": 0.0145, "reward": 1.10390625, "reward_std": 0.14213568177074193, "rewards/accuracy_reward": 0.11953125, "rewards/format_reward": 0.984375, "step": 745 }, { "completion_length": 154.39921875, "epoch": 0.3312995223765219, "grad_norm": 0.34231725335121155, "kl": 0.31685791015625, "learning_rate": 1.691667848711723e-05, "loss": 0.0127, "reward": 1.10546875, "reward_std": 0.19847314581274986, "rewards/accuracy_reward": 0.128125, "rewards/format_reward": 0.97734375, "step": 750 }, { "completion_length": 178.4203125, "epoch": 0.3335081858590321, "grad_norm": 0.524643063545227, "kl": 3.5463134765625, "learning_rate": 1.686075345350156e-05, "loss": 0.1422, "reward": 1.0859375, "reward_std": 0.2875434797257185, "rewards/accuracy_reward": 0.14609375, "rewards/format_reward": 0.93984375, "step": 755 }, { "completion_length": 147.73203125, "epoch": 0.3357168493415422, "grad_norm": 0.3619137704372406, "kl": 0.3319091796875, "learning_rate": 1.6804420049852676e-05, "loss": 0.0133, "reward": 1.1296875, "reward_std": 0.23654117435216904, "rewards/accuracy_reward": 0.165625, "rewards/format_reward": 0.9640625, "step": 760 }, { "completion_length": 121.4109375, "epoch": 0.33792551282405237, "grad_norm": 0.36858057975769043, "kl": 0.32564697265625, "learning_rate": 1.6747681629282468e-05, "loss": 0.013, "reward": 1.16875, "reward_std": 0.2164825988933444, "rewards/accuracy_reward": 0.18671875, "rewards/format_reward": 0.98203125, "step": 765 }, { "completion_length": 108.2828125, "epoch": 0.3401341763065625, "grad_norm": 0.4734199047088623, "kl": 0.35654296875, "learning_rate": 1.6690541569010474e-05, "loss": 0.0143, "reward": 1.13828125, "reward_std": 0.20721396785229446, "rewards/accuracy_reward": 0.15859375, "rewards/format_reward": 0.9796875, "step": 770 }, { "completion_length": 126.0984375, "epoch": 0.34234283978907265, "grad_norm": 0.47700235247612, "kl": 0.3507568359375, "learning_rate": 1.6633003270162903e-05, "loss": 0.014, "reward": 1.1484375, "reward_std": 0.20858664382249117, "rewards/accuracy_reward": 0.1734375, "rewards/format_reward": 0.975, "step": 775 }, { "completion_length": 166.13125, "epoch": 0.3445515032715828, "grad_norm": 0.39217832684516907, "kl": 0.35064697265625, "learning_rate": 1.6575070157570152e-05, "loss": 0.014, "reward": 1.13515625, "reward_std": 0.2673689084127545, "rewards/accuracy_reward": 0.18359375, "rewards/format_reward": 0.9515625, "step": 780 }, { "completion_length": 169.01328125, "epoch": 0.34676016675409294, "grad_norm": 0.39597201347351074, "kl": 0.34754638671875, "learning_rate": 1.6516745679562977e-05, "loss": 0.0139, "reward": 1.065625, "reward_std": 0.2720937805250287, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9328125, "step": 785 }, { "completion_length": 182.8359375, "epoch": 0.3489688302366031, "grad_norm": 0.47019919753074646, "kl": 0.34156494140625, "learning_rate": 1.6458033307767217e-05, "loss": 0.0137, "reward": 1.08046875, "reward_std": 0.3094723552465439, "rewards/accuracy_reward": 0.16640625, "rewards/format_reward": 0.9140625, "step": 790 }, { "completion_length": 129.996875, "epoch": 0.3511774937191132, "grad_norm": 1.1053619384765625, "kl": 0.59439697265625, "learning_rate": 1.6398936536897182e-05, "loss": 0.0238, "reward": 1.09609375, "reward_std": 0.27469405010342596, "rewards/accuracy_reward": 0.1703125, "rewards/format_reward": 0.92578125, "step": 795 }, { "completion_length": 56.90234375, "epoch": 0.35338615720162336, "grad_norm": 0.5199185609817505, "kl": 0.6048828125, "learning_rate": 1.6339458884547613e-05, "loss": 0.0242, "reward": 1.14140625, "reward_std": 0.15660744477063418, "rewards/accuracy_reward": 0.15390625, "rewards/format_reward": 0.9875, "step": 800 }, { "epoch": 0.35338615720162336, "eval_completion_length": 43.742083358764646, "eval_kl": 0.65765625, "eval_loss": 0.026582278311252594, "eval_reward": 1.1675, "eval_reward_std": 0.16831182479858398, "eval_rewards/accuracy_reward": 0.1725, "eval_rewards/format_reward": 0.995, "eval_runtime": 49.7219, "eval_samples_per_second": 1.991, "eval_steps_per_second": 0.08, "step": 800 }, { "completion_length": 64.53046875, "epoch": 0.3555948206841335, "grad_norm": 0.4935765266418457, "kl": 0.5750732421875, "learning_rate": 1.6279603890984315e-05, "loss": 0.023, "reward": 1.14765625, "reward_std": 0.20158261395990848, "rewards/accuracy_reward": 0.16015625, "rewards/format_reward": 0.9875, "step": 805 }, { "completion_length": 93.0140625, "epoch": 0.35780348416664365, "grad_norm": 0.5734583139419556, "kl": 0.4998291015625, "learning_rate": 1.6219375118933442e-05, "loss": 0.02, "reward": 1.14765625, "reward_std": 0.2238040953874588, "rewards/accuracy_reward": 0.17890625, "rewards/format_reward": 0.96875, "step": 810 }, { "completion_length": 134.42578125, "epoch": 0.3600121476491538, "grad_norm": 0.49560049176216125, "kl": 0.4668212890625, "learning_rate": 1.6158776153369406e-05, "loss": 0.0187, "reward": 1.0765625, "reward_std": 0.29757872987538575, "rewards/accuracy_reward": 0.14453125, "rewards/format_reward": 0.93203125, "step": 815 }, { "completion_length": 158.89296875, "epoch": 0.36222081113166393, "grad_norm": 0.5122950077056885, "kl": 0.46318359375, "learning_rate": 1.609781060130152e-05, "loss": 0.0185, "reward": 1.01171875, "reward_std": 0.29846451599150897, "rewards/accuracy_reward": 0.1046875, "rewards/format_reward": 0.90703125, "step": 820 }, { "completion_length": 116.4859375, "epoch": 0.3644294746141741, "grad_norm": 0.5021364092826843, "kl": 0.4975830078125, "learning_rate": 1.6036482091559287e-05, "loss": 0.0199, "reward": 1.12109375, "reward_std": 0.2664882358163595, "rewards/accuracy_reward": 0.165625, "rewards/format_reward": 0.95546875, "step": 825 }, { "completion_length": 131.8703125, "epoch": 0.3666381380966842, "grad_norm": 0.6027432680130005, "kl": 0.4765380859375, "learning_rate": 1.5974794274576394e-05, "loss": 0.0191, "reward": 1.12578125, "reward_std": 0.28144511561840774, "rewards/accuracy_reward": 0.17265625, "rewards/format_reward": 0.953125, "step": 830 }, { "completion_length": 134.915625, "epoch": 0.3688468015791944, "grad_norm": 0.49983125925064087, "kl": 0.4760986328125, "learning_rate": 1.5912750822173446e-05, "loss": 0.019, "reward": 1.134375, "reward_std": 0.29985770154744384, "rewards/accuracy_reward": 0.20078125, "rewards/format_reward": 0.93359375, "step": 835 }, { "completion_length": 91.075, "epoch": 0.37105546506170456, "grad_norm": 0.4414427876472473, "kl": 0.6185791015625, "learning_rate": 1.5850355427339398e-05, "loss": 0.0247, "reward": 1.1, "reward_std": 0.2404505180194974, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.95625, "step": 840 }, { "completion_length": 61.1109375, "epoch": 0.3732641285442147, "grad_norm": 0.46862563490867615, "kl": 0.6706787109375, "learning_rate": 1.5787611804011735e-05, "loss": 0.0268, "reward": 1.16875, "reward_std": 0.17206176780164242, "rewards/accuracy_reward": 0.18828125, "rewards/format_reward": 0.98046875, "step": 845 }, { "completion_length": 78.2640625, "epoch": 0.37547279202672484, "grad_norm": 0.7195978164672852, "kl": 0.6672119140625, "learning_rate": 1.5724523686855423e-05, "loss": 0.0267, "reward": 1.13203125, "reward_std": 0.1878314608708024, "rewards/accuracy_reward": 0.16328125, "rewards/format_reward": 0.96875, "step": 850 }, { "completion_length": 103.509375, "epoch": 0.377681455509235, "grad_norm": 0.5233339071273804, "kl": 0.6544677734375, "learning_rate": 1.56610948310406e-05, "loss": 0.0262, "reward": 1.13359375, "reward_std": 0.24245705269277096, "rewards/accuracy_reward": 0.1734375, "rewards/format_reward": 0.96015625, "step": 855 }, { "completion_length": 97.459375, "epoch": 0.37989011899174513, "grad_norm": 0.4953531324863434, "kl": 0.575927734375, "learning_rate": 1.5597329012019065e-05, "loss": 0.023, "reward": 1.15859375, "reward_std": 0.2319757068529725, "rewards/accuracy_reward": 0.19609375, "rewards/format_reward": 0.9625, "step": 860 }, { "completion_length": 126.03515625, "epoch": 0.38209878247425527, "grad_norm": 1.7415945529937744, "kl": 0.5650390625, "learning_rate": 1.5533230025299547e-05, "loss": 0.0226, "reward": 1.0765625, "reward_std": 0.25536851994693277, "rewards/accuracy_reward": 0.13828125, "rewards/format_reward": 0.93828125, "step": 865 }, { "completion_length": 92.45625, "epoch": 0.3843074459567654, "grad_norm": 0.5169078707695007, "kl": 0.5216796875, "learning_rate": 1.5468801686221793e-05, "loss": 0.0209, "reward": 1.14765625, "reward_std": 0.21833606492727994, "rewards/accuracy_reward": 0.18203125, "rewards/format_reward": 0.965625, "step": 870 }, { "completion_length": 111.06875, "epoch": 0.38651610943927556, "grad_norm": 1.1891313791275024, "kl": 0.5312255859375, "learning_rate": 1.540404782972946e-05, "loss": 0.0213, "reward": 1.08828125, "reward_std": 0.2383767468854785, "rewards/accuracy_reward": 0.1390625, "rewards/format_reward": 0.94921875, "step": 875 }, { "completion_length": 96.92109375, "epoch": 0.3887247729217857, "grad_norm": 0.49358442425727844, "kl": 0.54716796875, "learning_rate": 1.5338972310141863e-05, "loss": 0.0219, "reward": 1.14296875, "reward_std": 0.2417955880984664, "rewards/accuracy_reward": 0.18359375, "rewards/format_reward": 0.959375, "step": 880 }, { "completion_length": 103.10234375, "epoch": 0.39093343640429584, "grad_norm": 0.45550188422203064, "kl": 0.5120849609375, "learning_rate": 1.5273579000924545e-05, "loss": 0.0205, "reward": 1.12265625, "reward_std": 0.230152091011405, "rewards/accuracy_reward": 0.1609375, "rewards/format_reward": 0.96171875, "step": 885 }, { "completion_length": 79.07578125, "epoch": 0.393142099886806, "grad_norm": 0.4740158021450043, "kl": 0.751025390625, "learning_rate": 1.5207871794458715e-05, "loss": 0.03, "reward": 1.17578125, "reward_std": 0.19955000430345535, "rewards/accuracy_reward": 0.19375, "rewards/format_reward": 0.98203125, "step": 890 }, { "completion_length": 105.76171875, "epoch": 0.3953507633693161, "grad_norm": 0.37959805130958557, "kl": 0.492724609375, "learning_rate": 1.5141854601809583e-05, "loss": 0.0197, "reward": 1.15, "reward_std": 0.21266062185168266, "rewards/accuracy_reward": 0.18125, "rewards/format_reward": 0.96875, "step": 895 }, { "completion_length": 106.47109375, "epoch": 0.39755942685182627, "grad_norm": 0.4141887128353119, "kl": 0.47958984375, "learning_rate": 1.5075531352493528e-05, "loss": 0.0192, "reward": 1.125, "reward_std": 0.23161781765520573, "rewards/accuracy_reward": 0.1609375, "rewards/format_reward": 0.9640625, "step": 900 }, { "epoch": 0.39755942685182627, "eval_completion_length": 128.19291748046874, "eval_kl": 0.500859375, "eval_loss": 0.02018117904663086, "eval_reward": 1.1125, "eval_reward_std": 0.26222177892923354, "eval_rewards/accuracy_reward": 0.1604166667163372, "eval_rewards/format_reward": 0.9520833349227905, "eval_runtime": 156.2795, "eval_samples_per_second": 0.633, "eval_steps_per_second": 0.026, "step": 900 }, { "completion_length": 115.94921875, "epoch": 0.3997680903343364, "grad_norm": 0.4243628680706024, "kl": 0.49537353515625, "learning_rate": 1.5008905994244255e-05, "loss": 0.0198, "reward": 1.10234375, "reward_std": 0.225167977437377, "rewards/accuracy_reward": 0.15078125, "rewards/format_reward": 0.9515625, "step": 905 }, { "completion_length": 72.07578125, "epoch": 0.40197675381684655, "grad_norm": 0.4959592819213867, "kl": 0.5396484375, "learning_rate": 1.4941982492777749e-05, "loss": 0.0216, "reward": 1.1703125, "reward_std": 0.1914088014513254, "rewards/accuracy_reward": 0.18984375, "rewards/format_reward": 0.98046875, "step": 910 }, { "completion_length": 84.55390625, "epoch": 0.40418541729935675, "grad_norm": 0.6311984062194824, "kl": 0.56240234375, "learning_rate": 1.4874764831556285e-05, "loss": 0.0225, "reward": 1.15625, "reward_std": 0.20792635306715965, "rewards/accuracy_reward": 0.1890625, "rewards/format_reward": 0.9671875, "step": 915 }, { "completion_length": 115.203125, "epoch": 0.4063940807818669, "grad_norm": 0.5944263935089111, "kl": 0.4927734375, "learning_rate": 1.4807257011551297e-05, "loss": 0.0197, "reward": 1.15703125, "reward_std": 0.26277261301875116, "rewards/accuracy_reward": 0.21015625, "rewards/format_reward": 0.946875, "step": 920 }, { "completion_length": 111.07578125, "epoch": 0.40860274426437704, "grad_norm": 0.7743093967437744, "kl": 0.488720703125, "learning_rate": 1.4739463051005221e-05, "loss": 0.0196, "reward": 1.103125, "reward_std": 0.2299284663051367, "rewards/accuracy_reward": 0.14453125, "rewards/format_reward": 0.95859375, "step": 925 }, { "completion_length": 95.52109375, "epoch": 0.4108114077468872, "grad_norm": 0.7759421467781067, "kl": 0.5315185546875, "learning_rate": 1.4671386985192327e-05, "loss": 0.0213, "reward": 1.1671875, "reward_std": 0.1855922631919384, "rewards/accuracy_reward": 0.17890625, "rewards/format_reward": 0.98828125, "step": 930 }, { "completion_length": 108.65546875, "epoch": 0.4130200712293973, "grad_norm": 0.4987127482891083, "kl": 0.45135498046875, "learning_rate": 1.460303286617854e-05, "loss": 0.0181, "reward": 1.1625, "reward_std": 0.17659219540655613, "rewards/accuracy_reward": 0.16640625, "rewards/format_reward": 0.99609375, "step": 935 }, { "completion_length": 144.80078125, "epoch": 0.41522873471190747, "grad_norm": 0.5061682462692261, "kl": 5.969921875, "learning_rate": 1.4534404762580239e-05, "loss": 0.2394, "reward": 1.16484375, "reward_std": 0.21013734135776757, "rewards/accuracy_reward": 0.17890625, "rewards/format_reward": 0.9859375, "step": 940 }, { "completion_length": 113.9171875, "epoch": 0.4174373981944176, "grad_norm": 0.35856854915618896, "kl": 0.4087646484375, "learning_rate": 1.4465506759322074e-05, "loss": 0.0164, "reward": 1.196875, "reward_std": 0.16952291671186687, "rewards/accuracy_reward": 0.2015625, "rewards/format_reward": 0.9953125, "step": 945 }, { "completion_length": 112.64765625, "epoch": 0.41964606167692775, "grad_norm": 0.3706095218658447, "kl": 0.4105224609375, "learning_rate": 1.4396342957393844e-05, "loss": 0.0164, "reward": 1.17265625, "reward_std": 0.1865989552810788, "rewards/accuracy_reward": 0.18203125, "rewards/format_reward": 0.990625, "step": 950 }, { "completion_length": 137.5375, "epoch": 0.4218547251594379, "grad_norm": 1.2025071382522583, "kl": 0.38111572265625, "learning_rate": 1.4326917473606368e-05, "loss": 0.0152, "reward": 1.09921875, "reward_std": 0.18654303345829248, "rewards/accuracy_reward": 0.121875, "rewards/format_reward": 0.97734375, "step": 955 }, { "completion_length": 160.10234375, "epoch": 0.42406338864194804, "grad_norm": 3.2986419200897217, "kl": 0.59927978515625, "learning_rate": 1.4257234440346469e-05, "loss": 0.024, "reward": 1.1609375, "reward_std": 0.23215112816542388, "rewards/accuracy_reward": 0.18828125, "rewards/format_reward": 0.97265625, "step": 960 }, { "completion_length": 167.0, "epoch": 0.4262720521244582, "grad_norm": 0.610000729560852, "kl": 0.558154296875, "learning_rate": 1.4187298005330976e-05, "loss": 0.0223, "reward": 1.1421875, "reward_std": 0.31437007896602154, "rewards/accuracy_reward": 0.2078125, "rewards/format_reward": 0.934375, "step": 965 }, { "completion_length": 104.16484375, "epoch": 0.4284807156069683, "grad_norm": 8.364782333374023, "kl": 2.54658203125, "learning_rate": 1.4117112331359865e-05, "loss": 0.1018, "reward": 1.115625, "reward_std": 0.22522333543747663, "rewards/accuracy_reward": 0.1671875, "rewards/format_reward": 0.9484375, "step": 970 }, { "completion_length": 63.9890625, "epoch": 0.43068937908947846, "grad_norm": 0.708032488822937, "kl": 1.1080322265625, "learning_rate": 1.4046681596068468e-05, "loss": 0.0444, "reward": 1.171875, "reward_std": 0.16694873906672, "rewards/accuracy_reward": 0.19609375, "rewards/format_reward": 0.97578125, "step": 975 }, { "completion_length": 60.2453125, "epoch": 0.4328980425719886, "grad_norm": 1.7320284843444824, "kl": 1.096923828125, "learning_rate": 1.3976009991678803e-05, "loss": 0.0439, "reward": 1.1546875, "reward_std": 0.17392283789813517, "rewards/accuracy_reward": 0.17578125, "rewards/format_reward": 0.97890625, "step": 980 }, { "completion_length": 68.43359375, "epoch": 0.43510670605449875, "grad_norm": 0.7800001502037048, "kl": 1.552880859375, "learning_rate": 1.390510172475005e-05, "loss": 0.0621, "reward": 1.16875, "reward_std": 0.20654550790786744, "rewards/accuracy_reward": 0.1921875, "rewards/format_reward": 0.9765625, "step": 985 }, { "completion_length": 57.4203125, "epoch": 0.4373153695370089, "grad_norm": 0.8300907015800476, "kl": 0.723046875, "learning_rate": 1.383396101592817e-05, "loss": 0.0289, "reward": 1.13984375, "reward_std": 0.17493642698973416, "rewards/accuracy_reward": 0.1546875, "rewards/format_reward": 0.98515625, "step": 990 }, { "completion_length": 73.23671875, "epoch": 0.4395240330195191, "grad_norm": 2.696807384490967, "kl": 1.3876220703125, "learning_rate": 1.3762592099694666e-05, "loss": 0.0555, "reward": 1.1109375, "reward_std": 0.2104167841374874, "rewards/accuracy_reward": 0.14140625, "rewards/format_reward": 0.96953125, "step": 995 }, { "completion_length": 132.46328125, "epoch": 0.44173269650202923, "grad_norm": 3.0761334896087646, "kl": 3.6865478515625, "learning_rate": 1.3690999224114547e-05, "loss": 0.1477, "reward": 1.0703125, "reward_std": 0.2970853915438056, "rewards/accuracy_reward": 0.1515625, "rewards/format_reward": 0.91875, "step": 1000 }, { "epoch": 0.44173269650202923, "eval_completion_length": 104.09083335876466, "eval_kl": 1.039375, "eval_loss": 0.03946812078356743, "eval_reward": 1.1433333349227905, "eval_reward_std": 0.2386816355586052, "eval_rewards/accuracy_reward": 0.1854166667163372, "eval_rewards/format_reward": 0.9579166674613953, "eval_runtime": 111.3439, "eval_samples_per_second": 0.889, "eval_steps_per_second": 0.036, "step": 1000 }, { "completion_length": 103.2375, "epoch": 0.4439413599845394, "grad_norm": 1.7991042137145996, "kl": 1.09813232421875, "learning_rate": 1.361918665058348e-05, "loss": 0.0439, "reward": 1.13125, "reward_std": 0.21907511353492737, "rewards/accuracy_reward": 0.16015625, "rewards/format_reward": 0.97109375, "step": 1005 }, { "completion_length": 106.32421875, "epoch": 0.4461500234670495, "grad_norm": 1.2199746370315552, "kl": 0.9680908203125, "learning_rate": 1.354715865357411e-05, "loss": 0.0388, "reward": 1.15390625, "reward_std": 0.2513147694990039, "rewards/accuracy_reward": 0.1828125, "rewards/format_reward": 0.97109375, "step": 1010 }, { "completion_length": 84.5375, "epoch": 0.44835868694955966, "grad_norm": 0.8461725115776062, "kl": 0.64466552734375, "learning_rate": 1.3474919520381673e-05, "loss": 0.0258, "reward": 1.1921875, "reward_std": 0.20436475947499275, "rewards/accuracy_reward": 0.20234375, "rewards/format_reward": 0.98984375, "step": 1015 }, { "completion_length": 95.0921875, "epoch": 0.4505673504320698, "grad_norm": 0.5356422066688538, "kl": 0.674560546875, "learning_rate": 1.3402473550868769e-05, "loss": 0.027, "reward": 1.178125, "reward_std": 0.23662711773067713, "rewards/accuracy_reward": 0.2046875, "rewards/format_reward": 0.9734375, "step": 1020 }, { "completion_length": 126.2546875, "epoch": 0.45277601391457994, "grad_norm": 0.44370850920677185, "kl": 0.50389404296875, "learning_rate": 1.3329825057209446e-05, "loss": 0.0202, "reward": 1.1453125, "reward_std": 0.2518596975132823, "rewards/accuracy_reward": 0.20078125, "rewards/format_reward": 0.94453125, "step": 1025 }, { "completion_length": 100.384375, "epoch": 0.4549846773970901, "grad_norm": 0.30460554361343384, "kl": 0.5373046875, "learning_rate": 1.3256978363632515e-05, "loss": 0.0215, "reward": 1.18984375, "reward_std": 0.21483363024890423, "rewards/accuracy_reward": 0.228125, "rewards/format_reward": 0.96171875, "step": 1030 }, { "completion_length": 106.22578125, "epoch": 0.45719334087960023, "grad_norm": 0.5371220707893372, "kl": 0.5292236328125, "learning_rate": 1.3183937806164174e-05, "loss": 0.0212, "reward": 1.18671875, "reward_std": 0.264132690615952, "rewards/accuracy_reward": 0.23125, "rewards/format_reward": 0.95546875, "step": 1035 }, { "completion_length": 90.2515625, "epoch": 0.45940200436211037, "grad_norm": 0.6759688854217529, "kl": 0.541845703125, "learning_rate": 1.3110707732369896e-05, "loss": 0.0217, "reward": 1.1765625, "reward_std": 0.22657935097813606, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.965625, "step": 1040 }, { "completion_length": 129.69609375, "epoch": 0.4616106678446205, "grad_norm": 0.425448477268219, "kl": 0.549853515625, "learning_rate": 1.3037292501095674e-05, "loss": 0.022, "reward": 1.128125, "reward_std": 0.24908192362636328, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9328125, "step": 1045 }, { "completion_length": 100.40859375, "epoch": 0.46381933132713066, "grad_norm": 0.36143267154693604, "kl": 0.5515869140625, "learning_rate": 1.2963696482208552e-05, "loss": 0.0221, "reward": 1.16484375, "reward_std": 0.22955130971968174, "rewards/accuracy_reward": 0.1984375, "rewards/format_reward": 0.96640625, "step": 1050 }, { "completion_length": 86.01484375, "epoch": 0.4660279948096408, "grad_norm": 0.3947383463382721, "kl": 0.59249267578125, "learning_rate": 1.2889924056336531e-05, "loss": 0.0237, "reward": 1.15703125, "reward_std": 0.17950487434864043, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.96953125, "step": 1055 }, { "completion_length": 82.04375, "epoch": 0.46823665829215094, "grad_norm": 0.6859544515609741, "kl": 0.5910888671875, "learning_rate": 1.2815979614607818e-05, "loss": 0.0236, "reward": 1.1953125, "reward_std": 0.20693482737988234, "rewards/accuracy_reward": 0.21953125, "rewards/format_reward": 0.97578125, "step": 1060 }, { "completion_length": 95.7609375, "epoch": 0.4704453217746611, "grad_norm": 0.36446619033813477, "kl": 0.541748046875, "learning_rate": 1.274186755838945e-05, "loss": 0.0217, "reward": 1.19375, "reward_std": 0.2092124553397298, "rewards/accuracy_reward": 0.2203125, "rewards/format_reward": 0.9734375, "step": 1065 }, { "completion_length": 139.6734375, "epoch": 0.4726539852571712, "grad_norm": 0.5893868803977966, "kl": 0.471240234375, "learning_rate": 1.2667592299025331e-05, "loss": 0.0188, "reward": 1.1484375, "reward_std": 0.23675706721842288, "rewards/accuracy_reward": 0.19453125, "rewards/format_reward": 0.95390625, "step": 1070 }, { "completion_length": 116.73828125, "epoch": 0.4748626487396814, "grad_norm": 0.4791277348995209, "kl": 0.50758056640625, "learning_rate": 1.259315825757362e-05, "loss": 0.0203, "reward": 1.11796875, "reward_std": 0.20144069343805313, "rewards/accuracy_reward": 0.15, "rewards/format_reward": 0.96796875, "step": 1075 }, { "completion_length": 115.9921875, "epoch": 0.47707131222219157, "grad_norm": 0.5381553173065186, "kl": 0.49412841796875, "learning_rate": 1.251856986454363e-05, "loss": 0.0198, "reward": 1.19765625, "reward_std": 0.24683814216405153, "rewards/accuracy_reward": 0.23359375, "rewards/format_reward": 0.9640625, "step": 1080 }, { "completion_length": 122.90390625, "epoch": 0.4792799757047017, "grad_norm": 0.5010456442832947, "kl": 0.4677001953125, "learning_rate": 1.2443831559632065e-05, "loss": 0.0187, "reward": 1.15546875, "reward_std": 0.2554084826260805, "rewards/accuracy_reward": 0.1984375, "rewards/format_reward": 0.95703125, "step": 1085 }, { "completion_length": 151.96953125, "epoch": 0.48148863918721185, "grad_norm": 0.6068748235702515, "kl": 0.54783935546875, "learning_rate": 1.2368947791458785e-05, "loss": 0.0219, "reward": 1.14296875, "reward_std": 0.31096592992544175, "rewards/accuracy_reward": 0.21015625, "rewards/format_reward": 0.9328125, "step": 1090 }, { "completion_length": 125.5328125, "epoch": 0.483697302669722, "grad_norm": 0.37912383675575256, "kl": 0.45006103515625, "learning_rate": 1.2293923017302004e-05, "loss": 0.018, "reward": 1.1578125, "reward_std": 0.2355531807988882, "rewards/accuracy_reward": 0.20859375, "rewards/format_reward": 0.94921875, "step": 1095 }, { "completion_length": 93.94375, "epoch": 0.48590596615223214, "grad_norm": 0.45108431577682495, "kl": 0.535595703125, "learning_rate": 1.221876170283298e-05, "loss": 0.0214, "reward": 1.16640625, "reward_std": 0.1948365481570363, "rewards/accuracy_reward": 0.19609375, "rewards/format_reward": 0.9703125, "step": 1100 }, { "epoch": 0.48590596615223214, "eval_completion_length": 87.3554167175293, "eval_kl": 0.49109375, "eval_loss": 0.019712308421730995, "eval_reward": 1.206666669845581, "eval_reward_std": 0.194391932785511, "eval_rewards/accuracy_reward": 0.2316666667163372, "eval_rewards/format_reward": 0.975, "eval_runtime": 107.5828, "eval_samples_per_second": 0.92, "eval_steps_per_second": 0.037, "step": 1100 }, { "completion_length": 93.66171875, "epoch": 0.4881146296347423, "grad_norm": 0.3893759548664093, "kl": 0.50633544921875, "learning_rate": 1.214346832185021e-05, "loss": 0.0203, "reward": 1.18046875, "reward_std": 0.19928432293236256, "rewards/accuracy_reward": 0.20625, "rewards/format_reward": 0.97421875, "step": 1105 }, { "completion_length": 113.6171875, "epoch": 0.4903232931172524, "grad_norm": 0.5400230288505554, "kl": 0.63984375, "learning_rate": 1.2068047356013136e-05, "loss": 0.0256, "reward": 1.13359375, "reward_std": 0.24417215697467326, "rewards/accuracy_reward": 0.18203125, "rewards/format_reward": 0.9515625, "step": 1110 }, { "completion_length": 112.42421875, "epoch": 0.49253195659976257, "grad_norm": 0.4314129054546356, "kl": 0.5706787109375, "learning_rate": 1.1992503294575385e-05, "loss": 0.0228, "reward": 1.16171875, "reward_std": 0.21959545239806175, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.95859375, "step": 1115 }, { "completion_length": 116.0015625, "epoch": 0.4947406200822727, "grad_norm": 1.4012691974639893, "kl": 0.51864013671875, "learning_rate": 1.1916840634117555e-05, "loss": 0.0207, "reward": 1.1296875, "reward_std": 0.2404359621927142, "rewards/accuracy_reward": 0.17421875, "rewards/format_reward": 0.95546875, "step": 1120 }, { "completion_length": 119.28984375, "epoch": 0.49694928356478285, "grad_norm": 0.530860960483551, "kl": 0.5022705078125, "learning_rate": 1.1841063878279572e-05, "loss": 0.0201, "reward": 1.18203125, "reward_std": 0.25134353432804346, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.95546875, "step": 1125 }, { "completion_length": 108.6421875, "epoch": 0.499157947047293, "grad_norm": 0.4978342056274414, "kl": 0.541162109375, "learning_rate": 1.1765177537492616e-05, "loss": 0.0216, "reward": 1.18359375, "reward_std": 0.24454718120396138, "rewards/accuracy_reward": 0.225, "rewards/format_reward": 0.95859375, "step": 1130 }, { "completion_length": 85.57890625, "epoch": 0.5013666105298031, "grad_norm": 0.4762633144855499, "kl": 0.542431640625, "learning_rate": 1.1689186128710654e-05, "loss": 0.0217, "reward": 1.19609375, "reward_std": 0.1994084009900689, "rewards/accuracy_reward": 0.21796875, "rewards/format_reward": 0.978125, "step": 1135 }, { "completion_length": 98.32109375, "epoch": 0.5035752740123133, "grad_norm": 0.4662761092185974, "kl": 0.5237060546875, "learning_rate": 1.1613094175141568e-05, "loss": 0.0209, "reward": 1.17734375, "reward_std": 0.1960198676213622, "rewards/accuracy_reward": 0.2, "rewards/format_reward": 0.97734375, "step": 1140 }, { "completion_length": 129.81640625, "epoch": 0.5057839374948234, "grad_norm": 0.6282123923301697, "kl": 0.52607421875, "learning_rate": 1.1536906205977936e-05, "loss": 0.021, "reward": 1.2, "reward_std": 0.24621726330369711, "rewards/accuracy_reward": 0.2453125, "rewards/format_reward": 0.9546875, "step": 1145 }, { "completion_length": 144.84921875, "epoch": 0.5079926009773336, "grad_norm": 0.8076626658439636, "kl": 0.6271484375, "learning_rate": 1.1460626756127431e-05, "loss": 0.0251, "reward": 1.2, "reward_std": 0.2839036539196968, "rewards/accuracy_reward": 0.25546875, "rewards/format_reward": 0.94453125, "step": 1150 }, { "completion_length": 133.01328125, "epoch": 0.5102012644598437, "grad_norm": 0.7102019190788269, "kl": 0.682958984375, "learning_rate": 1.1384260365942905e-05, "loss": 0.0273, "reward": 1.14140625, "reward_std": 0.22996564749628307, "rewards/accuracy_reward": 0.184375, "rewards/format_reward": 0.95703125, "step": 1155 }, { "completion_length": 118.7953125, "epoch": 0.5124099279423538, "grad_norm": 0.6336015462875366, "kl": 0.6332275390625, "learning_rate": 1.1307811580952113e-05, "loss": 0.0253, "reward": 1.17734375, "reward_std": 0.2275516463443637, "rewards/accuracy_reward": 0.2140625, "rewards/format_reward": 0.96328125, "step": 1160 }, { "completion_length": 125.0515625, "epoch": 0.514618591424864, "grad_norm": 0.5651206374168396, "kl": 0.5558349609375, "learning_rate": 1.123128495158718e-05, "loss": 0.0222, "reward": 1.2, "reward_std": 0.22535169757902623, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.965625, "step": 1165 }, { "completion_length": 150.834375, "epoch": 0.5168272549073741, "grad_norm": 0.5256941914558411, "kl": 0.49346923828125, "learning_rate": 1.1154685032913719e-05, "loss": 0.0197, "reward": 1.13046875, "reward_std": 0.22142891082912683, "rewards/accuracy_reward": 0.17109375, "rewards/format_reward": 0.959375, "step": 1170 }, { "completion_length": 147.46875, "epoch": 0.5190359183898843, "grad_norm": 1.1057101488113403, "kl": 0.4740234375, "learning_rate": 1.1078016384359725e-05, "loss": 0.019, "reward": 1.17734375, "reward_std": 0.2350118851289153, "rewards/accuracy_reward": 0.21484375, "rewards/format_reward": 0.9625, "step": 1175 }, { "completion_length": 165.5859375, "epoch": 0.5212445818723944, "grad_norm": 0.5781757235527039, "kl": 0.5138427734375, "learning_rate": 1.100128356944417e-05, "loss": 0.0206, "reward": 1.15703125, "reward_std": 0.3072576764971018, "rewards/accuracy_reward": 0.23515625, "rewards/format_reward": 0.921875, "step": 1180 }, { "completion_length": 133.359375, "epoch": 0.5234532453549046, "grad_norm": 0.5776464343070984, "kl": 0.7113037109375, "learning_rate": 1.0924491155505375e-05, "loss": 0.0285, "reward": 1.121875, "reward_std": 0.3050299068912864, "rewards/accuracy_reward": 0.19765625, "rewards/format_reward": 0.92421875, "step": 1185 }, { "completion_length": 119.27421875, "epoch": 0.5256619088374147, "grad_norm": 0.7527931928634644, "kl": 0.5626953125, "learning_rate": 1.0847643713429155e-05, "loss": 0.0225, "reward": 1.16015625, "reward_std": 0.2789567396044731, "rewards/accuracy_reward": 0.2171875, "rewards/format_reward": 0.94296875, "step": 1190 }, { "completion_length": 75.74609375, "epoch": 0.527870572319925, "grad_norm": 0.5353002548217773, "kl": 0.5966064453125, "learning_rate": 1.0770745817376741e-05, "loss": 0.0239, "reward": 1.171875, "reward_std": 0.1676252031698823, "rewards/accuracy_reward": 0.19140625, "rewards/format_reward": 0.98046875, "step": 1195 }, { "completion_length": 74.70078125, "epoch": 0.5300792358024351, "grad_norm": 0.7278808951377869, "kl": 0.5812255859375, "learning_rate": 1.0693802044512525e-05, "loss": 0.0233, "reward": 1.17734375, "reward_std": 0.16255829595029353, "rewards/accuracy_reward": 0.1890625, "rewards/format_reward": 0.98828125, "step": 1200 }, { "epoch": 0.5300792358024351, "eval_completion_length": 84.3741668701172, "eval_kl": 0.55484375, "eval_loss": 0.022250505164265633, "eval_reward": 1.195, "eval_reward_std": 0.15836685180664062, "eval_rewards/accuracy_reward": 0.2025, "eval_rewards/format_reward": 0.9925, "eval_runtime": 102.7549, "eval_samples_per_second": 0.963, "eval_steps_per_second": 0.039, "step": 1200 }, { "completion_length": 89.65546875, "epoch": 0.5322878992849452, "grad_norm": 0.8118963241577148, "kl": 0.56036376953125, "learning_rate": 1.061681697473159e-05, "loss": 0.0224, "reward": 1.23828125, "reward_std": 0.16189471799880267, "rewards/accuracy_reward": 0.24609375, "rewards/format_reward": 0.9921875, "step": 1205 }, { "completion_length": 171.05078125, "epoch": 0.5344965627674554, "grad_norm": 1.0046565532684326, "kl": 1.409912109375, "learning_rate": 1.0539795190387141e-05, "loss": 0.0564, "reward": 1.14140625, "reward_std": 0.23411216996610165, "rewards/accuracy_reward": 0.17578125, "rewards/format_reward": 0.965625, "step": 1210 }, { "completion_length": 244.696875, "epoch": 0.5367052262499655, "grad_norm": 1.4813671112060547, "kl": 1.9112060546875, "learning_rate": 1.0462741276017711e-05, "loss": 0.0765, "reward": 1.12265625, "reward_std": 0.312072067707777, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.91953125, "step": 1215 }, { "completion_length": 137.07109375, "epoch": 0.5389138897324757, "grad_norm": 0.44882670044898987, "kl": 1.1479248046875, "learning_rate": 1.038565981807431e-05, "loss": 0.0459, "reward": 1.165625, "reward_std": 0.24441679026931523, "rewards/accuracy_reward": 0.2, "rewards/format_reward": 0.965625, "step": 1220 }, { "completion_length": 112.965625, "epoch": 0.5411225532149858, "grad_norm": 1.2693246603012085, "kl": 0.73072509765625, "learning_rate": 1.0308555404647407e-05, "loss": 0.0292, "reward": 1.190625, "reward_std": 0.2190632749348879, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.971875, "step": 1225 }, { "completion_length": 109.203125, "epoch": 0.543331216697496, "grad_norm": 0.6727134585380554, "kl": 0.66895751953125, "learning_rate": 1.0231432625193842e-05, "loss": 0.0267, "reward": 1.23359375, "reward_std": 0.2255825974047184, "rewards/accuracy_reward": 0.25546875, "rewards/format_reward": 0.978125, "step": 1230 }, { "completion_length": 114.903125, "epoch": 0.5455398801800061, "grad_norm": 0.5706783533096313, "kl": 0.6170654296875, "learning_rate": 1.0154296070263649e-05, "loss": 0.0247, "reward": 1.1875, "reward_std": 0.2010749163106084, "rewards/accuracy_reward": 0.2125, "rewards/format_reward": 0.975, "step": 1235 }, { "completion_length": 121.5609375, "epoch": 0.5477485436625162, "grad_norm": 0.6042254567146301, "kl": 0.58348388671875, "learning_rate": 1.0077150331226822e-05, "loss": 0.0233, "reward": 1.25546875, "reward_std": 0.22948720771819353, "rewards/accuracy_reward": 0.2875, "rewards/format_reward": 0.96796875, "step": 1240 }, { "completion_length": 130.03984375, "epoch": 0.5499572071450264, "grad_norm": 0.6111878752708435, "kl": 0.63865966796875, "learning_rate": 1e-05, "loss": 0.0256, "reward": 1.19375, "reward_std": 0.25420499257743356, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.959375, "step": 1245 }, { "completion_length": 154.44453125, "epoch": 0.5521658706275365, "grad_norm": 2.512364387512207, "kl": 0.91964111328125, "learning_rate": 9.922849668773181e-06, "loss": 0.0368, "reward": 1.12890625, "reward_std": 0.29293579459190366, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.92578125, "step": 1250 }, { "completion_length": 129.0, "epoch": 0.5543745341100467, "grad_norm": 5.317835807800293, "kl": 1.59818115234375, "learning_rate": 9.845703929736351e-06, "loss": 0.0639, "reward": 1.18984375, "reward_std": 0.26693961266428234, "rewards/accuracy_reward": 0.23984375, "rewards/format_reward": 0.95, "step": 1255 }, { "completion_length": 108.20859375, "epoch": 0.5565831975925568, "grad_norm": 0.5913572311401367, "kl": 1.3113525390625, "learning_rate": 9.768567374806163e-06, "loss": 0.0524, "reward": 1.153125, "reward_std": 0.2126995487138629, "rewards/accuracy_reward": 0.18359375, "rewards/format_reward": 0.96953125, "step": 1260 }, { "completion_length": 107.97890625, "epoch": 0.558791861075067, "grad_norm": 1.2350952625274658, "kl": 0.59656982421875, "learning_rate": 9.691444595352596e-06, "loss": 0.0239, "reward": 1.23359375, "reward_std": 0.21166725642979145, "rewards/accuracy_reward": 0.2640625, "rewards/format_reward": 0.96953125, "step": 1265 }, { "completion_length": 145.5171875, "epoch": 0.5610005245575771, "grad_norm": 0.784472644329071, "kl": 0.96728515625, "learning_rate": 9.614340181925692e-06, "loss": 0.0387, "reward": 1.15625, "reward_std": 0.2822375038638711, "rewards/accuracy_reward": 0.21328125, "rewards/format_reward": 0.94296875, "step": 1270 }, { "completion_length": 147.246875, "epoch": 0.5632091880400872, "grad_norm": 0.712846577167511, "kl": 0.7251220703125, "learning_rate": 9.53725872398229e-06, "loss": 0.029, "reward": 1.11328125, "reward_std": 0.28186143897473814, "rewards/accuracy_reward": 0.18984375, "rewards/format_reward": 0.9234375, "step": 1275 }, { "completion_length": 83.80546875, "epoch": 0.5654178515225974, "grad_norm": 0.5126023888587952, "kl": 0.632373046875, "learning_rate": 9.460204809612864e-06, "loss": 0.0253, "reward": 1.12734375, "reward_std": 0.193264627084136, "rewards/accuracy_reward": 0.14609375, "rewards/format_reward": 0.98125, "step": 1280 }, { "completion_length": 62.38515625, "epoch": 0.5676265150051075, "grad_norm": 0.3859677314758301, "kl": 0.5292724609375, "learning_rate": 9.383183025268411e-06, "loss": 0.0212, "reward": 1.21328125, "reward_std": 0.15019516460597515, "rewards/accuracy_reward": 0.21796875, "rewards/format_reward": 0.9953125, "step": 1285 }, { "completion_length": 69.62578125, "epoch": 0.5698351784876177, "grad_norm": 0.4441126883029938, "kl": 0.4895263671875, "learning_rate": 9.306197955487479e-06, "loss": 0.0196, "reward": 1.22265625, "reward_std": 0.1509174121543765, "rewards/accuracy_reward": 0.22578125, "rewards/format_reward": 0.996875, "step": 1290 }, { "completion_length": 90.8921875, "epoch": 0.5720438419701278, "grad_norm": 0.24060453474521637, "kl": 0.4620361328125, "learning_rate": 9.22925418262326e-06, "loss": 0.0185, "reward": 1.21015625, "reward_std": 0.15958714820444583, "rewards/accuracy_reward": 0.215625, "rewards/format_reward": 0.99453125, "step": 1295 }, { "completion_length": 113.90625, "epoch": 0.574252505452638, "grad_norm": 0.5410070419311523, "kl": 0.451025390625, "learning_rate": 9.15235628657085e-06, "loss": 0.018, "reward": 1.253125, "reward_std": 0.21325785480439663, "rewards/accuracy_reward": 0.26953125, "rewards/format_reward": 0.98359375, "step": 1300 }, { "epoch": 0.574252505452638, "eval_completion_length": 121.16583343505859, "eval_kl": 0.428828125, "eval_loss": 0.017305398359894753, "eval_reward": 1.22625, "eval_reward_std": 0.23555977791547775, "eval_rewards/accuracy_reward": 0.2533333334326744, "eval_rewards/format_reward": 0.9729166674613953, "eval_runtime": 139.5802, "eval_samples_per_second": 0.709, "eval_steps_per_second": 0.029, "step": 1300 }, { "completion_length": 123.81796875, "epoch": 0.5764611689351481, "grad_norm": 0.5413645505905151, "kl": 0.43470458984375, "learning_rate": 9.07550884449463e-06, "loss": 0.0174, "reward": 1.178125, "reward_std": 0.20707119330763818, "rewards/accuracy_reward": 0.209375, "rewards/format_reward": 0.96875, "step": 1305 }, { "completion_length": 143.82578125, "epoch": 0.5786698324176582, "grad_norm": 0.4805835783481598, "kl": 0.44522705078125, "learning_rate": 8.998716430555832e-06, "loss": 0.0178, "reward": 1.128125, "reward_std": 0.2421926449984312, "rewards/accuracy_reward": 0.17890625, "rewards/format_reward": 0.94921875, "step": 1310 }, { "completion_length": 146.4421875, "epoch": 0.5808784959001684, "grad_norm": 0.36650240421295166, "kl": 0.50875244140625, "learning_rate": 8.921983615640277e-06, "loss": 0.0203, "reward": 1.13515625, "reward_std": 0.2266262538731098, "rewards/accuracy_reward": 0.1765625, "rewards/format_reward": 0.95859375, "step": 1315 }, { "completion_length": 125.47421875, "epoch": 0.5830871593826785, "grad_norm": 0.431325227022171, "kl": 0.4361328125, "learning_rate": 8.845314967086281e-06, "loss": 0.0174, "reward": 1.13984375, "reward_std": 0.16203333698213102, "rewards/accuracy_reward": 0.1609375, "rewards/format_reward": 0.97890625, "step": 1320 }, { "completion_length": 116.61015625, "epoch": 0.5852958228651887, "grad_norm": 0.44710680842399597, "kl": 0.41380615234375, "learning_rate": 8.768715048412823e-06, "loss": 0.0166, "reward": 1.215625, "reward_std": 0.19275038037449121, "rewards/accuracy_reward": 0.228125, "rewards/format_reward": 0.9875, "step": 1325 }, { "completion_length": 119.11171875, "epoch": 0.5875044863476988, "grad_norm": 0.49566569924354553, "kl": 0.420166015625, "learning_rate": 8.692188419047889e-06, "loss": 0.0168, "reward": 1.19375, "reward_std": 0.1878614580258727, "rewards/accuracy_reward": 0.20625, "rewards/format_reward": 0.9875, "step": 1330 }, { "completion_length": 132.55390625, "epoch": 0.589713149830209, "grad_norm": 0.5140964388847351, "kl": 0.447314453125, "learning_rate": 8.615739634057098e-06, "loss": 0.0179, "reward": 1.21171875, "reward_std": 0.19049166329205036, "rewards/accuracy_reward": 0.2328125, "rewards/format_reward": 0.97890625, "step": 1335 }, { "completion_length": 147.45390625, "epoch": 0.5919218133127191, "grad_norm": 0.5098828673362732, "kl": 0.456298828125, "learning_rate": 8.539373243872569e-06, "loss": 0.0182, "reward": 1.18359375, "reward_std": 0.20728036612272263, "rewards/accuracy_reward": 0.21171875, "rewards/format_reward": 0.971875, "step": 1340 }, { "completion_length": 148.99296875, "epoch": 0.5941304767952292, "grad_norm": 0.5452316999435425, "kl": 0.48438720703125, "learning_rate": 8.463093794022069e-06, "loss": 0.0194, "reward": 1.2, "reward_std": 0.26944386642426255, "rewards/accuracy_reward": 0.24140625, "rewards/format_reward": 0.95859375, "step": 1345 }, { "completion_length": 146.82578125, "epoch": 0.5963391402777395, "grad_norm": 0.3637787103652954, "kl": 0.42889404296875, "learning_rate": 8.386905824858436e-06, "loss": 0.0172, "reward": 1.165625, "reward_std": 0.18749849069863558, "rewards/accuracy_reward": 0.1921875, "rewards/format_reward": 0.9734375, "step": 1350 }, { "completion_length": 163.79296875, "epoch": 0.5985478037602496, "grad_norm": 0.5319222807884216, "kl": 0.41142578125, "learning_rate": 8.310813871289349e-06, "loss": 0.0165, "reward": 1.1984375, "reward_std": 0.25758711863309147, "rewards/accuracy_reward": 0.2375, "rewards/format_reward": 0.9609375, "step": 1355 }, { "completion_length": 158.078125, "epoch": 0.6007564672427598, "grad_norm": 0.4595475494861603, "kl": 0.4110107421875, "learning_rate": 8.234822462507384e-06, "loss": 0.0164, "reward": 1.2328125, "reward_std": 0.21721374820917844, "rewards/accuracy_reward": 0.25625, "rewards/format_reward": 0.9765625, "step": 1360 }, { "completion_length": 145.98046875, "epoch": 0.6029651307252699, "grad_norm": 0.49845319986343384, "kl": 0.41668701171875, "learning_rate": 8.158936121720433e-06, "loss": 0.0167, "reward": 1.24609375, "reward_std": 0.22847788464277982, "rewards/accuracy_reward": 0.2765625, "rewards/format_reward": 0.96953125, "step": 1365 }, { "completion_length": 171.2109375, "epoch": 0.6051737942077801, "grad_norm": 0.5033155083656311, "kl": 0.4380126953125, "learning_rate": 8.08315936588245e-06, "loss": 0.0175, "reward": 1.20390625, "reward_std": 0.22734466083347799, "rewards/accuracy_reward": 0.24765625, "rewards/format_reward": 0.95625, "step": 1370 }, { "completion_length": 154.93125, "epoch": 0.6073824576902902, "grad_norm": 0.5040455460548401, "kl": 0.41199951171875, "learning_rate": 8.00749670542462e-06, "loss": 0.0165, "reward": 1.22734375, "reward_std": 0.21946511473506689, "rewards/accuracy_reward": 0.246875, "rewards/format_reward": 0.98046875, "step": 1375 }, { "completion_length": 174.39609375, "epoch": 0.6095911211728003, "grad_norm": 0.6203243136405945, "kl": 0.4758056640625, "learning_rate": 7.931952643986866e-06, "loss": 0.019, "reward": 1.225, "reward_std": 0.25668525900691747, "rewards/accuracy_reward": 0.259375, "rewards/format_reward": 0.965625, "step": 1380 }, { "completion_length": 148.69375, "epoch": 0.6117997846553105, "grad_norm": 3.571584939956665, "kl": 0.6333251953125, "learning_rate": 7.856531678149792e-06, "loss": 0.0253, "reward": 1.21015625, "reward_std": 0.19479831736534833, "rewards/accuracy_reward": 0.23359375, "rewards/format_reward": 0.9765625, "step": 1385 }, { "completion_length": 163.81796875, "epoch": 0.6140084481378206, "grad_norm": 1.1779475212097168, "kl": 0.9492919921875, "learning_rate": 7.781238297167025e-06, "loss": 0.0379, "reward": 1.1703125, "reward_std": 0.22253222949802876, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.959375, "step": 1390 }, { "completion_length": 160.65625, "epoch": 0.6162171116203308, "grad_norm": 1.4942560195922852, "kl": 1.04031982421875, "learning_rate": 7.706076982698e-06, "loss": 0.0416, "reward": 1.2109375, "reward_std": 0.25693559013307093, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.953125, "step": 1395 }, { "completion_length": 142.009375, "epoch": 0.6184257751028409, "grad_norm": 0.8941807746887207, "kl": 1.5090576171875, "learning_rate": 7.631052208541217e-06, "loss": 0.0605, "reward": 1.24453125, "reward_std": 0.24907068870961666, "rewards/accuracy_reward": 0.27109375, "rewards/format_reward": 0.9734375, "step": 1400 }, { "epoch": 0.6184257751028409, "eval_completion_length": 126.1554168701172, "eval_kl": 0.51203125, "eval_loss": 0.020548321306705475, "eval_reward": 1.2670833349227906, "eval_reward_std": 0.2361640551686287, "eval_rewards/accuracy_reward": 0.2795833334326744, "eval_rewards/format_reward": 0.9875, "eval_runtime": 108.4132, "eval_samples_per_second": 0.913, "eval_steps_per_second": 0.037, "step": 1400 }, { "completion_length": 129.46015625, "epoch": 0.6206344385853511, "grad_norm": 0.5738782286643982, "kl": 0.52000732421875, "learning_rate": 7.5561684403679355e-06, "loss": 0.0208, "reward": 1.159375, "reward_std": 0.18879235051572324, "rewards/accuracy_reward": 0.1828125, "rewards/format_reward": 0.9765625, "step": 1405 }, { "completion_length": 160.28828125, "epoch": 0.6228431020678612, "grad_norm": 0.8489373326301575, "kl": 0.78538818359375, "learning_rate": 7.4814301354563735e-06, "loss": 0.0314, "reward": 1.1765625, "reward_std": 0.2816866671666503, "rewards/accuracy_reward": 0.23046875, "rewards/format_reward": 0.94609375, "step": 1410 }, { "completion_length": 130.85, "epoch": 0.6250517655503713, "grad_norm": 0.4323514997959137, "kl": 0.4590087890625, "learning_rate": 7.40684174242638e-06, "loss": 0.0184, "reward": 1.1921875, "reward_std": 0.20537027660757304, "rewards/accuracy_reward": 0.21796875, "rewards/format_reward": 0.97421875, "step": 1415 }, { "completion_length": 129.54921875, "epoch": 0.6272604290328815, "grad_norm": 0.4422205686569214, "kl": 0.44378662109375, "learning_rate": 7.332407700974673e-06, "loss": 0.0178, "reward": 1.2109375, "reward_std": 0.22267536614090205, "rewards/accuracy_reward": 0.23125, "rewards/format_reward": 0.9796875, "step": 1420 }, { "completion_length": 156.784375, "epoch": 0.6294690925153916, "grad_norm": 0.6342864632606506, "kl": 0.4815185546875, "learning_rate": 7.258132441610548e-06, "loss": 0.0193, "reward": 1.225, "reward_std": 0.27565329764038327, "rewards/accuracy_reward": 0.2703125, "rewards/format_reward": 0.9546875, "step": 1425 }, { "completion_length": 189.24375, "epoch": 0.6316777559979018, "grad_norm": 0.5509055256843567, "kl": 0.6362548828125, "learning_rate": 7.184020385392186e-06, "loss": 0.0254, "reward": 1.1234375, "reward_std": 0.2952466538175941, "rewards/accuracy_reward": 0.19765625, "rewards/format_reward": 0.92578125, "step": 1430 }, { "completion_length": 125.340625, "epoch": 0.6338864194804119, "grad_norm": 0.44184309244155884, "kl": 0.415966796875, "learning_rate": 7.110075943663473e-06, "loss": 0.0166, "reward": 1.25625, "reward_std": 0.20973300114274024, "rewards/accuracy_reward": 0.2796875, "rewards/format_reward": 0.9765625, "step": 1435 }, { "completion_length": 110.86875, "epoch": 0.636095082962922, "grad_norm": 0.393185019493103, "kl": 0.45828857421875, "learning_rate": 7.0363035177914505e-06, "loss": 0.0183, "reward": 1.196875, "reward_std": 0.18387279994785785, "rewards/accuracy_reward": 0.2203125, "rewards/format_reward": 0.9765625, "step": 1440 }, { "completion_length": 121.4484375, "epoch": 0.6383037464454322, "grad_norm": 0.791002094745636, "kl": 0.54757080078125, "learning_rate": 6.962707498904331e-06, "loss": 0.0219, "reward": 1.21328125, "reward_std": 0.2349924026057124, "rewards/accuracy_reward": 0.2453125, "rewards/format_reward": 0.96796875, "step": 1445 }, { "completion_length": 145.55703125, "epoch": 0.6405124099279423, "grad_norm": 0.44890421628952026, "kl": 0.5147216796875, "learning_rate": 6.889292267630106e-06, "loss": 0.0206, "reward": 1.19375, "reward_std": 0.24552099388092757, "rewards/accuracy_reward": 0.22890625, "rewards/format_reward": 0.96484375, "step": 1450 }, { "completion_length": 142.5640625, "epoch": 0.6427210734104525, "grad_norm": 0.47677525877952576, "kl": 0.44825439453125, "learning_rate": 6.81606219383583e-06, "loss": 0.0179, "reward": 1.22109375, "reward_std": 0.18264568988233804, "rewards/accuracy_reward": 0.2375, "rewards/format_reward": 0.98359375, "step": 1455 }, { "completion_length": 141.29765625, "epoch": 0.6449297368929626, "grad_norm": 0.5547141432762146, "kl": 0.45455322265625, "learning_rate": 6.743021636367488e-06, "loss": 0.0182, "reward": 1.24296875, "reward_std": 0.20987400207668544, "rewards/accuracy_reward": 0.2703125, "rewards/format_reward": 0.97265625, "step": 1460 }, { "completion_length": 141.034375, "epoch": 0.6471384003754728, "grad_norm": 0.43943309783935547, "kl": 0.45238037109375, "learning_rate": 6.670174942790557e-06, "loss": 0.0181, "reward": 1.2390625, "reward_std": 0.2042137583717704, "rewards/accuracy_reward": 0.26484375, "rewards/format_reward": 0.97421875, "step": 1465 }, { "completion_length": 159.996875, "epoch": 0.6493470638579829, "grad_norm": 0.5421745181083679, "kl": 0.55458984375, "learning_rate": 6.597526449131232e-06, "loss": 0.0222, "reward": 1.26875, "reward_std": 0.25485040955245497, "rewards/accuracy_reward": 0.30703125, "rewards/format_reward": 0.96171875, "step": 1470 }, { "completion_length": 152.93984375, "epoch": 0.651555727340493, "grad_norm": 0.49404215812683105, "kl": 0.4667724609375, "learning_rate": 6.525080479618331e-06, "loss": 0.0187, "reward": 1.1671875, "reward_std": 0.22337155733257533, "rewards/accuracy_reward": 0.1984375, "rewards/format_reward": 0.96875, "step": 1475 }, { "completion_length": 142.815625, "epoch": 0.6537643908230032, "grad_norm": 0.5135136842727661, "kl": 0.438818359375, "learning_rate": 6.452841346425891e-06, "loss": 0.0176, "reward": 1.2640625, "reward_std": 0.24292335454374553, "rewards/accuracy_reward": 0.2875, "rewards/format_reward": 0.9765625, "step": 1480 }, { "completion_length": 133.865625, "epoch": 0.6559730543055133, "grad_norm": 0.5439188480377197, "kl": 0.40977783203125, "learning_rate": 6.380813349416523e-06, "loss": 0.0164, "reward": 1.2625, "reward_std": 0.21160587538033723, "rewards/accuracy_reward": 0.27578125, "rewards/format_reward": 0.98671875, "step": 1485 }, { "completion_length": 141.23671875, "epoch": 0.6581817177880235, "grad_norm": 0.5219649076461792, "kl": 0.4312255859375, "learning_rate": 6.309000775885452e-06, "loss": 0.0172, "reward": 1.21796875, "reward_std": 0.21080582737922668, "rewards/accuracy_reward": 0.24375, "rewards/format_reward": 0.97421875, "step": 1490 }, { "completion_length": 156.421875, "epoch": 0.6603903812705336, "grad_norm": 0.6208651065826416, "kl": 0.45198974609375, "learning_rate": 6.237407900305334e-06, "loss": 0.0181, "reward": 1.21328125, "reward_std": 0.21545952204614877, "rewards/accuracy_reward": 0.2453125, "rewards/format_reward": 0.96796875, "step": 1495 }, { "completion_length": 157.92578125, "epoch": 0.6625990447530438, "grad_norm": 0.514742910861969, "kl": 0.445703125, "learning_rate": 6.166038984071833e-06, "loss": 0.0178, "reward": 1.20859375, "reward_std": 0.24552375469356774, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.95859375, "step": 1500 }, { "epoch": 0.6625990447530438, "eval_completion_length": 150.59583343505858, "eval_kl": 0.510390625, "eval_loss": 0.020548084750771523, "eval_reward": 1.2470833349227906, "eval_reward_std": 0.23129627466201783, "eval_rewards/accuracy_reward": 0.2833333334326744, "eval_rewards/format_reward": 0.96375, "eval_runtime": 130.7052, "eval_samples_per_second": 0.757, "eval_steps_per_second": 0.031, "step": 1500 }, { "completion_length": 146.13515625, "epoch": 0.6648077082355539, "grad_norm": 0.46206313371658325, "kl": 0.51572265625, "learning_rate": 6.094898275249952e-06, "loss": 0.0206, "reward": 1.2828125, "reward_std": 0.23776858411729335, "rewards/accuracy_reward": 0.3203125, "rewards/format_reward": 0.9625, "step": 1505 }, { "completion_length": 115.98671875, "epoch": 0.6670163717180642, "grad_norm": 0.7776006460189819, "kl": 0.4756591796875, "learning_rate": 6.023990008321199e-06, "loss": 0.019, "reward": 1.24765625, "reward_std": 0.21786664836108685, "rewards/accuracy_reward": 0.26953125, "rewards/format_reward": 0.978125, "step": 1510 }, { "completion_length": 125.35625, "epoch": 0.6692250352005743, "grad_norm": 0.4395429193973541, "kl": 0.5373779296875, "learning_rate": 5.953318403931533e-06, "loss": 0.0215, "reward": 1.20703125, "reward_std": 0.22481790594756604, "rewards/accuracy_reward": 0.24765625, "rewards/format_reward": 0.959375, "step": 1515 }, { "completion_length": 99.33359375, "epoch": 0.6714336986830844, "grad_norm": 0.45579713582992554, "kl": 0.524560546875, "learning_rate": 5.882887668640138e-06, "loss": 0.021, "reward": 1.18828125, "reward_std": 0.16194322612136602, "rewards/accuracy_reward": 0.1984375, "rewards/format_reward": 0.98984375, "step": 1520 }, { "completion_length": 126.44296875, "epoch": 0.6736423621655946, "grad_norm": 0.42683523893356323, "kl": 0.4496826171875, "learning_rate": 5.812701994669028e-06, "loss": 0.018, "reward": 1.221875, "reward_std": 0.24250736236572265, "rewards/accuracy_reward": 0.24921875, "rewards/format_reward": 0.97265625, "step": 1525 }, { "completion_length": 179.19921875, "epoch": 0.6758510256481047, "grad_norm": 0.9905825257301331, "kl": 0.49107666015625, "learning_rate": 5.742765559653537e-06, "loss": 0.0197, "reward": 1.16875, "reward_std": 0.25706158187240363, "rewards/accuracy_reward": 0.2171875, "rewards/format_reward": 0.9515625, "step": 1530 }, { "completion_length": 223.625, "epoch": 0.6780596891306149, "grad_norm": 0.45996785163879395, "kl": 0.45472412109375, "learning_rate": 5.673082526393634e-06, "loss": 0.0182, "reward": 1.1953125, "reward_std": 0.2674042139202356, "rewards/accuracy_reward": 0.2484375, "rewards/format_reward": 0.946875, "step": 1535 }, { "completion_length": 198.6390625, "epoch": 0.680268352613125, "grad_norm": 0.37783390283584595, "kl": 0.353515625, "learning_rate": 5.603657042606163e-06, "loss": 0.0141, "reward": 1.178125, "reward_std": 0.21960081458091735, "rewards/accuracy_reward": 0.21171875, "rewards/format_reward": 0.96640625, "step": 1540 }, { "completion_length": 197.028125, "epoch": 0.6824770160956352, "grad_norm": 0.6316084265708923, "kl": 0.368896484375, "learning_rate": 5.53449324067793e-06, "loss": 0.0148, "reward": 1.171875, "reward_std": 0.2550745034590364, "rewards/accuracy_reward": 0.21640625, "rewards/format_reward": 0.95546875, "step": 1545 }, { "completion_length": 172.515625, "epoch": 0.6846856795781453, "grad_norm": 0.5714349746704102, "kl": 0.41336669921875, "learning_rate": 5.465595237419768e-06, "loss": 0.0165, "reward": 1.265625, "reward_std": 0.23817113135010004, "rewards/accuracy_reward": 0.3140625, "rewards/format_reward": 0.9515625, "step": 1550 }, { "completion_length": 164.86171875, "epoch": 0.6868943430606554, "grad_norm": 0.31497815251350403, "kl": 0.40863037109375, "learning_rate": 5.396967133821461e-06, "loss": 0.0164, "reward": 1.24296875, "reward_std": 0.22363899704068899, "rewards/accuracy_reward": 0.278125, "rewards/format_reward": 0.96484375, "step": 1555 }, { "completion_length": 144.21171875, "epoch": 0.6891030065431656, "grad_norm": 0.3973537087440491, "kl": 0.39749755859375, "learning_rate": 5.3286130148076765e-06, "loss": 0.0159, "reward": 1.2203125, "reward_std": 0.20798758920282126, "rewards/accuracy_reward": 0.24375, "rewards/format_reward": 0.9765625, "step": 1560 }, { "completion_length": 126.52421875, "epoch": 0.6913116700256757, "grad_norm": 0.5487494468688965, "kl": 0.41744384765625, "learning_rate": 5.260536948994786e-06, "loss": 0.0167, "reward": 1.246875, "reward_std": 0.2289178878068924, "rewards/accuracy_reward": 0.26640625, "rewards/format_reward": 0.98046875, "step": 1565 }, { "completion_length": 153.5671875, "epoch": 0.6935203335081859, "grad_norm": 1.368481993675232, "kl": 0.47470703125, "learning_rate": 5.192742988448707e-06, "loss": 0.019, "reward": 1.22890625, "reward_std": 0.24065108597278595, "rewards/accuracy_reward": 0.2828125, "rewards/format_reward": 0.94609375, "step": 1570 }, { "completion_length": 152.64296875, "epoch": 0.695728996990696, "grad_norm": 0.460273414850235, "kl": 0.53028564453125, "learning_rate": 5.125235168443714e-06, "loss": 0.0212, "reward": 1.22734375, "reward_std": 0.26983388569206, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.94609375, "step": 1575 }, { "completion_length": 121.12890625, "epoch": 0.6979376604732062, "grad_norm": 0.5840798020362854, "kl": 0.4253173828125, "learning_rate": 5.058017507222254e-06, "loss": 0.017, "reward": 1.215625, "reward_std": 0.20102577321231366, "rewards/accuracy_reward": 0.240625, "rewards/format_reward": 0.975, "step": 1580 }, { "completion_length": 110.19375, "epoch": 0.7001463239557163, "grad_norm": 0.48586878180503845, "kl": 0.4316650390625, "learning_rate": 4.99109400575575e-06, "loss": 0.0173, "reward": 1.28984375, "reward_std": 0.19885436855256558, "rewards/accuracy_reward": 0.30546875, "rewards/format_reward": 0.984375, "step": 1585 }, { "completion_length": 128.54609375, "epoch": 0.7023549874382264, "grad_norm": 1.0986440181732178, "kl": 0.42666015625, "learning_rate": 4.924468647506473e-06, "loss": 0.0171, "reward": 1.22421875, "reward_std": 0.23164935149252414, "rewards/accuracy_reward": 0.2515625, "rewards/format_reward": 0.97265625, "step": 1590 }, { "completion_length": 149.9328125, "epoch": 0.7045636509207366, "grad_norm": 0.595619797706604, "kl": 0.46484375, "learning_rate": 4.8581453981904205e-06, "loss": 0.0186, "reward": 1.25078125, "reward_std": 0.2856699053198099, "rewards/accuracy_reward": 0.2984375, "rewards/format_reward": 0.95234375, "step": 1595 }, { "completion_length": 156.828125, "epoch": 0.7067723144032467, "grad_norm": 0.4311577081680298, "kl": 0.40791015625, "learning_rate": 4.792128205541286e-06, "loss": 0.0163, "reward": 1.18359375, "reward_std": 0.2468837944790721, "rewards/accuracy_reward": 0.2296875, "rewards/format_reward": 0.95390625, "step": 1600 }, { "epoch": 0.7067723144032467, "eval_completion_length": 135.73458335876464, "eval_kl": 0.39359375, "eval_loss": 0.015719087794423103, "eval_reward": 1.2620833349227905, "eval_reward_std": 0.2296227565407753, "eval_rewards/accuracy_reward": 0.2908333334326744, "eval_rewards/format_reward": 0.97125, "eval_runtime": 135.2265, "eval_samples_per_second": 0.732, "eval_steps_per_second": 0.03, "step": 1600 }, { "completion_length": 158.7234375, "epoch": 0.7089809778857569, "grad_norm": 0.485534131526947, "kl": 0.39541015625, "learning_rate": 4.7264209990754594e-06, "loss": 0.0158, "reward": 1.265625, "reward_std": 0.2561331497505307, "rewards/accuracy_reward": 0.30703125, "rewards/format_reward": 0.95859375, "step": 1605 }, { "completion_length": 163.1359375, "epoch": 0.711189641368267, "grad_norm": 0.38874199986457825, "kl": 0.3953125, "learning_rate": 4.661027689858142e-06, "loss": 0.0158, "reward": 1.15859375, "reward_std": 0.23531383704394102, "rewards/accuracy_reward": 0.2046875, "rewards/format_reward": 0.95390625, "step": 1610 }, { "completion_length": 131.1453125, "epoch": 0.7133983048507772, "grad_norm": 0.7737115025520325, "kl": 0.432568359375, "learning_rate": 4.595952170270542e-06, "loss": 0.0173, "reward": 1.23828125, "reward_std": 0.24948414210230113, "rewards/accuracy_reward": 0.26953125, "rewards/format_reward": 0.96875, "step": 1615 }, { "completion_length": 129.2015625, "epoch": 0.7156069683332873, "grad_norm": 0.44618454575538635, "kl": 0.44493408203125, "learning_rate": 4.5311983137782116e-06, "loss": 0.0178, "reward": 1.19453125, "reward_std": 0.18757406566292048, "rewards/accuracy_reward": 0.221875, "rewards/format_reward": 0.97265625, "step": 1620 }, { "completion_length": 128.13671875, "epoch": 0.7178156318157974, "grad_norm": 0.4870763421058655, "kl": 0.44984130859375, "learning_rate": 4.4667699747004555e-06, "loss": 0.018, "reward": 1.22734375, "reward_std": 0.2222797654569149, "rewards/accuracy_reward": 0.2640625, "rewards/format_reward": 0.96328125, "step": 1625 }, { "completion_length": 128.1109375, "epoch": 0.7200242952983076, "grad_norm": 0.830633819103241, "kl": 0.471240234375, "learning_rate": 4.402670987980938e-06, "loss": 0.0189, "reward": 1.21796875, "reward_std": 0.2280710056424141, "rewards/accuracy_reward": 0.24921875, "rewards/format_reward": 0.96875, "step": 1630 }, { "completion_length": 116.93515625, "epoch": 0.7222329587808177, "grad_norm": 0.5954543948173523, "kl": 0.46390380859375, "learning_rate": 4.3389051689594e-06, "loss": 0.0186, "reward": 1.2171875, "reward_std": 0.21377347223460674, "rewards/accuracy_reward": 0.24296875, "rewards/format_reward": 0.97421875, "step": 1635 }, { "completion_length": 118.27734375, "epoch": 0.7244416222633279, "grad_norm": 0.42961063981056213, "kl": 0.41763916015625, "learning_rate": 4.275476313144578e-06, "loss": 0.0167, "reward": 1.2125, "reward_std": 0.21657640542834997, "rewards/accuracy_reward": 0.23828125, "rewards/format_reward": 0.97421875, "step": 1640 }, { "completion_length": 113.2859375, "epoch": 0.726650285745838, "grad_norm": 0.4778992831707001, "kl": 0.46826171875, "learning_rate": 4.212388195988267e-06, "loss": 0.0187, "reward": 1.221875, "reward_std": 0.2040594968944788, "rewards/accuracy_reward": 0.240625, "rewards/format_reward": 0.98125, "step": 1645 }, { "completion_length": 134.99296875, "epoch": 0.7288589492283482, "grad_norm": 0.868116021156311, "kl": 0.4645263671875, "learning_rate": 4.1496445726606064e-06, "loss": 0.0186, "reward": 1.246875, "reward_std": 0.20862858258187772, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.9734375, "step": 1650 }, { "completion_length": 167.1515625, "epoch": 0.7310676127108583, "grad_norm": 0.5153465867042542, "kl": 0.4582275390625, "learning_rate": 4.087249177826553e-06, "loss": 0.0183, "reward": 1.26953125, "reward_std": 0.29226357098668815, "rewards/accuracy_reward": 0.3171875, "rewards/format_reward": 0.95234375, "step": 1655 }, { "completion_length": 167.3953125, "epoch": 0.7332762761933684, "grad_norm": 0.9335393905639648, "kl": 0.4902099609375, "learning_rate": 4.025205725423607e-06, "loss": 0.0196, "reward": 1.2296875, "reward_std": 0.3044658374041319, "rewards/accuracy_reward": 0.2875, "rewards/format_reward": 0.9421875, "step": 1660 }, { "completion_length": 124.8671875, "epoch": 0.7354849396758786, "grad_norm": 0.4761280119419098, "kl": 0.4821533203125, "learning_rate": 3.963517908440716e-06, "loss": 0.0193, "reward": 1.22578125, "reward_std": 0.19983983058482407, "rewards/accuracy_reward": 0.246875, "rewards/format_reward": 0.97890625, "step": 1665 }, { "completion_length": 127.8421875, "epoch": 0.7376936031583888, "grad_norm": 0.5155916810035706, "kl": 0.43746337890625, "learning_rate": 3.902189398698482e-06, "loss": 0.0175, "reward": 1.18515625, "reward_std": 0.17090727612376214, "rewards/accuracy_reward": 0.2078125, "rewards/format_reward": 0.97734375, "step": 1670 }, { "completion_length": 146.315625, "epoch": 0.739902266640899, "grad_norm": 0.3942101001739502, "kl": 0.41949462890625, "learning_rate": 3.841223846630599e-06, "loss": 0.0168, "reward": 1.21328125, "reward_std": 0.19338544998317958, "rewards/accuracy_reward": 0.23671875, "rewards/format_reward": 0.9765625, "step": 1675 }, { "completion_length": 170.7625, "epoch": 0.7421109301234091, "grad_norm": 0.4603167176246643, "kl": 0.39854736328125, "learning_rate": 3.7806248810665613e-06, "loss": 0.0159, "reward": 1.25078125, "reward_std": 0.22261980101466178, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.97734375, "step": 1680 }, { "completion_length": 195.67265625, "epoch": 0.7443195936059193, "grad_norm": 0.527126669883728, "kl": 0.41444091796875, "learning_rate": 3.720396109015686e-06, "loss": 0.0166, "reward": 1.2375, "reward_std": 0.24860016535967588, "rewards/accuracy_reward": 0.275, "rewards/format_reward": 0.9625, "step": 1685 }, { "completion_length": 178.1625, "epoch": 0.7465282570884294, "grad_norm": 0.4369182884693146, "kl": 0.37342529296875, "learning_rate": 3.6605411154523885e-06, "loss": 0.0149, "reward": 1.24140625, "reward_std": 0.22675297893583773, "rewards/accuracy_reward": 0.26953125, "rewards/format_reward": 0.971875, "step": 1690 }, { "completion_length": 168.7265625, "epoch": 0.7487369205709395, "grad_norm": 0.5954079031944275, "kl": 0.381005859375, "learning_rate": 3.601063463102823e-06, "loss": 0.0152, "reward": 1.21015625, "reward_std": 0.25745327677577734, "rewards/accuracy_reward": 0.24296875, "rewards/format_reward": 0.9671875, "step": 1695 }, { "completion_length": 176.64375, "epoch": 0.7509455840534497, "grad_norm": 0.4956624507904053, "kl": 0.423779296875, "learning_rate": 3.5419666922327854e-06, "loss": 0.017, "reward": 1.24609375, "reward_std": 0.25288699120283126, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.96484375, "step": 1700 }, { "epoch": 0.7509455840534497, "eval_completion_length": 163.3483334350586, "eval_kl": 0.376171875, "eval_loss": 0.015068257227540016, "eval_reward": 1.247916669845581, "eval_reward_std": 0.2429444035887718, "eval_rewards/accuracy_reward": 0.2904166667163372, "eval_rewards/format_reward": 0.9575, "eval_runtime": 148.8095, "eval_samples_per_second": 0.665, "eval_steps_per_second": 0.027, "step": 1700 }, { "completion_length": 165.946875, "epoch": 0.7531542475359598, "grad_norm": 0.4972885251045227, "kl": 0.40225830078125, "learning_rate": 3.4832543204370284e-06, "loss": 0.0161, "reward": 1.203125, "reward_std": 0.26637718454003334, "rewards/accuracy_reward": 0.2375, "rewards/format_reward": 0.965625, "step": 1705 }, { "completion_length": 173.74609375, "epoch": 0.75536291101847, "grad_norm": 0.6974431872367859, "kl": 0.40240478515625, "learning_rate": 3.424929842429848e-06, "loss": 0.0161, "reward": 1.17734375, "reward_std": 0.2468060377985239, "rewards/accuracy_reward": 0.22109375, "rewards/format_reward": 0.95625, "step": 1710 }, { "completion_length": 154.61953125, "epoch": 0.7575715745009801, "grad_norm": 0.43418002128601074, "kl": 0.3802001953125, "learning_rate": 3.366996729837102e-06, "loss": 0.0152, "reward": 1.2546875, "reward_std": 0.26053862273693085, "rewards/accuracy_reward": 0.29140625, "rewards/format_reward": 0.96328125, "step": 1715 }, { "completion_length": 141.70625, "epoch": 0.7597802379834903, "grad_norm": 0.5373135805130005, "kl": 0.39423828125, "learning_rate": 3.309458430989527e-06, "loss": 0.0158, "reward": 1.2203125, "reward_std": 0.216771724447608, "rewards/accuracy_reward": 0.24765625, "rewards/format_reward": 0.97265625, "step": 1720 }, { "completion_length": 142.44765625, "epoch": 0.7619889014660004, "grad_norm": 0.5140719413757324, "kl": 0.40374755859375, "learning_rate": 3.2523183707175366e-06, "loss": 0.0161, "reward": 1.234375, "reward_std": 0.21065853331238032, "rewards/accuracy_reward": 0.2671875, "rewards/format_reward": 0.9671875, "step": 1725 }, { "completion_length": 158.240625, "epoch": 0.7641975649485105, "grad_norm": 0.6687895655632019, "kl": 0.46646728515625, "learning_rate": 3.1955799501473226e-06, "loss": 0.0187, "reward": 1.18515625, "reward_std": 0.3053234376013279, "rewards/accuracy_reward": 0.24453125, "rewards/format_reward": 0.940625, "step": 1730 }, { "completion_length": 169.2078125, "epoch": 0.7664062284310207, "grad_norm": 0.4844968616962433, "kl": 0.48839111328125, "learning_rate": 3.1392465464984455e-06, "loss": 0.0195, "reward": 1.196875, "reward_std": 0.29177020620554683, "rewards/accuracy_reward": 0.26640625, "rewards/format_reward": 0.93046875, "step": 1735 }, { "completion_length": 152.02890625, "epoch": 0.7686148919135308, "grad_norm": 0.5372802019119263, "kl": 0.461181640625, "learning_rate": 3.083321512882773e-06, "loss": 0.0184, "reward": 1.17421875, "reward_std": 0.2651050504297018, "rewards/accuracy_reward": 0.22421875, "rewards/format_reward": 0.95, "step": 1740 }, { "completion_length": 129.86171875, "epoch": 0.770823555396041, "grad_norm": 0.781815767288208, "kl": 1.108740234375, "learning_rate": 3.0278081781049405e-06, "loss": 0.0444, "reward": 1.23125, "reward_std": 0.25756825953722, "rewards/accuracy_reward": 0.271875, "rewards/format_reward": 0.959375, "step": 1745 }, { "completion_length": 134.19296875, "epoch": 0.7730322188785511, "grad_norm": 0.5345208644866943, "kl": 0.44124755859375, "learning_rate": 2.9727098464641735e-06, "loss": 0.0177, "reward": 1.26875, "reward_std": 0.23357822820544244, "rewards/accuracy_reward": 0.30703125, "rewards/format_reward": 0.96171875, "step": 1750 }, { "completion_length": 136.47734375, "epoch": 0.7752408823610613, "grad_norm": 0.6541038155555725, "kl": 0.44271240234375, "learning_rate": 2.9180297975576368e-06, "loss": 0.0177, "reward": 1.2328125, "reward_std": 0.22248574066907167, "rewards/accuracy_reward": 0.271875, "rewards/format_reward": 0.9609375, "step": 1755 }, { "completion_length": 120.7203125, "epoch": 0.7774495458435714, "grad_norm": 0.4700789749622345, "kl": 0.449462890625, "learning_rate": 2.8637712860851974e-06, "loss": 0.018, "reward": 1.24765625, "reward_std": 0.23170709386467933, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.96640625, "step": 1760 }, { "completion_length": 123.3, "epoch": 0.7796582093260815, "grad_norm": 0.6256026029586792, "kl": 0.42320556640625, "learning_rate": 2.8099375416557163e-06, "loss": 0.0169, "reward": 1.2609375, "reward_std": 0.22770290337502958, "rewards/accuracy_reward": 0.29140625, "rewards/format_reward": 0.96953125, "step": 1765 }, { "completion_length": 115.96328125, "epoch": 0.7818668728085917, "grad_norm": 0.5066558718681335, "kl": 0.43795166015625, "learning_rate": 2.7565317685948e-06, "loss": 0.0175, "reward": 1.2984375, "reward_std": 0.22580576539039612, "rewards/accuracy_reward": 0.325, "rewards/format_reward": 0.9734375, "step": 1770 }, { "completion_length": 131.2125, "epoch": 0.7840755362911018, "grad_norm": 0.8898307681083679, "kl": 0.4900634765625, "learning_rate": 2.7035571457540865e-06, "loss": 0.0196, "reward": 1.23203125, "reward_std": 0.23165916074067355, "rewards/accuracy_reward": 0.2671875, "rewards/format_reward": 0.96484375, "step": 1775 }, { "completion_length": 136.24453125, "epoch": 0.786284199773612, "grad_norm": 0.49983325600624084, "kl": 0.4647705078125, "learning_rate": 2.651016826322017e-06, "loss": 0.0186, "reward": 1.26015625, "reward_std": 0.23627216089516878, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.95546875, "step": 1780 }, { "completion_length": 120.8796875, "epoch": 0.7884928632561221, "grad_norm": 0.6942005753517151, "kl": 0.459130859375, "learning_rate": 2.598913937636153e-06, "loss": 0.0184, "reward": 1.24375, "reward_std": 0.2460821120068431, "rewards/accuracy_reward": 0.27578125, "rewards/format_reward": 0.96796875, "step": 1785 }, { "completion_length": 112.2453125, "epoch": 0.7907015267386323, "grad_norm": 0.48447439074516296, "kl": 0.4265380859375, "learning_rate": 2.5472515809970343e-06, "loss": 0.0171, "reward": 1.2421875, "reward_std": 0.19503602739423515, "rewards/accuracy_reward": 0.27109375, "rewards/format_reward": 0.97109375, "step": 1790 }, { "completion_length": 105.003125, "epoch": 0.7929101902211424, "grad_norm": 0.9765954613685608, "kl": 0.48828125, "learning_rate": 2.4960328314835746e-06, "loss": 0.0195, "reward": 1.21796875, "reward_std": 0.1995122255757451, "rewards/accuracy_reward": 0.240625, "rewards/format_reward": 0.97734375, "step": 1795 }, { "completion_length": 110.82421875, "epoch": 0.7951188537036525, "grad_norm": 1.8787330389022827, "kl": 0.458447265625, "learning_rate": 2.4452607377700367e-06, "loss": 0.0183, "reward": 1.25625, "reward_std": 0.23729459717869758, "rewards/accuracy_reward": 0.2828125, "rewards/format_reward": 0.9734375, "step": 1800 }, { "epoch": 0.7951188537036525, "eval_completion_length": 112.79791687011719, "eval_kl": 0.497265625, "eval_loss": 0.0199314896017313, "eval_reward": 1.2658333349227906, "eval_reward_std": 0.2067297151684761, "eval_rewards/accuracy_reward": 0.2920833334326744, "eval_rewards/format_reward": 0.97375, "eval_runtime": 119.8153, "eval_samples_per_second": 0.826, "eval_steps_per_second": 0.033, "step": 1800 }, { "completion_length": 109.19375, "epoch": 0.7973275171861627, "grad_norm": 2.065337657928467, "kl": 0.4661865234375, "learning_rate": 2.394938321944551e-06, "loss": 0.0187, "reward": 1.2140625, "reward_std": 0.22883423641324044, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.971875, "step": 1805 }, { "completion_length": 108.35, "epoch": 0.7995361806686728, "grad_norm": 0.6966126561164856, "kl": 0.7103515625, "learning_rate": 2.3450685793292437e-06, "loss": 0.0284, "reward": 1.17265625, "reward_std": 0.17452798802405595, "rewards/accuracy_reward": 0.2015625, "rewards/format_reward": 0.97109375, "step": 1810 }, { "completion_length": 113.3828125, "epoch": 0.801744844151183, "grad_norm": 0.8000837564468384, "kl": 0.7013427734375, "learning_rate": 2.295654478301942e-06, "loss": 0.0281, "reward": 1.234375, "reward_std": 0.21197393592447042, "rewards/accuracy_reward": 0.2640625, "rewards/format_reward": 0.9703125, "step": 1815 }, { "completion_length": 116.340625, "epoch": 0.8039535076336931, "grad_norm": 0.9716657996177673, "kl": 0.9371826171875, "learning_rate": 2.246698960119499e-06, "loss": 0.0375, "reward": 1.221875, "reward_std": 0.23381243012845515, "rewards/accuracy_reward": 0.246875, "rewards/format_reward": 0.975, "step": 1820 }, { "completion_length": 143.3609375, "epoch": 0.8061621711162033, "grad_norm": 3.171027898788452, "kl": 0.77176513671875, "learning_rate": 2.198204938742707e-06, "loss": 0.0309, "reward": 1.20234375, "reward_std": 0.25977810826152564, "rewards/accuracy_reward": 0.246875, "rewards/format_reward": 0.95546875, "step": 1825 }, { "completion_length": 135.51484375, "epoch": 0.8083708345987135, "grad_norm": 1.4249086380004883, "kl": 0.80992431640625, "learning_rate": 2.150175300662862e-06, "loss": 0.0324, "reward": 1.18671875, "reward_std": 0.24862184505909682, "rewards/accuracy_reward": 0.228125, "rewards/format_reward": 0.95859375, "step": 1830 }, { "completion_length": 151.6921875, "epoch": 0.8105794980812236, "grad_norm": 1.0902801752090454, "kl": 0.86148681640625, "learning_rate": 2.1026129047299436e-06, "loss": 0.0345, "reward": 1.2, "reward_std": 0.260273445956409, "rewards/accuracy_reward": 0.25703125, "rewards/format_reward": 0.94296875, "step": 1835 }, { "completion_length": 140.75859375, "epoch": 0.8127881615637338, "grad_norm": 0.5658231973648071, "kl": 0.635693359375, "learning_rate": 2.055520581982463e-06, "loss": 0.0254, "reward": 1.26171875, "reward_std": 0.26789135448634627, "rewards/accuracy_reward": 0.30546875, "rewards/format_reward": 0.95625, "step": 1840 }, { "completion_length": 108.0703125, "epoch": 0.8149968250462439, "grad_norm": 0.7409191727638245, "kl": 0.51619873046875, "learning_rate": 2.0089011354789357e-06, "loss": 0.0206, "reward": 1.2359375, "reward_std": 0.21879921518266202, "rewards/accuracy_reward": 0.26171875, "rewards/format_reward": 0.97421875, "step": 1845 }, { "completion_length": 131.575, "epoch": 0.8172054885287541, "grad_norm": 0.933627724647522, "kl": 0.5877197265625, "learning_rate": 1.9627573401310452e-06, "loss": 0.0235, "reward": 1.23203125, "reward_std": 0.26072712801396847, "rewards/accuracy_reward": 0.275, "rewards/format_reward": 0.95703125, "step": 1850 }, { "completion_length": 116.653125, "epoch": 0.8194141520112642, "grad_norm": 0.5518223643302917, "kl": 0.523291015625, "learning_rate": 1.9170919425384695e-06, "loss": 0.0209, "reward": 1.296875, "reward_std": 0.23366234563291072, "rewards/accuracy_reward": 0.32265625, "rewards/format_reward": 0.97421875, "step": 1855 }, { "completion_length": 121.140625, "epoch": 0.8216228154937744, "grad_norm": 0.5761392712593079, "kl": 0.49658203125, "learning_rate": 1.8719076608254028e-06, "loss": 0.0199, "reward": 1.225, "reward_std": 0.21435881238430737, "rewards/accuracy_reward": 0.25390625, "rewards/format_reward": 0.97109375, "step": 1860 }, { "completion_length": 136.73359375, "epoch": 0.8238314789762845, "grad_norm": 0.7133124470710754, "kl": 0.52952880859375, "learning_rate": 1.8272071844787575e-06, "loss": 0.0212, "reward": 1.1703125, "reward_std": 0.2595834471285343, "rewards/accuracy_reward": 0.209375, "rewards/format_reward": 0.9609375, "step": 1865 }, { "completion_length": 141.15390625, "epoch": 0.8260401424587946, "grad_norm": 0.9754716157913208, "kl": 0.5518310546875, "learning_rate": 1.7829931741880802e-06, "loss": 0.0221, "reward": 1.2625, "reward_std": 0.28494130074977875, "rewards/accuracy_reward": 0.31328125, "rewards/format_reward": 0.94921875, "step": 1870 }, { "completion_length": 123.50234375, "epoch": 0.8282488059413048, "grad_norm": 0.912441611289978, "kl": 0.601611328125, "learning_rate": 1.7392682616871836e-06, "loss": 0.0241, "reward": 1.225, "reward_std": 0.2650785157456994, "rewards/accuracy_reward": 0.2609375, "rewards/format_reward": 0.9640625, "step": 1875 }, { "completion_length": 116.25625, "epoch": 0.8304574694238149, "grad_norm": 0.719677209854126, "kl": 0.52056884765625, "learning_rate": 1.696035049597503e-06, "loss": 0.0208, "reward": 1.24453125, "reward_std": 0.22317184396088124, "rewards/accuracy_reward": 0.26796875, "rewards/format_reward": 0.9765625, "step": 1880 }, { "completion_length": 118.2171875, "epoch": 0.8326661329063251, "grad_norm": 0.5467638969421387, "kl": 0.5210205078125, "learning_rate": 1.6532961112731672e-06, "loss": 0.0208, "reward": 1.23828125, "reward_std": 0.19823672361671923, "rewards/accuracy_reward": 0.26171875, "rewards/format_reward": 0.9765625, "step": 1885 }, { "completion_length": 113.9359375, "epoch": 0.8348747963888352, "grad_norm": 0.6019476652145386, "kl": 0.444091796875, "learning_rate": 1.6110539906478463e-06, "loss": 0.0178, "reward": 1.3015625, "reward_std": 0.2479660578072071, "rewards/accuracy_reward": 0.32421875, "rewards/format_reward": 0.97734375, "step": 1890 }, { "completion_length": 122.75546875, "epoch": 0.8370834598713454, "grad_norm": 0.9671128988265991, "kl": 0.5489501953125, "learning_rate": 1.5693112020833012e-06, "loss": 0.022, "reward": 1.2703125, "reward_std": 0.2313979933038354, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9734375, "step": 1895 }, { "completion_length": 118.28046875, "epoch": 0.8392921233538555, "grad_norm": 0.9875019788742065, "kl": 0.4894287109375, "learning_rate": 1.528070230219756e-06, "loss": 0.0196, "reward": 1.2171875, "reward_std": 0.21601561345160009, "rewards/accuracy_reward": 0.24140625, "rewards/format_reward": 0.97578125, "step": 1900 }, { "epoch": 0.8392921233538555, "eval_completion_length": 123.54375, "eval_kl": 0.558125, "eval_loss": 0.022584721446037292, "eval_reward": 1.295, "eval_reward_std": 0.21910995721817017, "eval_rewards/accuracy_reward": 0.32416666686534884, "eval_rewards/format_reward": 0.9708333349227906, "eval_runtime": 124.5976, "eval_samples_per_second": 0.795, "eval_steps_per_second": 0.032, "step": 1900 }, { "completion_length": 136.87578125, "epoch": 0.8415007868363656, "grad_norm": 0.6465866565704346, "kl": 0.58070068359375, "learning_rate": 1.4873335298279801e-06, "loss": 0.0232, "reward": 1.27265625, "reward_std": 0.22537416350096465, "rewards/accuracy_reward": 0.3078125, "rewards/format_reward": 0.96484375, "step": 1905 }, { "completion_length": 152.965625, "epoch": 0.8437094503188758, "grad_norm": 0.5781317353248596, "kl": 0.5817138671875, "learning_rate": 1.447103525663186e-06, "loss": 0.0233, "reward": 1.20234375, "reward_std": 0.23281266931444405, "rewards/accuracy_reward": 0.24375, "rewards/format_reward": 0.95859375, "step": 1910 }, { "completion_length": 153.8953125, "epoch": 0.8459181138013859, "grad_norm": 0.6810880303382874, "kl": 0.59539794921875, "learning_rate": 1.4073826123206946e-06, "loss": 0.0238, "reward": 1.259375, "reward_std": 0.2544757820665836, "rewards/accuracy_reward": 0.29609375, "rewards/format_reward": 0.96328125, "step": 1915 }, { "completion_length": 143.8140625, "epoch": 0.8481267772838961, "grad_norm": 0.6009095311164856, "kl": 0.5434326171875, "learning_rate": 1.368173154093414e-06, "loss": 0.0217, "reward": 1.2453125, "reward_std": 0.2513797411695123, "rewards/accuracy_reward": 0.2828125, "rewards/format_reward": 0.9625, "step": 1920 }, { "completion_length": 159.8171875, "epoch": 0.8503354407664062, "grad_norm": 0.6377544403076172, "kl": 0.572900390625, "learning_rate": 1.3294774848310954e-06, "loss": 0.0229, "reward": 1.23046875, "reward_std": 0.27456119302660226, "rewards/accuracy_reward": 0.27421875, "rewards/format_reward": 0.95625, "step": 1925 }, { "completion_length": 155.94453125, "epoch": 0.8525441042489164, "grad_norm": 0.5800438523292542, "kl": 0.6190185546875, "learning_rate": 1.2912979078014242e-06, "loss": 0.0248, "reward": 1.20703125, "reward_std": 0.23998625949025154, "rewards/accuracy_reward": 0.253125, "rewards/format_reward": 0.95390625, "step": 1930 }, { "completion_length": 159.3046875, "epoch": 0.8547527677314265, "grad_norm": 0.9404852986335754, "kl": 0.56507568359375, "learning_rate": 1.253636695552931e-06, "loss": 0.0226, "reward": 1.259375, "reward_std": 0.2761839430779219, "rewards/accuracy_reward": 0.30703125, "rewards/format_reward": 0.95234375, "step": 1935 }, { "completion_length": 154.96953125, "epoch": 0.8569614312139366, "grad_norm": 0.9105063676834106, "kl": 0.64293212890625, "learning_rate": 1.216496089779703e-06, "loss": 0.0257, "reward": 1.22578125, "reward_std": 0.27103531677275894, "rewards/accuracy_reward": 0.271875, "rewards/format_reward": 0.95390625, "step": 1940 }, { "completion_length": 147.2546875, "epoch": 0.8591700946964468, "grad_norm": 0.6244191527366638, "kl": 0.49593505859375, "learning_rate": 1.1798783011879766e-06, "loss": 0.0198, "reward": 1.26484375, "reward_std": 0.2632339050993323, "rewards/accuracy_reward": 0.30546875, "rewards/format_reward": 0.959375, "step": 1945 }, { "completion_length": 150.15234375, "epoch": 0.8613787581789569, "grad_norm": 0.9317097663879395, "kl": 0.51068115234375, "learning_rate": 1.14378550936453e-06, "loss": 0.0204, "reward": 1.2453125, "reward_std": 0.2504005776718259, "rewards/accuracy_reward": 0.28828125, "rewards/format_reward": 0.95703125, "step": 1950 }, { "completion_length": 136.7046875, "epoch": 0.8635874216614671, "grad_norm": 0.5654709935188293, "kl": 0.496484375, "learning_rate": 1.1082198626469687e-06, "loss": 0.0199, "reward": 1.22421875, "reward_std": 0.23155678305774927, "rewards/accuracy_reward": 0.25625, "rewards/format_reward": 0.96796875, "step": 1955 }, { "completion_length": 135.80859375, "epoch": 0.8657960851439772, "grad_norm": 0.4000113904476166, "kl": 0.499267578125, "learning_rate": 1.0731834779958217e-06, "loss": 0.02, "reward": 1.253125, "reward_std": 0.2156506871804595, "rewards/accuracy_reward": 0.290625, "rewards/format_reward": 0.9625, "step": 1960 }, { "completion_length": 140.81640625, "epoch": 0.8680047486264874, "grad_norm": 0.41182902455329895, "kl": 0.44923095703125, "learning_rate": 1.0386784408685713e-06, "loss": 0.018, "reward": 1.1765625, "reward_std": 0.21103496849536896, "rewards/accuracy_reward": 0.2125, "rewards/format_reward": 0.9640625, "step": 1965 }, { "completion_length": 144.734375, "epoch": 0.8702134121089975, "grad_norm": 0.6124417781829834, "kl": 0.5717529296875, "learning_rate": 1.0047068050954868e-06, "loss": 0.0229, "reward": 1.19140625, "reward_std": 0.2501412840560079, "rewards/accuracy_reward": 0.2359375, "rewards/format_reward": 0.95546875, "step": 1970 }, { "completion_length": 137.81015625, "epoch": 0.8724220755915076, "grad_norm": 0.7430810928344727, "kl": 0.51724853515625, "learning_rate": 9.71270592757404e-07, "loss": 0.0207, "reward": 1.25234375, "reward_std": 0.2686174543574452, "rewards/accuracy_reward": 0.28359375, "rewards/format_reward": 0.96875, "step": 1975 }, { "completion_length": 117.8140625, "epoch": 0.8746307390740178, "grad_norm": 0.48936066031455994, "kl": 0.59530029296875, "learning_rate": 9.38371794065337e-07, "loss": 0.0238, "reward": 1.2453125, "reward_std": 0.21264754123985768, "rewards/accuracy_reward": 0.27109375, "rewards/format_reward": 0.97421875, "step": 1980 }, { "completion_length": 136.915625, "epoch": 0.876839402556528, "grad_norm": 0.5388721823692322, "kl": 0.4608642578125, "learning_rate": 9.060123672420451e-07, "loss": 0.0184, "reward": 1.228125, "reward_std": 0.22003105469048023, "rewards/accuracy_reward": 0.25859375, "rewards/format_reward": 0.96953125, "step": 1985 }, { "completion_length": 123.7546875, "epoch": 0.8790480660390382, "grad_norm": 0.7182089686393738, "kl": 0.4820068359375, "learning_rate": 8.741942384054481e-07, "loss": 0.0193, "reward": 1.2609375, "reward_std": 0.23840244263410568, "rewards/accuracy_reward": 0.28828125, "rewards/format_reward": 0.97265625, "step": 1990 }, { "completion_length": 124.53359375, "epoch": 0.8812567295215483, "grad_norm": 0.524861752986908, "kl": 0.452880859375, "learning_rate": 8.429193014540015e-07, "loss": 0.0181, "reward": 1.253125, "reward_std": 0.2055924626067281, "rewards/accuracy_reward": 0.275, "rewards/format_reward": 0.978125, "step": 1995 }, { "completion_length": 130.221875, "epoch": 0.8834653930040585, "grad_norm": 0.49339717626571655, "kl": 0.484423828125, "learning_rate": 8.121894179539469e-07, "loss": 0.0194, "reward": 1.225, "reward_std": 0.22272255159914495, "rewards/accuracy_reward": 0.25625, "rewards/format_reward": 0.96875, "step": 2000 }, { "epoch": 0.8834653930040585, "eval_completion_length": 142.34375, "eval_kl": 0.450234375, "eval_loss": 0.018087182193994522, "eval_reward": 1.279166669845581, "eval_reward_std": 0.2484509229660034, "eval_rewards/accuracy_reward": 0.31458333373069763, "eval_rewards/format_reward": 0.9645833349227906, "eval_runtime": 145.7929, "eval_samples_per_second": 0.679, "eval_steps_per_second": 0.027, "step": 2000 }, { "completion_length": 130.49140625, "epoch": 0.8856740564865686, "grad_norm": 0.5030075907707214, "kl": 0.4587646484375, "learning_rate": 7.82006417028518e-07, "loss": 0.0183, "reward": 1.2390625, "reward_std": 0.20883522126823664, "rewards/accuracy_reward": 0.26015625, "rewards/format_reward": 0.97890625, "step": 2005 }, { "completion_length": 145.6078125, "epoch": 0.8878827199690787, "grad_norm": 0.5313246250152588, "kl": 0.45802001953125, "learning_rate": 7.523720952490631e-07, "loss": 0.0183, "reward": 1.2578125, "reward_std": 0.2512391902506351, "rewards/accuracy_reward": 0.28828125, "rewards/format_reward": 0.96953125, "step": 2010 }, { "completion_length": 157.45546875, "epoch": 0.8900913834515889, "grad_norm": 0.5215573310852051, "kl": 0.49078369140625, "learning_rate": 7.232882165281141e-07, "loss": 0.0196, "reward": 1.1890625, "reward_std": 0.22680971212685108, "rewards/accuracy_reward": 0.228125, "rewards/format_reward": 0.9609375, "step": 2015 }, { "completion_length": 146.7203125, "epoch": 0.892300046934099, "grad_norm": 0.6293109059333801, "kl": 0.49932861328125, "learning_rate": 6.947565120143828e-07, "loss": 0.02, "reward": 1.2375, "reward_std": 0.2244907196611166, "rewards/accuracy_reward": 0.27265625, "rewards/format_reward": 0.96484375, "step": 2020 }, { "completion_length": 152.1328125, "epoch": 0.8945087104166092, "grad_norm": 0.6269906759262085, "kl": 0.4867919921875, "learning_rate": 6.667786799897269e-07, "loss": 0.0195, "reward": 1.23828125, "reward_std": 0.2209881154820323, "rewards/accuracy_reward": 0.275, "rewards/format_reward": 0.96328125, "step": 2025 }, { "completion_length": 143.7203125, "epoch": 0.8967173738991193, "grad_norm": 0.4811406433582306, "kl": 0.45821533203125, "learning_rate": 6.393563857680596e-07, "loss": 0.0183, "reward": 1.26953125, "reward_std": 0.22661811783909797, "rewards/accuracy_reward": 0.3015625, "rewards/format_reward": 0.96796875, "step": 2030 }, { "completion_length": 145.00390625, "epoch": 0.8989260373816295, "grad_norm": 0.6537109017372131, "kl": 0.49169921875, "learning_rate": 6.124912615962341e-07, "loss": 0.0197, "reward": 1.24765625, "reward_std": 0.22691688518971204, "rewards/accuracy_reward": 0.2765625, "rewards/format_reward": 0.97109375, "step": 2035 }, { "completion_length": 159.30390625, "epoch": 0.9011347008641396, "grad_norm": 0.5987099409103394, "kl": 0.4700439453125, "learning_rate": 5.861849065568726e-07, "loss": 0.0188, "reward": 1.2921875, "reward_std": 0.26790456287562847, "rewards/accuracy_reward": 0.3328125, "rewards/format_reward": 0.959375, "step": 2040 }, { "completion_length": 157.98203125, "epoch": 0.9033433643466497, "grad_norm": 1.6861835718154907, "kl": 0.5018310546875, "learning_rate": 5.604388864732002e-07, "loss": 0.0201, "reward": 1.2171875, "reward_std": 0.23706249240785837, "rewards/accuracy_reward": 0.25703125, "rewards/format_reward": 0.96015625, "step": 2045 }, { "completion_length": 157.39921875, "epoch": 0.9055520278291599, "grad_norm": 0.623263955116272, "kl": 0.61142578125, "learning_rate": 5.352547338158309e-07, "loss": 0.0245, "reward": 1.19453125, "reward_std": 0.2746475737541914, "rewards/accuracy_reward": 0.23828125, "rewards/format_reward": 0.95625, "step": 2050 }, { "completion_length": 153.7953125, "epoch": 0.90776069131167, "grad_norm": 0.6804496049880981, "kl": 0.4615234375, "learning_rate": 5.106339476115596e-07, "loss": 0.0185, "reward": 1.23359375, "reward_std": 0.2757456684485078, "rewards/accuracy_reward": 0.271875, "rewards/format_reward": 0.96171875, "step": 2055 }, { "completion_length": 160.09140625, "epoch": 0.9099693547941802, "grad_norm": 0.8526637554168701, "kl": 0.48623046875, "learning_rate": 4.865779933541348e-07, "loss": 0.0194, "reward": 1.253125, "reward_std": 0.27613792307674884, "rewards/accuracy_reward": 0.30078125, "rewards/format_reward": 0.95234375, "step": 2060 }, { "completion_length": 161.521875, "epoch": 0.9121780182766903, "grad_norm": 0.6661585569381714, "kl": 0.49755859375, "learning_rate": 4.63088302917023e-07, "loss": 0.0199, "reward": 1.24375, "reward_std": 0.24981417022645475, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.9546875, "step": 2065 }, { "completion_length": 157.80546875, "epoch": 0.9143866817592005, "grad_norm": 0.7502483129501343, "kl": 0.4491455078125, "learning_rate": 4.401662744681845e-07, "loss": 0.018, "reward": 1.2625, "reward_std": 0.23363354597240688, "rewards/accuracy_reward": 0.29765625, "rewards/format_reward": 0.96484375, "step": 2070 }, { "completion_length": 141.709375, "epoch": 0.9165953452417106, "grad_norm": 1.0072883367538452, "kl": 0.440625, "learning_rate": 4.1781327238684775e-07, "loss": 0.0176, "reward": 1.271875, "reward_std": 0.23114844579249622, "rewards/accuracy_reward": 0.29921875, "rewards/format_reward": 0.97265625, "step": 2075 }, { "completion_length": 143.68515625, "epoch": 0.9188040087242207, "grad_norm": 0.7586592435836792, "kl": 0.49891357421875, "learning_rate": 3.9603062718230667e-07, "loss": 0.02, "reward": 1.27734375, "reward_std": 0.2479051820933819, "rewards/accuracy_reward": 0.30703125, "rewards/format_reward": 0.9703125, "step": 2080 }, { "completion_length": 163.2359375, "epoch": 0.9210126722067309, "grad_norm": 0.7740212678909302, "kl": 0.4748779296875, "learning_rate": 3.748196354147127e-07, "loss": 0.019, "reward": 1.253125, "reward_std": 0.2910253481939435, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.9484375, "step": 2085 }, { "completion_length": 161.70703125, "epoch": 0.923221335689241, "grad_norm": 0.5001750588417053, "kl": 0.51661376953125, "learning_rate": 3.5418155961790546e-07, "loss": 0.0207, "reward": 1.18984375, "reward_std": 0.24765556119382381, "rewards/accuracy_reward": 0.23984375, "rewards/format_reward": 0.95, "step": 2090 }, { "completion_length": 149.140625, "epoch": 0.9254299991717512, "grad_norm": 0.6595699787139893, "kl": 0.459619140625, "learning_rate": 3.341176282242653e-07, "loss": 0.0184, "reward": 1.25546875, "reward_std": 0.23822309002280234, "rewards/accuracy_reward": 0.29140625, "rewards/format_reward": 0.9640625, "step": 2095 }, { "completion_length": 152.50390625, "epoch": 0.9276386626542613, "grad_norm": 1.5634217262268066, "kl": 0.5083740234375, "learning_rate": 3.1462903549159484e-07, "loss": 0.0203, "reward": 1.2578125, "reward_std": 0.2591968797147274, "rewards/accuracy_reward": 0.3015625, "rewards/format_reward": 0.95625, "step": 2100 }, { "epoch": 0.9276386626542613, "eval_completion_length": 141.9604168701172, "eval_kl": 0.46765625, "eval_loss": 0.01848418451845646, "eval_reward": 1.28125, "eval_reward_std": 0.24666063576936723, "eval_rewards/accuracy_reward": 0.31916666686534884, "eval_rewards/format_reward": 0.9620833349227905, "eval_runtime": 156.3759, "eval_samples_per_second": 0.633, "eval_steps_per_second": 0.026, "step": 2100 }, { "completion_length": 148.68515625, "epoch": 0.9298473261367715, "grad_norm": 0.7756811380386353, "kl": 0.4727783203125, "learning_rate": 2.9571694143202934e-07, "loss": 0.0189, "reward": 1.2359375, "reward_std": 0.2811011435464025, "rewards/accuracy_reward": 0.27578125, "rewards/format_reward": 0.96015625, "step": 2105 }, { "completion_length": 170.4796875, "epoch": 0.9320559896192816, "grad_norm": 0.5152100324630737, "kl": 0.523291015625, "learning_rate": 2.773824717429907e-07, "loss": 0.0209, "reward": 1.18046875, "reward_std": 0.28240158669650556, "rewards/accuracy_reward": 0.2328125, "rewards/format_reward": 0.94765625, "step": 2110 }, { "completion_length": 138.71171875, "epoch": 0.9342646531017917, "grad_norm": 0.6858031153678894, "kl": 0.481396484375, "learning_rate": 2.5962671774018234e-07, "loss": 0.0193, "reward": 1.2265625, "reward_std": 0.2497631970793009, "rewards/accuracy_reward": 0.2625, "rewards/format_reward": 0.9640625, "step": 2115 }, { "completion_length": 150.840625, "epoch": 0.9364733165843019, "grad_norm": 0.7537331581115723, "kl": 0.49796142578125, "learning_rate": 2.424507362926376e-07, "loss": 0.0199, "reward": 1.2484375, "reward_std": 0.2752187229692936, "rewards/accuracy_reward": 0.2875, "rewards/format_reward": 0.9609375, "step": 2120 }, { "completion_length": 145.603125, "epoch": 0.938681980066812, "grad_norm": 0.854280412197113, "kl": 0.453173828125, "learning_rate": 2.2585554975980252e-07, "loss": 0.0181, "reward": 1.240625, "reward_std": 0.24026636723428965, "rewards/accuracy_reward": 0.278125, "rewards/format_reward": 0.9625, "step": 2125 }, { "completion_length": 146.646875, "epoch": 0.9408906435493222, "grad_norm": 0.6566202640533447, "kl": 0.47532958984375, "learning_rate": 2.0984214593069318e-07, "loss": 0.019, "reward": 1.2546875, "reward_std": 0.2486549686640501, "rewards/accuracy_reward": 0.28671875, "rewards/format_reward": 0.96796875, "step": 2130 }, { "completion_length": 134.93125, "epoch": 0.9430993070318323, "grad_norm": 0.6418664455413818, "kl": 0.70113525390625, "learning_rate": 1.9441147796508408e-07, "loss": 0.028, "reward": 1.28046875, "reward_std": 0.270217102766037, "rewards/accuracy_reward": 0.3140625, "rewards/format_reward": 0.96640625, "step": 2135 }, { "completion_length": 153.1296875, "epoch": 0.9453079705143425, "grad_norm": 0.5165784358978271, "kl": 0.51217041015625, "learning_rate": 1.795644643367922e-07, "loss": 0.0205, "reward": 1.23984375, "reward_std": 0.2561708649620414, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.95859375, "step": 2140 }, { "completion_length": 143.43046875, "epoch": 0.9475166339968527, "grad_norm": 0.6096455454826355, "kl": 0.474267578125, "learning_rate": 1.6530198877899417e-07, "loss": 0.019, "reward": 1.265625, "reward_std": 0.24631664287298918, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.96875, "step": 2145 }, { "completion_length": 149.7796875, "epoch": 0.9497252974793629, "grad_norm": 0.6443772912025452, "kl": 0.46708984375, "learning_rate": 1.5162490023163057e-07, "loss": 0.0187, "reward": 1.23828125, "reward_std": 0.23290605265647174, "rewards/accuracy_reward": 0.275, "rewards/format_reward": 0.96328125, "step": 2150 }, { "completion_length": 152.20546875, "epoch": 0.951933960961873, "grad_norm": 0.7591469883918762, "kl": 0.5279296875, "learning_rate": 1.3853401279086853e-07, "loss": 0.0211, "reward": 1.20859375, "reward_std": 0.23095191065222026, "rewards/accuracy_reward": 0.24921875, "rewards/format_reward": 0.959375, "step": 2155 }, { "completion_length": 143.9375, "epoch": 0.9541426244443831, "grad_norm": 0.4750344753265381, "kl": 0.47156982421875, "learning_rate": 1.2603010566065055e-07, "loss": 0.0189, "reward": 1.240625, "reward_std": 0.2516822377219796, "rewards/accuracy_reward": 0.27734375, "rewards/format_reward": 0.96328125, "step": 2160 }, { "completion_length": 147.12578125, "epoch": 0.9563512879268933, "grad_norm": 0.6196463704109192, "kl": 0.50130615234375, "learning_rate": 1.1411392310631153e-07, "loss": 0.0201, "reward": 1.2375, "reward_std": 0.2511793440207839, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.95625, "step": 2165 }, { "completion_length": 130.6046875, "epoch": 0.9585599514094034, "grad_norm": 0.519320547580719, "kl": 0.55948486328125, "learning_rate": 1.0278617441028205e-07, "loss": 0.0224, "reward": 1.2484375, "reward_std": 0.24660416580736638, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.9671875, "step": 2170 }, { "completion_length": 139.35078125, "epoch": 0.9607686148919136, "grad_norm": 0.5927404761314392, "kl": 0.45987548828125, "learning_rate": 9.204753382986097e-08, "loss": 0.0184, "reward": 1.240625, "reward_std": 0.19637434519827365, "rewards/accuracy_reward": 0.27265625, "rewards/format_reward": 0.96796875, "step": 2175 }, { "completion_length": 139.8921875, "epoch": 0.9629772783744237, "grad_norm": 0.4677460491657257, "kl": 0.47265625, "learning_rate": 8.189864055709206e-08, "loss": 0.0189, "reward": 1.2296875, "reward_std": 0.2277947474271059, "rewards/accuracy_reward": 0.2625, "rewards/format_reward": 0.9671875, "step": 2180 }, { "completion_length": 143.153125, "epoch": 0.9651859418569338, "grad_norm": 0.9342114925384521, "kl": 0.5101318359375, "learning_rate": 7.23400986807099e-08, "loss": 0.0204, "reward": 1.25078125, "reward_std": 0.26373773720115423, "rewards/accuracy_reward": 0.28984375, "rewards/format_reward": 0.9609375, "step": 2185 }, { "completion_length": 148.3703125, "epoch": 0.967394605339444, "grad_norm": 0.46142810583114624, "kl": 0.48370361328125, "learning_rate": 6.337247715018869e-08, "loss": 0.0194, "reward": 1.2203125, "reward_std": 0.23893490042537452, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9625, "step": 2190 }, { "completion_length": 139.8171875, "epoch": 0.9696032688219541, "grad_norm": 0.5866816639900208, "kl": 0.4761962890625, "learning_rate": 5.4996309741873755e-08, "loss": 0.019, "reward": 1.275, "reward_std": 0.23191295862197875, "rewards/accuracy_reward": 0.30546875, "rewards/format_reward": 0.96953125, "step": 2195 }, { "completion_length": 150.57109375, "epoch": 0.9718119323044643, "grad_norm": 0.540817141532898, "kl": 0.44088134765625, "learning_rate": 4.7212095027209246e-08, "loss": 0.0176, "reward": 1.29765625, "reward_std": 0.2451239839196205, "rewards/accuracy_reward": 0.33046875, "rewards/format_reward": 0.9671875, "step": 2200 }, { "epoch": 0.9718119323044643, "eval_completion_length": 168.58166748046875, "eval_kl": 0.486328125, "eval_loss": 0.019449135288596153, "eval_reward": 1.2825, "eval_reward_std": 0.28413535237312315, "eval_rewards/accuracy_reward": 0.3333333337306976, "eval_rewards/format_reward": 0.9491666674613952, "eval_runtime": 159.9487, "eval_samples_per_second": 0.619, "eval_steps_per_second": 0.025, "step": 2200 }, { "completion_length": 153.36015625, "epoch": 0.9740205957869744, "grad_norm": 0.4312755763530731, "kl": 0.52962646484375, "learning_rate": 4.0020296343065144e-08, "loss": 0.0212, "reward": 1.2046875, "reward_std": 0.2571037333458662, "rewards/accuracy_reward": 0.24609375, "rewards/format_reward": 0.95859375, "step": 2205 }, { "completion_length": 146.95, "epoch": 0.9762292592694846, "grad_norm": 0.8880886435508728, "kl": 0.478515625, "learning_rate": 3.3421341764152684e-08, "loss": 0.0191, "reward": 1.23359375, "reward_std": 0.2560200056061149, "rewards/accuracy_reward": 0.27109375, "rewards/format_reward": 0.9625, "step": 2210 }, { "completion_length": 144.825, "epoch": 0.9784379227519947, "grad_norm": 0.788774311542511, "kl": 0.512060546875, "learning_rate": 2.7415624077551383e-08, "loss": 0.0205, "reward": 1.2359375, "reward_std": 0.26023210752755405, "rewards/accuracy_reward": 0.2765625, "rewards/format_reward": 0.959375, "step": 2215 }, { "completion_length": 133.7578125, "epoch": 0.9806465862345048, "grad_norm": 0.5918937921524048, "kl": 0.51031494140625, "learning_rate": 2.2003500759322228e-08, "loss": 0.0204, "reward": 1.2203125, "reward_std": 0.24810067620128393, "rewards/accuracy_reward": 0.2515625, "rewards/format_reward": 0.96875, "step": 2220 }, { "completion_length": 131.171875, "epoch": 0.982855249717015, "grad_norm": 0.4942164421081543, "kl": 0.49373779296875, "learning_rate": 1.718529395323687e-08, "loss": 0.0198, "reward": 1.234375, "reward_std": 0.22725256606936456, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.96875, "step": 2225 }, { "completion_length": 151.55703125, "epoch": 0.9850639131995251, "grad_norm": 0.4727592468261719, "kl": 0.46812744140625, "learning_rate": 1.2961290451594111e-08, "loss": 0.0187, "reward": 1.2390625, "reward_std": 0.2715959116816521, "rewards/accuracy_reward": 0.2796875, "rewards/format_reward": 0.959375, "step": 2230 }, { "completion_length": 149.5328125, "epoch": 0.9872725766820353, "grad_norm": 0.7650235891342163, "kl": 0.46435546875, "learning_rate": 9.3317416781602e-09, "loss": 0.0186, "reward": 1.26875, "reward_std": 0.2793737856671214, "rewards/accuracy_reward": 0.3109375, "rewards/format_reward": 0.9578125, "step": 2235 }, { "completion_length": 148.59296875, "epoch": 0.9894812401645454, "grad_norm": 0.9975623488426208, "kl": 0.50648193359375, "learning_rate": 6.296863673191933e-09, "loss": 0.0203, "reward": 1.26796875, "reward_std": 0.270522028952837, "rewards/accuracy_reward": 0.3109375, "rewards/format_reward": 0.95703125, "step": 2240 }, { "completion_length": 148.49765625, "epoch": 0.9916899036470556, "grad_norm": 0.8689573407173157, "kl": 0.4764892578125, "learning_rate": 3.856837080585818e-09, "loss": 0.0191, "reward": 1.25, "reward_std": 0.23697545174509288, "rewards/accuracy_reward": 0.2828125, "rewards/format_reward": 0.9671875, "step": 2245 }, { "completion_length": 149.41484375, "epoch": 0.9938985671295657, "grad_norm": 0.9692167639732361, "kl": 0.488427734375, "learning_rate": 2.0118071371211244e-09, "loss": 0.0195, "reward": 1.2078125, "reward_std": 0.25148440394550564, "rewards/accuracy_reward": 0.2484375, "rewards/format_reward": 0.959375, "step": 2250 }, { "completion_length": 139.27109375, "epoch": 0.9961072306120758, "grad_norm": 13.9436674118042, "kl": 0.557568359375, "learning_rate": 7.618836638190186e-10, "loss": 0.0223, "reward": 1.26953125, "reward_std": 0.2697257066145539, "rewards/accuracy_reward": 0.30625, "rewards/format_reward": 0.96328125, "step": 2255 }, { "completion_length": 156.4703125, "epoch": 0.998315894094586, "grad_norm": 0.910273015499115, "kl": 0.52550048828125, "learning_rate": 1.0714105940001773e-10, "loss": 0.021, "reward": 1.190625, "reward_std": 0.2517782935872674, "rewards/accuracy_reward": 0.23984375, "rewards/format_reward": 0.95078125, "step": 2260 }, { "completion_length": 164.65104166666666, "epoch": 0.9996410921840921, "kl": 0.5259602864583334, "reward": 1.20703125, "reward_std": 0.2667766287922859, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.94921875, "step": 2263, "total_flos": 0.0, "train_loss": 2.13883109888834, "train_runtime": 166892.9358, "train_samples_per_second": 0.434, "train_steps_per_second": 0.014 } ], "logging_steps": 5, "max_steps": 2263, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }