| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9996410921840921, | |
| "eval_steps": 100, | |
| "global_step": 2263, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 400.17890625, | |
| "epoch": 0.002208663482510146, | |
| "grad_norm": 1.4726563692092896, | |
| "kl": 0.0001227259635925293, | |
| "learning_rate": 4.405286343612335e-07, | |
| "loss": 0.0, | |
| "reward": 0.57421875, | |
| "reward_std": 0.458265589363873, | |
| "rewards/accuracy_reward": 0.15078125, | |
| "rewards/format_reward": 0.4234375, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 379.875, | |
| "epoch": 0.004417326965020292, | |
| "grad_norm": 1.1294347047805786, | |
| "kl": 0.00025554299354553224, | |
| "learning_rate": 8.81057268722467e-07, | |
| "loss": 0.0, | |
| "reward": 0.6515625, | |
| "reward_std": 0.4448237407952547, | |
| "rewards/accuracy_reward": 0.146875, | |
| "rewards/format_reward": 0.5046875, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 349.56640625, | |
| "epoch": 0.006625990447530438, | |
| "grad_norm": 0.7087352871894836, | |
| "kl": 0.003926074504852295, | |
| "learning_rate": 1.3215859030837006e-06, | |
| "loss": 0.0002, | |
| "reward": 0.7, | |
| "reward_std": 0.4231703171506524, | |
| "rewards/accuracy_reward": 0.1078125, | |
| "rewards/format_reward": 0.5921875, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 259.32421875, | |
| "epoch": 0.008834653930040584, | |
| "grad_norm": 6.263583660125732, | |
| "kl": 0.08897933959960938, | |
| "learning_rate": 1.762114537444934e-06, | |
| "loss": 0.0036, | |
| "reward": 0.9296875, | |
| "reward_std": 0.34432896580547095, | |
| "rewards/accuracy_reward": 0.09453125, | |
| "rewards/format_reward": 0.83515625, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 258.88203125, | |
| "epoch": 0.01104331741255073, | |
| "grad_norm": 0.662657618522644, | |
| "kl": 0.041219329833984374, | |
| "learning_rate": 2.2026431718061673e-06, | |
| "loss": 0.0016, | |
| "reward": 0.93359375, | |
| "reward_std": 0.3294501030817628, | |
| "rewards/accuracy_reward": 0.08828125, | |
| "rewards/format_reward": 0.8453125, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 283.37578125, | |
| "epoch": 0.013251980895060876, | |
| "grad_norm": 0.8266062140464783, | |
| "kl": 0.03238487243652344, | |
| "learning_rate": 2.643171806167401e-06, | |
| "loss": 0.0013, | |
| "reward": 0.88671875, | |
| "reward_std": 0.38558061737567184, | |
| "rewards/accuracy_reward": 0.1015625, | |
| "rewards/format_reward": 0.78515625, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 228.65625, | |
| "epoch": 0.015460644377571022, | |
| "grad_norm": 0.9381304383277893, | |
| "kl": 0.023354721069335938, | |
| "learning_rate": 3.0837004405286347e-06, | |
| "loss": 0.0009, | |
| "reward": 0.9203125, | |
| "reward_std": 0.34575226698070766, | |
| "rewards/accuracy_reward": 0.09296875, | |
| "rewards/format_reward": 0.82734375, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 175.81953125, | |
| "epoch": 0.017669307860081168, | |
| "grad_norm": 0.654951274394989, | |
| "kl": 0.04830093383789062, | |
| "learning_rate": 3.524229074889868e-06, | |
| "loss": 0.0019, | |
| "reward": 0.99765625, | |
| "reward_std": 0.3096940713003278, | |
| "rewards/accuracy_reward": 0.11015625, | |
| "rewards/format_reward": 0.8875, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 179.19765625, | |
| "epoch": 0.019877971342591314, | |
| "grad_norm": 0.46008017659187317, | |
| "kl": 0.0401031494140625, | |
| "learning_rate": 3.964757709251102e-06, | |
| "loss": 0.0016, | |
| "reward": 1.0421875, | |
| "reward_std": 0.22858326323330402, | |
| "rewards/accuracy_reward": 0.09453125, | |
| "rewards/format_reward": 0.94765625, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 219.9734375, | |
| "epoch": 0.02208663482510146, | |
| "grad_norm": 0.6333373188972473, | |
| "kl": 0.05000381469726563, | |
| "learning_rate": 4.405286343612335e-06, | |
| "loss": 0.002, | |
| "reward": 1.11640625, | |
| "reward_std": 0.2830535739660263, | |
| "rewards/accuracy_reward": 0.16875, | |
| "rewards/format_reward": 0.94765625, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 206.340625, | |
| "epoch": 0.024295298307611606, | |
| "grad_norm": 0.444731205701828, | |
| "kl": 0.07784347534179688, | |
| "learning_rate": 4.8458149779735685e-06, | |
| "loss": 0.0031, | |
| "reward": 1.08671875, | |
| "reward_std": 0.24719008896499872, | |
| "rewards/accuracy_reward": 0.1296875, | |
| "rewards/format_reward": 0.95703125, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 217.25546875, | |
| "epoch": 0.026503961790121752, | |
| "grad_norm": 0.49777764081954956, | |
| "kl": 0.0484649658203125, | |
| "learning_rate": 5.286343612334802e-06, | |
| "loss": 0.0019, | |
| "reward": 1.096875, | |
| "reward_std": 0.2545401843264699, | |
| "rewards/accuracy_reward": 0.1390625, | |
| "rewards/format_reward": 0.9578125, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 206.72265625, | |
| "epoch": 0.028712625272631898, | |
| "grad_norm": 0.5592818260192871, | |
| "kl": 0.05699615478515625, | |
| "learning_rate": 5.7268722466960354e-06, | |
| "loss": 0.0023, | |
| "reward": 1.1234375, | |
| "reward_std": 0.25865183435380457, | |
| "rewards/accuracy_reward": 0.16640625, | |
| "rewards/format_reward": 0.95703125, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 178.8796875, | |
| "epoch": 0.030921288755142044, | |
| "grad_norm": 0.7485532164573669, | |
| "kl": 0.06690826416015624, | |
| "learning_rate": 6.167400881057269e-06, | |
| "loss": 0.0027, | |
| "reward": 1.15859375, | |
| "reward_std": 0.2730751080438495, | |
| "rewards/accuracy_reward": 0.1859375, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 206.44375, | |
| "epoch": 0.033129952237652194, | |
| "grad_norm": 0.5321747064590454, | |
| "kl": 0.05711669921875, | |
| "learning_rate": 6.607929515418503e-06, | |
| "loss": 0.0023, | |
| "reward": 1.14140625, | |
| "reward_std": 0.2609355779364705, | |
| "rewards/accuracy_reward": 0.17578125, | |
| "rewards/format_reward": 0.965625, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 180.4796875, | |
| "epoch": 0.035338615720162336, | |
| "grad_norm": 0.7185708284378052, | |
| "kl": 0.079705810546875, | |
| "learning_rate": 7.048458149779736e-06, | |
| "loss": 0.0032, | |
| "reward": 1.20859375, | |
| "reward_std": 0.2991209041327238, | |
| "rewards/accuracy_reward": 0.240625, | |
| "rewards/format_reward": 0.96796875, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 205.42890625, | |
| "epoch": 0.037547279202672486, | |
| "grad_norm": 0.6930840611457825, | |
| "kl": 0.079119873046875, | |
| "learning_rate": 7.48898678414097e-06, | |
| "loss": 0.0032, | |
| "reward": 1.2125, | |
| "reward_std": 0.29966206308454274, | |
| "rewards/accuracy_reward": 0.24296875, | |
| "rewards/format_reward": 0.96953125, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 232.1125, | |
| "epoch": 0.03975594268518263, | |
| "grad_norm": 0.6082009673118591, | |
| "kl": 0.078369140625, | |
| "learning_rate": 7.929515418502203e-06, | |
| "loss": 0.0031, | |
| "reward": 1.2390625, | |
| "reward_std": 0.3367170764133334, | |
| "rewards/accuracy_reward": 0.2796875, | |
| "rewards/format_reward": 0.959375, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 234.2796875, | |
| "epoch": 0.04196460616769278, | |
| "grad_norm": 0.5688744783401489, | |
| "kl": 0.0821014404296875, | |
| "learning_rate": 8.370044052863436e-06, | |
| "loss": 0.0033, | |
| "reward": 1.2484375, | |
| "reward_std": 0.3449540941044688, | |
| "rewards/accuracy_reward": 0.28515625, | |
| "rewards/format_reward": 0.96328125, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 247.909375, | |
| "epoch": 0.04417326965020292, | |
| "grad_norm": 0.4896390438079834, | |
| "kl": 0.093121337890625, | |
| "learning_rate": 8.81057268722467e-06, | |
| "loss": 0.0037, | |
| "reward": 1.271875, | |
| "reward_std": 0.29160809628665446, | |
| "rewards/accuracy_reward": 0.2953125, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04417326965020292, | |
| "eval_completion_length": 264.7725, | |
| "eval_kl": 0.09, | |
| "eval_loss": 0.003620876930654049, | |
| "eval_reward": 1.2708333349227905, | |
| "eval_reward_std": 0.30468439966440203, | |
| "eval_rewards/accuracy_reward": 0.29583333343267443, | |
| "eval_rewards/format_reward": 0.975, | |
| "eval_runtime": 145.932, | |
| "eval_samples_per_second": 0.678, | |
| "eval_steps_per_second": 0.027, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 292.07734375, | |
| "epoch": 0.04638193313271307, | |
| "grad_norm": 0.5129627585411072, | |
| "kl": 0.099822998046875, | |
| "learning_rate": 9.251101321585904e-06, | |
| "loss": 0.004, | |
| "reward": 1.2453125, | |
| "reward_std": 0.3150330139324069, | |
| "rewards/accuracy_reward": 0.28671875, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 264.43125, | |
| "epoch": 0.04859059661522321, | |
| "grad_norm": 0.604174017906189, | |
| "kl": 0.103936767578125, | |
| "learning_rate": 9.691629955947137e-06, | |
| "loss": 0.0042, | |
| "reward": 1.26015625, | |
| "reward_std": 0.26412205342203376, | |
| "rewards/accuracy_reward": 0.290625, | |
| "rewards/format_reward": 0.96953125, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 238.78203125, | |
| "epoch": 0.05079926009773336, | |
| "grad_norm": 0.5758931040763855, | |
| "kl": 5888.534802246094, | |
| "learning_rate": 1.0132158590308372e-05, | |
| "loss": 234.88, | |
| "reward": 1.30703125, | |
| "reward_std": 0.3428795490413904, | |
| "rewards/accuracy_reward": 0.35078125, | |
| "rewards/format_reward": 0.95625, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 227.2875, | |
| "epoch": 0.053007923580243504, | |
| "grad_norm": 0.6410739421844482, | |
| "kl": 0.21376953125, | |
| "learning_rate": 1.0572687224669605e-05, | |
| "loss": 0.0085, | |
| "reward": 1.2140625, | |
| "reward_std": 0.39241575095802544, | |
| "rewards/accuracy_reward": 0.29765625, | |
| "rewards/format_reward": 0.91640625, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 208.746875, | |
| "epoch": 0.055216587062753654, | |
| "grad_norm": 118.43889617919922, | |
| "kl": 9.3187255859375, | |
| "learning_rate": 1.1013215859030836e-05, | |
| "loss": 0.3719, | |
| "reward": 1.1359375, | |
| "reward_std": 0.34939223267138003, | |
| "rewards/accuracy_reward": 0.2296875, | |
| "rewards/format_reward": 0.90625, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 164.7515625, | |
| "epoch": 0.057425250545263797, | |
| "grad_norm": 0.6973806023597717, | |
| "kl": 1.569244384765625, | |
| "learning_rate": 1.1453744493392071e-05, | |
| "loss": 0.0628, | |
| "reward": 1.20390625, | |
| "reward_std": 0.3942227842286229, | |
| "rewards/accuracy_reward": 0.29140625, | |
| "rewards/format_reward": 0.9125, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 188.734375, | |
| "epoch": 0.059633914027773946, | |
| "grad_norm": 54.8037223815918, | |
| "kl": 9.47294921875, | |
| "learning_rate": 1.1894273127753304e-05, | |
| "loss": 0.3796, | |
| "reward": 1.20234375, | |
| "reward_std": 0.32416225373744967, | |
| "rewards/accuracy_reward": 0.27734375, | |
| "rewards/format_reward": 0.925, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 150.5953125, | |
| "epoch": 0.06184257751028409, | |
| "grad_norm": 33.95077133178711, | |
| "kl": 1.503021240234375, | |
| "learning_rate": 1.2334801762114539e-05, | |
| "loss": 0.0602, | |
| "reward": 1.23671875, | |
| "reward_std": 0.30887170899659394, | |
| "rewards/accuracy_reward": 0.28203125, | |
| "rewards/format_reward": 0.9546875, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 179.94453125, | |
| "epoch": 0.06405124099279423, | |
| "grad_norm": 2.577629804611206, | |
| "kl": 1.558148193359375, | |
| "learning_rate": 1.2775330396475772e-05, | |
| "loss": 0.0623, | |
| "reward": 1.11953125, | |
| "reward_std": 0.4326841413974762, | |
| "rewards/accuracy_reward": 0.27265625, | |
| "rewards/format_reward": 0.846875, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 218.13203125, | |
| "epoch": 0.06625990447530439, | |
| "grad_norm": 4.710011005401611, | |
| "kl": 2.9638153076171876, | |
| "learning_rate": 1.3215859030837006e-05, | |
| "loss": 0.1188, | |
| "reward": 1.196875, | |
| "reward_std": 0.4242498528212309, | |
| "rewards/accuracy_reward": 0.32421875, | |
| "rewards/format_reward": 0.87265625, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 283.83828125, | |
| "epoch": 0.06846856795781453, | |
| "grad_norm": 1.0956288576126099, | |
| "kl": 10.248779296875, | |
| "learning_rate": 1.3656387665198238e-05, | |
| "loss": 0.4114, | |
| "reward": 1.19453125, | |
| "reward_std": 0.4564665203914046, | |
| "rewards/accuracy_reward": 0.34921875, | |
| "rewards/format_reward": 0.8453125, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 209.16328125, | |
| "epoch": 0.07067723144032467, | |
| "grad_norm": 0.584996223449707, | |
| "kl": 0.1743133544921875, | |
| "learning_rate": 1.4096916299559472e-05, | |
| "loss": 0.007, | |
| "reward": 1.36953125, | |
| "reward_std": 0.3628161208704114, | |
| "rewards/accuracy_reward": 0.40859375, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 196.24453125, | |
| "epoch": 0.07288589492283482, | |
| "grad_norm": 0.8083503246307373, | |
| "kl": 0.168878173828125, | |
| "learning_rate": 1.4537444933920706e-05, | |
| "loss": 0.0068, | |
| "reward": 1.290625, | |
| "reward_std": 0.31575766000896693, | |
| "rewards/accuracy_reward": 0.33515625, | |
| "rewards/format_reward": 0.95546875, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 190.34453125, | |
| "epoch": 0.07509455840534497, | |
| "grad_norm": 0.6046306490898132, | |
| "kl": 0.1955078125, | |
| "learning_rate": 1.497797356828194e-05, | |
| "loss": 0.0078, | |
| "reward": 1.1984375, | |
| "reward_std": 0.37014698795974255, | |
| "rewards/accuracy_reward": 0.2765625, | |
| "rewards/format_reward": 0.921875, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 148.55, | |
| "epoch": 0.07730322188785511, | |
| "grad_norm": 0.509132444858551, | |
| "kl": 0.1782470703125, | |
| "learning_rate": 1.5418502202643173e-05, | |
| "loss": 0.0071, | |
| "reward": 1.23125, | |
| "reward_std": 0.23883652742952108, | |
| "rewards/accuracy_reward": 0.246875, | |
| "rewards/format_reward": 0.984375, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 307.5828125, | |
| "epoch": 0.07951188537036526, | |
| "grad_norm": 115.14045715332031, | |
| "kl": 2.41651611328125, | |
| "learning_rate": 1.5859030837004406e-05, | |
| "loss": 0.0967, | |
| "reward": 1.1296875, | |
| "reward_std": 0.4080469489097595, | |
| "rewards/accuracy_reward": 0.28125, | |
| "rewards/format_reward": 0.8484375, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 745.15546875, | |
| "epoch": 0.0817205488528754, | |
| "grad_norm": 1.7153384685516357, | |
| "kl": 1.31116943359375, | |
| "learning_rate": 1.629955947136564e-05, | |
| "loss": 0.0524, | |
| "reward": 0.30546875, | |
| "reward_std": 0.46172712091356516, | |
| "rewards/accuracy_reward": 0.0890625, | |
| "rewards/format_reward": 0.21640625, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 366.26796875, | |
| "epoch": 0.08392921233538556, | |
| "grad_norm": 2.2445106506347656, | |
| "kl": 4.1861083984375, | |
| "learning_rate": 1.6740088105726872e-05, | |
| "loss": 0.1674, | |
| "reward": 0.69453125, | |
| "reward_std": 0.5489674057811499, | |
| "rewards/accuracy_reward": 0.225, | |
| "rewards/format_reward": 0.46953125, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 276.47890625, | |
| "epoch": 0.0861378758178957, | |
| "grad_norm": 1.3458584547042847, | |
| "kl": 0.4182861328125, | |
| "learning_rate": 1.718061674008811e-05, | |
| "loss": 0.0167, | |
| "reward": 1.2015625, | |
| "reward_std": 0.47771220188587904, | |
| "rewards/accuracy_reward": 0.35546875, | |
| "rewards/format_reward": 0.84609375, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 330.8, | |
| "epoch": 0.08834653930040584, | |
| "grad_norm": 5.644898414611816, | |
| "kl": 3.825439453125, | |
| "learning_rate": 1.762114537444934e-05, | |
| "loss": 0.153, | |
| "reward": 1.13203125, | |
| "reward_std": 0.5322479158639908, | |
| "rewards/accuracy_reward": 0.33125, | |
| "rewards/format_reward": 0.80078125, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08834653930040584, | |
| "eval_completion_length": 356.7441674804688, | |
| "eval_kl": 3.136875, | |
| "eval_loss": 0.12226903438568115, | |
| "eval_reward": 1.0791666674613953, | |
| "eval_reward_std": 0.5326099014282226, | |
| "eval_rewards/accuracy_reward": 0.3129166668653488, | |
| "eval_rewards/format_reward": 0.76625, | |
| "eval_runtime": 294.9089, | |
| "eval_samples_per_second": 0.336, | |
| "eval_steps_per_second": 0.014, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 272.7140625, | |
| "epoch": 0.09055520278291598, | |
| "grad_norm": 3.581615924835205, | |
| "kl": 1.26063232421875, | |
| "learning_rate": 1.8061674008810575e-05, | |
| "loss": 0.0504, | |
| "reward": 1.13828125, | |
| "reward_std": 0.35043725427240136, | |
| "rewards/accuracy_reward": 0.24296875, | |
| "rewards/format_reward": 0.8953125, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 215.88984375, | |
| "epoch": 0.09276386626542614, | |
| "grad_norm": 4.773531436920166, | |
| "kl": 0.9517578125, | |
| "learning_rate": 1.8502202643171808e-05, | |
| "loss": 0.0381, | |
| "reward": 1.23828125, | |
| "reward_std": 0.30075737833976746, | |
| "rewards/accuracy_reward": 0.30703125, | |
| "rewards/format_reward": 0.93125, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 225.04765625, | |
| "epoch": 0.09497252974793628, | |
| "grad_norm": 2.473970890045166, | |
| "kl": 0.9461669921875, | |
| "learning_rate": 1.894273127753304e-05, | |
| "loss": 0.0378, | |
| "reward": 1.09140625, | |
| "reward_std": 0.4519174795597792, | |
| "rewards/accuracy_reward": 0.2453125, | |
| "rewards/format_reward": 0.84609375, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 297.01875, | |
| "epoch": 0.09718119323044642, | |
| "grad_norm": 2.2808244228363037, | |
| "kl": 21.850811767578126, | |
| "learning_rate": 1.9383259911894274e-05, | |
| "loss": 0.8722, | |
| "reward": 1.08046875, | |
| "reward_std": 0.46868909504264594, | |
| "rewards/accuracy_reward": 0.27734375, | |
| "rewards/format_reward": 0.803125, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 228.60234375, | |
| "epoch": 0.09938985671295657, | |
| "grad_norm": 1.9014919996261597, | |
| "kl": 0.230517578125, | |
| "learning_rate": 1.982378854625551e-05, | |
| "loss": 0.0092, | |
| "reward": 1.23046875, | |
| "reward_std": 0.3573141796514392, | |
| "rewards/accuracy_reward": 0.30234375, | |
| "rewards/format_reward": 0.928125, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 389.16640625, | |
| "epoch": 0.10159852019546672, | |
| "grad_norm": 7.406689643859863, | |
| "kl": 1.709527587890625, | |
| "learning_rate": 1.99998928589406e-05, | |
| "loss": 0.0684, | |
| "reward": 1.0375, | |
| "reward_std": 0.49771256893873217, | |
| "rewards/accuracy_reward": 0.26875, | |
| "rewards/format_reward": 0.76875, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 524.803125, | |
| "epoch": 0.10380718367797687, | |
| "grad_norm": 10.62135124206543, | |
| "kl": 5.1268310546875, | |
| "learning_rate": 1.999923811633618e-05, | |
| "loss": 0.2051, | |
| "reward": 0.834375, | |
| "reward_std": 0.5984118554741145, | |
| "rewards/accuracy_reward": 0.21015625, | |
| "rewards/format_reward": 0.62421875, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 332.025, | |
| "epoch": 0.10601584716048701, | |
| "grad_norm": 1.785484790802002, | |
| "kl": 8.025537109375, | |
| "learning_rate": 1.999798819286288e-05, | |
| "loss": 0.3211, | |
| "reward": 0.48359375, | |
| "reward_std": 0.4500583238899708, | |
| "rewards/accuracy_reward": 0.11484375, | |
| "rewards/format_reward": 0.36875, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 292.31328125, | |
| "epoch": 0.10822451064299715, | |
| "grad_norm": 3.578373432159424, | |
| "kl": 0.82308349609375, | |
| "learning_rate": 1.9996143162919416e-05, | |
| "loss": 0.0329, | |
| "reward": 0.8859375, | |
| "reward_std": 0.5004101138561964, | |
| "rewards/accuracy_reward": 0.1703125, | |
| "rewards/format_reward": 0.715625, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 230.86953125, | |
| "epoch": 0.11043317412550731, | |
| "grad_norm": 3.702038288116455, | |
| "kl": 11.077984619140626, | |
| "learning_rate": 1.9993703136326808e-05, | |
| "loss": 0.4433, | |
| "reward": 1.26875, | |
| "reward_std": 0.3013193493708968, | |
| "rewards/accuracy_reward": 0.30390625, | |
| "rewards/format_reward": 0.96484375, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 218.70078125, | |
| "epoch": 0.11264183760801745, | |
| "grad_norm": 7.172746658325195, | |
| "kl": 0.32041015625, | |
| "learning_rate": 1.999066825832184e-05, | |
| "loss": 0.0128, | |
| "reward": 1.26015625, | |
| "reward_std": 0.23753905296325684, | |
| "rewards/accuracy_reward": 0.26875, | |
| "rewards/format_reward": 0.99140625, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 253.38984375, | |
| "epoch": 0.11485050109052759, | |
| "grad_norm": 1.3221828937530518, | |
| "kl": 8.120147705078125, | |
| "learning_rate": 1.9987038709548408e-05, | |
| "loss": 0.3232, | |
| "reward": 1.24609375, | |
| "reward_std": 0.2821945507079363, | |
| "rewards/accuracy_reward": 0.27734375, | |
| "rewards/format_reward": 0.96875, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 288.26796875, | |
| "epoch": 0.11705916457303774, | |
| "grad_norm": 0.6487714052200317, | |
| "kl": 0.57447509765625, | |
| "learning_rate": 1.9982814706046766e-05, | |
| "loss": 0.023, | |
| "reward": 1.1859375, | |
| "reward_std": 0.3182327225804329, | |
| "rewards/accuracy_reward": 0.2390625, | |
| "rewards/format_reward": 0.946875, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 361.62265625, | |
| "epoch": 0.11926782805554789, | |
| "grad_norm": 3.041498899459839, | |
| "kl": 1.428155517578125, | |
| "learning_rate": 1.997799649924068e-05, | |
| "loss": 0.0572, | |
| "reward": 1.06640625, | |
| "reward_std": 0.3401893651112914, | |
| "rewards/accuracy_reward": 0.1796875, | |
| "rewards/format_reward": 0.88671875, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 322.9, | |
| "epoch": 0.12147649153805803, | |
| "grad_norm": 1.5550798177719116, | |
| "kl": 0.65260009765625, | |
| "learning_rate": 1.9972584375922453e-05, | |
| "loss": 0.0261, | |
| "reward": 1.13984375, | |
| "reward_std": 0.37677015643566847, | |
| "rewards/accuracy_reward": 0.23984375, | |
| "rewards/format_reward": 0.9, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 319.37578125, | |
| "epoch": 0.12368515502056818, | |
| "grad_norm": 114126.546875, | |
| "kl": 4326.556237792969, | |
| "learning_rate": 1.996657865823585e-05, | |
| "loss": 172.9404, | |
| "reward": 1.23828125, | |
| "reward_std": 0.33014066983014345, | |
| "rewards/accuracy_reward": 0.30078125, | |
| "rewards/format_reward": 0.9375, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 312.85546875, | |
| "epoch": 0.12589381850307832, | |
| "grad_norm": 1.4805908203125, | |
| "kl": 0.228668212890625, | |
| "learning_rate": 1.995997970365694e-05, | |
| "loss": 0.0091, | |
| "reward": 1.21171875, | |
| "reward_std": 0.3828111669048667, | |
| "rewards/accuracy_reward": 0.2890625, | |
| "rewards/format_reward": 0.92265625, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 354.6484375, | |
| "epoch": 0.12810248198558846, | |
| "grad_norm": 53.09714889526367, | |
| "kl": 4.07802734375, | |
| "learning_rate": 1.9952787904972794e-05, | |
| "loss": 0.1632, | |
| "reward": 1.053125, | |
| "reward_std": 0.4529764140024781, | |
| "rewards/accuracy_reward": 0.21640625, | |
| "rewards/format_reward": 0.83671875, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 285.98125, | |
| "epoch": 0.1303111454680986, | |
| "grad_norm": 0.581076443195343, | |
| "kl": 0.92733154296875, | |
| "learning_rate": 1.9945003690258127e-05, | |
| "loss": 0.0371, | |
| "reward": 1.1953125, | |
| "reward_std": 0.31984729822725055, | |
| "rewards/accuracy_reward": 0.259375, | |
| "rewards/format_reward": 0.9359375, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 269.2640625, | |
| "epoch": 0.13251980895060878, | |
| "grad_norm": 0.33922237157821655, | |
| "kl": 0.34735107421875, | |
| "learning_rate": 1.993662752284981e-05, | |
| "loss": 0.0139, | |
| "reward": 1.2796875, | |
| "reward_std": 0.257637638784945, | |
| "rewards/accuracy_reward": 0.30390625, | |
| "rewards/format_reward": 0.97578125, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.13251980895060878, | |
| "eval_completion_length": 312.8345849609375, | |
| "eval_kl": 0.2959375, | |
| "eval_loss": 0.012029927223920822, | |
| "eval_reward": 1.2270833349227905, | |
| "eval_reward_std": 0.3163718029856682, | |
| "eval_rewards/accuracy_reward": 0.2704166667163372, | |
| "eval_rewards/format_reward": 0.9566666674613953, | |
| "eval_runtime": 158.7274, | |
| "eval_samples_per_second": 0.624, | |
| "eval_steps_per_second": 0.025, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 344.0703125, | |
| "epoch": 0.13472847243311892, | |
| "grad_norm": 15.648075103759766, | |
| "kl": 0.812335205078125, | |
| "learning_rate": 1.9927659901319292e-05, | |
| "loss": 0.0324, | |
| "reward": 1.18515625, | |
| "reward_std": 0.36495272126048806, | |
| "rewards/accuracy_reward": 0.24765625, | |
| "rewards/format_reward": 0.9375, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 351.78984375, | |
| "epoch": 0.13693713591562906, | |
| "grad_norm": 0.3627087473869324, | |
| "kl": 0.196575927734375, | |
| "learning_rate": 1.9918101359442908e-05, | |
| "loss": 0.0079, | |
| "reward": 1.1203125, | |
| "reward_std": 0.3219464411959052, | |
| "rewards/accuracy_reward": 0.20234375, | |
| "rewards/format_reward": 0.91796875, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 289.5125, | |
| "epoch": 0.1391457993981392, | |
| "grad_norm": 0.41371065378189087, | |
| "kl": 0.208282470703125, | |
| "learning_rate": 1.990795246617014e-05, | |
| "loss": 0.0083, | |
| "reward": 1.15234375, | |
| "reward_std": 0.30122786965221166, | |
| "rewards/accuracy_reward": 0.21328125, | |
| "rewards/format_reward": 0.9390625, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 243.4625, | |
| "epoch": 0.14135446288064935, | |
| "grad_norm": 0.3589507043361664, | |
| "kl": 0.216851806640625, | |
| "learning_rate": 1.989721382558972e-05, | |
| "loss": 0.0087, | |
| "reward": 1.2484375, | |
| "reward_std": 0.3238747540861368, | |
| "rewards/accuracy_reward": 0.28515625, | |
| "rewards/format_reward": 0.96328125, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 243.1734375, | |
| "epoch": 0.1435631263631595, | |
| "grad_norm": 0.38298115134239197, | |
| "kl": 0.201153564453125, | |
| "learning_rate": 1.988588607689369e-05, | |
| "loss": 0.008, | |
| "reward": 1.2078125, | |
| "reward_std": 0.2437373088672757, | |
| "rewards/accuracy_reward": 0.23515625, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 278.3796875, | |
| "epoch": 0.14577178984566963, | |
| "grad_norm": 0.7377725839614868, | |
| "kl": 0.229010009765625, | |
| "learning_rate": 1.987396989433935e-05, | |
| "loss": 0.0092, | |
| "reward": 1.153125, | |
| "reward_std": 0.36884579751640556, | |
| "rewards/accuracy_reward": 0.2328125, | |
| "rewards/format_reward": 0.9203125, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 280.9796875, | |
| "epoch": 0.14798045332817977, | |
| "grad_norm": 3.068225622177124, | |
| "kl": 0.5145751953125, | |
| "learning_rate": 1.986146598720913e-05, | |
| "loss": 0.0205, | |
| "reward": 1.0796875, | |
| "reward_std": 0.3447397375479341, | |
| "rewards/accuracy_reward": 0.17734375, | |
| "rewards/format_reward": 0.90234375, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 231.14296875, | |
| "epoch": 0.15018911681068994, | |
| "grad_norm": 1.5392614603042603, | |
| "kl": 1.3296142578125, | |
| "learning_rate": 1.984837509976837e-05, | |
| "loss": 0.0532, | |
| "reward": 1.14375, | |
| "reward_std": 0.3358943074941635, | |
| "rewards/accuracy_reward": 0.215625, | |
| "rewards/format_reward": 0.928125, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 248.0734375, | |
| "epoch": 0.15239778029320009, | |
| "grad_norm": 0.6375353336334229, | |
| "kl": 1.9455322265625, | |
| "learning_rate": 1.9834698011221008e-05, | |
| "loss": 0.0778, | |
| "reward": 1.13203125, | |
| "reward_std": 0.38343740683048966, | |
| "rewards/accuracy_reward": 0.2203125, | |
| "rewards/format_reward": 0.91171875, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 261.58046875, | |
| "epoch": 0.15460644377571023, | |
| "grad_norm": 2110458.0, | |
| "kl": 12810.955236816406, | |
| "learning_rate": 1.982043553566321e-05, | |
| "loss": 512.4465, | |
| "reward": 1.06953125, | |
| "reward_std": 0.38607826493680475, | |
| "rewards/accuracy_reward": 0.18046875, | |
| "rewards/format_reward": 0.8890625, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 325.5890625, | |
| "epoch": 0.15681510725822037, | |
| "grad_norm": 7.603396415710449, | |
| "kl": 20.8695068359375, | |
| "learning_rate": 1.980558852203492e-05, | |
| "loss": 0.8364, | |
| "reward": 1.02265625, | |
| "reward_std": 0.46340725645422937, | |
| "rewards/accuracy_reward": 0.209375, | |
| "rewards/format_reward": 0.81328125, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 344.390625, | |
| "epoch": 0.1590237707407305, | |
| "grad_norm": 1.510093331336975, | |
| "kl": 1.6796875, | |
| "learning_rate": 1.979015785406931e-05, | |
| "loss": 0.0672, | |
| "reward": 1.0, | |
| "reward_std": 0.470697814039886, | |
| "rewards/accuracy_reward": 0.19921875, | |
| "rewards/format_reward": 0.80078125, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 243.0875, | |
| "epoch": 0.16123243422324066, | |
| "grad_norm": 7.704539775848389, | |
| "kl": 4.3294921875, | |
| "learning_rate": 1.97741444502402e-05, | |
| "loss": 0.1733, | |
| "reward": 1.16328125, | |
| "reward_std": 0.3427719760686159, | |
| "rewards/accuracy_reward": 0.2390625, | |
| "rewards/format_reward": 0.92421875, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 207.48046875, | |
| "epoch": 0.1634410977057508, | |
| "grad_norm": 5.120603561401367, | |
| "kl": 2.038916015625, | |
| "learning_rate": 1.9757549263707366e-05, | |
| "loss": 0.0816, | |
| "reward": 1.19296875, | |
| "reward_std": 0.35142498891800644, | |
| "rewards/accuracy_reward": 0.25625, | |
| "rewards/format_reward": 0.93671875, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 229.01953125, | |
| "epoch": 0.16564976118826094, | |
| "grad_norm": 7.258485317230225, | |
| "kl": 2.72215576171875, | |
| "learning_rate": 1.974037328225982e-05, | |
| "loss": 0.1089, | |
| "reward": 1.14921875, | |
| "reward_std": 0.3493613565340638, | |
| "rewards/accuracy_reward": 0.22421875, | |
| "rewards/format_reward": 0.925, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 217.7515625, | |
| "epoch": 0.1678584246707711, | |
| "grad_norm": 7.898167133331299, | |
| "kl": 2.309033203125, | |
| "learning_rate": 1.972261752825701e-05, | |
| "loss": 0.0924, | |
| "reward": 1.14453125, | |
| "reward_std": 0.300789905525744, | |
| "rewards/accuracy_reward": 0.20625, | |
| "rewards/format_reward": 0.93828125, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 168.23515625, | |
| "epoch": 0.17006708815328125, | |
| "grad_norm": 0.5930284261703491, | |
| "kl": 1.09581298828125, | |
| "learning_rate": 1.9704283058567972e-05, | |
| "loss": 0.0439, | |
| "reward": 1.17421875, | |
| "reward_std": 0.2850266819819808, | |
| "rewards/accuracy_reward": 0.2140625, | |
| "rewards/format_reward": 0.96015625, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 156.07421875, | |
| "epoch": 0.1722757516357914, | |
| "grad_norm": 2.664320945739746, | |
| "kl": 1.3061279296875, | |
| "learning_rate": 1.968537096450841e-05, | |
| "loss": 0.0523, | |
| "reward": 1.1390625, | |
| "reward_std": 0.28086008559912445, | |
| "rewards/accuracy_reward": 0.18046875, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 204.665625, | |
| "epoch": 0.17448441511830154, | |
| "grad_norm": 1.541552186012268, | |
| "kl": 1.1947509765625, | |
| "learning_rate": 1.9665882371775735e-05, | |
| "loss": 0.0478, | |
| "reward": 1.1328125, | |
| "reward_std": 0.26257925033569335, | |
| "rewards/accuracy_reward": 0.175, | |
| "rewards/format_reward": 0.9578125, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 227.2359375, | |
| "epoch": 0.17669307860081168, | |
| "grad_norm": 0.41995060443878174, | |
| "kl": 0.6468994140625, | |
| "learning_rate": 1.9645818440382096e-05, | |
| "loss": 0.0259, | |
| "reward": 1.215625, | |
| "reward_std": 0.3341550791636109, | |
| "rewards/accuracy_reward": 0.2703125, | |
| "rewards/format_reward": 0.9453125, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.17669307860081168, | |
| "eval_completion_length": 243.36083374023437, | |
| "eval_kl": 0.38421875, | |
| "eval_loss": 0.015584616921842098, | |
| "eval_reward": 1.2075, | |
| "eval_reward_std": 0.30897092461586, | |
| "eval_rewards/accuracy_reward": 0.2583333334326744, | |
| "eval_rewards/format_reward": 0.9491666674613952, | |
| "eval_runtime": 159.9641, | |
| "eval_samples_per_second": 0.619, | |
| "eval_steps_per_second": 0.025, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 266.815625, | |
| "epoch": 0.17890174208332182, | |
| "grad_norm": 0.7400681376457214, | |
| "kl": 0.40008544921875, | |
| "learning_rate": 1.962518036458529e-05, | |
| "loss": 0.016, | |
| "reward": 1.190625, | |
| "reward_std": 0.31112865209579466, | |
| "rewards/accuracy_reward": 0.23828125, | |
| "rewards/format_reward": 0.95234375, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 280.94921875, | |
| "epoch": 0.18111040556583197, | |
| "grad_norm": 0.30900517106056213, | |
| "kl": 0.38648681640625, | |
| "learning_rate": 1.9603969372817695e-05, | |
| "loss": 0.0155, | |
| "reward": 1.240625, | |
| "reward_std": 0.294854056276381, | |
| "rewards/accuracy_reward": 0.28828125, | |
| "rewards/format_reward": 0.95234375, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 358.7671875, | |
| "epoch": 0.1833190690483421, | |
| "grad_norm": 1.0198228359222412, | |
| "kl": 1.0703125, | |
| "learning_rate": 1.9582186727613152e-05, | |
| "loss": 0.0428, | |
| "reward": 1.01796875, | |
| "reward_std": 0.4371380554512143, | |
| "rewards/accuracy_reward": 0.2234375, | |
| "rewards/format_reward": 0.79453125, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 286.39140625, | |
| "epoch": 0.18552773253085228, | |
| "grad_norm": 1.2672603130340576, | |
| "kl": 0.32391357421875, | |
| "learning_rate": 1.955983372553182e-05, | |
| "loss": 0.013, | |
| "reward": 0.946875, | |
| "reward_std": 0.4627113614231348, | |
| "rewards/accuracy_reward": 0.171875, | |
| "rewards/format_reward": 0.775, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 207.659375, | |
| "epoch": 0.18773639601336242, | |
| "grad_norm": 0.4993106424808502, | |
| "kl": 0.32420654296875, | |
| "learning_rate": 1.953691169708298e-05, | |
| "loss": 0.013, | |
| "reward": 1.05859375, | |
| "reward_std": 0.3334008002653718, | |
| "rewards/accuracy_reward": 0.15546875, | |
| "rewards/format_reward": 0.903125, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 167.09296875, | |
| "epoch": 0.18994505949587256, | |
| "grad_norm": 0.7137377858161926, | |
| "kl": 0.3811279296875, | |
| "learning_rate": 1.9513422006645867e-05, | |
| "loss": 0.0152, | |
| "reward": 1.12265625, | |
| "reward_std": 0.3202515557408333, | |
| "rewards/accuracy_reward": 0.1984375, | |
| "rewards/format_reward": 0.92421875, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 259.22109375, | |
| "epoch": 0.1921537229783827, | |
| "grad_norm": 0.4750295877456665, | |
| "kl": 0.30950927734375, | |
| "learning_rate": 1.9489366052388443e-05, | |
| "loss": 0.0124, | |
| "reward": 1.121875, | |
| "reward_std": 0.4058058561757207, | |
| "rewards/accuracy_reward": 0.225, | |
| "rewards/format_reward": 0.896875, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 234.15546875, | |
| "epoch": 0.19436238646089285, | |
| "grad_norm": 0.5221619009971619, | |
| "kl": 0.27547607421875, | |
| "learning_rate": 1.9464745266184173e-05, | |
| "loss": 0.011, | |
| "reward": 1.1921875, | |
| "reward_std": 0.3065523250028491, | |
| "rewards/accuracy_reward": 0.25546875, | |
| "rewards/format_reward": 0.93671875, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 150.1078125, | |
| "epoch": 0.196571049943403, | |
| "grad_norm": 7.660337924957275, | |
| "kl": 0.38460693359375, | |
| "learning_rate": 1.9439561113526802e-05, | |
| "loss": 0.0154, | |
| "reward": 1.16328125, | |
| "reward_std": 0.2454235328361392, | |
| "rewards/accuracy_reward": 0.2015625, | |
| "rewards/format_reward": 0.96171875, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 145.90859375, | |
| "epoch": 0.19877971342591313, | |
| "grad_norm": 0.47105056047439575, | |
| "kl": 0.276025390625, | |
| "learning_rate": 1.9413815093443128e-05, | |
| "loss": 0.011, | |
| "reward": 1.14140625, | |
| "reward_std": 0.21770920380949974, | |
| "rewards/accuracy_reward": 0.1765625, | |
| "rewards/format_reward": 0.96484375, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 162.709375, | |
| "epoch": 0.20098837690842328, | |
| "grad_norm": 0.4455668032169342, | |
| "kl": 0.29571533203125, | |
| "learning_rate": 1.938750873840377e-05, | |
| "loss": 0.0118, | |
| "reward": 1.1203125, | |
| "reward_std": 0.2609225707128644, | |
| "rewards/accuracy_reward": 0.16171875, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 185.5453125, | |
| "epoch": 0.20319704039093345, | |
| "grad_norm": 1.09501051902771, | |
| "kl": 0.34464111328125, | |
| "learning_rate": 1.9360643614231942e-05, | |
| "loss": 0.0138, | |
| "reward": 1.08125, | |
| "reward_std": 0.2819837937131524, | |
| "rewards/accuracy_reward": 0.1375, | |
| "rewards/format_reward": 0.94375, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 178.6546875, | |
| "epoch": 0.2054057038734436, | |
| "grad_norm": 0.501656174659729, | |
| "kl": 0.2873779296875, | |
| "learning_rate": 1.9333221320010275e-05, | |
| "loss": 0.0115, | |
| "reward": 1.1515625, | |
| "reward_std": 0.2634758483618498, | |
| "rewards/accuracy_reward": 0.19453125, | |
| "rewards/format_reward": 0.95703125, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 190.03671875, | |
| "epoch": 0.20761436735595373, | |
| "grad_norm": 1.7912346124649048, | |
| "kl": 0.30848388671875, | |
| "learning_rate": 1.930524348798562e-05, | |
| "loss": 0.0123, | |
| "reward": 1.14609375, | |
| "reward_std": 0.31838786210864783, | |
| "rewards/accuracy_reward": 0.215625, | |
| "rewards/format_reward": 0.93046875, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 174.62109375, | |
| "epoch": 0.20982303083846388, | |
| "grad_norm": 0.4804052412509918, | |
| "kl": 0.32216796875, | |
| "learning_rate": 1.9276711783471888e-05, | |
| "loss": 0.0129, | |
| "reward": 1.096875, | |
| "reward_std": 0.2830366240814328, | |
| "rewards/accuracy_reward": 0.165625, | |
| "rewards/format_reward": 0.93125, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 144.003125, | |
| "epoch": 0.21203169432097402, | |
| "grad_norm": 0.47488105297088623, | |
| "kl": 0.29373779296875, | |
| "learning_rate": 1.9247627904750937e-05, | |
| "loss": 0.0117, | |
| "reward": 1.22109375, | |
| "reward_std": 0.2334042889997363, | |
| "rewards/accuracy_reward": 0.2328125, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 225.4140625, | |
| "epoch": 0.21424035780348416, | |
| "grad_norm": 0.6563146114349365, | |
| "kl": 0.37607421875, | |
| "learning_rate": 1.9217993582971485e-05, | |
| "loss": 0.015, | |
| "reward": 1.12109375, | |
| "reward_std": 0.35556240323930977, | |
| "rewards/accuracy_reward": 0.21015625, | |
| "rewards/format_reward": 0.9109375, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 148.5078125, | |
| "epoch": 0.2164490212859943, | |
| "grad_norm": 0.6001664400100708, | |
| "kl": 0.50482177734375, | |
| "learning_rate": 1.9187810582046056e-05, | |
| "loss": 0.0202, | |
| "reward": 1.16796875, | |
| "reward_std": 0.2445027854293585, | |
| "rewards/accuracy_reward": 0.196875, | |
| "rewards/format_reward": 0.97109375, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 175.846875, | |
| "epoch": 0.21865768476850445, | |
| "grad_norm": 0.48493316769599915, | |
| "kl": 0.33309326171875, | |
| "learning_rate": 1.9157080698546e-05, | |
| "loss": 0.0133, | |
| "reward": 1.134375, | |
| "reward_std": 0.2596387291327119, | |
| "rewards/accuracy_reward": 0.1765625, | |
| "rewards/format_reward": 0.9578125, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 172.95859375, | |
| "epoch": 0.22086634825101462, | |
| "grad_norm": 0.34802621603012085, | |
| "kl": 0.29521484375, | |
| "learning_rate": 1.9125805761594553e-05, | |
| "loss": 0.0118, | |
| "reward": 1.13125, | |
| "reward_std": 0.2513675343245268, | |
| "rewards/accuracy_reward": 0.1703125, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.22086634825101462, | |
| "eval_completion_length": 168.7541668701172, | |
| "eval_kl": 0.3077734375, | |
| "eval_loss": 0.012391554191708565, | |
| "eval_reward": 1.204166669845581, | |
| "eval_reward_std": 0.2949218952655792, | |
| "eval_rewards/accuracy_reward": 0.2508333337306976, | |
| "eval_rewards/format_reward": 0.9533333349227905, | |
| "eval_runtime": 143.7617, | |
| "eval_samples_per_second": 0.689, | |
| "eval_steps_per_second": 0.028, | |
| "step": 500 | |
| }, | |
| { | |
| "completion_length": 160.86171875, | |
| "epoch": 0.22307501173352476, | |
| "grad_norm": 0.750469446182251, | |
| "kl": 0.33531494140625, | |
| "learning_rate": 1.9093987632757957e-05, | |
| "loss": 0.0134, | |
| "reward": 1.13828125, | |
| "reward_std": 0.2916230414062738, | |
| "rewards/accuracy_reward": 0.18828125, | |
| "rewards/format_reward": 0.95, | |
| "step": 505 | |
| }, | |
| { | |
| "completion_length": 158.5875, | |
| "epoch": 0.2252836752160349, | |
| "grad_norm": 0.4148224890232086, | |
| "kl": 0.35068359375, | |
| "learning_rate": 1.9061628205934662e-05, | |
| "loss": 0.014, | |
| "reward": 1.12265625, | |
| "reward_std": 0.263315293751657, | |
| "rewards/accuracy_reward": 0.16953125, | |
| "rewards/format_reward": 0.953125, | |
| "step": 510 | |
| }, | |
| { | |
| "completion_length": 182.6203125, | |
| "epoch": 0.22749233869854504, | |
| "grad_norm": 0.3992038667201996, | |
| "kl": 0.2927001953125, | |
| "learning_rate": 1.9028729407242598e-05, | |
| "loss": 0.0117, | |
| "reward": 1.1828125, | |
| "reward_std": 0.3094627659767866, | |
| "rewards/accuracy_reward": 0.23125, | |
| "rewards/format_reward": 0.9515625, | |
| "step": 515 | |
| }, | |
| { | |
| "completion_length": 191.246875, | |
| "epoch": 0.22970100218105519, | |
| "grad_norm": 0.5499962568283081, | |
| "kl": 0.318408203125, | |
| "learning_rate": 1.8995293194904512e-05, | |
| "loss": 0.0127, | |
| "reward": 1.12734375, | |
| "reward_std": 0.2894813433289528, | |
| "rewards/accuracy_reward": 0.17890625, | |
| "rewards/format_reward": 0.9484375, | |
| "step": 520 | |
| }, | |
| { | |
| "completion_length": 167.7296875, | |
| "epoch": 0.23190966566356533, | |
| "grad_norm": 0.38229528069496155, | |
| "kl": 0.31822509765625, | |
| "learning_rate": 1.896132155913143e-05, | |
| "loss": 0.0127, | |
| "reward": 1.11484375, | |
| "reward_std": 0.29848874974995854, | |
| "rewards/accuracy_reward": 0.178125, | |
| "rewards/format_reward": 0.93671875, | |
| "step": 525 | |
| }, | |
| { | |
| "completion_length": 142.56015625, | |
| "epoch": 0.23411832914607547, | |
| "grad_norm": 0.5173822641372681, | |
| "kl": 0.33304443359375, | |
| "learning_rate": 1.892681652200418e-05, | |
| "loss": 0.0133, | |
| "reward": 1.08984375, | |
| "reward_std": 0.2929655512794852, | |
| "rewards/accuracy_reward": 0.15234375, | |
| "rewards/format_reward": 0.9375, | |
| "step": 530 | |
| }, | |
| { | |
| "completion_length": 151.20078125, | |
| "epoch": 0.2363269926285856, | |
| "grad_norm": 0.521994411945343, | |
| "kl": 0.344140625, | |
| "learning_rate": 1.8891780137353036e-05, | |
| "loss": 0.0138, | |
| "reward": 1.0921875, | |
| "reward_std": 0.2893120773136616, | |
| "rewards/accuracy_reward": 0.1578125, | |
| "rewards/format_reward": 0.934375, | |
| "step": 535 | |
| }, | |
| { | |
| "completion_length": 132.7984375, | |
| "epoch": 0.23853565611109578, | |
| "grad_norm": 0.5721232295036316, | |
| "kl": 0.36300048828125, | |
| "learning_rate": 1.885621449063547e-05, | |
| "loss": 0.0145, | |
| "reward": 1.128125, | |
| "reward_std": 0.24814818538725375, | |
| "rewards/accuracy_reward": 0.1671875, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 540 | |
| }, | |
| { | |
| "completion_length": 144.96875, | |
| "epoch": 0.24074431959360593, | |
| "grad_norm": 0.3831787407398224, | |
| "kl": 0.34149169921875, | |
| "learning_rate": 1.8820121698812028e-05, | |
| "loss": 0.0137, | |
| "reward": 1.14140625, | |
| "reward_std": 0.22749478761106728, | |
| "rewards/accuracy_reward": 0.1765625, | |
| "rewards/format_reward": 0.96484375, | |
| "step": 545 | |
| }, | |
| { | |
| "completion_length": 169.89296875, | |
| "epoch": 0.24295298307611607, | |
| "grad_norm": 0.5316097140312195, | |
| "kl": 0.32564697265625, | |
| "learning_rate": 1.8783503910220296e-05, | |
| "loss": 0.013, | |
| "reward": 1.1546875, | |
| "reward_std": 0.2780306525528431, | |
| "rewards/accuracy_reward": 0.2046875, | |
| "rewards/format_reward": 0.95, | |
| "step": 550 | |
| }, | |
| { | |
| "completion_length": 149.83984375, | |
| "epoch": 0.2451616465586262, | |
| "grad_norm": 0.4427882432937622, | |
| "kl": 0.3146484375, | |
| "learning_rate": 1.8746363304447073e-05, | |
| "loss": 0.0126, | |
| "reward": 1.1578125, | |
| "reward_std": 0.22796925920993089, | |
| "rewards/accuracy_reward": 0.1859375, | |
| "rewards/format_reward": 0.971875, | |
| "step": 555 | |
| }, | |
| { | |
| "completion_length": 165.7453125, | |
| "epoch": 0.24737031004113635, | |
| "grad_norm": 0.47105562686920166, | |
| "kl": 0.298486328125, | |
| "learning_rate": 1.8708702092198576e-05, | |
| "loss": 0.0119, | |
| "reward": 1.17578125, | |
| "reward_std": 0.2685113290324807, | |
| "rewards/accuracy_reward": 0.2125, | |
| "rewards/format_reward": 0.96328125, | |
| "step": 560 | |
| }, | |
| { | |
| "completion_length": 168.32265625, | |
| "epoch": 0.2495789735236465, | |
| "grad_norm": 0.3255383372306824, | |
| "kl": 0.3055908203125, | |
| "learning_rate": 1.867052251516891e-05, | |
| "loss": 0.0122, | |
| "reward": 1.1390625, | |
| "reward_std": 0.24939600769430398, | |
| "rewards/accuracy_reward": 0.1703125, | |
| "rewards/format_reward": 0.96875, | |
| "step": 565 | |
| }, | |
| { | |
| "completion_length": 184.50546875, | |
| "epoch": 0.25178763700615664, | |
| "grad_norm": 0.4039280116558075, | |
| "kl": 0.31488037109375, | |
| "learning_rate": 1.8631826845906588e-05, | |
| "loss": 0.0126, | |
| "reward": 1.14140625, | |
| "reward_std": 0.2917328651994467, | |
| "rewards/accuracy_reward": 0.1984375, | |
| "rewards/format_reward": 0.94296875, | |
| "step": 570 | |
| }, | |
| { | |
| "completion_length": 187.24296875, | |
| "epoch": 0.2539963004886668, | |
| "grad_norm": 0.48246172070503235, | |
| "kl": 0.33466796875, | |
| "learning_rate": 1.8592617387679304e-05, | |
| "loss": 0.0134, | |
| "reward": 1.15625, | |
| "reward_std": 0.32329851035028695, | |
| "rewards/accuracy_reward": 0.21640625, | |
| "rewards/format_reward": 0.93984375, | |
| "step": 575 | |
| }, | |
| { | |
| "completion_length": 157.41796875, | |
| "epoch": 0.2562049639711769, | |
| "grad_norm": 0.5687190294265747, | |
| "kl": 0.365576171875, | |
| "learning_rate": 1.8552896474336816e-05, | |
| "loss": 0.0146, | |
| "reward": 1.1390625, | |
| "reward_std": 0.25018255431205033, | |
| "rewards/accuracy_reward": 0.17890625, | |
| "rewards/format_reward": 0.96015625, | |
| "step": 580 | |
| }, | |
| { | |
| "completion_length": 126.7265625, | |
| "epoch": 0.25841362745368707, | |
| "grad_norm": 0.4837645888328552, | |
| "kl": 0.494287109375, | |
| "learning_rate": 1.8512666470172024e-05, | |
| "loss": 0.0198, | |
| "reward": 1.13515625, | |
| "reward_std": 0.25846064239740374, | |
| "rewards/accuracy_reward": 0.1828125, | |
| "rewards/format_reward": 0.95234375, | |
| "step": 585 | |
| }, | |
| { | |
| "completion_length": 117.5125, | |
| "epoch": 0.2606222909361972, | |
| "grad_norm": 0.5099273324012756, | |
| "kl": 0.46514892578125, | |
| "learning_rate": 1.8471929769780247e-05, | |
| "loss": 0.0186, | |
| "reward": 1.1375, | |
| "reward_std": 0.27797329761087897, | |
| "rewards/accuracy_reward": 0.18671875, | |
| "rewards/format_reward": 0.95078125, | |
| "step": 590 | |
| }, | |
| { | |
| "completion_length": 129.4875, | |
| "epoch": 0.26283095441870735, | |
| "grad_norm": 0.48087364435195923, | |
| "kl": 23.66878662109375, | |
| "learning_rate": 1.8430688797916702e-05, | |
| "loss": 0.9494, | |
| "reward": 1.10234375, | |
| "reward_std": 0.26711587999016045, | |
| "rewards/accuracy_reward": 0.14609375, | |
| "rewards/format_reward": 0.95625, | |
| "step": 595 | |
| }, | |
| { | |
| "completion_length": 137.51171875, | |
| "epoch": 0.26503961790121755, | |
| "grad_norm": 0.5267772674560547, | |
| "kl": 0.357568359375, | |
| "learning_rate": 1.8388946009352157e-05, | |
| "loss": 0.0143, | |
| "reward": 1.15625, | |
| "reward_std": 0.2736740421503782, | |
| "rewards/accuracy_reward": 0.2046875, | |
| "rewards/format_reward": 0.9515625, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.26503961790121755, | |
| "eval_completion_length": 110.09, | |
| "eval_kl": 0.377890625, | |
| "eval_loss": 0.01513399463146925, | |
| "eval_reward": 1.1708333349227906, | |
| "eval_reward_std": 0.2645035409927368, | |
| "eval_rewards/accuracy_reward": 0.20208333373069765, | |
| "eval_rewards/format_reward": 0.96875, | |
| "eval_runtime": 129.4495, | |
| "eval_samples_per_second": 0.765, | |
| "eval_steps_per_second": 0.031, | |
| "step": 600 | |
| }, | |
| { | |
| "completion_length": 103.6046875, | |
| "epoch": 0.2672482813837277, | |
| "grad_norm": 0.5802999138832092, | |
| "kl": 0.42120361328125, | |
| "learning_rate": 1.8346703888726833e-05, | |
| "loss": 0.0168, | |
| "reward": 1.1421875, | |
| "reward_std": 0.22200765572488307, | |
| "rewards/accuracy_reward": 0.1765625, | |
| "rewards/format_reward": 0.965625, | |
| "step": 605 | |
| }, | |
| { | |
| "completion_length": 141.5125, | |
| "epoch": 0.26945694486623784, | |
| "grad_norm": 0.595243513584137, | |
| "kl": 0.3810546875, | |
| "learning_rate": 1.8303964950402498e-05, | |
| "loss": 0.0152, | |
| "reward": 1.12265625, | |
| "reward_std": 0.29428734816610813, | |
| "rewards/accuracy_reward": 0.184375, | |
| "rewards/format_reward": 0.93828125, | |
| "step": 610 | |
| }, | |
| { | |
| "completion_length": 138.7, | |
| "epoch": 0.271665608348748, | |
| "grad_norm": 0.43679195642471313, | |
| "kl": 0.38223876953125, | |
| "learning_rate": 1.8260731738312817e-05, | |
| "loss": 0.0153, | |
| "reward": 1.10078125, | |
| "reward_std": 0.23885549493134023, | |
| "rewards/accuracy_reward": 0.1484375, | |
| "rewards/format_reward": 0.95234375, | |
| "step": 615 | |
| }, | |
| { | |
| "completion_length": 104.80703125, | |
| "epoch": 0.2738742718312581, | |
| "grad_norm": 0.5839166045188904, | |
| "kl": 0.4041748046875, | |
| "learning_rate": 1.8217006825811924e-05, | |
| "loss": 0.0162, | |
| "reward": 1.12890625, | |
| "reward_std": 0.19053069781512022, | |
| "rewards/accuracy_reward": 0.14609375, | |
| "rewards/format_reward": 0.9828125, | |
| "step": 620 | |
| }, | |
| { | |
| "completion_length": 132.15859375, | |
| "epoch": 0.27608293531376826, | |
| "grad_norm": 1.1488077640533447, | |
| "kl": 0.4154296875, | |
| "learning_rate": 1.8172792815521246e-05, | |
| "loss": 0.0166, | |
| "reward": 1.12890625, | |
| "reward_std": 0.24578131809830667, | |
| "rewards/accuracy_reward": 0.1703125, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 625 | |
| }, | |
| { | |
| "completion_length": 203.21328125, | |
| "epoch": 0.2782915987962784, | |
| "grad_norm": 0.9778295159339905, | |
| "kl": 1.12738037109375, | |
| "learning_rate": 1.81280923391746e-05, | |
| "loss": 0.0452, | |
| "reward": 0.99453125, | |
| "reward_std": 0.3527091216295958, | |
| "rewards/accuracy_reward": 0.1265625, | |
| "rewards/format_reward": 0.86796875, | |
| "step": 630 | |
| }, | |
| { | |
| "completion_length": 107.22421875, | |
| "epoch": 0.28050026227878855, | |
| "grad_norm": 0.4877747595310211, | |
| "kl": 0.5244873046875, | |
| "learning_rate": 1.8082908057461534e-05, | |
| "loss": 0.021, | |
| "reward": 1.0953125, | |
| "reward_std": 0.23968660701066255, | |
| "rewards/accuracy_reward": 0.1484375, | |
| "rewards/format_reward": 0.946875, | |
| "step": 635 | |
| }, | |
| { | |
| "completion_length": 81.0328125, | |
| "epoch": 0.2827089257612987, | |
| "grad_norm": 0.7603225708007812, | |
| "kl": 0.5284912109375, | |
| "learning_rate": 1.8037242659868958e-05, | |
| "loss": 0.0211, | |
| "reward": 1.12109375, | |
| "reward_std": 0.21915814336389303, | |
| "rewards/accuracy_reward": 0.14453125, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 640 | |
| }, | |
| { | |
| "completion_length": 118.26171875, | |
| "epoch": 0.28491758924380883, | |
| "grad_norm": 0.565264880657196, | |
| "kl": 0.44853515625, | |
| "learning_rate": 1.7991098864521066e-05, | |
| "loss": 0.018, | |
| "reward": 1.12578125, | |
| "reward_std": 0.2820789096876979, | |
| "rewards/accuracy_reward": 0.17734375, | |
| "rewards/format_reward": 0.9484375, | |
| "step": 645 | |
| }, | |
| { | |
| "completion_length": 92.51484375, | |
| "epoch": 0.287126252726319, | |
| "grad_norm": 0.5516146421432495, | |
| "kl": 0.4996337890625, | |
| "learning_rate": 1.794447941801754e-05, | |
| "loss": 0.02, | |
| "reward": 1.13515625, | |
| "reward_std": 0.21679833866655826, | |
| "rewards/accuracy_reward": 0.16328125, | |
| "rewards/format_reward": 0.971875, | |
| "step": 650 | |
| }, | |
| { | |
| "completion_length": 126.5140625, | |
| "epoch": 0.2893349162088291, | |
| "grad_norm": 0.5977817177772522, | |
| "kl": 0.4461181640625, | |
| "learning_rate": 1.7897387095270058e-05, | |
| "loss": 0.0178, | |
| "reward": 1.10546875, | |
| "reward_std": 0.24418613854795695, | |
| "rewards/accuracy_reward": 0.14296875, | |
| "rewards/format_reward": 0.9625, | |
| "step": 655 | |
| }, | |
| { | |
| "completion_length": 153.0734375, | |
| "epoch": 0.29154357969133926, | |
| "grad_norm": 0.4894627630710602, | |
| "kl": 0.37508544921875, | |
| "learning_rate": 1.7849824699337143e-05, | |
| "loss": 0.015, | |
| "reward": 1.1125, | |
| "reward_std": 0.22963083293288947, | |
| "rewards/accuracy_reward": 0.1515625, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 660 | |
| }, | |
| { | |
| "completion_length": 103.91953125, | |
| "epoch": 0.2937522431738494, | |
| "grad_norm": 0.6808644533157349, | |
| "kl": 0.456884765625, | |
| "learning_rate": 1.7801795061257293e-05, | |
| "loss": 0.0183, | |
| "reward": 1.121875, | |
| "reward_std": 0.21878602355718613, | |
| "rewards/accuracy_reward": 0.1484375, | |
| "rewards/format_reward": 0.9734375, | |
| "step": 665 | |
| }, | |
| { | |
| "completion_length": 90.246875, | |
| "epoch": 0.29596090665635955, | |
| "grad_norm": 0.4546065330505371, | |
| "kl": 0.46005859375, | |
| "learning_rate": 1.77533010398805e-05, | |
| "loss": 0.0184, | |
| "reward": 1.0796875, | |
| "reward_std": 0.19561193585395814, | |
| "rewards/accuracy_reward": 0.109375, | |
| "rewards/format_reward": 0.9703125, | |
| "step": 670 | |
| }, | |
| { | |
| "completion_length": 108.37578125, | |
| "epoch": 0.29816957013886974, | |
| "grad_norm": 0.4939492344856262, | |
| "kl": 0.40859375, | |
| "learning_rate": 1.7704345521698057e-05, | |
| "loss": 0.0163, | |
| "reward": 1.0984375, | |
| "reward_std": 0.2110065519809723, | |
| "rewards/accuracy_reward": 0.13515625, | |
| "rewards/format_reward": 0.96328125, | |
| "step": 675 | |
| }, | |
| { | |
| "completion_length": 116.196875, | |
| "epoch": 0.3003782336213799, | |
| "grad_norm": 0.4660269021987915, | |
| "kl": 0.397998046875, | |
| "learning_rate": 1.765493142067076e-05, | |
| "loss": 0.0159, | |
| "reward": 1.14140625, | |
| "reward_std": 0.23255243562161923, | |
| "rewards/accuracy_reward": 0.1765625, | |
| "rewards/format_reward": 0.96484375, | |
| "step": 680 | |
| }, | |
| { | |
| "completion_length": 104.7328125, | |
| "epoch": 0.30258689710389003, | |
| "grad_norm": 0.5599631071090698, | |
| "kl": 0.39521484375, | |
| "learning_rate": 1.7605061678055453e-05, | |
| "loss": 0.0158, | |
| "reward": 1.11953125, | |
| "reward_std": 0.17798166144639255, | |
| "rewards/accuracy_reward": 0.1359375, | |
| "rewards/format_reward": 0.98359375, | |
| "step": 685 | |
| }, | |
| { | |
| "completion_length": 129.27890625, | |
| "epoch": 0.30479556058640017, | |
| "grad_norm": 0.4298873543739319, | |
| "kl": 0.3538818359375, | |
| "learning_rate": 1.7554739262229965e-05, | |
| "loss": 0.0142, | |
| "reward": 1.12265625, | |
| "reward_std": 0.25020663160830736, | |
| "rewards/accuracy_reward": 0.16328125, | |
| "rewards/format_reward": 0.959375, | |
| "step": 690 | |
| }, | |
| { | |
| "completion_length": 126.02890625, | |
| "epoch": 0.3070042240689103, | |
| "grad_norm": 0.4924304485321045, | |
| "kl": 0.373681640625, | |
| "learning_rate": 1.7503967168516426e-05, | |
| "loss": 0.015, | |
| "reward": 1.11953125, | |
| "reward_std": 0.2316643577069044, | |
| "rewards/accuracy_reward": 0.159375, | |
| "rewards/format_reward": 0.96015625, | |
| "step": 695 | |
| }, | |
| { | |
| "completion_length": 112.05, | |
| "epoch": 0.30921288755142046, | |
| "grad_norm": 0.5005078315734863, | |
| "kl": 0.364013671875, | |
| "learning_rate": 1.7452748419002968e-05, | |
| "loss": 0.0146, | |
| "reward": 1.14296875, | |
| "reward_std": 0.20688416287302971, | |
| "rewards/accuracy_reward": 0.17109375, | |
| "rewards/format_reward": 0.971875, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.30921288755142046, | |
| "eval_completion_length": 114.4858334350586, | |
| "eval_kl": 0.38703125, | |
| "eval_loss": 0.015563694760203362, | |
| "eval_reward": 1.1183333349227906, | |
| "eval_reward_std": 0.22717599272727967, | |
| "eval_rewards/accuracy_reward": 0.15583333373069763, | |
| "eval_rewards/format_reward": 0.9625, | |
| "eval_runtime": 118.9296, | |
| "eval_samples_per_second": 0.832, | |
| "eval_steps_per_second": 0.034, | |
| "step": 700 | |
| }, | |
| { | |
| "completion_length": 116.096875, | |
| "epoch": 0.3114215510339306, | |
| "grad_norm": 0.4761113226413727, | |
| "kl": 0.3679443359375, | |
| "learning_rate": 1.740108606236385e-05, | |
| "loss": 0.0147, | |
| "reward": 1.1265625, | |
| "reward_std": 0.20904745440930128, | |
| "rewards/accuracy_reward": 0.16015625, | |
| "rewards/format_reward": 0.96640625, | |
| "step": 705 | |
| }, | |
| { | |
| "completion_length": 134.46640625, | |
| "epoch": 0.31363021451644074, | |
| "grad_norm": 0.7244411110877991, | |
| "kl": 0.38353271484375, | |
| "learning_rate": 1.7348983173677986e-05, | |
| "loss": 0.0153, | |
| "reward": 1.0765625, | |
| "reward_std": 0.23576183728873729, | |
| "rewards/accuracy_reward": 0.13515625, | |
| "rewards/format_reward": 0.94140625, | |
| "step": 710 | |
| }, | |
| { | |
| "completion_length": 99.53828125, | |
| "epoch": 0.3158388779989509, | |
| "grad_norm": 0.4390712380409241, | |
| "kl": 0.4088623046875, | |
| "learning_rate": 1.7296442854245915e-05, | |
| "loss": 0.0164, | |
| "reward": 1.1109375, | |
| "reward_std": 0.1941352991387248, | |
| "rewards/accuracy_reward": 0.14453125, | |
| "rewards/format_reward": 0.96640625, | |
| "step": 715 | |
| }, | |
| { | |
| "completion_length": 84.16640625, | |
| "epoch": 0.318047541481461, | |
| "grad_norm": 0.8809035420417786, | |
| "kl": 0.4468994140625, | |
| "learning_rate": 1.72434682314052e-05, | |
| "loss": 0.0179, | |
| "reward": 1.15390625, | |
| "reward_std": 0.1831468353047967, | |
| "rewards/accuracy_reward": 0.1765625, | |
| "rewards/format_reward": 0.97734375, | |
| "step": 720 | |
| }, | |
| { | |
| "completion_length": 82.83203125, | |
| "epoch": 0.32025620496397117, | |
| "grad_norm": 0.7408865690231323, | |
| "kl": 0.4578857421875, | |
| "learning_rate": 1.719006245834429e-05, | |
| "loss": 0.0183, | |
| "reward": 1.11328125, | |
| "reward_std": 0.16447940673679112, | |
| "rewards/accuracy_reward": 0.13828125, | |
| "rewards/format_reward": 0.975, | |
| "step": 725 | |
| }, | |
| { | |
| "completion_length": 124.11171875, | |
| "epoch": 0.3224648684464813, | |
| "grad_norm": 0.4459853172302246, | |
| "kl": 803.6734375, | |
| "learning_rate": 1.7136228713914805e-05, | |
| "loss": 32.0277, | |
| "reward": 1.0515625, | |
| "reward_std": 0.2270077530294657, | |
| "rewards/accuracy_reward": 0.10703125, | |
| "rewards/format_reward": 0.94453125, | |
| "step": 730 | |
| }, | |
| { | |
| "completion_length": 116.0078125, | |
| "epoch": 0.32467353192899145, | |
| "grad_norm": 0.8283806443214417, | |
| "kl": 0.4873046875, | |
| "learning_rate": 1.7081970202442363e-05, | |
| "loss": 0.0195, | |
| "reward": 1.10625, | |
| "reward_std": 0.24761096592992543, | |
| "rewards/accuracy_reward": 0.15390625, | |
| "rewards/format_reward": 0.95234375, | |
| "step": 735 | |
| }, | |
| { | |
| "completion_length": 75.5578125, | |
| "epoch": 0.3268821954115016, | |
| "grad_norm": 0.5588904023170471, | |
| "kl": 0.44139404296875, | |
| "learning_rate": 1.7027290153535826e-05, | |
| "loss": 0.0177, | |
| "reward": 1.16015625, | |
| "reward_std": 0.16179091222584246, | |
| "rewards/accuracy_reward": 0.165625, | |
| "rewards/format_reward": 0.99453125, | |
| "step": 740 | |
| }, | |
| { | |
| "completion_length": 114.0390625, | |
| "epoch": 0.32909085889401174, | |
| "grad_norm": 0.43830907344818115, | |
| "kl": 0.3618896484375, | |
| "learning_rate": 1.6972191821895065e-05, | |
| "loss": 0.0145, | |
| "reward": 1.10390625, | |
| "reward_std": 0.14213568177074193, | |
| "rewards/accuracy_reward": 0.11953125, | |
| "rewards/format_reward": 0.984375, | |
| "step": 745 | |
| }, | |
| { | |
| "completion_length": 154.39921875, | |
| "epoch": 0.3312995223765219, | |
| "grad_norm": 0.34231725335121155, | |
| "kl": 0.31685791015625, | |
| "learning_rate": 1.691667848711723e-05, | |
| "loss": 0.0127, | |
| "reward": 1.10546875, | |
| "reward_std": 0.19847314581274986, | |
| "rewards/accuracy_reward": 0.128125, | |
| "rewards/format_reward": 0.97734375, | |
| "step": 750 | |
| }, | |
| { | |
| "completion_length": 178.4203125, | |
| "epoch": 0.3335081858590321, | |
| "grad_norm": 0.524643063545227, | |
| "kl": 3.5463134765625, | |
| "learning_rate": 1.686075345350156e-05, | |
| "loss": 0.1422, | |
| "reward": 1.0859375, | |
| "reward_std": 0.2875434797257185, | |
| "rewards/accuracy_reward": 0.14609375, | |
| "rewards/format_reward": 0.93984375, | |
| "step": 755 | |
| }, | |
| { | |
| "completion_length": 147.73203125, | |
| "epoch": 0.3357168493415422, | |
| "grad_norm": 0.3619137704372406, | |
| "kl": 0.3319091796875, | |
| "learning_rate": 1.6804420049852676e-05, | |
| "loss": 0.0133, | |
| "reward": 1.1296875, | |
| "reward_std": 0.23654117435216904, | |
| "rewards/accuracy_reward": 0.165625, | |
| "rewards/format_reward": 0.9640625, | |
| "step": 760 | |
| }, | |
| { | |
| "completion_length": 121.4109375, | |
| "epoch": 0.33792551282405237, | |
| "grad_norm": 0.36858057975769043, | |
| "kl": 0.32564697265625, | |
| "learning_rate": 1.6747681629282468e-05, | |
| "loss": 0.013, | |
| "reward": 1.16875, | |
| "reward_std": 0.2164825988933444, | |
| "rewards/accuracy_reward": 0.18671875, | |
| "rewards/format_reward": 0.98203125, | |
| "step": 765 | |
| }, | |
| { | |
| "completion_length": 108.2828125, | |
| "epoch": 0.3401341763065625, | |
| "grad_norm": 0.4734199047088623, | |
| "kl": 0.35654296875, | |
| "learning_rate": 1.6690541569010474e-05, | |
| "loss": 0.0143, | |
| "reward": 1.13828125, | |
| "reward_std": 0.20721396785229446, | |
| "rewards/accuracy_reward": 0.15859375, | |
| "rewards/format_reward": 0.9796875, | |
| "step": 770 | |
| }, | |
| { | |
| "completion_length": 126.0984375, | |
| "epoch": 0.34234283978907265, | |
| "grad_norm": 0.47700235247612, | |
| "kl": 0.3507568359375, | |
| "learning_rate": 1.6633003270162903e-05, | |
| "loss": 0.014, | |
| "reward": 1.1484375, | |
| "reward_std": 0.20858664382249117, | |
| "rewards/accuracy_reward": 0.1734375, | |
| "rewards/format_reward": 0.975, | |
| "step": 775 | |
| }, | |
| { | |
| "completion_length": 166.13125, | |
| "epoch": 0.3445515032715828, | |
| "grad_norm": 0.39217832684516907, | |
| "kl": 0.35064697265625, | |
| "learning_rate": 1.6575070157570152e-05, | |
| "loss": 0.014, | |
| "reward": 1.13515625, | |
| "reward_std": 0.2673689084127545, | |
| "rewards/accuracy_reward": 0.18359375, | |
| "rewards/format_reward": 0.9515625, | |
| "step": 780 | |
| }, | |
| { | |
| "completion_length": 169.01328125, | |
| "epoch": 0.34676016675409294, | |
| "grad_norm": 0.39597201347351074, | |
| "kl": 0.34754638671875, | |
| "learning_rate": 1.6516745679562977e-05, | |
| "loss": 0.0139, | |
| "reward": 1.065625, | |
| "reward_std": 0.2720937805250287, | |
| "rewards/accuracy_reward": 0.1328125, | |
| "rewards/format_reward": 0.9328125, | |
| "step": 785 | |
| }, | |
| { | |
| "completion_length": 182.8359375, | |
| "epoch": 0.3489688302366031, | |
| "grad_norm": 0.47019919753074646, | |
| "kl": 0.34156494140625, | |
| "learning_rate": 1.6458033307767217e-05, | |
| "loss": 0.0137, | |
| "reward": 1.08046875, | |
| "reward_std": 0.3094723552465439, | |
| "rewards/accuracy_reward": 0.16640625, | |
| "rewards/format_reward": 0.9140625, | |
| "step": 790 | |
| }, | |
| { | |
| "completion_length": 129.996875, | |
| "epoch": 0.3511774937191132, | |
| "grad_norm": 1.1053619384765625, | |
| "kl": 0.59439697265625, | |
| "learning_rate": 1.6398936536897182e-05, | |
| "loss": 0.0238, | |
| "reward": 1.09609375, | |
| "reward_std": 0.27469405010342596, | |
| "rewards/accuracy_reward": 0.1703125, | |
| "rewards/format_reward": 0.92578125, | |
| "step": 795 | |
| }, | |
| { | |
| "completion_length": 56.90234375, | |
| "epoch": 0.35338615720162336, | |
| "grad_norm": 0.5199185609817505, | |
| "kl": 0.6048828125, | |
| "learning_rate": 1.6339458884547613e-05, | |
| "loss": 0.0242, | |
| "reward": 1.14140625, | |
| "reward_std": 0.15660744477063418, | |
| "rewards/accuracy_reward": 0.15390625, | |
| "rewards/format_reward": 0.9875, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.35338615720162336, | |
| "eval_completion_length": 43.742083358764646, | |
| "eval_kl": 0.65765625, | |
| "eval_loss": 0.026582278311252594, | |
| "eval_reward": 1.1675, | |
| "eval_reward_std": 0.16831182479858398, | |
| "eval_rewards/accuracy_reward": 0.1725, | |
| "eval_rewards/format_reward": 0.995, | |
| "eval_runtime": 49.7219, | |
| "eval_samples_per_second": 1.991, | |
| "eval_steps_per_second": 0.08, | |
| "step": 800 | |
| }, | |
| { | |
| "completion_length": 64.53046875, | |
| "epoch": 0.3555948206841335, | |
| "grad_norm": 0.4935765266418457, | |
| "kl": 0.5750732421875, | |
| "learning_rate": 1.6279603890984315e-05, | |
| "loss": 0.023, | |
| "reward": 1.14765625, | |
| "reward_std": 0.20158261395990848, | |
| "rewards/accuracy_reward": 0.16015625, | |
| "rewards/format_reward": 0.9875, | |
| "step": 805 | |
| }, | |
| { | |
| "completion_length": 93.0140625, | |
| "epoch": 0.35780348416664365, | |
| "grad_norm": 0.5734583139419556, | |
| "kl": 0.4998291015625, | |
| "learning_rate": 1.6219375118933442e-05, | |
| "loss": 0.02, | |
| "reward": 1.14765625, | |
| "reward_std": 0.2238040953874588, | |
| "rewards/accuracy_reward": 0.17890625, | |
| "rewards/format_reward": 0.96875, | |
| "step": 810 | |
| }, | |
| { | |
| "completion_length": 134.42578125, | |
| "epoch": 0.3600121476491538, | |
| "grad_norm": 0.49560049176216125, | |
| "kl": 0.4668212890625, | |
| "learning_rate": 1.6158776153369406e-05, | |
| "loss": 0.0187, | |
| "reward": 1.0765625, | |
| "reward_std": 0.29757872987538575, | |
| "rewards/accuracy_reward": 0.14453125, | |
| "rewards/format_reward": 0.93203125, | |
| "step": 815 | |
| }, | |
| { | |
| "completion_length": 158.89296875, | |
| "epoch": 0.36222081113166393, | |
| "grad_norm": 0.5122950077056885, | |
| "kl": 0.46318359375, | |
| "learning_rate": 1.609781060130152e-05, | |
| "loss": 0.0185, | |
| "reward": 1.01171875, | |
| "reward_std": 0.29846451599150897, | |
| "rewards/accuracy_reward": 0.1046875, | |
| "rewards/format_reward": 0.90703125, | |
| "step": 820 | |
| }, | |
| { | |
| "completion_length": 116.4859375, | |
| "epoch": 0.3644294746141741, | |
| "grad_norm": 0.5021364092826843, | |
| "kl": 0.4975830078125, | |
| "learning_rate": 1.6036482091559287e-05, | |
| "loss": 0.0199, | |
| "reward": 1.12109375, | |
| "reward_std": 0.2664882358163595, | |
| "rewards/accuracy_reward": 0.165625, | |
| "rewards/format_reward": 0.95546875, | |
| "step": 825 | |
| }, | |
| { | |
| "completion_length": 131.8703125, | |
| "epoch": 0.3666381380966842, | |
| "grad_norm": 0.6027432680130005, | |
| "kl": 0.4765380859375, | |
| "learning_rate": 1.5974794274576394e-05, | |
| "loss": 0.0191, | |
| "reward": 1.12578125, | |
| "reward_std": 0.28144511561840774, | |
| "rewards/accuracy_reward": 0.17265625, | |
| "rewards/format_reward": 0.953125, | |
| "step": 830 | |
| }, | |
| { | |
| "completion_length": 134.915625, | |
| "epoch": 0.3688468015791944, | |
| "grad_norm": 0.49983125925064087, | |
| "kl": 0.4760986328125, | |
| "learning_rate": 1.5912750822173446e-05, | |
| "loss": 0.019, | |
| "reward": 1.134375, | |
| "reward_std": 0.29985770154744384, | |
| "rewards/accuracy_reward": 0.20078125, | |
| "rewards/format_reward": 0.93359375, | |
| "step": 835 | |
| }, | |
| { | |
| "completion_length": 91.075, | |
| "epoch": 0.37105546506170456, | |
| "grad_norm": 0.4414427876472473, | |
| "kl": 0.6185791015625, | |
| "learning_rate": 1.5850355427339398e-05, | |
| "loss": 0.0247, | |
| "reward": 1.1, | |
| "reward_std": 0.2404505180194974, | |
| "rewards/accuracy_reward": 0.14375, | |
| "rewards/format_reward": 0.95625, | |
| "step": 840 | |
| }, | |
| { | |
| "completion_length": 61.1109375, | |
| "epoch": 0.3732641285442147, | |
| "grad_norm": 0.46862563490867615, | |
| "kl": 0.6706787109375, | |
| "learning_rate": 1.5787611804011735e-05, | |
| "loss": 0.0268, | |
| "reward": 1.16875, | |
| "reward_std": 0.17206176780164242, | |
| "rewards/accuracy_reward": 0.18828125, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 845 | |
| }, | |
| { | |
| "completion_length": 78.2640625, | |
| "epoch": 0.37547279202672484, | |
| "grad_norm": 0.7195978164672852, | |
| "kl": 0.6672119140625, | |
| "learning_rate": 1.5724523686855423e-05, | |
| "loss": 0.0267, | |
| "reward": 1.13203125, | |
| "reward_std": 0.1878314608708024, | |
| "rewards/accuracy_reward": 0.16328125, | |
| "rewards/format_reward": 0.96875, | |
| "step": 850 | |
| }, | |
| { | |
| "completion_length": 103.509375, | |
| "epoch": 0.377681455509235, | |
| "grad_norm": 0.5233339071273804, | |
| "kl": 0.6544677734375, | |
| "learning_rate": 1.56610948310406e-05, | |
| "loss": 0.0262, | |
| "reward": 1.13359375, | |
| "reward_std": 0.24245705269277096, | |
| "rewards/accuracy_reward": 0.1734375, | |
| "rewards/format_reward": 0.96015625, | |
| "step": 855 | |
| }, | |
| { | |
| "completion_length": 97.459375, | |
| "epoch": 0.37989011899174513, | |
| "grad_norm": 0.4953531324863434, | |
| "kl": 0.575927734375, | |
| "learning_rate": 1.5597329012019065e-05, | |
| "loss": 0.023, | |
| "reward": 1.15859375, | |
| "reward_std": 0.2319757068529725, | |
| "rewards/accuracy_reward": 0.19609375, | |
| "rewards/format_reward": 0.9625, | |
| "step": 860 | |
| }, | |
| { | |
| "completion_length": 126.03515625, | |
| "epoch": 0.38209878247425527, | |
| "grad_norm": 1.7415945529937744, | |
| "kl": 0.5650390625, | |
| "learning_rate": 1.5533230025299547e-05, | |
| "loss": 0.0226, | |
| "reward": 1.0765625, | |
| "reward_std": 0.25536851994693277, | |
| "rewards/accuracy_reward": 0.13828125, | |
| "rewards/format_reward": 0.93828125, | |
| "step": 865 | |
| }, | |
| { | |
| "completion_length": 92.45625, | |
| "epoch": 0.3843074459567654, | |
| "grad_norm": 0.5169078707695007, | |
| "kl": 0.5216796875, | |
| "learning_rate": 1.5468801686221793e-05, | |
| "loss": 0.0209, | |
| "reward": 1.14765625, | |
| "reward_std": 0.21833606492727994, | |
| "rewards/accuracy_reward": 0.18203125, | |
| "rewards/format_reward": 0.965625, | |
| "step": 870 | |
| }, | |
| { | |
| "completion_length": 111.06875, | |
| "epoch": 0.38651610943927556, | |
| "grad_norm": 1.1891313791275024, | |
| "kl": 0.5312255859375, | |
| "learning_rate": 1.540404782972946e-05, | |
| "loss": 0.0213, | |
| "reward": 1.08828125, | |
| "reward_std": 0.2383767468854785, | |
| "rewards/accuracy_reward": 0.1390625, | |
| "rewards/format_reward": 0.94921875, | |
| "step": 875 | |
| }, | |
| { | |
| "completion_length": 96.92109375, | |
| "epoch": 0.3887247729217857, | |
| "grad_norm": 0.49358442425727844, | |
| "kl": 0.54716796875, | |
| "learning_rate": 1.5338972310141863e-05, | |
| "loss": 0.0219, | |
| "reward": 1.14296875, | |
| "reward_std": 0.2417955880984664, | |
| "rewards/accuracy_reward": 0.18359375, | |
| "rewards/format_reward": 0.959375, | |
| "step": 880 | |
| }, | |
| { | |
| "completion_length": 103.10234375, | |
| "epoch": 0.39093343640429584, | |
| "grad_norm": 0.45550188422203064, | |
| "kl": 0.5120849609375, | |
| "learning_rate": 1.5273579000924545e-05, | |
| "loss": 0.0205, | |
| "reward": 1.12265625, | |
| "reward_std": 0.230152091011405, | |
| "rewards/accuracy_reward": 0.1609375, | |
| "rewards/format_reward": 0.96171875, | |
| "step": 885 | |
| }, | |
| { | |
| "completion_length": 79.07578125, | |
| "epoch": 0.393142099886806, | |
| "grad_norm": 0.4740158021450043, | |
| "kl": 0.751025390625, | |
| "learning_rate": 1.5207871794458715e-05, | |
| "loss": 0.03, | |
| "reward": 1.17578125, | |
| "reward_std": 0.19955000430345535, | |
| "rewards/accuracy_reward": 0.19375, | |
| "rewards/format_reward": 0.98203125, | |
| "step": 890 | |
| }, | |
| { | |
| "completion_length": 105.76171875, | |
| "epoch": 0.3953507633693161, | |
| "grad_norm": 0.37959805130958557, | |
| "kl": 0.492724609375, | |
| "learning_rate": 1.5141854601809583e-05, | |
| "loss": 0.0197, | |
| "reward": 1.15, | |
| "reward_std": 0.21266062185168266, | |
| "rewards/accuracy_reward": 0.18125, | |
| "rewards/format_reward": 0.96875, | |
| "step": 895 | |
| }, | |
| { | |
| "completion_length": 106.47109375, | |
| "epoch": 0.39755942685182627, | |
| "grad_norm": 0.4141887128353119, | |
| "kl": 0.47958984375, | |
| "learning_rate": 1.5075531352493528e-05, | |
| "loss": 0.0192, | |
| "reward": 1.125, | |
| "reward_std": 0.23161781765520573, | |
| "rewards/accuracy_reward": 0.1609375, | |
| "rewards/format_reward": 0.9640625, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.39755942685182627, | |
| "eval_completion_length": 128.19291748046874, | |
| "eval_kl": 0.500859375, | |
| "eval_loss": 0.02018117904663086, | |
| "eval_reward": 1.1125, | |
| "eval_reward_std": 0.26222177892923354, | |
| "eval_rewards/accuracy_reward": 0.1604166667163372, | |
| "eval_rewards/format_reward": 0.9520833349227905, | |
| "eval_runtime": 156.2795, | |
| "eval_samples_per_second": 0.633, | |
| "eval_steps_per_second": 0.026, | |
| "step": 900 | |
| }, | |
| { | |
| "completion_length": 115.94921875, | |
| "epoch": 0.3997680903343364, | |
| "grad_norm": 0.4243628680706024, | |
| "kl": 0.49537353515625, | |
| "learning_rate": 1.5008905994244255e-05, | |
| "loss": 0.0198, | |
| "reward": 1.10234375, | |
| "reward_std": 0.225167977437377, | |
| "rewards/accuracy_reward": 0.15078125, | |
| "rewards/format_reward": 0.9515625, | |
| "step": 905 | |
| }, | |
| { | |
| "completion_length": 72.07578125, | |
| "epoch": 0.40197675381684655, | |
| "grad_norm": 0.4959592819213867, | |
| "kl": 0.5396484375, | |
| "learning_rate": 1.4941982492777749e-05, | |
| "loss": 0.0216, | |
| "reward": 1.1703125, | |
| "reward_std": 0.1914088014513254, | |
| "rewards/accuracy_reward": 0.18984375, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 910 | |
| }, | |
| { | |
| "completion_length": 84.55390625, | |
| "epoch": 0.40418541729935675, | |
| "grad_norm": 0.6311984062194824, | |
| "kl": 0.56240234375, | |
| "learning_rate": 1.4874764831556285e-05, | |
| "loss": 0.0225, | |
| "reward": 1.15625, | |
| "reward_std": 0.20792635306715965, | |
| "rewards/accuracy_reward": 0.1890625, | |
| "rewards/format_reward": 0.9671875, | |
| "step": 915 | |
| }, | |
| { | |
| "completion_length": 115.203125, | |
| "epoch": 0.4063940807818669, | |
| "grad_norm": 0.5944263935089111, | |
| "kl": 0.4927734375, | |
| "learning_rate": 1.4807257011551297e-05, | |
| "loss": 0.0197, | |
| "reward": 1.15703125, | |
| "reward_std": 0.26277261301875116, | |
| "rewards/accuracy_reward": 0.21015625, | |
| "rewards/format_reward": 0.946875, | |
| "step": 920 | |
| }, | |
| { | |
| "completion_length": 111.07578125, | |
| "epoch": 0.40860274426437704, | |
| "grad_norm": 0.7743093967437744, | |
| "kl": 0.488720703125, | |
| "learning_rate": 1.4739463051005221e-05, | |
| "loss": 0.0196, | |
| "reward": 1.103125, | |
| "reward_std": 0.2299284663051367, | |
| "rewards/accuracy_reward": 0.14453125, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 925 | |
| }, | |
| { | |
| "completion_length": 95.52109375, | |
| "epoch": 0.4108114077468872, | |
| "grad_norm": 0.7759421467781067, | |
| "kl": 0.5315185546875, | |
| "learning_rate": 1.4671386985192327e-05, | |
| "loss": 0.0213, | |
| "reward": 1.1671875, | |
| "reward_std": 0.1855922631919384, | |
| "rewards/accuracy_reward": 0.17890625, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 930 | |
| }, | |
| { | |
| "completion_length": 108.65546875, | |
| "epoch": 0.4130200712293973, | |
| "grad_norm": 0.4987127482891083, | |
| "kl": 0.45135498046875, | |
| "learning_rate": 1.460303286617854e-05, | |
| "loss": 0.0181, | |
| "reward": 1.1625, | |
| "reward_std": 0.17659219540655613, | |
| "rewards/accuracy_reward": 0.16640625, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 935 | |
| }, | |
| { | |
| "completion_length": 144.80078125, | |
| "epoch": 0.41522873471190747, | |
| "grad_norm": 0.5061682462692261, | |
| "kl": 5.969921875, | |
| "learning_rate": 1.4534404762580239e-05, | |
| "loss": 0.2394, | |
| "reward": 1.16484375, | |
| "reward_std": 0.21013734135776757, | |
| "rewards/accuracy_reward": 0.17890625, | |
| "rewards/format_reward": 0.9859375, | |
| "step": 940 | |
| }, | |
| { | |
| "completion_length": 113.9171875, | |
| "epoch": 0.4174373981944176, | |
| "grad_norm": 0.35856854915618896, | |
| "kl": 0.4087646484375, | |
| "learning_rate": 1.4465506759322074e-05, | |
| "loss": 0.0164, | |
| "reward": 1.196875, | |
| "reward_std": 0.16952291671186687, | |
| "rewards/accuracy_reward": 0.2015625, | |
| "rewards/format_reward": 0.9953125, | |
| "step": 945 | |
| }, | |
| { | |
| "completion_length": 112.64765625, | |
| "epoch": 0.41964606167692775, | |
| "grad_norm": 0.3706095218658447, | |
| "kl": 0.4105224609375, | |
| "learning_rate": 1.4396342957393844e-05, | |
| "loss": 0.0164, | |
| "reward": 1.17265625, | |
| "reward_std": 0.1865989552810788, | |
| "rewards/accuracy_reward": 0.18203125, | |
| "rewards/format_reward": 0.990625, | |
| "step": 950 | |
| }, | |
| { | |
| "completion_length": 137.5375, | |
| "epoch": 0.4218547251594379, | |
| "grad_norm": 1.2025071382522583, | |
| "kl": 0.38111572265625, | |
| "learning_rate": 1.4326917473606368e-05, | |
| "loss": 0.0152, | |
| "reward": 1.09921875, | |
| "reward_std": 0.18654303345829248, | |
| "rewards/accuracy_reward": 0.121875, | |
| "rewards/format_reward": 0.97734375, | |
| "step": 955 | |
| }, | |
| { | |
| "completion_length": 160.10234375, | |
| "epoch": 0.42406338864194804, | |
| "grad_norm": 3.2986419200897217, | |
| "kl": 0.59927978515625, | |
| "learning_rate": 1.4257234440346469e-05, | |
| "loss": 0.024, | |
| "reward": 1.1609375, | |
| "reward_std": 0.23215112816542388, | |
| "rewards/accuracy_reward": 0.18828125, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 960 | |
| }, | |
| { | |
| "completion_length": 167.0, | |
| "epoch": 0.4262720521244582, | |
| "grad_norm": 0.610000729560852, | |
| "kl": 0.558154296875, | |
| "learning_rate": 1.4187298005330976e-05, | |
| "loss": 0.0223, | |
| "reward": 1.1421875, | |
| "reward_std": 0.31437007896602154, | |
| "rewards/accuracy_reward": 0.2078125, | |
| "rewards/format_reward": 0.934375, | |
| "step": 965 | |
| }, | |
| { | |
| "completion_length": 104.16484375, | |
| "epoch": 0.4284807156069683, | |
| "grad_norm": 8.364782333374023, | |
| "kl": 2.54658203125, | |
| "learning_rate": 1.4117112331359865e-05, | |
| "loss": 0.1018, | |
| "reward": 1.115625, | |
| "reward_std": 0.22522333543747663, | |
| "rewards/accuracy_reward": 0.1671875, | |
| "rewards/format_reward": 0.9484375, | |
| "step": 970 | |
| }, | |
| { | |
| "completion_length": 63.9890625, | |
| "epoch": 0.43068937908947846, | |
| "grad_norm": 0.708032488822937, | |
| "kl": 1.1080322265625, | |
| "learning_rate": 1.4046681596068468e-05, | |
| "loss": 0.0444, | |
| "reward": 1.171875, | |
| "reward_std": 0.16694873906672, | |
| "rewards/accuracy_reward": 0.19609375, | |
| "rewards/format_reward": 0.97578125, | |
| "step": 975 | |
| }, | |
| { | |
| "completion_length": 60.2453125, | |
| "epoch": 0.4328980425719886, | |
| "grad_norm": 1.7320284843444824, | |
| "kl": 1.096923828125, | |
| "learning_rate": 1.3976009991678803e-05, | |
| "loss": 0.0439, | |
| "reward": 1.1546875, | |
| "reward_std": 0.17392283789813517, | |
| "rewards/accuracy_reward": 0.17578125, | |
| "rewards/format_reward": 0.97890625, | |
| "step": 980 | |
| }, | |
| { | |
| "completion_length": 68.43359375, | |
| "epoch": 0.43510670605449875, | |
| "grad_norm": 0.7800001502037048, | |
| "kl": 1.552880859375, | |
| "learning_rate": 1.390510172475005e-05, | |
| "loss": 0.0621, | |
| "reward": 1.16875, | |
| "reward_std": 0.20654550790786744, | |
| "rewards/accuracy_reward": 0.1921875, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 985 | |
| }, | |
| { | |
| "completion_length": 57.4203125, | |
| "epoch": 0.4373153695370089, | |
| "grad_norm": 0.8300907015800476, | |
| "kl": 0.723046875, | |
| "learning_rate": 1.383396101592817e-05, | |
| "loss": 0.0289, | |
| "reward": 1.13984375, | |
| "reward_std": 0.17493642698973416, | |
| "rewards/accuracy_reward": 0.1546875, | |
| "rewards/format_reward": 0.98515625, | |
| "step": 990 | |
| }, | |
| { | |
| "completion_length": 73.23671875, | |
| "epoch": 0.4395240330195191, | |
| "grad_norm": 2.696807384490967, | |
| "kl": 1.3876220703125, | |
| "learning_rate": 1.3762592099694666e-05, | |
| "loss": 0.0555, | |
| "reward": 1.1109375, | |
| "reward_std": 0.2104167841374874, | |
| "rewards/accuracy_reward": 0.14140625, | |
| "rewards/format_reward": 0.96953125, | |
| "step": 995 | |
| }, | |
| { | |
| "completion_length": 132.46328125, | |
| "epoch": 0.44173269650202923, | |
| "grad_norm": 3.0761334896087646, | |
| "kl": 3.6865478515625, | |
| "learning_rate": 1.3690999224114547e-05, | |
| "loss": 0.1477, | |
| "reward": 1.0703125, | |
| "reward_std": 0.2970853915438056, | |
| "rewards/accuracy_reward": 0.1515625, | |
| "rewards/format_reward": 0.91875, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.44173269650202923, | |
| "eval_completion_length": 104.09083335876466, | |
| "eval_kl": 1.039375, | |
| "eval_loss": 0.03946812078356743, | |
| "eval_reward": 1.1433333349227905, | |
| "eval_reward_std": 0.2386816355586052, | |
| "eval_rewards/accuracy_reward": 0.1854166667163372, | |
| "eval_rewards/format_reward": 0.9579166674613953, | |
| "eval_runtime": 111.3439, | |
| "eval_samples_per_second": 0.889, | |
| "eval_steps_per_second": 0.036, | |
| "step": 1000 | |
| }, | |
| { | |
| "completion_length": 103.2375, | |
| "epoch": 0.4439413599845394, | |
| "grad_norm": 1.7991042137145996, | |
| "kl": 1.09813232421875, | |
| "learning_rate": 1.361918665058348e-05, | |
| "loss": 0.0439, | |
| "reward": 1.13125, | |
| "reward_std": 0.21907511353492737, | |
| "rewards/accuracy_reward": 0.16015625, | |
| "rewards/format_reward": 0.97109375, | |
| "step": 1005 | |
| }, | |
| { | |
| "completion_length": 106.32421875, | |
| "epoch": 0.4461500234670495, | |
| "grad_norm": 1.2199746370315552, | |
| "kl": 0.9680908203125, | |
| "learning_rate": 1.354715865357411e-05, | |
| "loss": 0.0388, | |
| "reward": 1.15390625, | |
| "reward_std": 0.2513147694990039, | |
| "rewards/accuracy_reward": 0.1828125, | |
| "rewards/format_reward": 0.97109375, | |
| "step": 1010 | |
| }, | |
| { | |
| "completion_length": 84.5375, | |
| "epoch": 0.44835868694955966, | |
| "grad_norm": 0.8461725115776062, | |
| "kl": 0.64466552734375, | |
| "learning_rate": 1.3474919520381673e-05, | |
| "loss": 0.0258, | |
| "reward": 1.1921875, | |
| "reward_std": 0.20436475947499275, | |
| "rewards/accuracy_reward": 0.20234375, | |
| "rewards/format_reward": 0.98984375, | |
| "step": 1015 | |
| }, | |
| { | |
| "completion_length": 95.0921875, | |
| "epoch": 0.4505673504320698, | |
| "grad_norm": 0.5356422066688538, | |
| "kl": 0.674560546875, | |
| "learning_rate": 1.3402473550868769e-05, | |
| "loss": 0.027, | |
| "reward": 1.178125, | |
| "reward_std": 0.23662711773067713, | |
| "rewards/accuracy_reward": 0.2046875, | |
| "rewards/format_reward": 0.9734375, | |
| "step": 1020 | |
| }, | |
| { | |
| "completion_length": 126.2546875, | |
| "epoch": 0.45277601391457994, | |
| "grad_norm": 0.44370850920677185, | |
| "kl": 0.50389404296875, | |
| "learning_rate": 1.3329825057209446e-05, | |
| "loss": 0.0202, | |
| "reward": 1.1453125, | |
| "reward_std": 0.2518596975132823, | |
| "rewards/accuracy_reward": 0.20078125, | |
| "rewards/format_reward": 0.94453125, | |
| "step": 1025 | |
| }, | |
| { | |
| "completion_length": 100.384375, | |
| "epoch": 0.4549846773970901, | |
| "grad_norm": 0.30460554361343384, | |
| "kl": 0.5373046875, | |
| "learning_rate": 1.3256978363632515e-05, | |
| "loss": 0.0215, | |
| "reward": 1.18984375, | |
| "reward_std": 0.21483363024890423, | |
| "rewards/accuracy_reward": 0.228125, | |
| "rewards/format_reward": 0.96171875, | |
| "step": 1030 | |
| }, | |
| { | |
| "completion_length": 106.22578125, | |
| "epoch": 0.45719334087960023, | |
| "grad_norm": 0.5371220707893372, | |
| "kl": 0.5292236328125, | |
| "learning_rate": 1.3183937806164174e-05, | |
| "loss": 0.0212, | |
| "reward": 1.18671875, | |
| "reward_std": 0.264132690615952, | |
| "rewards/accuracy_reward": 0.23125, | |
| "rewards/format_reward": 0.95546875, | |
| "step": 1035 | |
| }, | |
| { | |
| "completion_length": 90.2515625, | |
| "epoch": 0.45940200436211037, | |
| "grad_norm": 0.6759688854217529, | |
| "kl": 0.541845703125, | |
| "learning_rate": 1.3110707732369896e-05, | |
| "loss": 0.0217, | |
| "reward": 1.1765625, | |
| "reward_std": 0.22657935097813606, | |
| "rewards/accuracy_reward": 0.2109375, | |
| "rewards/format_reward": 0.965625, | |
| "step": 1040 | |
| }, | |
| { | |
| "completion_length": 129.69609375, | |
| "epoch": 0.4616106678446205, | |
| "grad_norm": 0.425448477268219, | |
| "kl": 0.549853515625, | |
| "learning_rate": 1.3037292501095674e-05, | |
| "loss": 0.022, | |
| "reward": 1.128125, | |
| "reward_std": 0.24908192362636328, | |
| "rewards/accuracy_reward": 0.1953125, | |
| "rewards/format_reward": 0.9328125, | |
| "step": 1045 | |
| }, | |
| { | |
| "completion_length": 100.40859375, | |
| "epoch": 0.46381933132713066, | |
| "grad_norm": 0.36143267154693604, | |
| "kl": 0.5515869140625, | |
| "learning_rate": 1.2963696482208552e-05, | |
| "loss": 0.0221, | |
| "reward": 1.16484375, | |
| "reward_std": 0.22955130971968174, | |
| "rewards/accuracy_reward": 0.1984375, | |
| "rewards/format_reward": 0.96640625, | |
| "step": 1050 | |
| }, | |
| { | |
| "completion_length": 86.01484375, | |
| "epoch": 0.4660279948096408, | |
| "grad_norm": 0.3947383463382721, | |
| "kl": 0.59249267578125, | |
| "learning_rate": 1.2889924056336531e-05, | |
| "loss": 0.0237, | |
| "reward": 1.15703125, | |
| "reward_std": 0.17950487434864043, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/format_reward": 0.96953125, | |
| "step": 1055 | |
| }, | |
| { | |
| "completion_length": 82.04375, | |
| "epoch": 0.46823665829215094, | |
| "grad_norm": 0.6859544515609741, | |
| "kl": 0.5910888671875, | |
| "learning_rate": 1.2815979614607818e-05, | |
| "loss": 0.0236, | |
| "reward": 1.1953125, | |
| "reward_std": 0.20693482737988234, | |
| "rewards/accuracy_reward": 0.21953125, | |
| "rewards/format_reward": 0.97578125, | |
| "step": 1060 | |
| }, | |
| { | |
| "completion_length": 95.7609375, | |
| "epoch": 0.4704453217746611, | |
| "grad_norm": 0.36446619033813477, | |
| "kl": 0.541748046875, | |
| "learning_rate": 1.274186755838945e-05, | |
| "loss": 0.0217, | |
| "reward": 1.19375, | |
| "reward_std": 0.2092124553397298, | |
| "rewards/accuracy_reward": 0.2203125, | |
| "rewards/format_reward": 0.9734375, | |
| "step": 1065 | |
| }, | |
| { | |
| "completion_length": 139.6734375, | |
| "epoch": 0.4726539852571712, | |
| "grad_norm": 0.5893868803977966, | |
| "kl": 0.471240234375, | |
| "learning_rate": 1.2667592299025331e-05, | |
| "loss": 0.0188, | |
| "reward": 1.1484375, | |
| "reward_std": 0.23675706721842288, | |
| "rewards/accuracy_reward": 0.19453125, | |
| "rewards/format_reward": 0.95390625, | |
| "step": 1070 | |
| }, | |
| { | |
| "completion_length": 116.73828125, | |
| "epoch": 0.4748626487396814, | |
| "grad_norm": 0.4791277348995209, | |
| "kl": 0.50758056640625, | |
| "learning_rate": 1.259315825757362e-05, | |
| "loss": 0.0203, | |
| "reward": 1.11796875, | |
| "reward_std": 0.20144069343805313, | |
| "rewards/accuracy_reward": 0.15, | |
| "rewards/format_reward": 0.96796875, | |
| "step": 1075 | |
| }, | |
| { | |
| "completion_length": 115.9921875, | |
| "epoch": 0.47707131222219157, | |
| "grad_norm": 0.5381553173065186, | |
| "kl": 0.49412841796875, | |
| "learning_rate": 1.251856986454363e-05, | |
| "loss": 0.0198, | |
| "reward": 1.19765625, | |
| "reward_std": 0.24683814216405153, | |
| "rewards/accuracy_reward": 0.23359375, | |
| "rewards/format_reward": 0.9640625, | |
| "step": 1080 | |
| }, | |
| { | |
| "completion_length": 122.90390625, | |
| "epoch": 0.4792799757047017, | |
| "grad_norm": 0.5010456442832947, | |
| "kl": 0.4677001953125, | |
| "learning_rate": 1.2443831559632065e-05, | |
| "loss": 0.0187, | |
| "reward": 1.15546875, | |
| "reward_std": 0.2554084826260805, | |
| "rewards/accuracy_reward": 0.1984375, | |
| "rewards/format_reward": 0.95703125, | |
| "step": 1085 | |
| }, | |
| { | |
| "completion_length": 151.96953125, | |
| "epoch": 0.48148863918721185, | |
| "grad_norm": 0.6068748235702515, | |
| "kl": 0.54783935546875, | |
| "learning_rate": 1.2368947791458785e-05, | |
| "loss": 0.0219, | |
| "reward": 1.14296875, | |
| "reward_std": 0.31096592992544175, | |
| "rewards/accuracy_reward": 0.21015625, | |
| "rewards/format_reward": 0.9328125, | |
| "step": 1090 | |
| }, | |
| { | |
| "completion_length": 125.5328125, | |
| "epoch": 0.483697302669722, | |
| "grad_norm": 0.37912383675575256, | |
| "kl": 0.45006103515625, | |
| "learning_rate": 1.2293923017302004e-05, | |
| "loss": 0.018, | |
| "reward": 1.1578125, | |
| "reward_std": 0.2355531807988882, | |
| "rewards/accuracy_reward": 0.20859375, | |
| "rewards/format_reward": 0.94921875, | |
| "step": 1095 | |
| }, | |
| { | |
| "completion_length": 93.94375, | |
| "epoch": 0.48590596615223214, | |
| "grad_norm": 0.45108431577682495, | |
| "kl": 0.535595703125, | |
| "learning_rate": 1.221876170283298e-05, | |
| "loss": 0.0214, | |
| "reward": 1.16640625, | |
| "reward_std": 0.1948365481570363, | |
| "rewards/accuracy_reward": 0.19609375, | |
| "rewards/format_reward": 0.9703125, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.48590596615223214, | |
| "eval_completion_length": 87.3554167175293, | |
| "eval_kl": 0.49109375, | |
| "eval_loss": 0.019712308421730995, | |
| "eval_reward": 1.206666669845581, | |
| "eval_reward_std": 0.194391932785511, | |
| "eval_rewards/accuracy_reward": 0.2316666667163372, | |
| "eval_rewards/format_reward": 0.975, | |
| "eval_runtime": 107.5828, | |
| "eval_samples_per_second": 0.92, | |
| "eval_steps_per_second": 0.037, | |
| "step": 1100 | |
| }, | |
| { | |
| "completion_length": 93.66171875, | |
| "epoch": 0.4881146296347423, | |
| "grad_norm": 0.3893759548664093, | |
| "kl": 0.50633544921875, | |
| "learning_rate": 1.214346832185021e-05, | |
| "loss": 0.0203, | |
| "reward": 1.18046875, | |
| "reward_std": 0.19928432293236256, | |
| "rewards/accuracy_reward": 0.20625, | |
| "rewards/format_reward": 0.97421875, | |
| "step": 1105 | |
| }, | |
| { | |
| "completion_length": 113.6171875, | |
| "epoch": 0.4903232931172524, | |
| "grad_norm": 0.5400230288505554, | |
| "kl": 0.63984375, | |
| "learning_rate": 1.2068047356013136e-05, | |
| "loss": 0.0256, | |
| "reward": 1.13359375, | |
| "reward_std": 0.24417215697467326, | |
| "rewards/accuracy_reward": 0.18203125, | |
| "rewards/format_reward": 0.9515625, | |
| "step": 1110 | |
| }, | |
| { | |
| "completion_length": 112.42421875, | |
| "epoch": 0.49253195659976257, | |
| "grad_norm": 0.4314129054546356, | |
| "kl": 0.5706787109375, | |
| "learning_rate": 1.1992503294575385e-05, | |
| "loss": 0.0228, | |
| "reward": 1.16171875, | |
| "reward_std": 0.21959545239806175, | |
| "rewards/accuracy_reward": 0.203125, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 1115 | |
| }, | |
| { | |
| "completion_length": 116.0015625, | |
| "epoch": 0.4947406200822727, | |
| "grad_norm": 1.4012691974639893, | |
| "kl": 0.51864013671875, | |
| "learning_rate": 1.1916840634117555e-05, | |
| "loss": 0.0207, | |
| "reward": 1.1296875, | |
| "reward_std": 0.2404359621927142, | |
| "rewards/accuracy_reward": 0.17421875, | |
| "rewards/format_reward": 0.95546875, | |
| "step": 1120 | |
| }, | |
| { | |
| "completion_length": 119.28984375, | |
| "epoch": 0.49694928356478285, | |
| "grad_norm": 0.530860960483551, | |
| "kl": 0.5022705078125, | |
| "learning_rate": 1.1841063878279572e-05, | |
| "loss": 0.0201, | |
| "reward": 1.18203125, | |
| "reward_std": 0.25134353432804346, | |
| "rewards/accuracy_reward": 0.2265625, | |
| "rewards/format_reward": 0.95546875, | |
| "step": 1125 | |
| }, | |
| { | |
| "completion_length": 108.6421875, | |
| "epoch": 0.499157947047293, | |
| "grad_norm": 0.4978342056274414, | |
| "kl": 0.541162109375, | |
| "learning_rate": 1.1765177537492616e-05, | |
| "loss": 0.0216, | |
| "reward": 1.18359375, | |
| "reward_std": 0.24454718120396138, | |
| "rewards/accuracy_reward": 0.225, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 1130 | |
| }, | |
| { | |
| "completion_length": 85.57890625, | |
| "epoch": 0.5013666105298031, | |
| "grad_norm": 0.4762633144855499, | |
| "kl": 0.542431640625, | |
| "learning_rate": 1.1689186128710654e-05, | |
| "loss": 0.0217, | |
| "reward": 1.19609375, | |
| "reward_std": 0.1994084009900689, | |
| "rewards/accuracy_reward": 0.21796875, | |
| "rewards/format_reward": 0.978125, | |
| "step": 1135 | |
| }, | |
| { | |
| "completion_length": 98.32109375, | |
| "epoch": 0.5035752740123133, | |
| "grad_norm": 0.4662761092185974, | |
| "kl": 0.5237060546875, | |
| "learning_rate": 1.1613094175141568e-05, | |
| "loss": 0.0209, | |
| "reward": 1.17734375, | |
| "reward_std": 0.1960198676213622, | |
| "rewards/accuracy_reward": 0.2, | |
| "rewards/format_reward": 0.97734375, | |
| "step": 1140 | |
| }, | |
| { | |
| "completion_length": 129.81640625, | |
| "epoch": 0.5057839374948234, | |
| "grad_norm": 0.6282123923301697, | |
| "kl": 0.52607421875, | |
| "learning_rate": 1.1536906205977936e-05, | |
| "loss": 0.021, | |
| "reward": 1.2, | |
| "reward_std": 0.24621726330369711, | |
| "rewards/accuracy_reward": 0.2453125, | |
| "rewards/format_reward": 0.9546875, | |
| "step": 1145 | |
| }, | |
| { | |
| "completion_length": 144.84921875, | |
| "epoch": 0.5079926009773336, | |
| "grad_norm": 0.8076626658439636, | |
| "kl": 0.6271484375, | |
| "learning_rate": 1.1460626756127431e-05, | |
| "loss": 0.0251, | |
| "reward": 1.2, | |
| "reward_std": 0.2839036539196968, | |
| "rewards/accuracy_reward": 0.25546875, | |
| "rewards/format_reward": 0.94453125, | |
| "step": 1150 | |
| }, | |
| { | |
| "completion_length": 133.01328125, | |
| "epoch": 0.5102012644598437, | |
| "grad_norm": 0.7102019190788269, | |
| "kl": 0.682958984375, | |
| "learning_rate": 1.1384260365942905e-05, | |
| "loss": 0.0273, | |
| "reward": 1.14140625, | |
| "reward_std": 0.22996564749628307, | |
| "rewards/accuracy_reward": 0.184375, | |
| "rewards/format_reward": 0.95703125, | |
| "step": 1155 | |
| }, | |
| { | |
| "completion_length": 118.7953125, | |
| "epoch": 0.5124099279423538, | |
| "grad_norm": 0.6336015462875366, | |
| "kl": 0.6332275390625, | |
| "learning_rate": 1.1307811580952113e-05, | |
| "loss": 0.0253, | |
| "reward": 1.17734375, | |
| "reward_std": 0.2275516463443637, | |
| "rewards/accuracy_reward": 0.2140625, | |
| "rewards/format_reward": 0.96328125, | |
| "step": 1160 | |
| }, | |
| { | |
| "completion_length": 125.0515625, | |
| "epoch": 0.514618591424864, | |
| "grad_norm": 0.5651206374168396, | |
| "kl": 0.5558349609375, | |
| "learning_rate": 1.123128495158718e-05, | |
| "loss": 0.0222, | |
| "reward": 1.2, | |
| "reward_std": 0.22535169757902623, | |
| "rewards/accuracy_reward": 0.234375, | |
| "rewards/format_reward": 0.965625, | |
| "step": 1165 | |
| }, | |
| { | |
| "completion_length": 150.834375, | |
| "epoch": 0.5168272549073741, | |
| "grad_norm": 0.5256941914558411, | |
| "kl": 0.49346923828125, | |
| "learning_rate": 1.1154685032913719e-05, | |
| "loss": 0.0197, | |
| "reward": 1.13046875, | |
| "reward_std": 0.22142891082912683, | |
| "rewards/accuracy_reward": 0.17109375, | |
| "rewards/format_reward": 0.959375, | |
| "step": 1170 | |
| }, | |
| { | |
| "completion_length": 147.46875, | |
| "epoch": 0.5190359183898843, | |
| "grad_norm": 1.1057101488113403, | |
| "kl": 0.4740234375, | |
| "learning_rate": 1.1078016384359725e-05, | |
| "loss": 0.019, | |
| "reward": 1.17734375, | |
| "reward_std": 0.2350118851289153, | |
| "rewards/accuracy_reward": 0.21484375, | |
| "rewards/format_reward": 0.9625, | |
| "step": 1175 | |
| }, | |
| { | |
| "completion_length": 165.5859375, | |
| "epoch": 0.5212445818723944, | |
| "grad_norm": 0.5781757235527039, | |
| "kl": 0.5138427734375, | |
| "learning_rate": 1.100128356944417e-05, | |
| "loss": 0.0206, | |
| "reward": 1.15703125, | |
| "reward_std": 0.3072576764971018, | |
| "rewards/accuracy_reward": 0.23515625, | |
| "rewards/format_reward": 0.921875, | |
| "step": 1180 | |
| }, | |
| { | |
| "completion_length": 133.359375, | |
| "epoch": 0.5234532453549046, | |
| "grad_norm": 0.5776464343070984, | |
| "kl": 0.7113037109375, | |
| "learning_rate": 1.0924491155505375e-05, | |
| "loss": 0.0285, | |
| "reward": 1.121875, | |
| "reward_std": 0.3050299068912864, | |
| "rewards/accuracy_reward": 0.19765625, | |
| "rewards/format_reward": 0.92421875, | |
| "step": 1185 | |
| }, | |
| { | |
| "completion_length": 119.27421875, | |
| "epoch": 0.5256619088374147, | |
| "grad_norm": 0.7527931928634644, | |
| "kl": 0.5626953125, | |
| "learning_rate": 1.0847643713429155e-05, | |
| "loss": 0.0225, | |
| "reward": 1.16015625, | |
| "reward_std": 0.2789567396044731, | |
| "rewards/accuracy_reward": 0.2171875, | |
| "rewards/format_reward": 0.94296875, | |
| "step": 1190 | |
| }, | |
| { | |
| "completion_length": 75.74609375, | |
| "epoch": 0.527870572319925, | |
| "grad_norm": 0.5353002548217773, | |
| "kl": 0.5966064453125, | |
| "learning_rate": 1.0770745817376741e-05, | |
| "loss": 0.0239, | |
| "reward": 1.171875, | |
| "reward_std": 0.1676252031698823, | |
| "rewards/accuracy_reward": 0.19140625, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 1195 | |
| }, | |
| { | |
| "completion_length": 74.70078125, | |
| "epoch": 0.5300792358024351, | |
| "grad_norm": 0.7278808951377869, | |
| "kl": 0.5812255859375, | |
| "learning_rate": 1.0693802044512525e-05, | |
| "loss": 0.0233, | |
| "reward": 1.17734375, | |
| "reward_std": 0.16255829595029353, | |
| "rewards/accuracy_reward": 0.1890625, | |
| "rewards/format_reward": 0.98828125, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5300792358024351, | |
| "eval_completion_length": 84.3741668701172, | |
| "eval_kl": 0.55484375, | |
| "eval_loss": 0.022250505164265633, | |
| "eval_reward": 1.195, | |
| "eval_reward_std": 0.15836685180664062, | |
| "eval_rewards/accuracy_reward": 0.2025, | |
| "eval_rewards/format_reward": 0.9925, | |
| "eval_runtime": 102.7549, | |
| "eval_samples_per_second": 0.963, | |
| "eval_steps_per_second": 0.039, | |
| "step": 1200 | |
| }, | |
| { | |
| "completion_length": 89.65546875, | |
| "epoch": 0.5322878992849452, | |
| "grad_norm": 0.8118963241577148, | |
| "kl": 0.56036376953125, | |
| "learning_rate": 1.061681697473159e-05, | |
| "loss": 0.0224, | |
| "reward": 1.23828125, | |
| "reward_std": 0.16189471799880267, | |
| "rewards/accuracy_reward": 0.24609375, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 1205 | |
| }, | |
| { | |
| "completion_length": 171.05078125, | |
| "epoch": 0.5344965627674554, | |
| "grad_norm": 1.0046565532684326, | |
| "kl": 1.409912109375, | |
| "learning_rate": 1.0539795190387141e-05, | |
| "loss": 0.0564, | |
| "reward": 1.14140625, | |
| "reward_std": 0.23411216996610165, | |
| "rewards/accuracy_reward": 0.17578125, | |
| "rewards/format_reward": 0.965625, | |
| "step": 1210 | |
| }, | |
| { | |
| "completion_length": 244.696875, | |
| "epoch": 0.5367052262499655, | |
| "grad_norm": 1.4813671112060547, | |
| "kl": 1.9112060546875, | |
| "learning_rate": 1.0462741276017711e-05, | |
| "loss": 0.0765, | |
| "reward": 1.12265625, | |
| "reward_std": 0.312072067707777, | |
| "rewards/accuracy_reward": 0.203125, | |
| "rewards/format_reward": 0.91953125, | |
| "step": 1215 | |
| }, | |
| { | |
| "completion_length": 137.07109375, | |
| "epoch": 0.5389138897324757, | |
| "grad_norm": 0.44882670044898987, | |
| "kl": 1.1479248046875, | |
| "learning_rate": 1.038565981807431e-05, | |
| "loss": 0.0459, | |
| "reward": 1.165625, | |
| "reward_std": 0.24441679026931523, | |
| "rewards/accuracy_reward": 0.2, | |
| "rewards/format_reward": 0.965625, | |
| "step": 1220 | |
| }, | |
| { | |
| "completion_length": 112.965625, | |
| "epoch": 0.5411225532149858, | |
| "grad_norm": 1.2693246603012085, | |
| "kl": 0.73072509765625, | |
| "learning_rate": 1.0308555404647407e-05, | |
| "loss": 0.0292, | |
| "reward": 1.190625, | |
| "reward_std": 0.2190632749348879, | |
| "rewards/accuracy_reward": 0.21875, | |
| "rewards/format_reward": 0.971875, | |
| "step": 1225 | |
| }, | |
| { | |
| "completion_length": 109.203125, | |
| "epoch": 0.543331216697496, | |
| "grad_norm": 0.6727134585380554, | |
| "kl": 0.66895751953125, | |
| "learning_rate": 1.0231432625193842e-05, | |
| "loss": 0.0267, | |
| "reward": 1.23359375, | |
| "reward_std": 0.2255825974047184, | |
| "rewards/accuracy_reward": 0.25546875, | |
| "rewards/format_reward": 0.978125, | |
| "step": 1230 | |
| }, | |
| { | |
| "completion_length": 114.903125, | |
| "epoch": 0.5455398801800061, | |
| "grad_norm": 0.5706783533096313, | |
| "kl": 0.6170654296875, | |
| "learning_rate": 1.0154296070263649e-05, | |
| "loss": 0.0247, | |
| "reward": 1.1875, | |
| "reward_std": 0.2010749163106084, | |
| "rewards/accuracy_reward": 0.2125, | |
| "rewards/format_reward": 0.975, | |
| "step": 1235 | |
| }, | |
| { | |
| "completion_length": 121.5609375, | |
| "epoch": 0.5477485436625162, | |
| "grad_norm": 0.6042254567146301, | |
| "kl": 0.58348388671875, | |
| "learning_rate": 1.0077150331226822e-05, | |
| "loss": 0.0233, | |
| "reward": 1.25546875, | |
| "reward_std": 0.22948720771819353, | |
| "rewards/accuracy_reward": 0.2875, | |
| "rewards/format_reward": 0.96796875, | |
| "step": 1240 | |
| }, | |
| { | |
| "completion_length": 130.03984375, | |
| "epoch": 0.5499572071450264, | |
| "grad_norm": 0.6111878752708435, | |
| "kl": 0.63865966796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0256, | |
| "reward": 1.19375, | |
| "reward_std": 0.25420499257743356, | |
| "rewards/accuracy_reward": 0.234375, | |
| "rewards/format_reward": 0.959375, | |
| "step": 1245 | |
| }, | |
| { | |
| "completion_length": 154.44453125, | |
| "epoch": 0.5521658706275365, | |
| "grad_norm": 2.512364387512207, | |
| "kl": 0.91964111328125, | |
| "learning_rate": 9.922849668773181e-06, | |
| "loss": 0.0368, | |
| "reward": 1.12890625, | |
| "reward_std": 0.29293579459190366, | |
| "rewards/accuracy_reward": 0.203125, | |
| "rewards/format_reward": 0.92578125, | |
| "step": 1250 | |
| }, | |
| { | |
| "completion_length": 129.0, | |
| "epoch": 0.5543745341100467, | |
| "grad_norm": 5.317835807800293, | |
| "kl": 1.59818115234375, | |
| "learning_rate": 9.845703929736351e-06, | |
| "loss": 0.0639, | |
| "reward": 1.18984375, | |
| "reward_std": 0.26693961266428234, | |
| "rewards/accuracy_reward": 0.23984375, | |
| "rewards/format_reward": 0.95, | |
| "step": 1255 | |
| }, | |
| { | |
| "completion_length": 108.20859375, | |
| "epoch": 0.5565831975925568, | |
| "grad_norm": 0.5913572311401367, | |
| "kl": 1.3113525390625, | |
| "learning_rate": 9.768567374806163e-06, | |
| "loss": 0.0524, | |
| "reward": 1.153125, | |
| "reward_std": 0.2126995487138629, | |
| "rewards/accuracy_reward": 0.18359375, | |
| "rewards/format_reward": 0.96953125, | |
| "step": 1260 | |
| }, | |
| { | |
| "completion_length": 107.97890625, | |
| "epoch": 0.558791861075067, | |
| "grad_norm": 1.2350952625274658, | |
| "kl": 0.59656982421875, | |
| "learning_rate": 9.691444595352596e-06, | |
| "loss": 0.0239, | |
| "reward": 1.23359375, | |
| "reward_std": 0.21166725642979145, | |
| "rewards/accuracy_reward": 0.2640625, | |
| "rewards/format_reward": 0.96953125, | |
| "step": 1265 | |
| }, | |
| { | |
| "completion_length": 145.5171875, | |
| "epoch": 0.5610005245575771, | |
| "grad_norm": 0.784472644329071, | |
| "kl": 0.96728515625, | |
| "learning_rate": 9.614340181925692e-06, | |
| "loss": 0.0387, | |
| "reward": 1.15625, | |
| "reward_std": 0.2822375038638711, | |
| "rewards/accuracy_reward": 0.21328125, | |
| "rewards/format_reward": 0.94296875, | |
| "step": 1270 | |
| }, | |
| { | |
| "completion_length": 147.246875, | |
| "epoch": 0.5632091880400872, | |
| "grad_norm": 0.712846577167511, | |
| "kl": 0.7251220703125, | |
| "learning_rate": 9.53725872398229e-06, | |
| "loss": 0.029, | |
| "reward": 1.11328125, | |
| "reward_std": 0.28186143897473814, | |
| "rewards/accuracy_reward": 0.18984375, | |
| "rewards/format_reward": 0.9234375, | |
| "step": 1275 | |
| }, | |
| { | |
| "completion_length": 83.80546875, | |
| "epoch": 0.5654178515225974, | |
| "grad_norm": 0.5126023888587952, | |
| "kl": 0.632373046875, | |
| "learning_rate": 9.460204809612864e-06, | |
| "loss": 0.0253, | |
| "reward": 1.12734375, | |
| "reward_std": 0.193264627084136, | |
| "rewards/accuracy_reward": 0.14609375, | |
| "rewards/format_reward": 0.98125, | |
| "step": 1280 | |
| }, | |
| { | |
| "completion_length": 62.38515625, | |
| "epoch": 0.5676265150051075, | |
| "grad_norm": 0.3859677314758301, | |
| "kl": 0.5292724609375, | |
| "learning_rate": 9.383183025268411e-06, | |
| "loss": 0.0212, | |
| "reward": 1.21328125, | |
| "reward_std": 0.15019516460597515, | |
| "rewards/accuracy_reward": 0.21796875, | |
| "rewards/format_reward": 0.9953125, | |
| "step": 1285 | |
| }, | |
| { | |
| "completion_length": 69.62578125, | |
| "epoch": 0.5698351784876177, | |
| "grad_norm": 0.4441126883029938, | |
| "kl": 0.4895263671875, | |
| "learning_rate": 9.306197955487479e-06, | |
| "loss": 0.0196, | |
| "reward": 1.22265625, | |
| "reward_std": 0.1509174121543765, | |
| "rewards/accuracy_reward": 0.22578125, | |
| "rewards/format_reward": 0.996875, | |
| "step": 1290 | |
| }, | |
| { | |
| "completion_length": 90.8921875, | |
| "epoch": 0.5720438419701278, | |
| "grad_norm": 0.24060453474521637, | |
| "kl": 0.4620361328125, | |
| "learning_rate": 9.22925418262326e-06, | |
| "loss": 0.0185, | |
| "reward": 1.21015625, | |
| "reward_std": 0.15958714820444583, | |
| "rewards/accuracy_reward": 0.215625, | |
| "rewards/format_reward": 0.99453125, | |
| "step": 1295 | |
| }, | |
| { | |
| "completion_length": 113.90625, | |
| "epoch": 0.574252505452638, | |
| "grad_norm": 0.5410070419311523, | |
| "kl": 0.451025390625, | |
| "learning_rate": 9.15235628657085e-06, | |
| "loss": 0.018, | |
| "reward": 1.253125, | |
| "reward_std": 0.21325785480439663, | |
| "rewards/accuracy_reward": 0.26953125, | |
| "rewards/format_reward": 0.98359375, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.574252505452638, | |
| "eval_completion_length": 121.16583343505859, | |
| "eval_kl": 0.428828125, | |
| "eval_loss": 0.017305398359894753, | |
| "eval_reward": 1.22625, | |
| "eval_reward_std": 0.23555977791547775, | |
| "eval_rewards/accuracy_reward": 0.2533333334326744, | |
| "eval_rewards/format_reward": 0.9729166674613953, | |
| "eval_runtime": 139.5802, | |
| "eval_samples_per_second": 0.709, | |
| "eval_steps_per_second": 0.029, | |
| "step": 1300 | |
| }, | |
| { | |
| "completion_length": 123.81796875, | |
| "epoch": 0.5764611689351481, | |
| "grad_norm": 0.5413645505905151, | |
| "kl": 0.43470458984375, | |
| "learning_rate": 9.07550884449463e-06, | |
| "loss": 0.0174, | |
| "reward": 1.178125, | |
| "reward_std": 0.20707119330763818, | |
| "rewards/accuracy_reward": 0.209375, | |
| "rewards/format_reward": 0.96875, | |
| "step": 1305 | |
| }, | |
| { | |
| "completion_length": 143.82578125, | |
| "epoch": 0.5786698324176582, | |
| "grad_norm": 0.4805835783481598, | |
| "kl": 0.44522705078125, | |
| "learning_rate": 8.998716430555832e-06, | |
| "loss": 0.0178, | |
| "reward": 1.128125, | |
| "reward_std": 0.2421926449984312, | |
| "rewards/accuracy_reward": 0.17890625, | |
| "rewards/format_reward": 0.94921875, | |
| "step": 1310 | |
| }, | |
| { | |
| "completion_length": 146.4421875, | |
| "epoch": 0.5808784959001684, | |
| "grad_norm": 0.36650240421295166, | |
| "kl": 0.50875244140625, | |
| "learning_rate": 8.921983615640277e-06, | |
| "loss": 0.0203, | |
| "reward": 1.13515625, | |
| "reward_std": 0.2266262538731098, | |
| "rewards/accuracy_reward": 0.1765625, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 1315 | |
| }, | |
| { | |
| "completion_length": 125.47421875, | |
| "epoch": 0.5830871593826785, | |
| "grad_norm": 0.431325227022171, | |
| "kl": 0.4361328125, | |
| "learning_rate": 8.845314967086281e-06, | |
| "loss": 0.0174, | |
| "reward": 1.13984375, | |
| "reward_std": 0.16203333698213102, | |
| "rewards/accuracy_reward": 0.1609375, | |
| "rewards/format_reward": 0.97890625, | |
| "step": 1320 | |
| }, | |
| { | |
| "completion_length": 116.61015625, | |
| "epoch": 0.5852958228651887, | |
| "grad_norm": 0.44710680842399597, | |
| "kl": 0.41380615234375, | |
| "learning_rate": 8.768715048412823e-06, | |
| "loss": 0.0166, | |
| "reward": 1.215625, | |
| "reward_std": 0.19275038037449121, | |
| "rewards/accuracy_reward": 0.228125, | |
| "rewards/format_reward": 0.9875, | |
| "step": 1325 | |
| }, | |
| { | |
| "completion_length": 119.11171875, | |
| "epoch": 0.5875044863476988, | |
| "grad_norm": 0.49566569924354553, | |
| "kl": 0.420166015625, | |
| "learning_rate": 8.692188419047889e-06, | |
| "loss": 0.0168, | |
| "reward": 1.19375, | |
| "reward_std": 0.1878614580258727, | |
| "rewards/accuracy_reward": 0.20625, | |
| "rewards/format_reward": 0.9875, | |
| "step": 1330 | |
| }, | |
| { | |
| "completion_length": 132.55390625, | |
| "epoch": 0.589713149830209, | |
| "grad_norm": 0.5140964388847351, | |
| "kl": 0.447314453125, | |
| "learning_rate": 8.615739634057098e-06, | |
| "loss": 0.0179, | |
| "reward": 1.21171875, | |
| "reward_std": 0.19049166329205036, | |
| "rewards/accuracy_reward": 0.2328125, | |
| "rewards/format_reward": 0.97890625, | |
| "step": 1335 | |
| }, | |
| { | |
| "completion_length": 147.45390625, | |
| "epoch": 0.5919218133127191, | |
| "grad_norm": 0.5098828673362732, | |
| "kl": 0.456298828125, | |
| "learning_rate": 8.539373243872569e-06, | |
| "loss": 0.0182, | |
| "reward": 1.18359375, | |
| "reward_std": 0.20728036612272263, | |
| "rewards/accuracy_reward": 0.21171875, | |
| "rewards/format_reward": 0.971875, | |
| "step": 1340 | |
| }, | |
| { | |
| "completion_length": 148.99296875, | |
| "epoch": 0.5941304767952292, | |
| "grad_norm": 0.5452316999435425, | |
| "kl": 0.48438720703125, | |
| "learning_rate": 8.463093794022069e-06, | |
| "loss": 0.0194, | |
| "reward": 1.2, | |
| "reward_std": 0.26944386642426255, | |
| "rewards/accuracy_reward": 0.24140625, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 1345 | |
| }, | |
| { | |
| "completion_length": 146.82578125, | |
| "epoch": 0.5963391402777395, | |
| "grad_norm": 0.3637787103652954, | |
| "kl": 0.42889404296875, | |
| "learning_rate": 8.386905824858436e-06, | |
| "loss": 0.0172, | |
| "reward": 1.165625, | |
| "reward_std": 0.18749849069863558, | |
| "rewards/accuracy_reward": 0.1921875, | |
| "rewards/format_reward": 0.9734375, | |
| "step": 1350 | |
| }, | |
| { | |
| "completion_length": 163.79296875, | |
| "epoch": 0.5985478037602496, | |
| "grad_norm": 0.5319222807884216, | |
| "kl": 0.41142578125, | |
| "learning_rate": 8.310813871289349e-06, | |
| "loss": 0.0165, | |
| "reward": 1.1984375, | |
| "reward_std": 0.25758711863309147, | |
| "rewards/accuracy_reward": 0.2375, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 1355 | |
| }, | |
| { | |
| "completion_length": 158.078125, | |
| "epoch": 0.6007564672427598, | |
| "grad_norm": 0.4595475494861603, | |
| "kl": 0.4110107421875, | |
| "learning_rate": 8.234822462507384e-06, | |
| "loss": 0.0164, | |
| "reward": 1.2328125, | |
| "reward_std": 0.21721374820917844, | |
| "rewards/accuracy_reward": 0.25625, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 1360 | |
| }, | |
| { | |
| "completion_length": 145.98046875, | |
| "epoch": 0.6029651307252699, | |
| "grad_norm": 0.49845319986343384, | |
| "kl": 0.41668701171875, | |
| "learning_rate": 8.158936121720433e-06, | |
| "loss": 0.0167, | |
| "reward": 1.24609375, | |
| "reward_std": 0.22847788464277982, | |
| "rewards/accuracy_reward": 0.2765625, | |
| "rewards/format_reward": 0.96953125, | |
| "step": 1365 | |
| }, | |
| { | |
| "completion_length": 171.2109375, | |
| "epoch": 0.6051737942077801, | |
| "grad_norm": 0.5033155083656311, | |
| "kl": 0.4380126953125, | |
| "learning_rate": 8.08315936588245e-06, | |
| "loss": 0.0175, | |
| "reward": 1.20390625, | |
| "reward_std": 0.22734466083347799, | |
| "rewards/accuracy_reward": 0.24765625, | |
| "rewards/format_reward": 0.95625, | |
| "step": 1370 | |
| }, | |
| { | |
| "completion_length": 154.93125, | |
| "epoch": 0.6073824576902902, | |
| "grad_norm": 0.5040455460548401, | |
| "kl": 0.41199951171875, | |
| "learning_rate": 8.00749670542462e-06, | |
| "loss": 0.0165, | |
| "reward": 1.22734375, | |
| "reward_std": 0.21946511473506689, | |
| "rewards/accuracy_reward": 0.246875, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 1375 | |
| }, | |
| { | |
| "completion_length": 174.39609375, | |
| "epoch": 0.6095911211728003, | |
| "grad_norm": 0.6203243136405945, | |
| "kl": 0.4758056640625, | |
| "learning_rate": 7.931952643986866e-06, | |
| "loss": 0.019, | |
| "reward": 1.225, | |
| "reward_std": 0.25668525900691747, | |
| "rewards/accuracy_reward": 0.259375, | |
| "rewards/format_reward": 0.965625, | |
| "step": 1380 | |
| }, | |
| { | |
| "completion_length": 148.69375, | |
| "epoch": 0.6117997846553105, | |
| "grad_norm": 3.571584939956665, | |
| "kl": 0.6333251953125, | |
| "learning_rate": 7.856531678149792e-06, | |
| "loss": 0.0253, | |
| "reward": 1.21015625, | |
| "reward_std": 0.19479831736534833, | |
| "rewards/accuracy_reward": 0.23359375, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 1385 | |
| }, | |
| { | |
| "completion_length": 163.81796875, | |
| "epoch": 0.6140084481378206, | |
| "grad_norm": 1.1779475212097168, | |
| "kl": 0.9492919921875, | |
| "learning_rate": 7.781238297167025e-06, | |
| "loss": 0.0379, | |
| "reward": 1.1703125, | |
| "reward_std": 0.22253222949802876, | |
| "rewards/accuracy_reward": 0.2109375, | |
| "rewards/format_reward": 0.959375, | |
| "step": 1390 | |
| }, | |
| { | |
| "completion_length": 160.65625, | |
| "epoch": 0.6162171116203308, | |
| "grad_norm": 1.4942560195922852, | |
| "kl": 1.04031982421875, | |
| "learning_rate": 7.706076982698e-06, | |
| "loss": 0.0416, | |
| "reward": 1.2109375, | |
| "reward_std": 0.25693559013307093, | |
| "rewards/accuracy_reward": 0.2578125, | |
| "rewards/format_reward": 0.953125, | |
| "step": 1395 | |
| }, | |
| { | |
| "completion_length": 142.009375, | |
| "epoch": 0.6184257751028409, | |
| "grad_norm": 0.8941807746887207, | |
| "kl": 1.5090576171875, | |
| "learning_rate": 7.631052208541217e-06, | |
| "loss": 0.0605, | |
| "reward": 1.24453125, | |
| "reward_std": 0.24907068870961666, | |
| "rewards/accuracy_reward": 0.27109375, | |
| "rewards/format_reward": 0.9734375, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6184257751028409, | |
| "eval_completion_length": 126.1554168701172, | |
| "eval_kl": 0.51203125, | |
| "eval_loss": 0.020548321306705475, | |
| "eval_reward": 1.2670833349227906, | |
| "eval_reward_std": 0.2361640551686287, | |
| "eval_rewards/accuracy_reward": 0.2795833334326744, | |
| "eval_rewards/format_reward": 0.9875, | |
| "eval_runtime": 108.4132, | |
| "eval_samples_per_second": 0.913, | |
| "eval_steps_per_second": 0.037, | |
| "step": 1400 | |
| }, | |
| { | |
| "completion_length": 129.46015625, | |
| "epoch": 0.6206344385853511, | |
| "grad_norm": 0.5738782286643982, | |
| "kl": 0.52000732421875, | |
| "learning_rate": 7.5561684403679355e-06, | |
| "loss": 0.0208, | |
| "reward": 1.159375, | |
| "reward_std": 0.18879235051572324, | |
| "rewards/accuracy_reward": 0.1828125, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 1405 | |
| }, | |
| { | |
| "completion_length": 160.28828125, | |
| "epoch": 0.6228431020678612, | |
| "grad_norm": 0.8489373326301575, | |
| "kl": 0.78538818359375, | |
| "learning_rate": 7.4814301354563735e-06, | |
| "loss": 0.0314, | |
| "reward": 1.1765625, | |
| "reward_std": 0.2816866671666503, | |
| "rewards/accuracy_reward": 0.23046875, | |
| "rewards/format_reward": 0.94609375, | |
| "step": 1410 | |
| }, | |
| { | |
| "completion_length": 130.85, | |
| "epoch": 0.6250517655503713, | |
| "grad_norm": 0.4323514997959137, | |
| "kl": 0.4590087890625, | |
| "learning_rate": 7.40684174242638e-06, | |
| "loss": 0.0184, | |
| "reward": 1.1921875, | |
| "reward_std": 0.20537027660757304, | |
| "rewards/accuracy_reward": 0.21796875, | |
| "rewards/format_reward": 0.97421875, | |
| "step": 1415 | |
| }, | |
| { | |
| "completion_length": 129.54921875, | |
| "epoch": 0.6272604290328815, | |
| "grad_norm": 0.4422205686569214, | |
| "kl": 0.44378662109375, | |
| "learning_rate": 7.332407700974673e-06, | |
| "loss": 0.0178, | |
| "reward": 1.2109375, | |
| "reward_std": 0.22267536614090205, | |
| "rewards/accuracy_reward": 0.23125, | |
| "rewards/format_reward": 0.9796875, | |
| "step": 1420 | |
| }, | |
| { | |
| "completion_length": 156.784375, | |
| "epoch": 0.6294690925153916, | |
| "grad_norm": 0.6342864632606506, | |
| "kl": 0.4815185546875, | |
| "learning_rate": 7.258132441610548e-06, | |
| "loss": 0.0193, | |
| "reward": 1.225, | |
| "reward_std": 0.27565329764038327, | |
| "rewards/accuracy_reward": 0.2703125, | |
| "rewards/format_reward": 0.9546875, | |
| "step": 1425 | |
| }, | |
| { | |
| "completion_length": 189.24375, | |
| "epoch": 0.6316777559979018, | |
| "grad_norm": 0.5509055256843567, | |
| "kl": 0.6362548828125, | |
| "learning_rate": 7.184020385392186e-06, | |
| "loss": 0.0254, | |
| "reward": 1.1234375, | |
| "reward_std": 0.2952466538175941, | |
| "rewards/accuracy_reward": 0.19765625, | |
| "rewards/format_reward": 0.92578125, | |
| "step": 1430 | |
| }, | |
| { | |
| "completion_length": 125.340625, | |
| "epoch": 0.6338864194804119, | |
| "grad_norm": 0.44184309244155884, | |
| "kl": 0.415966796875, | |
| "learning_rate": 7.110075943663473e-06, | |
| "loss": 0.0166, | |
| "reward": 1.25625, | |
| "reward_std": 0.20973300114274024, | |
| "rewards/accuracy_reward": 0.2796875, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 1435 | |
| }, | |
| { | |
| "completion_length": 110.86875, | |
| "epoch": 0.636095082962922, | |
| "grad_norm": 0.393185019493103, | |
| "kl": 0.45828857421875, | |
| "learning_rate": 7.0363035177914505e-06, | |
| "loss": 0.0183, | |
| "reward": 1.196875, | |
| "reward_std": 0.18387279994785785, | |
| "rewards/accuracy_reward": 0.2203125, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 1440 | |
| }, | |
| { | |
| "completion_length": 121.4484375, | |
| "epoch": 0.6383037464454322, | |
| "grad_norm": 0.791002094745636, | |
| "kl": 0.54757080078125, | |
| "learning_rate": 6.962707498904331e-06, | |
| "loss": 0.0219, | |
| "reward": 1.21328125, | |
| "reward_std": 0.2349924026057124, | |
| "rewards/accuracy_reward": 0.2453125, | |
| "rewards/format_reward": 0.96796875, | |
| "step": 1445 | |
| }, | |
| { | |
| "completion_length": 145.55703125, | |
| "epoch": 0.6405124099279423, | |
| "grad_norm": 0.44890421628952026, | |
| "kl": 0.5147216796875, | |
| "learning_rate": 6.889292267630106e-06, | |
| "loss": 0.0206, | |
| "reward": 1.19375, | |
| "reward_std": 0.24552099388092757, | |
| "rewards/accuracy_reward": 0.22890625, | |
| "rewards/format_reward": 0.96484375, | |
| "step": 1450 | |
| }, | |
| { | |
| "completion_length": 142.5640625, | |
| "epoch": 0.6427210734104525, | |
| "grad_norm": 0.47677525877952576, | |
| "kl": 0.44825439453125, | |
| "learning_rate": 6.81606219383583e-06, | |
| "loss": 0.0179, | |
| "reward": 1.22109375, | |
| "reward_std": 0.18264568988233804, | |
| "rewards/accuracy_reward": 0.2375, | |
| "rewards/format_reward": 0.98359375, | |
| "step": 1455 | |
| }, | |
| { | |
| "completion_length": 141.29765625, | |
| "epoch": 0.6449297368929626, | |
| "grad_norm": 0.5547141432762146, | |
| "kl": 0.45455322265625, | |
| "learning_rate": 6.743021636367488e-06, | |
| "loss": 0.0182, | |
| "reward": 1.24296875, | |
| "reward_std": 0.20987400207668544, | |
| "rewards/accuracy_reward": 0.2703125, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 1460 | |
| }, | |
| { | |
| "completion_length": 141.034375, | |
| "epoch": 0.6471384003754728, | |
| "grad_norm": 0.43943309783935547, | |
| "kl": 0.45238037109375, | |
| "learning_rate": 6.670174942790557e-06, | |
| "loss": 0.0181, | |
| "reward": 1.2390625, | |
| "reward_std": 0.2042137583717704, | |
| "rewards/accuracy_reward": 0.26484375, | |
| "rewards/format_reward": 0.97421875, | |
| "step": 1465 | |
| }, | |
| { | |
| "completion_length": 159.996875, | |
| "epoch": 0.6493470638579829, | |
| "grad_norm": 0.5421745181083679, | |
| "kl": 0.55458984375, | |
| "learning_rate": 6.597526449131232e-06, | |
| "loss": 0.0222, | |
| "reward": 1.26875, | |
| "reward_std": 0.25485040955245497, | |
| "rewards/accuracy_reward": 0.30703125, | |
| "rewards/format_reward": 0.96171875, | |
| "step": 1470 | |
| }, | |
| { | |
| "completion_length": 152.93984375, | |
| "epoch": 0.651555727340493, | |
| "grad_norm": 0.49404215812683105, | |
| "kl": 0.4667724609375, | |
| "learning_rate": 6.525080479618331e-06, | |
| "loss": 0.0187, | |
| "reward": 1.1671875, | |
| "reward_std": 0.22337155733257533, | |
| "rewards/accuracy_reward": 0.1984375, | |
| "rewards/format_reward": 0.96875, | |
| "step": 1475 | |
| }, | |
| { | |
| "completion_length": 142.815625, | |
| "epoch": 0.6537643908230032, | |
| "grad_norm": 0.5135136842727661, | |
| "kl": 0.438818359375, | |
| "learning_rate": 6.452841346425891e-06, | |
| "loss": 0.0176, | |
| "reward": 1.2640625, | |
| "reward_std": 0.24292335454374553, | |
| "rewards/accuracy_reward": 0.2875, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 1480 | |
| }, | |
| { | |
| "completion_length": 133.865625, | |
| "epoch": 0.6559730543055133, | |
| "grad_norm": 0.5439188480377197, | |
| "kl": 0.40977783203125, | |
| "learning_rate": 6.380813349416523e-06, | |
| "loss": 0.0164, | |
| "reward": 1.2625, | |
| "reward_std": 0.21160587538033723, | |
| "rewards/accuracy_reward": 0.27578125, | |
| "rewards/format_reward": 0.98671875, | |
| "step": 1485 | |
| }, | |
| { | |
| "completion_length": 141.23671875, | |
| "epoch": 0.6581817177880235, | |
| "grad_norm": 0.5219649076461792, | |
| "kl": 0.4312255859375, | |
| "learning_rate": 6.309000775885452e-06, | |
| "loss": 0.0172, | |
| "reward": 1.21796875, | |
| "reward_std": 0.21080582737922668, | |
| "rewards/accuracy_reward": 0.24375, | |
| "rewards/format_reward": 0.97421875, | |
| "step": 1490 | |
| }, | |
| { | |
| "completion_length": 156.421875, | |
| "epoch": 0.6603903812705336, | |
| "grad_norm": 0.6208651065826416, | |
| "kl": 0.45198974609375, | |
| "learning_rate": 6.237407900305334e-06, | |
| "loss": 0.0181, | |
| "reward": 1.21328125, | |
| "reward_std": 0.21545952204614877, | |
| "rewards/accuracy_reward": 0.2453125, | |
| "rewards/format_reward": 0.96796875, | |
| "step": 1495 | |
| }, | |
| { | |
| "completion_length": 157.92578125, | |
| "epoch": 0.6625990447530438, | |
| "grad_norm": 0.514742910861969, | |
| "kl": 0.445703125, | |
| "learning_rate": 6.166038984071833e-06, | |
| "loss": 0.0178, | |
| "reward": 1.20859375, | |
| "reward_std": 0.24552375469356774, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6625990447530438, | |
| "eval_completion_length": 150.59583343505858, | |
| "eval_kl": 0.510390625, | |
| "eval_loss": 0.020548084750771523, | |
| "eval_reward": 1.2470833349227906, | |
| "eval_reward_std": 0.23129627466201783, | |
| "eval_rewards/accuracy_reward": 0.2833333334326744, | |
| "eval_rewards/format_reward": 0.96375, | |
| "eval_runtime": 130.7052, | |
| "eval_samples_per_second": 0.757, | |
| "eval_steps_per_second": 0.031, | |
| "step": 1500 | |
| }, | |
| { | |
| "completion_length": 146.13515625, | |
| "epoch": 0.6648077082355539, | |
| "grad_norm": 0.46206313371658325, | |
| "kl": 0.51572265625, | |
| "learning_rate": 6.094898275249952e-06, | |
| "loss": 0.0206, | |
| "reward": 1.2828125, | |
| "reward_std": 0.23776858411729335, | |
| "rewards/accuracy_reward": 0.3203125, | |
| "rewards/format_reward": 0.9625, | |
| "step": 1505 | |
| }, | |
| { | |
| "completion_length": 115.98671875, | |
| "epoch": 0.6670163717180642, | |
| "grad_norm": 0.7776006460189819, | |
| "kl": 0.4756591796875, | |
| "learning_rate": 6.023990008321199e-06, | |
| "loss": 0.019, | |
| "reward": 1.24765625, | |
| "reward_std": 0.21786664836108685, | |
| "rewards/accuracy_reward": 0.26953125, | |
| "rewards/format_reward": 0.978125, | |
| "step": 1510 | |
| }, | |
| { | |
| "completion_length": 125.35625, | |
| "epoch": 0.6692250352005743, | |
| "grad_norm": 0.4395429193973541, | |
| "kl": 0.5373779296875, | |
| "learning_rate": 5.953318403931533e-06, | |
| "loss": 0.0215, | |
| "reward": 1.20703125, | |
| "reward_std": 0.22481790594756604, | |
| "rewards/accuracy_reward": 0.24765625, | |
| "rewards/format_reward": 0.959375, | |
| "step": 1515 | |
| }, | |
| { | |
| "completion_length": 99.33359375, | |
| "epoch": 0.6714336986830844, | |
| "grad_norm": 0.45579713582992554, | |
| "kl": 0.524560546875, | |
| "learning_rate": 5.882887668640138e-06, | |
| "loss": 0.021, | |
| "reward": 1.18828125, | |
| "reward_std": 0.16194322612136602, | |
| "rewards/accuracy_reward": 0.1984375, | |
| "rewards/format_reward": 0.98984375, | |
| "step": 1520 | |
| }, | |
| { | |
| "completion_length": 126.44296875, | |
| "epoch": 0.6736423621655946, | |
| "grad_norm": 0.42683523893356323, | |
| "kl": 0.4496826171875, | |
| "learning_rate": 5.812701994669028e-06, | |
| "loss": 0.018, | |
| "reward": 1.221875, | |
| "reward_std": 0.24250736236572265, | |
| "rewards/accuracy_reward": 0.24921875, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 1525 | |
| }, | |
| { | |
| "completion_length": 179.19921875, | |
| "epoch": 0.6758510256481047, | |
| "grad_norm": 0.9905825257301331, | |
| "kl": 0.49107666015625, | |
| "learning_rate": 5.742765559653537e-06, | |
| "loss": 0.0197, | |
| "reward": 1.16875, | |
| "reward_std": 0.25706158187240363, | |
| "rewards/accuracy_reward": 0.2171875, | |
| "rewards/format_reward": 0.9515625, | |
| "step": 1530 | |
| }, | |
| { | |
| "completion_length": 223.625, | |
| "epoch": 0.6780596891306149, | |
| "grad_norm": 0.45996785163879395, | |
| "kl": 0.45472412109375, | |
| "learning_rate": 5.673082526393634e-06, | |
| "loss": 0.0182, | |
| "reward": 1.1953125, | |
| "reward_std": 0.2674042139202356, | |
| "rewards/accuracy_reward": 0.2484375, | |
| "rewards/format_reward": 0.946875, | |
| "step": 1535 | |
| }, | |
| { | |
| "completion_length": 198.6390625, | |
| "epoch": 0.680268352613125, | |
| "grad_norm": 0.37783390283584595, | |
| "kl": 0.353515625, | |
| "learning_rate": 5.603657042606163e-06, | |
| "loss": 0.0141, | |
| "reward": 1.178125, | |
| "reward_std": 0.21960081458091735, | |
| "rewards/accuracy_reward": 0.21171875, | |
| "rewards/format_reward": 0.96640625, | |
| "step": 1540 | |
| }, | |
| { | |
| "completion_length": 197.028125, | |
| "epoch": 0.6824770160956352, | |
| "grad_norm": 0.6316084265708923, | |
| "kl": 0.368896484375, | |
| "learning_rate": 5.53449324067793e-06, | |
| "loss": 0.0148, | |
| "reward": 1.171875, | |
| "reward_std": 0.2550745034590364, | |
| "rewards/accuracy_reward": 0.21640625, | |
| "rewards/format_reward": 0.95546875, | |
| "step": 1545 | |
| }, | |
| { | |
| "completion_length": 172.515625, | |
| "epoch": 0.6846856795781453, | |
| "grad_norm": 0.5714349746704102, | |
| "kl": 0.41336669921875, | |
| "learning_rate": 5.465595237419768e-06, | |
| "loss": 0.0165, | |
| "reward": 1.265625, | |
| "reward_std": 0.23817113135010004, | |
| "rewards/accuracy_reward": 0.3140625, | |
| "rewards/format_reward": 0.9515625, | |
| "step": 1550 | |
| }, | |
| { | |
| "completion_length": 164.86171875, | |
| "epoch": 0.6868943430606554, | |
| "grad_norm": 0.31497815251350403, | |
| "kl": 0.40863037109375, | |
| "learning_rate": 5.396967133821461e-06, | |
| "loss": 0.0164, | |
| "reward": 1.24296875, | |
| "reward_std": 0.22363899704068899, | |
| "rewards/accuracy_reward": 0.278125, | |
| "rewards/format_reward": 0.96484375, | |
| "step": 1555 | |
| }, | |
| { | |
| "completion_length": 144.21171875, | |
| "epoch": 0.6891030065431656, | |
| "grad_norm": 0.3973537087440491, | |
| "kl": 0.39749755859375, | |
| "learning_rate": 5.3286130148076765e-06, | |
| "loss": 0.0159, | |
| "reward": 1.2203125, | |
| "reward_std": 0.20798758920282126, | |
| "rewards/accuracy_reward": 0.24375, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 1560 | |
| }, | |
| { | |
| "completion_length": 126.52421875, | |
| "epoch": 0.6913116700256757, | |
| "grad_norm": 0.5487494468688965, | |
| "kl": 0.41744384765625, | |
| "learning_rate": 5.260536948994786e-06, | |
| "loss": 0.0167, | |
| "reward": 1.246875, | |
| "reward_std": 0.2289178878068924, | |
| "rewards/accuracy_reward": 0.26640625, | |
| "rewards/format_reward": 0.98046875, | |
| "step": 1565 | |
| }, | |
| { | |
| "completion_length": 153.5671875, | |
| "epoch": 0.6935203335081859, | |
| "grad_norm": 1.368481993675232, | |
| "kl": 0.47470703125, | |
| "learning_rate": 5.192742988448707e-06, | |
| "loss": 0.019, | |
| "reward": 1.22890625, | |
| "reward_std": 0.24065108597278595, | |
| "rewards/accuracy_reward": 0.2828125, | |
| "rewards/format_reward": 0.94609375, | |
| "step": 1570 | |
| }, | |
| { | |
| "completion_length": 152.64296875, | |
| "epoch": 0.695728996990696, | |
| "grad_norm": 0.460273414850235, | |
| "kl": 0.53028564453125, | |
| "learning_rate": 5.125235168443714e-06, | |
| "loss": 0.0212, | |
| "reward": 1.22734375, | |
| "reward_std": 0.26983388569206, | |
| "rewards/accuracy_reward": 0.28125, | |
| "rewards/format_reward": 0.94609375, | |
| "step": 1575 | |
| }, | |
| { | |
| "completion_length": 121.12890625, | |
| "epoch": 0.6979376604732062, | |
| "grad_norm": 0.5840798020362854, | |
| "kl": 0.4253173828125, | |
| "learning_rate": 5.058017507222254e-06, | |
| "loss": 0.017, | |
| "reward": 1.215625, | |
| "reward_std": 0.20102577321231366, | |
| "rewards/accuracy_reward": 0.240625, | |
| "rewards/format_reward": 0.975, | |
| "step": 1580 | |
| }, | |
| { | |
| "completion_length": 110.19375, | |
| "epoch": 0.7001463239557163, | |
| "grad_norm": 0.48586878180503845, | |
| "kl": 0.4316650390625, | |
| "learning_rate": 4.99109400575575e-06, | |
| "loss": 0.0173, | |
| "reward": 1.28984375, | |
| "reward_std": 0.19885436855256558, | |
| "rewards/accuracy_reward": 0.30546875, | |
| "rewards/format_reward": 0.984375, | |
| "step": 1585 | |
| }, | |
| { | |
| "completion_length": 128.54609375, | |
| "epoch": 0.7023549874382264, | |
| "grad_norm": 1.0986440181732178, | |
| "kl": 0.42666015625, | |
| "learning_rate": 4.924468647506473e-06, | |
| "loss": 0.0171, | |
| "reward": 1.22421875, | |
| "reward_std": 0.23164935149252414, | |
| "rewards/accuracy_reward": 0.2515625, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 1590 | |
| }, | |
| { | |
| "completion_length": 149.9328125, | |
| "epoch": 0.7045636509207366, | |
| "grad_norm": 0.595619797706604, | |
| "kl": 0.46484375, | |
| "learning_rate": 4.8581453981904205e-06, | |
| "loss": 0.0186, | |
| "reward": 1.25078125, | |
| "reward_std": 0.2856699053198099, | |
| "rewards/accuracy_reward": 0.2984375, | |
| "rewards/format_reward": 0.95234375, | |
| "step": 1595 | |
| }, | |
| { | |
| "completion_length": 156.828125, | |
| "epoch": 0.7067723144032467, | |
| "grad_norm": 0.4311577081680298, | |
| "kl": 0.40791015625, | |
| "learning_rate": 4.792128205541286e-06, | |
| "loss": 0.0163, | |
| "reward": 1.18359375, | |
| "reward_std": 0.2468837944790721, | |
| "rewards/accuracy_reward": 0.2296875, | |
| "rewards/format_reward": 0.95390625, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7067723144032467, | |
| "eval_completion_length": 135.73458335876464, | |
| "eval_kl": 0.39359375, | |
| "eval_loss": 0.015719087794423103, | |
| "eval_reward": 1.2620833349227905, | |
| "eval_reward_std": 0.2296227565407753, | |
| "eval_rewards/accuracy_reward": 0.2908333334326744, | |
| "eval_rewards/format_reward": 0.97125, | |
| "eval_runtime": 135.2265, | |
| "eval_samples_per_second": 0.732, | |
| "eval_steps_per_second": 0.03, | |
| "step": 1600 | |
| }, | |
| { | |
| "completion_length": 158.7234375, | |
| "epoch": 0.7089809778857569, | |
| "grad_norm": 0.485534131526947, | |
| "kl": 0.39541015625, | |
| "learning_rate": 4.7264209990754594e-06, | |
| "loss": 0.0158, | |
| "reward": 1.265625, | |
| "reward_std": 0.2561331497505307, | |
| "rewards/accuracy_reward": 0.30703125, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 1605 | |
| }, | |
| { | |
| "completion_length": 163.1359375, | |
| "epoch": 0.711189641368267, | |
| "grad_norm": 0.38874199986457825, | |
| "kl": 0.3953125, | |
| "learning_rate": 4.661027689858142e-06, | |
| "loss": 0.0158, | |
| "reward": 1.15859375, | |
| "reward_std": 0.23531383704394102, | |
| "rewards/accuracy_reward": 0.2046875, | |
| "rewards/format_reward": 0.95390625, | |
| "step": 1610 | |
| }, | |
| { | |
| "completion_length": 131.1453125, | |
| "epoch": 0.7133983048507772, | |
| "grad_norm": 0.7737115025520325, | |
| "kl": 0.432568359375, | |
| "learning_rate": 4.595952170270542e-06, | |
| "loss": 0.0173, | |
| "reward": 1.23828125, | |
| "reward_std": 0.24948414210230113, | |
| "rewards/accuracy_reward": 0.26953125, | |
| "rewards/format_reward": 0.96875, | |
| "step": 1615 | |
| }, | |
| { | |
| "completion_length": 129.2015625, | |
| "epoch": 0.7156069683332873, | |
| "grad_norm": 0.44618454575538635, | |
| "kl": 0.44493408203125, | |
| "learning_rate": 4.5311983137782116e-06, | |
| "loss": 0.0178, | |
| "reward": 1.19453125, | |
| "reward_std": 0.18757406566292048, | |
| "rewards/accuracy_reward": 0.221875, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 1620 | |
| }, | |
| { | |
| "completion_length": 128.13671875, | |
| "epoch": 0.7178156318157974, | |
| "grad_norm": 0.4870763421058655, | |
| "kl": 0.44984130859375, | |
| "learning_rate": 4.4667699747004555e-06, | |
| "loss": 0.018, | |
| "reward": 1.22734375, | |
| "reward_std": 0.2222797654569149, | |
| "rewards/accuracy_reward": 0.2640625, | |
| "rewards/format_reward": 0.96328125, | |
| "step": 1625 | |
| }, | |
| { | |
| "completion_length": 128.1109375, | |
| "epoch": 0.7200242952983076, | |
| "grad_norm": 0.830633819103241, | |
| "kl": 0.471240234375, | |
| "learning_rate": 4.402670987980938e-06, | |
| "loss": 0.0189, | |
| "reward": 1.21796875, | |
| "reward_std": 0.2280710056424141, | |
| "rewards/accuracy_reward": 0.24921875, | |
| "rewards/format_reward": 0.96875, | |
| "step": 1630 | |
| }, | |
| { | |
| "completion_length": 116.93515625, | |
| "epoch": 0.7222329587808177, | |
| "grad_norm": 0.5954543948173523, | |
| "kl": 0.46390380859375, | |
| "learning_rate": 4.3389051689594e-06, | |
| "loss": 0.0186, | |
| "reward": 1.2171875, | |
| "reward_std": 0.21377347223460674, | |
| "rewards/accuracy_reward": 0.24296875, | |
| "rewards/format_reward": 0.97421875, | |
| "step": 1635 | |
| }, | |
| { | |
| "completion_length": 118.27734375, | |
| "epoch": 0.7244416222633279, | |
| "grad_norm": 0.42961063981056213, | |
| "kl": 0.41763916015625, | |
| "learning_rate": 4.275476313144578e-06, | |
| "loss": 0.0167, | |
| "reward": 1.2125, | |
| "reward_std": 0.21657640542834997, | |
| "rewards/accuracy_reward": 0.23828125, | |
| "rewards/format_reward": 0.97421875, | |
| "step": 1640 | |
| }, | |
| { | |
| "completion_length": 113.2859375, | |
| "epoch": 0.726650285745838, | |
| "grad_norm": 0.4778992831707001, | |
| "kl": 0.46826171875, | |
| "learning_rate": 4.212388195988267e-06, | |
| "loss": 0.0187, | |
| "reward": 1.221875, | |
| "reward_std": 0.2040594968944788, | |
| "rewards/accuracy_reward": 0.240625, | |
| "rewards/format_reward": 0.98125, | |
| "step": 1645 | |
| }, | |
| { | |
| "completion_length": 134.99296875, | |
| "epoch": 0.7288589492283482, | |
| "grad_norm": 0.868116021156311, | |
| "kl": 0.4645263671875, | |
| "learning_rate": 4.1496445726606064e-06, | |
| "loss": 0.0186, | |
| "reward": 1.246875, | |
| "reward_std": 0.20862858258187772, | |
| "rewards/accuracy_reward": 0.2734375, | |
| "rewards/format_reward": 0.9734375, | |
| "step": 1650 | |
| }, | |
| { | |
| "completion_length": 167.1515625, | |
| "epoch": 0.7310676127108583, | |
| "grad_norm": 0.5153465867042542, | |
| "kl": 0.4582275390625, | |
| "learning_rate": 4.087249177826553e-06, | |
| "loss": 0.0183, | |
| "reward": 1.26953125, | |
| "reward_std": 0.29226357098668815, | |
| "rewards/accuracy_reward": 0.3171875, | |
| "rewards/format_reward": 0.95234375, | |
| "step": 1655 | |
| }, | |
| { | |
| "completion_length": 167.3953125, | |
| "epoch": 0.7332762761933684, | |
| "grad_norm": 0.9335393905639648, | |
| "kl": 0.4902099609375, | |
| "learning_rate": 4.025205725423607e-06, | |
| "loss": 0.0196, | |
| "reward": 1.2296875, | |
| "reward_std": 0.3044658374041319, | |
| "rewards/accuracy_reward": 0.2875, | |
| "rewards/format_reward": 0.9421875, | |
| "step": 1660 | |
| }, | |
| { | |
| "completion_length": 124.8671875, | |
| "epoch": 0.7354849396758786, | |
| "grad_norm": 0.4761280119419098, | |
| "kl": 0.4821533203125, | |
| "learning_rate": 3.963517908440716e-06, | |
| "loss": 0.0193, | |
| "reward": 1.22578125, | |
| "reward_std": 0.19983983058482407, | |
| "rewards/accuracy_reward": 0.246875, | |
| "rewards/format_reward": 0.97890625, | |
| "step": 1665 | |
| }, | |
| { | |
| "completion_length": 127.8421875, | |
| "epoch": 0.7376936031583888, | |
| "grad_norm": 0.5155916810035706, | |
| "kl": 0.43746337890625, | |
| "learning_rate": 3.902189398698482e-06, | |
| "loss": 0.0175, | |
| "reward": 1.18515625, | |
| "reward_std": 0.17090727612376214, | |
| "rewards/accuracy_reward": 0.2078125, | |
| "rewards/format_reward": 0.97734375, | |
| "step": 1670 | |
| }, | |
| { | |
| "completion_length": 146.315625, | |
| "epoch": 0.739902266640899, | |
| "grad_norm": 0.3942101001739502, | |
| "kl": 0.41949462890625, | |
| "learning_rate": 3.841223846630599e-06, | |
| "loss": 0.0168, | |
| "reward": 1.21328125, | |
| "reward_std": 0.19338544998317958, | |
| "rewards/accuracy_reward": 0.23671875, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 1675 | |
| }, | |
| { | |
| "completion_length": 170.7625, | |
| "epoch": 0.7421109301234091, | |
| "grad_norm": 0.4603167176246643, | |
| "kl": 0.39854736328125, | |
| "learning_rate": 3.7806248810665613e-06, | |
| "loss": 0.0159, | |
| "reward": 1.25078125, | |
| "reward_std": 0.22261980101466178, | |
| "rewards/accuracy_reward": 0.2734375, | |
| "rewards/format_reward": 0.97734375, | |
| "step": 1680 | |
| }, | |
| { | |
| "completion_length": 195.67265625, | |
| "epoch": 0.7443195936059193, | |
| "grad_norm": 0.527126669883728, | |
| "kl": 0.41444091796875, | |
| "learning_rate": 3.720396109015686e-06, | |
| "loss": 0.0166, | |
| "reward": 1.2375, | |
| "reward_std": 0.24860016535967588, | |
| "rewards/accuracy_reward": 0.275, | |
| "rewards/format_reward": 0.9625, | |
| "step": 1685 | |
| }, | |
| { | |
| "completion_length": 178.1625, | |
| "epoch": 0.7465282570884294, | |
| "grad_norm": 0.4369182884693146, | |
| "kl": 0.37342529296875, | |
| "learning_rate": 3.6605411154523885e-06, | |
| "loss": 0.0149, | |
| "reward": 1.24140625, | |
| "reward_std": 0.22675297893583773, | |
| "rewards/accuracy_reward": 0.26953125, | |
| "rewards/format_reward": 0.971875, | |
| "step": 1690 | |
| }, | |
| { | |
| "completion_length": 168.7265625, | |
| "epoch": 0.7487369205709395, | |
| "grad_norm": 0.5954079031944275, | |
| "kl": 0.381005859375, | |
| "learning_rate": 3.601063463102823e-06, | |
| "loss": 0.0152, | |
| "reward": 1.21015625, | |
| "reward_std": 0.25745327677577734, | |
| "rewards/accuracy_reward": 0.24296875, | |
| "rewards/format_reward": 0.9671875, | |
| "step": 1695 | |
| }, | |
| { | |
| "completion_length": 176.64375, | |
| "epoch": 0.7509455840534497, | |
| "grad_norm": 0.4956624507904053, | |
| "kl": 0.423779296875, | |
| "learning_rate": 3.5419666922327854e-06, | |
| "loss": 0.017, | |
| "reward": 1.24609375, | |
| "reward_std": 0.25288699120283126, | |
| "rewards/accuracy_reward": 0.28125, | |
| "rewards/format_reward": 0.96484375, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.7509455840534497, | |
| "eval_completion_length": 163.3483334350586, | |
| "eval_kl": 0.376171875, | |
| "eval_loss": 0.015068257227540016, | |
| "eval_reward": 1.247916669845581, | |
| "eval_reward_std": 0.2429444035887718, | |
| "eval_rewards/accuracy_reward": 0.2904166667163372, | |
| "eval_rewards/format_reward": 0.9575, | |
| "eval_runtime": 148.8095, | |
| "eval_samples_per_second": 0.665, | |
| "eval_steps_per_second": 0.027, | |
| "step": 1700 | |
| }, | |
| { | |
| "completion_length": 165.946875, | |
| "epoch": 0.7531542475359598, | |
| "grad_norm": 0.4972885251045227, | |
| "kl": 0.40225830078125, | |
| "learning_rate": 3.4832543204370284e-06, | |
| "loss": 0.0161, | |
| "reward": 1.203125, | |
| "reward_std": 0.26637718454003334, | |
| "rewards/accuracy_reward": 0.2375, | |
| "rewards/format_reward": 0.965625, | |
| "step": 1705 | |
| }, | |
| { | |
| "completion_length": 173.74609375, | |
| "epoch": 0.75536291101847, | |
| "grad_norm": 0.6974431872367859, | |
| "kl": 0.40240478515625, | |
| "learning_rate": 3.424929842429848e-06, | |
| "loss": 0.0161, | |
| "reward": 1.17734375, | |
| "reward_std": 0.2468060377985239, | |
| "rewards/accuracy_reward": 0.22109375, | |
| "rewards/format_reward": 0.95625, | |
| "step": 1710 | |
| }, | |
| { | |
| "completion_length": 154.61953125, | |
| "epoch": 0.7575715745009801, | |
| "grad_norm": 0.43418002128601074, | |
| "kl": 0.3802001953125, | |
| "learning_rate": 3.366996729837102e-06, | |
| "loss": 0.0152, | |
| "reward": 1.2546875, | |
| "reward_std": 0.26053862273693085, | |
| "rewards/accuracy_reward": 0.29140625, | |
| "rewards/format_reward": 0.96328125, | |
| "step": 1715 | |
| }, | |
| { | |
| "completion_length": 141.70625, | |
| "epoch": 0.7597802379834903, | |
| "grad_norm": 0.5373135805130005, | |
| "kl": 0.39423828125, | |
| "learning_rate": 3.309458430989527e-06, | |
| "loss": 0.0158, | |
| "reward": 1.2203125, | |
| "reward_std": 0.216771724447608, | |
| "rewards/accuracy_reward": 0.24765625, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 1720 | |
| }, | |
| { | |
| "completion_length": 142.44765625, | |
| "epoch": 0.7619889014660004, | |
| "grad_norm": 0.5140719413757324, | |
| "kl": 0.40374755859375, | |
| "learning_rate": 3.2523183707175366e-06, | |
| "loss": 0.0161, | |
| "reward": 1.234375, | |
| "reward_std": 0.21065853331238032, | |
| "rewards/accuracy_reward": 0.2671875, | |
| "rewards/format_reward": 0.9671875, | |
| "step": 1725 | |
| }, | |
| { | |
| "completion_length": 158.240625, | |
| "epoch": 0.7641975649485105, | |
| "grad_norm": 0.6687895655632019, | |
| "kl": 0.46646728515625, | |
| "learning_rate": 3.1955799501473226e-06, | |
| "loss": 0.0187, | |
| "reward": 1.18515625, | |
| "reward_std": 0.3053234376013279, | |
| "rewards/accuracy_reward": 0.24453125, | |
| "rewards/format_reward": 0.940625, | |
| "step": 1730 | |
| }, | |
| { | |
| "completion_length": 169.2078125, | |
| "epoch": 0.7664062284310207, | |
| "grad_norm": 0.4844968616962433, | |
| "kl": 0.48839111328125, | |
| "learning_rate": 3.1392465464984455e-06, | |
| "loss": 0.0195, | |
| "reward": 1.196875, | |
| "reward_std": 0.29177020620554683, | |
| "rewards/accuracy_reward": 0.26640625, | |
| "rewards/format_reward": 0.93046875, | |
| "step": 1735 | |
| }, | |
| { | |
| "completion_length": 152.02890625, | |
| "epoch": 0.7686148919135308, | |
| "grad_norm": 0.5372802019119263, | |
| "kl": 0.461181640625, | |
| "learning_rate": 3.083321512882773e-06, | |
| "loss": 0.0184, | |
| "reward": 1.17421875, | |
| "reward_std": 0.2651050504297018, | |
| "rewards/accuracy_reward": 0.22421875, | |
| "rewards/format_reward": 0.95, | |
| "step": 1740 | |
| }, | |
| { | |
| "completion_length": 129.86171875, | |
| "epoch": 0.770823555396041, | |
| "grad_norm": 0.781815767288208, | |
| "kl": 1.108740234375, | |
| "learning_rate": 3.0278081781049405e-06, | |
| "loss": 0.0444, | |
| "reward": 1.23125, | |
| "reward_std": 0.25756825953722, | |
| "rewards/accuracy_reward": 0.271875, | |
| "rewards/format_reward": 0.959375, | |
| "step": 1745 | |
| }, | |
| { | |
| "completion_length": 134.19296875, | |
| "epoch": 0.7730322188785511, | |
| "grad_norm": 0.5345208644866943, | |
| "kl": 0.44124755859375, | |
| "learning_rate": 2.9727098464641735e-06, | |
| "loss": 0.0177, | |
| "reward": 1.26875, | |
| "reward_std": 0.23357822820544244, | |
| "rewards/accuracy_reward": 0.30703125, | |
| "rewards/format_reward": 0.96171875, | |
| "step": 1750 | |
| }, | |
| { | |
| "completion_length": 136.47734375, | |
| "epoch": 0.7752408823610613, | |
| "grad_norm": 0.6541038155555725, | |
| "kl": 0.44271240234375, | |
| "learning_rate": 2.9180297975576368e-06, | |
| "loss": 0.0177, | |
| "reward": 1.2328125, | |
| "reward_std": 0.22248574066907167, | |
| "rewards/accuracy_reward": 0.271875, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 1755 | |
| }, | |
| { | |
| "completion_length": 120.7203125, | |
| "epoch": 0.7774495458435714, | |
| "grad_norm": 0.4700789749622345, | |
| "kl": 0.449462890625, | |
| "learning_rate": 2.8637712860851974e-06, | |
| "loss": 0.018, | |
| "reward": 1.24765625, | |
| "reward_std": 0.23170709386467933, | |
| "rewards/accuracy_reward": 0.28125, | |
| "rewards/format_reward": 0.96640625, | |
| "step": 1760 | |
| }, | |
| { | |
| "completion_length": 123.3, | |
| "epoch": 0.7796582093260815, | |
| "grad_norm": 0.6256026029586792, | |
| "kl": 0.42320556640625, | |
| "learning_rate": 2.8099375416557163e-06, | |
| "loss": 0.0169, | |
| "reward": 1.2609375, | |
| "reward_std": 0.22770290337502958, | |
| "rewards/accuracy_reward": 0.29140625, | |
| "rewards/format_reward": 0.96953125, | |
| "step": 1765 | |
| }, | |
| { | |
| "completion_length": 115.96328125, | |
| "epoch": 0.7818668728085917, | |
| "grad_norm": 0.5066558718681335, | |
| "kl": 0.43795166015625, | |
| "learning_rate": 2.7565317685948e-06, | |
| "loss": 0.0175, | |
| "reward": 1.2984375, | |
| "reward_std": 0.22580576539039612, | |
| "rewards/accuracy_reward": 0.325, | |
| "rewards/format_reward": 0.9734375, | |
| "step": 1770 | |
| }, | |
| { | |
| "completion_length": 131.2125, | |
| "epoch": 0.7840755362911018, | |
| "grad_norm": 0.8898307681083679, | |
| "kl": 0.4900634765625, | |
| "learning_rate": 2.7035571457540865e-06, | |
| "loss": 0.0196, | |
| "reward": 1.23203125, | |
| "reward_std": 0.23165916074067355, | |
| "rewards/accuracy_reward": 0.2671875, | |
| "rewards/format_reward": 0.96484375, | |
| "step": 1775 | |
| }, | |
| { | |
| "completion_length": 136.24453125, | |
| "epoch": 0.786284199773612, | |
| "grad_norm": 0.49983325600624084, | |
| "kl": 0.4647705078125, | |
| "learning_rate": 2.651016826322017e-06, | |
| "loss": 0.0186, | |
| "reward": 1.26015625, | |
| "reward_std": 0.23627216089516878, | |
| "rewards/accuracy_reward": 0.3046875, | |
| "rewards/format_reward": 0.95546875, | |
| "step": 1780 | |
| }, | |
| { | |
| "completion_length": 120.8796875, | |
| "epoch": 0.7884928632561221, | |
| "grad_norm": 0.6942005753517151, | |
| "kl": 0.459130859375, | |
| "learning_rate": 2.598913937636153e-06, | |
| "loss": 0.0184, | |
| "reward": 1.24375, | |
| "reward_std": 0.2460821120068431, | |
| "rewards/accuracy_reward": 0.27578125, | |
| "rewards/format_reward": 0.96796875, | |
| "step": 1785 | |
| }, | |
| { | |
| "completion_length": 112.2453125, | |
| "epoch": 0.7907015267386323, | |
| "grad_norm": 0.48447439074516296, | |
| "kl": 0.4265380859375, | |
| "learning_rate": 2.5472515809970343e-06, | |
| "loss": 0.0171, | |
| "reward": 1.2421875, | |
| "reward_std": 0.19503602739423515, | |
| "rewards/accuracy_reward": 0.27109375, | |
| "rewards/format_reward": 0.97109375, | |
| "step": 1790 | |
| }, | |
| { | |
| "completion_length": 105.003125, | |
| "epoch": 0.7929101902211424, | |
| "grad_norm": 0.9765954613685608, | |
| "kl": 0.48828125, | |
| "learning_rate": 2.4960328314835746e-06, | |
| "loss": 0.0195, | |
| "reward": 1.21796875, | |
| "reward_std": 0.1995122255757451, | |
| "rewards/accuracy_reward": 0.240625, | |
| "rewards/format_reward": 0.97734375, | |
| "step": 1795 | |
| }, | |
| { | |
| "completion_length": 110.82421875, | |
| "epoch": 0.7951188537036525, | |
| "grad_norm": 1.8787330389022827, | |
| "kl": 0.458447265625, | |
| "learning_rate": 2.4452607377700367e-06, | |
| "loss": 0.0183, | |
| "reward": 1.25625, | |
| "reward_std": 0.23729459717869758, | |
| "rewards/accuracy_reward": 0.2828125, | |
| "rewards/format_reward": 0.9734375, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.7951188537036525, | |
| "eval_completion_length": 112.79791687011719, | |
| "eval_kl": 0.497265625, | |
| "eval_loss": 0.0199314896017313, | |
| "eval_reward": 1.2658333349227906, | |
| "eval_reward_std": 0.2067297151684761, | |
| "eval_rewards/accuracy_reward": 0.2920833334326744, | |
| "eval_rewards/format_reward": 0.97375, | |
| "eval_runtime": 119.8153, | |
| "eval_samples_per_second": 0.826, | |
| "eval_steps_per_second": 0.033, | |
| "step": 1800 | |
| }, | |
| { | |
| "completion_length": 109.19375, | |
| "epoch": 0.7973275171861627, | |
| "grad_norm": 2.065337657928467, | |
| "kl": 0.4661865234375, | |
| "learning_rate": 2.394938321944551e-06, | |
| "loss": 0.0187, | |
| "reward": 1.2140625, | |
| "reward_std": 0.22883423641324044, | |
| "rewards/accuracy_reward": 0.2421875, | |
| "rewards/format_reward": 0.971875, | |
| "step": 1805 | |
| }, | |
| { | |
| "completion_length": 108.35, | |
| "epoch": 0.7995361806686728, | |
| "grad_norm": 0.6966126561164856, | |
| "kl": 0.7103515625, | |
| "learning_rate": 2.3450685793292437e-06, | |
| "loss": 0.0284, | |
| "reward": 1.17265625, | |
| "reward_std": 0.17452798802405595, | |
| "rewards/accuracy_reward": 0.2015625, | |
| "rewards/format_reward": 0.97109375, | |
| "step": 1810 | |
| }, | |
| { | |
| "completion_length": 113.3828125, | |
| "epoch": 0.801744844151183, | |
| "grad_norm": 0.8000837564468384, | |
| "kl": 0.7013427734375, | |
| "learning_rate": 2.295654478301942e-06, | |
| "loss": 0.0281, | |
| "reward": 1.234375, | |
| "reward_std": 0.21197393592447042, | |
| "rewards/accuracy_reward": 0.2640625, | |
| "rewards/format_reward": 0.9703125, | |
| "step": 1815 | |
| }, | |
| { | |
| "completion_length": 116.340625, | |
| "epoch": 0.8039535076336931, | |
| "grad_norm": 0.9716657996177673, | |
| "kl": 0.9371826171875, | |
| "learning_rate": 2.246698960119499e-06, | |
| "loss": 0.0375, | |
| "reward": 1.221875, | |
| "reward_std": 0.23381243012845515, | |
| "rewards/accuracy_reward": 0.246875, | |
| "rewards/format_reward": 0.975, | |
| "step": 1820 | |
| }, | |
| { | |
| "completion_length": 143.3609375, | |
| "epoch": 0.8061621711162033, | |
| "grad_norm": 3.171027898788452, | |
| "kl": 0.77176513671875, | |
| "learning_rate": 2.198204938742707e-06, | |
| "loss": 0.0309, | |
| "reward": 1.20234375, | |
| "reward_std": 0.25977810826152564, | |
| "rewards/accuracy_reward": 0.246875, | |
| "rewards/format_reward": 0.95546875, | |
| "step": 1825 | |
| }, | |
| { | |
| "completion_length": 135.51484375, | |
| "epoch": 0.8083708345987135, | |
| "grad_norm": 1.4249086380004883, | |
| "kl": 0.80992431640625, | |
| "learning_rate": 2.150175300662862e-06, | |
| "loss": 0.0324, | |
| "reward": 1.18671875, | |
| "reward_std": 0.24862184505909682, | |
| "rewards/accuracy_reward": 0.228125, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 1830 | |
| }, | |
| { | |
| "completion_length": 151.6921875, | |
| "epoch": 0.8105794980812236, | |
| "grad_norm": 1.0902801752090454, | |
| "kl": 0.86148681640625, | |
| "learning_rate": 2.1026129047299436e-06, | |
| "loss": 0.0345, | |
| "reward": 1.2, | |
| "reward_std": 0.260273445956409, | |
| "rewards/accuracy_reward": 0.25703125, | |
| "rewards/format_reward": 0.94296875, | |
| "step": 1835 | |
| }, | |
| { | |
| "completion_length": 140.75859375, | |
| "epoch": 0.8127881615637338, | |
| "grad_norm": 0.5658231973648071, | |
| "kl": 0.635693359375, | |
| "learning_rate": 2.055520581982463e-06, | |
| "loss": 0.0254, | |
| "reward": 1.26171875, | |
| "reward_std": 0.26789135448634627, | |
| "rewards/accuracy_reward": 0.30546875, | |
| "rewards/format_reward": 0.95625, | |
| "step": 1840 | |
| }, | |
| { | |
| "completion_length": 108.0703125, | |
| "epoch": 0.8149968250462439, | |
| "grad_norm": 0.7409191727638245, | |
| "kl": 0.51619873046875, | |
| "learning_rate": 2.0089011354789357e-06, | |
| "loss": 0.0206, | |
| "reward": 1.2359375, | |
| "reward_std": 0.21879921518266202, | |
| "rewards/accuracy_reward": 0.26171875, | |
| "rewards/format_reward": 0.97421875, | |
| "step": 1845 | |
| }, | |
| { | |
| "completion_length": 131.575, | |
| "epoch": 0.8172054885287541, | |
| "grad_norm": 0.933627724647522, | |
| "kl": 0.5877197265625, | |
| "learning_rate": 1.9627573401310452e-06, | |
| "loss": 0.0235, | |
| "reward": 1.23203125, | |
| "reward_std": 0.26072712801396847, | |
| "rewards/accuracy_reward": 0.275, | |
| "rewards/format_reward": 0.95703125, | |
| "step": 1850 | |
| }, | |
| { | |
| "completion_length": 116.653125, | |
| "epoch": 0.8194141520112642, | |
| "grad_norm": 0.5518223643302917, | |
| "kl": 0.523291015625, | |
| "learning_rate": 1.9170919425384695e-06, | |
| "loss": 0.0209, | |
| "reward": 1.296875, | |
| "reward_std": 0.23366234563291072, | |
| "rewards/accuracy_reward": 0.32265625, | |
| "rewards/format_reward": 0.97421875, | |
| "step": 1855 | |
| }, | |
| { | |
| "completion_length": 121.140625, | |
| "epoch": 0.8216228154937744, | |
| "grad_norm": 0.5761392712593079, | |
| "kl": 0.49658203125, | |
| "learning_rate": 1.8719076608254028e-06, | |
| "loss": 0.0199, | |
| "reward": 1.225, | |
| "reward_std": 0.21435881238430737, | |
| "rewards/accuracy_reward": 0.25390625, | |
| "rewards/format_reward": 0.97109375, | |
| "step": 1860 | |
| }, | |
| { | |
| "completion_length": 136.73359375, | |
| "epoch": 0.8238314789762845, | |
| "grad_norm": 0.7133124470710754, | |
| "kl": 0.52952880859375, | |
| "learning_rate": 1.8272071844787575e-06, | |
| "loss": 0.0212, | |
| "reward": 1.1703125, | |
| "reward_std": 0.2595834471285343, | |
| "rewards/accuracy_reward": 0.209375, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 1865 | |
| }, | |
| { | |
| "completion_length": 141.15390625, | |
| "epoch": 0.8260401424587946, | |
| "grad_norm": 0.9754716157913208, | |
| "kl": 0.5518310546875, | |
| "learning_rate": 1.7829931741880802e-06, | |
| "loss": 0.0221, | |
| "reward": 1.2625, | |
| "reward_std": 0.28494130074977875, | |
| "rewards/accuracy_reward": 0.31328125, | |
| "rewards/format_reward": 0.94921875, | |
| "step": 1870 | |
| }, | |
| { | |
| "completion_length": 123.50234375, | |
| "epoch": 0.8282488059413048, | |
| "grad_norm": 0.912441611289978, | |
| "kl": 0.601611328125, | |
| "learning_rate": 1.7392682616871836e-06, | |
| "loss": 0.0241, | |
| "reward": 1.225, | |
| "reward_std": 0.2650785157456994, | |
| "rewards/accuracy_reward": 0.2609375, | |
| "rewards/format_reward": 0.9640625, | |
| "step": 1875 | |
| }, | |
| { | |
| "completion_length": 116.25625, | |
| "epoch": 0.8304574694238149, | |
| "grad_norm": 0.719677209854126, | |
| "kl": 0.52056884765625, | |
| "learning_rate": 1.696035049597503e-06, | |
| "loss": 0.0208, | |
| "reward": 1.24453125, | |
| "reward_std": 0.22317184396088124, | |
| "rewards/accuracy_reward": 0.26796875, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 1880 | |
| }, | |
| { | |
| "completion_length": 118.2171875, | |
| "epoch": 0.8326661329063251, | |
| "grad_norm": 0.5467638969421387, | |
| "kl": 0.5210205078125, | |
| "learning_rate": 1.6532961112731672e-06, | |
| "loss": 0.0208, | |
| "reward": 1.23828125, | |
| "reward_std": 0.19823672361671923, | |
| "rewards/accuracy_reward": 0.26171875, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 1885 | |
| }, | |
| { | |
| "completion_length": 113.9359375, | |
| "epoch": 0.8348747963888352, | |
| "grad_norm": 0.6019476652145386, | |
| "kl": 0.444091796875, | |
| "learning_rate": 1.6110539906478463e-06, | |
| "loss": 0.0178, | |
| "reward": 1.3015625, | |
| "reward_std": 0.2479660578072071, | |
| "rewards/accuracy_reward": 0.32421875, | |
| "rewards/format_reward": 0.97734375, | |
| "step": 1890 | |
| }, | |
| { | |
| "completion_length": 122.75546875, | |
| "epoch": 0.8370834598713454, | |
| "grad_norm": 0.9671128988265991, | |
| "kl": 0.5489501953125, | |
| "learning_rate": 1.5693112020833012e-06, | |
| "loss": 0.022, | |
| "reward": 1.2703125, | |
| "reward_std": 0.2313979933038354, | |
| "rewards/accuracy_reward": 0.296875, | |
| "rewards/format_reward": 0.9734375, | |
| "step": 1895 | |
| }, | |
| { | |
| "completion_length": 118.28046875, | |
| "epoch": 0.8392921233538555, | |
| "grad_norm": 0.9875019788742065, | |
| "kl": 0.4894287109375, | |
| "learning_rate": 1.528070230219756e-06, | |
| "loss": 0.0196, | |
| "reward": 1.2171875, | |
| "reward_std": 0.21601561345160009, | |
| "rewards/accuracy_reward": 0.24140625, | |
| "rewards/format_reward": 0.97578125, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.8392921233538555, | |
| "eval_completion_length": 123.54375, | |
| "eval_kl": 0.558125, | |
| "eval_loss": 0.022584721446037292, | |
| "eval_reward": 1.295, | |
| "eval_reward_std": 0.21910995721817017, | |
| "eval_rewards/accuracy_reward": 0.32416666686534884, | |
| "eval_rewards/format_reward": 0.9708333349227906, | |
| "eval_runtime": 124.5976, | |
| "eval_samples_per_second": 0.795, | |
| "eval_steps_per_second": 0.032, | |
| "step": 1900 | |
| }, | |
| { | |
| "completion_length": 136.87578125, | |
| "epoch": 0.8415007868363656, | |
| "grad_norm": 0.6465866565704346, | |
| "kl": 0.58070068359375, | |
| "learning_rate": 1.4873335298279801e-06, | |
| "loss": 0.0232, | |
| "reward": 1.27265625, | |
| "reward_std": 0.22537416350096465, | |
| "rewards/accuracy_reward": 0.3078125, | |
| "rewards/format_reward": 0.96484375, | |
| "step": 1905 | |
| }, | |
| { | |
| "completion_length": 152.965625, | |
| "epoch": 0.8437094503188758, | |
| "grad_norm": 0.5781317353248596, | |
| "kl": 0.5817138671875, | |
| "learning_rate": 1.447103525663186e-06, | |
| "loss": 0.0233, | |
| "reward": 1.20234375, | |
| "reward_std": 0.23281266931444405, | |
| "rewards/accuracy_reward": 0.24375, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 1910 | |
| }, | |
| { | |
| "completion_length": 153.8953125, | |
| "epoch": 0.8459181138013859, | |
| "grad_norm": 0.6810880303382874, | |
| "kl": 0.59539794921875, | |
| "learning_rate": 1.4073826123206946e-06, | |
| "loss": 0.0238, | |
| "reward": 1.259375, | |
| "reward_std": 0.2544757820665836, | |
| "rewards/accuracy_reward": 0.29609375, | |
| "rewards/format_reward": 0.96328125, | |
| "step": 1915 | |
| }, | |
| { | |
| "completion_length": 143.8140625, | |
| "epoch": 0.8481267772838961, | |
| "grad_norm": 0.6009095311164856, | |
| "kl": 0.5434326171875, | |
| "learning_rate": 1.368173154093414e-06, | |
| "loss": 0.0217, | |
| "reward": 1.2453125, | |
| "reward_std": 0.2513797411695123, | |
| "rewards/accuracy_reward": 0.2828125, | |
| "rewards/format_reward": 0.9625, | |
| "step": 1920 | |
| }, | |
| { | |
| "completion_length": 159.8171875, | |
| "epoch": 0.8503354407664062, | |
| "grad_norm": 0.6377544403076172, | |
| "kl": 0.572900390625, | |
| "learning_rate": 1.3294774848310954e-06, | |
| "loss": 0.0229, | |
| "reward": 1.23046875, | |
| "reward_std": 0.27456119302660226, | |
| "rewards/accuracy_reward": 0.27421875, | |
| "rewards/format_reward": 0.95625, | |
| "step": 1925 | |
| }, | |
| { | |
| "completion_length": 155.94453125, | |
| "epoch": 0.8525441042489164, | |
| "grad_norm": 0.5800438523292542, | |
| "kl": 0.6190185546875, | |
| "learning_rate": 1.2912979078014242e-06, | |
| "loss": 0.0248, | |
| "reward": 1.20703125, | |
| "reward_std": 0.23998625949025154, | |
| "rewards/accuracy_reward": 0.253125, | |
| "rewards/format_reward": 0.95390625, | |
| "step": 1930 | |
| }, | |
| { | |
| "completion_length": 159.3046875, | |
| "epoch": 0.8547527677314265, | |
| "grad_norm": 0.9404852986335754, | |
| "kl": 0.56507568359375, | |
| "learning_rate": 1.253636695552931e-06, | |
| "loss": 0.0226, | |
| "reward": 1.259375, | |
| "reward_std": 0.2761839430779219, | |
| "rewards/accuracy_reward": 0.30703125, | |
| "rewards/format_reward": 0.95234375, | |
| "step": 1935 | |
| }, | |
| { | |
| "completion_length": 154.96953125, | |
| "epoch": 0.8569614312139366, | |
| "grad_norm": 0.9105063676834106, | |
| "kl": 0.64293212890625, | |
| "learning_rate": 1.216496089779703e-06, | |
| "loss": 0.0257, | |
| "reward": 1.22578125, | |
| "reward_std": 0.27103531677275894, | |
| "rewards/accuracy_reward": 0.271875, | |
| "rewards/format_reward": 0.95390625, | |
| "step": 1940 | |
| }, | |
| { | |
| "completion_length": 147.2546875, | |
| "epoch": 0.8591700946964468, | |
| "grad_norm": 0.6244191527366638, | |
| "kl": 0.49593505859375, | |
| "learning_rate": 1.1798783011879766e-06, | |
| "loss": 0.0198, | |
| "reward": 1.26484375, | |
| "reward_std": 0.2632339050993323, | |
| "rewards/accuracy_reward": 0.30546875, | |
| "rewards/format_reward": 0.959375, | |
| "step": 1945 | |
| }, | |
| { | |
| "completion_length": 150.15234375, | |
| "epoch": 0.8613787581789569, | |
| "grad_norm": 0.9317097663879395, | |
| "kl": 0.51068115234375, | |
| "learning_rate": 1.14378550936453e-06, | |
| "loss": 0.0204, | |
| "reward": 1.2453125, | |
| "reward_std": 0.2504005776718259, | |
| "rewards/accuracy_reward": 0.28828125, | |
| "rewards/format_reward": 0.95703125, | |
| "step": 1950 | |
| }, | |
| { | |
| "completion_length": 136.7046875, | |
| "epoch": 0.8635874216614671, | |
| "grad_norm": 0.5654709935188293, | |
| "kl": 0.496484375, | |
| "learning_rate": 1.1082198626469687e-06, | |
| "loss": 0.0199, | |
| "reward": 1.22421875, | |
| "reward_std": 0.23155678305774927, | |
| "rewards/accuracy_reward": 0.25625, | |
| "rewards/format_reward": 0.96796875, | |
| "step": 1955 | |
| }, | |
| { | |
| "completion_length": 135.80859375, | |
| "epoch": 0.8657960851439772, | |
| "grad_norm": 0.4000113904476166, | |
| "kl": 0.499267578125, | |
| "learning_rate": 1.0731834779958217e-06, | |
| "loss": 0.02, | |
| "reward": 1.253125, | |
| "reward_std": 0.2156506871804595, | |
| "rewards/accuracy_reward": 0.290625, | |
| "rewards/format_reward": 0.9625, | |
| "step": 1960 | |
| }, | |
| { | |
| "completion_length": 140.81640625, | |
| "epoch": 0.8680047486264874, | |
| "grad_norm": 0.41182902455329895, | |
| "kl": 0.44923095703125, | |
| "learning_rate": 1.0386784408685713e-06, | |
| "loss": 0.018, | |
| "reward": 1.1765625, | |
| "reward_std": 0.21103496849536896, | |
| "rewards/accuracy_reward": 0.2125, | |
| "rewards/format_reward": 0.9640625, | |
| "step": 1965 | |
| }, | |
| { | |
| "completion_length": 144.734375, | |
| "epoch": 0.8702134121089975, | |
| "grad_norm": 0.6124417781829834, | |
| "kl": 0.5717529296875, | |
| "learning_rate": 1.0047068050954868e-06, | |
| "loss": 0.0229, | |
| "reward": 1.19140625, | |
| "reward_std": 0.2501412840560079, | |
| "rewards/accuracy_reward": 0.2359375, | |
| "rewards/format_reward": 0.95546875, | |
| "step": 1970 | |
| }, | |
| { | |
| "completion_length": 137.81015625, | |
| "epoch": 0.8724220755915076, | |
| "grad_norm": 0.7430810928344727, | |
| "kl": 0.51724853515625, | |
| "learning_rate": 9.71270592757404e-07, | |
| "loss": 0.0207, | |
| "reward": 1.25234375, | |
| "reward_std": 0.2686174543574452, | |
| "rewards/accuracy_reward": 0.28359375, | |
| "rewards/format_reward": 0.96875, | |
| "step": 1975 | |
| }, | |
| { | |
| "completion_length": 117.8140625, | |
| "epoch": 0.8746307390740178, | |
| "grad_norm": 0.48936066031455994, | |
| "kl": 0.59530029296875, | |
| "learning_rate": 9.38371794065337e-07, | |
| "loss": 0.0238, | |
| "reward": 1.2453125, | |
| "reward_std": 0.21264754123985768, | |
| "rewards/accuracy_reward": 0.27109375, | |
| "rewards/format_reward": 0.97421875, | |
| "step": 1980 | |
| }, | |
| { | |
| "completion_length": 136.915625, | |
| "epoch": 0.876839402556528, | |
| "grad_norm": 0.5388721823692322, | |
| "kl": 0.4608642578125, | |
| "learning_rate": 9.060123672420451e-07, | |
| "loss": 0.0184, | |
| "reward": 1.228125, | |
| "reward_std": 0.22003105469048023, | |
| "rewards/accuracy_reward": 0.25859375, | |
| "rewards/format_reward": 0.96953125, | |
| "step": 1985 | |
| }, | |
| { | |
| "completion_length": 123.7546875, | |
| "epoch": 0.8790480660390382, | |
| "grad_norm": 0.7182089686393738, | |
| "kl": 0.4820068359375, | |
| "learning_rate": 8.741942384054481e-07, | |
| "loss": 0.0193, | |
| "reward": 1.2609375, | |
| "reward_std": 0.23840244263410568, | |
| "rewards/accuracy_reward": 0.28828125, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 1990 | |
| }, | |
| { | |
| "completion_length": 124.53359375, | |
| "epoch": 0.8812567295215483, | |
| "grad_norm": 0.524861752986908, | |
| "kl": 0.452880859375, | |
| "learning_rate": 8.429193014540015e-07, | |
| "loss": 0.0181, | |
| "reward": 1.253125, | |
| "reward_std": 0.2055924626067281, | |
| "rewards/accuracy_reward": 0.275, | |
| "rewards/format_reward": 0.978125, | |
| "step": 1995 | |
| }, | |
| { | |
| "completion_length": 130.221875, | |
| "epoch": 0.8834653930040585, | |
| "grad_norm": 0.49339717626571655, | |
| "kl": 0.484423828125, | |
| "learning_rate": 8.121894179539469e-07, | |
| "loss": 0.0194, | |
| "reward": 1.225, | |
| "reward_std": 0.22272255159914495, | |
| "rewards/accuracy_reward": 0.25625, | |
| "rewards/format_reward": 0.96875, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8834653930040585, | |
| "eval_completion_length": 142.34375, | |
| "eval_kl": 0.450234375, | |
| "eval_loss": 0.018087182193994522, | |
| "eval_reward": 1.279166669845581, | |
| "eval_reward_std": 0.2484509229660034, | |
| "eval_rewards/accuracy_reward": 0.31458333373069763, | |
| "eval_rewards/format_reward": 0.9645833349227906, | |
| "eval_runtime": 145.7929, | |
| "eval_samples_per_second": 0.679, | |
| "eval_steps_per_second": 0.027, | |
| "step": 2000 | |
| }, | |
| { | |
| "completion_length": 130.49140625, | |
| "epoch": 0.8856740564865686, | |
| "grad_norm": 0.5030075907707214, | |
| "kl": 0.4587646484375, | |
| "learning_rate": 7.82006417028518e-07, | |
| "loss": 0.0183, | |
| "reward": 1.2390625, | |
| "reward_std": 0.20883522126823664, | |
| "rewards/accuracy_reward": 0.26015625, | |
| "rewards/format_reward": 0.97890625, | |
| "step": 2005 | |
| }, | |
| { | |
| "completion_length": 145.6078125, | |
| "epoch": 0.8878827199690787, | |
| "grad_norm": 0.5313246250152588, | |
| "kl": 0.45802001953125, | |
| "learning_rate": 7.523720952490631e-07, | |
| "loss": 0.0183, | |
| "reward": 1.2578125, | |
| "reward_std": 0.2512391902506351, | |
| "rewards/accuracy_reward": 0.28828125, | |
| "rewards/format_reward": 0.96953125, | |
| "step": 2010 | |
| }, | |
| { | |
| "completion_length": 157.45546875, | |
| "epoch": 0.8900913834515889, | |
| "grad_norm": 0.5215573310852051, | |
| "kl": 0.49078369140625, | |
| "learning_rate": 7.232882165281141e-07, | |
| "loss": 0.0196, | |
| "reward": 1.1890625, | |
| "reward_std": 0.22680971212685108, | |
| "rewards/accuracy_reward": 0.228125, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 2015 | |
| }, | |
| { | |
| "completion_length": 146.7203125, | |
| "epoch": 0.892300046934099, | |
| "grad_norm": 0.6293109059333801, | |
| "kl": 0.49932861328125, | |
| "learning_rate": 6.947565120143828e-07, | |
| "loss": 0.02, | |
| "reward": 1.2375, | |
| "reward_std": 0.2244907196611166, | |
| "rewards/accuracy_reward": 0.27265625, | |
| "rewards/format_reward": 0.96484375, | |
| "step": 2020 | |
| }, | |
| { | |
| "completion_length": 152.1328125, | |
| "epoch": 0.8945087104166092, | |
| "grad_norm": 0.6269906759262085, | |
| "kl": 0.4867919921875, | |
| "learning_rate": 6.667786799897269e-07, | |
| "loss": 0.0195, | |
| "reward": 1.23828125, | |
| "reward_std": 0.2209881154820323, | |
| "rewards/accuracy_reward": 0.275, | |
| "rewards/format_reward": 0.96328125, | |
| "step": 2025 | |
| }, | |
| { | |
| "completion_length": 143.7203125, | |
| "epoch": 0.8967173738991193, | |
| "grad_norm": 0.4811406433582306, | |
| "kl": 0.45821533203125, | |
| "learning_rate": 6.393563857680596e-07, | |
| "loss": 0.0183, | |
| "reward": 1.26953125, | |
| "reward_std": 0.22661811783909797, | |
| "rewards/accuracy_reward": 0.3015625, | |
| "rewards/format_reward": 0.96796875, | |
| "step": 2030 | |
| }, | |
| { | |
| "completion_length": 145.00390625, | |
| "epoch": 0.8989260373816295, | |
| "grad_norm": 0.6537109017372131, | |
| "kl": 0.49169921875, | |
| "learning_rate": 6.124912615962341e-07, | |
| "loss": 0.0197, | |
| "reward": 1.24765625, | |
| "reward_std": 0.22691688518971204, | |
| "rewards/accuracy_reward": 0.2765625, | |
| "rewards/format_reward": 0.97109375, | |
| "step": 2035 | |
| }, | |
| { | |
| "completion_length": 159.30390625, | |
| "epoch": 0.9011347008641396, | |
| "grad_norm": 0.5987099409103394, | |
| "kl": 0.4700439453125, | |
| "learning_rate": 5.861849065568726e-07, | |
| "loss": 0.0188, | |
| "reward": 1.2921875, | |
| "reward_std": 0.26790456287562847, | |
| "rewards/accuracy_reward": 0.3328125, | |
| "rewards/format_reward": 0.959375, | |
| "step": 2040 | |
| }, | |
| { | |
| "completion_length": 157.98203125, | |
| "epoch": 0.9033433643466497, | |
| "grad_norm": 1.6861835718154907, | |
| "kl": 0.5018310546875, | |
| "learning_rate": 5.604388864732002e-07, | |
| "loss": 0.0201, | |
| "reward": 1.2171875, | |
| "reward_std": 0.23706249240785837, | |
| "rewards/accuracy_reward": 0.25703125, | |
| "rewards/format_reward": 0.96015625, | |
| "step": 2045 | |
| }, | |
| { | |
| "completion_length": 157.39921875, | |
| "epoch": 0.9055520278291599, | |
| "grad_norm": 0.623263955116272, | |
| "kl": 0.61142578125, | |
| "learning_rate": 5.352547338158309e-07, | |
| "loss": 0.0245, | |
| "reward": 1.19453125, | |
| "reward_std": 0.2746475737541914, | |
| "rewards/accuracy_reward": 0.23828125, | |
| "rewards/format_reward": 0.95625, | |
| "step": 2050 | |
| }, | |
| { | |
| "completion_length": 153.7953125, | |
| "epoch": 0.90776069131167, | |
| "grad_norm": 0.6804496049880981, | |
| "kl": 0.4615234375, | |
| "learning_rate": 5.106339476115596e-07, | |
| "loss": 0.0185, | |
| "reward": 1.23359375, | |
| "reward_std": 0.2757456684485078, | |
| "rewards/accuracy_reward": 0.271875, | |
| "rewards/format_reward": 0.96171875, | |
| "step": 2055 | |
| }, | |
| { | |
| "completion_length": 160.09140625, | |
| "epoch": 0.9099693547941802, | |
| "grad_norm": 0.8526637554168701, | |
| "kl": 0.48623046875, | |
| "learning_rate": 4.865779933541348e-07, | |
| "loss": 0.0194, | |
| "reward": 1.253125, | |
| "reward_std": 0.27613792307674884, | |
| "rewards/accuracy_reward": 0.30078125, | |
| "rewards/format_reward": 0.95234375, | |
| "step": 2060 | |
| }, | |
| { | |
| "completion_length": 161.521875, | |
| "epoch": 0.9121780182766903, | |
| "grad_norm": 0.6661585569381714, | |
| "kl": 0.49755859375, | |
| "learning_rate": 4.63088302917023e-07, | |
| "loss": 0.0199, | |
| "reward": 1.24375, | |
| "reward_std": 0.24981417022645475, | |
| "rewards/accuracy_reward": 0.2890625, | |
| "rewards/format_reward": 0.9546875, | |
| "step": 2065 | |
| }, | |
| { | |
| "completion_length": 157.80546875, | |
| "epoch": 0.9143866817592005, | |
| "grad_norm": 0.7502483129501343, | |
| "kl": 0.4491455078125, | |
| "learning_rate": 4.401662744681845e-07, | |
| "loss": 0.018, | |
| "reward": 1.2625, | |
| "reward_std": 0.23363354597240688, | |
| "rewards/accuracy_reward": 0.29765625, | |
| "rewards/format_reward": 0.96484375, | |
| "step": 2070 | |
| }, | |
| { | |
| "completion_length": 141.709375, | |
| "epoch": 0.9165953452417106, | |
| "grad_norm": 1.0072883367538452, | |
| "kl": 0.440625, | |
| "learning_rate": 4.1781327238684775e-07, | |
| "loss": 0.0176, | |
| "reward": 1.271875, | |
| "reward_std": 0.23114844579249622, | |
| "rewards/accuracy_reward": 0.29921875, | |
| "rewards/format_reward": 0.97265625, | |
| "step": 2075 | |
| }, | |
| { | |
| "completion_length": 143.68515625, | |
| "epoch": 0.9188040087242207, | |
| "grad_norm": 0.7586592435836792, | |
| "kl": 0.49891357421875, | |
| "learning_rate": 3.9603062718230667e-07, | |
| "loss": 0.02, | |
| "reward": 1.27734375, | |
| "reward_std": 0.2479051820933819, | |
| "rewards/accuracy_reward": 0.30703125, | |
| "rewards/format_reward": 0.9703125, | |
| "step": 2080 | |
| }, | |
| { | |
| "completion_length": 163.2359375, | |
| "epoch": 0.9210126722067309, | |
| "grad_norm": 0.7740212678909302, | |
| "kl": 0.4748779296875, | |
| "learning_rate": 3.748196354147127e-07, | |
| "loss": 0.019, | |
| "reward": 1.253125, | |
| "reward_std": 0.2910253481939435, | |
| "rewards/accuracy_reward": 0.3046875, | |
| "rewards/format_reward": 0.9484375, | |
| "step": 2085 | |
| }, | |
| { | |
| "completion_length": 161.70703125, | |
| "epoch": 0.923221335689241, | |
| "grad_norm": 0.5001750588417053, | |
| "kl": 0.51661376953125, | |
| "learning_rate": 3.5418155961790546e-07, | |
| "loss": 0.0207, | |
| "reward": 1.18984375, | |
| "reward_std": 0.24765556119382381, | |
| "rewards/accuracy_reward": 0.23984375, | |
| "rewards/format_reward": 0.95, | |
| "step": 2090 | |
| }, | |
| { | |
| "completion_length": 149.140625, | |
| "epoch": 0.9254299991717512, | |
| "grad_norm": 0.6595699787139893, | |
| "kl": 0.459619140625, | |
| "learning_rate": 3.341176282242653e-07, | |
| "loss": 0.0184, | |
| "reward": 1.25546875, | |
| "reward_std": 0.23822309002280234, | |
| "rewards/accuracy_reward": 0.29140625, | |
| "rewards/format_reward": 0.9640625, | |
| "step": 2095 | |
| }, | |
| { | |
| "completion_length": 152.50390625, | |
| "epoch": 0.9276386626542613, | |
| "grad_norm": 1.5634217262268066, | |
| "kl": 0.5083740234375, | |
| "learning_rate": 3.1462903549159484e-07, | |
| "loss": 0.0203, | |
| "reward": 1.2578125, | |
| "reward_std": 0.2591968797147274, | |
| "rewards/accuracy_reward": 0.3015625, | |
| "rewards/format_reward": 0.95625, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.9276386626542613, | |
| "eval_completion_length": 141.9604168701172, | |
| "eval_kl": 0.46765625, | |
| "eval_loss": 0.01848418451845646, | |
| "eval_reward": 1.28125, | |
| "eval_reward_std": 0.24666063576936723, | |
| "eval_rewards/accuracy_reward": 0.31916666686534884, | |
| "eval_rewards/format_reward": 0.9620833349227905, | |
| "eval_runtime": 156.3759, | |
| "eval_samples_per_second": 0.633, | |
| "eval_steps_per_second": 0.026, | |
| "step": 2100 | |
| }, | |
| { | |
| "completion_length": 148.68515625, | |
| "epoch": 0.9298473261367715, | |
| "grad_norm": 0.7756811380386353, | |
| "kl": 0.4727783203125, | |
| "learning_rate": 2.9571694143202934e-07, | |
| "loss": 0.0189, | |
| "reward": 1.2359375, | |
| "reward_std": 0.2811011435464025, | |
| "rewards/accuracy_reward": 0.27578125, | |
| "rewards/format_reward": 0.96015625, | |
| "step": 2105 | |
| }, | |
| { | |
| "completion_length": 170.4796875, | |
| "epoch": 0.9320559896192816, | |
| "grad_norm": 0.5152100324630737, | |
| "kl": 0.523291015625, | |
| "learning_rate": 2.773824717429907e-07, | |
| "loss": 0.0209, | |
| "reward": 1.18046875, | |
| "reward_std": 0.28240158669650556, | |
| "rewards/accuracy_reward": 0.2328125, | |
| "rewards/format_reward": 0.94765625, | |
| "step": 2110 | |
| }, | |
| { | |
| "completion_length": 138.71171875, | |
| "epoch": 0.9342646531017917, | |
| "grad_norm": 0.6858031153678894, | |
| "kl": 0.481396484375, | |
| "learning_rate": 2.5962671774018234e-07, | |
| "loss": 0.0193, | |
| "reward": 1.2265625, | |
| "reward_std": 0.2497631970793009, | |
| "rewards/accuracy_reward": 0.2625, | |
| "rewards/format_reward": 0.9640625, | |
| "step": 2115 | |
| }, | |
| { | |
| "completion_length": 150.840625, | |
| "epoch": 0.9364733165843019, | |
| "grad_norm": 0.7537331581115723, | |
| "kl": 0.49796142578125, | |
| "learning_rate": 2.424507362926376e-07, | |
| "loss": 0.0199, | |
| "reward": 1.2484375, | |
| "reward_std": 0.2752187229692936, | |
| "rewards/accuracy_reward": 0.2875, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 2120 | |
| }, | |
| { | |
| "completion_length": 145.603125, | |
| "epoch": 0.938681980066812, | |
| "grad_norm": 0.854280412197113, | |
| "kl": 0.453173828125, | |
| "learning_rate": 2.2585554975980252e-07, | |
| "loss": 0.0181, | |
| "reward": 1.240625, | |
| "reward_std": 0.24026636723428965, | |
| "rewards/accuracy_reward": 0.278125, | |
| "rewards/format_reward": 0.9625, | |
| "step": 2125 | |
| }, | |
| { | |
| "completion_length": 146.646875, | |
| "epoch": 0.9408906435493222, | |
| "grad_norm": 0.6566202640533447, | |
| "kl": 0.47532958984375, | |
| "learning_rate": 2.0984214593069318e-07, | |
| "loss": 0.019, | |
| "reward": 1.2546875, | |
| "reward_std": 0.2486549686640501, | |
| "rewards/accuracy_reward": 0.28671875, | |
| "rewards/format_reward": 0.96796875, | |
| "step": 2130 | |
| }, | |
| { | |
| "completion_length": 134.93125, | |
| "epoch": 0.9430993070318323, | |
| "grad_norm": 0.6418664455413818, | |
| "kl": 0.70113525390625, | |
| "learning_rate": 1.9441147796508408e-07, | |
| "loss": 0.028, | |
| "reward": 1.28046875, | |
| "reward_std": 0.270217102766037, | |
| "rewards/accuracy_reward": 0.3140625, | |
| "rewards/format_reward": 0.96640625, | |
| "step": 2135 | |
| }, | |
| { | |
| "completion_length": 153.1296875, | |
| "epoch": 0.9453079705143425, | |
| "grad_norm": 0.5165784358978271, | |
| "kl": 0.51217041015625, | |
| "learning_rate": 1.795644643367922e-07, | |
| "loss": 0.0205, | |
| "reward": 1.23984375, | |
| "reward_std": 0.2561708649620414, | |
| "rewards/accuracy_reward": 0.28125, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 2140 | |
| }, | |
| { | |
| "completion_length": 143.43046875, | |
| "epoch": 0.9475166339968527, | |
| "grad_norm": 0.6096455454826355, | |
| "kl": 0.474267578125, | |
| "learning_rate": 1.6530198877899417e-07, | |
| "loss": 0.019, | |
| "reward": 1.265625, | |
| "reward_std": 0.24631664287298918, | |
| "rewards/accuracy_reward": 0.296875, | |
| "rewards/format_reward": 0.96875, | |
| "step": 2145 | |
| }, | |
| { | |
| "completion_length": 149.7796875, | |
| "epoch": 0.9497252974793629, | |
| "grad_norm": 0.6443772912025452, | |
| "kl": 0.46708984375, | |
| "learning_rate": 1.5162490023163057e-07, | |
| "loss": 0.0187, | |
| "reward": 1.23828125, | |
| "reward_std": 0.23290605265647174, | |
| "rewards/accuracy_reward": 0.275, | |
| "rewards/format_reward": 0.96328125, | |
| "step": 2150 | |
| }, | |
| { | |
| "completion_length": 152.20546875, | |
| "epoch": 0.951933960961873, | |
| "grad_norm": 0.7591469883918762, | |
| "kl": 0.5279296875, | |
| "learning_rate": 1.3853401279086853e-07, | |
| "loss": 0.0211, | |
| "reward": 1.20859375, | |
| "reward_std": 0.23095191065222026, | |
| "rewards/accuracy_reward": 0.24921875, | |
| "rewards/format_reward": 0.959375, | |
| "step": 2155 | |
| }, | |
| { | |
| "completion_length": 143.9375, | |
| "epoch": 0.9541426244443831, | |
| "grad_norm": 0.4750344753265381, | |
| "kl": 0.47156982421875, | |
| "learning_rate": 1.2603010566065055e-07, | |
| "loss": 0.0189, | |
| "reward": 1.240625, | |
| "reward_std": 0.2516822377219796, | |
| "rewards/accuracy_reward": 0.27734375, | |
| "rewards/format_reward": 0.96328125, | |
| "step": 2160 | |
| }, | |
| { | |
| "completion_length": 147.12578125, | |
| "epoch": 0.9563512879268933, | |
| "grad_norm": 0.6196463704109192, | |
| "kl": 0.50130615234375, | |
| "learning_rate": 1.1411392310631153e-07, | |
| "loss": 0.0201, | |
| "reward": 1.2375, | |
| "reward_std": 0.2511793440207839, | |
| "rewards/accuracy_reward": 0.28125, | |
| "rewards/format_reward": 0.95625, | |
| "step": 2165 | |
| }, | |
| { | |
| "completion_length": 130.6046875, | |
| "epoch": 0.9585599514094034, | |
| "grad_norm": 0.519320547580719, | |
| "kl": 0.55948486328125, | |
| "learning_rate": 1.0278617441028205e-07, | |
| "loss": 0.0224, | |
| "reward": 1.2484375, | |
| "reward_std": 0.24660416580736638, | |
| "rewards/accuracy_reward": 0.28125, | |
| "rewards/format_reward": 0.9671875, | |
| "step": 2170 | |
| }, | |
| { | |
| "completion_length": 139.35078125, | |
| "epoch": 0.9607686148919136, | |
| "grad_norm": 0.5927404761314392, | |
| "kl": 0.45987548828125, | |
| "learning_rate": 9.204753382986097e-08, | |
| "loss": 0.0184, | |
| "reward": 1.240625, | |
| "reward_std": 0.19637434519827365, | |
| "rewards/accuracy_reward": 0.27265625, | |
| "rewards/format_reward": 0.96796875, | |
| "step": 2175 | |
| }, | |
| { | |
| "completion_length": 139.8921875, | |
| "epoch": 0.9629772783744237, | |
| "grad_norm": 0.4677460491657257, | |
| "kl": 0.47265625, | |
| "learning_rate": 8.189864055709206e-08, | |
| "loss": 0.0189, | |
| "reward": 1.2296875, | |
| "reward_std": 0.2277947474271059, | |
| "rewards/accuracy_reward": 0.2625, | |
| "rewards/format_reward": 0.9671875, | |
| "step": 2180 | |
| }, | |
| { | |
| "completion_length": 143.153125, | |
| "epoch": 0.9651859418569338, | |
| "grad_norm": 0.9342114925384521, | |
| "kl": 0.5101318359375, | |
| "learning_rate": 7.23400986807099e-08, | |
| "loss": 0.0204, | |
| "reward": 1.25078125, | |
| "reward_std": 0.26373773720115423, | |
| "rewards/accuracy_reward": 0.28984375, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 2185 | |
| }, | |
| { | |
| "completion_length": 148.3703125, | |
| "epoch": 0.967394605339444, | |
| "grad_norm": 0.46142810583114624, | |
| "kl": 0.48370361328125, | |
| "learning_rate": 6.337247715018869e-08, | |
| "loss": 0.0194, | |
| "reward": 1.2203125, | |
| "reward_std": 0.23893490042537452, | |
| "rewards/accuracy_reward": 0.2578125, | |
| "rewards/format_reward": 0.9625, | |
| "step": 2190 | |
| }, | |
| { | |
| "completion_length": 139.8171875, | |
| "epoch": 0.9696032688219541, | |
| "grad_norm": 0.5866816639900208, | |
| "kl": 0.4761962890625, | |
| "learning_rate": 5.4996309741873755e-08, | |
| "loss": 0.019, | |
| "reward": 1.275, | |
| "reward_std": 0.23191295862197875, | |
| "rewards/accuracy_reward": 0.30546875, | |
| "rewards/format_reward": 0.96953125, | |
| "step": 2195 | |
| }, | |
| { | |
| "completion_length": 150.57109375, | |
| "epoch": 0.9718119323044643, | |
| "grad_norm": 0.540817141532898, | |
| "kl": 0.44088134765625, | |
| "learning_rate": 4.7212095027209246e-08, | |
| "loss": 0.0176, | |
| "reward": 1.29765625, | |
| "reward_std": 0.2451239839196205, | |
| "rewards/accuracy_reward": 0.33046875, | |
| "rewards/format_reward": 0.9671875, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.9718119323044643, | |
| "eval_completion_length": 168.58166748046875, | |
| "eval_kl": 0.486328125, | |
| "eval_loss": 0.019449135288596153, | |
| "eval_reward": 1.2825, | |
| "eval_reward_std": 0.28413535237312315, | |
| "eval_rewards/accuracy_reward": 0.3333333337306976, | |
| "eval_rewards/format_reward": 0.9491666674613952, | |
| "eval_runtime": 159.9487, | |
| "eval_samples_per_second": 0.619, | |
| "eval_steps_per_second": 0.025, | |
| "step": 2200 | |
| }, | |
| { | |
| "completion_length": 153.36015625, | |
| "epoch": 0.9740205957869744, | |
| "grad_norm": 0.4312755763530731, | |
| "kl": 0.52962646484375, | |
| "learning_rate": 4.0020296343065144e-08, | |
| "loss": 0.0212, | |
| "reward": 1.2046875, | |
| "reward_std": 0.2571037333458662, | |
| "rewards/accuracy_reward": 0.24609375, | |
| "rewards/format_reward": 0.95859375, | |
| "step": 2205 | |
| }, | |
| { | |
| "completion_length": 146.95, | |
| "epoch": 0.9762292592694846, | |
| "grad_norm": 0.8880886435508728, | |
| "kl": 0.478515625, | |
| "learning_rate": 3.3421341764152684e-08, | |
| "loss": 0.0191, | |
| "reward": 1.23359375, | |
| "reward_std": 0.2560200056061149, | |
| "rewards/accuracy_reward": 0.27109375, | |
| "rewards/format_reward": 0.9625, | |
| "step": 2210 | |
| }, | |
| { | |
| "completion_length": 144.825, | |
| "epoch": 0.9784379227519947, | |
| "grad_norm": 0.788774311542511, | |
| "kl": 0.512060546875, | |
| "learning_rate": 2.7415624077551383e-08, | |
| "loss": 0.0205, | |
| "reward": 1.2359375, | |
| "reward_std": 0.26023210752755405, | |
| "rewards/accuracy_reward": 0.2765625, | |
| "rewards/format_reward": 0.959375, | |
| "step": 2215 | |
| }, | |
| { | |
| "completion_length": 133.7578125, | |
| "epoch": 0.9806465862345048, | |
| "grad_norm": 0.5918937921524048, | |
| "kl": 0.51031494140625, | |
| "learning_rate": 2.2003500759322228e-08, | |
| "loss": 0.0204, | |
| "reward": 1.2203125, | |
| "reward_std": 0.24810067620128393, | |
| "rewards/accuracy_reward": 0.2515625, | |
| "rewards/format_reward": 0.96875, | |
| "step": 2220 | |
| }, | |
| { | |
| "completion_length": 131.171875, | |
| "epoch": 0.982855249717015, | |
| "grad_norm": 0.4942164421081543, | |
| "kl": 0.49373779296875, | |
| "learning_rate": 1.718529395323687e-08, | |
| "loss": 0.0198, | |
| "reward": 1.234375, | |
| "reward_std": 0.22725256606936456, | |
| "rewards/accuracy_reward": 0.265625, | |
| "rewards/format_reward": 0.96875, | |
| "step": 2225 | |
| }, | |
| { | |
| "completion_length": 151.55703125, | |
| "epoch": 0.9850639131995251, | |
| "grad_norm": 0.4727592468261719, | |
| "kl": 0.46812744140625, | |
| "learning_rate": 1.2961290451594111e-08, | |
| "loss": 0.0187, | |
| "reward": 1.2390625, | |
| "reward_std": 0.2715959116816521, | |
| "rewards/accuracy_reward": 0.2796875, | |
| "rewards/format_reward": 0.959375, | |
| "step": 2230 | |
| }, | |
| { | |
| "completion_length": 149.5328125, | |
| "epoch": 0.9872725766820353, | |
| "grad_norm": 0.7650235891342163, | |
| "kl": 0.46435546875, | |
| "learning_rate": 9.3317416781602e-09, | |
| "loss": 0.0186, | |
| "reward": 1.26875, | |
| "reward_std": 0.2793737856671214, | |
| "rewards/accuracy_reward": 0.3109375, | |
| "rewards/format_reward": 0.9578125, | |
| "step": 2235 | |
| }, | |
| { | |
| "completion_length": 148.59296875, | |
| "epoch": 0.9894812401645454, | |
| "grad_norm": 0.9975623488426208, | |
| "kl": 0.50648193359375, | |
| "learning_rate": 6.296863673191933e-09, | |
| "loss": 0.0203, | |
| "reward": 1.26796875, | |
| "reward_std": 0.270522028952837, | |
| "rewards/accuracy_reward": 0.3109375, | |
| "rewards/format_reward": 0.95703125, | |
| "step": 2240 | |
| }, | |
| { | |
| "completion_length": 148.49765625, | |
| "epoch": 0.9916899036470556, | |
| "grad_norm": 0.8689573407173157, | |
| "kl": 0.4764892578125, | |
| "learning_rate": 3.856837080585818e-09, | |
| "loss": 0.0191, | |
| "reward": 1.25, | |
| "reward_std": 0.23697545174509288, | |
| "rewards/accuracy_reward": 0.2828125, | |
| "rewards/format_reward": 0.9671875, | |
| "step": 2245 | |
| }, | |
| { | |
| "completion_length": 149.41484375, | |
| "epoch": 0.9938985671295657, | |
| "grad_norm": 0.9692167639732361, | |
| "kl": 0.488427734375, | |
| "learning_rate": 2.0118071371211244e-09, | |
| "loss": 0.0195, | |
| "reward": 1.2078125, | |
| "reward_std": 0.25148440394550564, | |
| "rewards/accuracy_reward": 0.2484375, | |
| "rewards/format_reward": 0.959375, | |
| "step": 2250 | |
| }, | |
| { | |
| "completion_length": 139.27109375, | |
| "epoch": 0.9961072306120758, | |
| "grad_norm": 13.9436674118042, | |
| "kl": 0.557568359375, | |
| "learning_rate": 7.618836638190186e-10, | |
| "loss": 0.0223, | |
| "reward": 1.26953125, | |
| "reward_std": 0.2697257066145539, | |
| "rewards/accuracy_reward": 0.30625, | |
| "rewards/format_reward": 0.96328125, | |
| "step": 2255 | |
| }, | |
| { | |
| "completion_length": 156.4703125, | |
| "epoch": 0.998315894094586, | |
| "grad_norm": 0.910273015499115, | |
| "kl": 0.52550048828125, | |
| "learning_rate": 1.0714105940001773e-10, | |
| "loss": 0.021, | |
| "reward": 1.190625, | |
| "reward_std": 0.2517782935872674, | |
| "rewards/accuracy_reward": 0.23984375, | |
| "rewards/format_reward": 0.95078125, | |
| "step": 2260 | |
| }, | |
| { | |
| "completion_length": 164.65104166666666, | |
| "epoch": 0.9996410921840921, | |
| "kl": 0.5259602864583334, | |
| "reward": 1.20703125, | |
| "reward_std": 0.2667766287922859, | |
| "rewards/accuracy_reward": 0.2578125, | |
| "rewards/format_reward": 0.94921875, | |
| "step": 2263, | |
| "total_flos": 0.0, | |
| "train_loss": 2.13883109888834, | |
| "train_runtime": 166892.9358, | |
| "train_samples_per_second": 0.434, | |
| "train_steps_per_second": 0.014 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2263, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |