| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.997134670487106, |
| "eval_steps": 500, |
| "global_step": 696, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "advantages/mean": -1.862645149230957e-09, |
| "advantages/snr": 4.005929201951151e-09, |
| "advantages/std": 0.46497204899787903, |
| "advantages/var": 0.21619900634928602, |
| "completions/clipped_ratio": -1.890625, |
| "epoch": 0.0057306590257879654, |
| "grad_norm": 591.0469885189827, |
| "learning_rate": 2e-06, |
| "loss": -34.4309, |
| "num_tokens": 188198.0, |
| "residual_var": 0.060805998742580414, |
| "reward": 0.43359375, |
| "reward_std": 0.4025588035583496, |
| "rewards/drgrpo_math_reward/mean": 0.43359375, |
| "rewards/drgrpo_math_reward/std": 0.4965413510799408, |
| "rho2": 0.7187498807907104, |
| "step": 1 |
| }, |
| { |
| "advantages/mean": 4.889443516731262e-09, |
| "advantages/snr": 1.0609876673533912e-08, |
| "advantages/std": 0.46083885431289673, |
| "advantages/var": 0.21237244964442326, |
| "completions/clipped_ratio": -1.578125, |
| "epoch": 0.011461318051575931, |
| "grad_norm": 995.3395033100987, |
| "learning_rate": 1.9999899289920054e-06, |
| "loss": -38.2217, |
| "num_tokens": 388582.0, |
| "residual_var": 0.04645651578903198, |
| "reward": 0.37890625, |
| "reward_std": 0.3851699233055115, |
| "rewards/drgrpo_math_reward/mean": 0.37890625, |
| "rewards/drgrpo_math_reward/std": 0.4860650300979614, |
| "rho2": 0.7812498211860657, |
| "step": 2 |
| }, |
| { |
| "advantages/mean": -1.0477378964424133e-09, |
| "advantages/snr": 2.3413202815254037e-09, |
| "advantages/std": 0.44749873876571655, |
| "advantages/var": 0.20025512119690703, |
| "completions/clipped_ratio": -1.5625, |
| "epoch": 0.017191977077363897, |
| "grad_norm": 261.62193963816037, |
| "learning_rate": 1.999959716170871e-06, |
| "loss": -35.6328, |
| "num_tokens": 602957.0, |
| "residual_var": 0.056321777403354645, |
| "reward": 0.4375, |
| "reward_std": 0.3725636303424835, |
| "rewards/drgrpo_math_reward/mean": 0.4375, |
| "rewards/drgrpo_math_reward/std": 0.49705013632774353, |
| "rho2": 0.7187498807907104, |
| "step": 3 |
| }, |
| { |
| "advantages/mean": -1.3969838619232178e-09, |
| "advantages/snr": 3.0971986497173363e-09, |
| "advantages/std": 0.45104753971099854, |
| "advantages/var": 0.2034438830793448, |
| "completions/clipped_ratio": -1.8125, |
| "epoch": 0.022922636103151862, |
| "grad_norm": 388.3029257567336, |
| "learning_rate": 1.999909362145145e-06, |
| "loss": -26.3566, |
| "num_tokens": 816817.0, |
| "residual_var": 0.06993386149406433, |
| "reward": 0.58984375, |
| "reward_std": 0.37533193826675415, |
| "rewards/drgrpo_math_reward/mean": 0.58984375, |
| "rewards/drgrpo_math_reward/std": 0.49282538890838623, |
| "rho2": 0.6562498807907104, |
| "step": 4 |
| }, |
| { |
| "advantages/mean": -3.3760443329811096e-09, |
| "advantages/snr": 7.850225838136565e-09, |
| "advantages/std": 0.43005695939064026, |
| "advantages/var": 0.1849489883203228, |
| "completions/clipped_ratio": -1.421875, |
| "epoch": 0.02865329512893983, |
| "grad_norm": 179.31864687689426, |
| "learning_rate": 1.999838867929058e-06, |
| "loss": -18.2458, |
| "num_tokens": 1042032.0, |
| "residual_var": 0.07513555139303207, |
| "reward": 0.484375, |
| "reward_std": 0.33824339509010315, |
| "rewards/drgrpo_math_reward/mean": 0.484375, |
| "rewards/drgrpo_math_reward/std": 0.5007347464561462, |
| "rho2": 0.5937498807907104, |
| "step": 5 |
| }, |
| { |
| "advantages/mean": 1.862645149230957e-09, |
| "advantages/snr": 4.522330689365846e-09, |
| "advantages/std": 0.41187724471092224, |
| "advantages/var": 0.16964286471066092, |
| "completions/clipped_ratio": -2.03125, |
| "epoch": 0.034383954154727794, |
| "grad_norm": 233.82793486575466, |
| "learning_rate": 1.9997482349425066e-06, |
| "loss": -11.5251, |
| "num_tokens": 1238234.0, |
| "residual_var": 0.03180807828903198, |
| "reward": 0.703125, |
| "reward_std": 0.34193065762519836, |
| "rewards/drgrpo_math_reward/mean": 0.703125, |
| "rewards/drgrpo_math_reward/std": 0.45777595043182373, |
| "rho2": 0.8124997615814209, |
| "step": 6 |
| }, |
| { |
| "advantages/mean": 1.7462298274040222e-09, |
| "advantages/snr": 3.972400980534068e-09, |
| "advantages/std": 0.4395905137062073, |
| "advantages/var": 0.1932398197404872, |
| "completions/clipped_ratio": -1.859375, |
| "epoch": 0.04011461318051576, |
| "grad_norm": 229.54028598418301, |
| "learning_rate": 1.999637465011021e-06, |
| "loss": -23.6846, |
| "num_tokens": 1443163.0, |
| "residual_var": 0.03019375540316105, |
| "reward": 0.62890625, |
| "reward_std": 0.3734835982322693, |
| "rewards/drgrpo_math_reward/mean": 0.62890625, |
| "rewards/drgrpo_math_reward/std": 0.48404383659362793, |
| "rho2": 0.8437498211860657, |
| "step": 7 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 8.945516905539432e-10, |
| "advantages/std": 0.3904145061969757, |
| "advantages/var": 0.15242348664902838, |
| "completions/clipped_ratio": -1.875, |
| "epoch": 0.045845272206303724, |
| "grad_norm": 186.0838781990229, |
| "learning_rate": 1.9995065603657316e-06, |
| "loss": -18.2365, |
| "num_tokens": 1656784.0, |
| "residual_var": 0.05715882405638695, |
| "reward": 0.64453125, |
| "reward_std": 0.2987997531890869, |
| "rewards/drgrpo_math_reward/mean": 0.64453125, |
| "rewards/drgrpo_math_reward/std": 0.4795927405357361, |
| "rho2": 0.6249998807907104, |
| "step": 8 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.2028437765240612e-09, |
| "advantages/std": 0.3871336281299591, |
| "advantages/var": 0.14987244602906546, |
| "completions/clipped_ratio": -1.921875, |
| "epoch": 0.05157593123209169, |
| "grad_norm": 101.5457998948026, |
| "learning_rate": 1.999355523643321e-06, |
| "loss": -10.5425, |
| "num_tokens": 1858387.0, |
| "residual_var": 0.0608857125043869, |
| "reward": 0.71484375, |
| "reward_std": 0.28300461173057556, |
| "rewards/drgrpo_math_reward/mean": 0.71484375, |
| "rewards/drgrpo_math_reward/std": 0.4523732364177704, |
| "rho2": 0.5937498807907104, |
| "step": 9 |
| }, |
| { |
| "advantages/mean": -2.9103830456733704e-09, |
| "advantages/snr": 8.10862952743916e-09, |
| "advantages/std": 0.35892415046691895, |
| "advantages/var": 0.12882654578839947, |
| "completions/clipped_ratio": -1.640625, |
| "epoch": 0.05730659025787966, |
| "grad_norm": 213.60109250542604, |
| "learning_rate": 1.9991843578859746e-06, |
| "loss": -19.1132, |
| "num_tokens": 2082512.0, |
| "residual_var": 0.05636163428425789, |
| "reward": 0.5, |
| "reward_std": 0.2613256275653839, |
| "rewards/drgrpo_math_reward/mean": 0.5, |
| "rewards/drgrpo_math_reward/std": 0.5009794235229492, |
| "rho2": 0.5624998807907104, |
| "step": 10 |
| }, |
| { |
| "advantages/mean": -1.5133991837501526e-09, |
| "advantages/snr": 4.125582353770003e-09, |
| "advantages/std": 0.3668328523635864, |
| "advantages/var": 0.1345663415732048, |
| "completions/clipped_ratio": -1.984375, |
| "epoch": 0.06303724928366762, |
| "grad_norm": 115.78635926271141, |
| "learning_rate": 1.9989930665413145e-06, |
| "loss": -11.9284, |
| "num_tokens": 2269571.0, |
| "residual_var": 0.06728318333625793, |
| "reward": 0.69921875, |
| "reward_std": 0.2610597014427185, |
| "rewards/drgrpo_math_reward/mean": 0.69921875, |
| "rewards/drgrpo_math_reward/std": 0.45949608087539673, |
| "rho2": 0.49999991059303284, |
| "step": 11 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 2.503474956315218e-09, |
| "advantages/std": 0.3720119297504425, |
| "advantages/var": 0.13839287587664817, |
| "completions/clipped_ratio": -1.46875, |
| "epoch": 0.06876790830945559, |
| "grad_norm": 86.54669725830207, |
| "learning_rate": 1.998781653462335e-06, |
| "loss": -5.6964, |
| "num_tokens": 2475242.0, |
| "residual_var": 0.06487166881561279, |
| "reward": 0.52734375, |
| "reward_std": 0.2703958749771118, |
| "rewards/drgrpo_math_reward/mean": 0.52734375, |
| "rewards/drgrpo_math_reward/std": 0.5002297759056091, |
| "rho2": 0.5312498807907104, |
| "step": 12 |
| }, |
| { |
| "advantages/mean": 1.6298145055770874e-09, |
| "advantages/snr": 4.4965306713937905e-09, |
| "advantages/std": 0.3624604344367981, |
| "advantages/var": 0.1313775665321124, |
| "completions/clipped_ratio": -2.03125, |
| "epoch": 0.07449856733524356, |
| "grad_norm": 248.72480558278457, |
| "learning_rate": 1.998550122907321e-06, |
| "loss": -6.6781, |
| "num_tokens": 2676337.0, |
| "residual_var": 0.04926660656929016, |
| "reward": 0.703125, |
| "reward_std": 0.27658790349960327, |
| "rewards/drgrpo_math_reward/mean": 0.703125, |
| "rewards/drgrpo_math_reward/std": 0.45777595043182373, |
| "rho2": 0.6249998807907104, |
| "step": 13 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 3.0520855732441985e-09, |
| "advantages/std": 0.30514299869537354, |
| "advantages/var": 0.09311224965280473, |
| "completions/clipped_ratio": -2.125, |
| "epoch": 0.08022922636103152, |
| "grad_norm": 167.19083795315274, |
| "learning_rate": 1.9982984795397646e-06, |
| "loss": -4.0752, |
| "num_tokens": 2865669.0, |
| "residual_var": 0.05528540536761284, |
| "reward": 0.734375, |
| "reward_std": 0.18596167862415314, |
| "rewards/drgrpo_math_reward/mean": 0.734375, |
| "rewards/drgrpo_math_reward/std": 0.4425306022167206, |
| "rho2": 0.4062499403953552, |
| "step": 14 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 6.815341083570872e-10, |
| "advantages/std": 0.3416272699832916, |
| "advantages/var": 0.11670919159623683, |
| "completions/clipped_ratio": -1.71875, |
| "epoch": 0.08595988538681948, |
| "grad_norm": 88.29950202682701, |
| "learning_rate": 1.9980267284282714e-06, |
| "loss": -3.5993, |
| "num_tokens": 3093146.0, |
| "residual_var": 0.06929609924554825, |
| "reward": 0.58203125, |
| "reward_std": 0.21633264422416687, |
| "rewards/drgrpo_math_reward/mean": 0.58203125, |
| "rewards/drgrpo_math_reward/std": 0.49419113993644714, |
| "rho2": 0.40624991059303284, |
| "step": 15 |
| }, |
| { |
| "advantages/mean": 2.2118911147117615e-09, |
| "advantages/snr": 5.775276936058749e-09, |
| "advantages/std": 0.38299307227134705, |
| "advantages/var": 0.14668369340784526, |
| "completions/clipped_ratio": -1.609375, |
| "epoch": 0.09169054441260745, |
| "grad_norm": 127.5442814323443, |
| "learning_rate": 1.997734875046456e-06, |
| "loss": -5.593, |
| "num_tokens": 3312316.0, |
| "residual_var": 0.06417413055896759, |
| "reward": 0.6015625, |
| "reward_std": 0.28657418489456177, |
| "rewards/drgrpo_math_reward/mean": 0.6015625, |
| "rewards/drgrpo_math_reward/std": 0.4905354380607605, |
| "rho2": 0.5624998807907104, |
| "step": 16 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 1.550447277389203e-09, |
| "advantages/std": 0.37542495131492615, |
| "advantages/var": 0.14094389406981467, |
| "completions/clipped_ratio": -2.234375, |
| "epoch": 0.09742120343839542, |
| "grad_norm": 268.45005256485683, |
| "learning_rate": 1.997422925272834e-06, |
| "loss": -5.6212, |
| "num_tokens": 3497926.0, |
| "residual_var": 0.05285397917032242, |
| "reward": 0.73046875, |
| "reward_std": 0.2803860306739807, |
| "rewards/drgrpo_math_reward/mean": 0.73046875, |
| "rewards/drgrpo_math_reward/std": 0.44458550214767456, |
| "rho2": 0.6249998807907104, |
| "step": 17 |
| }, |
| { |
| "advantages/mean": 2.561137080192566e-09, |
| "advantages/snr": 7.559090868705042e-09, |
| "advantages/std": 0.33881548047065735, |
| "advantages/var": 0.11479592980656239, |
| "completions/clipped_ratio": -2.34375, |
| "epoch": 0.10315186246418338, |
| "grad_norm": 149.2003080189042, |
| "learning_rate": 1.9970908853907026e-06, |
| "loss": -3.6572, |
| "num_tokens": 3674901.0, |
| "residual_var": 0.05381061136722565, |
| "reward": 0.75, |
| "reward_std": 0.23357081413269043, |
| "rewards/drgrpo_math_reward/mean": 0.75, |
| "rewards/drgrpo_math_reward/std": 0.4338609278202057, |
| "rho2": 0.5312498807907104, |
| "step": 18 |
| }, |
| { |
| "advantages/mean": -2.444721758365631e-09, |
| "advantages/snr": 7.060306184456525e-09, |
| "advantages/std": 0.3462628424167633, |
| "advantages/var": 0.11989795603853626, |
| "completions/clipped_ratio": -2.140625, |
| "epoch": 0.10888252148997135, |
| "grad_norm": 77.94130251951155, |
| "learning_rate": 1.9967387620880144e-06, |
| "loss": -4.1568, |
| "num_tokens": 3868267.0, |
| "residual_var": 0.07118944078683853, |
| "reward": 0.7265625, |
| "reward_std": 0.22620412707328796, |
| "rewards/drgrpo_math_reward/mean": 0.7265625, |
| "rewards/drgrpo_math_reward/std": 0.446596622467041, |
| "rho2": 0.40624991059303284, |
| "step": 19 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 1.8042655258901198e-09, |
| "advantages/std": 0.3871336579322815, |
| "advantages/var": 0.14987246910402874, |
| "completions/clipped_ratio": -2.5, |
| "epoch": 0.11461318051575932, |
| "grad_norm": 95.62080008887207, |
| "learning_rate": 1.996366562457243e-06, |
| "loss": -4.9432, |
| "num_tokens": 4056543.0, |
| "residual_var": 0.0608857087790966, |
| "reward": 0.73828125, |
| "reward_std": 0.289458692073822, |
| "rewards/drgrpo_math_reward/mean": 0.73828125, |
| "rewards/drgrpo_math_reward/std": 0.4404313564300537, |
| "rho2": 0.5937498807907104, |
| "step": 20 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 4.3869819386401945e-09, |
| "advantages/std": 0.31843847036361694, |
| "advantages/var": 0.10140305940752015, |
| "completions/clipped_ratio": -2.375, |
| "epoch": 0.12034383954154727, |
| "grad_norm": 94.44241989116894, |
| "learning_rate": 1.995974293995239e-06, |
| "loss": -3.5967, |
| "num_tokens": 4239399.0, |
| "residual_var": 0.06020808964967728, |
| "reward": 0.78515625, |
| "reward_std": 0.19450394809246063, |
| "rewards/drgrpo_math_reward/mean": 0.78515625, |
| "rewards/drgrpo_math_reward/std": 0.4115184545516968, |
| "rho2": 0.40624991059303284, |
| "step": 21 |
| }, |
| { |
| "advantages/mean": 2.6775524020195007e-09, |
| "advantages/snr": 7.052721734123111e-09, |
| "advantages/std": 0.3796480894088745, |
| "advantages/var": 0.14413267179180878, |
| "completions/clipped_ratio": -1.765625, |
| "epoch": 0.12607449856733524, |
| "grad_norm": 109.37752818286293, |
| "learning_rate": 1.99556196460308e-06, |
| "loss": -4.1523, |
| "num_tokens": 4456912.0, |
| "residual_var": 0.0540497712790966, |
| "reward": 0.5859375, |
| "reward_std": 0.2902575135231018, |
| "rewards/drgrpo_math_reward/mean": 0.5859375, |
| "rewards/drgrpo_math_reward/std": 0.4935242533683777, |
| "rho2": 0.6249998807907104, |
| "step": 22 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.3412577461348454e-09, |
| "advantages/std": 0.3471825420856476, |
| "advantages/var": 0.12053571752905246, |
| "completions/clipped_ratio": -2.40625, |
| "epoch": 0.1318051575931232, |
| "grad_norm": 103.74041156571671, |
| "learning_rate": 1.995129582585911e-06, |
| "loss": -3.7992, |
| "num_tokens": 4641229.0, |
| "residual_var": 0.05650113895535469, |
| "reward": 0.70703125, |
| "reward_std": 0.24686214327812195, |
| "rewards/drgrpo_math_reward/mean": 0.70703125, |
| "rewards/drgrpo_math_reward/std": 0.45601576566696167, |
| "rho2": 0.5312498807907104, |
| "step": 23 |
| }, |
| { |
| "advantages/mean": 1.6298145055770874e-09, |
| "advantages/snr": 4.732108640838661e-09, |
| "advantages/std": 0.3444161117076874, |
| "advantages/var": 0.11862245800384219, |
| "completions/clipped_ratio": -2.09375, |
| "epoch": 0.13753581661891118, |
| "grad_norm": 79.54553022424207, |
| "learning_rate": 1.994677156652778e-06, |
| "loss": -4.236, |
| "num_tokens": 4859649.0, |
| "residual_var": 0.05189733952283859, |
| "reward": 0.6953125, |
| "reward_std": 0.23816770315170288, |
| "rewards/drgrpo_math_reward/mean": 0.6953125, |
| "rewards/drgrpo_math_reward/std": 0.4611765742301941, |
| "rho2": 0.5624998807907104, |
| "step": 24 |
| }, |
| { |
| "advantages/mean": -1.0477378964424133e-09, |
| "advantages/snr": 2.784512456412912e-09, |
| "advantages/std": 0.37627336382865906, |
| "advantages/var": 0.14158164432693443, |
| "completions/clipped_ratio": -1.953125, |
| "epoch": 0.14326647564469913, |
| "grad_norm": 154.38060766574566, |
| "learning_rate": 1.994204695916451e-06, |
| "loss": -4.3948, |
| "num_tokens": 5063750.0, |
| "residual_var": 0.05751756578683853, |
| "reward": 0.65625, |
| "reward_std": 0.27446234226226807, |
| "rewards/drgrpo_math_reward/mean": 0.65625, |
| "rewards/drgrpo_math_reward/std": 0.47588926553726196, |
| "rho2": 0.5937498807907104, |
| "step": 25 |
| }, |
| { |
| "advantages/mean": -1.0477378964424133e-09, |
| "advantages/snr": 3.290236453980146e-09, |
| "advantages/std": 0.31843847036361694, |
| "advantages/var": 0.10140305940752015, |
| "completions/clipped_ratio": -2.359375, |
| "epoch": 0.1489971346704871, |
| "grad_norm": 90.32609776484996, |
| "learning_rate": 1.9937122098932426e-06, |
| "loss": -3.7122, |
| "num_tokens": 5244742.0, |
| "residual_var": 0.066545769572258, |
| "reward": 0.72265625, |
| "reward_std": 0.2003089338541031, |
| "rewards/drgrpo_math_reward/mean": 0.72265625, |
| "rewards/drgrpo_math_reward/std": 0.4485645890235901, |
| "rho2": 0.3437499403953552, |
| "step": 26 |
| }, |
| { |
| "advantages/mean": 3.026798367500305e-09, |
| "advantages/snr": 8.539322219708342e-09, |
| "advantages/std": 0.3544541597366333, |
| "advantages/var": 0.12563775135460276, |
| "completions/clipped_ratio": -2.0, |
| "epoch": 0.15472779369627507, |
| "grad_norm": 73.45824601580452, |
| "learning_rate": 1.9931997085028128e-06, |
| "loss": -4.1696, |
| "num_tokens": 5438925.0, |
| "residual_var": 0.0667450800538063, |
| "reward": 0.71484375, |
| "reward_std": 0.2384297400712967, |
| "rewards/drgrpo_math_reward/mean": 0.71484375, |
| "rewards/drgrpo_math_reward/std": 0.4523732364177704, |
| "rho2": 0.46874991059303284, |
| "step": 27 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.29558831453323364, |
| "advantages/var": 0.08737245168859786, |
| "completions/clipped_ratio": -2.3125, |
| "epoch": 0.16045845272206305, |
| "grad_norm": 72.70898121193805, |
| "learning_rate": 1.9926672020679734e-06, |
| "loss": -2.3686, |
| "num_tokens": 5615770.0, |
| "residual_var": 0.05733818560838699, |
| "reward": 0.66015625, |
| "reward_std": 0.1737360805273056, |
| "rewards/drgrpo_math_reward/mean": 0.66015625, |
| "rewards/drgrpo_math_reward/std": 0.47458380460739136, |
| "rho2": 0.3437499403953552, |
| "step": 28 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 3.2760502000971653e-09, |
| "advantages/std": 0.3553526699542999, |
| "advantages/var": 0.12627552004364961, |
| "completions/clipped_ratio": -1.96875, |
| "epoch": 0.166189111747851, |
| "grad_norm": 79.31434150094832, |
| "learning_rate": 1.9921147013144777e-06, |
| "loss": -1.8823, |
| "num_tokens": 5833569.0, |
| "residual_var": 0.06708388775587082, |
| "reward": 0.640625, |
| "reward_std": 0.24488137662410736, |
| "rewards/drgrpo_math_reward/mean": 0.640625, |
| "rewards/drgrpo_math_reward/std": 0.4807571768760681, |
| "rho2": 0.46874988079071045, |
| "step": 29 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 4.422026899202607e-09, |
| "advantages/std": 0.3685672879219055, |
| "advantages/var": 0.1358418457261088, |
| "completions/clipped_ratio": -2.296875, |
| "epoch": 0.17191977077363896, |
| "grad_norm": 75.57215984106453, |
| "learning_rate": 1.9915422173708044e-06, |
| "loss": -3.8191, |
| "num_tokens": 6026380.0, |
| "residual_var": 0.05943083018064499, |
| "reward": 0.72265625, |
| "reward_std": 0.26170387864112854, |
| "rewards/drgrpo_math_reward/mean": 0.72265625, |
| "rewards/drgrpo_math_reward/std": 0.4485645890235901, |
| "rho2": 0.5624998807907104, |
| "step": 30 |
| }, |
| { |
| "advantages/mean": -1.3969838619232178e-09, |
| "advantages/snr": 4.280614987975828e-09, |
| "advantages/std": 0.32635119557380676, |
| "advantages/var": 0.10650510285245307, |
| "completions/clipped_ratio": -2.21875, |
| "epoch": 0.17765042979942694, |
| "grad_norm": 60.21761972546305, |
| "learning_rate": 1.9909497617679347e-06, |
| "loss": -2.0778, |
| "num_tokens": 6220330.0, |
| "residual_var": 0.07322227954864502, |
| "reward": 0.64453125, |
| "reward_std": 0.19951260089874268, |
| "rewards/drgrpo_math_reward/mean": 0.64453125, |
| "rewards/drgrpo_math_reward/std": 0.4795927405357361, |
| "rho2": 0.3124999403953552, |
| "step": 31 |
| }, |
| { |
| "advantages/mean": 2.2118911147117615e-09, |
| "advantages/snr": 6.256172755954382e-09, |
| "advantages/std": 0.3535533845424652, |
| "advantages/var": 0.12499999572143228, |
| "completions/clipped_ratio": -2.671875, |
| "epoch": 0.1833810888252149, |
| "grad_norm": 75.90161991383617, |
| "learning_rate": 1.9903373464391184e-06, |
| "loss": -2.4459, |
| "num_tokens": 6388484.0, |
| "residual_var": 0.06250002980232239, |
| "reward": 0.8203125, |
| "reward_std": 0.24435339868068695, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.49999991059303284, |
| "step": 32 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 9.302683664335218e-10, |
| "advantages/std": 0.37542495131492615, |
| "advantages/var": 0.14094389406981467, |
| "completions/clipped_ratio": -1.953125, |
| "epoch": 0.18911174785100288, |
| "grad_norm": 88.08278791565704, |
| "learning_rate": 1.9897049837196347e-06, |
| "loss": -4.1498, |
| "num_tokens": 6608514.0, |
| "residual_var": 0.06606747210025787, |
| "reward": 0.58984375, |
| "reward_std": 0.2673616409301758, |
| "rewards/drgrpo_math_reward/mean": 0.58984375, |
| "rewards/drgrpo_math_reward/std": 0.49282538890838623, |
| "rho2": 0.5312498807907104, |
| "step": 33 |
| }, |
| { |
| "advantages/mean": -5.820766091346741e-10, |
| "advantages/snr": 1.9764402082422215e-09, |
| "advantages/std": 0.29450756311416626, |
| "advantages/var": 0.08673470473144462, |
| "completions/clipped_ratio": -2.640625, |
| "epoch": 0.19484240687679083, |
| "grad_norm": 158.9426980296641, |
| "learning_rate": 1.9890526863465443e-06, |
| "loss": -1.9826, |
| "num_tokens": 6787702.0, |
| "residual_var": 0.046077825129032135, |
| "reward": 0.8203125, |
| "reward_std": 0.18623007833957672, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.46874988079071045, |
| "step": 34 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 3.763894853665204e-09, |
| "advantages/std": 0.30929481983184814, |
| "advantages/var": 0.0956632855748154, |
| "completions/clipped_ratio": -2.5, |
| "epoch": 0.20057306590257878, |
| "grad_norm": 54.19371702022302, |
| "learning_rate": 1.9883804674584306e-06, |
| "loss": 0.1586, |
| "num_tokens": 6974380.0, |
| "residual_var": 0.05381060764193535, |
| "reward": 0.7578125, |
| "reward_std": 0.19530275464057922, |
| "rewards/drgrpo_math_reward/mean": 0.7578125, |
| "rewards/drgrpo_math_reward/std": 0.4292463958263397, |
| "rho2": 0.43749991059303284, |
| "step": 35 |
| }, |
| { |
| "advantages/mean": -2.444721758365631e-09, |
| "advantages/snr": 7.381391359252789e-09, |
| "advantages/std": 0.33120065927505493, |
| "advantages/var": 0.10969387670423103, |
| "completions/clipped_ratio": -2.359375, |
| "epoch": 0.20630372492836677, |
| "grad_norm": 93.4601212855401, |
| "learning_rate": 1.9876883405951377e-06, |
| "loss": -3.3799, |
| "num_tokens": 7168721.0, |
| "residual_var": 0.06513076275587082, |
| "reward": 0.765625, |
| "reward_std": 0.2095002979040146, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.4062499403953552, |
| "step": 36 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.3499271273612976, |
| "advantages/var": 0.1224489944633298, |
| "completions/clipped_ratio": -2.375, |
| "epoch": 0.21203438395415472, |
| "grad_norm": 153.14054844478008, |
| "learning_rate": 1.9869763196974956e-06, |
| "loss": -2.5922, |
| "num_tokens": 7366495.0, |
| "residual_var": 0.06887757778167725, |
| "reward": 0.6484375, |
| "reward_std": 0.2290911078453064, |
| "rewards/drgrpo_math_reward/mean": 0.6484375, |
| "rewards/drgrpo_math_reward/std": 0.47839346528053284, |
| "rho2": 0.43749988079071045, |
| "step": 37 |
| }, |
| { |
| "advantages/mean": 1.6298145055770874e-09, |
| "advantages/snr": 4.5749320956588125e-09, |
| "advantages/std": 0.3562488853931427, |
| "advantages/var": 0.12691326834385652, |
| "completions/clipped_ratio": -2.421875, |
| "epoch": 0.2177650429799427, |
| "grad_norm": 73.9873748933185, |
| "learning_rate": 1.986244419107041e-06, |
| "loss": -1.17, |
| "num_tokens": 7563588.0, |
| "residual_var": 0.06345665454864502, |
| "reward": 0.68359375, |
| "reward_std": 0.24606089293956757, |
| "rewards/drgrpo_math_reward/mean": 0.68359375, |
| "rewards/drgrpo_math_reward/std": 0.4659844934940338, |
| "rho2": 0.49999991059303284, |
| "step": 38 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 4.1231406671042415e-09, |
| "advantages/std": 0.2823462188243866, |
| "advantages/var": 0.0797193872844284, |
| "completions/clipped_ratio": -2.171875, |
| "epoch": 0.22349570200573066, |
| "grad_norm": 77.56288158012738, |
| "learning_rate": 1.9854926535657268e-06, |
| "loss": -1.8093, |
| "num_tokens": 7755300.0, |
| "residual_var": 0.05978955700993538, |
| "reward": 0.59765625, |
| "reward_std": 0.1521669626235962, |
| "rewards/drgrpo_math_reward/mean": 0.59765625, |
| "rewards/drgrpo_math_reward/std": 0.4913311004638672, |
| "rho2": 0.24999994039535522, |
| "step": 39 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.4311629836649927e-09, |
| "advantages/std": 0.3253726363182068, |
| "advantages/var": 0.10586735246466006, |
| "completions/clipped_ratio": -2.671875, |
| "epoch": 0.22922636103151864, |
| "grad_norm": 87.24551220149847, |
| "learning_rate": 1.9847210382156263e-06, |
| "loss": -1.523, |
| "num_tokens": 7934446.0, |
| "residual_var": 0.06616710126399994, |
| "reward": 0.7109375, |
| "reward_std": 0.20490340888500214, |
| "rewards/drgrpo_math_reward/mean": 0.7109375, |
| "rewards/drgrpo_math_reward/std": 0.45421501994132996, |
| "rho2": 0.3749999403953552, |
| "step": 40 |
| }, |
| { |
| "advantages/mean": -1.862645149230957e-09, |
| "advantages/snr": 5.9243035139399444e-09, |
| "advantages/std": 0.31440743803977966, |
| "advantages/var": 0.09885203709473789, |
| "completions/clipped_ratio": -2.328125, |
| "epoch": 0.2349570200573066, |
| "grad_norm": 55.094477304288226, |
| "learning_rate": 1.9839295885986295e-06, |
| "loss": -0.5665, |
| "num_tokens": 8125659.0, |
| "residual_var": 0.05560428649187088, |
| "reward": 0.71484375, |
| "reward_std": 0.1922660768032074, |
| "rewards/drgrpo_math_reward/mean": 0.71484375, |
| "rewards/drgrpo_math_reward/std": 0.4523732364177704, |
| "rho2": 0.4374999403953552, |
| "step": 41 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 3.417019487087845e-10, |
| "advantages/std": 0.34069257974624634, |
| "advantages/var": 0.11607143389415242, |
| "completions/clipped_ratio": -2.21875, |
| "epoch": 0.24068767908309455, |
| "grad_norm": 82.4610385566399, |
| "learning_rate": 1.983118320656126e-06, |
| "loss": -2.4495, |
| "num_tokens": 8321968.0, |
| "residual_var": 0.06529020518064499, |
| "reward": 0.65625, |
| "reward_std": 0.22225631773471832, |
| "rewards/drgrpo_math_reward/mean": 0.65625, |
| "rewards/drgrpo_math_reward/std": 0.47588926553726196, |
| "rho2": 0.43749991059303284, |
| "step": 42 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.3154200315475464, |
| "advantages/var": 0.09948979630145516, |
| "completions/clipped_ratio": -2.4375, |
| "epoch": 0.24641833810888253, |
| "grad_norm": 74.27035735922438, |
| "learning_rate": 1.9822872507286887e-06, |
| "loss": -3.659, |
| "num_tokens": 8503101.0, |
| "residual_var": 0.05907208472490311, |
| "reward": 0.78125, |
| "reward_std": 0.1927964836359024, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.4062499403953552, |
| "step": 43 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.5639963792566626e-09, |
| "advantages/std": 0.29773807525634766, |
| "advantages/var": 0.08864796145735454, |
| "completions/clipped_ratio": -2.546875, |
| "epoch": 0.2521489971346705, |
| "grad_norm": 124.26993483415545, |
| "learning_rate": 1.98143639555574e-06, |
| "loss": -4.6213, |
| "num_tokens": 8678007.0, |
| "residual_var": 0.055404990911483765, |
| "reward": 0.76171875, |
| "reward_std": 0.17491313815116882, |
| "rewards/drgrpo_math_reward/mean": 0.76171875, |
| "rewards/drgrpo_math_reward/std": 0.4268665909767151, |
| "rho2": 0.3749999403953552, |
| "step": 44 |
| }, |
| { |
| "advantages/mean": 3.4924596548080444e-10, |
| "advantages/snr": 1.2129215590738813e-09, |
| "advantages/std": 0.28793779015541077, |
| "advantages/var": 0.08290817099958137, |
| "completions/clipped_ratio": -2.421875, |
| "epoch": 0.25787965616045844, |
| "grad_norm": 54.50555477439546, |
| "learning_rate": 1.98056577227522e-06, |
| "loss": -2.9954, |
| "num_tokens": 8867225.0, |
| "residual_var": 0.05440850183367729, |
| "reward": 0.6796875, |
| "reward_std": 0.1621546745300293, |
| "rewards/drgrpo_math_reward/mean": 0.6796875, |
| "rewards/drgrpo_math_reward/std": 0.4675106406211853, |
| "rho2": 0.34374991059303284, |
| "step": 45 |
| }, |
| { |
| "advantages/mean": 1.6298145055770874e-09, |
| "advantages/snr": 5.47398732739832e-09, |
| "advantages/std": 0.29773807525634766, |
| "advantages/var": 0.08864796145735454, |
| "completions/clipped_ratio": -2.390625, |
| "epoch": 0.2636103151862464, |
| "grad_norm": 50.70189605191493, |
| "learning_rate": 1.9796753984232355e-06, |
| "loss": -3.1074, |
| "num_tokens": 9076637.0, |
| "residual_var": 0.06371574103832245, |
| "reward": 0.66015625, |
| "reward_std": 0.16834282875061035, |
| "rewards/drgrpo_math_reward/mean": 0.66015625, |
| "rewards/drgrpo_math_reward/std": 0.47458380460739136, |
| "rho2": 0.2812499403953552, |
| "step": 46 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 9.509314693234178e-10, |
| "advantages/std": 0.24484480917453766, |
| "advantages/var": 0.05994898057971576, |
| "completions/clipped_ratio": -2.671875, |
| "epoch": 0.2693409742120344, |
| "grad_norm": 43.609323731133514, |
| "learning_rate": 1.9787652919337115e-06, |
| "loss": -1.4678, |
| "num_tokens": 9242656.0, |
| "residual_var": 0.04683515429496765, |
| "reward": 0.859375, |
| "reward_std": 0.11283563077449799, |
| "rewards/drgrpo_math_reward/mean": 0.859375, |
| "rewards/drgrpo_math_reward/std": 0.3483152687549591, |
| "rho2": 0.21874994039535522, |
| "step": 47 |
| }, |
| { |
| "advantages/mean": -1.3969838619232178e-09, |
| "advantages/snr": 4.5166742596055914e-09, |
| "advantages/std": 0.30929479002952576, |
| "advantages/var": 0.09566326713940843, |
| "completions/clipped_ratio": -2.421875, |
| "epoch": 0.27507163323782235, |
| "grad_norm": 138.18146615794635, |
| "learning_rate": 1.977835471138027e-06, |
| "loss": -1.2657, |
| "num_tokens": 9419647.0, |
| "residual_var": 0.05381060019135475, |
| "reward": 0.78125, |
| "reward_std": 0.19530275464057922, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.4374999403953552, |
| "step": 48 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 8.830797509141478e-10, |
| "advantages/std": 0.2636575400829315, |
| "advantages/var": 0.06951529844258264, |
| "completions/clipped_ratio": -2.40625, |
| "epoch": 0.2808022922636103, |
| "grad_norm": 37.49999329733934, |
| "learning_rate": 1.9768859547646473e-06, |
| "loss": -0.3458, |
| "num_tokens": 9592645.0, |
| "residual_var": 0.04779179021716118, |
| "reward": 0.79296875, |
| "reward_std": 0.14256632328033447, |
| "rewards/drgrpo_math_reward/mean": 0.79296875, |
| "rewards/drgrpo_math_reward/std": 0.40597182512283325, |
| "rho2": 0.3124999403953552, |
| "step": 49 |
| }, |
| { |
| "advantages/mean": -2.7939677238464355e-09, |
| "advantages/snr": 8.510420944439486e-09, |
| "advantages/std": 0.3282995820045471, |
| "advantages/var": 0.10778061554436036, |
| "completions/clipped_ratio": -2.53125, |
| "epoch": 0.28653295128939826, |
| "grad_norm": 58.64737177653492, |
| "learning_rate": 1.9759167619387473e-06, |
| "loss": -2.9043, |
| "num_tokens": 9765343.0, |
| "residual_var": 0.06399475783109665, |
| "reward": 0.72265625, |
| "reward_std": 0.20779283344745636, |
| "rewards/drgrpo_math_reward/mean": 0.72265625, |
| "rewards/drgrpo_math_reward/std": 0.4485645890235901, |
| "rho2": 0.4062499403953552, |
| "step": 50 |
| }, |
| { |
| "advantages/mean": 8.149072527885437e-10, |
| "advantages/snr": 2.2320711875154694e-09, |
| "advantages/std": 0.36509016156196594, |
| "advantages/var": 0.1332908260693424, |
| "completions/clipped_ratio": -2.453125, |
| "epoch": 0.2922636103151863, |
| "grad_norm": 89.34276653392745, |
| "learning_rate": 1.9749279121818236e-06, |
| "loss": -1.9981, |
| "num_tokens": 9950969.0, |
| "residual_var": 0.05831475183367729, |
| "reward": 0.67578125, |
| "reward_std": 0.26645296812057495, |
| "rewards/drgrpo_math_reward/mean": 0.67578125, |
| "rewards/drgrpo_math_reward/std": 0.46899911761283875, |
| "rho2": 0.5624998807907104, |
| "step": 51 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 3.9972132486104327e-10, |
| "advantages/std": 0.29124119877815247, |
| "advantages/var": 0.08482143586573532, |
| "completions/clipped_ratio": -2.6875, |
| "epoch": 0.2979942693409742, |
| "grad_norm": 54.58559855508494, |
| "learning_rate": 1.973919425411304e-06, |
| "loss": -1.13, |
| "num_tokens": 10121127.0, |
| "residual_var": 0.06361608952283859, |
| "reward": 0.77734375, |
| "reward_std": 0.15729182958602905, |
| "rewards/drgrpo_math_reward/mean": 0.77734375, |
| "rewards/drgrpo_math_reward/std": 0.41684433817863464, |
| "rho2": 0.24999994039535522, |
| "step": 52 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.8081168879392927e-09, |
| "advantages/std": 0.2575393617153168, |
| "advantages/var": 0.06632652283273277, |
| "completions/clipped_ratio": -2.703125, |
| "epoch": 0.3037249283667622, |
| "grad_norm": 39.35070508430935, |
| "learning_rate": 1.9728913219401447e-06, |
| "loss": -1.8596, |
| "num_tokens": 10285728.0, |
| "residual_var": 0.045599501579999924, |
| "reward": 0.78125, |
| "reward_std": 0.13914892077445984, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.3124999403953552, |
| "step": 53 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.253797709941864, |
| "advantages/var": 0.06441327757173454, |
| "completions/clipped_ratio": -2.59375, |
| "epoch": 0.30945558739255014, |
| "grad_norm": 136.99193873954198, |
| "learning_rate": 1.971843622476423e-06, |
| "loss": -1.4447, |
| "num_tokens": 10451827.0, |
| "residual_var": 0.04830996319651604, |
| "reward": 0.83203125, |
| "reward_std": 0.1238841786980629, |
| "rewards/drgrpo_math_reward/mean": 0.83203125, |
| "rewards/drgrpo_math_reward/std": 0.3745708465576172, |
| "rho2": 0.24999994039535522, |
| "step": 54 |
| }, |
| { |
| "advantages/mean": 1.280568540096283e-09, |
| "advantages/snr": 3.811440982290487e-09, |
| "advantages/std": 0.3359801471233368, |
| "advantages/var": 0.11288265926101904, |
| "completions/clipped_ratio": -2.46875, |
| "epoch": 0.3151862464183381, |
| "grad_norm": 87.22776016485538, |
| "learning_rate": 1.970776348122918e-06, |
| "loss": -2.9245, |
| "num_tokens": 10635142.0, |
| "residual_var": 0.07407926768064499, |
| "reward": 0.69140625, |
| "reward_std": 0.1993604302406311, |
| "rewards/drgrpo_math_reward/mean": 0.69140625, |
| "rewards/drgrpo_math_reward/std": 0.46281787753105164, |
| "rho2": 0.34374991059303284, |
| "step": 55 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 2.3376010956411484e-09, |
| "advantages/std": 0.29880714416503906, |
| "advantages/var": 0.08928570940406644, |
| "completions/clipped_ratio": -2.65625, |
| "epoch": 0.3209169054441261, |
| "grad_norm": 64.42194097152215, |
| "learning_rate": 1.9696895203766866e-06, |
| "loss": -2.6551, |
| "num_tokens": 10816137.0, |
| "residual_var": 0.055803585797548294, |
| "reward": 0.765625, |
| "reward_std": 0.18781886994838715, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.3749999403953552, |
| "step": 56 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 3.136579162365215e-10, |
| "advantages/std": 0.3711537718772888, |
| "advantages/var": 0.13775512237873855, |
| "completions/clipped_ratio": -2.703125, |
| "epoch": 0.32664756446991405, |
| "grad_norm": 70.92817738441774, |
| "learning_rate": 1.968583161128631e-06, |
| "loss": -1.7118, |
| "num_tokens": 10987350.0, |
| "residual_var": 0.06457272171974182, |
| "reward": 0.6328125, |
| "reward_std": 0.25103604793548584, |
| "rewards/drgrpo_math_reward/mean": 0.6328125, |
| "rewards/drgrpo_math_reward/std": 0.48298248648643494, |
| "rho2": 0.5312498807907104, |
| "step": 57 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 5.3587966720360335e-09, |
| "advantages/std": 0.21724152565002441, |
| "advantages/var": 0.047193880466750215, |
| "completions/clipped_ratio": -2.6875, |
| "epoch": 0.332378223495702, |
| "grad_norm": 47.37141263170206, |
| "learning_rate": 1.9674572926630564e-06, |
| "loss": -0.5569, |
| "num_tokens": 11138208.0, |
| "residual_var": 0.03539542108774185, |
| "reward": 0.859375, |
| "reward_std": 0.10034801065921783, |
| "rewards/drgrpo_math_reward/mean": 0.859375, |
| "rewards/drgrpo_math_reward/std": 0.3483152687549591, |
| "rho2": 0.24999994039535522, |
| "step": 58 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.0059432951138956e-09, |
| "advantages/std": 0.2314550280570984, |
| "advantages/var": 0.0535714300129122, |
| "completions/clipped_ratio": -2.703125, |
| "epoch": 0.33810888252148996, |
| "grad_norm": 38.885603895596766, |
| "learning_rate": 1.966311937657224e-06, |
| "loss": -1.2637, |
| "num_tokens": 11306846.0, |
| "residual_var": 0.043526798486709595, |
| "reward": 0.8828125, |
| "reward_std": 0.10007961094379425, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.1874999701976776, |
| "step": 59 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.2488633010539937e-09, |
| "advantages/std": 0.3728680908679962, |
| "advantages/var": 0.13903061318754428, |
| "completions/clipped_ratio": -2.546875, |
| "epoch": 0.3438395415472779, |
| "grad_norm": 156.58855155725036, |
| "learning_rate": 1.9651471191808923e-06, |
| "loss": -4.2495, |
| "num_tokens": 11494798.0, |
| "residual_var": 0.06082591786980629, |
| "reward": 0.703125, |
| "reward_std": 0.2780294418334961, |
| "rewards/drgrpo_math_reward/mean": 0.703125, |
| "rewards/drgrpo_math_reward/std": 0.45777595043182373, |
| "rho2": 0.5624998807907104, |
| "step": 60 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 4.7448812031710745e-09, |
| "advantages/std": 0.34348899126052856, |
| "advantages/var": 0.11798468711717547, |
| "completions/clipped_ratio": -2.4375, |
| "epoch": 0.3495702005730659, |
| "grad_norm": 63.90428061896672, |
| "learning_rate": 1.9639628606958534e-06, |
| "loss": -3.3587, |
| "num_tokens": 11692303.0, |
| "residual_var": 0.07374045252799988, |
| "reward": 0.60546875, |
| "reward_std": 0.2168606072664261, |
| "rewards/drgrpo_math_reward/mean": 0.60546875, |
| "rewards/drgrpo_math_reward/std": 0.48970720171928406, |
| "rho2": 0.37499988079071045, |
| "step": 61 |
| }, |
| { |
| "advantages/mean": -1.5133991837501526e-09, |
| "advantages/snr": 4.959639056521822e-09, |
| "advantages/std": 0.30514299869537354, |
| "advantages/var": 0.09311224965280473, |
| "completions/clipped_ratio": -2.34375, |
| "epoch": 0.3553008595988539, |
| "grad_norm": 87.18799338980655, |
| "learning_rate": 1.962759186055461e-06, |
| "loss": -1.0981, |
| "num_tokens": 11866779.0, |
| "residual_var": 0.06110493093729019, |
| "reward": 0.6953125, |
| "reward_std": 0.17885848879814148, |
| "rewards/drgrpo_math_reward/mean": 0.6953125, |
| "rewards/drgrpo_math_reward/std": 0.4611765742301941, |
| "rho2": 0.34374991059303284, |
| "step": 62 |
| }, |
| { |
| "advantages/mean": 1.862645149230957e-09, |
| "advantages/snr": 6.104171146488397e-09, |
| "advantages/std": 0.30514299869537354, |
| "advantages/var": 0.09311224965280473, |
| "completions/clipped_ratio": -2.5, |
| "epoch": 0.36103151862464183, |
| "grad_norm": 76.80663904599737, |
| "learning_rate": 1.961536119504149e-06, |
| "loss": -4.3722, |
| "num_tokens": 12044597.0, |
| "residual_var": 0.04655614122748375, |
| "reward": 0.765625, |
| "reward_std": 0.19898608326911926, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.49999991059303284, |
| "step": 63 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.6864667746454993e-09, |
| "advantages/std": 0.260003924369812, |
| "advantages/var": 0.06760204068770292, |
| "completions/clipped_ratio": -2.40625, |
| "epoch": 0.3667621776504298, |
| "grad_norm": 50.8446775500655, |
| "learning_rate": 1.960293685676943e-06, |
| "loss": -2.5164, |
| "num_tokens": 12235035.0, |
| "residual_var": 0.05070154368877411, |
| "reward": 0.75, |
| "reward_std": 0.12730157375335693, |
| "rewards/drgrpo_math_reward/mean": 0.75, |
| "rewards/drgrpo_math_reward/std": 0.4338609278202057, |
| "rho2": 0.24999995529651642, |
| "step": 64 |
| }, |
| { |
| "advantages/mean": -1.7462298274040222e-09, |
| "advantages/snr": 5.907641413650253e-09, |
| "advantages/std": 0.29558831453323364, |
| "advantages/var": 0.08737245168859786, |
| "completions/clipped_ratio": -2.15625, |
| "epoch": 0.37249283667621774, |
| "grad_norm": 55.661046948184854, |
| "learning_rate": 1.9590319095989656e-06, |
| "loss": -2.5047, |
| "num_tokens": 12433204.0, |
| "residual_var": 0.05187740921974182, |
| "reward": 0.68359375, |
| "reward_std": 0.1743851751089096, |
| "rewards/drgrpo_math_reward/mean": 0.68359375, |
| "rewards/drgrpo_math_reward/std": 0.4659844934940338, |
| "rho2": 0.4062499403953552, |
| "step": 65 |
| }, |
| { |
| "advantages/mean": 2.0954757928848267e-09, |
| "advantages/snr": 7.875790433379715e-09, |
| "advantages/std": 0.26606544852256775, |
| "advantages/var": 0.07079082289751515, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 0.37822349570200575, |
| "grad_norm": 66.03759288988809, |
| "learning_rate": 1.9577508166849303e-06, |
| "loss": -2.0897, |
| "num_tokens": 12591123.0, |
| "residual_var": 0.04645648971199989, |
| "reward": 0.83203125, |
| "reward_std": 0.1437433809041977, |
| "rewards/drgrpo_math_reward/mean": 0.83203125, |
| "rewards/drgrpo_math_reward/std": 0.3745708465576172, |
| "rho2": 0.34374991059303284, |
| "step": 66 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 1.7998288379265802e-09, |
| "advantages/std": 0.3234066367149353, |
| "advantages/var": 0.10459185267126614, |
| "completions/clipped_ratio": -2.4375, |
| "epoch": 0.3839541547277937, |
| "grad_norm": 62.754101502938425, |
| "learning_rate": 1.9564504327386314e-06, |
| "loss": -1.9607, |
| "num_tokens": 12770302.0, |
| "residual_var": 0.062101420015096664, |
| "reward": 0.6953125, |
| "reward_std": 0.1914672553539276, |
| "rewards/drgrpo_math_reward/mean": 0.6953125, |
| "rewards/drgrpo_math_reward/std": 0.4611765742301941, |
| "rho2": 0.40624991059303284, |
| "step": 67 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 7.905760832968887e-10, |
| "advantages/std": 0.29450756311416626, |
| "advantages/var": 0.08673470473144462, |
| "completions/clipped_ratio": -2.5625, |
| "epoch": 0.38968481375358166, |
| "grad_norm": 60.793677536940486, |
| "learning_rate": 1.955130783952423e-06, |
| "loss": -0.3413, |
| "num_tokens": 12959733.0, |
| "residual_var": 0.05420919880270958, |
| "reward": 0.7421875, |
| "reward_std": 0.1732056736946106, |
| "rewards/drgrpo_math_reward/mean": 0.7421875, |
| "rewards/drgrpo_math_reward/std": 0.4382871091365814, |
| "rho2": 0.3749999403953552, |
| "step": 68 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.6252634777932388e-09, |
| "advantages/std": 0.26606544852256775, |
| "advantages/var": 0.07079082289751515, |
| "completions/clipped_ratio": -2.65625, |
| "epoch": 0.3954154727793696, |
| "grad_norm": 48.873217225599404, |
| "learning_rate": 1.953791896906692e-06, |
| "loss": -1.5571, |
| "num_tokens": 13113127.0, |
| "residual_var": 0.05309312045574188, |
| "reward": 0.83984375, |
| "reward_std": 0.13717305660247803, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.2499999701976776, |
| "step": 69 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.23007319867610931, |
| "advantages/var": 0.05293367674905647, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 0.40114613180515757, |
| "grad_norm": 49.36381459017057, |
| "learning_rate": 1.9524337985693227e-06, |
| "loss": -1.7489, |
| "num_tokens": 13276750.0, |
| "residual_var": 0.036391910165548325, |
| "reward": 0.80859375, |
| "reward_std": 0.11902770400047302, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.3124999403953552, |
| "step": 70 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 2.899437187779043e-09, |
| "advantages/std": 0.2409060299396515, |
| "advantages/var": 0.05803571526128426, |
| "completions/clipped_ratio": -2.625, |
| "epoch": 0.4068767908309456, |
| "grad_norm": 36.88997932012926, |
| "learning_rate": 1.9510565162951534e-06, |
| "loss": -1.3753, |
| "num_tokens": 13442695.0, |
| "residual_var": 0.041713181883096695, |
| "reward": 0.83984375, |
| "reward_std": 0.11823134124279022, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.2812499403953552, |
| "step": 71 |
| }, |
| { |
| "advantages/mean": -2.444721758365631e-09, |
| "advantages/snr": 8.331736741469324e-09, |
| "advantages/std": 0.2934228181838989, |
| "advantages/var": 0.0860969502309814, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 0.41260744985673353, |
| "grad_norm": 69.03881568139195, |
| "learning_rate": 1.949660077825426e-06, |
| "loss": 0.1049, |
| "num_tokens": 13623554.0, |
| "residual_var": 0.06457272171974182, |
| "reward": 0.85546875, |
| "reward_std": 0.15900176763534546, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.24999995529651642, |
| "step": 72 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.2074180229024673e-09, |
| "advantages/std": 0.31642937660217285, |
| "advantages/var": 0.10012755037683974, |
| "completions/clipped_ratio": -2.71875, |
| "epoch": 0.4183381088825215, |
| "grad_norm": 75.3655355836356, |
| "learning_rate": 1.948244511287226e-06, |
| "loss": -0.4727, |
| "num_tokens": 13799558.0, |
| "residual_var": 0.0688377097249031, |
| "reward": 0.75390625, |
| "reward_std": 0.18622371554374695, |
| "rewards/drgrpo_math_reward/mean": 0.75390625, |
| "rewards/drgrpo_math_reward/std": 0.43157756328582764, |
| "rho2": 0.3124999403953552, |
| "step": 73 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 2.8623259673299855e-09, |
| "advantages/std": 0.3253726363182068, |
| "advantages/var": 0.10586735246466006, |
| "completions/clipped_ratio": -2.53125, |
| "epoch": 0.42406876790830944, |
| "grad_norm": 50.55349729785912, |
| "learning_rate": 1.946809845192918e-06, |
| "loss": -0.6276, |
| "num_tokens": 13979108.0, |
| "residual_var": 0.052933696657419205, |
| "reward": 0.7578125, |
| "reward_std": 0.21910977363586426, |
| "rewards/drgrpo_math_reward/mean": 0.7578125, |
| "rewards/drgrpo_math_reward/std": 0.4292463958263397, |
| "rho2": 0.49999991059303284, |
| "step": 74 |
| }, |
| { |
| "advantages/mean": 1.979060471057892e-09, |
| "advantages/snr": 7.124253269284449e-09, |
| "advantages/std": 0.27779197692871094, |
| "advantages/var": 0.07716838244596147, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 0.4297994269340974, |
| "grad_norm": 149.96300323315998, |
| "learning_rate": 1.9453561084395687e-06, |
| "loss": -3.9114, |
| "num_tokens": 14147162.0, |
| "residual_var": 0.04823024198412895, |
| "reward": 0.75390625, |
| "reward_std": 0.1629534810781479, |
| "rewards/drgrpo_math_reward/mean": 0.75390625, |
| "rewards/drgrpo_math_reward/std": 0.43157756328582764, |
| "rho2": 0.37499988079071045, |
| "step": 75 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.31743553280830383, |
| "advantages/var": 0.10076531748929174, |
| "completions/clipped_ratio": -2.453125, |
| "epoch": 0.4355300859598854, |
| "grad_norm": 63.727237228202135, |
| "learning_rate": 1.9438833303083674e-06, |
| "loss": -3.3417, |
| "num_tokens": 14317167.0, |
| "residual_var": 0.056680500507354736, |
| "reward": 0.65625, |
| "reward_std": 0.20688170194625854, |
| "rewards/drgrpo_math_reward/mean": 0.65625, |
| "rewards/drgrpo_math_reward/std": 0.47588926553726196, |
| "rho2": 0.4374999403953552, |
| "step": 76 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-09, |
| "advantages/snr": 7.709835570839547e-09, |
| "advantages/std": 0.30199170112609863, |
| "advantages/var": 0.09119898754903488, |
| "completions/clipped_ratio": -2.578125, |
| "epoch": 0.44126074498567336, |
| "grad_norm": 56.35814893945447, |
| "learning_rate": 1.9423915404640348e-06, |
| "loss": -0.9106, |
| "num_tokens": 14492451.0, |
| "residual_var": 0.05984935164451599, |
| "reward": 0.66796875, |
| "reward_std": 0.1706969439983368, |
| "rewards/drgrpo_math_reward/mean": 0.66796875, |
| "rewards/drgrpo_math_reward/std": 0.4718646705150604, |
| "rho2": 0.34374988079071045, |
| "step": 77 |
| }, |
| { |
| "advantages/mean": 1.280568540096283e-09, |
| "advantages/snr": 5.1486095650116935e-09, |
| "advantages/std": 0.24872122704982758, |
| "advantages/var": 0.06186224878517188, |
| "completions/clipped_ratio": -2.640625, |
| "epoch": 0.4469914040114613, |
| "grad_norm": 36.08436726472045, |
| "learning_rate": 1.9408807689542254e-06, |
| "loss": -1.3103, |
| "num_tokens": 14657029.0, |
| "residual_var": 0.044463496655225754, |
| "reward": 0.69140625, |
| "reward_std": 0.12164628505706787, |
| "rewards/drgrpo_math_reward/mean": 0.69140625, |
| "rewards/drgrpo_math_reward/std": 0.46281787753105164, |
| "rho2": 0.2812499403953552, |
| "step": 78 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 8.416325633651013e-10, |
| "advantages/std": 0.2766416668891907, |
| "advantages/var": 0.07653061185922994, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 0.45272206303724927, |
| "grad_norm": 58.90239262695802, |
| "learning_rate": 1.9393510462089237e-06, |
| "loss": -1.5106, |
| "num_tokens": 14822142.0, |
| "residual_var": 0.05739796906709671, |
| "reward": 0.8359375, |
| "reward_std": 0.14874956011772156, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.24999994039535522, |
| "step": 79 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.3061862289905548, |
| "advantages/var": 0.09375000682345647, |
| "completions/clipped_ratio": -2.515625, |
| "epoch": 0.4584527220630373, |
| "grad_norm": 52.44279292628102, |
| "learning_rate": 1.937802403039829e-06, |
| "loss": -0.0152, |
| "num_tokens": 15004930.0, |
| "residual_var": 0.05859377235174179, |
| "reward": 0.67578125, |
| "reward_std": 0.18003800511360168, |
| "rewards/drgrpo_math_reward/mean": 0.67578125, |
| "rewards/drgrpo_math_reward/std": 0.46899911761283875, |
| "rho2": 0.37499991059303284, |
| "step": 80 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 1.7085097435439224e-09, |
| "advantages/std": 0.34069257974624634, |
| "advantages/var": 0.11607143389415242, |
| "completions/clipped_ratio": -2.5625, |
| "epoch": 0.46418338108882523, |
| "grad_norm": 69.59869023850752, |
| "learning_rate": 1.936234870639737e-06, |
| "loss": -2.8108, |
| "num_tokens": 15191177.0, |
| "residual_var": 0.07254466414451599, |
| "reward": 0.6640625, |
| "reward_std": 0.22160722315311432, |
| "rewards/drgrpo_math_reward/mean": 0.6640625, |
| "rewards/drgrpo_math_reward/std": 0.4732423722743988, |
| "rho2": 0.3749999403953552, |
| "step": 81 |
| }, |
| { |
| "advantages/mean": 1.6298145055770874e-09, |
| "advantages/snr": 5.575178862375007e-09, |
| "advantages/std": 0.29233402013778687, |
| "advantages/var": 0.08545917932991998, |
| "completions/clipped_ratio": -2.65625, |
| "epoch": 0.4699140401146132, |
| "grad_norm": 59.051506077119946, |
| "learning_rate": 1.934648480581911e-06, |
| "loss": -1.8163, |
| "num_tokens": 15362117.0, |
| "residual_var": 0.04807080700993538, |
| "reward": 0.734375, |
| "reward_std": 0.17859891057014465, |
| "rewards/drgrpo_math_reward/mean": 0.734375, |
| "rewards/drgrpo_math_reward/std": 0.4425306022167206, |
| "rho2": 0.43749991059303284, |
| "step": 82 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 8.830796510959388e-10, |
| "advantages/std": 0.2636575698852539, |
| "advantages/var": 0.06951531415779755, |
| "completions/clipped_ratio": -2.359375, |
| "epoch": 0.47564469914040114, |
| "grad_norm": 39.306186581687186, |
| "learning_rate": 1.933043264819444e-06, |
| "loss": -0.6807, |
| "num_tokens": 15550143.0, |
| "residual_var": 0.05213649198412895, |
| "reward": 0.75390625, |
| "reward_std": 0.12900903820991516, |
| "rewards/drgrpo_math_reward/mean": 0.75390625, |
| "rewards/drgrpo_math_reward/std": 0.43157756328582764, |
| "rho2": 0.24999994039535522, |
| "step": 83 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.0968132639044306e-09, |
| "advantages/std": 0.3331207036972046, |
| "advantages/var": 0.11096940323172078, |
| "completions/clipped_ratio": -2.578125, |
| "epoch": 0.4813753581661891, |
| "grad_norm": 71.96477285108665, |
| "learning_rate": 1.931419255684618e-06, |
| "loss": -0.9075, |
| "num_tokens": 15727115.0, |
| "residual_var": 0.062420301139354706, |
| "reward": 0.7421875, |
| "reward_std": 0.21713145077228546, |
| "rewards/drgrpo_math_reward/mean": 0.7421875, |
| "rewards/drgrpo_math_reward/std": 0.4382871091365814, |
| "rho2": 0.43749991059303284, |
| "step": 84 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 3.06259193017924e-09, |
| "advantages/std": 0.30409619212150574, |
| "advantages/var": 0.09247449406279973, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 0.4871060171919771, |
| "grad_norm": 47.61110860557143, |
| "learning_rate": 1.929776485888251e-06, |
| "loss": -1.8949, |
| "num_tokens": 15886432.0, |
| "residual_var": 0.057796575129032135, |
| "reward": 0.76953125, |
| "reward_std": 0.17832808196544647, |
| "rewards/drgrpo_math_reward/mean": 0.76953125, |
| "rewards/drgrpo_math_reward/std": 0.4219578504562378, |
| "rho2": 0.37499991059303284, |
| "step": 85 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-09, |
| "advantages/snr": 7.819981896283313e-09, |
| "advantages/std": 0.29773807525634766, |
| "advantages/var": 0.08864796145735454, |
| "completions/clipped_ratio": -2.640625, |
| "epoch": 0.49283667621776506, |
| "grad_norm": 48.589294451635624, |
| "learning_rate": 1.928114988519039e-06, |
| "loss": -1.5481, |
| "num_tokens": 16054808.0, |
| "residual_var": 0.0692562386393547, |
| "reward": 0.80859375, |
| "reward_std": 0.1612396389245987, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.21874994039535522, |
| "step": 86 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 3.72681136151475e-09, |
| "advantages/std": 0.31237244606018066, |
| "advantages/var": 0.09757654505762048, |
| "completions/clipped_ratio": -2.5, |
| "epoch": 0.498567335243553, |
| "grad_norm": 77.61472809779868, |
| "learning_rate": 1.926434797042887e-06, |
| "loss": -1.1408, |
| "num_tokens": 16222572.0, |
| "residual_var": 0.06403461843729019, |
| "reward": 0.76953125, |
| "reward_std": 0.17753173410892487, |
| "rewards/drgrpo_math_reward/mean": 0.76953125, |
| "rewards/drgrpo_math_reward/std": 0.4219578504562378, |
| "rho2": 0.34374991059303284, |
| "step": 87 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 6.706288730674227e-10, |
| "advantages/std": 0.3471825420856476, |
| "advantages/var": 0.12053571752905246, |
| "completions/clipped_ratio": -2.5625, |
| "epoch": 0.504297994269341, |
| "grad_norm": 58.93647688904723, |
| "learning_rate": 1.9247359453022406e-06, |
| "loss": -0.6762, |
| "num_tokens": 16402078.0, |
| "residual_var": 0.06780136376619339, |
| "reward": 0.76171875, |
| "reward_std": 0.22738362848758698, |
| "rewards/drgrpo_math_reward/mean": 0.76171875, |
| "rewards/drgrpo_math_reward/std": 0.4268665909767151, |
| "rho2": 0.4374999403953552, |
| "step": 88 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 5.575178862375007e-09, |
| "advantages/std": 0.29233402013778687, |
| "advantages/var": 0.08545917932991998, |
| "completions/clipped_ratio": -2.65625, |
| "epoch": 0.5100286532951289, |
| "grad_norm": 60.11029385266424, |
| "learning_rate": 1.9230184675153973e-06, |
| "loss": -1.8015, |
| "num_tokens": 16564467.0, |
| "residual_var": 0.05341200530529022, |
| "reward": 0.84375, |
| "reward_std": 0.1714957356452942, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.37499991059303284, |
| "step": 89 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 4.7295666564139676e-09, |
| "advantages/std": 0.24614372849464417, |
| "advantages/var": 0.0605867350772451, |
| "completions/clipped_ratio": -2.6875, |
| "epoch": 0.5157593123209169, |
| "grad_norm": 32.80821455872672, |
| "learning_rate": 1.9212823982758223e-06, |
| "loss": -0.388, |
| "num_tokens": 16735491.0, |
| "residual_var": 0.049226731061935425, |
| "reward": 0.74609375, |
| "reward_std": 0.11336605250835419, |
| "rewards/drgrpo_math_reward/mean": 0.74609375, |
| "rewards/drgrpo_math_reward/std": 0.4360972046852112, |
| "rho2": 0.1874999701976776, |
| "step": 90 |
| }, |
| { |
| "advantages/mean": 2.0954757928848267e-09, |
| "advantages/snr": 7.115184749671998e-09, |
| "advantages/std": 0.29450756311416626, |
| "advantages/var": 0.08673470473144462, |
| "completions/clipped_ratio": -2.515625, |
| "epoch": 0.5214899713467048, |
| "grad_norm": 126.43847336761601, |
| "learning_rate": 1.9195277725514506e-06, |
| "loss": -1.8816, |
| "num_tokens": 16921769.0, |
| "residual_var": 0.05149874463677406, |
| "reward": 0.6796875, |
| "reward_std": 0.17912688851356506, |
| "rewards/drgrpo_math_reward/mean": 0.6796875, |
| "rewards/drgrpo_math_reward/std": 0.4675106406211853, |
| "rho2": 0.40624988079071045, |
| "step": 91 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.2562982141971588, |
| "advantages/var": 0.0656887746006527, |
| "completions/clipped_ratio": -2.625, |
| "epoch": 0.5272206303724928, |
| "grad_norm": 44.145512065978785, |
| "learning_rate": 1.917754625683981e-06, |
| "loss": -1.4745, |
| "num_tokens": 17083984.0, |
| "residual_var": 0.043108273297548294, |
| "reward": 0.80078125, |
| "reward_std": 0.13861849904060364, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.3437499403953552, |
| "step": 92 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.7194685024217942e-09, |
| "advantages/std": 0.27081698179244995, |
| "advantages/var": 0.07334183762717217, |
| "completions/clipped_ratio": -2.65625, |
| "epoch": 0.5329512893982808, |
| "grad_norm": 38.3146731634114, |
| "learning_rate": 1.9159629933881667e-06, |
| "loss": -0.3711, |
| "num_tokens": 17245004.0, |
| "residual_var": 0.05042252317070961, |
| "reward": 0.73828125, |
| "reward_std": 0.15243536233901978, |
| "rewards/drgrpo_math_reward/mean": 0.73828125, |
| "rewards/drgrpo_math_reward/std": 0.4404313564300537, |
| "rho2": 0.3124999403953552, |
| "step": 93 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 7.709835570839548e-10, |
| "advantages/std": 0.30199170112609863, |
| "advantages/var": 0.09119898754903488, |
| "completions/clipped_ratio": -2.609375, |
| "epoch": 0.5386819484240688, |
| "grad_norm": 51.034802200564776, |
| "learning_rate": 1.914152911751093e-06, |
| "loss": -2.5489, |
| "num_tokens": 17418162.0, |
| "residual_var": 0.05984934791922569, |
| "reward": 0.80859375, |
| "reward_std": 0.17715103924274445, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.3437499403953552, |
| "step": 94 |
| }, |
| { |
| "advantages/mean": 3.4924596548080444e-10, |
| "advantages/snr": 1.318584786741314e-09, |
| "advantages/std": 0.2648642361164093, |
| "advantages/var": 0.07015306357352902, |
| "completions/clipped_ratio": -2.328125, |
| "epoch": 0.5444126074498568, |
| "grad_norm": 39.90106380343523, |
| "learning_rate": 1.912324417231454e-06, |
| "loss": -1.5377, |
| "num_tokens": 17609294.0, |
| "residual_var": 0.05042252317070961, |
| "reward": 0.625, |
| "reward_std": 0.14309674501419067, |
| "rewards/drgrpo_math_reward/mean": 0.625, |
| "rewards/drgrpo_math_reward/std": 0.4850712716579437, |
| "rho2": 0.2812499403953552, |
| "step": 95 |
| }, |
| { |
| "advantages/mean": 2.0954757928848267e-09, |
| "advantages/snr": 8.513219981545141e-09, |
| "advantages/std": 0.24614372849464417, |
| "advantages/var": 0.0605867350772451, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 0.5501432664756447, |
| "grad_norm": 39.1391593416744, |
| "learning_rate": 1.9104775466588157e-06, |
| "loss": -1.0478, |
| "num_tokens": 17758949.0, |
| "residual_var": 0.04354672506451607, |
| "reward": 0.87109375, |
| "reward_std": 0.11993636190891266, |
| "rewards/drgrpo_math_reward/mean": 0.87109375, |
| "rewards/drgrpo_math_reward/std": 0.33575257658958435, |
| "rho2": 0.2812499403953552, |
| "step": 96 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 7.155814918324964e-10, |
| "advantages/std": 0.3253726363182068, |
| "advantages/var": 0.10586735246466006, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 0.5558739255014327, |
| "grad_norm": 52.65706152505506, |
| "learning_rate": 1.9086123372328743e-06, |
| "loss": -1.9894, |
| "num_tokens": 17931749.0, |
| "residual_var": 0.059550393372774124, |
| "reward": 0.7890625, |
| "reward_std": 0.1990984082221985, |
| "rewards/drgrpo_math_reward/mean": 0.7890625, |
| "rewards/drgrpo_math_reward/std": 0.4087733030319214, |
| "rho2": 0.4374999403953552, |
| "step": 97 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 3.352589773780917e-09, |
| "advantages/std": 0.27779197692871094, |
| "advantages/var": 0.07716838244596147, |
| "completions/clipped_ratio": -2.53125, |
| "epoch": 0.5616045845272206, |
| "grad_norm": 42.36508110920937, |
| "learning_rate": 1.906728826522708e-06, |
| "loss": -0.8938, |
| "num_tokens": 18100959.0, |
| "residual_var": 0.050641756504774094, |
| "reward": 0.69140625, |
| "reward_std": 0.15057817101478577, |
| "rewards/drgrpo_math_reward/mean": 0.69140625, |
| "rewards/drgrpo_math_reward/std": 0.46281787753105164, |
| "rho2": 0.3437499403953552, |
| "step": 98 |
| }, |
| { |
| "advantages/mean": -1.0477378964424133e-09, |
| "advantages/snr": 3.902891199784966e-09, |
| "advantages/std": 0.2684517204761505, |
| "advantages/var": 0.07206632622660525, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 0.5673352435530086, |
| "grad_norm": 49.233447686472736, |
| "learning_rate": 1.9048270524660196e-06, |
| "loss": -0.2706, |
| "num_tokens": 18256522.0, |
| "residual_var": 0.05404975265264511, |
| "reward": 0.86328125, |
| "reward_std": 0.13888299465179443, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.2499999701976776, |
| "step": 99 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-09, |
| "advantages/snr": 8.086143727159208e-09, |
| "advantages/std": 0.28793779015541077, |
| "advantages/var": 0.08290817099958137, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 0.5730659025787965, |
| "grad_norm": 45.12521046013055, |
| "learning_rate": 1.9029070533683722e-06, |
| "loss": -1.2174, |
| "num_tokens": 18421106.0, |
| "residual_var": 0.04922673851251602, |
| "reward": 0.78125, |
| "reward_std": 0.16925784945487976, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.40624991059303284, |
| "step": 100 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 3.783653096074668e-09, |
| "advantages/std": 0.24614374339580536, |
| "advantages/var": 0.060586742412900074, |
| "completions/clipped_ratio": -2.5, |
| "epoch": 0.5787965616045845, |
| "grad_norm": 38.66496875469516, |
| "learning_rate": 1.9009688679024189e-06, |
| "loss": -0.7118, |
| "num_tokens": 18589613.0, |
| "residual_var": 0.04544006660580635, |
| "reward": 0.69140625, |
| "reward_std": 0.12046922743320465, |
| "rewards/drgrpo_math_reward/mean": 0.69140625, |
| "rewards/drgrpo_math_reward/std": 0.46281787753105164, |
| "rho2": 0.24999995529651642, |
| "step": 101 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 4.5421821470515416e-09, |
| "advantages/std": 0.2562982141971588, |
| "advantages/var": 0.0656887746006527, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 0.5845272206303725, |
| "grad_norm": 60.63497224407915, |
| "learning_rate": 1.8990125351071223e-06, |
| "loss": -1.0793, |
| "num_tokens": 18733932.0, |
| "residual_var": 0.049266595393419266, |
| "reward": 0.90234375, |
| "reward_std": 0.13204818964004517, |
| "rewards/drgrpo_math_reward/mean": 0.90234375, |
| "rewards/drgrpo_math_reward/std": 0.29743078351020813, |
| "rho2": 0.24999995529651642, |
| "step": 102 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.24222607910633087, |
| "advantages/var": 0.05867347339922646, |
| "completions/clipped_ratio": -2.421875, |
| "epoch": 0.5902578796561605, |
| "grad_norm": 98.73285501942651, |
| "learning_rate": 1.8970380943869686e-06, |
| "loss": -1.4782, |
| "num_tokens": 18904374.0, |
| "residual_var": 0.04217156767845154, |
| "reward": 0.6953125, |
| "reward_std": 0.11822889000177383, |
| "rewards/drgrpo_math_reward/mean": 0.6953125, |
| "rewards/drgrpo_math_reward/std": 0.4611765742301941, |
| "rho2": 0.2812499403953552, |
| "step": 103 |
| }, |
| { |
| "advantages/mean": -1.3969838619232178e-09, |
| "advantages/snr": 4.255210472219743e-09, |
| "advantages/std": 0.3282995820045471, |
| "advantages/var": 0.10778061554436036, |
| "completions/clipped_ratio": -2.546875, |
| "epoch": 0.5959885386819485, |
| "grad_norm": 42.68933358778393, |
| "learning_rate": 1.8950455855111742e-06, |
| "loss": -1.2247, |
| "num_tokens": 19072246.0, |
| "residual_var": 0.07073105126619339, |
| "reward": 0.64453125, |
| "reward_std": 0.20714375376701355, |
| "rewards/drgrpo_math_reward/mean": 0.64453125, |
| "rewards/drgrpo_math_reward/std": 0.4795927405357361, |
| "rho2": 0.3437499403953552, |
| "step": 104 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 7.630213187892461e-10, |
| "advantages/std": 0.3051430284976959, |
| "advantages/var": 0.09311226784074567, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 0.6017191977077364, |
| "grad_norm": 41.03832126155318, |
| "learning_rate": 1.8930350486128855e-06, |
| "loss": -0.5834, |
| "num_tokens": 19240780.0, |
| "residual_var": 0.058195166289806366, |
| "reward": 0.75, |
| "reward_std": 0.18649454414844513, |
| "rewards/drgrpo_math_reward/mean": 0.75, |
| "rewards/drgrpo_math_reward/std": 0.4338609278202057, |
| "rho2": 0.3749999403953552, |
| "step": 105 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 3.802900033317195e-09, |
| "advantages/std": 0.2142857313156128, |
| "advantages/var": 0.045918374645467, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 0.6074498567335244, |
| "grad_norm": 36.27068845842204, |
| "learning_rate": 1.8910065241883678e-06, |
| "loss": -1.19, |
| "num_tokens": 19390230.0, |
| "residual_var": 0.035873737186193466, |
| "reward": 0.8984375, |
| "reward_std": 0.09271685779094696, |
| "rewards/drgrpo_math_reward/mean": 0.8984375, |
| "rewards/drgrpo_math_reward/std": 0.3026638329029083, |
| "rho2": 0.2187499701976776, |
| "step": 106 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 6.840946068269286e-09, |
| "advantages/std": 0.23824401199817657, |
| "advantages/var": 0.056760209252987304, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 0.6131805157593123, |
| "grad_norm": 34.31275723668857, |
| "learning_rate": 1.8889600530961932e-06, |
| "loss": -0.404, |
| "num_tokens": 19542572.0, |
| "residual_var": 0.04079640656709671, |
| "reward": 0.74609375, |
| "reward_std": 0.11652141809463501, |
| "rewards/drgrpo_math_reward/mean": 0.74609375, |
| "rewards/drgrpo_math_reward/std": 0.4360972046852112, |
| "rho2": 0.2812499403953552, |
| "step": 107 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 2.627483538992809e-09, |
| "advantages/std": 0.3544541895389557, |
| "advantages/var": 0.12563777248171792, |
| "completions/clipped_ratio": -2.625, |
| "epoch": 0.6189111747851003, |
| "grad_norm": 62.78402318125484, |
| "learning_rate": 1.8868956765564148e-06, |
| "loss": -1.6898, |
| "num_tokens": 19717110.0, |
| "residual_var": 0.07852361351251602, |
| "reward": 0.69140625, |
| "reward_std": 0.23831351101398468, |
| "rewards/drgrpo_math_reward/mean": 0.69140625, |
| "rewards/drgrpo_math_reward/std": 0.46281787753105164, |
| "rho2": 0.3749999403953552, |
| "step": 108 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 7.764322892357765e-10, |
| "advantages/std": 0.29987242817878723, |
| "advantages/var": 0.0899234731818419, |
| "completions/clipped_ratio": -2.640625, |
| "epoch": 0.6246418338108882, |
| "grad_norm": 47.45129141101553, |
| "learning_rate": 1.8848134361497382e-06, |
| "loss": -0.4754, |
| "num_tokens": 19873742.0, |
| "residual_var": 0.061822403222322464, |
| "reward": 0.81640625, |
| "reward_std": 0.17597398161888123, |
| "rewards/drgrpo_math_reward/mean": 0.81640625, |
| "rewards/drgrpo_math_reward/std": 0.387910932302475, |
| "rho2": 0.3124999403953552, |
| "step": 109 |
| }, |
| { |
| "advantages/mean": 1.862645149230957e-09, |
| "advantages/snr": 6.395541197776692e-09, |
| "advantages/std": 0.29124119877815247, |
| "advantages/var": 0.08482143586573532, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 0.6303724928366762, |
| "grad_norm": 45.06513373893126, |
| "learning_rate": 1.882713373816683e-06, |
| "loss": -0.4579, |
| "num_tokens": 20030959.0, |
| "residual_var": 0.055664073675870895, |
| "reward": 0.82421875, |
| "reward_std": 0.17031623423099518, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.34374991059303284, |
| "step": 110 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 3.06259193017924e-09, |
| "advantages/std": 0.30409619212150574, |
| "advantages/var": 0.09247449406279973, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 0.6361031518624641, |
| "grad_norm": 46.493358792765605, |
| "learning_rate": 1.8805955318567379e-06, |
| "loss": -2.3481, |
| "num_tokens": 20194007.0, |
| "residual_var": 0.05201692134141922, |
| "reward": 0.86328125, |
| "reward_std": 0.18543127179145813, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.43749991059303284, |
| "step": 111 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 6.160119582101702e-09, |
| "advantages/std": 0.18898223340511322, |
| "advantages/var": 0.03571428454278469, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 0.6418338108882522, |
| "grad_norm": 32.10619667409936, |
| "learning_rate": 1.8784599529275099e-06, |
| "loss": -0.5848, |
| "num_tokens": 20331794.0, |
| "residual_var": 0.030133940279483795, |
| "reward": 0.9140625, |
| "reward_std": 0.07536393404006958, |
| "rewards/drgrpo_math_reward/mean": 0.9140625, |
| "rewards/drgrpo_math_reward/std": 0.28082075715065, |
| "rho2": 0.1562499701976776, |
| "step": 112 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.23555190861225128, |
| "advantages/var": 0.05548470165087438, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 0.6475644699140402, |
| "grad_norm": 39.33915422125142, |
| "learning_rate": 1.8763066800438634e-06, |
| "loss": -0.9856, |
| "num_tokens": 20487738.0, |
| "residual_var": 0.04161353409290314, |
| "reward": 0.83984375, |
| "reward_std": 0.11534436047077179, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.24999995529651642, |
| "step": 113 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.2624453604221344, |
| "advantages/var": 0.06887756720710403, |
| "completions/clipped_ratio": -2.5625, |
| "epoch": 0.6532951289398281, |
| "grad_norm": 40.44236478817312, |
| "learning_rate": 1.874135756577056e-06, |
| "loss": -1.4464, |
| "num_tokens": 20659930.0, |
| "residual_var": 0.055963024497032166, |
| "reward": 0.7578125, |
| "reward_std": 0.12190830707550049, |
| "rewards/drgrpo_math_reward/mean": 0.7578125, |
| "rewards/drgrpo_math_reward/std": 0.4292463958263397, |
| "rho2": 0.18749995529651642, |
| "step": 114 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.7581130489884187e-09, |
| "advantages/std": 0.2648642361164093, |
| "advantages/var": 0.07015306357352902, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 0.6590257879656161, |
| "grad_norm": 45.149902363692064, |
| "learning_rate": 1.8719472262538622e-06, |
| "loss": -3.3805, |
| "num_tokens": 20803640.0, |
| "residual_var": 0.043845679610967636, |
| "reward": 0.8671875, |
| "reward_std": 0.14966705441474915, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.37499991059303284, |
| "step": 115 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 3.1507420872801353e-09, |
| "advantages/std": 0.29558831453323364, |
| "advantages/var": 0.08737245168859786, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 0.664756446991404, |
| "grad_norm": 56.38374910767192, |
| "learning_rate": 1.8697411331556953e-06, |
| "loss": -2.1825, |
| "num_tokens": 20956942.0, |
| "residual_var": 0.06279896944761276, |
| "reward": 0.79296875, |
| "reward_std": 0.16663289070129395, |
| "rewards/drgrpo_math_reward/mean": 0.79296875, |
| "rewards/drgrpo_math_reward/std": 0.40597182512283325, |
| "rho2": 0.2812499403953552, |
| "step": 116 |
| }, |
| { |
| "advantages/mean": 2.561137080192566e-09, |
| "advantages/snr": 8.633090704662692e-09, |
| "advantages/std": 0.29666513204574585, |
| "advantages/var": 0.08801020057171982, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 0.670487106017192, |
| "grad_norm": 272.8870600231476, |
| "learning_rate": 1.8675175217177175e-06, |
| "loss": -1.3051, |
| "num_tokens": 21116292.0, |
| "residual_var": 0.052256081253290176, |
| "reward": 0.7578125, |
| "reward_std": 0.17438271641731262, |
| "rewards/drgrpo_math_reward/mean": 0.7578125, |
| "rewards/drgrpo_math_reward/std": 0.4292463958263397, |
| "rho2": 0.40624991059303284, |
| "step": 117 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-09, |
| "advantages/snr": 8.055221268003356e-09, |
| "advantages/std": 0.289043128490448, |
| "advantages/var": 0.08354593012754563, |
| "completions/clipped_ratio": -2.71875, |
| "epoch": 0.6762177650429799, |
| "grad_norm": 49.4537830676488, |
| "learning_rate": 1.8652764367279459e-06, |
| "loss": -1.0499, |
| "num_tokens": 21280178.0, |
| "residual_var": 0.05743783712387085, |
| "reward": 0.79296875, |
| "reward_std": 0.1626850813627243, |
| "rewards/drgrpo_math_reward/mean": 0.79296875, |
| "rewards/drgrpo_math_reward/std": 0.40597182512283325, |
| "rho2": 0.3124999403953552, |
| "step": 118 |
| }, |
| { |
| "advantages/mean": 1.7462298274040222e-09, |
| "advantages/snr": 7.0208312250159456e-09, |
| "advantages/std": 0.24872122704982758, |
| "advantages/var": 0.06186224878517188, |
| "completions/clipped_ratio": -2.71875, |
| "epoch": 0.6819484240687679, |
| "grad_norm": 37.031634565478385, |
| "learning_rate": 1.86301792332635e-06, |
| "loss": -0.4774, |
| "num_tokens": 21426617.0, |
| "residual_var": 0.04446350410580635, |
| "reward": 0.84765625, |
| "reward_std": 0.12810038030147552, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.2812499403953552, |
| "step": 119 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.0865429422188867e-09, |
| "advantages/std": 0.2142857164144516, |
| "advantages/var": 0.04591836825925477, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 0.6876790830945558, |
| "grad_norm": 57.8559807139232, |
| "learning_rate": 1.8607420270039435e-06, |
| "loss": -0.8314, |
| "num_tokens": 21575376.0, |
| "residual_var": 0.034438785165548325, |
| "reward": 0.875, |
| "reward_std": 0.09863808751106262, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.24999994039535522, |
| "step": 120 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.4577503818942396e-09, |
| "advantages/std": 0.31943827867507935, |
| "advantages/var": 0.10204081388289765, |
| "completions/clipped_ratio": -2.578125, |
| "epoch": 0.6934097421203438, |
| "grad_norm": 47.86157687219279, |
| "learning_rate": 1.858448793601866e-06, |
| "loss": -0.0483, |
| "num_tokens": 21740794.0, |
| "residual_var": 0.06377552449703217, |
| "reward": 0.7265625, |
| "reward_std": 0.18858027458190918, |
| "rewards/drgrpo_math_reward/mean": 0.7265625, |
| "rewards/drgrpo_math_reward/std": 0.446596622467041, |
| "rho2": 0.3749999403953552, |
| "step": 121 |
| }, |
| { |
| "advantages/mean": 1.5133991837501526e-09, |
| "advantages/snr": 5.0290080186388524e-09, |
| "advantages/std": 0.30093392729759216, |
| "advantages/var": 0.09056122859875249, |
| "completions/clipped_ratio": -2.5625, |
| "epoch": 0.6991404011461319, |
| "grad_norm": 72.79748126385374, |
| "learning_rate": 1.8561382693104614e-06, |
| "loss": -1.7789, |
| "num_tokens": 21917447.0, |
| "residual_var": 0.06509089469909668, |
| "reward": 0.71875, |
| "reward_std": 0.17005029320716858, |
| "rewards/drgrpo_math_reward/mean": 0.71875, |
| "rewards/drgrpo_math_reward/std": 0.45048993825912476, |
| "rho2": 0.2812499403953552, |
| "step": 122 |
| }, |
| { |
| "advantages/mean": -5.820766091346741e-10, |
| "advantages/snr": 2.3165174831568894e-09, |
| "advantages/std": 0.2512722611427307, |
| "advantages/var": 0.06313774921978066, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 0.7048710601719198, |
| "grad_norm": 44.88746539099347, |
| "learning_rate": 1.853810500668347e-06, |
| "loss": -0.7919, |
| "num_tokens": 22094461.0, |
| "residual_var": 0.04538027569651604, |
| "reward": 0.78515625, |
| "reward_std": 0.12981030344963074, |
| "rewards/drgrpo_math_reward/mean": 0.78515625, |
| "rewards/drgrpo_math_reward/std": 0.4115184545516968, |
| "rho2": 0.2812499403953552, |
| "step": 123 |
| }, |
| { |
| "advantages/mean": 3.4924596548080444e-10, |
| "advantages/snr": 1.2896013768163457e-09, |
| "advantages/std": 0.27081698179244995, |
| "advantages/var": 0.07334183762717217, |
| "completions/clipped_ratio": -2.53125, |
| "epoch": 0.7106017191977078, |
| "grad_norm": 48.80423262194112, |
| "learning_rate": 1.8514655345614762e-06, |
| "loss": -0.8237, |
| "num_tokens": 22268217.0, |
| "residual_var": 0.05271446332335472, |
| "reward": 0.75390625, |
| "reward_std": 0.14006003737449646, |
| "rewards/drgrpo_math_reward/mean": 0.75390625, |
| "rewards/drgrpo_math_reward/std": 0.43157756328582764, |
| "rho2": 0.2812499403953552, |
| "step": 124 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 9.040583393524847e-10, |
| "advantages/std": 0.25753939151763916, |
| "advantages/var": 0.06632653818327583, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 0.7163323782234957, |
| "grad_norm": 46.02844560487019, |
| "learning_rate": 1.8491034182221936e-06, |
| "loss": -0.9501, |
| "num_tokens": 22430925.0, |
| "residual_var": 0.04767220839858055, |
| "reward": 0.7890625, |
| "reward_std": 0.12677361071109772, |
| "rewards/drgrpo_math_reward/mean": 0.7890625, |
| "rewards/drgrpo_math_reward/std": 0.4087733030319214, |
| "rho2": 0.2812499403953552, |
| "step": 125 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 5.203854933046621e-09, |
| "advantages/std": 0.2684517204761505, |
| "advantages/var": 0.07206632622660525, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 0.7220630372492837, |
| "grad_norm": 60.67743324938953, |
| "learning_rate": 1.8467241992282841e-06, |
| "loss": -1.0718, |
| "num_tokens": 22583031.0, |
| "residual_var": 0.045041464269161224, |
| "reward": 0.87109375, |
| "reward_std": 0.15782861411571503, |
| "rewards/drgrpo_math_reward/mean": 0.87109375, |
| "rewards/drgrpo_math_reward/std": 0.33575257658958435, |
| "rho2": 0.3749999403953552, |
| "step": 126 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 2.3227503455075525e-09, |
| "advantages/std": 0.35083720088005066, |
| "advantages/var": 0.12308674152134902, |
| "completions/clipped_ratio": -2.65625, |
| "epoch": 0.7277936962750716, |
| "grad_norm": 56.280257142687866, |
| "learning_rate": 1.844327925502015e-06, |
| "loss": -1.0202, |
| "num_tokens": 22755738.0, |
| "residual_var": 0.06154339015483856, |
| "reward": 0.75390625, |
| "reward_std": 0.22973774373531342, |
| "rewards/drgrpo_math_reward/mean": 0.75390625, |
| "rewards/drgrpo_math_reward/std": 0.43157756328582764, |
| "rho2": 0.49999988079071045, |
| "step": 127 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 4.395282127917263e-10, |
| "advantages/std": 0.2648642659187317, |
| "advantages/var": 0.07015307936066861, |
| "completions/clipped_ratio": -2.671875, |
| "epoch": 0.7335243553008596, |
| "grad_norm": 54.82075946899719, |
| "learning_rate": 1.8419146453091702e-06, |
| "loss": -1.0377, |
| "num_tokens": 22910639.0, |
| "residual_var": 0.052614808082580566, |
| "reward": 0.765625, |
| "reward_std": 0.13072142004966736, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.2499999701976776, |
| "step": 128 |
| }, |
| { |
| "advantages/mean": 1.280568540096283e-09, |
| "advantages/snr": 5.741541728836773e-09, |
| "advantages/std": 0.22303564846515656, |
| "advantages/var": 0.04974490048627289, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 0.7392550143266475, |
| "grad_norm": 38.05363017417547, |
| "learning_rate": 1.8394844072580772e-06, |
| "loss": -0.7448, |
| "num_tokens": 23065211.0, |
| "residual_var": 0.03886321187019348, |
| "reward": 0.8828125, |
| "reward_std": 0.10258589684963226, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.2187499701976776, |
| "step": 129 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.774317220522933e-09, |
| "advantages/std": 0.262445330619812, |
| "advantages/var": 0.06887755156414244, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 0.7449856733524355, |
| "grad_norm": 51.16139356760249, |
| "learning_rate": 1.8370372602986302e-06, |
| "loss": -1.3307, |
| "num_tokens": 23214742.0, |
| "residual_var": 0.049505751579999924, |
| "reward": 0.7890625, |
| "reward_std": 0.1349327266216278, |
| "rewards/drgrpo_math_reward/mean": 0.7890625, |
| "rewards/drgrpo_math_reward/std": 0.4087733030319214, |
| "rho2": 0.2812499403953552, |
| "step": 130 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 4.656612686812885e-09, |
| "advantages/std": 0.25, |
| "advantages/var": 0.0625, |
| "completions/clipped_ratio": -2.703125, |
| "epoch": 0.7507163323782235, |
| "grad_norm": 93.65015154291969, |
| "learning_rate": 1.8345732537213026e-06, |
| "loss": -0.0264, |
| "num_tokens": 23380824.0, |
| "residual_var": 0.042968764901161194, |
| "reward": 0.75, |
| "reward_std": 0.12927988171577454, |
| "rewards/drgrpo_math_reward/mean": 0.75, |
| "rewards/drgrpo_math_reward/std": 0.4338609278202057, |
| "rho2": 0.3124999403953552, |
| "step": 131 |
| }, |
| { |
| "advantages/mean": 2.444721758365631e-09, |
| "advantages/snr": 7.424683779959137e-09, |
| "advantages/std": 0.3292694687843323, |
| "advantages/var": 0.10841838307351637, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 0.7564469914040115, |
| "grad_norm": 61.13746296372967, |
| "learning_rate": 1.832092437156154e-06, |
| "loss": -0.4033, |
| "num_tokens": 23542656.0, |
| "residual_var": 0.0643734335899353, |
| "reward": 0.8515625, |
| "reward_std": 0.20133627951145172, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.40624988079071045, |
| "step": 132 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.152452789710391e-09, |
| "advantages/std": 0.30304577946662903, |
| "advantages/var": 0.09183674445253676, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 0.7621776504297995, |
| "grad_norm": 82.25821857438845, |
| "learning_rate": 1.8295948605718311e-06, |
| "loss": -0.9542, |
| "num_tokens": 23699606.0, |
| "residual_var": 0.05739797279238701, |
| "reward": 0.8515625, |
| "reward_std": 0.1718764305114746, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.3749999403953552, |
| "step": 133 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.872221660004252e-09, |
| "advantages/std": 0.24872122704982758, |
| "advantages/var": 0.06186224878517188, |
| "completions/clipped_ratio": -2.65625, |
| "epoch": 0.7679083094555874, |
| "grad_norm": 61.86934388754127, |
| "learning_rate": 1.8270805742745616e-06, |
| "loss": -0.4728, |
| "num_tokens": 23857410.0, |
| "residual_var": 0.04446350410580635, |
| "reward": 0.80859375, |
| "reward_std": 0.12164628505706787, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.2812499403953552, |
| "step": 134 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.6426984546755946e-09, |
| "advantages/std": 0.2834733724594116, |
| "advantages/var": 0.08035715289351231, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 0.7736389684813754, |
| "grad_norm": 71.10537320866999, |
| "learning_rate": 1.8245496289071406e-06, |
| "loss": -0.9753, |
| "num_tokens": 24019095.0, |
| "residual_var": 0.057756710797548294, |
| "reward": 0.8828125, |
| "reward_std": 0.1533464789390564, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.2812499403953552, |
| "step": 135 |
| }, |
| { |
| "advantages/mean": 8.149072527885437e-10, |
| "advantages/snr": 3.0491036849611125e-09, |
| "advantages/std": 0.26726123690605164, |
| "advantages/var": 0.07142856875255266, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 0.7793696275071633, |
| "grad_norm": 62.37237735361797, |
| "learning_rate": 1.82200207544791e-06, |
| "loss": -0.9455, |
| "num_tokens": 24175665.0, |
| "residual_var": 0.049107152968645096, |
| "reward": 0.828125, |
| "reward_std": 0.1442737877368927, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.3124999403953552, |
| "step": 136 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.011985032910295e-09, |
| "advantages/std": 0.23007319867610931, |
| "advantages/var": 0.05293367674905647, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 0.7851002865329513, |
| "grad_norm": 54.84248499224107, |
| "learning_rate": 1.8194379652097318e-06, |
| "loss": -0.6355, |
| "num_tokens": 24321485.0, |
| "residual_var": 0.04135444760322571, |
| "reward": 0.83203125, |
| "reward_std": 0.1060032919049263, |
| "rewards/drgrpo_math_reward/mean": 0.83203125, |
| "rewards/drgrpo_math_reward/std": 0.3745708465576172, |
| "rho2": 0.21874994039535522, |
| "step": 137 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.2004213037461316e-09, |
| "advantages/std": 0.31743553280830383, |
| "advantages/var": 0.10076531748929174, |
| "completions/clipped_ratio": -2.671875, |
| "epoch": 0.7908309455587392, |
| "grad_norm": 113.83761802134484, |
| "learning_rate": 1.8168573498389562e-06, |
| "loss": -0.4863, |
| "num_tokens": 24494928.0, |
| "residual_var": 0.05668049678206444, |
| "reward": 0.6875, |
| "reward_std": 0.20042762160301208, |
| "rewards/drgrpo_math_reward/mean": 0.6875, |
| "rewards/drgrpo_math_reward/std": 0.4644203782081604, |
| "rho2": 0.4374999403953552, |
| "step": 138 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 4.261771990579164e-09, |
| "advantages/std": 0.27316176891326904, |
| "advantages/var": 0.0746173519958262, |
| "completions/clipped_ratio": -2.546875, |
| "epoch": 0.7965616045845272, |
| "grad_norm": 58.86024131658044, |
| "learning_rate": 1.8142602813143784e-06, |
| "loss": -0.3733, |
| "num_tokens": 24664246.0, |
| "residual_var": 0.05596302077174187, |
| "reward": 0.76171875, |
| "reward_std": 0.14058800041675568, |
| "rewards/drgrpo_math_reward/mean": 0.76171875, |
| "rewards/drgrpo_math_reward/std": 0.4268665909767151, |
| "rho2": 0.24999995529651642, |
| "step": 139 |
| }, |
| { |
| "advantages/mean": 8.149072527885437e-10, |
| "advantages/snr": 2.151245094721418e-09, |
| "advantages/std": 0.3788072466850281, |
| "advantages/var": 0.14349493014109171, |
| "completions/clipped_ratio": -2.671875, |
| "epoch": 0.8022922636103151, |
| "grad_norm": 85.49564799929581, |
| "learning_rate": 1.8116468119461951e-06, |
| "loss": -1.2034, |
| "num_tokens": 24833688.0, |
| "residual_var": 0.07174746692180634, |
| "reward": 0.68359375, |
| "reward_std": 0.2695994973182678, |
| "rewards/drgrpo_math_reward/mean": 0.68359375, |
| "rewards/drgrpo_math_reward/std": 0.4659844934940338, |
| "rho2": 0.49999988079071045, |
| "step": 140 |
| }, |
| { |
| "advantages/mean": 3.259629011154175e-09, |
| "advantages/snr": 1.0756226037296983e-08, |
| "advantages/std": 0.30304577946662903, |
| "advantages/var": 0.09183674445253676, |
| "completions/clipped_ratio": -2.609375, |
| "epoch": 0.8080229226361032, |
| "grad_norm": 46.26865155218474, |
| "learning_rate": 1.8090169943749474e-06, |
| "loss": -0.5296, |
| "num_tokens": 24994931.0, |
| "residual_var": 0.06313777714967728, |
| "reward": 0.765625, |
| "reward_std": 0.16477325558662415, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.31249991059303284, |
| "step": 141 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 2.5299625822757374e-09, |
| "advantages/std": 0.23007319867610931, |
| "advantages/var": 0.05293367674905647, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 0.8137535816618912, |
| "grad_norm": 58.97870578609364, |
| "learning_rate": 1.806370881570463e-06, |
| "loss": 0.0156, |
| "num_tokens": 25160661.0, |
| "residual_var": 0.036391910165548325, |
| "reward": 0.80859375, |
| "reward_std": 0.11902769654989243, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.3124999403953552, |
| "step": 142 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.7825890840358e-09, |
| "advantages/std": 0.26122748851776123, |
| "advantages/var": 0.06823980075729708, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 0.8194842406876791, |
| "grad_norm": 79.08516224088073, |
| "learning_rate": 1.8037085268307885e-06, |
| "loss": -0.8249, |
| "num_tokens": 25315126.0, |
| "residual_var": 0.04691487178206444, |
| "reward": 0.89453125, |
| "reward_std": 0.14085638523101807, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.3124999403953552, |
| "step": 143 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.4418181244196808e-09, |
| "advantages/std": 0.24222607910633087, |
| "advantages/var": 0.05867347339922646, |
| "completions/clipped_ratio": -2.734375, |
| "epoch": 0.8252148997134671, |
| "grad_norm": 41.798700207046984, |
| "learning_rate": 1.8010299837811158e-06, |
| "loss": -0.783, |
| "num_tokens": 25478148.0, |
| "residual_var": 0.04400511458516121, |
| "reward": 0.71875, |
| "reward_std": 0.11876175552606583, |
| "rewards/drgrpo_math_reward/mean": 0.71875, |
| "rewards/drgrpo_math_reward/std": 0.45048993825912476, |
| "rho2": 0.2499999701976776, |
| "step": 144 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.509314693234178e-10, |
| "advantages/std": 0.24484480917453766, |
| "advantages/var": 0.05994898057971576, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 0.830945558739255, |
| "grad_norm": 73.56406215605747, |
| "learning_rate": 1.7983353063727014e-06, |
| "loss": -0.3336, |
| "num_tokens": 25630661.0, |
| "residual_var": 0.04121493920683861, |
| "reward": 0.765625, |
| "reward_std": 0.13231413066387177, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.3124999403953552, |
| "step": 145 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.7994859396734798e-09, |
| "advantages/std": 0.25877460837364197, |
| "advantages/var": 0.06696429793893177, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 0.836676217765043, |
| "grad_norm": 49.186265655217355, |
| "learning_rate": 1.795624548881781e-06, |
| "loss": -0.6319, |
| "num_tokens": 25776933.0, |
| "residual_var": 0.04813059791922569, |
| "reward": 0.83203125, |
| "reward_std": 0.12677115201950073, |
| "rewards/drgrpo_math_reward/mean": 0.83203125, |
| "rewards/drgrpo_math_reward/std": 0.3745708465576172, |
| "rho2": 0.28124991059303284, |
| "step": 146 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.022616832722391e-09, |
| "advantages/std": 0.34534069895744324, |
| "advantages/var": 0.11926019835641544, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 0.8424068767908309, |
| "grad_norm": 110.13488773783708, |
| "learning_rate": 1.792897765908475e-06, |
| "loss": -1.0988, |
| "num_tokens": 25946196.0, |
| "residual_var": 0.06708388030529022, |
| "reward": 0.76953125, |
| "reward_std": 0.23212778568267822, |
| "rewards/drgrpo_math_reward/mean": 0.76953125, |
| "rewards/drgrpo_math_reward/std": 0.4219578504562378, |
| "rho2": 0.4374999403953552, |
| "step": 147 |
| }, |
| { |
| "advantages/mean": -2.561137080192566e-09, |
| "advantages/snr": 1.0041665435540562e-08, |
| "advantages/std": 0.25505101680755615, |
| "advantages/var": 0.06505102117456829, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 0.8481375358166189, |
| "grad_norm": 42.935601661756145, |
| "learning_rate": 1.7901550123756903e-06, |
| "loss": -0.9533, |
| "num_tokens": 26099008.0, |
| "residual_var": 0.046755433082580566, |
| "reward": 0.8203125, |
| "reward_std": 0.1250636875629425, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.2812499403953552, |
| "step": 148 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 4.578128359866298e-09, |
| "advantages/std": 0.30514299869537354, |
| "advantages/var": 0.09311224965280473, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 0.8538681948424068, |
| "grad_norm": 79.02183185379518, |
| "learning_rate": 1.787396343528012e-06, |
| "loss": -0.5308, |
| "num_tokens": 26245967.0, |
| "residual_var": 0.06401468068361282, |
| "reward": 0.875, |
| "reward_std": 0.172937273979187, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.3124999403953552, |
| "step": 149 |
| }, |
| { |
| "advantages/mean": 1.6298145055770874e-09, |
| "advantages/snr": 7.71368947201975e-09, |
| "advantages/std": 0.21128857135772705, |
| "advantages/var": 0.044642860386389316, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 0.8595988538681948, |
| "grad_norm": 33.21391170215605, |
| "learning_rate": 1.7846218149305919e-06, |
| "loss": -0.7593, |
| "num_tokens": 26394941.0, |
| "residual_var": 0.034877244383096695, |
| "reward": 0.8203125, |
| "reward_std": 0.09100693464279175, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.21874995529651642, |
| "step": 150 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.2187044471502304, |
| "advantages/var": 0.047831635203287926, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 0.8653295128939829, |
| "grad_norm": 52.69766460617733, |
| "learning_rate": 1.7818314824680298e-06, |
| "loss": -1.2605, |
| "num_tokens": 26563921.0, |
| "residual_var": 0.037368472665548325, |
| "reward": 0.77734375, |
| "reward_std": 0.10087841749191284, |
| "rewards/drgrpo_math_reward/mean": 0.77734375, |
| "rewards/drgrpo_math_reward/std": 0.41684433817863464, |
| "rho2": 0.2187499701976776, |
| "step": 151 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.1688005478205742e-09, |
| "advantages/std": 0.29880714416503906, |
| "advantages/var": 0.08928570940406644, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 0.8710601719197708, |
| "grad_norm": 86.09764168588923, |
| "learning_rate": 1.7790254023432464e-06, |
| "loss": 0.0834, |
| "num_tokens": 26726648.0, |
| "residual_var": 0.05859377235174179, |
| "reward": 0.7734375, |
| "reward_std": 0.16898946464061737, |
| "rewards/drgrpo_math_reward/mean": 0.7734375, |
| "rewards/drgrpo_math_reward/std": 0.41942715644836426, |
| "rho2": 0.34374991059303284, |
| "step": 152 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 2.6864667746454993e-09, |
| "advantages/std": 0.260003924369812, |
| "advantages/var": 0.06760204068770292, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 0.8767908309455588, |
| "grad_norm": 36.9976857893275, |
| "learning_rate": 1.776203631076353e-06, |
| "loss": -0.3414, |
| "num_tokens": 26875356.0, |
| "residual_var": 0.044363852590322495, |
| "reward": 0.8203125, |
| "reward_std": 0.1467800736427307, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.3437499403953552, |
| "step": 153 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 8.086143727159208e-10, |
| "advantages/std": 0.28793779015541077, |
| "advantages/var": 0.08290817099958137, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 0.8825214899713467, |
| "grad_norm": 53.3898001272296, |
| "learning_rate": 1.7733662255035111e-06, |
| "loss": -0.6895, |
| "num_tokens": 27042351.0, |
| "residual_var": 0.04922674223780632, |
| "reward": 0.734375, |
| "reward_std": 0.16925786435604095, |
| "rewards/drgrpo_math_reward/mean": 0.734375, |
| "rewards/drgrpo_math_reward/std": 0.4425306022167206, |
| "rho2": 0.40624991059303284, |
| "step": 154 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 4.68055415001063e-10, |
| "advantages/std": 0.24872122704982758, |
| "advantages/var": 0.06186224878517188, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 0.8882521489971347, |
| "grad_norm": 33.30659799512382, |
| "learning_rate": 1.7705132427757892e-06, |
| "loss": -1.0276, |
| "num_tokens": 27187580.0, |
| "residual_var": 0.04446350410580635, |
| "reward": 0.82421875, |
| "reward_std": 0.12164628505706787, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.2812499403953552, |
| "step": 155 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.30409619212150574, |
| "advantages/var": 0.09247449406279973, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 0.8939828080229226, |
| "grad_norm": 53.760508839254754, |
| "learning_rate": 1.767644740358011e-06, |
| "loss": -0.8033, |
| "num_tokens": 27339935.0, |
| "residual_var": 0.05201692134141922, |
| "reward": 0.80859375, |
| "reward_std": 0.18543127179145813, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.43749988079071045, |
| "step": 156 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 3.5355637047424465e-10, |
| "advantages/std": 0.3292694687843323, |
| "advantages/var": 0.10841838307351637, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 0.8997134670487106, |
| "grad_norm": 110.32996992700565, |
| "learning_rate": 1.7647607760275985e-06, |
| "loss": -0.5867, |
| "num_tokens": 27499321.0, |
| "residual_var": 0.0643734335899353, |
| "reward": 0.6875, |
| "reward_std": 0.20779038965702057, |
| "rewards/drgrpo_math_reward/mean": 0.6875, |
| "rewards/drgrpo_math_reward/std": 0.4644203782081604, |
| "rho2": 0.40624988079071045, |
| "step": 157 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 3.6908025963194066e-09, |
| "advantages/std": 0.3154200613498688, |
| "advantages/var": 0.09948981510195498, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 0.9054441260744985, |
| "grad_norm": 51.3684103060375, |
| "learning_rate": 1.7618614078734067e-06, |
| "loss": -0.1811, |
| "num_tokens": 27664718.0, |
| "residual_var": 0.0652901902794838, |
| "reward": 0.78125, |
| "reward_std": 0.18569329380989075, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.3437499403953552, |
| "step": 158 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 4.173527128124609e-09, |
| "advantages/std": 0.2789374887943268, |
| "advantages/var": 0.07780612265488518, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 0.9111747851002865, |
| "grad_norm": 41.1640018865697, |
| "learning_rate": 1.7589466942945555e-06, |
| "loss": -0.4552, |
| "num_tokens": 27819289.0, |
| "residual_var": 0.051060281693935394, |
| "reward": 0.7890625, |
| "reward_std": 0.15702980756759644, |
| "rewards/drgrpo_math_reward/mean": 0.7890625, |
| "rewards/drgrpo_math_reward/std": 0.4087733030319214, |
| "rho2": 0.34374991059303284, |
| "step": 159 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.6694108512498434e-09, |
| "advantages/std": 0.2789374887943268, |
| "advantages/var": 0.07780612265488518, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 0.9169054441260746, |
| "grad_norm": 51.077607649603536, |
| "learning_rate": 1.7560166939992527e-06, |
| "loss": -0.892, |
| "num_tokens": 27963753.0, |
| "residual_var": 0.05835460126399994, |
| "reward": 0.828125, |
| "reward_std": 0.13755130767822266, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.24999994039535522, |
| "step": 160 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 3.326844631279246e-10, |
| "advantages/std": 0.3499271273612976, |
| "advantages/var": 0.1224489944633298, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 0.9226361031518625, |
| "grad_norm": 48.10192863656198, |
| "learning_rate": 1.753071466003611e-06, |
| "loss": -1.1203, |
| "num_tokens": 28122012.0, |
| "residual_var": 0.06887757033109665, |
| "reward": 0.8046875, |
| "reward_std": 0.22263701260089874, |
| "rewards/drgrpo_math_reward/mean": 0.8046875, |
| "rewards/drgrpo_math_reward/std": 0.39721766114234924, |
| "rho2": 0.4374999403953552, |
| "step": 161 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 4.520292219848232e-09, |
| "advantages/std": 0.2575393617153168, |
| "advantages/var": 0.06632652283273277, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 0.9283667621776505, |
| "grad_norm": 34.72294908019458, |
| "learning_rate": 1.7501110696304595e-06, |
| "loss": -0.1205, |
| "num_tokens": 28284283.0, |
| "residual_var": 0.045599501579999924, |
| "reward": 0.7734375, |
| "reward_std": 0.13269484043121338, |
| "rewards/drgrpo_math_reward/mean": 0.7734375, |
| "rewards/drgrpo_math_reward/std": 0.41942715644836426, |
| "rho2": 0.3124999403953552, |
| "step": 162 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.2372256189616873e-09, |
| "advantages/std": 0.21576867997646332, |
| "advantages/var": 0.04655612325878544, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 0.9340974212034384, |
| "grad_norm": 55.63584185341244, |
| "learning_rate": 1.7471355645081495e-06, |
| "loss": -0.2935, |
| "num_tokens": 28425609.0, |
| "residual_var": 0.037826862186193466, |
| "reward": 0.83984375, |
| "reward_std": 0.09324727952480316, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.1874999701976776, |
| "step": 163 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.6614758307843997e-09, |
| "advantages/std": 0.262445330619812, |
| "advantages/var": 0.06887755156414244, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 0.9398280802292264, |
| "grad_norm": 47.70746569664927, |
| "learning_rate": 1.7441450105693529e-06, |
| "loss": 0.0682, |
| "num_tokens": 28568538.0, |
| "residual_var": 0.049505751579999924, |
| "reward": 0.8515625, |
| "reward_std": 0.1349327117204666, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.2812499403953552, |
| "step": 164 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 4.633034966313779e-09, |
| "advantages/std": 0.2512722611427307, |
| "advantages/var": 0.06313774921978066, |
| "completions/clipped_ratio": -2.703125, |
| "epoch": 0.9455587392550143, |
| "grad_norm": 42.28262395990937, |
| "learning_rate": 1.7411394680498548e-06, |
| "loss": -0.5688, |
| "num_tokens": 28720699.0, |
| "residual_var": 0.04932638630270958, |
| "reward": 0.78515625, |
| "reward_std": 0.12270711362361908, |
| "rewards/drgrpo_math_reward/mean": 0.78515625, |
| "rewards/drgrpo_math_reward/std": 0.4115184545516968, |
| "rho2": 0.21874995529651642, |
| "step": 165 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 4.208162363484517e-10, |
| "advantages/std": 0.27664169669151306, |
| "advantages/var": 0.07653062834835911, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 0.9512893982808023, |
| "grad_norm": 41.67501812101098, |
| "learning_rate": 1.7381189974873407e-06, |
| "loss": -0.7094, |
| "num_tokens": 28860305.0, |
| "residual_var": 0.052614811807870865, |
| "reward": 0.8828125, |
| "reward_std": 0.14939865469932556, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.3124999403953552, |
| "step": 166 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 5.322951711271804e-09, |
| "advantages/std": 0.1749635487794876, |
| "advantages/var": 0.03061224340151214, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 0.9570200573065902, |
| "grad_norm": 29.567632786804115, |
| "learning_rate": 1.7350836597201765e-06, |
| "loss": 0.1182, |
| "num_tokens": 28991869.0, |
| "residual_var": 0.025829091668128967, |
| "reward": 0.9609375, |
| "reward_std": 0.06378497928380966, |
| "rewards/drgrpo_math_reward/mean": 0.9609375, |
| "rewards/drgrpo_math_reward/std": 0.19412322342395782, |
| "rho2": 0.1562499701976776, |
| "step": 167 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 3.407670541785436e-09, |
| "advantages/std": 0.3416272699832916, |
| "advantages/var": 0.11670919159623683, |
| "completions/clipped_ratio": -2.625, |
| "epoch": 0.9627507163323782, |
| "grad_norm": 58.85091702310497, |
| "learning_rate": 1.7320335158861852e-06, |
| "loss": -1.8019, |
| "num_tokens": 29158280.0, |
| "residual_var": 0.06564892828464508, |
| "reward": 0.75390625, |
| "reward_std": 0.21751460433006287, |
| "rewards/drgrpo_math_reward/mean": 0.75390625, |
| "rewards/drgrpo_math_reward/std": 0.43157756328582764, |
| "rho2": 0.4374999403953552, |
| "step": 168 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 4.175666432902228e-09, |
| "advantages/std": 0.22303566336631775, |
| "advantages/var": 0.04974490713325341, |
| "completions/clipped_ratio": -2.453125, |
| "epoch": 0.9684813753581661, |
| "grad_norm": 39.61307892613523, |
| "learning_rate": 1.7289686274214115e-06, |
| "loss": -0.2184, |
| "num_tokens": 29321178.0, |
| "residual_var": 0.04041774198412895, |
| "reward": 0.71875, |
| "reward_std": 0.0966646671295166, |
| "rewards/drgrpo_math_reward/mean": 0.71875, |
| "rewards/drgrpo_math_reward/std": 0.45048993825912476, |
| "rho2": 0.1874999701976776, |
| "step": 169 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.5707444382625662e-09, |
| "advantages/std": 0.19561520218849182, |
| "advantages/var": 0.038265307327244535, |
| "completions/clipped_ratio": -2.6875, |
| "epoch": 0.9742120343839542, |
| "grad_norm": 38.590165781283964, |
| "learning_rate": 1.7258890560588887e-06, |
| "loss": -0.2002, |
| "num_tokens": 29473074.0, |
| "residual_var": 0.031090570613741875, |
| "reward": 0.7265625, |
| "reward_std": 0.07825091481208801, |
| "rewards/drgrpo_math_reward/mean": 0.7265625, |
| "rewards/drgrpo_math_reward/std": 0.446596622467041, |
| "rho2": 0.1874999701976776, |
| "step": 170 |
| }, |
| { |
| "advantages/mean": -1.3969838619232178e-09, |
| "advantages/snr": 6.758136794548967e-09, |
| "advantages/std": 0.20671138167381287, |
| "advantages/var": 0.04272959531349674, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 0.9799426934097422, |
| "grad_norm": 30.278598266227252, |
| "learning_rate": 1.7227948638273915e-06, |
| "loss": -0.2199, |
| "num_tokens": 29611466.0, |
| "residual_var": 0.03338250517845154, |
| "reward": 0.91796875, |
| "reward_std": 0.08929946273565292, |
| "rewards/drgrpo_math_reward/mean": 0.91796875, |
| "rewards/drgrpo_math_reward/std": 0.2749498784542084, |
| "rho2": 0.21874995529651642, |
| "step": 171 |
| }, |
| { |
| "advantages/mean": 1.979060471057892e-09, |
| "advantages/snr": 6.377399533485238e-09, |
| "advantages/std": 0.3103240430355072, |
| "advantages/var": 0.09630101168590333, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 0.9856733524355301, |
| "grad_norm": 46.96748798734129, |
| "learning_rate": 1.7196861130501902e-06, |
| "loss": -0.2326, |
| "num_tokens": 29761366.0, |
| "residual_var": 0.06319756805896759, |
| "reward": 0.76953125, |
| "reward_std": 0.1822758913040161, |
| "rewards/drgrpo_math_reward/mean": 0.76953125, |
| "rewards/drgrpo_math_reward/std": 0.4219578504562378, |
| "rho2": 0.34374991059303284, |
| "step": 172 |
| }, |
| { |
| "advantages/mean": 1.0477378964424133e-09, |
| "advantages/snr": 4.638532964616805e-09, |
| "advantages/std": 0.22587698698043823, |
| "advantages/var": 0.05102041324736106, |
| "completions/clipped_ratio": -2.609375, |
| "epoch": 0.9914040114613181, |
| "grad_norm": 82.39543338312782, |
| "learning_rate": 1.716562866343792e-06, |
| "loss": -0.063, |
| "num_tokens": 29936989.0, |
| "residual_var": 0.03985970467329025, |
| "reward": 0.6171875, |
| "reward_std": 0.09784172475337982, |
| "rewards/drgrpo_math_reward/mean": 0.6171875, |
| "rewards/drgrpo_math_reward/std": 0.48702529072761536, |
| "rho2": 0.21874995529651642, |
| "step": 173 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 3.116801356546741e-09, |
| "advantages/std": 0.14940357208251953, |
| "advantages/var": 0.02232142735101661, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 0.997134670487106, |
| "grad_norm": 25.91850427167227, |
| "learning_rate": 1.7134251866166828e-06, |
| "loss": -0.1145, |
| "num_tokens": 30091098.0, |
| "residual_var": 0.018833715468645096, |
| "reward": 0.80078125, |
| "reward_std": 0.055242717266082764, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.1562499701976776, |
| "step": 174 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.4447215635128062e-09, |
| "advantages/std": 0.2857142984867096, |
| "advantages/var": 0.08163266035975258, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.005730659025788, |
| "grad_norm": 52.003556279462764, |
| "learning_rate": 1.710273137068057e-06, |
| "loss": -0.5332, |
| "num_tokens": 30238663.0, |
| "residual_var": 0.05612246319651604, |
| "reward": 0.8125, |
| "reward_std": 0.15452352166175842, |
| "rewards/drgrpo_math_reward/mean": 0.8125, |
| "rewards/drgrpo_math_reward/std": 0.3910769522190094, |
| "rho2": 0.3124999403953552, |
| "step": 175 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.3626546441154624e-09, |
| "advantages/std": 0.2562982141971588, |
| "advantages/var": 0.0656887746006527, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 1.011461318051576, |
| "grad_norm": 49.08232252807409, |
| "learning_rate": 1.7071067811865474e-06, |
| "loss": 0.2621, |
| "num_tokens": 30386461.0, |
| "residual_var": 0.05337214469909668, |
| "reward": 0.79296875, |
| "reward_std": 0.11849091202020645, |
| "rewards/drgrpo_math_reward/mean": 0.79296875, |
| "rewards/drgrpo_math_reward/std": 0.40597182512283325, |
| "rho2": 0.18749994039535522, |
| "step": 176 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-09, |
| "advantages/snr": 8.181092293916676e-09, |
| "advantages/std": 0.2845960259437561, |
| "advantages/var": 0.0809948979829791, |
| "completions/clipped_ratio": -2.515625, |
| "epoch": 1.0171919770773639, |
| "grad_norm": 43.315763175106994, |
| "learning_rate": 1.7039261827489448e-06, |
| "loss": -0.7983, |
| "num_tokens": 30540223.0, |
| "residual_var": 0.05821509286761284, |
| "reward": 0.72265625, |
| "reward_std": 0.1533440202474594, |
| "rewards/drgrpo_math_reward/mean": 0.72265625, |
| "rewards/drgrpo_math_reward/std": 0.4485645890235901, |
| "rho2": 0.2812499403953552, |
| "step": 177 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 5.638654887602349e-09, |
| "advantages/std": 0.289043128490448, |
| "advantages/var": 0.08354593012754563, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.0229226361031518, |
| "grad_norm": 47.223138020041155, |
| "learning_rate": 1.7007314058189138e-06, |
| "loss": -0.4208, |
| "num_tokens": 30701218.0, |
| "residual_var": 0.06004864722490311, |
| "reward": 0.74609375, |
| "reward_std": 0.15030977129936218, |
| "rewards/drgrpo_math_reward/mean": 0.74609375, |
| "rewards/drgrpo_math_reward/std": 0.4360972046852112, |
| "rho2": 0.2812499403953552, |
| "step": 178 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 3.997213248610433e-09, |
| "advantages/std": 0.29124119877815247, |
| "advantages/var": 0.08482143586573532, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 1.0286532951289398, |
| "grad_norm": 50.17432938470871, |
| "learning_rate": 1.6975225147457024e-06, |
| "loss": -0.3989, |
| "num_tokens": 30849451.0, |
| "residual_var": 0.06361608952283859, |
| "reward": 0.82421875, |
| "reward_std": 0.15729182958602905, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.24999994039535522, |
| "step": 179 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.017830079630706e-09, |
| "advantages/std": 0.2314550131559372, |
| "advantages/var": 0.05357142311501506, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 1.0343839541547277, |
| "grad_norm": 37.273134893984825, |
| "learning_rate": 1.6942995741628456e-06, |
| "loss": -0.4196, |
| "num_tokens": 30982416.0, |
| "residual_var": 0.043526798486709595, |
| "reward": 0.8984375, |
| "reward_std": 0.10007961094379425, |
| "rewards/drgrpo_math_reward/mean": 0.8984375, |
| "rewards/drgrpo_math_reward/std": 0.3026638329029083, |
| "rho2": 0.18749995529651642, |
| "step": 180 |
| }, |
| { |
| "advantages/mean": -1.5133991837501526e-09, |
| "advantages/snr": 5.8483293039388095e-09, |
| "advantages/std": 0.25877460837364197, |
| "advantages/var": 0.06696429793893177, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.0401146131805157, |
| "grad_norm": 61.99506107492152, |
| "learning_rate": 1.6910626489868648e-06, |
| "loss": -0.4872, |
| "num_tokens": 31136161.0, |
| "residual_var": 0.05022323131561279, |
| "reward": 0.76171875, |
| "reward_std": 0.12730401754379272, |
| "rewards/drgrpo_math_reward/mean": 0.76171875, |
| "rewards/drgrpo_math_reward/std": 0.4268665909767151, |
| "rho2": 0.2499999701976776, |
| "step": 181 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-09, |
| "advantages/snr": 7.819981896283313e-09, |
| "advantages/std": 0.29773807525634766, |
| "advantages/var": 0.08864796145735454, |
| "completions/clipped_ratio": -2.65625, |
| "epoch": 1.0458452722063036, |
| "grad_norm": 113.48047316941651, |
| "learning_rate": 1.6878118044159578e-06, |
| "loss": -0.2918, |
| "num_tokens": 31287067.0, |
| "residual_var": 0.06371574103832245, |
| "reward": 0.77734375, |
| "reward_std": 0.15543463826179504, |
| "rewards/drgrpo_math_reward/mean": 0.77734375, |
| "rewards/drgrpo_math_reward/std": 0.41684433817863464, |
| "rho2": 0.2812499403953552, |
| "step": 182 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 3.062807733827676e-09, |
| "advantages/std": 0.26606541872024536, |
| "advantages/var": 0.07079080703877949, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 1.0515759312320916, |
| "grad_norm": 40.02864895138621, |
| "learning_rate": 1.6845471059286886e-06, |
| "loss": -0.1182, |
| "num_tokens": 31439199.0, |
| "residual_var": 0.05309312418103218, |
| "reward": 0.77734375, |
| "reward_std": 0.13717305660247803, |
| "rewards/drgrpo_math_reward/mean": 0.77734375, |
| "rewards/drgrpo_math_reward/std": 0.41684433817863464, |
| "rho2": 0.24999995529651642, |
| "step": 183 |
| }, |
| { |
| "advantages/mean": 1.280568540096283e-09, |
| "advantages/snr": 4.856938081027663e-09, |
| "advantages/std": 0.2636575698852539, |
| "advantages/var": 0.06951531415779755, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.0573065902578798, |
| "grad_norm": 40.549121661009465, |
| "learning_rate": 1.6812686192826662e-06, |
| "loss": -0.7632, |
| "num_tokens": 31597985.0, |
| "residual_var": 0.04779178649187088, |
| "reward": 0.91015625, |
| "reward_std": 0.14256632328033447, |
| "rewards/drgrpo_math_reward/mean": 0.91015625, |
| "rewards/drgrpo_math_reward/std": 0.2865179479122162, |
| "rho2": 0.3124999403953552, |
| "step": 184 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 5.186443199129546e-10, |
| "advantages/std": 0.22446079552173615, |
| "advantages/var": 0.050382648726250645, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 1.0630372492836677, |
| "grad_norm": 39.5470013856504, |
| "learning_rate": 1.677976410513221e-06, |
| "loss": -0.3753, |
| "num_tokens": 31745502.0, |
| "residual_var": 0.04093591496348381, |
| "reward": 0.83203125, |
| "reward_std": 0.10311630368232727, |
| "rewards/drgrpo_math_reward/mean": 0.83203125, |
| "rewards/drgrpo_math_reward/std": 0.3745708465576172, |
| "rho2": 0.1874999701976776, |
| "step": 185 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 8.560204468385481e-10, |
| "advantages/std": 0.27199190855026245, |
| "advantages/var": 0.07397959831681433, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.0687679083094557, |
| "grad_norm": 44.057693695852585, |
| "learning_rate": 1.6746705459320744e-06, |
| "loss": 0.0624, |
| "num_tokens": 31905760.0, |
| "residual_var": 0.053172849118709564, |
| "reward": 0.796875, |
| "reward_std": 0.14005759358406067, |
| "rewards/drgrpo_math_reward/mean": 0.796875, |
| "rewards/drgrpo_math_reward/std": 0.40311288833618164, |
| "rho2": 0.2812499403953552, |
| "step": 186 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.5559327531530887e-09, |
| "advantages/std": 0.22446082532405853, |
| "advantages/var": 0.050382662105157516, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.0744985673352436, |
| "grad_norm": 34.399246117778546, |
| "learning_rate": 1.6713510921260038e-06, |
| "loss": -0.4038, |
| "num_tokens": 32059438.0, |
| "residual_var": 0.037786997854709625, |
| "reward": 0.89453125, |
| "reward_std": 0.10376540571451187, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.2499999701976776, |
| "step": 187 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-09, |
| "advantages/snr": 8.024651453540351e-09, |
| "advantages/std": 0.2901442348957062, |
| "advantages/var": 0.08418367704321472, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 1.0802292263610316, |
| "grad_norm": 75.92610469530256, |
| "learning_rate": 1.6680181159555011e-06, |
| "loss": -0.3991, |
| "num_tokens": 32202032.0, |
| "residual_var": 0.06050703302025795, |
| "reward": 0.8046875, |
| "reward_std": 0.15676140785217285, |
| "rewards/drgrpo_math_reward/mean": 0.8046875, |
| "rewards/drgrpo_math_reward/std": 0.39721766114234924, |
| "rho2": 0.28124991059303284, |
| "step": 188 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 3.116801356546741e-09, |
| "advantages/std": 0.14940357208251953, |
| "advantages/var": 0.02232142735101661, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 1.0859598853868195, |
| "grad_norm": 29.435908660503742, |
| "learning_rate": 1.6646716845534256e-06, |
| "loss": -0.0744, |
| "num_tokens": 32334668.0, |
| "residual_var": 0.018833715468645096, |
| "reward": 0.86328125, |
| "reward_std": 0.055242717266082764, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.1562499701976776, |
| "step": 189 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 8.790565244942093e-10, |
| "advantages/std": 0.2648642361164093, |
| "advantages/var": 0.07015306357352902, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.0916905444126075, |
| "grad_norm": 89.04585523798126, |
| "learning_rate": 1.6613118653236517e-06, |
| "loss": -0.3858, |
| "num_tokens": 32483123.0, |
| "residual_var": 0.043845679610967636, |
| "reward": 0.75, |
| "reward_std": 0.14966705441474915, |
| "rewards/drgrpo_math_reward/mean": 0.75, |
| "rewards/drgrpo_math_reward/std": 0.4338609278202057, |
| "rho2": 0.3749999403953552, |
| "step": 190 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 5.298477906575633e-09, |
| "advantages/std": 0.2636575698852539, |
| "advantages/var": 0.06951531415779755, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 1.0974212034383954, |
| "grad_norm": 45.88711988000871, |
| "learning_rate": 1.6579387259397126e-06, |
| "loss": 0.4409, |
| "num_tokens": 32628123.0, |
| "residual_var": 0.04779178649187088, |
| "reward": 0.77734375, |
| "reward_std": 0.13611222803592682, |
| "rewards/drgrpo_math_reward/mean": 0.77734375, |
| "rewards/drgrpo_math_reward/std": 0.41684433817863464, |
| "rho2": 0.3124999403953552, |
| "step": 191 |
| }, |
| { |
| "advantages/mean": -5.820766091346741e-10, |
| "advantages/snr": 2.3773286733085444e-09, |
| "advantages/std": 0.24484480917453766, |
| "advantages/var": 0.05994898057971576, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 1.1031518624641834, |
| "grad_norm": 40.04913866471509, |
| "learning_rate": 1.6545523343434353e-06, |
| "loss": -1.5379, |
| "num_tokens": 32774588.0, |
| "residual_var": 0.046835146844387054, |
| "reward": 0.828125, |
| "reward_std": 0.11928971856832504, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.2187499701976776, |
| "step": 192 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.8439244876346257e-09, |
| "advantages/std": 0.25253814458847046, |
| "advantages/var": 0.06377551447218721, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.1088825214899714, |
| "grad_norm": 49.81062329564691, |
| "learning_rate": 1.6511527587435735e-06, |
| "loss": -0.4636, |
| "num_tokens": 32922532.0, |
| "residual_var": 0.047831643372774124, |
| "reward": 0.8828125, |
| "reward_std": 0.12388662993907928, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.2499999701976776, |
| "step": 193 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 3.259628751350408e-09, |
| "advantages/std": 0.2857142984867096, |
| "advantages/var": 0.08163266035975258, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 1.1146131805157593, |
| "grad_norm": 45.511791262701145, |
| "learning_rate": 1.6477400676144333e-06, |
| "loss": 0.2143, |
| "num_tokens": 33077704.0, |
| "residual_var": 0.05612245947122574, |
| "reward": 0.8125, |
| "reward_std": 0.14806942641735077, |
| "rewards/drgrpo_math_reward/mean": 0.8125, |
| "rewards/drgrpo_math_reward/std": 0.3910769522190094, |
| "rho2": 0.3124999403953552, |
| "step": 194 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.1291806000454375e-09, |
| "advantages/std": 0.2187044471502304, |
| "advantages/var": 0.047831635203287926, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.1203438395415473, |
| "grad_norm": 39.835124741875426, |
| "learning_rate": 1.6443143296944945e-06, |
| "loss": -1.345, |
| "num_tokens": 33220605.0, |
| "residual_var": 0.03736847639083862, |
| "reward": 0.81640625, |
| "reward_std": 0.10087842494249344, |
| "rewards/drgrpo_math_reward/mean": 0.81640625, |
| "rewards/drgrpo_math_reward/std": 0.387910932302475, |
| "rho2": 0.21874995529651642, |
| "step": 195 |
| }, |
| { |
| "advantages/mean": -1.0477378964424133e-09, |
| "advantages/snr": 4.12824047321452e-09, |
| "advantages/std": 0.2537976801395416, |
| "advantages/var": 0.06441326244421308, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.1260744985673352, |
| "grad_norm": 36.741125549960515, |
| "learning_rate": 1.640875613985024e-06, |
| "loss": -0.4177, |
| "num_tokens": 33372791.0, |
| "residual_var": 0.044284138828516006, |
| "reward": 0.80859375, |
| "reward_std": 0.1374414563179016, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.3124999403953552, |
| "step": 196 |
| }, |
| { |
| "advantages/mean": -1.280568540096283e-09, |
| "advantages/snr": 4.4473790499375645e-09, |
| "advantages/std": 0.28793779015541077, |
| "advantages/var": 0.08290817099958137, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.1318051575931232, |
| "grad_norm": 47.67398766795382, |
| "learning_rate": 1.6374239897486897e-06, |
| "loss": -0.1512, |
| "num_tokens": 33524940.0, |
| "residual_var": 0.06218114122748375, |
| "reward": 0.765625, |
| "reward_std": 0.15558436512947083, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.24999994039535522, |
| "step": 197 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 3.548634441045866e-09, |
| "advantages/std": 0.262445330619812, |
| "advantages/var": 0.06887755156414244, |
| "completions/clipped_ratio": -2.703125, |
| "epoch": 1.1375358166189111, |
| "grad_norm": 48.57034087855925, |
| "learning_rate": 1.6339595265081616e-06, |
| "loss": -0.6361, |
| "num_tokens": 33686111.0, |
| "residual_var": 0.055963024497032166, |
| "reward": 0.7890625, |
| "reward_std": 0.12836240231990814, |
| "rewards/drgrpo_math_reward/mean": 0.7890625, |
| "rewards/drgrpo_math_reward/std": 0.4087733030319214, |
| "rho2": 0.18749995529651642, |
| "step": 198 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 6.653689639089756e-09, |
| "advantages/std": 0.1749635487794876, |
| "advantages/var": 0.03061224340151214, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 1.143266475644699, |
| "grad_norm": 25.524074525646057, |
| "learning_rate": 1.6304822940447136e-06, |
| "loss": -0.1862, |
| "num_tokens": 33837046.0, |
| "residual_var": 0.025829095393419266, |
| "reward": 0.7421875, |
| "reward_std": 0.06378497928380966, |
| "rewards/drgrpo_math_reward/mean": 0.7421875, |
| "rewards/drgrpo_math_reward/std": 0.4382871091365814, |
| "rho2": 0.15624995529651642, |
| "step": 199 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.29233402013778687, |
| "advantages/var": 0.08545917932991998, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 1.148997134670487, |
| "grad_norm": 66.74123021504873, |
| "learning_rate": 1.6269923623968179e-06, |
| "loss": -0.1777, |
| "num_tokens": 33982336.0, |
| "residual_var": 0.04807081073522568, |
| "reward": 0.8359375, |
| "reward_std": 0.17859894037246704, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.43749991059303284, |
| "step": 200 |
| }, |
| { |
| "advantages/mean": 8.149072527885437e-10, |
| "advantages/snr": 3.00906987923814e-09, |
| "advantages/std": 0.27081698179244995, |
| "advantages/var": 0.07334183762717217, |
| "completions/clipped_ratio": -2.5625, |
| "epoch": 1.154727793696275, |
| "grad_norm": 60.27379111957229, |
| "learning_rate": 1.6234898018587336e-06, |
| "loss": -2.2732, |
| "num_tokens": 34146966.0, |
| "residual_var": 0.04583865776658058, |
| "reward": 0.76171875, |
| "reward_std": 0.15308445692062378, |
| "rewards/drgrpo_math_reward/mean": 0.76171875, |
| "rewards/drgrpo_math_reward/std": 0.4268665909767151, |
| "rho2": 0.3749999403953552, |
| "step": 201 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 6.552775417430944e-09, |
| "advantages/std": 0.24872124195098877, |
| "advantages/var": 0.061862256197642296, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.1604584527220632, |
| "grad_norm": 49.46313200862665, |
| "learning_rate": 1.6199746829790905e-06, |
| "loss": 0.0609, |
| "num_tokens": 34305411.0, |
| "residual_var": 0.04639669507741928, |
| "reward": 0.80859375, |
| "reward_std": 0.1157250627875328, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.2499999701976776, |
| "step": 202 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.7994861469150812e-09, |
| "advantages/std": 0.2587745785713196, |
| "advantages/var": 0.06696428251476405, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 1.1661891117478511, |
| "grad_norm": 107.60304176172177, |
| "learning_rate": 1.6164470765594697e-06, |
| "loss": -0.9408, |
| "num_tokens": 34446676.0, |
| "residual_var": 0.048130594193935394, |
| "reward": 0.78515625, |
| "reward_std": 0.1332252472639084, |
| "rewards/drgrpo_math_reward/mean": 0.78515625, |
| "rewards/drgrpo_math_reward/std": 0.4115184545516968, |
| "rho2": 0.2812499403953552, |
| "step": 203 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 8.331737022612655e-09, |
| "advantages/std": 0.19561520218849182, |
| "advantages/var": 0.038265307327244535, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 1.171919770773639, |
| "grad_norm": 44.088290605449814, |
| "learning_rate": 1.6129070536529765e-06, |
| "loss": -0.2678, |
| "num_tokens": 34593589.0, |
| "residual_var": 0.031090570613741875, |
| "reward": 0.84375, |
| "reward_std": 0.07825092226266861, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.1874999701976776, |
| "step": 204 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 2.3165172084047557e-09, |
| "advantages/std": 0.2512722909450531, |
| "advantages/var": 0.06313776419677541, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.177650429799427, |
| "grad_norm": 47.83310352768681, |
| "learning_rate": 1.6093546855628081e-06, |
| "loss": -0.3666, |
| "num_tokens": 34744310.0, |
| "residual_var": 0.04932638630270958, |
| "reward": 0.88671875, |
| "reward_std": 0.11625302582979202, |
| "rewards/drgrpo_math_reward/mean": 0.88671875, |
| "rewards/drgrpo_math_reward/std": 0.31755712628364563, |
| "rho2": 0.21874995529651642, |
| "step": 205 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 5.549560778490846e-09, |
| "advantages/std": 0.2097739279270172, |
| "advantages/var": 0.04400510083792941, |
| "completions/clipped_ratio": -2.671875, |
| "epoch": 1.183381088825215, |
| "grad_norm": 35.1700095212735, |
| "learning_rate": 1.6057900438408199e-06, |
| "loss": -0.2259, |
| "num_tokens": 34898568.0, |
| "residual_var": 0.037129320204257965, |
| "reward": 0.76953125, |
| "reward_std": 0.08390620350837708, |
| "rewards/drgrpo_math_reward/mean": 0.76953125, |
| "rewards/drgrpo_math_reward/std": 0.4219578504562378, |
| "rho2": 0.1562499701976776, |
| "step": 206 |
| }, |
| { |
| "advantages/mean": 1.6298145055770874e-09, |
| "advantages/snr": 5.3781130186484915e-09, |
| "advantages/std": 0.30304577946662903, |
| "advantages/var": 0.09183674445253676, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 1.189111747851003, |
| "grad_norm": 63.771394008210834, |
| "learning_rate": 1.6022132002860821e-06, |
| "loss": -0.101, |
| "num_tokens": 35049921.0, |
| "residual_var": 0.05739797279238701, |
| "reward": 0.8359375, |
| "reward_std": 0.17833054065704346, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.3749999403953552, |
| "step": 207 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.1118655063061774e-09, |
| "advantages/std": 0.22446082532405853, |
| "advantages/var": 0.050382662105157516, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.1948424068767909, |
| "grad_norm": 51.11574479359161, |
| "learning_rate": 1.598624226943435e-06, |
| "loss": 0.1908, |
| "num_tokens": 35184799.0, |
| "residual_var": 0.04093591868877411, |
| "reward": 0.85546875, |
| "reward_std": 0.09666222333908081, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.18749994039535522, |
| "step": 208 |
| }, |
| { |
| "advantages/mean": -1.280568540096283e-09, |
| "advantages/snr": 4.6483877879022735e-09, |
| "advantages/std": 0.2754865884780884, |
| "advantages/var": 0.07589286043129562, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 1.2005730659025788, |
| "grad_norm": 54.22474322398488, |
| "learning_rate": 1.595023196102037e-06, |
| "loss": -0.0642, |
| "num_tokens": 35331541.0, |
| "residual_var": 0.05691966041922569, |
| "reward": 0.69921875, |
| "reward_std": 0.1422979235649109, |
| "rewards/drgrpo_math_reward/mean": 0.69921875, |
| "rewards/drgrpo_math_reward/std": 0.45949608087539673, |
| "rho2": 0.24999995529651642, |
| "step": 209 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.569652855393217e-09, |
| "advantages/std": 0.29666513204574585, |
| "advantages/var": 0.08801020057171982, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.2063037249283668, |
| "grad_norm": 52.08833318906159, |
| "learning_rate": 1.5914101802939088e-06, |
| "loss": -0.5718, |
| "num_tokens": 35486393.0, |
| "residual_var": 0.06600767374038696, |
| "reward": 0.765625, |
| "reward_std": 0.1607092320919037, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.24999994039535522, |
| "step": 210 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 4.139732864518914e-09, |
| "advantages/std": 0.28121456503868103, |
| "advantages/var": 0.07908163158989456, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.2120343839541547, |
| "grad_norm": 48.84084160737796, |
| "learning_rate": 1.587785252292473e-06, |
| "loss": -0.0827, |
| "num_tokens": 35632888.0, |
| "residual_var": 0.051897335797548294, |
| "reward": 0.8359375, |
| "reward_std": 0.152285635471344, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.34374991059303284, |
| "step": 211 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 4.31630082528225e-09, |
| "advantages/std": 0.21576867997646332, |
| "advantages/var": 0.04655612325878544, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.2177650429799427, |
| "grad_norm": 31.030637833339267, |
| "learning_rate": 1.584148485111087e-06, |
| "loss": -0.1616, |
| "num_tokens": 35768842.0, |
| "residual_var": 0.03637198358774185, |
| "reward": 0.85546875, |
| "reward_std": 0.09916850179433823, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.21874994039535522, |
| "step": 212 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.0244025692171684e-09, |
| "advantages/std": 0.22728431224822998, |
| "advantages/var": 0.051658158594150905, |
| "completions/clipped_ratio": -2.703125, |
| "epoch": 1.2234957020057307, |
| "grad_norm": 50.84114456820672, |
| "learning_rate": 1.5804999520015733e-06, |
| "loss": -0.6298, |
| "num_tokens": 35930457.0, |
| "residual_var": 0.04035795107483864, |
| "reward": 0.75390625, |
| "reward_std": 0.10429336875677109, |
| "rewards/drgrpo_math_reward/mean": 0.75390625, |
| "rewards/drgrpo_math_reward/std": 0.43157756328582764, |
| "rho2": 0.21874995529651642, |
| "step": 213 |
| }, |
| { |
| "advantages/mean": 1.280568540096283e-09, |
| "advantages/snr": 4.770199825726274e-09, |
| "advantages/std": 0.2684517502784729, |
| "advantages/var": 0.07206634222757557, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.2292263610315186, |
| "grad_norm": 43.68379202104907, |
| "learning_rate": 1.5768397264527446e-06, |
| "loss": 0.3753, |
| "num_tokens": 36081065.0, |
| "residual_var": 0.047293536365032196, |
| "reward": 0.78515625, |
| "reward_std": 0.1454533040523529, |
| "rewards/drgrpo_math_reward/mean": 0.78515625, |
| "rewards/drgrpo_math_reward/std": 0.4115184545516968, |
| "rho2": 0.3437499403953552, |
| "step": 214 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 2.5932214274099493e-09, |
| "advantages/std": 0.22446081042289734, |
| "advantages/var": 0.05038265541570386, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 1.2349570200573066, |
| "grad_norm": 31.309381344591714, |
| "learning_rate": 1.5731678821889222e-06, |
| "loss": 0.0674, |
| "num_tokens": 36228838.0, |
| "residual_var": 0.04093592241406441, |
| "reward": 0.83984375, |
| "reward_std": 0.10311631113290787, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.18749995529651642, |
| "step": 215 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.252712264849656e-09, |
| "advantages/std": 0.20671138167381287, |
| "advantages/var": 0.04272959531349674, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 1.2406876790830945, |
| "grad_norm": 33.93200973085216, |
| "learning_rate": 1.569484493168452e-06, |
| "loss": -0.4791, |
| "num_tokens": 36370678.0, |
| "residual_var": 0.03338250517845154, |
| "reward": 0.80859375, |
| "reward_std": 0.08929947018623352, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.2187499701976776, |
| "step": 216 |
| }, |
| { |
| "advantages/mean": 8.149072527885437e-10, |
| "advantages/snr": 3.1950753658538154e-09, |
| "advantages/std": 0.25505101680755615, |
| "advantages/var": 0.06505102117456829, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.2464183381088825, |
| "grad_norm": 43.01139157018457, |
| "learning_rate": 1.5657896335822145e-06, |
| "loss": -0.1037, |
| "num_tokens": 36514432.0, |
| "residual_var": 0.046755433082580566, |
| "reward": 0.796875, |
| "reward_std": 0.13151776790618896, |
| "rewards/drgrpo_math_reward/mean": 0.796875, |
| "rewards/drgrpo_math_reward/std": 0.40311288833618164, |
| "rho2": 0.2812499403953552, |
| "step": 217 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.822921321434942e-09, |
| "advantages/std": 0.24743583798408508, |
| "advantages/var": 0.0612244939188864, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.2521489971346704, |
| "grad_norm": 52.736686618922654, |
| "learning_rate": 1.5620833778521306e-06, |
| "loss": -0.3578, |
| "num_tokens": 36655270.0, |
| "residual_var": 0.04209185019135475, |
| "reward": 0.9140625, |
| "reward_std": 0.13402405381202698, |
| "rewards/drgrpo_math_reward/mean": 0.9140625, |
| "rewards/drgrpo_math_reward/std": 0.28082075715065, |
| "rho2": 0.3124999403953552, |
| "step": 218 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 5.1361226810312885e-09, |
| "advantages/std": 0.27199190855026245, |
| "advantages/var": 0.07397959831681433, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.2578796561604584, |
| "grad_norm": 66.86254250830923, |
| "learning_rate": 1.5583658006296623e-06, |
| "loss": 0.1719, |
| "num_tokens": 36825958.0, |
| "residual_var": 0.053172849118709564, |
| "reward": 0.65625, |
| "reward_std": 0.14651167392730713, |
| "rewards/drgrpo_math_reward/mean": 0.65625, |
| "rewards/drgrpo_math_reward/std": 0.47588926553726196, |
| "rho2": 0.2812499403953552, |
| "step": 219 |
| }, |
| { |
| "advantages/mean": 8.149072527885437e-10, |
| "advantages/snr": 3.179527502936079e-09, |
| "advantages/std": 0.2562982141971588, |
| "advantages/var": 0.0656887746006527, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 1.2636103151862463, |
| "grad_norm": 54.50502110716351, |
| "learning_rate": 1.5546369767943102e-06, |
| "loss": -0.5218, |
| "num_tokens": 36985406.0, |
| "residual_var": 0.043108273297548294, |
| "reward": 0.84765625, |
| "reward_std": 0.13861849904060364, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.3437499403953552, |
| "step": 220 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 8.560204468385481e-10, |
| "advantages/std": 0.27199190855026245, |
| "advantages/var": 0.07397959831681433, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.2693409742120343, |
| "grad_norm": 40.739286290569964, |
| "learning_rate": 1.5508969814521024e-06, |
| "loss": 0.0063, |
| "num_tokens": 37116884.0, |
| "residual_var": 0.060108430683612823, |
| "reward": 0.8984375, |
| "reward_std": 0.1334872543811798, |
| "rewards/drgrpo_math_reward/mean": 0.8984375, |
| "rewards/drgrpo_math_reward/std": 0.3026638329029083, |
| "rho2": 0.18749995529651642, |
| "step": 221 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.2731026598319146e-09, |
| "advantages/std": 0.2743266224861145, |
| "advantages/var": 0.07525509580463918, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.2750716332378222, |
| "grad_norm": 58.428618246393256, |
| "learning_rate": 1.5471458899340858e-06, |
| "loss": -1.2012, |
| "num_tokens": 37269959.0, |
| "residual_var": 0.05408962070941925, |
| "reward": 0.7734375, |
| "reward_std": 0.14822159707546234, |
| "rewards/drgrpo_math_reward/mean": 0.7734375, |
| "rewards/drgrpo_math_reward/std": 0.41942715644836426, |
| "rho2": 0.2812499403953552, |
| "step": 222 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 4.656612686812885e-09, |
| "advantages/std": 0.25, |
| "advantages/var": 0.0625, |
| "completions/clipped_ratio": -2.65625, |
| "epoch": 1.2808022922636102, |
| "grad_norm": 48.94953357605194, |
| "learning_rate": 1.5433837777948058e-06, |
| "loss": -0.4937, |
| "num_tokens": 37434350.0, |
| "residual_var": 0.046875014901161194, |
| "reward": 0.765625, |
| "reward_std": 0.11572261154651642, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.24999994039535522, |
| "step": 223 |
| }, |
| { |
| "advantages/mean": -1.5133991837501526e-09, |
| "advantages/snr": 5.4479589668666796e-09, |
| "advantages/std": 0.27779194712638855, |
| "advantages/var": 0.07716836588827025, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.2865329512893982, |
| "grad_norm": 48.425063051432105, |
| "learning_rate": 1.5396107208107846e-06, |
| "loss": -0.4051, |
| "num_tokens": 37593087.0, |
| "residual_var": 0.05546478554606438, |
| "reward": 0.82421875, |
| "reward_std": 0.1434749811887741, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.2812499403953552, |
| "step": 224 |
| }, |
| { |
| "advantages/mean": -1.7462298274040222e-09, |
| "advantages/snr": 8.781698823081943e-09, |
| "advantages/std": 0.19884873926639557, |
| "advantages/var": 0.03954082110783497, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 1.2922636103151863, |
| "grad_norm": 34.77580496494329, |
| "learning_rate": 1.5358267949789964e-06, |
| "loss": -0.0698, |
| "num_tokens": 37731294.0, |
| "residual_var": 0.032126929610967636, |
| "reward": 0.90625, |
| "reward_std": 0.07996084541082382, |
| "rewards/drgrpo_math_reward/mean": 0.90625, |
| "rewards/drgrpo_math_reward/std": 0.2920515835285187, |
| "rho2": 0.1874999701976776, |
| "step": 225 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.25253814458847046, |
| "advantages/var": 0.06377551447218721, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 1.2979942693409743, |
| "grad_norm": 41.08991075758347, |
| "learning_rate": 1.5320320765153365e-06, |
| "loss": -0.1379, |
| "num_tokens": 37881528.0, |
| "residual_var": 0.04583866521716118, |
| "reward": 0.828125, |
| "reward_std": 0.1233537495136261, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.28124991059303284, |
| "step": 226 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.7194685024217942e-09, |
| "advantages/std": 0.27081698179244995, |
| "advantages/var": 0.07334183762717217, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.3037249283667622, |
| "grad_norm": 53.21476933729365, |
| "learning_rate": 1.5282266418530844e-06, |
| "loss": 0.1193, |
| "num_tokens": 38027798.0, |
| "residual_var": 0.05042253062129021, |
| "reward": 0.82421875, |
| "reward_std": 0.15243536233901978, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.3124999403953552, |
| "step": 227 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 5.717766645206049e-10, |
| "advantages/std": 0.20360277593135834, |
| "advantages/var": 0.04145409036695491, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 1.3094555873925502, |
| "grad_norm": 32.614018221273795, |
| "learning_rate": 1.5244105676413656e-06, |
| "loss": -0.5882, |
| "num_tokens": 38181529.0, |
| "residual_var": 0.03368145227432251, |
| "reward": 0.84765625, |
| "reward_std": 0.08166831731796265, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.1874999701976776, |
| "step": 228 |
| }, |
| { |
| "advantages/mean": 1.862645149230957e-09, |
| "advantages/snr": 6.877874009687177e-09, |
| "advantages/std": 0.27081698179244995, |
| "advantages/var": 0.07334183762717217, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.3151862464183381, |
| "grad_norm": 57.591177765790256, |
| "learning_rate": 1.5205839307436086e-06, |
| "loss": -1.1811, |
| "num_tokens": 38330487.0, |
| "residual_var": 0.05042252689599991, |
| "reward": 0.78515625, |
| "reward_std": 0.14598126709461212, |
| "rewards/drgrpo_math_reward/mean": 0.78515625, |
| "rewards/drgrpo_math_reward/std": 0.4115184545516968, |
| "rho2": 0.31249991059303284, |
| "step": 229 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 4.058712560030977e-10, |
| "advantages/std": 0.2868281900882721, |
| "advantages/var": 0.08227041062931395, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.320916905444126, |
| "grad_norm": 63.47812761517252, |
| "learning_rate": 1.5167468082359944e-06, |
| "loss": -0.6156, |
| "num_tokens": 38482705.0, |
| "residual_var": 0.05398997291922569, |
| "reward": 0.80078125, |
| "reward_std": 0.16861121356487274, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.3437499403953552, |
| "step": 230 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-09, |
| "advantages/snr": 8.56020446838548e-09, |
| "advantages/std": 0.27199190855026245, |
| "advantages/var": 0.07397959831681433, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 1.326647564469914, |
| "grad_norm": 64.11487366448154, |
| "learning_rate": 1.5128992774059062e-06, |
| "loss": -1.467, |
| "num_tokens": 38625275.0, |
| "residual_var": 0.041613537818193436, |
| "reward": 0.8828125, |
| "reward_std": 0.1666392683982849, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.43749991059303284, |
| "step": 231 |
| }, |
| { |
| "advantages/mean": -1.862645149230957e-09, |
| "advantages/snr": 6.25598551702665e-09, |
| "advantages/std": 0.29773807525634766, |
| "advantages/var": 0.08864796145735454, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.332378223495702, |
| "grad_norm": 49.53825848796451, |
| "learning_rate": 1.5090414157503713e-06, |
| "loss": -0.8094, |
| "num_tokens": 38774409.0, |
| "residual_var": 0.06371574103832245, |
| "reward": 0.80078125, |
| "reward_std": 0.1618887335062027, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.2812499403953552, |
| "step": 232 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.1937709000681565e-09, |
| "advantages/std": 0.2187044471502304, |
| "advantages/var": 0.047831635203287926, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.33810888252149, |
| "grad_norm": 32.156107327334965, |
| "learning_rate": 1.5051733009745012e-06, |
| "loss": -0.1481, |
| "num_tokens": 38923354.0, |
| "residual_var": 0.037368472665548325, |
| "reward": 0.87890625, |
| "reward_std": 0.09442433714866638, |
| "rewards/drgrpo_math_reward/mean": 0.87890625, |
| "rewards/drgrpo_math_reward/std": 0.3268752694129944, |
| "rho2": 0.21874994039535522, |
| "step": 233 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.3417863528218515e-09, |
| "advantages/std": 0.19884873926639557, |
| "advantages/var": 0.03954082110783497, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.343839541547278, |
| "grad_norm": 53.25849122067783, |
| "learning_rate": 1.501295010989925e-06, |
| "loss": -0.1868, |
| "num_tokens": 39081935.0, |
| "residual_var": 0.034598227590322495, |
| "reward": 0.875, |
| "reward_std": 0.07285766303539276, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.12499997019767761, |
| "step": 234 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 4.258361200090875e-09, |
| "advantages/std": 0.2187044471502304, |
| "advantages/var": 0.047831635203287926, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 1.3495702005730659, |
| "grad_norm": 41.785440666032066, |
| "learning_rate": 1.4974066239132218e-06, |
| "loss": -0.5715, |
| "num_tokens": 39224346.0, |
| "residual_var": 0.037368472665548325, |
| "reward": 0.88671875, |
| "reward_std": 0.09442433714866638, |
| "rewards/drgrpo_math_reward/mean": 0.88671875, |
| "rewards/drgrpo_math_reward/std": 0.31755712628364563, |
| "rho2": 0.2187499701976776, |
| "step": 235 |
| }, |
| { |
| "advantages/mean": -1.0477378964424133e-09, |
| "advantages/snr": 4.148830097177908e-09, |
| "advantages/std": 0.25253814458847046, |
| "advantages/var": 0.06377551447218721, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 1.3553008595988538, |
| "grad_norm": 53.85641702669578, |
| "learning_rate": 1.4935082180643467e-06, |
| "loss": -0.4736, |
| "num_tokens": 39366574.0, |
| "residual_var": 0.04783164709806442, |
| "reward": 0.875, |
| "reward_std": 0.12388662993907928, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.2499999701976776, |
| "step": 236 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 6.125614781517557e-09, |
| "advantages/std": 0.26606544852256775, |
| "advantages/var": 0.07079082289751515, |
| "completions/clipped_ratio": -2.609375, |
| "epoch": 1.3610315186246418, |
| "grad_norm": 56.90528111968534, |
| "learning_rate": 1.4895998719650523e-06, |
| "loss": -0.7502, |
| "num_tokens": 39526191.0, |
| "residual_var": 0.048668697476387024, |
| "reward": 0.72265625, |
| "reward_std": 0.14427624642848969, |
| "rewards/drgrpo_math_reward/mean": 0.72265625, |
| "rewards/drgrpo_math_reward/std": 0.4485645890235901, |
| "rho2": 0.3124999701976776, |
| "step": 237 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.690322831964463e-09, |
| "advantages/std": 0.2754865884780884, |
| "advantages/var": 0.07589286043129562, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.3667621776504297, |
| "grad_norm": 43.19070198809019, |
| "learning_rate": 1.4856816643373082e-06, |
| "loss": -0.9103, |
| "num_tokens": 39683090.0, |
| "residual_var": 0.04980470612645149, |
| "reward": 0.76953125, |
| "reward_std": 0.15532232820987701, |
| "rewards/drgrpo_math_reward/mean": 0.76953125, |
| "rewards/drgrpo_math_reward/std": 0.4219578504562378, |
| "rho2": 0.34374991059303284, |
| "step": 238 |
| }, |
| { |
| "advantages/mean": 1.280568540096283e-09, |
| "advantages/snr": 5.669318067864984e-09, |
| "advantages/std": 0.22587698698043823, |
| "advantages/var": 0.05102041324736106, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 1.3724928366762177, |
| "grad_norm": 43.06595444024102, |
| "learning_rate": 1.4817536741017151e-06, |
| "loss": -0.0039, |
| "num_tokens": 39836659.0, |
| "residual_var": 0.03985970467329025, |
| "reward": 0.8125, |
| "reward_std": 0.09784172475337982, |
| "rewards/drgrpo_math_reward/mean": 0.8125, |
| "rewards/drgrpo_math_reward/std": 0.3910769522190094, |
| "rho2": 0.2187499701976776, |
| "step": 239 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 4.972694864437645e-09, |
| "advantages/std": 0.18728728592395782, |
| "advantages/var": 0.03507652746876233, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 1.3782234957020059, |
| "grad_norm": 48.141601296719074, |
| "learning_rate": 1.4778159803759156e-06, |
| "loss": -0.1759, |
| "num_tokens": 39974954.0, |
| "residual_var": 0.02849968895316124, |
| "reward": 0.87890625, |
| "reward_std": 0.07483352720737457, |
| "rewards/drgrpo_math_reward/mean": 0.87890625, |
| "rewards/drgrpo_math_reward/std": 0.3268752694129944, |
| "rho2": 0.1874999701976776, |
| "step": 240 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.9545560195055106e-09, |
| "advantages/std": 0.23824401199817657, |
| "advantages/var": 0.056760209252987304, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 1.3839541547277938, |
| "grad_norm": 48.60932568929768, |
| "learning_rate": 1.4738686624729987e-06, |
| "loss": -0.2783, |
| "num_tokens": 40125785.0, |
| "residual_var": 0.04079641401767731, |
| "reward": 0.81640625, |
| "reward_std": 0.12297550588846207, |
| "rewards/drgrpo_math_reward/mean": 0.81640625, |
| "rewards/drgrpo_math_reward/std": 0.387910932302475, |
| "rho2": 0.2812499403953552, |
| "step": 241 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.6635067535109087e-09, |
| "advantages/std": 0.19066210091114044, |
| "advantages/var": 0.0363520367238499, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.3896848137535818, |
| "grad_norm": 62.29778777727231, |
| "learning_rate": 1.4699117998999054e-06, |
| "loss": -0.6888, |
| "num_tokens": 40284183.0, |
| "residual_var": 0.02840004302561283, |
| "reward": 0.76953125, |
| "reward_std": 0.08246467262506485, |
| "rewards/drgrpo_math_reward/mean": 0.76953125, |
| "rewards/drgrpo_math_reward/std": 0.4219578504562378, |
| "rho2": 0.21874994039535522, |
| "step": 242 |
| }, |
| { |
| "advantages/mean": 1.6298145055770874e-09, |
| "advantages/snr": 5.91612991187562e-09, |
| "advantages/std": 0.2754865884780884, |
| "advantages/var": 0.07589286043129562, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.3954154727793697, |
| "grad_norm": 64.60278694418795, |
| "learning_rate": 1.4659454723558246e-06, |
| "loss": -0.3987, |
| "num_tokens": 40437955.0, |
| "residual_var": 0.049804698675870895, |
| "reward": 0.79296875, |
| "reward_std": 0.15532232820987701, |
| "rewards/drgrpo_math_reward/mean": 0.79296875, |
| "rewards/drgrpo_math_reward/std": 0.40597182512283325, |
| "rho2": 0.3437499403953552, |
| "step": 243 |
| }, |
| { |
| "advantages/mean": 1.0477378964424133e-09, |
| "advantages/snr": 3.937895216689858e-09, |
| "advantages/std": 0.26606544852256775, |
| "advantages/var": 0.07079082289751515, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.4011461318051577, |
| "grad_norm": 96.87862275971443, |
| "learning_rate": 1.4619697597305899e-06, |
| "loss": -0.3258, |
| "num_tokens": 40588989.0, |
| "residual_var": 0.05309312418103218, |
| "reward": 0.86328125, |
| "reward_std": 0.13071896135807037, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.24999994039535522, |
| "step": 244 |
| }, |
| { |
| "advantages/mean": -1.7462298274040222e-09, |
| "advantages/snr": 6.28610582583922e-09, |
| "advantages/std": 0.27779197692871094, |
| "advantages/var": 0.07716838244596147, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.4068767908309456, |
| "grad_norm": 53.57336970473912, |
| "learning_rate": 1.4579847421030676e-06, |
| "loss": -0.1779, |
| "num_tokens": 40729537.0, |
| "residual_var": 0.050641756504774094, |
| "reward": 0.87890625, |
| "reward_std": 0.15057817101478577, |
| "rewards/drgrpo_math_reward/mean": 0.87890625, |
| "rewards/drgrpo_math_reward/std": 0.3268752694129944, |
| "rho2": 0.3437499403953552, |
| "step": 245 |
| }, |
| { |
| "advantages/mean": 2.0954757928848267e-09, |
| "advantages/snr": 6.867192539799447e-09, |
| "advantages/std": 0.30514299869537354, |
| "advantages/var": 0.09311224965280473, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 1.4126074498567336, |
| "grad_norm": 76.51726864020476, |
| "learning_rate": 1.4539904997395467e-06, |
| "loss": -0.1959, |
| "num_tokens": 40887865.0, |
| "residual_var": 0.055285416543483734, |
| "reward": 0.7421875, |
| "reward_std": 0.18596169352531433, |
| "rewards/drgrpo_math_reward/mean": 0.7421875, |
| "rewards/drgrpo_math_reward/std": 0.4382871091365814, |
| "rho2": 0.40624991059303284, |
| "step": 246 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.750175847901529e-09, |
| "advantages/std": 0.26606541872024536, |
| "advantages/var": 0.07079080703877949, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 1.4183381088825215, |
| "grad_norm": 36.59307165492083, |
| "learning_rate": 1.449987113092121e-06, |
| "loss": -0.3474, |
| "num_tokens": 41031181.0, |
| "residual_var": 0.05309312045574188, |
| "reward": 0.83984375, |
| "reward_std": 0.13717305660247803, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.2499999701976776, |
| "step": 247 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 5.675479987696761e-09, |
| "advantages/std": 0.24614372849464417, |
| "advantages/var": 0.0605867350772451, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 1.4240687679083095, |
| "grad_norm": 68.05512469299691, |
| "learning_rate": 1.4459746627970684e-06, |
| "loss": -0.2955, |
| "num_tokens": 41184203.0, |
| "residual_var": 0.043546728789806366, |
| "reward": 0.81640625, |
| "reward_std": 0.1263904571533203, |
| "rewards/drgrpo_math_reward/mean": 0.81640625, |
| "rewards/drgrpo_math_reward/std": 0.387910932302475, |
| "rho2": 0.2812499403953552, |
| "step": 248 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 3.4539869611748964e-09, |
| "advantages/std": 0.26963695883750916, |
| "advantages/var": 0.0727040895711406, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.4297994269340975, |
| "grad_norm": 59.493421784362106, |
| "learning_rate": 1.4419532296732268e-06, |
| "loss": -1.7344, |
| "num_tokens": 41343611.0, |
| "residual_var": 0.047712069004774094, |
| "reward": 0.8125, |
| "reward_std": 0.14545084536075592, |
| "rewards/drgrpo_math_reward/mean": 0.8125, |
| "rewards/drgrpo_math_reward/std": 0.3910769522190094, |
| "rho2": 0.34374991059303284, |
| "step": 249 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 8.997430734575406e-10, |
| "advantages/std": 0.2587745785713196, |
| "advantages/var": 0.06696428251476405, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.4355300859598854, |
| "grad_norm": 65.74542653499171, |
| "learning_rate": 1.4379228947203662e-06, |
| "loss": -0.2735, |
| "num_tokens": 41495636.0, |
| "residual_var": 0.048130594193935394, |
| "reward": 0.82421875, |
| "reward_std": 0.1332252472639084, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.2812499403953552, |
| "step": 250 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 4.28010223419274e-09, |
| "advantages/std": 0.27199190855026245, |
| "advantages/var": 0.07397959831681433, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.4412607449856734, |
| "grad_norm": 80.0417918069496, |
| "learning_rate": 1.433883739117558e-06, |
| "loss": -0.8315, |
| "num_tokens": 41634633.0, |
| "residual_var": 0.046237263828516006, |
| "reward": 0.828125, |
| "reward_std": 0.1530819833278656, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.37499991059303284, |
| "step": 251 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.158150412641125e-09, |
| "advantages/std": 0.21576867997646332, |
| "advantages/var": 0.04655612325878544, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 1.4469914040114613, |
| "grad_norm": 64.1441250560586, |
| "learning_rate": 1.4298358442215388e-06, |
| "loss": -0.2455, |
| "num_tokens": 41781254.0, |
| "residual_var": 0.03346222639083862, |
| "reward": 0.82421875, |
| "reward_std": 0.10627168416976929, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.2812499403953552, |
| "step": 252 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-09, |
| "advantages/snr": 7.33473836777306e-09, |
| "advantages/std": 0.31743550300598145, |
| "advantages/var": 0.10076529856866046, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.4527220630372493, |
| "grad_norm": 68.26001489650739, |
| "learning_rate": 1.4257792915650725e-06, |
| "loss": -0.7108, |
| "num_tokens": 41924107.0, |
| "residual_var": 0.06297832727432251, |
| "reward": 0.8828125, |
| "reward_std": 0.18687033653259277, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.3749999403953552, |
| "step": 253 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 2.970572550223795e-09, |
| "advantages/std": 0.2743266522884369, |
| "advantages/var": 0.07525511215578096, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.4584527220630372, |
| "grad_norm": 46.95429888318211, |
| "learning_rate": 1.4217141628553076e-06, |
| "loss": -1.0476, |
| "num_tokens": 42074555.0, |
| "residual_var": 0.05408961698412895, |
| "reward": 0.8203125, |
| "reward_std": 0.1417675018310547, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.2812499403953552, |
| "step": 254 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 3.616233635463797e-09, |
| "advantages/std": 0.1287696808576584, |
| "advantages/var": 0.016581630708183193, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 1.4641833810888252, |
| "grad_norm": 22.725984301890314, |
| "learning_rate": 1.417640539972131e-06, |
| "loss": 0.0385, |
| "num_tokens": 42202249.0, |
| "residual_var": 0.015027116052806377, |
| "reward": 0.90625, |
| "reward_std": 0.036563027650117874, |
| "rewards/drgrpo_math_reward/mean": 0.90625, |
| "rewards/drgrpo_math_reward/std": 0.2920515835285187, |
| "rho2": 0.0937499850988388, |
| "step": 255 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 2.982532443970627e-09, |
| "advantages/std": 0.23419423401355743, |
| "advantages/var": 0.0548469392451969, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.4699140401146131, |
| "grad_norm": 44.22865864057851, |
| "learning_rate": 1.4135585049665206e-06, |
| "loss": -0.1639, |
| "num_tokens": 42353064.0, |
| "residual_var": 0.042849186807870865, |
| "reward": 0.765625, |
| "reward_std": 0.10771076381206512, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.21874994039535522, |
| "step": 256 |
| }, |
| { |
| "advantages/mean": -1.0477378964424133e-09, |
| "advantages/snr": 4.0488433642653295e-09, |
| "advantages/std": 0.25877460837364197, |
| "advantages/var": 0.06696429793893177, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.475644699140401, |
| "grad_norm": 48.71604701518212, |
| "learning_rate": 1.4094681400588907e-06, |
| "loss": -0.6364, |
| "num_tokens": 42500406.0, |
| "residual_var": 0.050223227590322495, |
| "reward": 0.84765625, |
| "reward_std": 0.12730401754379272, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.24999995529651642, |
| "step": 257 |
| }, |
| { |
| "advantages/mean": 8.149072527885437e-10, |
| "advantages/snr": 3.1050551359151326e-09, |
| "advantages/std": 0.262445330619812, |
| "advantages/var": 0.06887755156414244, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 1.481375358166189, |
| "grad_norm": 62.11118022578198, |
| "learning_rate": 1.405369527637436e-06, |
| "loss": 0.3678, |
| "num_tokens": 42648437.0, |
| "residual_var": 0.049505751579999924, |
| "reward": 0.765625, |
| "reward_std": 0.1349327266216278, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.2812499403953552, |
| "step": 258 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 8.523543981158328e-10, |
| "advantages/std": 0.27316176891326904, |
| "advantages/var": 0.0746173519958262, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 1.487106017191977, |
| "grad_norm": 67.65573090786955, |
| "learning_rate": 1.4012627502564742e-06, |
| "loss": -1.1556, |
| "num_tokens": 42789425.0, |
| "residual_var": 0.05129944160580635, |
| "reward": 0.76953125, |
| "reward_std": 0.14769119024276733, |
| "rewards/drgrpo_math_reward/mean": 0.76953125, |
| "rewards/drgrpo_math_reward/std": 0.4219578504562378, |
| "rho2": 0.3124999403953552, |
| "step": 259 |
| }, |
| { |
| "advantages/mean": -1.0477378964424133e-09, |
| "advantages/snr": 3.696071523020088e-09, |
| "advantages/std": 0.2834733724594116, |
| "advantages/var": 0.08035715289351231, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.492836676217765, |
| "grad_norm": 62.192696949199856, |
| "learning_rate": 1.3971478906347805e-06, |
| "loss": -0.9984, |
| "num_tokens": 42957416.0, |
| "residual_var": 0.05022323131561279, |
| "reward": 0.75, |
| "reward_std": 0.16637086868286133, |
| "rewards/drgrpo_math_reward/mean": 0.75, |
| "rewards/drgrpo_math_reward/std": 0.4338609278202057, |
| "rho2": 0.37499988079071045, |
| "step": 260 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 2.9318340292582657e-09, |
| "advantages/std": 0.23824401199817657, |
| "advantages/var": 0.056760209252987304, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.498567335243553, |
| "grad_norm": 33.8876635168633, |
| "learning_rate": 1.3930250316539235e-06, |
| "loss": -0.0272, |
| "num_tokens": 43096508.0, |
| "residual_var": 0.04434392228722572, |
| "reward": 0.87890625, |
| "reward_std": 0.115872323513031, |
| "rewards/drgrpo_math_reward/mean": 0.87890625, |
| "rewards/drgrpo_math_reward/std": 0.3268752694129944, |
| "rho2": 0.21874995529651642, |
| "step": 261 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 3.714691764834188e-09, |
| "advantages/std": 0.3133915960788727, |
| "advantages/var": 0.09821429249286329, |
| "completions/clipped_ratio": -2.734375, |
| "epoch": 1.5042979942693409, |
| "grad_norm": 56.81531691666041, |
| "learning_rate": 1.3888942563565948e-06, |
| "loss": 0.0313, |
| "num_tokens": 43261550.0, |
| "residual_var": 0.05524555593729019, |
| "reward": 0.796875, |
| "reward_std": 0.19700777530670166, |
| "rewards/drgrpo_math_reward/mean": 0.796875, |
| "rewards/drgrpo_math_reward/std": 0.40311288833618164, |
| "rho2": 0.43749988079071045, |
| "step": 262 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 1.7677820123739347e-09, |
| "advantages/std": 0.3292694389820099, |
| "advantages/var": 0.10841836344752753, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.5100286532951288, |
| "grad_norm": 49.90809038754078, |
| "learning_rate": 1.384755647944936e-06, |
| "loss": -0.3604, |
| "num_tokens": 43411190.0, |
| "residual_var": 0.0643734261393547, |
| "reward": 0.8046875, |
| "reward_std": 0.20779037475585938, |
| "rewards/drgrpo_math_reward/mean": 0.8046875, |
| "rewards/drgrpo_math_reward/std": 0.39721766114234924, |
| "rho2": 0.40624991059303284, |
| "step": 263 |
| }, |
| { |
| "advantages/mean": 1.0477378964424133e-09, |
| "advantages/snr": 5.64584279348239e-09, |
| "advantages/std": 0.18557687103748322, |
| "advantages/var": 0.03443877506406268, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.5157593123209168, |
| "grad_norm": 72.71898678230077, |
| "learning_rate": 1.3806092897788643e-06, |
| "loss": -0.2531, |
| "num_tokens": 43553476.0, |
| "residual_var": 0.026905305683612823, |
| "reward": 0.859375, |
| "reward_std": 0.08075720071792603, |
| "rewards/drgrpo_math_reward/mean": 0.859375, |
| "rewards/drgrpo_math_reward/std": 0.3483152687549591, |
| "rho2": 0.21874995529651642, |
| "step": 264 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.2910053593143641e-09, |
| "advantages/std": 0.18034830689430237, |
| "advantages/var": 0.03252551179964147, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 1.5214899713467047, |
| "grad_norm": 29.822109608396552, |
| "learning_rate": 1.3764552653743919e-06, |
| "loss": -0.3537, |
| "num_tokens": 43701138.0, |
| "residual_var": 0.027443410828709602, |
| "reward": 0.85546875, |
| "reward_std": 0.07194654643535614, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.1562499701976776, |
| "step": 265 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.1348570685336994e-09, |
| "advantages/std": 0.20516295731067657, |
| "advantages/var": 0.0420918390524625, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.5272206303724927, |
| "grad_norm": 59.49985738560282, |
| "learning_rate": 1.3722936584019451e-06, |
| "loss": -0.2906, |
| "num_tokens": 43851563.0, |
| "residual_var": 0.03551499918103218, |
| "reward": 0.828125, |
| "reward_std": 0.08219873160123825, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.1562499701976776, |
| "step": 266 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 8.560204468385481e-10, |
| "advantages/std": 0.27199190855026245, |
| "advantages/var": 0.07397959831681433, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.5329512893982808, |
| "grad_norm": 46.34469421593609, |
| "learning_rate": 1.3681245526846781e-06, |
| "loss": -0.4533, |
| "num_tokens": 43987630.0, |
| "residual_var": 0.060108426958322525, |
| "reward": 0.8125, |
| "reward_std": 0.133487269282341, |
| "rewards/drgrpo_math_reward/mean": 0.8125, |
| "rewards/drgrpo_math_reward/std": 0.3910769522190094, |
| "rho2": 0.1874999701976776, |
| "step": 267 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.1726896131954143e-09, |
| "advantages/std": 0.220157653093338, |
| "advantages/var": 0.048469392215566565, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 1.5386819484240688, |
| "grad_norm": 39.76207826773463, |
| "learning_rate": 1.3639480321967845e-06, |
| "loss": 0.1273, |
| "num_tokens": 44132623.0, |
| "residual_var": 0.03938139230012894, |
| "reward": 0.8359375, |
| "reward_std": 0.09495474398136139, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.1874999701976776, |
| "step": 268 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 4.287037337628827e-09, |
| "advantages/std": 0.21724152565002441, |
| "advantages/var": 0.047193880466750215, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 1.5444126074498568, |
| "grad_norm": 35.89001157588973, |
| "learning_rate": 1.359764181061807e-06, |
| "loss": -0.2152, |
| "num_tokens": 44277413.0, |
| "residual_var": 0.03834503889083862, |
| "reward": 0.84375, |
| "reward_std": 0.08679073303937912, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.18749994039535522, |
| "step": 269 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.6992289095102196e-09, |
| "advantages/std": 0.25877460837364197, |
| "advantages/var": 0.06696429793893177, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.5501432664756447, |
| "grad_norm": 53.21944819552593, |
| "learning_rate": 1.3555730835509419e-06, |
| "loss": -0.1514, |
| "num_tokens": 44429618.0, |
| "residual_var": 0.05022323504090309, |
| "reward": 0.79296875, |
| "reward_std": 0.12730401754379272, |
| "rewards/drgrpo_math_reward/mean": 0.79296875, |
| "rewards/drgrpo_math_reward/std": 0.40597182512283325, |
| "rho2": 0.24999995529651642, |
| "step": 270 |
| }, |
| { |
| "advantages/mean": -1.3969838619232178e-09, |
| "advantages/snr": 4.79665589833252e-09, |
| "advantages/std": 0.29124119877815247, |
| "advantages/var": 0.08482143586573532, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.5558739255014327, |
| "grad_norm": 61.803069074199236, |
| "learning_rate": 1.3513748240813427e-06, |
| "loss": 0.1271, |
| "num_tokens": 44583681.0, |
| "residual_var": 0.05301341041922569, |
| "reward": 0.79296875, |
| "reward_std": 0.16504409909248352, |
| "rewards/drgrpo_math_reward/mean": 0.79296875, |
| "rewards/drgrpo_math_reward/std": 0.40597182512283325, |
| "rho2": 0.3749999403953552, |
| "step": 271 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.21576867997646332, |
| "advantages/var": 0.04655612325878544, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 1.5616045845272206, |
| "grad_norm": 34.187999091685725, |
| "learning_rate": 1.3471694872144185e-06, |
| "loss": -0.2637, |
| "num_tokens": 44727554.0, |
| "residual_var": 0.03346222639083862, |
| "reward": 0.86328125, |
| "reward_std": 0.10627167671918869, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.2812499403953552, |
| "step": 272 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.1730858844377734e-09, |
| "advantages/std": 0.2142857164144516, |
| "advantages/var": 0.04591836825925477, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 1.5673352435530086, |
| "grad_norm": 48.52107737550291, |
| "learning_rate": 1.3429571576541314e-06, |
| "loss": -1.3899, |
| "num_tokens": 44876865.0, |
| "residual_var": 0.035873737186193466, |
| "reward": 0.875, |
| "reward_std": 0.09271685779094696, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.2187499701976776, |
| "step": 273 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.22728431224822998, |
| "advantages/var": 0.051658158594150905, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 1.5730659025787965, |
| "grad_norm": 54.31265956931611, |
| "learning_rate": 1.3387379202452915e-06, |
| "loss": -0.7988, |
| "num_tokens": 45039696.0, |
| "residual_var": 0.037129323929548264, |
| "reward": 0.80078125, |
| "reward_std": 0.11139655113220215, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.2812499403953552, |
| "step": 274 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.24222607910633087, |
| "advantages/var": 0.05867347339922646, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 1.5787965616045845, |
| "grad_norm": 48.08288648494765, |
| "learning_rate": 1.3345118599718454e-06, |
| "loss": -0.2865, |
| "num_tokens": 45201572.0, |
| "residual_var": 0.04217156767845154, |
| "reward": 0.7578125, |
| "reward_std": 0.11822889000177383, |
| "rewards/drgrpo_math_reward/mean": 0.7578125, |
| "rewards/drgrpo_math_reward/std": 0.4292463958263397, |
| "rho2": 0.2812499403953552, |
| "step": 275 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 2.6545718234372027e-09, |
| "advantages/std": 0.35083720088005066, |
| "advantages/var": 0.12308674152134902, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.5845272206303727, |
| "grad_norm": 55.04932654737737, |
| "learning_rate": 1.3302790619551672e-06, |
| "loss": -0.1071, |
| "num_tokens": 45370041.0, |
| "residual_var": 0.06923630088567734, |
| "reward": 0.80859375, |
| "reward_std": 0.2290886491537094, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.4374999403953552, |
| "step": 276 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.22587698698043823, |
| "advantages/var": 0.05102041324736106, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.5902578796561606, |
| "grad_norm": 42.25464681628162, |
| "learning_rate": 1.3260396114523417e-06, |
| "loss": -0.4081, |
| "num_tokens": 45526026.0, |
| "residual_var": 0.04304848238825798, |
| "reward": 0.75, |
| "reward_std": 0.09073854237794876, |
| "rewards/drgrpo_math_reward/mean": 0.75, |
| "rewards/drgrpo_math_reward/std": 0.4338609278202057, |
| "rho2": 0.15624995529651642, |
| "step": 277 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 2.983240393405415e-09, |
| "advantages/std": 0.27316176891326904, |
| "advantages/var": 0.0746173519958262, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.5959885386819486, |
| "grad_norm": 46.40080500539711, |
| "learning_rate": 1.3217935938544495e-06, |
| "loss": -0.1385, |
| "num_tokens": 45672381.0, |
| "residual_var": 0.058294814079999924, |
| "reward": 0.83984375, |
| "reward_std": 0.1346667855978012, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.2187499701976776, |
| "step": 278 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.504116276874765e-09, |
| "advantages/std": 0.2789374887943268, |
| "advantages/var": 0.07780612265488518, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.6017191977077365, |
| "grad_norm": 52.843715111367665, |
| "learning_rate": 1.3175410946848444e-06, |
| "loss": -0.3251, |
| "num_tokens": 45821622.0, |
| "residual_var": 0.046197403222322464, |
| "reward": 0.8203125, |
| "reward_std": 0.1641329824924469, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.40624991059303284, |
| "step": 279 |
| }, |
| { |
| "advantages/mean": -5.820766091346741e-10, |
| "advantages/snr": 2.3773285286252477e-09, |
| "advantages/std": 0.24484482407569885, |
| "advantages/var": 0.05994898787665992, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 1.6074498567335245, |
| "grad_norm": 46.228824312012414, |
| "learning_rate": 1.3132821995974326e-06, |
| "loss": 0.2804, |
| "num_tokens": 45973378.0, |
| "residual_var": 0.048708558082580566, |
| "reward": 0.875, |
| "reward_std": 0.10691440105438232, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.1874999701976776, |
| "step": 280 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 8.523543981158328e-10, |
| "advantages/std": 0.27316176891326904, |
| "advantages/var": 0.0746173519958262, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 1.6131805157593124, |
| "grad_norm": 55.11002748041064, |
| "learning_rate": 1.3090169943749473e-06, |
| "loss": -0.463, |
| "num_tokens": 46117341.0, |
| "residual_var": 0.05129943788051605, |
| "reward": 0.84765625, |
| "reward_std": 0.14123709499835968, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.3124999403953552, |
| "step": 281 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.9329581251860285e-09, |
| "advantages/std": 0.2409060299396515, |
| "advantages/var": 0.05803571526128426, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.6189111747851004, |
| "grad_norm": 46.93471833357915, |
| "learning_rate": 1.3047455649272206e-06, |
| "loss": -0.504, |
| "num_tokens": 46278368.0, |
| "residual_var": 0.0453404076397419, |
| "reward": 0.82421875, |
| "reward_std": 0.11112816631793976, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.2187499701976776, |
| "step": 282 |
| }, |
| { |
| "advantages/mean": 1.862645149230957e-09, |
| "advantages/snr": 7.303029407665864e-09, |
| "advantages/std": 0.25505101680755615, |
| "advantages/var": 0.06505102117456829, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 1.6246418338108883, |
| "grad_norm": 42.613912161685086, |
| "learning_rate": 1.3004679972894518e-06, |
| "loss": -0.4131, |
| "num_tokens": 46421038.0, |
| "residual_var": 0.050821125507354736, |
| "reward": 0.796875, |
| "reward_std": 0.1244145929813385, |
| "rewards/drgrpo_math_reward/mean": 0.796875, |
| "rewards/drgrpo_math_reward/std": 0.40311288833618164, |
| "rho2": 0.21874995529651642, |
| "step": 283 |
| }, |
| { |
| "advantages/mean": 1.5133991837501526e-09, |
| "advantages/snr": 5.766530966699532e-09, |
| "advantages/std": 0.262445330619812, |
| "advantages/var": 0.06887755156414244, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 1.6303724928366763, |
| "grad_norm": 55.21201691395928, |
| "learning_rate": 1.2961843776204776e-06, |
| "loss": -0.2435, |
| "num_tokens": 46580804.0, |
| "residual_var": 0.045200902968645096, |
| "reward": 0.8125, |
| "reward_std": 0.14203590154647827, |
| "rewards/drgrpo_math_reward/mean": 0.8125, |
| "rewards/drgrpo_math_reward/std": 0.3910769522190094, |
| "rho2": 0.3437499403953552, |
| "step": 284 |
| }, |
| { |
| "advantages/mean": -1.3969838619232178e-09, |
| "advantages/snr": 5.675479987696761e-09, |
| "advantages/std": 0.24614372849464417, |
| "advantages/var": 0.0605867350772451, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 1.6361031518624642, |
| "grad_norm": 50.79647099010151, |
| "learning_rate": 1.2918947922010336e-06, |
| "loss": -0.1687, |
| "num_tokens": 46743701.0, |
| "residual_var": 0.045440055429935455, |
| "reward": 0.83984375, |
| "reward_std": 0.12046923488378525, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.2499999701976776, |
| "step": 285 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 2.9155005520288713e-09, |
| "advantages/std": 0.2395787239074707, |
| "advantages/var": 0.057397964949132074, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 1.6418338108882522, |
| "grad_norm": 74.3888925274376, |
| "learning_rate": 1.2875993274320173e-06, |
| "loss": -0.9076, |
| "num_tokens": 46908808.0, |
| "residual_var": 0.04663585498929024, |
| "reward": 0.828125, |
| "reward_std": 0.10994865000247955, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.18749995529651642, |
| "step": 286 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.126356132424828e-09, |
| "advantages/std": 0.20671138167381287, |
| "advantages/var": 0.04272959531349674, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.6475644699140402, |
| "grad_norm": 49.41706074564681, |
| "learning_rate": 1.2832980698327494e-06, |
| "loss": -0.1523, |
| "num_tokens": 47070033.0, |
| "residual_var": 0.03338250517845154, |
| "reward": 0.74609375, |
| "reward_std": 0.08929947018623352, |
| "rewards/drgrpo_math_reward/mean": 0.74609375, |
| "rewards/drgrpo_math_reward/std": 0.4360972046852112, |
| "rho2": 0.21874995529651642, |
| "step": 287 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.20044593513011932, |
| "advantages/var": 0.040178572910188004, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 1.653295128939828, |
| "grad_norm": 45.268262312968005, |
| "learning_rate": 1.2789911060392294e-06, |
| "loss": -0.4863, |
| "num_tokens": 47209548.0, |
| "residual_var": 0.032645102590322495, |
| "reward": 0.89453125, |
| "reward_std": 0.08641248196363449, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.1874999701976776, |
| "step": 288 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.2409060299396515, |
| "advantages/var": 0.05803571526128426, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 1.659025787965616, |
| "grad_norm": 72.12409051313657, |
| "learning_rate": 1.2746785228023901e-06, |
| "loss": -0.2533, |
| "num_tokens": 47355458.0, |
| "residual_var": 0.04534041881561279, |
| "reward": 0.84765625, |
| "reward_std": 0.11112815141677856, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.21874995529651642, |
| "step": 289 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.4073954360621055e-09, |
| "advantages/std": 0.2901442348957062, |
| "advantages/var": 0.08418367704321472, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.664756446991404, |
| "grad_norm": 81.65094517437481, |
| "learning_rate": 1.2703604069863528e-06, |
| "loss": -0.5033, |
| "num_tokens": 47508301.0, |
| "residual_var": 0.06313777714967728, |
| "reward": 0.6796875, |
| "reward_std": 0.15729427337646484, |
| "rewards/drgrpo_math_reward/mean": 0.6796875, |
| "rewards/drgrpo_math_reward/std": 0.4675106406211853, |
| "rho2": 0.24999995529651642, |
| "step": 290 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.828150668068317e-10, |
| "advantages/std": 0.23690177500247955, |
| "advantages/var": 0.056122450999325446, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.670487106017192, |
| "grad_norm": 40.0774805833318, |
| "learning_rate": 1.266036845566675e-06, |
| "loss": -0.1413, |
| "num_tokens": 47662898.0, |
| "residual_var": 0.04384567216038704, |
| "reward": 0.78125, |
| "reward_std": 0.10942068696022034, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.2187499701976776, |
| "step": 291 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 5.807816254470197e-10, |
| "advantages/std": 0.20044593513011932, |
| "advantages/var": 0.040178572910188004, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 1.67621776504298, |
| "grad_norm": 61.83270346354945, |
| "learning_rate": 1.2617079256286e-06, |
| "loss": 0.3358, |
| "num_tokens": 47801263.0, |
| "residual_var": 0.032645102590322495, |
| "reward": 0.83984375, |
| "reward_std": 0.07995839416980743, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.1874999701976776, |
| "step": 292 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 4.439648622792677e-09, |
| "advantages/std": 0.2097739279270172, |
| "advantages/var": 0.04400510083792941, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.6819484240687679, |
| "grad_norm": 115.93567638341011, |
| "learning_rate": 1.2573737343653023e-06, |
| "loss": -0.8953, |
| "num_tokens": 47949713.0, |
| "residual_var": 0.03300383687019348, |
| "reward": 0.86328125, |
| "reward_std": 0.0969306081533432, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.24999994039535522, |
| "step": 293 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.9768946035727382e-09, |
| "advantages/std": 0.2355518937110901, |
| "advantages/var": 0.05548469463088068, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 1.6876790830945558, |
| "grad_norm": 62.65175714195031, |
| "learning_rate": 1.2530343590761317e-06, |
| "loss": -0.4164, |
| "num_tokens": 48098928.0, |
| "residual_var": 0.04508132487535477, |
| "reward": 0.83984375, |
| "reward_std": 0.10824117064476013, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.1874999701976776, |
| "step": 294 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 2.8301503045057226e-09, |
| "advantages/std": 0.28793779015541077, |
| "advantages/var": 0.08290817099958137, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 1.6934097421203438, |
| "grad_norm": 75.52026681703008, |
| "learning_rate": 1.2486898871648551e-06, |
| "loss": -0.2693, |
| "num_tokens": 48232546.0, |
| "residual_var": 0.05699937418103218, |
| "reward": 0.8671875, |
| "reward_std": 0.15623344480991364, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.3124999403953552, |
| "step": 295 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 3.6695466564034195e-09, |
| "advantages/std": 0.253797709941864, |
| "advantages/var": 0.06441327757173454, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.6991404011461317, |
| "grad_norm": 62.035253493008476, |
| "learning_rate": 1.244340406137894e-06, |
| "loss": -0.5796, |
| "num_tokens": 48387262.0, |
| "residual_var": 0.04428413510322571, |
| "reward": 0.80859375, |
| "reward_std": 0.1374414563179016, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.3124999403953552, |
| "step": 296 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 4.806060414732269e-09, |
| "advantages/std": 0.24222607910633087, |
| "advantages/var": 0.05867347339922646, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.7048710601719197, |
| "grad_norm": 46.16600507678362, |
| "learning_rate": 1.2399860036025658e-06, |
| "loss": 0.1704, |
| "num_tokens": 48517720.0, |
| "residual_var": 0.04217156767845154, |
| "reward": 0.8671875, |
| "reward_std": 0.12468297779560089, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.2812499403953552, |
| "step": 297 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 8.702216012765879e-09, |
| "advantages/std": 0.18728728592395782, |
| "advantages/var": 0.03507652746876233, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.7106017191977076, |
| "grad_norm": 43.965060130880126, |
| "learning_rate": 1.235626767265316e-06, |
| "loss": -0.1406, |
| "num_tokens": 48671527.0, |
| "residual_var": 0.028499694541096687, |
| "reward": 0.87109375, |
| "reward_std": 0.07483352720737457, |
| "rewards/drgrpo_math_reward/mean": 0.87109375, |
| "rewards/drgrpo_math_reward/std": 0.33575257658958435, |
| "rho2": 0.18749994039535522, |
| "step": 298 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 7.769385089887185e-09, |
| "advantages/std": 0.2097739279270172, |
| "advantages/var": 0.04400510083792941, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.7163323782234956, |
| "grad_norm": 47.8375889530657, |
| "learning_rate": 1.2312627849299522e-06, |
| "loss": 0.0195, |
| "num_tokens": 48805831.0, |
| "residual_var": 0.03712931647896767, |
| "reward": 0.85546875, |
| "reward_std": 0.08390620350837708, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.1562499701976776, |
| "step": 299 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.36110830002126e-10, |
| "advantages/std": 0.24872122704982758, |
| "advantages/var": 0.06186224878517188, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 1.7220630372492836, |
| "grad_norm": 99.10893071959802, |
| "learning_rate": 1.2268941444958764e-06, |
| "loss": -0.8685, |
| "num_tokens": 48949622.0, |
| "residual_var": 0.04446350038051605, |
| "reward": 0.79296875, |
| "reward_std": 0.12164628505706787, |
| "rewards/drgrpo_math_reward/mean": 0.79296875, |
| "rewards/drgrpo_math_reward/std": 0.40597182512283325, |
| "rho2": 0.2812499403953552, |
| "step": 300 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 4.59388789526886e-09, |
| "advantages/std": 0.30409619212150574, |
| "advantages/var": 0.09247449406279973, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 1.7277936962750715, |
| "grad_norm": 44.15970985324081, |
| "learning_rate": 1.2225209339563143e-06, |
| "loss": -0.5217, |
| "num_tokens": 49094796.0, |
| "residual_var": 0.057796575129032135, |
| "reward": 0.85546875, |
| "reward_std": 0.17832809686660767, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.37499991059303284, |
| "step": 301 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 5.645842642869884e-09, |
| "advantages/std": 0.24743583798408508, |
| "advantages/var": 0.0612244939188864, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 1.7335243553008595, |
| "grad_norm": 50.98950387379492, |
| "learning_rate": 1.2181432413965426e-06, |
| "loss": -0.2709, |
| "num_tokens": 49228796.0, |
| "residual_var": 0.045918382704257965, |
| "reward": 0.890625, |
| "reward_std": 0.12046678364276886, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.24999994039535522, |
| "step": 302 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 2.4838397187113485e-09, |
| "advantages/std": 0.28121456503868103, |
| "advantages/var": 0.07908163158989456, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.7392550143266474, |
| "grad_norm": 67.15690274376962, |
| "learning_rate": 1.2137611549921145e-06, |
| "loss": -0.1553, |
| "num_tokens": 49380594.0, |
| "residual_var": 0.051897335797548294, |
| "reward": 0.8046875, |
| "reward_std": 0.15873973071575165, |
| "rewards/drgrpo_math_reward/mean": 0.8046875, |
| "rewards/drgrpo_math_reward/std": 0.39721766114234924, |
| "rho2": 0.3437499403953552, |
| "step": 303 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.084364294103083e-10, |
| "advantages/std": 0.2562982141971588, |
| "advantages/var": 0.0656887746006527, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 1.7449856733524354, |
| "grad_norm": 49.955242749828415, |
| "learning_rate": 1.2093747630070842e-06, |
| "loss": -0.7495, |
| "num_tokens": 49529014.0, |
| "residual_var": 0.04721382260322571, |
| "reward": 0.81640625, |
| "reward_std": 0.13151532411575317, |
| "rewards/drgrpo_math_reward/mean": 0.81640625, |
| "rewards/drgrpo_math_reward/std": 0.387910932302475, |
| "rho2": 0.2812499403953552, |
| "step": 304 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 6.0181390962042404e-09, |
| "advantages/std": 0.27081701159477234, |
| "advantages/var": 0.07334185376912306, |
| "completions/clipped_ratio": -2.671875, |
| "epoch": 1.7507163323782235, |
| "grad_norm": 60.730178677804794, |
| "learning_rate": 1.2049841537922305e-06, |
| "loss": 0.0502, |
| "num_tokens": 49679831.0, |
| "residual_var": 0.05271446332335472, |
| "reward": 0.76171875, |
| "reward_std": 0.1465141326189041, |
| "rewards/drgrpo_math_reward/mean": 0.76171875, |
| "rewards/drgrpo_math_reward/std": 0.4268665909767151, |
| "rho2": 0.2812499403953552, |
| "step": 305 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.7194683132012115e-09, |
| "advantages/std": 0.27081701159477234, |
| "advantages/var": 0.07334185376912306, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.7564469914040115, |
| "grad_norm": 59.49589899349533, |
| "learning_rate": 1.2005894157832728e-06, |
| "loss": -0.8913, |
| "num_tokens": 49828981.0, |
| "residual_var": 0.048130594193935394, |
| "reward": 0.85546875, |
| "reward_std": 0.14716322720050812, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.3437499403953552, |
| "step": 306 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.219622438173128e-10, |
| "advantages/std": 0.25253814458847046, |
| "advantages/var": 0.06377551447218721, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.7621776504297995, |
| "grad_norm": 55.76930921253595, |
| "learning_rate": 1.196190637499095e-06, |
| "loss": -1.2713, |
| "num_tokens": 49971360.0, |
| "residual_var": 0.04583866149187088, |
| "reward": 0.890625, |
| "reward_std": 0.12980785965919495, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.2812499403953552, |
| "step": 307 |
| }, |
| { |
| "advantages/mean": -5.820766091346741e-10, |
| "advantages/snr": 1.8882523284691135e-09, |
| "advantages/std": 0.30826207995414734, |
| "advantages/var": 0.09502550993765713, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.7679083094555874, |
| "grad_norm": 55.11636955178904, |
| "learning_rate": 1.19178790753996e-06, |
| "loss": -2.6177, |
| "num_tokens": 50126326.0, |
| "residual_var": 0.05939095839858055, |
| "reward": 0.81640625, |
| "reward_std": 0.18820202350616455, |
| "rewards/drgrpo_math_reward/mean": 0.81640625, |
| "rewards/drgrpo_math_reward/std": 0.387910932302475, |
| "rho2": 0.3749999403953552, |
| "step": 308 |
| }, |
| { |
| "advantages/mean": 1.5133991837501526e-09, |
| "advantages/snr": 6.352307063392909e-09, |
| "advantages/std": 0.23824401199817657, |
| "advantages/var": 0.056760209252987304, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 1.7736389684813754, |
| "grad_norm": 76.34588377569375, |
| "learning_rate": 1.1873813145857248e-06, |
| "loss": -0.0001, |
| "num_tokens": 50260527.0, |
| "residual_var": 0.04079640656709671, |
| "reward": 0.91015625, |
| "reward_std": 0.1165214255452156, |
| "rewards/drgrpo_math_reward/mean": 0.91015625, |
| "rewards/drgrpo_math_reward/std": 0.2865179479122162, |
| "rho2": 0.2812499403953552, |
| "step": 309 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 5.8310011040577425e-09, |
| "advantages/std": 0.2395787239074707, |
| "advantages/var": 0.057397964949132074, |
| "completions/clipped_ratio": -2.71875, |
| "epoch": 1.7793696275071633, |
| "grad_norm": 49.25562543554959, |
| "learning_rate": 1.1829709473940547e-06, |
| "loss": -0.1656, |
| "num_tokens": 50416781.0, |
| "residual_var": 0.04663585498929024, |
| "reward": 0.734375, |
| "reward_std": 0.1034945547580719, |
| "rewards/drgrpo_math_reward/mean": 0.734375, |
| "rewards/drgrpo_math_reward/std": 0.4425306022167206, |
| "rho2": 0.18749994039535522, |
| "step": 310 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.1213382228454678e-09, |
| "advantages/std": 0.3292694687843323, |
| "advantages/var": 0.10841838307351637, |
| "completions/clipped_ratio": -2.53125, |
| "epoch": 1.7851002865329513, |
| "grad_norm": 145.86003387133044, |
| "learning_rate": 1.1785568947986366e-06, |
| "loss": -0.9573, |
| "num_tokens": 50579721.0, |
| "residual_var": 0.06437341868877411, |
| "reward": 0.765625, |
| "reward_std": 0.20779037475585938, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.4062499403953552, |
| "step": 311 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.29450756311416626, |
| "advantages/var": 0.08673470473144462, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 1.7908309455587392, |
| "grad_norm": 56.44350194766821, |
| "learning_rate": 1.1741392457073886e-06, |
| "loss": -0.5022, |
| "num_tokens": 50736415.0, |
| "residual_var": 0.05963011458516121, |
| "reward": 0.7421875, |
| "reward_std": 0.15964838862419128, |
| "rewards/drgrpo_math_reward/mean": 0.7421875, |
| "rewards/drgrpo_math_reward/std": 0.4382871091365814, |
| "rho2": 0.3124999403953552, |
| "step": 312 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 4.754657346617089e-10, |
| "advantages/std": 0.24484480917453766, |
| "advantages/var": 0.05994898057971576, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 1.7965616045845272, |
| "grad_norm": 51.79635444923773, |
| "learning_rate": 1.1697180891006689e-06, |
| "loss": -0.3783, |
| "num_tokens": 50883741.0, |
| "residual_var": 0.04683515429496765, |
| "reward": 0.8359375, |
| "reward_std": 0.11928971856832504, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.21874995529651642, |
| "step": 313 |
| }, |
| { |
| "advantages/mean": 1.862645149230957e-09, |
| "advantages/snr": 6.597025067366786e-09, |
| "advantages/std": 0.2823462188243866, |
| "advantages/var": 0.0797193872844284, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 1.8022922636103151, |
| "grad_norm": 68.241969257841, |
| "learning_rate": 1.165293514029485e-06, |
| "loss": -0.6181, |
| "num_tokens": 51050995.0, |
| "residual_var": 0.05978954955935478, |
| "reward": 0.73046875, |
| "reward_std": 0.14571286737918854, |
| "rewards/drgrpo_math_reward/mean": 0.73046875, |
| "rewards/drgrpo_math_reward/std": 0.44458550214767456, |
| "rho2": 0.24999994039535522, |
| "step": 314 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.21128857135772705, |
| "advantages/var": 0.044642860386389316, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 1.8080229226361033, |
| "grad_norm": 48.580294172921796, |
| "learning_rate": 1.1608656096136983e-06, |
| "loss": -0.3171, |
| "num_tokens": 51192523.0, |
| "residual_var": 0.03487724810838699, |
| "reward": 0.8984375, |
| "reward_std": 0.0974610298871994, |
| "rewards/drgrpo_math_reward/mean": 0.8984375, |
| "rewards/drgrpo_math_reward/std": 0.3026638329029083, |
| "rho2": 0.21874995529651642, |
| "step": 315 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.173867718254488e-10, |
| "advantages/std": 0.2537976801395416, |
| "advantages/var": 0.06441326244421308, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.8137535816618913, |
| "grad_norm": 47.621003392183994, |
| "learning_rate": 1.156434465040231e-06, |
| "loss": -0.1354, |
| "num_tokens": 51346027.0, |
| "residual_var": 0.04428413510322571, |
| "reward": 0.69921875, |
| "reward_std": 0.13098736107349396, |
| "rewards/drgrpo_math_reward/mean": 0.69921875, |
| "rewards/drgrpo_math_reward/std": 0.45949608087539673, |
| "rho2": 0.3124999403953552, |
| "step": 316 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.941774813235425e-10, |
| "advantages/std": 0.23419423401355743, |
| "advantages/var": 0.0548469392451969, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 1.8194842406876792, |
| "grad_norm": 57.02857353398555, |
| "learning_rate": 1.1520001695612673e-06, |
| "loss": -0.8921, |
| "num_tokens": 51477023.0, |
| "residual_var": 0.04284917935729027, |
| "reward": 0.890625, |
| "reward_std": 0.10125666856765747, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.21874994039535522, |
| "step": 317 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.6992292203726215e-09, |
| "advantages/std": 0.2587745785713196, |
| "advantages/var": 0.06696428251476405, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 1.8252148997134672, |
| "grad_norm": 59.283326258454025, |
| "learning_rate": 1.1475628124924578e-06, |
| "loss": -0.4742, |
| "num_tokens": 51629827.0, |
| "residual_var": 0.048130594193935394, |
| "reward": 0.83203125, |
| "reward_std": 0.1332252472639084, |
| "rewards/drgrpo_math_reward/mean": 0.83203125, |
| "rewards/drgrpo_math_reward/std": 0.3745708465576172, |
| "rho2": 0.2812499403953552, |
| "step": 318 |
| }, |
| { |
| "advantages/mean": 1.280568540096283e-09, |
| "advantages/snr": 4.300990042955823e-09, |
| "advantages/std": 0.29773807525634766, |
| "advantages/var": 0.08864796145735454, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 1.8309455587392551, |
| "grad_norm": 74.49197012171977, |
| "learning_rate": 1.1431224832111194e-06, |
| "loss": -0.9767, |
| "num_tokens": 51782044.0, |
| "residual_var": 0.05540499463677406, |
| "reward": 0.76953125, |
| "reward_std": 0.17491313815116882, |
| "rewards/drgrpo_math_reward/mean": 0.76953125, |
| "rewards/drgrpo_math_reward/std": 0.4219578504562378, |
| "rho2": 0.37499991059303284, |
| "step": 319 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 8.830796510959388e-10, |
| "advantages/std": 0.2636575698852539, |
| "advantages/var": 0.06951531415779755, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 1.836676217765043, |
| "grad_norm": 95.72259182532687, |
| "learning_rate": 1.138679271154436e-06, |
| "loss": 0.1021, |
| "num_tokens": 51927336.0, |
| "residual_var": 0.05430884286761284, |
| "reward": 0.80078125, |
| "reward_std": 0.1230878233909607, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.2187499701976776, |
| "step": 320 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 9.12878675958233e-10, |
| "advantages/std": 0.25505101680755615, |
| "advantages/var": 0.06505102117456829, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 1.842406876790831, |
| "grad_norm": 59.64753369437884, |
| "learning_rate": 1.1342332658176555e-06, |
| "loss": -0.0949, |
| "num_tokens": 52069866.0, |
| "residual_var": 0.056919656693935394, |
| "reward": 0.8984375, |
| "reward_std": 0.11784427613019943, |
| "rewards/drgrpo_math_reward/mean": 0.8984375, |
| "rewards/drgrpo_math_reward/std": 0.3026638329029083, |
| "rho2": 0.1249999850988388, |
| "step": 321 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 8.997430734575406e-10, |
| "advantages/std": 0.2587745785713196, |
| "advantages/var": 0.06696428251476405, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.848137535816619, |
| "grad_norm": 61.94070407305759, |
| "learning_rate": 1.1297845567522886e-06, |
| "loss": -0.5812, |
| "num_tokens": 52224671.0, |
| "residual_var": 0.043945323675870895, |
| "reward": 0.73828125, |
| "reward_std": 0.14032843708992004, |
| "rewards/drgrpo_math_reward/mean": 0.73828125, |
| "rewards/drgrpo_math_reward/std": 0.4404313564300537, |
| "rho2": 0.3437499403953552, |
| "step": 322 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.6558931458075657e-09, |
| "advantages/std": 0.28121456503868103, |
| "advantages/var": 0.07908163158989456, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 1.853868194842407, |
| "grad_norm": 55.218722365267084, |
| "learning_rate": 1.1253332335643042e-06, |
| "loss": -0.5945, |
| "num_tokens": 52372889.0, |
| "residual_var": 0.05683993920683861, |
| "reward": 0.765625, |
| "reward_std": 0.15163654088974, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.2812499403953552, |
| "step": 323 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 3.210853701389071e-09, |
| "advantages/std": 0.2537976801395416, |
| "advantages/var": 0.06441326244421308, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 1.859598853868195, |
| "grad_norm": 69.06598539631601, |
| "learning_rate": 1.1208793859123256e-06, |
| "loss": -0.5763, |
| "num_tokens": 52514332.0, |
| "residual_var": 0.04830996319651604, |
| "reward": 0.85546875, |
| "reward_std": 0.1238841786980629, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.24999994039535522, |
| "step": 324 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.6974700286993113e-09, |
| "advantages/std": 0.2743266522884369, |
| "advantages/var": 0.07525511215578096, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 1.8653295128939829, |
| "grad_norm": 52.17395719662457, |
| "learning_rate": 1.1164231035058227e-06, |
| "loss": -0.5038, |
| "num_tokens": 52655435.0, |
| "residual_var": 0.05408961698412895, |
| "reward": 0.8671875, |
| "reward_std": 0.13531342148780823, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.2812499403953552, |
| "step": 325 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.988354962647085e-09, |
| "advantages/std": 0.23419423401355743, |
| "advantages/var": 0.0548469392451969, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.8710601719197708, |
| "grad_norm": 54.14916950149179, |
| "learning_rate": 1.1119644761033077e-06, |
| "loss": -0.3666, |
| "num_tokens": 52817959.0, |
| "residual_var": 0.03770728409290314, |
| "reward": 0.828125, |
| "reward_std": 0.12073516845703125, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.31249991059303284, |
| "step": 326 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.922424165892908e-09, |
| "advantages/std": 0.24222607910633087, |
| "advantages/var": 0.05867347339922646, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 1.8767908309455588, |
| "grad_norm": 58.06906202221281, |
| "learning_rate": 1.107503593510525e-06, |
| "loss": -0.5308, |
| "num_tokens": 52960527.0, |
| "residual_var": 0.047672200947999954, |
| "reward": 0.8671875, |
| "reward_std": 0.10520448535680771, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.1874999701976776, |
| "step": 327 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 5.114126388694997e-09, |
| "advantages/std": 0.27316176891326904, |
| "advantages/var": 0.0746173519958262, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 1.8825214899713467, |
| "grad_norm": 55.880601623517, |
| "learning_rate": 1.1030405455786424e-06, |
| "loss": -0.406, |
| "num_tokens": 53097288.0, |
| "residual_var": 0.05129944160580635, |
| "reward": 0.85546875, |
| "reward_std": 0.14123709499835968, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.3124999403953552, |
| "step": 328 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 3.865916250372057e-09, |
| "advantages/std": 0.2409060299396515, |
| "advantages/var": 0.05803571526128426, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.8882521489971347, |
| "grad_norm": 100.89430686833502, |
| "learning_rate": 1.0985754222024436e-06, |
| "loss": -0.1128, |
| "num_tokens": 53231633.0, |
| "residual_var": 0.041713181883096695, |
| "reward": 0.85546875, |
| "reward_std": 0.11823134869337082, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.2812499403953552, |
| "step": 329 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.729520851595926e-09, |
| "advantages/std": 0.18728730082511902, |
| "advantages/var": 0.03507653305035863, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 1.8939828080229226, |
| "grad_norm": 39.60488962547746, |
| "learning_rate": 1.0941083133185145e-06, |
| "loss": -1.4171, |
| "num_tokens": 53357002.0, |
| "residual_var": 0.028499694541096687, |
| "reward": 0.92578125, |
| "reward_std": 0.07483352720737457, |
| "rewards/drgrpo_math_reward/mean": 0.92578125, |
| "rewards/drgrpo_math_reward/std": 0.2626400291919708, |
| "rho2": 0.18749995529651642, |
| "step": 330 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 3.967493686413963e-09, |
| "advantages/std": 0.2934228181838989, |
| "advantages/var": 0.0860969502309814, |
| "completions/clipped_ratio": -2.734375, |
| "epoch": 1.8997134670487106, |
| "grad_norm": 49.7244276230108, |
| "learning_rate": 1.0896393089034335e-06, |
| "loss": -0.1195, |
| "num_tokens": 53534358.0, |
| "residual_var": 0.05650113523006439, |
| "reward": 0.69140625, |
| "reward_std": 0.16557207703590393, |
| "rewards/drgrpo_math_reward/mean": 0.69140625, |
| "rewards/drgrpo_math_reward/std": 0.46281787753105164, |
| "rho2": 0.34374991059303284, |
| "step": 331 |
| }, |
| { |
| "advantages/mean": -1.5133991837501526e-09, |
| "advantages/snr": 6.24787853915195e-09, |
| "advantages/std": 0.24222607910633087, |
| "advantages/var": 0.05867347339922646, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 1.9054441260744985, |
| "grad_norm": 57.8377677490507, |
| "learning_rate": 1.0851684989719594e-06, |
| "loss": -0.0041, |
| "num_tokens": 53691398.0, |
| "residual_var": 0.04400511458516121, |
| "reward": 0.8203125, |
| "reward_std": 0.11876175552606583, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.2499999701976776, |
| "step": 332 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.799237303606776e-09, |
| "advantages/std": 0.18385055661201477, |
| "advantages/var": 0.03380102716654765, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 1.9111747851002865, |
| "grad_norm": 44.490624674036994, |
| "learning_rate": 1.0806959735752173e-06, |
| "loss": -0.0637, |
| "num_tokens": 53837722.0, |
| "residual_var": 0.02851962298154831, |
| "reward": 0.76171875, |
| "reward_std": 0.0672023743391037, |
| "rewards/drgrpo_math_reward/mean": 0.76171875, |
| "rewards/drgrpo_math_reward/std": 0.4268665909767151, |
| "rho2": 0.1562499701976776, |
| "step": 333 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 3.338821702499687e-09, |
| "advantages/std": 0.2789374887943268, |
| "advantages/var": 0.07780612265488518, |
| "completions/clipped_ratio": -2.65625, |
| "epoch": 1.9169054441260744, |
| "grad_norm": 49.97152951305063, |
| "learning_rate": 1.076221822798885e-06, |
| "loss": -0.7242, |
| "num_tokens": 54010772.0, |
| "residual_var": 0.053491730242967606, |
| "reward": 0.7578125, |
| "reward_std": 0.15110857784748077, |
| "rewards/drgrpo_math_reward/mean": 0.7578125, |
| "rewards/drgrpo_math_reward/std": 0.4292463958263397, |
| "rho2": 0.3124999403953552, |
| "step": 334 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.18385054171085358, |
| "advantages/var": 0.03380102168737431, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 1.9226361031518624, |
| "grad_norm": 36.61290451397358, |
| "learning_rate": 1.0717461367613792e-06, |
| "loss": 0.1276, |
| "num_tokens": 54156350.0, |
| "residual_var": 0.03063218668103218, |
| "reward": 0.89453125, |
| "reward_std": 0.0665532797574997, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.09374997764825821, |
| "step": 335 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.598885299444173e-09, |
| "advantages/std": 0.29124119877815247, |
| "advantages/var": 0.08482143586573532, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 1.9283667621776504, |
| "grad_norm": 67.7518786607019, |
| "learning_rate": 1.0672690056120398e-06, |
| "loss": -0.6978, |
| "num_tokens": 54296370.0, |
| "residual_var": 0.05831475183367729, |
| "reward": 0.80078125, |
| "reward_std": 0.16439500451087952, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.3124999403953552, |
| "step": 336 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 6.210110271830265e-09, |
| "advantages/std": 0.262445330619812, |
| "advantages/var": 0.06887755156414244, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.9340974212034383, |
| "grad_norm": 47.49343597302498, |
| "learning_rate": 1.0627905195293135e-06, |
| "loss": -0.8715, |
| "num_tokens": 54450420.0, |
| "residual_var": 0.049505751579999924, |
| "reward": 0.78125, |
| "reward_std": 0.1349327266216278, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.2812499403953552, |
| "step": 337 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.4577503362594367e-09, |
| "advantages/std": 0.15971913933753967, |
| "advantages/var": 0.025510203470724413, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 1.9398280802292263, |
| "grad_norm": 41.621221294628306, |
| "learning_rate": 1.0583107687189387e-06, |
| "loss": -1.5985, |
| "num_tokens": 54581042.0, |
| "residual_var": 0.02152424491941929, |
| "reward": 0.890625, |
| "reward_std": 0.0586601123213768, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.1562499701976776, |
| "step": 338 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 3.3117862916151314e-09, |
| "advantages/std": 0.28121456503868103, |
| "advantages/var": 0.07908163158989456, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 1.9455587392550142, |
| "grad_norm": 71.06222944609438, |
| "learning_rate": 1.0538298434121282e-06, |
| "loss": -1.9821, |
| "num_tokens": 54737524.0, |
| "residual_var": 0.05189734697341919, |
| "reward": 0.84375, |
| "reward_std": 0.15873973071575165, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.34374991059303284, |
| "step": 339 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 4.1735271281246086e-10, |
| "advantages/std": 0.2789374887943268, |
| "advantages/var": 0.07780612265488518, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 1.9512893982808022, |
| "grad_norm": 94.66915132544855, |
| "learning_rate": 1.049347833863751e-06, |
| "loss": -0.3612, |
| "num_tokens": 54890793.0, |
| "residual_var": 0.058354608714580536, |
| "reward": 0.7578125, |
| "reward_std": 0.14400538802146912, |
| "rewards/drgrpo_math_reward/mean": 0.7578125, |
| "rewards/drgrpo_math_reward/std": 0.4292463958263397, |
| "rho2": 0.24999994039535522, |
| "step": 340 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.24222607910633087, |
| "advantages/var": 0.05867347339922646, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.9570200573065901, |
| "grad_norm": 38.99157475481538, |
| "learning_rate": 1.044864830350515e-06, |
| "loss": -0.1946, |
| "num_tokens": 55030026.0, |
| "residual_var": 0.04217156767845154, |
| "reward": 0.78125, |
| "reward_std": 0.12468298524618149, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.2812499403953552, |
| "step": 341 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 8.790565244942093e-10, |
| "advantages/std": 0.2648642361164093, |
| "advantages/var": 0.07015306357352902, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 1.962750716332378, |
| "grad_norm": 68.6227790006013, |
| "learning_rate": 1.0403809231691484e-06, |
| "loss": -1.0464, |
| "num_tokens": 55180828.0, |
| "residual_var": 0.05042253062129021, |
| "reward": 0.8671875, |
| "reward_std": 0.13664263486862183, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.2812499403953552, |
| "step": 342 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 6.105843634782741e-10, |
| "advantages/std": 0.19066213071346283, |
| "advantages/var": 0.036352048088197586, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.968481375358166, |
| "grad_norm": 67.90463567427477, |
| "learning_rate": 1.0358962026345824e-06, |
| "loss": -0.154, |
| "num_tokens": 55332364.0, |
| "residual_var": 0.031808048486709595, |
| "reward": 0.87109375, |
| "reward_std": 0.06944026052951813, |
| "rewards/drgrpo_math_reward/mean": 0.87109375, |
| "rewards/drgrpo_math_reward/std": 0.33575257658958435, |
| "rho2": 0.12499997019767761, |
| "step": 343 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 3.3461059648989908e-09, |
| "advantages/std": 0.2435389757156372, |
| "advantages/var": 0.05931123269262173, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 1.9742120343839542, |
| "grad_norm": 62.318007127153344, |
| "learning_rate": 1.0314107590781281e-06, |
| "loss": -0.621, |
| "num_tokens": 55476400.0, |
| "residual_var": 0.05004385486245155, |
| "reward": 0.91796875, |
| "reward_std": 0.10573489964008331, |
| "rewards/drgrpo_math_reward/mean": 0.91796875, |
| "rewards/drgrpo_math_reward/std": 0.2749498784542084, |
| "rho2": 0.1562499701976776, |
| "step": 344 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.9155006725188733e-09, |
| "advantages/std": 0.15971913933753967, |
| "advantages/var": 0.025510203470724413, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 1.9799426934097422, |
| "grad_norm": 35.042547615301785, |
| "learning_rate": 1.026924682845663e-06, |
| "loss": -0.0466, |
| "num_tokens": 55603276.0, |
| "residual_var": 0.021524248644709587, |
| "reward": 0.859375, |
| "reward_std": 0.0586601160466671, |
| "rewards/drgrpo_math_reward/mean": 0.859375, |
| "rewards/drgrpo_math_reward/std": 0.3483152687549591, |
| "rho2": 0.15624995529651642, |
| "step": 345 |
| }, |
| { |
| "advantages/mean": -1.3969838619232178e-09, |
| "advantages/snr": 7.823109002158462e-09, |
| "advantages/std": 0.1785714328289032, |
| "advantages/var": 0.03188775662256749, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 1.9856733524355301, |
| "grad_norm": 39.869269652084334, |
| "learning_rate": 1.022438064295805e-06, |
| "loss": -0.2044, |
| "num_tokens": 55750009.0, |
| "residual_var": 0.026905305683612823, |
| "reward": 0.9140625, |
| "reward_std": 0.06549490988254547, |
| "rewards/drgrpo_math_reward/mean": 0.9140625, |
| "rewards/drgrpo_math_reward/std": 0.28082075715065, |
| "rho2": 0.1562499701976776, |
| "step": 346 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 8.246281334208482e-10, |
| "advantages/std": 0.2823462188243866, |
| "advantages/var": 0.0797193872844284, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 1.991404011461318, |
| "grad_norm": 58.782506623819536, |
| "learning_rate": 1.0179509937980971e-06, |
| "loss": -0.0113, |
| "num_tokens": 55876382.0, |
| "residual_var": 0.05978954955935478, |
| "reward": 0.81640625, |
| "reward_std": 0.1521669626235962, |
| "rewards/drgrpo_math_reward/mean": 0.81640625, |
| "rewards/drgrpo_math_reward/std": 0.387910932302475, |
| "rho2": 0.24999994039535522, |
| "step": 347 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 9.772780097527553e-10, |
| "advantages/std": 0.23824401199817657, |
| "advantages/var": 0.056760209252987304, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 1.997134670487106, |
| "grad_norm": 58.5206879967775, |
| "learning_rate": 1.0134635617311853e-06, |
| "loss": 0.0692, |
| "num_tokens": 56017420.0, |
| "residual_var": 0.04611767828464508, |
| "reward": 0.80859375, |
| "reward_std": 0.10995110124349594, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.1874999701976776, |
| "step": 348 |
| }, |
| { |
| "advantages/mean": 1.0477378964424133e-09, |
| "advantages/snr": 4.638532964616805e-09, |
| "advantages/std": 0.22587698698043823, |
| "advantages/var": 0.05102041324736106, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 2.005730659025788, |
| "grad_norm": 40.990040303252684, |
| "learning_rate": 1.0089758584809977e-06, |
| "loss": -0.3943, |
| "num_tokens": 56163905.0, |
| "residual_var": 0.03985970839858055, |
| "reward": 0.90625, |
| "reward_std": 0.09784172475337982, |
| "rewards/drgrpo_math_reward/mean": 0.90625, |
| "rewards/drgrpo_math_reward/std": 0.2920515835285187, |
| "rho2": 0.21874995529651642, |
| "step": 349 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.092355513747993e-09, |
| "advantages/std": 0.22587697207927704, |
| "advantages/var": 0.0510204065157025, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.011461318051576, |
| "grad_norm": 45.769842359441355, |
| "learning_rate": 1.0044879744389256e-06, |
| "loss": -0.2645, |
| "num_tokens": 56300771.0, |
| "residual_var": 0.03507654368877411, |
| "reward": 0.859375, |
| "reward_std": 0.1173202320933342, |
| "rewards/drgrpo_math_reward/mean": 0.859375, |
| "rewards/drgrpo_math_reward/std": 0.3483152687549591, |
| "rho2": 0.3124999403953552, |
| "step": 350 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.0745771419279592e-09, |
| "advantages/std": 0.22446081042289734, |
| "advantages/var": 0.05038265541570386, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.017191977077364, |
| "grad_norm": 55.073488092850695, |
| "learning_rate": 1e-06, |
| "loss": 0.3208, |
| "num_tokens": 56434972.0, |
| "residual_var": 0.04093591496348381, |
| "reward": 0.82421875, |
| "reward_std": 0.09666221588850021, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.1874999701976776, |
| "step": 351 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 7.902533784003918e-09, |
| "advantages/std": 0.1767766922712326, |
| "advantages/var": 0.03124999893035807, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 2.022922636103152, |
| "grad_norm": 35.999997406726365, |
| "learning_rate": 9.955120255610746e-07, |
| "loss": -0.2176, |
| "num_tokens": 56580608.0, |
| "residual_var": 0.027343759313225746, |
| "reward": 0.80078125, |
| "reward_std": 0.05786130577325821, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.12499997019767761, |
| "step": 352 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.1708932641543454e-09, |
| "advantages/std": 0.19884872436523438, |
| "advantages/var": 0.039540815181680955, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 2.0286532951289398, |
| "grad_norm": 41.84499982504666, |
| "learning_rate": 9.91024141519002e-07, |
| "loss": -0.0258, |
| "num_tokens": 56727775.0, |
| "residual_var": 0.03089127317070961, |
| "reward": 0.78125, |
| "reward_std": 0.08588206768035889, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.21874995529651642, |
| "step": 353 |
| }, |
| { |
| "advantages/mean": -1.0477378964424133e-09, |
| "advantages/snr": 4.422667522444229e-09, |
| "advantages/std": 0.23690178990364075, |
| "advantages/var": 0.05612245805954874, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.0343839541547277, |
| "grad_norm": 40.89486934381943, |
| "learning_rate": 9.865364382688144e-07, |
| "loss": -0.2472, |
| "num_tokens": 56872331.0, |
| "residual_var": 0.04384567588567734, |
| "reward": 0.90625, |
| "reward_std": 0.10942068696022034, |
| "rewards/drgrpo_math_reward/mean": 0.90625, |
| "rewards/drgrpo_math_reward/std": 0.2920515835285187, |
| "rho2": 0.21874995529651642, |
| "step": 354 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 3.66736918388653e-09, |
| "advantages/std": 0.31743550300598145, |
| "advantages/var": 0.10076529856866046, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 2.0401146131805157, |
| "grad_norm": 62.92783490689025, |
| "learning_rate": 9.82049006201903e-07, |
| "loss": -1.0709, |
| "num_tokens": 57029960.0, |
| "residual_var": 0.0629783347249031, |
| "reward": 0.703125, |
| "reward_std": 0.19977852702140808, |
| "rewards/drgrpo_math_reward/mean": 0.703125, |
| "rewards/drgrpo_math_reward/std": 0.45777595043182373, |
| "rho2": 0.37499991059303284, |
| "step": 355 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.25, |
| "advantages/var": 0.0625, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 2.0458452722063036, |
| "grad_norm": 41.13574343425845, |
| "learning_rate": 9.77561935704195e-07, |
| "loss": -0.4015, |
| "num_tokens": 57182250.0, |
| "residual_var": 0.050781264901161194, |
| "reward": 0.8046875, |
| "reward_std": 0.11507351696491241, |
| "rewards/drgrpo_math_reward/mean": 0.8046875, |
| "rewards/drgrpo_math_reward/std": 0.39721766114234924, |
| "rho2": 0.18749995529651642, |
| "step": 356 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 4.407822555439857e-09, |
| "advantages/std": 0.21128857135772705, |
| "advantages/var": 0.044642860386389316, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 2.0515759312320916, |
| "grad_norm": 47.1257480404097, |
| "learning_rate": 9.730753171543374e-07, |
| "loss": -0.7046, |
| "num_tokens": 57321794.0, |
| "residual_var": 0.034877244383096695, |
| "reward": 0.84375, |
| "reward_std": 0.09100693464279175, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.21874994039535522, |
| "step": 357 |
| }, |
| { |
| "advantages/mean": -1.280568540096283e-09, |
| "advantages/snr": 5.175355444292141e-09, |
| "advantages/std": 0.24743585288524628, |
| "advantages/var": 0.06122450129304924, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.0573065902578795, |
| "grad_norm": 39.598904124881315, |
| "learning_rate": 9.685892409218718e-07, |
| "loss": -0.4665, |
| "num_tokens": 57481414.0, |
| "residual_var": 0.047831643372774124, |
| "reward": 0.8125, |
| "reward_std": 0.1145455539226532, |
| "rewards/drgrpo_math_reward/mean": 0.8125, |
| "rewards/drgrpo_math_reward/std": 0.3910769522190094, |
| "rho2": 0.2187499701976776, |
| "step": 358 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.6172287454318415e-09, |
| "advantages/std": 0.28793779015541077, |
| "advantages/var": 0.08290817099958137, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.0630372492836675, |
| "grad_norm": 49.79710645310716, |
| "learning_rate": 9.641037973654178e-07, |
| "loss": -0.4794, |
| "num_tokens": 57643547.0, |
| "residual_var": 0.054408494383096695, |
| "reward": 0.890625, |
| "reward_std": 0.1621546745300293, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.3437499403953552, |
| "step": 359 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.22303566336631775, |
| "advantages/var": 0.04974490713325341, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.0687679083094554, |
| "grad_norm": 50.82435735502694, |
| "learning_rate": 9.596190768308513e-07, |
| "loss": -0.4087, |
| "num_tokens": 57784823.0, |
| "residual_var": 0.03575415909290314, |
| "reward": 0.8828125, |
| "reward_std": 0.10968907922506332, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.2812499403953552, |
| "step": 360 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 2.3283063434064427e-09, |
| "advantages/std": 0.25, |
| "advantages/var": 0.0625, |
| "completions/clipped_ratio": -2.703125, |
| "epoch": 2.0744985673352434, |
| "grad_norm": 45.40379831308186, |
| "learning_rate": 9.551351696494853e-07, |
| "loss": -1.0983, |
| "num_tokens": 57940022.0, |
| "residual_var": 0.046875014901161194, |
| "reward": 0.7734375, |
| "reward_std": 0.12217670679092407, |
| "rewards/drgrpo_math_reward/mean": 0.7734375, |
| "rewards/drgrpo_math_reward/std": 0.41942715644836426, |
| "rho2": 0.24999995529651642, |
| "step": 361 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 3.3665301317676013e-09, |
| "advantages/std": 0.13832083344459534, |
| "advantages/var": 0.019132652964807484, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 2.0802292263610314, |
| "grad_norm": 29.094653129208076, |
| "learning_rate": 9.506521661362492e-07, |
| "loss": -0.188, |
| "num_tokens": 58079358.0, |
| "residual_var": 0.01793687231838703, |
| "reward": 0.8828125, |
| "reward_std": 0.0388009138405323, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.062499985098838806, |
| "step": 362 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.664790625930143e-10, |
| "advantages/std": 0.2409060299396515, |
| "advantages/var": 0.05803571526128426, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.0859598853868193, |
| "grad_norm": 83.4007947773079, |
| "learning_rate": 9.461701565878718e-07, |
| "loss": -0.2678, |
| "num_tokens": 58219170.0, |
| "residual_var": 0.045340411365032196, |
| "reward": 0.76171875, |
| "reward_std": 0.11112815886735916, |
| "rewards/drgrpo_math_reward/mean": 0.76171875, |
| "rewards/drgrpo_math_reward/std": 0.4268665909767151, |
| "rho2": 0.2187499701976776, |
| "step": 363 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.8439244876346257e-09, |
| "advantages/std": 0.25253814458847046, |
| "advantages/var": 0.06377551447218721, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.0916905444126073, |
| "grad_norm": 61.00925066972484, |
| "learning_rate": 9.416892312810613e-07, |
| "loss": -0.1656, |
| "num_tokens": 58371207.0, |
| "residual_var": 0.041852690279483795, |
| "reward": 0.8203125, |
| "reward_std": 0.1369110345840454, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.3437499403953552, |
| "step": 364 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 4.5202922198482317e-10, |
| "advantages/std": 0.2575393617153168, |
| "advantages/var": 0.06632652283273277, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 2.097421203438395, |
| "grad_norm": 50.15075310712616, |
| "learning_rate": 9.372094804706866e-07, |
| "loss": -1.0044, |
| "num_tokens": 58515929.0, |
| "residual_var": 0.045599501579999924, |
| "reward": 0.78125, |
| "reward_std": 0.13269482553005219, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.3124999403953552, |
| "step": 365 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.6558931458075657e-09, |
| "advantages/std": 0.28121456503868103, |
| "advantages/var": 0.07908163158989456, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.103151862464183, |
| "grad_norm": 111.89824195115196, |
| "learning_rate": 9.327309943879603e-07, |
| "loss": -0.8244, |
| "num_tokens": 58657277.0, |
| "residual_var": 0.05683993175625801, |
| "reward": 0.7890625, |
| "reward_std": 0.14518246054649353, |
| "rewards/drgrpo_math_reward/mean": 0.7890625, |
| "rewards/drgrpo_math_reward/std": 0.4087733030319214, |
| "rho2": 0.2812499403953552, |
| "step": 366 |
| }, |
| { |
| "advantages/mean": 2.0954757928848267e-09, |
| "advantages/snr": 8.947597331911881e-09, |
| "advantages/std": 0.23419423401355743, |
| "advantages/var": 0.0548469392451969, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.1088825214899716, |
| "grad_norm": 102.01837387407974, |
| "learning_rate": 9.282538632386206e-07, |
| "loss": -0.7782, |
| "num_tokens": 58803731.0, |
| "residual_var": 0.042849186807870865, |
| "reward": 0.890625, |
| "reward_std": 0.10771076381206512, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.21874995529651642, |
| "step": 367 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 7.604216454059908e-10, |
| "advantages/std": 0.3061862289905548, |
| "advantages/var": 0.09375000682345647, |
| "completions/clipped_ratio": -2.71875, |
| "epoch": 2.1146131805157595, |
| "grad_norm": 55.21812239129645, |
| "learning_rate": 9.237781772011151e-07, |
| "loss": -0.2983, |
| "num_tokens": 58969432.0, |
| "residual_var": 0.0644531399011612, |
| "reward": 0.79296875, |
| "reward_std": 0.17938891053199768, |
| "rewards/drgrpo_math_reward/mean": 0.79296875, |
| "rewards/drgrpo_math_reward/std": 0.40597182512283325, |
| "rho2": 0.3124999403953552, |
| "step": 368 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 5.717767063674629e-10, |
| "advantages/std": 0.20360276103019714, |
| "advantages/var": 0.041454084299119565, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.1203438395415475, |
| "grad_norm": 99.8146176812901, |
| "learning_rate": 9.193040264247828e-07, |
| "loss": -0.6521, |
| "num_tokens": 59107832.0, |
| "residual_var": 0.03368145227432251, |
| "reward": 0.92578125, |
| "reward_std": 0.08166831731796265, |
| "rewards/drgrpo_math_reward/mean": 0.92578125, |
| "rewards/drgrpo_math_reward/std": 0.2626400291919708, |
| "rho2": 0.1874999701976776, |
| "step": 369 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.1708931764109258e-09, |
| "advantages/std": 0.19884873926639557, |
| "advantages/var": 0.03954082110783497, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 2.1260744985673354, |
| "grad_norm": 36.18387046219496, |
| "learning_rate": 9.148315010280407e-07, |
| "loss": -0.7438, |
| "num_tokens": 59241040.0, |
| "residual_var": 0.03089127317070961, |
| "reward": 0.921875, |
| "reward_std": 0.08588206768035889, |
| "rewards/drgrpo_math_reward/mean": 0.921875, |
| "rewards/drgrpo_math_reward/std": 0.26889389753341675, |
| "rho2": 0.21874994039535522, |
| "step": 370 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.4188699969241902e-09, |
| "advantages/std": 0.24614372849464417, |
| "advantages/var": 0.0605867350772451, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.1318051575931234, |
| "grad_norm": 58.45422371851887, |
| "learning_rate": 9.103606910965665e-07, |
| "loss": -0.3998, |
| "num_tokens": 59396683.0, |
| "residual_var": 0.04544006660580635, |
| "reward": 0.75390625, |
| "reward_std": 0.12046922743320465, |
| "rewards/drgrpo_math_reward/mean": 0.75390625, |
| "rewards/drgrpo_math_reward/std": 0.43157756328582764, |
| "rho2": 0.24999995529651642, |
| "step": 371 |
| }, |
| { |
| "advantages/mean": 8.149072527885437e-10, |
| "advantages/snr": 3.70147121539465e-09, |
| "advantages/std": 0.220157653093338, |
| "advantages/var": 0.048469392215566565, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 2.1375358166189113, |
| "grad_norm": 43.47332393270564, |
| "learning_rate": 9.058916866814856e-07, |
| "loss": 0.0326, |
| "num_tokens": 59519814.0, |
| "residual_var": 0.03938138857483864, |
| "reward": 0.859375, |
| "reward_std": 0.09495474398136139, |
| "rewards/drgrpo_math_reward/mean": 0.859375, |
| "rewards/drgrpo_math_reward/std": 0.3483152687549591, |
| "rho2": 0.1874999701976776, |
| "step": 372 |
| }, |
| { |
| "advantages/mean": 2.561137080192566e-09, |
| "advantages/snr": 7.919246886876952e-09, |
| "advantages/std": 0.3234066367149353, |
| "advantages/var": 0.10459185267126614, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 2.1432664756446993, |
| "grad_norm": 68.3814059604225, |
| "learning_rate": 9.014245777975564e-07, |
| "loss": -0.8138, |
| "num_tokens": 59688958.0, |
| "residual_var": 0.05883292481303215, |
| "reward": 0.796875, |
| "reward_std": 0.20384256541728973, |
| "rewards/drgrpo_math_reward/mean": 0.796875, |
| "rewards/drgrpo_math_reward/std": 0.40311288833618164, |
| "rho2": 0.43749988079071045, |
| "step": 373 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 3.651514703832932e-09, |
| "advantages/std": 0.25505101680755615, |
| "advantages/var": 0.06505102117456829, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 2.1489971346704873, |
| "grad_norm": 55.37713615201918, |
| "learning_rate": 8.969594544213577e-07, |
| "loss": -0.2838, |
| "num_tokens": 59844959.0, |
| "residual_var": 0.050821125507354736, |
| "reward": 0.8046875, |
| "reward_std": 0.1244145929813385, |
| "rewards/drgrpo_math_reward/mean": 0.8046875, |
| "rewards/drgrpo_math_reward/std": 0.39721766114234924, |
| "rho2": 0.21874995529651642, |
| "step": 374 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.0178298853416863e-09, |
| "advantages/std": 0.2314550280570984, |
| "advantages/var": 0.0535714300129122, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 2.154727793696275, |
| "grad_norm": 75.81852215411352, |
| "learning_rate": 8.924964064894753e-07, |
| "loss": -0.397, |
| "num_tokens": 60014526.0, |
| "residual_var": 0.043526798486709595, |
| "reward": 0.8359375, |
| "reward_std": 0.10007961839437485, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.18749995529651642, |
| "step": 375 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.158150412641125e-09, |
| "advantages/std": 0.21576867997646332, |
| "advantages/var": 0.04655612325878544, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 2.160458452722063, |
| "grad_norm": 36.36413325512361, |
| "learning_rate": 8.880355238966921e-07, |
| "loss": -0.1531, |
| "num_tokens": 60145596.0, |
| "residual_var": 0.03637198358774185, |
| "reward": 0.92578125, |
| "reward_std": 0.09916849434375763, |
| "rewards/drgrpo_math_reward/mean": 0.92578125, |
| "rewards/drgrpo_math_reward/std": 0.2626400291919708, |
| "rho2": 0.21874994039535522, |
| "step": 376 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 3.541947615186032e-09, |
| "advantages/std": 0.23007319867610931, |
| "advantages/var": 0.05293367674905647, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.166189111747851, |
| "grad_norm": 36.5323392935581, |
| "learning_rate": 8.835768964941772e-07, |
| "loss": -0.5071, |
| "num_tokens": 60302043.0, |
| "residual_var": 0.04135444387793541, |
| "reward": 0.87109375, |
| "reward_std": 0.1060032844543457, |
| "rewards/drgrpo_math_reward/mean": 0.87109375, |
| "rewards/drgrpo_math_reward/std": 0.33575257658958435, |
| "rho2": 0.21874994039535522, |
| "step": 377 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.2785315388989055e-09, |
| "advantages/std": 0.1821078509092331, |
| "advantages/var": 0.03316326936277947, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.171919770773639, |
| "grad_norm": 34.55263411580291, |
| "learning_rate": 8.791206140876745e-07, |
| "loss": -0.0678, |
| "num_tokens": 60442295.0, |
| "residual_var": 0.026945164427161217, |
| "reward": 0.953125, |
| "reward_std": 0.07312604784965515, |
| "rewards/drgrpo_math_reward/mean": 0.953125, |
| "rewards/drgrpo_math_reward/std": 0.21178513765335083, |
| "rho2": 0.1874999701976776, |
| "step": 378 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 8.416324726969034e-10, |
| "advantages/std": 0.27664169669151306, |
| "advantages/var": 0.07653062834835911, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.177650429799427, |
| "grad_norm": 49.5081416172195, |
| "learning_rate": 8.746667664356955e-07, |
| "loss": -0.5504, |
| "num_tokens": 60578216.0, |
| "residual_var": 0.05978955700993538, |
| "reward": 0.859375, |
| "reward_std": 0.1428283452987671, |
| "rewards/drgrpo_math_reward/mean": 0.859375, |
| "rewards/drgrpo_math_reward/std": 0.3483152687549591, |
| "rho2": 0.21874995529651642, |
| "step": 379 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.1348570685336994e-09, |
| "advantages/std": 0.20516295731067657, |
| "advantages/var": 0.0420918390524625, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 2.183381088825215, |
| "grad_norm": 43.20035315786397, |
| "learning_rate": 8.702154432477115e-07, |
| "loss": -0.2085, |
| "num_tokens": 60712266.0, |
| "residual_var": 0.03551500290632248, |
| "reward": 0.90625, |
| "reward_std": 0.08219873160123825, |
| "rewards/drgrpo_math_reward/mean": 0.90625, |
| "rewards/drgrpo_math_reward/std": 0.2920515835285187, |
| "rho2": 0.1562499701976776, |
| "step": 380 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 2.0118866328303662e-09, |
| "advantages/std": 0.1157275065779686, |
| "advantages/var": 0.013392855778753765, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 2.189111747851003, |
| "grad_norm": 18.632259572410177, |
| "learning_rate": 8.657667341823448e-07, |
| "loss": -0.1763, |
| "num_tokens": 60867807.0, |
| "residual_var": 0.012137286365032196, |
| "reward": 0.83984375, |
| "reward_std": 0.03314562886953354, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.09374997764825821, |
| "step": 381 |
| }, |
| { |
| "advantages/mean": -1.7462298274040222e-09, |
| "advantages/snr": 7.094349984620951e-09, |
| "advantages/std": 0.24614372849464417, |
| "advantages/var": 0.0605867350772451, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.194842406876791, |
| "grad_norm": 47.23394786445544, |
| "learning_rate": 8.613207288455641e-07, |
| "loss": -0.4775, |
| "num_tokens": 61004135.0, |
| "residual_var": 0.04544006288051605, |
| "reward": 0.81640625, |
| "reward_std": 0.12046922743320465, |
| "rewards/drgrpo_math_reward/mean": 0.81640625, |
| "rewards/drgrpo_math_reward/std": 0.387910932302475, |
| "rho2": 0.2499999701976776, |
| "step": 382 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 3.5819556995273322e-09, |
| "advantages/std": 0.260003924369812, |
| "advantages/var": 0.06760204068770292, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 2.200573065902579, |
| "grad_norm": 54.11461392640346, |
| "learning_rate": 8.568775167888805e-07, |
| "loss": -0.2017, |
| "num_tokens": 61168740.0, |
| "residual_var": 0.05070154368877411, |
| "reward": 0.8515625, |
| "reward_std": 0.1337556540966034, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.24999994039535522, |
| "step": 383 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 5.000042345610263e-10, |
| "advantages/std": 0.23282866179943085, |
| "advantages/var": 0.05420918575531375, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.206303724928367, |
| "grad_norm": 43.4870952376505, |
| "learning_rate": 8.524371875075424e-07, |
| "loss": -0.3357, |
| "num_tokens": 61306482.0, |
| "residual_var": 0.04235094040632248, |
| "reward": 0.82421875, |
| "reward_std": 0.10125912725925446, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.21874995529651642, |
| "step": 384 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.25962882665666e-09, |
| "advantages/std": 0.2142857164144516, |
| "advantages/var": 0.04591836825925477, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 2.2120343839541547, |
| "grad_norm": 42.735990238221774, |
| "learning_rate": 8.479998304387328e-07, |
| "loss": -0.162, |
| "num_tokens": 61459774.0, |
| "residual_var": 0.035873740911483765, |
| "reward": 0.8359375, |
| "reward_std": 0.09271685779094696, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.21874995529651642, |
| "step": 385 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.464047832840681e-09, |
| "advantages/std": 0.18898223340511322, |
| "advantages/var": 0.03571428454278469, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 2.2177650429799427, |
| "grad_norm": 39.815277212712225, |
| "learning_rate": 8.435655349597689e-07, |
| "loss": -0.1584, |
| "num_tokens": 61591873.0, |
| "residual_var": 0.030133936554193497, |
| "reward": 0.859375, |
| "reward_std": 0.06890985369682312, |
| "rewards/drgrpo_math_reward/mean": 0.859375, |
| "rewards/drgrpo_math_reward/std": 0.3483152687549591, |
| "rho2": 0.1562499701976776, |
| "step": 386 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 4.355861921364802e-09, |
| "advantages/std": 0.267261266708374, |
| "advantages/var": 0.07142858468256463, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.2234957020057307, |
| "grad_norm": 61.97166050903191, |
| "learning_rate": 8.391343903863017e-07, |
| "loss": -0.4498, |
| "num_tokens": 61739668.0, |
| "residual_var": 0.049107152968645096, |
| "reward": 0.8359375, |
| "reward_std": 0.1442738026380539, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.3124999403953552, |
| "step": 387 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.2082482874393463, |
| "advantages/var": 0.043367349221420604, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 2.2292263610315186, |
| "grad_norm": 46.20210801677951, |
| "learning_rate": 8.347064859705152e-07, |
| "loss": -0.8056, |
| "num_tokens": 61898279.0, |
| "residual_var": 0.037946440279483795, |
| "reward": 0.8203125, |
| "reward_std": 0.08272669464349747, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.12499997019767761, |
| "step": 388 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 3.209860470786324e-09, |
| "advantages/std": 0.1450721174478531, |
| "advantages/var": 0.02104591926080368, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.2349570200573066, |
| "grad_norm": 35.73079034504171, |
| "learning_rate": 8.302819108993311e-07, |
| "loss": -0.3412, |
| "num_tokens": 62034899.0, |
| "residual_var": 0.018415190279483795, |
| "reward": 0.89453125, |
| "reward_std": 0.04761157184839249, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.12499997764825821, |
| "step": 389 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.2664125371788293e-09, |
| "advantages/std": 0.18385054171085358, |
| "advantages/var": 0.03380102168737431, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 2.2406876790830945, |
| "grad_norm": 45.68840941118221, |
| "learning_rate": 8.258607542926114e-07, |
| "loss": -0.2069, |
| "num_tokens": 62159693.0, |
| "residual_var": 0.02851962298154831, |
| "reward": 0.96484375, |
| "reward_std": 0.0672023743391037, |
| "rewards/drgrpo_math_reward/mean": 0.96484375, |
| "rewards/drgrpo_math_reward/std": 0.18453538417816162, |
| "rho2": 0.1562499701976776, |
| "step": 390 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 3.116801460854865e-09, |
| "advantages/std": 0.29880714416503906, |
| "advantages/var": 0.08928570940406644, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 2.2464183381088825, |
| "grad_norm": 54.218033325007504, |
| "learning_rate": 8.214431052013634e-07, |
| "loss": -0.6774, |
| "num_tokens": 62285721.0, |
| "residual_var": 0.0641741156578064, |
| "reward": 0.8671875, |
| "reward_std": 0.16834035515785217, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.2812499403953552, |
| "step": 391 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 5.059925164551475e-10, |
| "advantages/std": 0.23007319867610931, |
| "advantages/var": 0.05293367674905647, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 2.2521489971346704, |
| "grad_norm": 79.24952958588219, |
| "learning_rate": 8.170290526059452e-07, |
| "loss": -0.2492, |
| "num_tokens": 62431924.0, |
| "residual_var": 0.03804609179496765, |
| "reward": 0.87109375, |
| "reward_std": 0.11310647428035736, |
| "rewards/drgrpo_math_reward/mean": 0.87109375, |
| "rewards/drgrpo_math_reward/std": 0.33575257658958435, |
| "rho2": 0.2812499403953552, |
| "step": 392 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.3790683972744836e-09, |
| "advantages/std": 0.20671138167381287, |
| "advantages/var": 0.04272959531349674, |
| "completions/clipped_ratio": -2.734375, |
| "epoch": 2.2578796561604584, |
| "grad_norm": 51.74546036901536, |
| "learning_rate": 8.126186854142751e-07, |
| "loss": 0.0164, |
| "num_tokens": 62567990.0, |
| "residual_var": 0.03338250517845154, |
| "reward": 0.74609375, |
| "reward_std": 0.08929947018623352, |
| "rewards/drgrpo_math_reward/mean": 0.74609375, |
| "rewards/drgrpo_math_reward/std": 0.4360972046852112, |
| "rho2": 0.2187499701976776, |
| "step": 393 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 6.332062685894146e-09, |
| "advantages/std": 0.18385054171085358, |
| "advantages/var": 0.03380102168737431, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.2636103151862463, |
| "grad_norm": 103.33108063231454, |
| "learning_rate": 8.0821209246004e-07, |
| "loss": 0.045, |
| "num_tokens": 62709253.0, |
| "residual_var": 0.02851962298154831, |
| "reward": 0.89453125, |
| "reward_std": 0.0672023743391037, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.1562499701976776, |
| "step": 394 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 8.279465729037828e-10, |
| "advantages/std": 0.28121456503868103, |
| "advantages/var": 0.07908163158989456, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 2.2693409742120343, |
| "grad_norm": 49.10069181374784, |
| "learning_rate": 8.03809362500905e-07, |
| "loss": -0.7296, |
| "num_tokens": 62864754.0, |
| "residual_var": 0.05683993920683861, |
| "reward": 0.75, |
| "reward_std": 0.14518244564533234, |
| "rewards/drgrpo_math_reward/mean": 0.75, |
| "rewards/drgrpo_math_reward/std": 0.4338609278202057, |
| "rho2": 0.2812499403953552, |
| "step": 395 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 2.4258431181477625e-09, |
| "advantages/std": 0.28793779015541077, |
| "advantages/var": 0.08290817099958137, |
| "completions/clipped_ratio": -2.578125, |
| "epoch": 2.2750716332378222, |
| "grad_norm": 60.20448218570951, |
| "learning_rate": 7.994105842167272e-07, |
| "loss": -0.8891, |
| "num_tokens": 63030352.0, |
| "residual_var": 0.05699938163161278, |
| "reward": 0.75, |
| "reward_std": 0.1626875400543213, |
| "rewards/drgrpo_math_reward/mean": 0.75, |
| "rewards/drgrpo_math_reward/std": 0.4338609278202057, |
| "rho2": 0.3124999403953552, |
| "step": 396 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.5328250743576586e-09, |
| "advantages/std": 0.18385054171085358, |
| "advantages/var": 0.03380102168737431, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 2.28080229226361, |
| "grad_norm": 40.465213721376465, |
| "learning_rate": 7.950158462077697e-07, |
| "loss": -0.189, |
| "num_tokens": 63171710.0, |
| "residual_var": 0.02851962298154831, |
| "reward": 0.80078125, |
| "reward_std": 0.0672023743391037, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.1562499701976776, |
| "step": 397 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.282346248626709, |
| "advantages/var": 0.07971940411357537, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 2.286532951289398, |
| "grad_norm": 59.346861894412804, |
| "learning_rate": 7.906252369929154e-07, |
| "loss": -0.5438, |
| "num_tokens": 63322080.0, |
| "residual_var": 0.05480709299445152, |
| "reward": 0.85546875, |
| "reward_std": 0.1528160572052002, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.3124999403953552, |
| "step": 398 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 7.025359058465554e-09, |
| "advantages/std": 0.19884873926639557, |
| "advantages/var": 0.03954082110783497, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.292263610315186, |
| "grad_norm": 59.615400328722934, |
| "learning_rate": 7.862388450078854e-07, |
| "loss": -0.3341, |
| "num_tokens": 63458274.0, |
| "residual_var": 0.03089127317070961, |
| "reward": 0.828125, |
| "reward_std": 0.08588206768035889, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.21874994039535522, |
| "step": 399 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 4.375439129655397e-09, |
| "advantages/std": 0.26606544852256775, |
| "advantages/var": 0.07079082289751515, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.297994269340974, |
| "grad_norm": 51.14379978645281, |
| "learning_rate": 7.818567586034576e-07, |
| "loss": -0.2393, |
| "num_tokens": 63616372.0, |
| "residual_var": 0.046456485986709595, |
| "reward": 0.87890625, |
| "reward_std": 0.1437433660030365, |
| "rewards/drgrpo_math_reward/mean": 0.87890625, |
| "rewards/drgrpo_math_reward/std": 0.3268752694129944, |
| "rho2": 0.3437499403953552, |
| "step": 400 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.509314693234178e-10, |
| "advantages/std": 0.24484480917453766, |
| "advantages/var": 0.05994898057971576, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.303724928366762, |
| "grad_norm": 62.83649070850356, |
| "learning_rate": 7.774790660436857e-07, |
| "loss": -0.4879, |
| "num_tokens": 63750284.0, |
| "residual_var": 0.048708558082580566, |
| "reward": 0.9140625, |
| "reward_std": 0.11336849629878998, |
| "rewards/drgrpo_math_reward/mean": 0.9140625, |
| "rewards/drgrpo_math_reward/std": 0.28082075715065, |
| "rho2": 0.1874999701976776, |
| "step": 401 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 2.9653419053591074e-09, |
| "advantages/std": 0.2355518937110901, |
| "advantages/var": 0.05548469463088068, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 2.30945558739255, |
| "grad_norm": 66.03436103957057, |
| "learning_rate": 7.731058555041236e-07, |
| "loss": -0.0605, |
| "num_tokens": 63886228.0, |
| "residual_var": 0.039879634976387024, |
| "reward": 0.85546875, |
| "reward_std": 0.1148114949464798, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.2812499403953552, |
| "step": 402 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 9.664790625930143e-10, |
| "advantages/std": 0.2409060299396515, |
| "advantages/var": 0.05803571526128426, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 2.315186246418338, |
| "grad_norm": 85.42827831346187, |
| "learning_rate": 7.687372150700479e-07, |
| "loss": 0.1938, |
| "num_tokens": 64032369.0, |
| "residual_var": 0.04534041881561279, |
| "reward": 0.81640625, |
| "reward_std": 0.11112815141677856, |
| "rewards/drgrpo_math_reward/mean": 0.81640625, |
| "rewards/drgrpo_math_reward/std": 0.387910932302475, |
| "rho2": 0.21874995529651642, |
| "step": 403 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.1291806000454375e-09, |
| "advantages/std": 0.2187044471502304, |
| "advantages/var": 0.047831635203287926, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 2.3209169054441263, |
| "grad_norm": 36.125332534122556, |
| "learning_rate": 7.643732327346841e-07, |
| "loss": -0.1364, |
| "num_tokens": 64160653.0, |
| "residual_var": 0.037368472665548325, |
| "reward": 0.93359375, |
| "reward_std": 0.10087842494249344, |
| "rewards/drgrpo_math_reward/mean": 0.93359375, |
| "rewards/drgrpo_math_reward/std": 0.24947863817214966, |
| "rho2": 0.2187499701976776, |
| "step": 404 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 4.754657346617089e-10, |
| "advantages/std": 0.24484480917453766, |
| "advantages/var": 0.05994898057971576, |
| "completions/clipped_ratio": -2.703125, |
| "epoch": 2.326647564469914, |
| "grad_norm": 48.26815772105712, |
| "learning_rate": 7.60013996397434e-07, |
| "loss": -0.5838, |
| "num_tokens": 64339916.0, |
| "residual_var": 0.04308834299445152, |
| "reward": 0.71875, |
| "reward_std": 0.1263929009437561, |
| "rewards/drgrpo_math_reward/mean": 0.71875, |
| "rewards/drgrpo_math_reward/std": 0.45048993825912476, |
| "rho2": 0.2812499403953552, |
| "step": 405 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.22446081042289734, |
| "advantages/var": 0.05038265541570386, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 2.3323782234957022, |
| "grad_norm": 43.58466398770359, |
| "learning_rate": 7.556595938621058e-07, |
| "loss": -0.1253, |
| "num_tokens": 64484115.0, |
| "residual_var": 0.04093591868877411, |
| "reward": 0.85546875, |
| "reward_std": 0.10311631113290787, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.18749995529651642, |
| "step": 406 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 2.069866432259457e-09, |
| "advantages/std": 0.28121456503868103, |
| "advantages/var": 0.07908163158989456, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 2.3381088825214897, |
| "grad_norm": 53.382125444657255, |
| "learning_rate": 7.513101128351453e-07, |
| "loss": 0.147, |
| "num_tokens": 64631940.0, |
| "residual_var": 0.05931123346090317, |
| "reward": 0.7578125, |
| "reward_std": 0.15216940641403198, |
| "rewards/drgrpo_math_reward/mean": 0.7578125, |
| "rewards/drgrpo_math_reward/std": 0.4292463958263397, |
| "rho2": 0.2499999701976776, |
| "step": 407 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.0362738671494303e-09, |
| "advantages/std": 0.22868302464485168, |
| "advantages/var": 0.05229592576071784, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 2.343839541547278, |
| "grad_norm": 60.18215125277809, |
| "learning_rate": 7.469656409238683e-07, |
| "loss": -0.3729, |
| "num_tokens": 64772308.0, |
| "residual_var": 0.039221953600645065, |
| "reward": 0.8359375, |
| "reward_std": 0.11192695796489716, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.24999995529651642, |
| "step": 408 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.092355309744537e-09, |
| "advantages/std": 0.22587698698043823, |
| "advantages/var": 0.05102041324736106, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 2.349570200573066, |
| "grad_norm": 45.72333700789412, |
| "learning_rate": 7.426262656346978e-07, |
| "loss": -0.6002, |
| "num_tokens": 64911639.0, |
| "residual_var": 0.03985970839858055, |
| "reward": 0.890625, |
| "reward_std": 0.10429581999778748, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.21874995529651642, |
| "step": 409 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.0, |
| "advantages/var": 0.0, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 2.355300859598854, |
| "grad_norm": 0.0, |
| "learning_rate": 7.382920743713998e-07, |
| "loss": 0.0, |
| "num_tokens": 65039171.0, |
| "residual_var": 9.99999993922529e-09, |
| "reward": 0.9375, |
| "reward_std": 0.0, |
| "rewards/drgrpo_math_reward/mean": 0.9375, |
| "rewards/drgrpo_math_reward/std": 0.24253563582897186, |
| "rho2": 0.0, |
| "step": 410 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 8.830796510959388e-10, |
| "advantages/std": 0.2636575698852539, |
| "advantages/var": 0.06951531415779755, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.361031518624642, |
| "grad_norm": 93.30075449743337, |
| "learning_rate": 7.33963154433325e-07, |
| "loss": -0.0906, |
| "num_tokens": 65182661.0, |
| "residual_var": 0.04779178649187088, |
| "reward": 0.83984375, |
| "reward_std": 0.13611222803592682, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.3124999403953552, |
| "step": 411 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 6.1815575576715716e-09, |
| "advantages/std": 0.2636575698852539, |
| "advantages/var": 0.06951531415779755, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 2.36676217765043, |
| "grad_norm": 94.04588923049533, |
| "learning_rate": 7.29639593013647e-07, |
| "loss": 0.2124, |
| "num_tokens": 65331080.0, |
| "residual_var": 0.04779178649187088, |
| "reward": 0.89453125, |
| "reward_std": 0.14256632328033447, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.3124999403953552, |
| "step": 412 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.6738836260536997e-09, |
| "advantages/std": 0.26122748851776123, |
| "advantages/var": 0.06823980075729708, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.372492836676218, |
| "grad_norm": 52.465569457913595, |
| "learning_rate": 7.2532147719761e-07, |
| "loss": -0.4931, |
| "num_tokens": 65476367.0, |
| "residual_var": 0.053312353789806366, |
| "reward": 0.87890625, |
| "reward_std": 0.12783199548721313, |
| "rewards/drgrpo_math_reward/mean": 0.87890625, |
| "rewards/drgrpo_math_reward/std": 0.3268752694129944, |
| "rho2": 0.21874995529651642, |
| "step": 413 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.329736467094508e-09, |
| "advantages/std": 0.2097739279270172, |
| "advantages/var": 0.04400510083792941, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 2.378223495702006, |
| "grad_norm": 39.472581591236306, |
| "learning_rate": 7.210088939607708e-07, |
| "loss": 0.0679, |
| "num_tokens": 65615937.0, |
| "residual_var": 0.03300383687019348, |
| "reward": 0.92578125, |
| "reward_std": 0.0969306156039238, |
| "rewards/drgrpo_math_reward/mean": 0.92578125, |
| "rewards/drgrpo_math_reward/std": 0.2626400291919708, |
| "rho2": 0.24999994039535522, |
| "step": 414 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.13832083344459534, |
| "advantages/var": 0.019132652964807484, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 2.383954154727794, |
| "grad_norm": 32.19586387362773, |
| "learning_rate": 7.167019301672508e-07, |
| "loss": -0.2886, |
| "num_tokens": 65760842.0, |
| "residual_var": 0.017936870455741882, |
| "reward": 0.8359375, |
| "reward_std": 0.0388009138405323, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.062499988824129105, |
| "step": 415 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 5.186442510510295e-10, |
| "advantages/std": 0.22446082532405853, |
| "advantages/var": 0.050382662105157516, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 2.3896848137535818, |
| "grad_norm": 52.24445047939618, |
| "learning_rate": 7.124006725679828e-07, |
| "loss": -0.283, |
| "num_tokens": 65917872.0, |
| "residual_var": 0.037787001579999924, |
| "reward": 0.80078125, |
| "reward_std": 0.10376540571451187, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.24999995529651642, |
| "step": 416 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.115126408796943e-09, |
| "advantages/std": 0.220157653093338, |
| "advantages/var": 0.048469392215566565, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 2.3954154727793697, |
| "grad_norm": 74.83747879148169, |
| "learning_rate": 7.081052077989667e-07, |
| "loss": -0.2753, |
| "num_tokens": 66061607.0, |
| "residual_var": 0.03635205700993538, |
| "reward": 0.90625, |
| "reward_std": 0.10205793380737305, |
| "rewards/drgrpo_math_reward/mean": 0.90625, |
| "rewards/drgrpo_math_reward/std": 0.2920515835285187, |
| "rho2": 0.24999995529651642, |
| "step": 417 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.2910053593143641e-09, |
| "advantages/std": 0.18034830689430237, |
| "advantages/var": 0.03252551179964147, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 2.4011461318051577, |
| "grad_norm": 31.091632611376465, |
| "learning_rate": 7.038156223795224e-07, |
| "loss": -0.1819, |
| "num_tokens": 66191762.0, |
| "residual_var": 0.027443408966064453, |
| "reward": 0.87109375, |
| "reward_std": 0.07194654643535614, |
| "rewards/drgrpo_math_reward/mean": 0.87109375, |
| "rewards/drgrpo_math_reward/std": 0.33575257658958435, |
| "rho2": 0.1562499701976776, |
| "step": 418 |
| }, |
| { |
| "advantages/mean": 3.4924596548080444e-10, |
| "advantages/snr": 1.3760799961512823e-09, |
| "advantages/std": 0.253797709941864, |
| "advantages/var": 0.06441327757173454, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 2.4068767908309456, |
| "grad_norm": 50.8649830535786, |
| "learning_rate": 6.995320027105481e-07, |
| "loss": -0.2243, |
| "num_tokens": 66344202.0, |
| "residual_var": 0.05032287538051605, |
| "reward": 0.75390625, |
| "reward_std": 0.12441704422235489, |
| "rewards/drgrpo_math_reward/mean": 0.75390625, |
| "rewards/drgrpo_math_reward/std": 0.43157756328582764, |
| "rho2": 0.2187499701976776, |
| "step": 419 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 2.5610062551388323e-09, |
| "advantages/std": 0.22728432714939117, |
| "advantages/var": 0.051658165367751474, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 2.4126074498567336, |
| "grad_norm": 55.606646356605715, |
| "learning_rate": 6.952544350727799e-07, |
| "loss": -0.2911, |
| "num_tokens": 66490917.0, |
| "residual_var": 0.041972268372774124, |
| "reward": 0.81640625, |
| "reward_std": 0.09837214648723602, |
| "rewards/drgrpo_math_reward/mean": 0.81640625, |
| "rewards/drgrpo_math_reward/std": 0.387910932302475, |
| "rho2": 0.1874999701976776, |
| "step": 420 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 4.0725477342988605e-09, |
| "advantages/std": 0.22868302464485168, |
| "advantages/var": 0.05229592576071784, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.4183381088825215, |
| "grad_norm": 62.95766779171665, |
| "learning_rate": 6.909830056250526e-07, |
| "loss": -0.2863, |
| "num_tokens": 66648538.0, |
| "residual_var": 0.03922194987535477, |
| "reward": 0.828125, |
| "reward_std": 0.1054728776216507, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.24999994039535522, |
| "step": 421 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 5.7177670636746294e-09, |
| "advantages/std": 0.20360276103019714, |
| "advantages/var": 0.041454084299119565, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 2.4240687679083095, |
| "grad_norm": 32.371041420700465, |
| "learning_rate": 6.867178004025676e-07, |
| "loss": -0.348, |
| "num_tokens": 66792036.0, |
| "residual_var": 0.03238601237535477, |
| "reward": 0.83984375, |
| "reward_std": 0.08758954703807831, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.21874994039535522, |
| "step": 422 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 6.160119582101702e-10, |
| "advantages/std": 0.18898223340511322, |
| "advantages/var": 0.03571428454278469, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 2.4297994269340975, |
| "grad_norm": 39.984557826282405, |
| "learning_rate": 6.824589053151557e-07, |
| "loss": -0.333, |
| "num_tokens": 66929298.0, |
| "residual_var": 0.030133940279483795, |
| "reward": 0.8984375, |
| "reward_std": 0.07536394149065018, |
| "rewards/drgrpo_math_reward/mean": 0.8984375, |
| "rewards/drgrpo_math_reward/std": 0.3026638329029083, |
| "rho2": 0.1562499701976776, |
| "step": 423 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 3.725290149450308e-09, |
| "advantages/std": 0.25, |
| "advantages/var": 0.0625, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.4355300859598854, |
| "grad_norm": 50.055448927921034, |
| "learning_rate": 6.782064061455504e-07, |
| "loss": -0.4335, |
| "num_tokens": 67077150.0, |
| "residual_var": 0.046875014901161194, |
| "reward": 0.890625, |
| "reward_std": 0.12217670679092407, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.24999994039535522, |
| "step": 424 |
| }, |
| { |
| "advantages/mean": -1.280568540096283e-09, |
| "advantages/snr": 4.590879840937069e-09, |
| "advantages/std": 0.2789374887943268, |
| "advantages/var": 0.07780612265488518, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.4412607449856734, |
| "grad_norm": 57.2938109556653, |
| "learning_rate": 6.739603885476582e-07, |
| "loss": -0.111, |
| "num_tokens": 67227554.0, |
| "residual_var": 0.06564892828464508, |
| "reward": 0.8203125, |
| "reward_std": 0.13743507862091064, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.1562499701976776, |
| "step": 425 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 4.28010223419274e-09, |
| "advantages/std": 0.27199190855026245, |
| "advantages/var": 0.07397959831681433, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 2.4469914040114613, |
| "grad_norm": 55.31860953422715, |
| "learning_rate": 6.697209380448332e-07, |
| "loss": -0.703, |
| "num_tokens": 67387084.0, |
| "residual_var": 0.05548471212387085, |
| "reward": 0.765625, |
| "reward_std": 0.14059044420719147, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.24999995529651642, |
| "step": 426 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.4306602382047774e-09, |
| "advantages/std": 0.20360276103019714, |
| "advantages/var": 0.041454084299119565, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.4527220630372493, |
| "grad_norm": 42.06385390657045, |
| "learning_rate": 6.654881400281547e-07, |
| "loss": -0.0225, |
| "num_tokens": 67524018.0, |
| "residual_var": 0.03368144854903221, |
| "reward": 0.92578125, |
| "reward_std": 0.08166831731796265, |
| "rewards/drgrpo_math_reward/mean": 0.92578125, |
| "rewards/drgrpo_math_reward/std": 0.2626400291919708, |
| "rho2": 0.1874999701976776, |
| "step": 427 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.24743583798408508, |
| "advantages/var": 0.0612244939188864, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 2.458452722063037, |
| "grad_norm": 49.93600952633924, |
| "learning_rate": 6.612620797547086e-07, |
| "loss": -0.0831, |
| "num_tokens": 67655964.0, |
| "residual_var": 0.04591837897896767, |
| "reward": 0.890625, |
| "reward_std": 0.1140126883983612, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.24999994039535522, |
| "step": 428 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.23007319867610931, |
| "advantages/var": 0.05293367674905647, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 2.464183381088825, |
| "grad_norm": 73.33296408165174, |
| "learning_rate": 6.570428423458686e-07, |
| "loss": -0.0866, |
| "num_tokens": 67818794.0, |
| "residual_var": 0.03804609179496765, |
| "reward": 0.80859375, |
| "reward_std": 0.11310647428035736, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.2812499701976776, |
| "step": 429 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.061570206496358e-09, |
| "advantages/std": 0.22587698698043823, |
| "advantages/var": 0.05102041324736106, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 2.469914040114613, |
| "grad_norm": 41.3588146711098, |
| "learning_rate": 6.528305127855815e-07, |
| "loss": -0.1169, |
| "num_tokens": 67958774.0, |
| "residual_var": 0.03985970839858055, |
| "reward": 0.8125, |
| "reward_std": 0.09784172475337982, |
| "rewards/drgrpo_math_reward/mean": 0.8125, |
| "rewards/drgrpo_math_reward/std": 0.3910769522190094, |
| "rho2": 0.21874995529651642, |
| "step": 430 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.5792027536326915e-09, |
| "advantages/std": 0.27081698179244995, |
| "advantages/var": 0.07334183762717217, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 2.475644699140401, |
| "grad_norm": 47.53806778721076, |
| "learning_rate": 6.486251759186572e-07, |
| "loss": -0.0798, |
| "num_tokens": 68123504.0, |
| "residual_var": 0.05271446332335472, |
| "reward": 0.80078125, |
| "reward_std": 0.1336059421300888, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.2812499403953552, |
| "step": 431 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.1291804549763264e-09, |
| "advantages/std": 0.2187044620513916, |
| "advantages/var": 0.04783164172118859, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.481375358166189, |
| "grad_norm": 54.79795531786569, |
| "learning_rate": 6.444269164490578e-07, |
| "loss": -0.4716, |
| "num_tokens": 68270822.0, |
| "residual_var": 0.04035795107483864, |
| "reward": 0.82421875, |
| "reward_std": 0.08732114732265472, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.15624995529651642, |
| "step": 432 |
| }, |
| { |
| "advantages/mean": 1.0477378964424133e-09, |
| "advantages/snr": 6.117115762230131e-09, |
| "advantages/std": 0.17127971351146698, |
| "advantages/var": 0.029336740260570204, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.487106017191977, |
| "grad_norm": 31.992254782418268, |
| "learning_rate": 6.402358189381933e-07, |
| "loss": 0.0456, |
| "num_tokens": 68409422.0, |
| "residual_var": 0.025669652968645096, |
| "reward": 0.84375, |
| "reward_std": 0.056153833866119385, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.1249999850988388, |
| "step": 433 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.23282866179943085, |
| "advantages/var": 0.05420918575531375, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 2.492836676217765, |
| "grad_norm": 45.283505671775984, |
| "learning_rate": 6.360519678032157e-07, |
| "loss": -0.039, |
| "num_tokens": 68570279.0, |
| "residual_var": 0.045739009976387024, |
| "reward": 0.80859375, |
| "reward_std": 0.10061003267765045, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.1562499701976776, |
| "step": 434 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.7487602347642132e-09, |
| "advantages/std": 0.16940774023532867, |
| "advantages/var": 0.028698982451640598, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 2.498567335243553, |
| "grad_norm": 41.997973250055495, |
| "learning_rate": 6.31875447315322e-07, |
| "loss": 0.147, |
| "num_tokens": 68717741.0, |
| "residual_var": 0.02421477437019348, |
| "reward": 0.85546875, |
| "reward_std": 0.062077511101961136, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.1562499701976776, |
| "step": 435 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 3.66736918388653e-09, |
| "advantages/std": 0.31743550300598145, |
| "advantages/var": 0.10076529856866046, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 2.504297994269341, |
| "grad_norm": 68.79884909207622, |
| "learning_rate": 6.277063415980548e-07, |
| "loss": -0.269, |
| "num_tokens": 68869663.0, |
| "residual_var": 0.07242507487535477, |
| "reward": 0.7890625, |
| "reward_std": 0.18675412237644196, |
| "rewards/drgrpo_math_reward/mean": 0.7890625, |
| "rewards/drgrpo_math_reward/std": 0.4087733030319214, |
| "rho2": 0.2812499403953552, |
| "step": 436 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.219622438173128e-10, |
| "advantages/std": 0.25253814458847046, |
| "advantages/var": 0.06377551447218721, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 2.510028653295129, |
| "grad_norm": 59.63463907817795, |
| "learning_rate": 6.23544734625608e-07, |
| "loss": -0.1937, |
| "num_tokens": 69011738.0, |
| "residual_var": 0.04783164709806442, |
| "reward": 0.828125, |
| "reward_std": 0.12388662248849869, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.24999995529651642, |
| "step": 437 |
| }, |
| { |
| "advantages/mean": 3.4924596548080444e-10, |
| "advantages/snr": 1.4418180357225341e-09, |
| "advantages/std": 0.24222609400749207, |
| "advantages/var": 0.058673480618126383, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 2.5157593123209168, |
| "grad_norm": 40.77789706768845, |
| "learning_rate": 6.193907102211358e-07, |
| "loss": -0.2813, |
| "num_tokens": 69161665.0, |
| "residual_var": 0.04400511831045151, |
| "reward": 0.7265625, |
| "reward_std": 0.11230766773223877, |
| "rewards/drgrpo_math_reward/mean": 0.7265625, |
| "rewards/drgrpo_math_reward/std": 0.446596622467041, |
| "rho2": 0.24999995529651642, |
| "step": 438 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.8168728588206165e-09, |
| "advantages/std": 0.2562982141971588, |
| "advantages/var": 0.0656887746006527, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.5214899713467047, |
| "grad_norm": 50.03689448920175, |
| "learning_rate": 6.152443520550641e-07, |
| "loss": -0.9247, |
| "num_tokens": 69316042.0, |
| "residual_var": 0.04721382260322571, |
| "reward": 0.75390625, |
| "reward_std": 0.13151532411575317, |
| "rewards/drgrpo_math_reward/mean": 0.75390625, |
| "rewards/drgrpo_math_reward/std": 0.43157756328582764, |
| "rho2": 0.2812499403953552, |
| "step": 439 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 4.914075334034159e-09, |
| "advantages/std": 0.23690177500247955, |
| "advantages/var": 0.056122450999325446, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.5272206303724927, |
| "grad_norm": 49.160447041501605, |
| "learning_rate": 6.111057436434055e-07, |
| "loss": -0.2751, |
| "num_tokens": 69455761.0, |
| "residual_var": 0.04209184646606445, |
| "reward": 0.9296875, |
| "reward_std": 0.10888782143592834, |
| "rewards/drgrpo_math_reward/mean": 0.9296875, |
| "rewards/drgrpo_math_reward/std": 0.2561737895011902, |
| "rho2": 0.24999994039535522, |
| "step": 440 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 9.219622438173128e-10, |
| "advantages/std": 0.25253814458847046, |
| "advantages/var": 0.06377551447218721, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 2.532951289398281, |
| "grad_norm": 47.156529472775865, |
| "learning_rate": 6.069749683460764e-07, |
| "loss": -0.1707, |
| "num_tokens": 69599648.0, |
| "residual_var": 0.05181761458516121, |
| "reward": 0.890625, |
| "reward_std": 0.11678344011306763, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.1874999701976776, |
| "step": 441 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.9656301336136633e-09, |
| "advantages/std": 0.23690177500247955, |
| "advantages/var": 0.056122450999325446, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 2.5386819484240686, |
| "grad_norm": 46.898459689438916, |
| "learning_rate": 6.028521093652194e-07, |
| "loss": -0.7164, |
| "num_tokens": 69765995.0, |
| "residual_var": 0.043845679610967636, |
| "reward": 0.8203125, |
| "reward_std": 0.10942068696022034, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.21874995529651642, |
| "step": 442 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.054410800724145e-09, |
| "advantages/std": 0.22868302464485168, |
| "advantages/var": 0.05229592576071784, |
| "completions/clipped_ratio": -2.71875, |
| "epoch": 2.544412607449857, |
| "grad_norm": 36.281922684341176, |
| "learning_rate": 5.987372497435258e-07, |
| "loss": -0.2242, |
| "num_tokens": 69918989.0, |
| "residual_var": 0.04739318788051605, |
| "reward": 0.7890625, |
| "reward_std": 0.09179937839508057, |
| "rewards/drgrpo_math_reward/mean": 0.7890625, |
| "rewards/drgrpo_math_reward/std": 0.4087733030319214, |
| "rho2": 0.0937499850988388, |
| "step": 443 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.1730857333241115e-09, |
| "advantages/std": 0.2142857313156128, |
| "advantages/var": 0.045918374645467, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 2.5501432664756445, |
| "grad_norm": 37.74002948472885, |
| "learning_rate": 5.946304723625639e-07, |
| "loss": -0.0797, |
| "num_tokens": 70061853.0, |
| "residual_var": 0.038743630051612854, |
| "reward": 0.7734375, |
| "reward_std": 0.07915958762168884, |
| "rewards/drgrpo_math_reward/mean": 0.7734375, |
| "rewards/drgrpo_math_reward/std": 0.41942715644836426, |
| "rho2": 0.1562499701976776, |
| "step": 444 |
| }, |
| { |
| "advantages/mean": 1.6298145055770874e-09, |
| "advantages/snr": 7.2154957227372524e-09, |
| "advantages/std": 0.22587698698043823, |
| "advantages/var": 0.05102041324736106, |
| "completions/clipped_ratio": -2.6875, |
| "epoch": 2.555873925501433, |
| "grad_norm": 35.717771683637935, |
| "learning_rate": 5.905318599411097e-07, |
| "loss": -0.262, |
| "num_tokens": 70237613.0, |
| "residual_var": 0.03985970839858055, |
| "reward": 0.7578125, |
| "reward_std": 0.09784172475337982, |
| "rewards/drgrpo_math_reward/mean": 0.7578125, |
| "rewards/drgrpo_math_reward/std": 0.4292463958263397, |
| "rho2": 0.21874995529651642, |
| "step": 445 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.18728730082511902, |
| "advantages/var": 0.03507653305035863, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.5616045845272204, |
| "grad_norm": 40.02898953814623, |
| "learning_rate": 5.864414950334795e-07, |
| "loss": -0.2901, |
| "num_tokens": 70384711.0, |
| "residual_var": 0.028499692678451538, |
| "reward": 0.91796875, |
| "reward_std": 0.07483351975679398, |
| "rewards/drgrpo_math_reward/mean": 0.91796875, |
| "rewards/drgrpo_math_reward/std": 0.2749498784542084, |
| "rho2": 0.1874999701976776, |
| "step": 446 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.911554501079231e-09, |
| "advantages/std": 0.1785714328289032, |
| "advantages/var": 0.03188775662256749, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 2.567335243553009, |
| "grad_norm": 36.74075622863581, |
| "learning_rate": 5.82359460027869e-07, |
| "loss": 0.021, |
| "num_tokens": 70512660.0, |
| "residual_var": 0.025908811017870903, |
| "reward": 0.890625, |
| "reward_std": 0.07141612470149994, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.18749994039535522, |
| "step": 447 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 5.090684667873576e-10, |
| "advantages/std": 0.22868302464485168, |
| "advantages/var": 0.05229592576071784, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 2.5730659025787963, |
| "grad_norm": 42.09374187090726, |
| "learning_rate": 5.782858371446927e-07, |
| "loss": -0.4403, |
| "num_tokens": 70665873.0, |
| "residual_var": 0.03922194987535477, |
| "reward": 0.828125, |
| "reward_std": 0.11192697286605835, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.24999994039535522, |
| "step": 448 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 4.034460560317909e-09, |
| "advantages/std": 0.17313142120838165, |
| "advantages/var": 0.029974489009634064, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 2.5787965616045847, |
| "grad_norm": 26.676232917963386, |
| "learning_rate": 5.742207084349273e-07, |
| "loss": -0.0319, |
| "num_tokens": 70810642.0, |
| "residual_var": 0.02435428649187088, |
| "reward": 0.80078125, |
| "reward_std": 0.06970866024494171, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.18749995529651642, |
| "step": 449 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.991240497915941e-09, |
| "advantages/std": 0.15567496418952942, |
| "advantages/var": 0.024234694475411267, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 2.5845272206303727, |
| "grad_norm": 24.910936996773792, |
| "learning_rate": 5.701641557784609e-07, |
| "loss": 0.0803, |
| "num_tokens": 70940467.0, |
| "residual_var": 0.022720035165548325, |
| "reward": 0.84375, |
| "reward_std": 0.043925780802965164, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.062499985098838806, |
| "step": 450 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 3.9537892071454765e-09, |
| "advantages/std": 0.2355518937110901, |
| "advantages/var": 0.05548469463088068, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 2.5902578796561606, |
| "grad_norm": 88.14454543805262, |
| "learning_rate": 5.661162608824419e-07, |
| "loss": -0.3043, |
| "num_tokens": 71083256.0, |
| "residual_var": 0.04508132487535477, |
| "reward": 0.82421875, |
| "reward_std": 0.10824117809534073, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.1874999701976776, |
| "step": 451 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 5.982480995831882e-09, |
| "advantages/std": 0.15567496418952942, |
| "advantages/var": 0.024234694475411267, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 2.5959885386819486, |
| "grad_norm": 24.812763493051456, |
| "learning_rate": 5.620771052796338e-07, |
| "loss": 0.1889, |
| "num_tokens": 71235989.0, |
| "residual_var": 0.021205367520451546, |
| "reward": 0.890625, |
| "reward_std": 0.05102896690368652, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.12499997764825821, |
| "step": 452 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 5.000042345610262e-09, |
| "advantages/std": 0.23282866179943085, |
| "advantages/var": 0.05420918575531375, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.6017191977077365, |
| "grad_norm": 40.96379593667113, |
| "learning_rate": 5.580467703267735e-07, |
| "loss": -0.443, |
| "num_tokens": 71376750.0, |
| "residual_var": 0.045739009976387024, |
| "reward": 0.83203125, |
| "reward_std": 0.10061003267765045, |
| "rewards/drgrpo_math_reward/mean": 0.83203125, |
| "rewards/drgrpo_math_reward/std": 0.3745708465576172, |
| "rho2": 0.1562499701976776, |
| "step": 453 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 3.2596287508993594e-09, |
| "advantages/std": 0.1785714328289032, |
| "advantages/var": 0.03188775662256749, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 2.6074498567335245, |
| "grad_norm": 29.902235896857928, |
| "learning_rate": 5.540253372029314e-07, |
| "loss": -0.0184, |
| "num_tokens": 71512479.0, |
| "residual_var": 0.028898291289806366, |
| "reward": 0.8671875, |
| "reward_std": 0.058391720056533813, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.09374997764825821, |
| "step": 454 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.305866683433113e-09, |
| "advantages/std": 0.21128858625888824, |
| "advantages/var": 0.04464286668327966, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 2.6131805157593124, |
| "grad_norm": 34.34699092506041, |
| "learning_rate": 5.500128869078788e-07, |
| "loss": 0.1558, |
| "num_tokens": 71659797.0, |
| "residual_var": 0.039062514901161194, |
| "reward": 0.8515625, |
| "reward_std": 0.08443661779165268, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.12499997764825821, |
| "step": 455 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.380496292175044e-09, |
| "advantages/std": 0.19561520218849182, |
| "advantages/var": 0.038265307327244535, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 2.6189111747851004, |
| "grad_norm": 37.02449809638985, |
| "learning_rate": 5.460095002604532e-07, |
| "loss": -0.3228, |
| "num_tokens": 71800154.0, |
| "residual_var": 0.031090570613741875, |
| "reward": 0.875, |
| "reward_std": 0.07825092226266861, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.1874999701976776, |
| "step": 456 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.2369422001312725e-09, |
| "advantages/std": 0.2823462188243866, |
| "advantages/var": 0.0797193872844284, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 2.6246418338108883, |
| "grad_norm": 49.1134055528491, |
| "learning_rate": 5.420152578969325e-07, |
| "loss": -0.2936, |
| "num_tokens": 71964871.0, |
| "residual_var": 0.05480710044503212, |
| "reward": 0.82421875, |
| "reward_std": 0.15927013754844666, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.3124999403953552, |
| "step": 457 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 2.8527944079702535e-09, |
| "advantages/std": 0.24484480917453766, |
| "advantages/var": 0.05994898057971576, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 2.6303724928366763, |
| "grad_norm": 46.372268356616715, |
| "learning_rate": 5.380302402694103e-07, |
| "loss": -0.2865, |
| "num_tokens": 72116926.0, |
| "residual_var": 0.04308834299445152, |
| "reward": 0.828125, |
| "reward_std": 0.11993882060050964, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.2812499403953552, |
| "step": 458 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.2198241537125106e-09, |
| "advantages/std": 0.2097739428281784, |
| "advantages/var": 0.04400510708967986, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 2.6361031518624642, |
| "grad_norm": 42.058817691725466, |
| "learning_rate": 5.340545276441754e-07, |
| "loss": -0.0683, |
| "num_tokens": 72260427.0, |
| "residual_var": 0.037129320204257965, |
| "reward": 0.89453125, |
| "reward_std": 0.08390620350837708, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.1562499701976776, |
| "step": 459 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 4.5643933797911646e-09, |
| "advantages/std": 0.25505101680755615, |
| "advantages/var": 0.06505102117456829, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 2.641833810888252, |
| "grad_norm": 37.5340582752598, |
| "learning_rate": 5.300882001000946e-07, |
| "loss": -0.4365, |
| "num_tokens": 72414368.0, |
| "residual_var": 0.046755433082580566, |
| "reward": 0.7578125, |
| "reward_std": 0.1250636875629425, |
| "rewards/drgrpo_math_reward/mean": 0.7578125, |
| "rewards/drgrpo_math_reward/std": 0.4292463958263397, |
| "rho2": 0.2812499403953552, |
| "step": 460 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 4.801170592896891e-09, |
| "advantages/std": 0.1939782202243805, |
| "advantages/var": 0.03762754992141826, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 2.64756446991404, |
| "grad_norm": 28.066627315135047, |
| "learning_rate": 5.261313375270013e-07, |
| "loss": 0.2219, |
| "num_tokens": 72556336.0, |
| "residual_var": 0.0293965395539999, |
| "reward": 0.84765625, |
| "reward_std": 0.08417459577322006, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.21874995529651642, |
| "step": 461 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.3049056547270335e-09, |
| "advantages/std": 0.2020305097103119, |
| "advantages/var": 0.04081632685380843, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 2.653295128939828, |
| "grad_norm": 38.265928227983444, |
| "learning_rate": 5.221840196240848e-07, |
| "loss": -0.4421, |
| "num_tokens": 72694108.0, |
| "residual_var": 0.03188776224851608, |
| "reward": 0.8515625, |
| "reward_std": 0.0875919908285141, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.2187499701976776, |
| "step": 462 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 6.390150731707631e-09, |
| "advantages/std": 0.25505101680755615, |
| "advantages/var": 0.06505102117456829, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.659025787965616, |
| "grad_norm": 41.52771750903016, |
| "learning_rate": 5.182463258982846e-07, |
| "loss": -0.2373, |
| "num_tokens": 72839820.0, |
| "residual_var": 0.050821125507354736, |
| "reward": 0.8203125, |
| "reward_std": 0.1244145929813385, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.21874994039535522, |
| "step": 463 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.0059432951138956e-09, |
| "advantages/std": 0.2314550280570984, |
| "advantages/var": 0.0535714300129122, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 2.664756446991404, |
| "grad_norm": 44.56932098497142, |
| "learning_rate": 5.143183356626916e-07, |
| "loss": -0.3485, |
| "num_tokens": 72979719.0, |
| "residual_var": 0.040178585797548294, |
| "reward": 0.9296875, |
| "reward_std": 0.10718280076980591, |
| "rewards/drgrpo_math_reward/mean": 0.9296875, |
| "rewards/drgrpo_math_reward/std": 0.2561737895011902, |
| "rho2": 0.24999994039535522, |
| "step": 464 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 4.5421821470515416e-09, |
| "advantages/std": 0.2562982141971588, |
| "advantages/var": 0.0656887746006527, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 2.670487106017192, |
| "grad_norm": 52.1727676862902, |
| "learning_rate": 5.104001280349479e-07, |
| "loss": -0.217, |
| "num_tokens": 73117461.0, |
| "residual_var": 0.049266595393419266, |
| "reward": 0.83984375, |
| "reward_std": 0.1255940943956375, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.24999994039535522, |
| "step": 465 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.712175018057454e-09, |
| "advantages/std": 0.25753939151763916, |
| "advantages/var": 0.06632653818327583, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 2.67621776504298, |
| "grad_norm": 57.659842033208214, |
| "learning_rate": 5.064917819356531e-07, |
| "loss": -0.7347, |
| "num_tokens": 73275288.0, |
| "residual_var": 0.045599501579999924, |
| "reward": 0.7265625, |
| "reward_std": 0.13269482553005219, |
| "rewards/drgrpo_math_reward/mean": 0.7265625, |
| "rewards/drgrpo_math_reward/std": 0.446596622467041, |
| "rho2": 0.3124999403953552, |
| "step": 466 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 8.750878259310795e-10, |
| "advantages/std": 0.26606544852256775, |
| "advantages/var": 0.07079082289751515, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 2.681948424068768, |
| "grad_norm": 44.43579130370154, |
| "learning_rate": 5.025933760867781e-07, |
| "loss": -0.656, |
| "num_tokens": 73421135.0, |
| "residual_var": 0.046456485986709595, |
| "reward": 0.74609375, |
| "reward_std": 0.1437433809041977, |
| "rewards/drgrpo_math_reward/mean": 0.74609375, |
| "rewards/drgrpo_math_reward/std": 0.4360972046852112, |
| "rho2": 0.3437499403953552, |
| "step": 467 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.9961067217240515e-09, |
| "advantages/std": 0.1749635636806488, |
| "advantages/var": 0.03061224861583245, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 2.687679083094556, |
| "grad_norm": 43.168997925092384, |
| "learning_rate": 4.987049890100752e-07, |
| "loss": -0.3475, |
| "num_tokens": 73562177.0, |
| "residual_var": 0.025829093530774117, |
| "reward": 0.8359375, |
| "reward_std": 0.06378498673439026, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.15624995529651642, |
| "step": 468 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 4.743456338717347e-09, |
| "advantages/std": 0.14725378155708313, |
| "advantages/var": 0.021683676182861156, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.693409742120344, |
| "grad_norm": 25.475930745385376, |
| "learning_rate": 4.948266990254988e-07, |
| "loss": -0.2273, |
| "num_tokens": 73691189.0, |
| "residual_var": 0.019650839269161224, |
| "reward": 0.875, |
| "reward_std": 0.041687894612550735, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.09374997764825821, |
| "step": 469 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 2.0118866328303662e-09, |
| "advantages/std": 0.1157275065779686, |
| "advantages/var": 0.013392855778753765, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.6991404011461317, |
| "grad_norm": 59.802744549134886, |
| "learning_rate": 4.909585842496287e-07, |
| "loss": -0.0798, |
| "num_tokens": 73844398.0, |
| "residual_var": 0.012137286365032196, |
| "reward": 0.90234375, |
| "reward_std": 0.03314562886953354, |
| "rewards/drgrpo_math_reward/mean": 0.90234375, |
| "rewards/drgrpo_math_reward/std": 0.29743078351020813, |
| "rho2": 0.09374997764825821, |
| "step": 470 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.266069932627558e-10, |
| "advantages/std": 0.2512722611427307, |
| "advantages/var": 0.06313774921978066, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 2.7048710601719197, |
| "grad_norm": 80.40343933087797, |
| "learning_rate": 4.871007225940939e-07, |
| "loss": -0.5229, |
| "num_tokens": 73990263.0, |
| "residual_var": 0.043407220393419266, |
| "reward": 0.84765625, |
| "reward_std": 0.12927743792533875, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.3124999403953552, |
| "step": 471 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 2.464047702455537e-09, |
| "advantages/std": 0.09449111670255661, |
| "advantages/var": 0.008928571135696173, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 2.7106017191977076, |
| "grad_norm": 13.501865451703313, |
| "learning_rate": 4.832531917640057e-07, |
| "loss": 0.0409, |
| "num_tokens": 74126906.0, |
| "residual_var": 0.008370545692741871, |
| "reward": 0.84375, |
| "reward_std": 0.022097086533904076, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.062499985098838806, |
| "step": 472 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 5.645842642869884e-09, |
| "advantages/std": 0.24743583798408508, |
| "advantages/var": 0.0612244939188864, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 2.7163323782234956, |
| "grad_norm": 44.063649508463605, |
| "learning_rate": 4.794160692563917e-07, |
| "loss": -0.1131, |
| "num_tokens": 74277313.0, |
| "residual_var": 0.04783164709806442, |
| "reward": 0.71875, |
| "reward_std": 0.1145455539226532, |
| "rewards/drgrpo_math_reward/mean": 0.71875, |
| "rewards/drgrpo_math_reward/std": 0.45048993825912476, |
| "rho2": 0.21874995529651642, |
| "step": 473 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.2431736171986422e-09, |
| "advantages/std": 0.18728730082511902, |
| "advantages/var": 0.03507653305035863, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.7220630372492836, |
| "grad_norm": 31.63012832023641, |
| "learning_rate": 4.755894323586341e-07, |
| "loss": -0.288, |
| "num_tokens": 74413591.0, |
| "residual_var": 0.028499694541096687, |
| "reward": 0.91015625, |
| "reward_std": 0.07483352720737457, |
| "rewards/drgrpo_math_reward/mean": 0.91015625, |
| "rewards/drgrpo_math_reward/std": 0.2865179479122162, |
| "rho2": 0.18749995529651642, |
| "step": 474 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 5.854465882054629e-10, |
| "advantages/std": 0.19884873926639557, |
| "advantages/var": 0.03954082110783497, |
| "completions/clipped_ratio": -2.65625, |
| "epoch": 2.7277936962750715, |
| "grad_norm": 40.0832962857382, |
| "learning_rate": 4.7177335814691564e-07, |
| "loss": -0.1724, |
| "num_tokens": 74577378.0, |
| "residual_var": 0.03089127689599991, |
| "reward": 0.8046875, |
| "reward_std": 0.08588206768035889, |
| "rewards/drgrpo_math_reward/mean": 0.8046875, |
| "rewards/drgrpo_math_reward/std": 0.39721766114234924, |
| "rho2": 0.21874995529651642, |
| "step": 475 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.2320239164203405e-09, |
| "advantages/std": 0.18898223340511322, |
| "advantages/var": 0.03571428454278469, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.7335243553008595, |
| "grad_norm": 91.24920309867713, |
| "learning_rate": 4.6796792348466353e-07, |
| "loss": -0.1096, |
| "num_tokens": 74713567.0, |
| "residual_var": 0.030133940279483795, |
| "reward": 0.875, |
| "reward_std": 0.07536394149065018, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.1562499701976776, |
| "step": 476 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.050673483071512e-09, |
| "advantages/std": 0.22160132229328156, |
| "advantages/var": 0.049107146042130845, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 2.7392550143266474, |
| "grad_norm": 40.786722179486674, |
| "learning_rate": 4.641732050210031e-07, |
| "loss": -0.4072, |
| "num_tokens": 74844079.0, |
| "residual_var": 0.03836497291922569, |
| "reward": 0.89453125, |
| "reward_std": 0.096134252846241, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.21874994039535522, |
| "step": 477 |
| }, |
| { |
| "advantages/mean": 1.862645149230957e-09, |
| "advantages/snr": 8.09588026328236e-09, |
| "advantages/std": 0.23007319867610931, |
| "advantages/var": 0.05293367674905647, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.7449856733524354, |
| "grad_norm": 68.16071403640386, |
| "learning_rate": 4.6038927918921566e-07, |
| "loss": 0.1926, |
| "num_tokens": 74988949.0, |
| "residual_var": 0.04135444387793541, |
| "reward": 0.87890625, |
| "reward_std": 0.1060032844543457, |
| "rewards/drgrpo_math_reward/mean": 0.87890625, |
| "rewards/drgrpo_math_reward/std": 0.3268752694129944, |
| "rho2": 0.21874995529651642, |
| "step": 478 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.2825037950751864e-09, |
| "advantages/std": 0.21279241144657135, |
| "advantages/var": 0.04528061036924691, |
| "completions/clipped_ratio": -2.703125, |
| "epoch": 2.7507163323782233, |
| "grad_norm": 48.60041249893682, |
| "learning_rate": 4.5661622220519455e-07, |
| "loss": -0.1421, |
| "num_tokens": 75147052.0, |
| "residual_var": 0.03679051250219345, |
| "reward": 0.79296875, |
| "reward_std": 0.09153735637664795, |
| "rewards/drgrpo_math_reward/mean": 0.79296875, |
| "rewards/drgrpo_math_reward/std": 0.40597182512283325, |
| "rho2": 0.18749994039535522, |
| "step": 479 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.729520554863667e-09, |
| "advantages/std": 0.1872873157262802, |
| "advantages/var": 0.03507653863195537, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 2.7564469914040117, |
| "grad_norm": 30.727892281289908, |
| "learning_rate": 4.52854110065914e-07, |
| "loss": -0.3299, |
| "num_tokens": 75292654.0, |
| "residual_var": 0.031788118183612823, |
| "reward": 0.84765625, |
| "reward_std": 0.06180911511182785, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.0937499925494194, |
| "step": 480 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.663506180869644e-09, |
| "advantages/std": 0.19066213071346283, |
| "advantages/var": 0.036352048088197586, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 2.7621776504297992, |
| "grad_norm": 35.808697911905746, |
| "learning_rate": 4.4910301854789755e-07, |
| "loss": 0.0054, |
| "num_tokens": 75435968.0, |
| "residual_var": 0.031808048486709595, |
| "reward": 0.92578125, |
| "reward_std": 0.06944026798009872, |
| "rewards/drgrpo_math_reward/mean": 0.92578125, |
| "rewards/drgrpo_math_reward/std": 0.2626400291919708, |
| "rho2": 0.12499997019767761, |
| "step": 481 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 3.209860470786324e-09, |
| "advantages/std": 0.1450721174478531, |
| "advantages/var": 0.02104591926080368, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 2.7679083094555876, |
| "grad_norm": 41.99713741690543, |
| "learning_rate": 4.4536302320569006e-07, |
| "loss": 0.1462, |
| "num_tokens": 75594473.0, |
| "residual_var": 0.018415190279483795, |
| "reward": 0.82421875, |
| "reward_std": 0.04761157184839249, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.1249999850988388, |
| "step": 482 |
| }, |
| { |
| "advantages/mean": 8.149072527885437e-10, |
| "advantages/snr": 3.6773571907502916e-09, |
| "advantages/std": 0.22160132229328156, |
| "advantages/var": 0.049107146042130845, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.773638968481375, |
| "grad_norm": 32.267390440275356, |
| "learning_rate": 4.416341993703373e-07, |
| "loss": -0.0421, |
| "num_tokens": 75741541.0, |
| "residual_var": 0.036830369383096695, |
| "reward": 0.78515625, |
| "reward_std": 0.10205549001693726, |
| "rewards/drgrpo_math_reward/mean": 0.78515625, |
| "rewards/drgrpo_math_reward/std": 0.4115184545516968, |
| "rho2": 0.24999994039535522, |
| "step": 483 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 5.7594526340187176e-09, |
| "advantages/std": 0.16170330345630646, |
| "advantages/var": 0.026147958348682332, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.7793696275071635, |
| "grad_norm": 26.778065667371298, |
| "learning_rate": 4.3791662214786963e-07, |
| "loss": -0.2001, |
| "num_tokens": 75879212.0, |
| "residual_var": 0.022879473865032196, |
| "reward": 0.83984375, |
| "reward_std": 0.05273643881082535, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.12499997764825821, |
| "step": 484 |
| }, |
| { |
| "advantages/mean": 1.862645149230957e-09, |
| "advantages/snr": 7.488886640017008e-09, |
| "advantages/std": 0.24872122704982758, |
| "advantages/var": 0.06186224878517188, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.785100286532951, |
| "grad_norm": 46.19294691896084, |
| "learning_rate": 4.3421036641778553e-07, |
| "loss": -0.2724, |
| "num_tokens": 76021846.0, |
| "residual_var": 0.04446350410580635, |
| "reward": 0.78515625, |
| "reward_std": 0.12164628505706787, |
| "rewards/drgrpo_math_reward/mean": 0.78515625, |
| "rewards/drgrpo_math_reward/std": 0.4115184545516968, |
| "rho2": 0.2812499403953552, |
| "step": 485 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.13832083344459534, |
| "advantages/var": 0.019132652964807484, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.7908309455587395, |
| "grad_norm": 24.433410892525433, |
| "learning_rate": 4.3051550683154804e-07, |
| "loss": 0.0299, |
| "num_tokens": 76165491.0, |
| "residual_var": 0.017936870455741882, |
| "reward": 0.921875, |
| "reward_std": 0.0388009138405323, |
| "rewards/drgrpo_math_reward/mean": 0.921875, |
| "rewards/drgrpo_math_reward/std": 0.26889389753341675, |
| "rho2": 0.062499988824129105, |
| "step": 486 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 4.217937792220085e-09, |
| "advantages/std": 0.16560032963752747, |
| "advantages/var": 0.027423469176057758, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.796561604584527, |
| "grad_norm": 31.259156249194138, |
| "learning_rate": 4.2683211781107785e-07, |
| "loss": -0.0272, |
| "num_tokens": 76298358.0, |
| "residual_var": 0.023138564079999924, |
| "reward": 0.82421875, |
| "reward_std": 0.06036758795380592, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.15624995529651642, |
| "step": 487 |
| }, |
| { |
| "advantages/mean": -1.862645149230957e-09, |
| "advantages/snr": 7.818224078022042e-09, |
| "advantages/std": 0.23824401199817657, |
| "advantages/var": 0.056760209252987304, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.8022922636103154, |
| "grad_norm": 69.13207204855937, |
| "learning_rate": 4.2316027354725515e-07, |
| "loss": -0.2976, |
| "num_tokens": 76438276.0, |
| "residual_var": 0.042570166289806366, |
| "reward": 0.85546875, |
| "reward_std": 0.11060018837451935, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.24999995529651642, |
| "step": 488 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 5.00823255374953e-09, |
| "advantages/std": 0.2789374887943268, |
| "advantages/var": 0.07780612265488518, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 2.8080229226361033, |
| "grad_norm": 45.43666043048814, |
| "learning_rate": 4.195000479984264e-07, |
| "loss": -0.6233, |
| "num_tokens": 76588502.0, |
| "residual_var": 0.058354608714580536, |
| "reward": 0.8515625, |
| "reward_std": 0.14400538802146912, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.24999994039535522, |
| "step": 489 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.0745772796518186e-09, |
| "advantages/std": 0.22446079552173615, |
| "advantages/var": 0.050382648726250645, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 2.8137535816618913, |
| "grad_norm": 57.63830937677325, |
| "learning_rate": 4.15851514888913e-07, |
| "loss": -0.0804, |
| "num_tokens": 76732142.0, |
| "residual_var": 0.04093591868877411, |
| "reward": 0.86328125, |
| "reward_std": 0.09666222333908081, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.18749995529651642, |
| "step": 490 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.0575632043984715e-09, |
| "advantages/std": 0.220157653093338, |
| "advantages/var": 0.048469392215566565, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 2.819484240687679, |
| "grad_norm": 51.84633885141528, |
| "learning_rate": 4.1221474770752696e-07, |
| "loss": -0.2512, |
| "num_tokens": 76877475.0, |
| "residual_var": 0.03635205328464508, |
| "reward": 0.875, |
| "reward_std": 0.10205793380737305, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.2499999701976776, |
| "step": 491 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 2.95806495593781e-09, |
| "advantages/std": 0.2754865884780884, |
| "advantages/var": 0.07589286043129562, |
| "completions/clipped_ratio": -2.625, |
| "epoch": 2.825214899713467, |
| "grad_norm": 57.20556558168534, |
| "learning_rate": 4.0858981970609107e-07, |
| "loss": -0.5989, |
| "num_tokens": 77042376.0, |
| "residual_var": 0.05217635631561279, |
| "reward": 0.74609375, |
| "reward_std": 0.14940109848976135, |
| "rewards/drgrpo_math_reward/mean": 0.74609375, |
| "rewards/drgrpo_math_reward/std": 0.4360972046852112, |
| "rho2": 0.3124999403953552, |
| "step": 492 |
| }, |
| { |
| "advantages/mean": -1.862645149230957e-09, |
| "advantages/snr": 6.233602299984905e-09, |
| "advantages/std": 0.29880717396736145, |
| "advantages/var": 0.08928572721436101, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 2.830945558739255, |
| "grad_norm": 59.43106284892127, |
| "learning_rate": 4.049768038979631e-07, |
| "loss": -0.5261, |
| "num_tokens": 77215285.0, |
| "residual_var": 0.061383944004774094, |
| "reward": 0.8515625, |
| "reward_std": 0.1630682349205017, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.3124999403953552, |
| "step": 493 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.030785103248179e-09, |
| "advantages/std": 0.22587698698043823, |
| "advantages/var": 0.05102041324736106, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 2.836676217765043, |
| "grad_norm": 45.3427622201107, |
| "learning_rate": 4.013757730565648e-07, |
| "loss": -0.6061, |
| "num_tokens": 77347136.0, |
| "residual_var": 0.04304848238825798, |
| "reward": 0.8515625, |
| "reward_std": 0.09719263762235641, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.15624995529651642, |
| "step": 494 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 8.347054256249217e-10, |
| "advantages/std": 0.2789374887943268, |
| "advantages/var": 0.07780612265488518, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 2.842406876790831, |
| "grad_norm": 84.63958616387762, |
| "learning_rate": 3.977867997139178e-07, |
| "loss": -0.4822, |
| "num_tokens": 77512853.0, |
| "residual_var": 0.05106028541922569, |
| "reward": 0.7578125, |
| "reward_std": 0.15702980756759644, |
| "rewards/drgrpo_math_reward/mean": 0.7578125, |
| "rewards/drgrpo_math_reward/std": 0.4292463958263397, |
| "rho2": 0.34374991059303284, |
| "step": 495 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 5.3587966720360335e-09, |
| "advantages/std": 0.21724152565002441, |
| "advantages/var": 0.047193880466750215, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 2.848137535816619, |
| "grad_norm": 51.67407403119023, |
| "learning_rate": 3.942099561591802e-07, |
| "loss": -0.3061, |
| "num_tokens": 77665722.0, |
| "residual_var": 0.03539542108774185, |
| "reward": 0.7890625, |
| "reward_std": 0.10034801065921783, |
| "rewards/drgrpo_math_reward/mean": 0.7890625, |
| "rewards/drgrpo_math_reward/std": 0.4087733030319214, |
| "rho2": 0.24999994039535522, |
| "step": 496 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.011886590227791e-09, |
| "advantages/std": 0.2314550280570984, |
| "advantages/var": 0.0535714300129122, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 2.853868194842407, |
| "grad_norm": 41.89118231868127, |
| "learning_rate": 3.9064531443719194e-07, |
| "loss": -0.1388, |
| "num_tokens": 77811032.0, |
| "residual_var": 0.043526798486709595, |
| "reward": 0.890625, |
| "reward_std": 0.1065337061882019, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.1874999701976776, |
| "step": 497 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 3.7836533251311736e-09, |
| "advantages/std": 0.24614372849464417, |
| "advantages/var": 0.0605867350772451, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 2.859598853868195, |
| "grad_norm": 38.561900143076926, |
| "learning_rate": 3.8709294634702373e-07, |
| "loss": -0.4367, |
| "num_tokens": 77951892.0, |
| "residual_var": 0.04922673851251602, |
| "reward": 0.89453125, |
| "reward_std": 0.11336605250835419, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.18749995529651642, |
| "step": 498 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.035955098730885e-09, |
| "advantages/std": 0.23007319867610931, |
| "advantages/var": 0.05293367674905647, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 2.865329512893983, |
| "grad_norm": 70.63712164551843, |
| "learning_rate": 3.835529234405303e-07, |
| "loss": -0.2097, |
| "num_tokens": 78096080.0, |
| "residual_var": 0.04135444387793541, |
| "reward": 0.83203125, |
| "reward_std": 0.09954920411109924, |
| "rewards/drgrpo_math_reward/mean": 0.83203125, |
| "rewards/drgrpo_math_reward/std": 0.3745708465576172, |
| "rho2": 0.21874995529651642, |
| "step": 499 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 5.167123471029963e-09, |
| "advantages/std": 0.1577100306749344, |
| "advantages/var": 0.024872453775488745, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.871060171919771, |
| "grad_norm": 24.234209829875365, |
| "learning_rate": 3.8002531702090933e-07, |
| "loss": -0.3787, |
| "num_tokens": 78230318.0, |
| "residual_var": 0.02254066802561283, |
| "reward": 0.94921875, |
| "reward_std": 0.04510528966784477, |
| "rewards/drgrpo_math_reward/mean": 0.94921875, |
| "rewards/drgrpo_math_reward/std": 0.21998079121112823, |
| "rho2": 0.0937499850988388, |
| "step": 500 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.2211688223967514e-09, |
| "advantages/std": 0.19066211581230164, |
| "advantages/var": 0.03635204240602352, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 2.8767908309455588, |
| "grad_norm": 31.66254462537452, |
| "learning_rate": 3.765101981412665e-07, |
| "loss": -0.1826, |
| "num_tokens": 78372140.0, |
| "residual_var": 0.02840004302561283, |
| "reward": 0.81640625, |
| "reward_std": 0.08246467262506485, |
| "rewards/drgrpo_math_reward/mean": 0.81640625, |
| "rewards/drgrpo_math_reward/std": 0.387910932302475, |
| "rho2": 0.21874995529651642, |
| "step": 501 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.2624487090453552e-09, |
| "advantages/std": 0.27664169669151306, |
| "advantages/var": 0.07653062834835911, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 2.8825214899713467, |
| "grad_norm": 48.621615828233956, |
| "learning_rate": 3.730076376031821e-07, |
| "loss": -0.811, |
| "num_tokens": 78520915.0, |
| "residual_var": 0.05500639230012894, |
| "reward": 0.8125, |
| "reward_std": 0.1434774398803711, |
| "rewards/drgrpo_math_reward/mean": 0.8125, |
| "rewards/drgrpo_math_reward/std": 0.3910769522190094, |
| "rho2": 0.2812499403953552, |
| "step": 502 |
| }, |
| { |
| "advantages/mean": -1.862645149230957e-09, |
| "advantages/snr": 9.292506007152315e-09, |
| "advantages/std": 0.20044593513011932, |
| "advantages/var": 0.040178572910188004, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 2.8882521489971347, |
| "grad_norm": 50.217816467547244, |
| "learning_rate": 3.6951770595528606e-07, |
| "loss": 0.1648, |
| "num_tokens": 78670558.0, |
| "residual_var": 0.032645098865032196, |
| "reward": 0.89453125, |
| "reward_std": 0.07995839416980743, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.18749995529651642, |
| "step": 503 |
| }, |
| { |
| "advantages/mean": 1.280568540096283e-09, |
| "advantages/snr": 4.535454255086579e-09, |
| "advantages/std": 0.282346248626709, |
| "advantages/var": 0.07971940411357537, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 2.8939828080229226, |
| "grad_norm": 45.149223842328865, |
| "learning_rate": 3.6604047349183866e-07, |
| "loss": 0.558, |
| "num_tokens": 78817902.0, |
| "residual_var": 0.062280792742967606, |
| "reward": 0.80859375, |
| "reward_std": 0.13979163765907288, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.21874995529651642, |
| "step": 504 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 5.000042345610262e-09, |
| "advantages/std": 0.23282866179943085, |
| "advantages/var": 0.05420918575531375, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 2.8997134670487106, |
| "grad_norm": 39.995422100809684, |
| "learning_rate": 3.625760102513102e-07, |
| "loss": -0.403, |
| "num_tokens": 78964706.0, |
| "residual_var": 0.03726882115006447, |
| "reward": 0.86328125, |
| "reward_std": 0.12073761969804764, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.3124999701976776, |
| "step": 505 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.5461776548722686e-09, |
| "advantages/std": 0.22587698698043823, |
| "advantages/var": 0.05102041324736106, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 2.9054441260744985, |
| "grad_norm": 31.787805292575744, |
| "learning_rate": 3.5912438601497584e-07, |
| "loss": -0.0309, |
| "num_tokens": 79114030.0, |
| "residual_var": 0.04304848238825798, |
| "reward": 0.8828125, |
| "reward_std": 0.09719263762235641, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.15624995529651642, |
| "step": 506 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 4.407822555439857e-09, |
| "advantages/std": 0.21128857135772705, |
| "advantages/var": 0.044642860386389316, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.9111747851002865, |
| "grad_norm": 34.96829725190875, |
| "learning_rate": 3.5568567030550577e-07, |
| "loss": -0.0775, |
| "num_tokens": 79275874.0, |
| "residual_var": 0.034877244383096695, |
| "reward": 0.8125, |
| "reward_std": 0.09100693464279175, |
| "rewards/drgrpo_math_reward/mean": 0.8125, |
| "rewards/drgrpo_math_reward/std": 0.3910769522190094, |
| "rho2": 0.21874995529651642, |
| "step": 507 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 5.090684667873576e-09, |
| "advantages/std": 0.22868302464485168, |
| "advantages/var": 0.05229592576071784, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 2.9169054441260744, |
| "grad_norm": 51.66520804751849, |
| "learning_rate": 3.522599323855664e-07, |
| "loss": -0.3839, |
| "num_tokens": 79426033.0, |
| "residual_var": 0.040856197476387024, |
| "reward": 0.859375, |
| "reward_std": 0.09955164790153503, |
| "rewards/drgrpo_math_reward/mean": 0.859375, |
| "rewards/drgrpo_math_reward/std": 0.3483152687549591, |
| "rho2": 0.2187499701976776, |
| "step": 508 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.115126408796943e-09, |
| "advantages/std": 0.220157653093338, |
| "advantages/var": 0.048469392215566565, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.9226361031518624, |
| "grad_norm": 48.69432010109758, |
| "learning_rate": 3.488472412564264e-07, |
| "loss": -0.2349, |
| "num_tokens": 79574622.0, |
| "residual_var": 0.03938138857483864, |
| "reward": 0.84375, |
| "reward_std": 0.09495475143194199, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.1874999701976776, |
| "step": 509 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.1291806000454375e-09, |
| "advantages/std": 0.2187044471502304, |
| "advantages/var": 0.047831635203287926, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 2.9283667621776504, |
| "grad_norm": 40.305077001653665, |
| "learning_rate": 3.45447665656565e-07, |
| "loss": -0.1031, |
| "num_tokens": 79727238.0, |
| "residual_var": 0.03736847639083862, |
| "reward": 0.83203125, |
| "reward_std": 0.09442433714866638, |
| "rewards/drgrpo_math_reward/mean": 0.83203125, |
| "rewards/drgrpo_math_reward/std": 0.3745708465576172, |
| "rho2": 0.21874995529651642, |
| "step": 510 |
| }, |
| { |
| "advantages/mean": -5.820766091346741e-10, |
| "advantages/snr": 2.5610062551388323e-09, |
| "advantages/std": 0.22728432714939117, |
| "advantages/var": 0.051658165367751474, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.9340974212034383, |
| "grad_norm": 51.197575059820984, |
| "learning_rate": 3.420612740602874e-07, |
| "loss": -0.2717, |
| "num_tokens": 79878658.0, |
| "residual_var": 0.03874363377690315, |
| "reward": 0.85546875, |
| "reward_std": 0.10547532886266708, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.2499999701976776, |
| "step": 511 |
| }, |
| { |
| "advantages/mean": -1.3969838619232178e-09, |
| "advantages/snr": 5.158405507265383e-09, |
| "advantages/std": 0.27081698179244995, |
| "advantages/var": 0.07334183762717217, |
| "completions/clipped_ratio": -2.71875, |
| "epoch": 2.9398280802292263, |
| "grad_norm": 67.20237705888161, |
| "learning_rate": 3.3868813467634827e-07, |
| "loss": -0.4017, |
| "num_tokens": 80040195.0, |
| "residual_var": 0.05042252317070961, |
| "reward": 0.73046875, |
| "reward_std": 0.14598125219345093, |
| "rewards/drgrpo_math_reward/mean": 0.73046875, |
| "rewards/drgrpo_math_reward/std": 0.44458550214767456, |
| "rho2": 0.3124999403953552, |
| "step": 512 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.879726051639026e-09, |
| "advantages/std": 0.16170331835746765, |
| "advantages/var": 0.026147963167816535, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 2.945558739255014, |
| "grad_norm": 28.3295603420239, |
| "learning_rate": 3.3532831544657456e-07, |
| "loss": 0.2034, |
| "num_tokens": 80168350.0, |
| "residual_var": 0.022879473865032196, |
| "reward": 0.91796875, |
| "reward_std": 0.05273643881082535, |
| "rewards/drgrpo_math_reward/mean": 0.91796875, |
| "rewards/drgrpo_math_reward/std": 0.2749498784542084, |
| "rho2": 0.12499997764825821, |
| "step": 513 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 4.0725477342988605e-09, |
| "advantages/std": 0.22868302464485168, |
| "advantages/var": 0.05229592576071784, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 2.951289398280802, |
| "grad_norm": 37.95690908529424, |
| "learning_rate": 3.3198188404449865e-07, |
| "loss": -0.3579, |
| "num_tokens": 80315188.0, |
| "residual_var": 0.03922194987535477, |
| "reward": 0.84375, |
| "reward_std": 0.1054728701710701, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.24999994039535522, |
| "step": 514 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.3049056547270335e-09, |
| "advantages/std": 0.2020305097103119, |
| "advantages/var": 0.04081632685380843, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 2.95702005730659, |
| "grad_norm": 35.60499253279663, |
| "learning_rate": 3.28648907873996e-07, |
| "loss": -0.2166, |
| "num_tokens": 80472984.0, |
| "residual_var": 0.034438785165548325, |
| "reward": 0.8046875, |
| "reward_std": 0.08048880845308304, |
| "rewards/drgrpo_math_reward/mean": 0.8046875, |
| "rewards/drgrpo_math_reward/std": 0.39721766114234924, |
| "rho2": 0.1562499701976776, |
| "step": 515 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.061570206496358e-09, |
| "advantages/std": 0.22587698698043823, |
| "advantages/var": 0.05102041324736106, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 2.962750716332378, |
| "grad_norm": 54.49623163410567, |
| "learning_rate": 3.253294540679257e-07, |
| "loss": -0.3385, |
| "num_tokens": 80603270.0, |
| "residual_var": 0.03985970839858055, |
| "reward": 0.8828125, |
| "reward_std": 0.09784172475337982, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.21874995529651642, |
| "step": 516 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 5.7361820050862295e-09, |
| "advantages/std": 0.243538960814476, |
| "advantages/var": 0.05931122543459488, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 2.968481375358166, |
| "grad_norm": 40.92539135350785, |
| "learning_rate": 3.220235894867793e-07, |
| "loss": -0.3327, |
| "num_tokens": 80761049.0, |
| "residual_var": 0.044483426958322525, |
| "reward": 0.82421875, |
| "reward_std": 0.11875930428504944, |
| "rewards/drgrpo_math_reward/mean": 0.82421875, |
| "rewards/drgrpo_math_reward/std": 0.3813795745372772, |
| "rho2": 0.24999995529651642, |
| "step": 517 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.7909778497636661e-09, |
| "advantages/std": 0.260003924369812, |
| "advantages/var": 0.06760204068770292, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 2.974212034383954, |
| "grad_norm": 58.74183993632324, |
| "learning_rate": 3.1873138071733396e-07, |
| "loss": -0.4267, |
| "num_tokens": 80912014.0, |
| "residual_var": 0.05070154368877411, |
| "reward": 0.8046875, |
| "reward_std": 0.1337556689977646, |
| "rewards/drgrpo_math_reward/mean": 0.8046875, |
| "rewards/drgrpo_math_reward/std": 0.39721766114234924, |
| "rho2": 0.24999994039535522, |
| "step": 518 |
| }, |
| { |
| "advantages/mean": -5.820766091346741e-10, |
| "advantages/snr": 2.3773286733085444e-09, |
| "advantages/std": 0.24484480917453766, |
| "advantages/var": 0.05994898057971576, |
| "completions/clipped_ratio": -2.734375, |
| "epoch": 2.9799426934097424, |
| "grad_norm": 54.40628144346276, |
| "learning_rate": 3.154528940713113e-07, |
| "loss": -0.5217, |
| "num_tokens": 81056401.0, |
| "residual_var": 0.04308834299445152, |
| "reward": 0.703125, |
| "reward_std": 0.11993882060050964, |
| "rewards/drgrpo_math_reward/mean": 0.703125, |
| "rewards/drgrpo_math_reward/std": 0.45777595043182373, |
| "rho2": 0.2812499403953552, |
| "step": 519 |
| }, |
| { |
| "advantages/mean": -5.820766091346741e-10, |
| "advantages/snr": 2.1218375358741393e-09, |
| "advantages/std": 0.2743266522884369, |
| "advantages/var": 0.07525511215578096, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 2.98567335243553, |
| "grad_norm": 63.17104470170552, |
| "learning_rate": 3.121881955840421e-07, |
| "loss": -0.4794, |
| "num_tokens": 81215589.0, |
| "residual_var": 0.05408961698412895, |
| "reward": 0.796875, |
| "reward_std": 0.14822161197662354, |
| "rewards/drgrpo_math_reward/mean": 0.796875, |
| "rewards/drgrpo_math_reward/std": 0.40311288833618164, |
| "rho2": 0.2812499403953552, |
| "step": 520 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 5.590217219463297e-10, |
| "advantages/std": 0.2082482874393463, |
| "advantages/var": 0.043367349221420604, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 2.9914040114613183, |
| "grad_norm": 37.01518983121177, |
| "learning_rate": 3.0893735101313535e-07, |
| "loss": -0.0663, |
| "num_tokens": 81356272.0, |
| "residual_var": 0.03523597866296768, |
| "reward": 0.8671875, |
| "reward_std": 0.08337578177452087, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.1874999701976776, |
| "step": 521 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 4.7068702485910024e-09, |
| "advantages/std": 0.17313143610954285, |
| "advantages/var": 0.029974494169352717, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 2.997134670487106, |
| "grad_norm": 58.93078878918081, |
| "learning_rate": 3.0570042583715405e-07, |
| "loss": -0.1133, |
| "num_tokens": 81490470.0, |
| "residual_var": 0.027164394035935402, |
| "reward": 0.90234375, |
| "reward_std": 0.05668424814939499, |
| "rewards/drgrpo_math_reward/mean": 0.90234375, |
| "rewards/drgrpo_math_reward/std": 0.29743078351020813, |
| "rho2": 0.0937499850988388, |
| "step": 522 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 4.970887406617712e-10, |
| "advantages/std": 0.23419423401355743, |
| "advantages/var": 0.0548469392451969, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 3.005730659025788, |
| "grad_norm": 58.40789905317615, |
| "learning_rate": 3.0247748525429785e-07, |
| "loss": -0.1845, |
| "num_tokens": 81625398.0, |
| "residual_var": 0.03942125290632248, |
| "reward": 0.8359375, |
| "reward_std": 0.11481395363807678, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.2812499403953552, |
| "step": 523 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 7.009548925799328e-10, |
| "advantages/std": 0.332162082195282, |
| "advantages/var": 0.11033164884830526, |
| "completions/clipped_ratio": -2.640625, |
| "epoch": 3.011461318051576, |
| "grad_norm": 70.75980746457977, |
| "learning_rate": 2.992685941810864e-07, |
| "loss": -0.3411, |
| "num_tokens": 81783785.0, |
| "residual_var": 0.06206155940890312, |
| "reward": 0.78515625, |
| "reward_std": 0.2106797993183136, |
| "rewards/drgrpo_math_reward/mean": 0.78515625, |
| "rewards/drgrpo_math_reward/std": 0.4115184545516968, |
| "rho2": 0.4374999403953552, |
| "step": 524 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.1435186688144133e-09, |
| "advantages/std": 0.21724152565002441, |
| "advantages/var": 0.047193880466750215, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.017191977077364, |
| "grad_norm": 42.73496740391504, |
| "learning_rate": 2.9607381725105507e-07, |
| "loss": -0.1104, |
| "num_tokens": 81926608.0, |
| "residual_var": 0.03539542108774185, |
| "reward": 0.8515625, |
| "reward_std": 0.10034800320863724, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.24999995529651642, |
| "step": 525 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.3541305716820545e-09, |
| "advantages/std": 0.20824827253818512, |
| "advantages/var": 0.04336734301513823, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 3.022922636103152, |
| "grad_norm": 33.651790231476845, |
| "learning_rate": 2.9289321881345254e-07, |
| "loss": -0.3786, |
| "num_tokens": 82065619.0, |
| "residual_var": 0.03523597866296768, |
| "reward": 0.8359375, |
| "reward_std": 0.08982987701892853, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.1874999701976776, |
| "step": 526 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 7.199315129097565e-10, |
| "advantages/std": 0.16170331835746765, |
| "advantages/var": 0.026147963167816535, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 3.0286532951289398, |
| "grad_norm": 32.1041070083472, |
| "learning_rate": 2.8972686293194306e-07, |
| "loss": -0.3798, |
| "num_tokens": 82190899.0, |
| "residual_var": 0.022879473865032196, |
| "reward": 0.97265625, |
| "reward_std": 0.05273643881082535, |
| "rewards/drgrpo_math_reward/mean": 0.97265625, |
| "rewards/drgrpo_math_reward/std": 0.1634024828672409, |
| "rho2": 0.12499997764825821, |
| "step": 527 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.3049056547270335e-09, |
| "advantages/std": 0.2020305097103119, |
| "advantages/var": 0.04081632685380843, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 3.0343839541547277, |
| "grad_norm": 69.42900570188318, |
| "learning_rate": 2.8657481338331713e-07, |
| "loss": -0.0839, |
| "num_tokens": 82341140.0, |
| "residual_var": 0.03188776969909668, |
| "reward": 0.8984375, |
| "reward_std": 0.0875919908285141, |
| "rewards/drgrpo_math_reward/mean": 0.8984375, |
| "rewards/drgrpo_math_reward/std": 0.3026638329029083, |
| "rho2": 0.21874995529651642, |
| "step": 528 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.380496292175044e-09, |
| "advantages/std": 0.19561520218849182, |
| "advantages/var": 0.038265307327244535, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 3.0401146131805157, |
| "grad_norm": 29.96720122136539, |
| "learning_rate": 2.834371336562077e-07, |
| "loss": -0.0128, |
| "num_tokens": 82472668.0, |
| "residual_var": 0.031090570613741875, |
| "reward": 0.9140625, |
| "reward_std": 0.07825092226266861, |
| "rewards/drgrpo_math_reward/mean": 0.9140625, |
| "rewards/drgrpo_math_reward/std": 0.28082075715065, |
| "rho2": 0.1874999701976776, |
| "step": 529 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 2.3773286733085444e-09, |
| "advantages/std": 0.24484480917453766, |
| "advantages/var": 0.05994898057971576, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 3.0458452722063036, |
| "grad_norm": 44.458534391156505, |
| "learning_rate": 2.803138869498102e-07, |
| "loss": -0.0022, |
| "num_tokens": 82621094.0, |
| "residual_var": 0.05245537310838699, |
| "reward": 0.875, |
| "reward_std": 0.10626532137393951, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.12499997764825821, |
| "step": 530 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.2142857164144516, |
| "advantages/var": 0.04591836825925477, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 3.0515759312320916, |
| "grad_norm": 50.51329825337595, |
| "learning_rate": 2.7720513617260855e-07, |
| "loss": -0.5093, |
| "num_tokens": 82765309.0, |
| "residual_var": 0.03874363377690315, |
| "reward": 0.890625, |
| "reward_std": 0.09206777065992355, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.1562499701976776, |
| "step": 531 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.1151265519572637e-09, |
| "advantages/std": 0.22015763819217682, |
| "advantages/var": 0.04846938565435743, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.0573065902578795, |
| "grad_norm": 35.10227440382755, |
| "learning_rate": 2.7411094394111167e-07, |
| "loss": 0.1374, |
| "num_tokens": 82918289.0, |
| "residual_var": 0.03938138484954834, |
| "reward": 0.84375, |
| "reward_std": 0.09495474398136139, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.1874999701976776, |
| "step": 532 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.9155006725188733e-09, |
| "advantages/std": 0.15971913933753967, |
| "advantages/var": 0.025510203470724413, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 3.0630372492836675, |
| "grad_norm": 35.13963081749907, |
| "learning_rate": 2.7103137257858863e-07, |
| "loss": -0.2506, |
| "num_tokens": 83050303.0, |
| "residual_var": 0.021524246782064438, |
| "reward": 0.8671875, |
| "reward_std": 0.0586601123213768, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.1562499701976776, |
| "step": 533 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.2395787239074707, |
| "advantages/var": 0.057397964949132074, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 3.0687679083094554, |
| "grad_norm": 46.470983378748755, |
| "learning_rate": 2.6796648411381485e-07, |
| "loss": -0.1509, |
| "num_tokens": 83225273.0, |
| "residual_var": 0.04304848611354828, |
| "reward": 0.8515625, |
| "reward_std": 0.11705183237791061, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.24999995529651642, |
| "step": 534 |
| }, |
| { |
| "advantages/mean": -1.7462298274040222e-09, |
| "advantages/snr": 5.242033628734544e-09, |
| "advantages/std": 0.3331206738948822, |
| "advantages/var": 0.11096938337618045, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 3.0744985673352434, |
| "grad_norm": 66.47161989496657, |
| "learning_rate": 2.6491634027982324e-07, |
| "loss": -0.9807, |
| "num_tokens": 83377762.0, |
| "residual_var": 0.06588809192180634, |
| "reward": 0.71875, |
| "reward_std": 0.2112102210521698, |
| "rewards/drgrpo_math_reward/mean": 0.71875, |
| "rewards/drgrpo_math_reward/std": 0.45048993825912476, |
| "rho2": 0.4062499403953552, |
| "step": 535 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 3.896001826068581e-10, |
| "advantages/std": 0.29880714416503906, |
| "advantages/var": 0.08928570940406644, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 3.0802292263610314, |
| "grad_norm": 55.3104308735608, |
| "learning_rate": 2.6188100251265943e-07, |
| "loss": -0.5125, |
| "num_tokens": 83524281.0, |
| "residual_var": 0.058593761175870895, |
| "reward": 0.8203125, |
| "reward_std": 0.16898946464061737, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.3437499403953552, |
| "step": 536 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 9.084364294103083e-10, |
| "advantages/std": 0.2562982141971588, |
| "advantages/var": 0.0656887746006527, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 3.0859598853868193, |
| "grad_norm": 48.087935659741, |
| "learning_rate": 2.5886053195014534e-07, |
| "loss": -0.335, |
| "num_tokens": 83672810.0, |
| "residual_var": 0.049266595393419266, |
| "reward": 0.90234375, |
| "reward_std": 0.1255941092967987, |
| "rewards/drgrpo_math_reward/mean": 0.90234375, |
| "rewards/drgrpo_math_reward/std": 0.29743078351020813, |
| "rho2": 0.24999995529651642, |
| "step": 537 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.7153301191023887e-09, |
| "advantages/std": 0.20360276103019714, |
| "advantages/var": 0.041454084299119565, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 3.0916905444126073, |
| "grad_norm": 36.909324026341544, |
| "learning_rate": 2.558549894306472e-07, |
| "loss": -0.2538, |
| "num_tokens": 83813642.0, |
| "residual_var": 0.03368145599961281, |
| "reward": 0.77734375, |
| "reward_std": 0.08166831731796265, |
| "rewards/drgrpo_math_reward/mean": 0.77734375, |
| "rewards/drgrpo_math_reward/std": 0.41684433817863464, |
| "rho2": 0.1874999701976776, |
| "step": 538 |
| }, |
| { |
| "advantages/mean": 2.0954757928848267e-09, |
| "advantages/snr": 7.840552332757146e-09, |
| "advantages/std": 0.26726123690605164, |
| "advantages/var": 0.07142856875255266, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.097421203438395, |
| "grad_norm": 45.36181805672189, |
| "learning_rate": 2.528644354918503e-07, |
| "loss": -0.2785, |
| "num_tokens": 83948051.0, |
| "residual_var": 0.04464287683367729, |
| "reward": 0.765625, |
| "reward_std": 0.15137697756290436, |
| "rewards/drgrpo_math_reward/mean": 0.765625, |
| "rewards/drgrpo_math_reward/std": 0.42443734407424927, |
| "rho2": 0.3749999403953552, |
| "step": 539 |
| }, |
| { |
| "advantages/mean": 1.0477378964424133e-09, |
| "advantages/snr": 4.1697309751285605e-09, |
| "advantages/std": 0.2512722909450531, |
| "advantages/var": 0.06313776419677541, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 3.103151862464183, |
| "grad_norm": 48.626496905887414, |
| "learning_rate": 2.498889303695404e-07, |
| "loss": -0.0594, |
| "num_tokens": 84100302.0, |
| "residual_var": 0.04538027569651604, |
| "reward": 0.79296875, |
| "reward_std": 0.12335620820522308, |
| "rewards/drgrpo_math_reward/mean": 0.79296875, |
| "rewards/drgrpo_math_reward/std": 0.40597182512283325, |
| "rho2": 0.2812499403953552, |
| "step": 540 |
| }, |
| { |
| "advantages/mean": 3.14321368932724e-09, |
| "advantages/snr": 1.0556975559982473e-08, |
| "advantages/std": 0.29773807525634766, |
| "advantages/var": 0.08864796145735454, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 3.1088825214899716, |
| "grad_norm": 56.91244977023089, |
| "learning_rate": 2.4692853399638913e-07, |
| "loss": -0.3349, |
| "num_tokens": 84266071.0, |
| "residual_var": 0.055404990911483765, |
| "reward": 0.80859375, |
| "reward_std": 0.17491313815116882, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.3749999403953552, |
| "step": 541 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.19561520218849182, |
| "advantages/var": 0.038265307327244535, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 3.1146131805157595, |
| "grad_norm": 42.631437764051206, |
| "learning_rate": 2.439833060007471e-07, |
| "loss": -0.2056, |
| "num_tokens": 84388596.0, |
| "residual_var": 0.031090570613741875, |
| "reward": 0.9296875, |
| "reward_std": 0.07825092226266861, |
| "rewards/drgrpo_math_reward/mean": 0.9296875, |
| "rewards/drgrpo_math_reward/std": 0.2561737895011902, |
| "rho2": 0.1874999701976776, |
| "step": 542 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.8081166084975916e-09, |
| "advantages/std": 0.12876969575881958, |
| "advantages/var": 0.016581634545818957, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 3.1203438395415475, |
| "grad_norm": 27.399377039968435, |
| "learning_rate": 2.410533057054446e-07, |
| "loss": -0.1746, |
| "num_tokens": 84527155.0, |
| "residual_var": 0.015027116052806377, |
| "reward": 0.859375, |
| "reward_std": 0.036563027650117874, |
| "rewards/drgrpo_math_reward/mean": 0.859375, |
| "rewards/drgrpo_math_reward/std": 0.3483152687549591, |
| "rho2": 0.0937499850988388, |
| "step": 543 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.6412517826060687e-09, |
| "advantages/std": 0.21279242634773254, |
| "advantages/var": 0.04528061671095518, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 3.1260744985673354, |
| "grad_norm": 41.00985971424138, |
| "learning_rate": 2.381385921265936e-07, |
| "loss": -0.3216, |
| "num_tokens": 84699776.0, |
| "residual_var": 0.03396047279238701, |
| "reward": 0.74609375, |
| "reward_std": 0.09864053130149841, |
| "rewards/drgrpo_math_reward/mean": 0.74609375, |
| "rewards/drgrpo_math_reward/std": 0.4360972046852112, |
| "rho2": 0.24999995529651642, |
| "step": 544 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.911554501079231e-09, |
| "advantages/std": 0.1785714328289032, |
| "advantages/var": 0.03188775662256749, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 3.1318051575931234, |
| "grad_norm": 30.980264195023306, |
| "learning_rate": 2.352392239724016e-07, |
| "loss": 0.1331, |
| "num_tokens": 84825150.0, |
| "residual_var": 0.025908809155225754, |
| "reward": 0.7890625, |
| "reward_std": 0.07141612470149994, |
| "rewards/drgrpo_math_reward/mean": 0.7890625, |
| "rewards/drgrpo_math_reward/std": 0.4087733030319214, |
| "rho2": 0.18749995529651642, |
| "step": 545 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 3.931260267227327e-09, |
| "advantages/std": 0.23690177500247955, |
| "advantages/var": 0.056122450999325446, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 3.1375358166189113, |
| "grad_norm": 46.5091780323008, |
| "learning_rate": 2.3235525964198888e-07, |
| "loss": -0.0935, |
| "num_tokens": 84974438.0, |
| "residual_var": 0.04209184646606445, |
| "reward": 0.796875, |
| "reward_std": 0.1153419017791748, |
| "rewards/drgrpo_math_reward/mean": 0.796875, |
| "rewards/drgrpo_math_reward/std": 0.40311288833618164, |
| "rho2": 0.24999994039535522, |
| "step": 546 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.9120605513708516e-09, |
| "advantages/std": 0.2435389757156372, |
| "advantages/var": 0.05931123269262173, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 3.1432664756446993, |
| "grad_norm": 44.11507983606423, |
| "learning_rate": 2.2948675722421085e-07, |
| "loss": -0.6253, |
| "num_tokens": 85131523.0, |
| "residual_var": 0.04077647998929024, |
| "reward": 0.86328125, |
| "reward_std": 0.1258624941110611, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.3124999403953552, |
| "step": 547 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 4.79665589833252e-09, |
| "advantages/std": 0.29124119877815247, |
| "advantages/var": 0.08482143586573532, |
| "completions/clipped_ratio": -2.640625, |
| "epoch": 3.1489971346704873, |
| "grad_norm": 53.1988713378116, |
| "learning_rate": 2.266337744964888e-07, |
| "loss": -0.678, |
| "num_tokens": 85310056.0, |
| "residual_var": 0.05831475183367729, |
| "reward": 0.64453125, |
| "reward_std": 0.15794092416763306, |
| "rewards/drgrpo_math_reward/mean": 0.64453125, |
| "rewards/drgrpo_math_reward/std": 0.4795927405357361, |
| "rho2": 0.3124999403953552, |
| "step": 548 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 4.376671726766915e-09, |
| "advantages/std": 0.21279241144657135, |
| "advantages/var": 0.04528061036924691, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.154727793696275, |
| "grad_norm": 43.52396169619168, |
| "learning_rate": 2.2379636892364717e-07, |
| "loss": -0.1809, |
| "num_tokens": 85450686.0, |
| "residual_var": 0.036790505051612854, |
| "reward": 0.83984375, |
| "reward_std": 0.0850832611322403, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.1874999701976776, |
| "step": 549 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 5.215406001438975e-09, |
| "advantages/std": 0.1785714328289032, |
| "advantages/var": 0.03188775662256749, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 3.160458452722063, |
| "grad_norm": 37.17154478236981, |
| "learning_rate": 2.2097459765675343e-07, |
| "loss": -0.1732, |
| "num_tokens": 85594559.0, |
| "residual_var": 0.025908811017870903, |
| "reward": 0.90625, |
| "reward_std": 0.07141612470149994, |
| "rewards/drgrpo_math_reward/mean": 0.90625, |
| "rewards/drgrpo_math_reward/std": 0.2920515835285187, |
| "rho2": 0.18749995529651642, |
| "step": 550 |
| }, |
| { |
| "advantages/mean": -1.3969838619232178e-09, |
| "advantages/snr": 6.108821999503798e-09, |
| "advantages/std": 0.2286830097436905, |
| "advantages/var": 0.05229591894543284, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 3.166189111747851, |
| "grad_norm": 41.61138123554162, |
| "learning_rate": 2.181685175319702e-07, |
| "loss": -0.3167, |
| "num_tokens": 85736708.0, |
| "residual_var": 0.04249044507741928, |
| "reward": 0.796875, |
| "reward_std": 0.09836968779563904, |
| "rewards/drgrpo_math_reward/mean": 0.796875, |
| "rewards/drgrpo_math_reward/std": 0.40311288833618164, |
| "rho2": 0.18749994039535522, |
| "step": 551 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 5.559641684095414e-09, |
| "advantages/std": 0.1675148457288742, |
| "advantages/var": 0.028061223539568525, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 3.171919770773639, |
| "grad_norm": 33.13704713458902, |
| "learning_rate": 2.153781850694082e-07, |
| "loss": -0.0867, |
| "num_tokens": 85880529.0, |
| "residual_var": 0.024553582072257996, |
| "reward": 0.9140625, |
| "reward_std": 0.05444391071796417, |
| "rewards/drgrpo_math_reward/mean": 0.9140625, |
| "rewards/drgrpo_math_reward/std": 0.28082075715065, |
| "rho2": 0.12499996274709702, |
| "step": 552 |
| }, |
| { |
| "advantages/mean": -1.7462298274040222e-09, |
| "advantages/snr": 8.781698823081943e-09, |
| "advantages/std": 0.19884873926639557, |
| "advantages/var": 0.03954082110783497, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.177650429799427, |
| "grad_norm": 50.867545017443405, |
| "learning_rate": 2.1260365647198797e-07, |
| "loss": -0.27, |
| "num_tokens": 86020634.0, |
| "residual_var": 0.03212692588567734, |
| "reward": 0.90625, |
| "reward_std": 0.07996084541082382, |
| "rewards/drgrpo_math_reward/mean": 0.90625, |
| "rewards/drgrpo_math_reward/std": 0.2920515835285187, |
| "rho2": 0.1874999701976776, |
| "step": 553 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.891826548037334e-09, |
| "advantages/std": 0.24614374339580536, |
| "advantages/var": 0.060586742412900074, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 3.183381088825215, |
| "grad_norm": 43.90013873622376, |
| "learning_rate": 2.0984498762430957e-07, |
| "loss": -0.3093, |
| "num_tokens": 86160983.0, |
| "residual_var": 0.04922673478722572, |
| "reward": 0.92578125, |
| "reward_std": 0.11336604505777359, |
| "rewards/drgrpo_math_reward/mean": 0.92578125, |
| "rewards/drgrpo_math_reward/std": 0.2626400291919708, |
| "rho2": 0.1874999701976776, |
| "step": 554 |
| }, |
| { |
| "advantages/mean": 1.862645149230957e-09, |
| "advantages/snr": 7.527790190493179e-09, |
| "advantages/std": 0.24743583798408508, |
| "advantages/var": 0.0612244939188864, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 3.189111747851003, |
| "grad_norm": 54.52746162819845, |
| "learning_rate": 2.0710223409152471e-07, |
| "loss": -0.3519, |
| "num_tokens": 86308593.0, |
| "residual_var": 0.047831643372774124, |
| "reward": 0.7734375, |
| "reward_std": 0.12099964171648026, |
| "rewards/drgrpo_math_reward/mean": 0.7734375, |
| "rewards/drgrpo_math_reward/std": 0.41942715644836426, |
| "rho2": 0.2187499701976776, |
| "step": 555 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 7.527789707330408e-10, |
| "advantages/std": 0.30929481983184814, |
| "advantages/var": 0.0956632855748154, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 3.194842406876791, |
| "grad_norm": 65.55525445754603, |
| "learning_rate": 2.043754511182191e-07, |
| "loss": -0.1717, |
| "num_tokens": 86470234.0, |
| "residual_var": 0.06875799596309662, |
| "reward": 0.8046875, |
| "reward_std": 0.17517516016960144, |
| "rewards/drgrpo_math_reward/mean": 0.8046875, |
| "rewards/drgrpo_math_reward/std": 0.39721766114234924, |
| "rho2": 0.2812499403953552, |
| "step": 556 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.7120408307325485e-09, |
| "advantages/std": 0.13599595427513123, |
| "advantages/var": 0.018494899579203583, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 3.200573065902579, |
| "grad_norm": 21.553191096455976, |
| "learning_rate": 2.0166469362729865e-07, |
| "loss": -0.1121, |
| "num_tokens": 86616135.0, |
| "residual_var": 0.016761012375354767, |
| "reward": 0.84765625, |
| "reward_std": 0.0382704958319664, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.09374997019767761, |
| "step": 557 |
| }, |
| { |
| "advantages/mean": -1.3969838619232178e-09, |
| "advantages/snr": 5.965064887941254e-09, |
| "advantages/std": 0.23419423401355743, |
| "advantages/var": 0.0548469392451969, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 3.206303724928367, |
| "grad_norm": 53.09012235190345, |
| "learning_rate": 1.9897001621888432e-07, |
| "loss": -0.4491, |
| "num_tokens": 86763730.0, |
| "residual_var": 0.03942125290632248, |
| "reward": 0.8203125, |
| "reward_std": 0.11481394618749619, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.2812499403953552, |
| "step": 558 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 5.215406001438975e-09, |
| "advantages/std": 0.1785714328289032, |
| "advantages/var": 0.03188775662256749, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 3.2120343839541547, |
| "grad_norm": 41.12764521799682, |
| "learning_rate": 1.9629147316921123e-07, |
| "loss": -0.7028, |
| "num_tokens": 86899036.0, |
| "residual_var": 0.025908807292580605, |
| "reward": 0.90625, |
| "reward_std": 0.07141612470149994, |
| "rewards/drgrpo_math_reward/mean": 0.90625, |
| "rewards/drgrpo_math_reward/std": 0.2920515835285187, |
| "rho2": 0.1874999701976776, |
| "step": 559 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.0717593344072067e-09, |
| "advantages/std": 0.21724152565002441, |
| "advantages/var": 0.047193880466750215, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.2177650429799427, |
| "grad_norm": 41.15636930814853, |
| "learning_rate": 1.9362911842953678e-07, |
| "loss": -0.4087, |
| "num_tokens": 87036281.0, |
| "residual_var": 0.03834503889083862, |
| "reward": 0.875, |
| "reward_std": 0.09324482083320618, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.18749994039535522, |
| "step": 560 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.4863472343972843e-09, |
| "advantages/std": 0.18728730082511902, |
| "advantages/var": 0.03507653305035863, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 3.2234957020057307, |
| "grad_norm": 34.18837003348517, |
| "learning_rate": 1.9098300562505264e-07, |
| "loss": -0.2386, |
| "num_tokens": 87184357.0, |
| "residual_var": 0.02849968895316124, |
| "reward": 0.78515625, |
| "reward_std": 0.07483352720737457, |
| "rewards/drgrpo_math_reward/mean": 0.78515625, |
| "rewards/drgrpo_math_reward/std": 0.4115184545516968, |
| "rho2": 0.1874999701976776, |
| "step": 561 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.9120605513708516e-09, |
| "advantages/std": 0.2435389757156372, |
| "advantages/var": 0.05931123269262173, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 3.2292263610315186, |
| "grad_norm": 47.01875974339951, |
| "learning_rate": 1.8835318805380508e-07, |
| "loss": -0.3076, |
| "num_tokens": 87319723.0, |
| "residual_var": 0.05004385486245155, |
| "reward": 0.90234375, |
| "reward_std": 0.10573489964008331, |
| "rewards/drgrpo_math_reward/mean": 0.90234375, |
| "rewards/drgrpo_math_reward/std": 0.29743078351020813, |
| "rho2": 0.1562499701976776, |
| "step": 562 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.22728432714939117, |
| "advantages/var": 0.051658165367751474, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 3.2349570200573066, |
| "grad_norm": 35.16009147859087, |
| "learning_rate": 1.8573971868562156e-07, |
| "loss": 0.0347, |
| "num_tokens": 87450416.0, |
| "residual_var": 0.04197227209806442, |
| "reward": 0.89453125, |
| "reward_std": 0.09837214648723602, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.1874999701976776, |
| "step": 563 |
| }, |
| { |
| "advantages/mean": -1.3969838619232178e-09, |
| "advantages/snr": 6.387541800136313e-09, |
| "advantages/std": 0.2187044471502304, |
| "advantages/var": 0.047831635203287926, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.2406876790830945, |
| "grad_norm": 70.44073367079997, |
| "learning_rate": 1.8314265016104414e-07, |
| "loss": -0.3285, |
| "num_tokens": 87582852.0, |
| "residual_var": 0.041852690279483795, |
| "reward": 0.94140625, |
| "reward_std": 0.08785402029752731, |
| "rewards/drgrpo_math_reward/mean": 0.94140625, |
| "rewards/drgrpo_math_reward/std": 0.23532284796237946, |
| "rho2": 0.1249999850988388, |
| "step": 564 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.464047638551681e-09, |
| "advantages/std": 0.18898224830627441, |
| "advantages/var": 0.03571429017489436, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.2464183381088825, |
| "grad_norm": 36.19653827955635, |
| "learning_rate": 1.805620347902681e-07, |
| "loss": 0.2787, |
| "num_tokens": 87718694.0, |
| "residual_var": 0.030133940279483795, |
| "reward": 0.8515625, |
| "reward_std": 0.06890984624624252, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.1562499701976776, |
| "step": 565 |
| }, |
| { |
| "advantages/mean": 1.862645149230957e-09, |
| "advantages/snr": 7.527790190493179e-09, |
| "advantages/std": 0.24743583798408508, |
| "advantages/var": 0.0612244939188864, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.2521489971346704, |
| "grad_norm": 35.60382202111244, |
| "learning_rate": 1.7799792455209016e-07, |
| "loss": -0.4789, |
| "num_tokens": 87860362.0, |
| "residual_var": 0.05165817216038704, |
| "reward": 0.84375, |
| "reward_std": 0.10744237154722214, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.1562499701976776, |
| "step": 566 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.030785103248179e-09, |
| "advantages/std": 0.22587698698043823, |
| "advantages/var": 0.05102041324736106, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.2578796561604584, |
| "grad_norm": 41.956700507237855, |
| "learning_rate": 1.7545037109285942e-07, |
| "loss": -0.1415, |
| "num_tokens": 88014979.0, |
| "residual_var": 0.03826531767845154, |
| "reward": 0.84375, |
| "reward_std": 0.10376295447349548, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.24999994039535522, |
| "step": 567 |
| }, |
| { |
| "advantages/mean": 3.4924596548080444e-10, |
| "advantages/snr": 1.4577502760144356e-09, |
| "advantages/std": 0.2395787239074707, |
| "advantages/var": 0.057397964949132074, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 3.2636103151862463, |
| "grad_norm": 53.664794389081344, |
| "learning_rate": 1.7291942572543805e-07, |
| "loss": -1.3273, |
| "num_tokens": 88173410.0, |
| "residual_var": 0.04304848238825798, |
| "reward": 0.875, |
| "reward_std": 0.11705182492733002, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.24999994039535522, |
| "step": 568 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.1615632508940394e-09, |
| "advantages/std": 0.20044593513011932, |
| "advantages/var": 0.040178572910188004, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 3.2693409742120343, |
| "grad_norm": 47.15953812150556, |
| "learning_rate": 1.7040513942816904e-07, |
| "loss": -0.5937, |
| "num_tokens": 88309931.0, |
| "residual_var": 0.032645098865032196, |
| "reward": 0.91796875, |
| "reward_std": 0.07995839416980743, |
| "rewards/drgrpo_math_reward/mean": 0.91796875, |
| "rewards/drgrpo_math_reward/std": 0.2749498784542084, |
| "rho2": 0.18749995529651642, |
| "step": 569 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.22587698698043823, |
| "advantages/var": 0.05102041324736106, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 3.2750716332378222, |
| "grad_norm": 41.7539818186476, |
| "learning_rate": 1.6790756284384611e-07, |
| "loss": 0.1196, |
| "num_tokens": 88455379.0, |
| "residual_var": 0.04304847866296768, |
| "reward": 0.9296875, |
| "reward_std": 0.09073854982852936, |
| "rewards/drgrpo_math_reward/mean": 0.9296875, |
| "rewards/drgrpo_math_reward/std": 0.2561737895011902, |
| "rho2": 0.15624995529651642, |
| "step": 570 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 3.616233357409939e-09, |
| "advantages/std": 0.25753939151763916, |
| "advantages/var": 0.06632653818327583, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 3.28080229226361, |
| "grad_norm": 54.99980080757648, |
| "learning_rate": 1.6542674627869734e-07, |
| "loss": -0.1985, |
| "num_tokens": 88607222.0, |
| "residual_var": 0.055963024497032166, |
| "reward": 0.890625, |
| "reward_std": 0.11902132630348206, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.15624995529651642, |
| "step": 571 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 4.760992584350088e-09, |
| "advantages/std": 0.19561520218849182, |
| "advantages/var": 0.038265307327244535, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 3.286532951289398, |
| "grad_norm": 74.1830205981033, |
| "learning_rate": 1.6296273970136976e-07, |
| "loss": -0.1486, |
| "num_tokens": 88759284.0, |
| "residual_var": 0.033482152968645096, |
| "reward": 0.84375, |
| "reward_std": 0.07114773988723755, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.12499997019767761, |
| "step": 572 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 4.31630082528225e-09, |
| "advantages/std": 0.21576867997646332, |
| "advantages/var": 0.04655612325878544, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 3.292263610315186, |
| "grad_norm": 36.86016378974931, |
| "learning_rate": 1.6051559274192273e-07, |
| "loss": -0.4809, |
| "num_tokens": 88909197.0, |
| "residual_var": 0.03637198358774185, |
| "reward": 0.79296875, |
| "reward_std": 0.09271440654993057, |
| "rewards/drgrpo_math_reward/mean": 0.79296875, |
| "rewards/drgrpo_math_reward/std": 0.40597182512283325, |
| "rho2": 0.21874992549419403, |
| "step": 573 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.442337644793503e-09, |
| "advantages/std": 0.19066211581230164, |
| "advantages/var": 0.03635204240602352, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 3.297994269340974, |
| "grad_norm": 36.068320712126784, |
| "learning_rate": 1.5808535469082995e-07, |
| "loss": -0.0395, |
| "num_tokens": 89058629.0, |
| "residual_var": 0.02840004302561283, |
| "reward": 0.67578125, |
| "reward_std": 0.08246467262506485, |
| "rewards/drgrpo_math_reward/mean": 0.67578125, |
| "rewards/drgrpo_math_reward/std": 0.46899911761283875, |
| "rho2": 0.21874994039535522, |
| "step": 574 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 4.987832171158065e-09, |
| "advantages/std": 0.28007835149765015, |
| "advantages/var": 0.07844388297764127, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 3.303724928366762, |
| "grad_norm": 63.2889445553862, |
| "learning_rate": 1.5567207449798515e-07, |
| "loss": -0.6808, |
| "num_tokens": 89220273.0, |
| "residual_var": 0.05393018200993538, |
| "reward": 0.76171875, |
| "reward_std": 0.15756022930145264, |
| "rewards/drgrpo_math_reward/mean": 0.76171875, |
| "rewards/drgrpo_math_reward/std": 0.4268665909767151, |
| "rho2": 0.31249991059303284, |
| "step": 575 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 2.5932212552551478e-09, |
| "advantages/std": 0.22446082532405853, |
| "advantages/var": 0.050382662105157516, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 3.30945558739255, |
| "grad_norm": 48.82676725018094, |
| "learning_rate": 1.5327580077171588e-07, |
| "loss": -0.2314, |
| "num_tokens": 89355020.0, |
| "residual_var": 0.037787001579999924, |
| "reward": 0.90234375, |
| "reward_std": 0.10376540571451187, |
| "rewards/drgrpo_math_reward/mean": 0.90234375, |
| "rewards/drgrpo_math_reward/std": 0.29743078351020813, |
| "rho2": 0.2499999701976776, |
| "step": 576 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.7120408307325485e-09, |
| "advantages/std": 0.13599595427513123, |
| "advantages/var": 0.018494899579203583, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 3.315186246418338, |
| "grad_norm": 44.58586769511727, |
| "learning_rate": 1.508965817778065e-07, |
| "loss": -0.6477, |
| "num_tokens": 89481662.0, |
| "residual_var": 0.016761010512709618, |
| "reward": 0.87890625, |
| "reward_std": 0.0382704995572567, |
| "rewards/drgrpo_math_reward/mean": 0.87890625, |
| "rewards/drgrpo_math_reward/std": 0.3268752694129944, |
| "rho2": 0.09374997764825821, |
| "step": 577 |
| }, |
| { |
| "advantages/mean": 1.3969838619232178e-09, |
| "advantages/snr": 5.675479644112002e-09, |
| "advantages/std": 0.24614374339580536, |
| "advantages/var": 0.060586742412900074, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 3.3209169054441263, |
| "grad_norm": 42.11671269533907, |
| "learning_rate": 1.4853446543852388e-07, |
| "loss": -0.1887, |
| "num_tokens": 89619119.0, |
| "residual_var": 0.04922673478722572, |
| "reward": 0.80078125, |
| "reward_std": 0.11336604505777359, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.1874999701976776, |
| "step": 578 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.6008779446726684e-09, |
| "advantages/std": 0.1939782202243805, |
| "advantages/var": 0.03762754992141826, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.326647564469914, |
| "grad_norm": 34.86806196473536, |
| "learning_rate": 1.461894993316527e-07, |
| "loss": -0.316, |
| "num_tokens": 89770285.0, |
| "residual_var": 0.029396535828709602, |
| "reward": 0.79296875, |
| "reward_std": 0.08417459577322006, |
| "rewards/drgrpo_math_reward/mean": 0.79296875, |
| "rewards/drgrpo_math_reward/std": 0.40597182512283325, |
| "rho2": 0.2187499701976776, |
| "step": 579 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 4.683572705643703e-09, |
| "advantages/std": 0.19884873926639557, |
| "advantages/var": 0.03954082110783497, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 3.3323782234957022, |
| "grad_norm": 40.50716015519298, |
| "learning_rate": 1.4386173068953844e-07, |
| "loss": 0.12, |
| "num_tokens": 89924251.0, |
| "residual_var": 0.03212692588567734, |
| "reward": 0.8515625, |
| "reward_std": 0.07996084541082382, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.1874999701976776, |
| "step": 580 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.380496292175044e-09, |
| "advantages/std": 0.19561520218849182, |
| "advantages/var": 0.038265307327244535, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.3381088825214897, |
| "grad_norm": 28.278917680568767, |
| "learning_rate": 1.415512063981339e-07, |
| "loss": 0.0335, |
| "num_tokens": 90071164.0, |
| "residual_var": 0.033482152968645096, |
| "reward": 0.859375, |
| "reward_std": 0.0776018276810646, |
| "rewards/drgrpo_math_reward/mean": 0.859375, |
| "rewards/drgrpo_math_reward/std": 0.3483152687549591, |
| "rho2": 0.12499997019767761, |
| "step": 581 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 4.5869338591272445e-09, |
| "advantages/std": 0.2537976801395416, |
| "advantages/var": 0.06441326244421308, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 3.343839541547278, |
| "grad_norm": 60.39334512324598, |
| "learning_rate": 1.3925797299605645e-07, |
| "loss": -0.2582, |
| "num_tokens": 90224239.0, |
| "residual_var": 0.04830996319651604, |
| "reward": 0.85546875, |
| "reward_std": 0.13033825159072876, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.24999994039535522, |
| "step": 582 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.2355518937110901, |
| "advantages/var": 0.05548469463088068, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 3.349570200573066, |
| "grad_norm": 45.31574691335599, |
| "learning_rate": 1.3698207667364982e-07, |
| "loss": -0.2152, |
| "num_tokens": 90374149.0, |
| "residual_var": 0.041613537818193436, |
| "reward": 0.69921875, |
| "reward_std": 0.11534436047077179, |
| "rewards/drgrpo_math_reward/mean": 0.69921875, |
| "rewards/drgrpo_math_reward/std": 0.45949608087539673, |
| "rho2": 0.24999995529651642, |
| "step": 583 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 5.02971679938451e-10, |
| "advantages/std": 0.2314550131559372, |
| "advantages/var": 0.05357142311501506, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.355300859598854, |
| "grad_norm": 46.7865592883064, |
| "learning_rate": 1.3472356327205402e-07, |
| "loss": -0.0519, |
| "num_tokens": 90506453.0, |
| "residual_var": 0.038504477590322495, |
| "reward": 0.8828125, |
| "reward_std": 0.11310403048992157, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.2812499403953552, |
| "step": 584 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.19884872436523438, |
| "advantages/var": 0.039540815181680955, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 3.361031518624642, |
| "grad_norm": 29.8521935954077, |
| "learning_rate": 1.3248247828228243e-07, |
| "loss": -0.004, |
| "num_tokens": 90664546.0, |
| "residual_var": 0.03089127317070961, |
| "reward": 0.875, |
| "reward_std": 0.08588206768035889, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.21874994039535522, |
| "step": 585 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.3417865283086908e-09, |
| "advantages/std": 0.19884872436523438, |
| "advantages/var": 0.039540815181680955, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 3.36676217765043, |
| "grad_norm": 35.58514447675251, |
| "learning_rate": 1.3025886684430465e-07, |
| "loss": 0.032, |
| "num_tokens": 90800250.0, |
| "residual_var": 0.03089127317070961, |
| "reward": 0.8984375, |
| "reward_std": 0.08588206768035889, |
| "rewards/drgrpo_math_reward/mean": 0.8984375, |
| "rewards/drgrpo_math_reward/std": 0.3026638329029083, |
| "rho2": 0.21874995529651642, |
| "step": 586 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.6960714578275213e-09, |
| "advantages/std": 0.18898224830627441, |
| "advantages/var": 0.03571429017489436, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 3.372492836676218, |
| "grad_norm": 37.42055712162249, |
| "learning_rate": 1.2805277374613744e-07, |
| "loss": -0.1686, |
| "num_tokens": 90946273.0, |
| "residual_var": 0.030133938416838646, |
| "reward": 0.8984375, |
| "reward_std": 0.06890985369682312, |
| "rewards/drgrpo_math_reward/mean": 0.8984375, |
| "rewards/drgrpo_math_reward/std": 0.3026638329029083, |
| "rho2": 0.15624995529651642, |
| "step": 587 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.2002926482242227e-09, |
| "advantages/std": 0.1939782202243805, |
| "advantages/var": 0.03762754992141826, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.378223495702006, |
| "grad_norm": 36.531684597985844, |
| "learning_rate": 1.258642434229441e-07, |
| "loss": 0.061, |
| "num_tokens": 91078924.0, |
| "residual_var": 0.03174825385212898, |
| "reward": 0.90234375, |
| "reward_std": 0.077071413397789, |
| "rewards/drgrpo_math_reward/mean": 0.90234375, |
| "rewards/drgrpo_math_reward/std": 0.29743078351020813, |
| "rho2": 0.1562499701976776, |
| "step": 588 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.190248146087522e-09, |
| "advantages/std": 0.19561520218849182, |
| "advantages/var": 0.038265307327244535, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 3.383954154727794, |
| "grad_norm": 40.661807421663504, |
| "learning_rate": 1.2369331995613663e-07, |
| "loss": -0.2936, |
| "num_tokens": 91224368.0, |
| "residual_var": 0.031090570613741875, |
| "reward": 0.9296875, |
| "reward_std": 0.07825092226266861, |
| "rewards/drgrpo_math_reward/mean": 0.9296875, |
| "rewards/drgrpo_math_reward/std": 0.2561737895011902, |
| "rho2": 0.1874999701976776, |
| "step": 589 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 3.942246463486898e-09, |
| "advantages/std": 0.20671138167381287, |
| "advantages/var": 0.04272959531349674, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 3.3896848137535818, |
| "grad_norm": 32.00222756493884, |
| "learning_rate": 1.215400470724901e-07, |
| "loss": -0.2501, |
| "num_tokens": 91371870.0, |
| "residual_var": 0.03338250517845154, |
| "reward": 0.90234375, |
| "reward_std": 0.08929945528507233, |
| "rewards/drgrpo_math_reward/mean": 0.90234375, |
| "rewards/drgrpo_math_reward/std": 0.29743078351020813, |
| "rho2": 0.21874995529651642, |
| "step": 590 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.3743802382730425e-09, |
| "advantages/std": 0.16940772533416748, |
| "advantages/var": 0.02869897740289673, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.3954154727793697, |
| "grad_norm": 30.824827526887713, |
| "learning_rate": 1.19404468143262e-07, |
| "loss": -0.0275, |
| "num_tokens": 91514404.0, |
| "residual_var": 0.02421477437019348, |
| "reward": 0.97265625, |
| "reward_std": 0.06207750737667084, |
| "rewards/drgrpo_math_reward/mean": 0.97265625, |
| "rewards/drgrpo_math_reward/std": 0.1634024828672409, |
| "rho2": 0.1562499701976776, |
| "step": 591 |
| }, |
| { |
| "advantages/mean": 1.5133991837501526e-09, |
| "advantages/snr": 5.688070868552017e-09, |
| "advantages/std": 0.26606544852256775, |
| "advantages/var": 0.07079082289751515, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 3.4011461318051577, |
| "grad_norm": 48.81711650537176, |
| "learning_rate": 1.1728662618331698e-07, |
| "loss": -0.4048, |
| "num_tokens": 91680934.0, |
| "residual_var": 0.04645648971199989, |
| "reward": 0.80078125, |
| "reward_std": 0.1437433809041977, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.34374991059303284, |
| "step": 592 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.380496292175044e-09, |
| "advantages/std": 0.19561520218849182, |
| "advantages/var": 0.038265307327244535, |
| "completions/clipped_ratio": -2.6875, |
| "epoch": 3.4068767908309456, |
| "grad_norm": 49.10110326954101, |
| "learning_rate": 1.1518656385026148e-07, |
| "loss": -0.53, |
| "num_tokens": 91830724.0, |
| "residual_var": 0.031090570613741875, |
| "reward": 0.8515625, |
| "reward_std": 0.07825092226266861, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.1874999701976776, |
| "step": 593 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.439863025819513e-09, |
| "advantages/std": 0.16170331835746765, |
| "advantages/var": 0.026147963167816535, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.4126074498567336, |
| "grad_norm": 27.394151022413606, |
| "learning_rate": 1.1310432344358489e-07, |
| "loss": -0.1922, |
| "num_tokens": 91979322.0, |
| "residual_var": 0.022879473865032196, |
| "reward": 0.88671875, |
| "reward_std": 0.05273643881082535, |
| "rewards/drgrpo_math_reward/mean": 0.88671875, |
| "rewards/drgrpo_math_reward/std": 0.31755712628364563, |
| "rho2": 0.12499997019767761, |
| "step": 594 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 9.884473017863691e-10, |
| "advantages/std": 0.2355518937110901, |
| "advantages/var": 0.05548469463088068, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 3.4183381088825215, |
| "grad_norm": 42.73983938785139, |
| "learning_rate": 1.1103994690380681e-07, |
| "loss": -0.4317, |
| "num_tokens": 92123199.0, |
| "residual_var": 0.03987964242696762, |
| "reward": 0.84765625, |
| "reward_std": 0.1148114949464798, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.28124991059303284, |
| "step": 595 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.13363061845302582, |
| "advantages/var": 0.017857142188138164, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.4240687679083095, |
| "grad_norm": 23.86254859993742, |
| "learning_rate": 1.089934758116322e-07, |
| "loss": -0.097, |
| "num_tokens": 92270530.0, |
| "residual_var": 0.015625009313225746, |
| "reward": 0.8359375, |
| "reward_std": 0.04419417306780815, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.12499997764825821, |
| "step": 596 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 5.437436706144895e-09, |
| "advantages/std": 0.1712796986103058, |
| "advantages/var": 0.029336735156037186, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 3.4297994269340975, |
| "grad_norm": 31.83756189804068, |
| "learning_rate": 1.069649513871147e-07, |
| "loss": 0.015, |
| "num_tokens": 92408037.0, |
| "residual_var": 0.025669652968645096, |
| "reward": 0.8671875, |
| "reward_std": 0.056153833866119385, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.12499997764825821, |
| "step": 597 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 2.407395353089743e-09, |
| "advantages/std": 0.1450721174478531, |
| "advantages/var": 0.02104591926080368, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 3.4355300859598854, |
| "grad_norm": 60.871403127702614, |
| "learning_rate": 1.049544144888257e-07, |
| "loss": -0.1135, |
| "num_tokens": 92547556.0, |
| "residual_var": 0.018415190279483795, |
| "reward": 0.86328125, |
| "reward_std": 0.04761157184839249, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.12499997764825821, |
| "step": 598 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.23282866179943085, |
| "advantages/var": 0.05420918575531375, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 3.4412607449856734, |
| "grad_norm": 41.65630491047125, |
| "learning_rate": 1.0296190561303131e-07, |
| "loss": -0.0226, |
| "num_tokens": 92689649.0, |
| "residual_var": 0.04065689817070961, |
| "reward": 0.88671875, |
| "reward_std": 0.11363443732261658, |
| "rewards/drgrpo_math_reward/mean": 0.88671875, |
| "rewards/drgrpo_math_reward/std": 0.31755712628364563, |
| "rho2": 0.24999994039535522, |
| "step": 599 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.464047682013392e-09, |
| "advantages/std": 0.2834733724594116, |
| "advantages/var": 0.08035715289351231, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.4469914040114613, |
| "grad_norm": 53.78196015691887, |
| "learning_rate": 1.0098746489287758e-07, |
| "loss": 0.2443, |
| "num_tokens": 92845107.0, |
| "residual_var": 0.057756707072257996, |
| "reward": 0.8046875, |
| "reward_std": 0.1404382884502411, |
| "rewards/drgrpo_math_reward/mean": 0.8046875, |
| "rewards/drgrpo_math_reward/std": 0.39721766114234924, |
| "rho2": 0.2812499403953552, |
| "step": 600 |
| }, |
| { |
| "advantages/mean": 1.0477378964424133e-09, |
| "advantages/snr": 4.373250828043307e-09, |
| "advantages/std": 0.2395787239074707, |
| "advantages/var": 0.057397964949132074, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 3.4527220630372493, |
| "grad_norm": 39.90121714734247, |
| "learning_rate": 9.903113209758096e-08, |
| "loss": -0.3874, |
| "num_tokens": 92989220.0, |
| "residual_var": 0.04304848238825798, |
| "reward": 0.90625, |
| "reward_std": 0.11705183237791061, |
| "rewards/drgrpo_math_reward/mean": 0.90625, |
| "rewards/drgrpo_math_reward/std": 0.2920515835285187, |
| "rho2": 0.24999995529651642, |
| "step": 601 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 4.754657057250496e-10, |
| "advantages/std": 0.24484482407569885, |
| "advantages/var": 0.05994898787665992, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 3.458452722063037, |
| "grad_norm": 55.126160561348684, |
| "learning_rate": 9.70929466316277e-08, |
| "loss": -0.3398, |
| "num_tokens": 93155169.0, |
| "residual_var": 0.04496174678206444, |
| "reward": 0.8125, |
| "reward_std": 0.11401759088039398, |
| "rewards/drgrpo_math_reward/mean": 0.8125, |
| "rewards/drgrpo_math_reward/std": 0.3910769522190094, |
| "rho2": 0.2499999701976776, |
| "step": 602 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 5.153925516240895e-09, |
| "advantages/std": 0.22587698698043823, |
| "advantages/var": 0.05102041324736106, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 3.464183381088825, |
| "grad_norm": 47.87950722821978, |
| "learning_rate": 9.517294753398064e-08, |
| "loss": -0.1339, |
| "num_tokens": 93315903.0, |
| "residual_var": 0.03985970839858055, |
| "reward": 0.734375, |
| "reward_std": 0.10429581999778748, |
| "rewards/drgrpo_math_reward/mean": 0.734375, |
| "rewards/drgrpo_math_reward/std": 0.4425306022167206, |
| "rho2": 0.2187499701976776, |
| "step": 603 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 3.8029002977661035e-09, |
| "advantages/std": 0.2142857164144516, |
| "advantages/var": 0.04591836825925477, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.469914040114613, |
| "grad_norm": 49.87684872523336, |
| "learning_rate": 9.327117347729197e-08, |
| "loss": -0.1311, |
| "num_tokens": 93471268.0, |
| "residual_var": 0.035873737186193466, |
| "reward": 0.8984375, |
| "reward_std": 0.09271685779094696, |
| "rewards/drgrpo_math_reward/mean": 0.8984375, |
| "rewards/drgrpo_math_reward/std": 0.3026638329029083, |
| "rho2": 0.2187499701976776, |
| "step": 604 |
| }, |
| { |
| "advantages/mean": 1.6298145055770874e-09, |
| "advantages/snr": 7.2154961987453165e-09, |
| "advantages/std": 0.22587697207927704, |
| "advantages/var": 0.0510204065157025, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 3.475644699140401, |
| "grad_norm": 38.908678706119375, |
| "learning_rate": 9.13876627671255e-08, |
| "loss": 0.0163, |
| "num_tokens": 93615423.0, |
| "residual_var": 0.03826531767845154, |
| "reward": 0.828125, |
| "reward_std": 0.10376295447349548, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.24999994039535522, |
| "step": 605 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 1.2785315971737493e-09, |
| "advantages/std": 0.27316176891326904, |
| "advantages/var": 0.0746173519958262, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 3.481375358166189, |
| "grad_norm": 62.798294427898014, |
| "learning_rate": 8.952245334118413e-08, |
| "loss": -0.8204, |
| "num_tokens": 93761778.0, |
| "residual_var": 0.04663585498929024, |
| "reward": 0.80078125, |
| "reward_std": 0.1547943651676178, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.3749999403953552, |
| "step": 606 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.0941679316917287e-09, |
| "advantages/std": 0.21279241144657135, |
| "advantages/var": 0.04528061036924691, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 3.487106017191977, |
| "grad_norm": 88.73741860886084, |
| "learning_rate": 8.767558276854547e-08, |
| "loss": -0.4571, |
| "num_tokens": 93888659.0, |
| "residual_var": 0.036790505051612854, |
| "reward": 0.87890625, |
| "reward_std": 0.0850832611322403, |
| "rewards/drgrpo_math_reward/mean": 0.87890625, |
| "rewards/drgrpo_math_reward/std": 0.3268752694129944, |
| "rho2": 0.18749995529651642, |
| "step": 607 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 9.040584088659493e-10, |
| "advantages/std": 0.1287696808576584, |
| "advantages/var": 0.016581630708183193, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 3.492836676217765, |
| "grad_norm": 23.220600936321258, |
| "learning_rate": 8.584708824890696e-08, |
| "loss": -0.0098, |
| "num_tokens": 94015866.0, |
| "residual_var": 0.015027116052806377, |
| "reward": 0.890625, |
| "reward_std": 0.036563027650117874, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.0937499850988388, |
| "step": 608 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.101346966143024e-09, |
| "advantages/std": 0.22160132229328156, |
| "advantages/var": 0.049107146042130845, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 3.498567335243553, |
| "grad_norm": 29.651912245039497, |
| "learning_rate": 8.403700661183355e-08, |
| "loss": 0.1099, |
| "num_tokens": 94150833.0, |
| "residual_var": 0.041434165090322495, |
| "reward": 0.83984375, |
| "reward_std": 0.09548516571521759, |
| "rewards/drgrpo_math_reward/mean": 0.83984375, |
| "rewards/drgrpo_math_reward/std": 0.36746934056282043, |
| "rho2": 0.1562499701976776, |
| "step": 609 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.0181369335747151e-09, |
| "advantages/std": 0.22868302464485168, |
| "advantages/var": 0.05229592576071784, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.504297994269341, |
| "grad_norm": 44.8975747932072, |
| "learning_rate": 8.224537431601886e-08, |
| "loss": -0.3263, |
| "num_tokens": 94306132.0, |
| "residual_var": 0.03922194987535477, |
| "reward": 0.7421875, |
| "reward_std": 0.1054728701710701, |
| "rewards/drgrpo_math_reward/mean": 0.7421875, |
| "rewards/drgrpo_math_reward/std": 0.4382871091365814, |
| "rho2": 0.24999994039535522, |
| "step": 610 |
| }, |
| { |
| "advantages/mean": 1.7462298274040222e-09, |
| "advantages/snr": 7.371113001051238e-09, |
| "advantages/std": 0.23690177500247955, |
| "advantages/var": 0.056122450999325446, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.510028653295129, |
| "grad_norm": 46.59898003698075, |
| "learning_rate": 8.047222744854942e-08, |
| "loss": -0.6579, |
| "num_tokens": 94445829.0, |
| "residual_var": 0.04735332727432251, |
| "reward": 0.8203125, |
| "reward_std": 0.10231750458478928, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.15624995529651642, |
| "step": 611 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 3.7638950952465895e-09, |
| "advantages/std": 0.24743583798408508, |
| "advantages/var": 0.0612244939188864, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 3.5157593123209168, |
| "grad_norm": 39.87785063035209, |
| "learning_rate": 7.871760172417763e-08, |
| "loss": -0.5317, |
| "num_tokens": 94594088.0, |
| "residual_var": 0.04209185391664505, |
| "reward": 0.8515625, |
| "reward_std": 0.12756997346878052, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.31249991059303284, |
| "step": 612 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.7120408936770962e-09, |
| "advantages/std": 0.27199190855026245, |
| "advantages/var": 0.07397959831681433, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 3.5214899713467047, |
| "grad_norm": 47.423187700952894, |
| "learning_rate": 7.698153248460271e-08, |
| "loss": -0.7605, |
| "num_tokens": 94757232.0, |
| "residual_var": 0.048549119383096695, |
| "reward": 0.8359375, |
| "reward_std": 0.14716076850891113, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.3437499403953552, |
| "step": 613 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.459133312827934e-10, |
| "advantages/std": 0.24614372849464417, |
| "advantages/var": 0.0605867350772451, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.5272206303724927, |
| "grad_norm": 50.92583879853945, |
| "learning_rate": 7.526405469775954e-08, |
| "loss": -0.1927, |
| "num_tokens": 94906140.0, |
| "residual_var": 0.04544006660580635, |
| "reward": 0.84765625, |
| "reward_std": 0.12046922743320465, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.24999995529651642, |
| "step": 614 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 3.1937709000681565e-09, |
| "advantages/std": 0.2187044471502304, |
| "advantages/var": 0.047831635203287926, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.532951289398281, |
| "grad_norm": 44.685094883209764, |
| "learning_rate": 7.356520295711254e-08, |
| "loss": -0.294, |
| "num_tokens": 95067668.0, |
| "residual_var": 0.037368472665548325, |
| "reward": 0.85546875, |
| "reward_std": 0.10087842494249344, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.2187499701976776, |
| "step": 615 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.1157275065779686, |
| "advantages/var": 0.013392855778753765, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 3.5386819484240686, |
| "grad_norm": 28.964255935442797, |
| "learning_rate": 7.188501148096116e-08, |
| "loss": -0.0823, |
| "num_tokens": 95207002.0, |
| "residual_var": 0.012137286365032196, |
| "reward": 0.86328125, |
| "reward_std": 0.03314562886953354, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.09374997764825821, |
| "step": 616 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 2.7658867314519385e-09, |
| "advantages/std": 0.25253814458847046, |
| "advantages/var": 0.06377551447218721, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 3.544412607449857, |
| "grad_norm": 49.01721111929761, |
| "learning_rate": 7.022351411174865e-08, |
| "loss": -0.1572, |
| "num_tokens": 95352828.0, |
| "residual_var": 0.05181760713458061, |
| "reward": 0.8203125, |
| "reward_std": 0.11678344011306763, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.1874999701976776, |
| "step": 617 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 3.209860470786324e-09, |
| "advantages/std": 0.1450721174478531, |
| "advantages/var": 0.02104591926080368, |
| "completions/clipped_ratio": -2.984375, |
| "epoch": 3.5501432664756445, |
| "grad_norm": 30.35392299977766, |
| "learning_rate": 6.858074431538164e-08, |
| "loss": -0.0558, |
| "num_tokens": 95497198.0, |
| "residual_var": 0.018415190279483795, |
| "reward": 0.96484375, |
| "reward_std": 0.04761157184839249, |
| "rewards/drgrpo_math_reward/mean": 0.96484375, |
| "rewards/drgrpo_math_reward/std": 0.18453538417816162, |
| "rho2": 0.12499997764825821, |
| "step": 618 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.043916747715097e-09, |
| "advantages/std": 0.22303563356399536, |
| "advantages/var": 0.049744893839292814, |
| "completions/clipped_ratio": -2.75, |
| "epoch": 3.555873925501433, |
| "grad_norm": 41.92973970694584, |
| "learning_rate": 6.695673518055578e-08, |
| "loss": 0.1913, |
| "num_tokens": 95646256.0, |
| "residual_var": 0.03886321559548378, |
| "reward": 0.8203125, |
| "reward_std": 0.10258589684963226, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.21874994039535522, |
| "step": 619 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 4.375439129655397e-09, |
| "advantages/std": 0.26606544852256775, |
| "advantages/var": 0.07079082289751515, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 3.5616045845272204, |
| "grad_norm": 56.92800431775484, |
| "learning_rate": 6.535151941808914e-08, |
| "loss": -0.5503, |
| "num_tokens": 95790003.0, |
| "residual_var": 0.05751755088567734, |
| "reward": 0.86328125, |
| "reward_std": 0.13006988167762756, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.18749995529651642, |
| "step": 620 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 4.230252817593886e-09, |
| "advantages/std": 0.220157653093338, |
| "advantages/var": 0.048469392215566565, |
| "completions/clipped_ratio": -3.0, |
| "epoch": 3.567335243553009, |
| "grad_norm": 60.286859613453636, |
| "learning_rate": 6.376512936026279e-08, |
| "loss": -0.0555, |
| "num_tokens": 95931649.0, |
| "residual_var": 0.03938138857483864, |
| "reward": 0.8828125, |
| "reward_std": 0.09495474398136139, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.1874999701976776, |
| "step": 621 |
| }, |
| { |
| "advantages/mean": 1.7462298274040222e-09, |
| "advantages/snr": 8.781698823081943e-09, |
| "advantages/std": 0.19884873926639557, |
| "advantages/var": 0.03954082110783497, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.5730659025787963, |
| "grad_norm": 39.14535432506035, |
| "learning_rate": 6.219759696017113e-08, |
| "loss": -0.1751, |
| "num_tokens": 96075681.0, |
| "residual_var": 0.03089127317070961, |
| "reward": 0.8515625, |
| "reward_std": 0.08588206768035889, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.21874995529651642, |
| "step": 622 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 5.3229515001135936e-09, |
| "advantages/std": 0.2187044471502304, |
| "advantages/var": 0.047831635203287926, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 3.5787965616045847, |
| "grad_norm": 53.47147141809364, |
| "learning_rate": 6.064895379107659e-08, |
| "loss": -0.1134, |
| "num_tokens": 96225596.0, |
| "residual_var": 0.03736847639083862, |
| "reward": 0.84765625, |
| "reward_std": 0.09442433714866638, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.21874994039535522, |
| "step": 623 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 2.6614758307843997e-09, |
| "advantages/std": 0.262445330619812, |
| "advantages/var": 0.06887755156414244, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 3.5845272206303727, |
| "grad_norm": 47.78001634418774, |
| "learning_rate": 5.911923104577454e-08, |
| "loss": -0.3104, |
| "num_tokens": 96380274.0, |
| "residual_var": 0.05165817588567734, |
| "reward": 0.78125, |
| "reward_std": 0.12901148200035095, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.24999995529651642, |
| "step": 624 |
| }, |
| { |
| "advantages/mean": -1.3969838619232178e-09, |
| "advantages/snr": 6.2634996493533425e-09, |
| "advantages/std": 0.22303566336631775, |
| "advantages/var": 0.04974490713325341, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 3.5902578796561606, |
| "grad_norm": 46.22086643750276, |
| "learning_rate": 5.760845953596527e-08, |
| "loss": -0.8019, |
| "num_tokens": 96521301.0, |
| "residual_var": 0.043526798486709595, |
| "reward": 0.84375, |
| "reward_std": 0.08956148475408554, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.12499997019767761, |
| "step": 625 |
| }, |
| { |
| "advantages/mean": 5.820766091346741e-10, |
| "advantages/snr": 2.7747801921406384e-09, |
| "advantages/std": 0.2097739428281784, |
| "advantages/var": 0.04400510708967986, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.5959885386819486, |
| "grad_norm": 46.5684109422701, |
| "learning_rate": 5.611666969163242e-08, |
| "loss": -0.289, |
| "num_tokens": 96657732.0, |
| "residual_var": 0.03437899798154831, |
| "reward": 0.86328125, |
| "reward_std": 0.09100939333438873, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.2187499701976776, |
| "step": 626 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.557063287032119e-09, |
| "advantages/std": 0.1821078360080719, |
| "advantages/var": 0.03316326393554281, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 3.6017191977077365, |
| "grad_norm": 44.745888500064524, |
| "learning_rate": 5.464389156043114e-08, |
| "loss": -0.2185, |
| "num_tokens": 96799094.0, |
| "residual_var": 0.029017869383096695, |
| "reward": 0.84375, |
| "reward_std": 0.06602286547422409, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.12499997019767761, |
| "step": 627 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 3.1623042258115645e-09, |
| "advantages/std": 0.14725378155708313, |
| "advantages/var": 0.021683676182861156, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 3.6074498567335245, |
| "grad_norm": 29.061281994660337, |
| "learning_rate": 5.3190154807082e-08, |
| "loss": 0.137, |
| "num_tokens": 96935256.0, |
| "residual_var": 0.019650837406516075, |
| "reward": 0.890625, |
| "reward_std": 0.041687894612550735, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.09374997764825821, |
| "step": 628 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.236086887785319e-09, |
| "advantages/std": 0.2082482874393463, |
| "advantages/var": 0.043367349221420604, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 3.6131805157593124, |
| "grad_norm": 49.603404350651765, |
| "learning_rate": 5.175548871277358e-08, |
| "loss": -0.357, |
| "num_tokens": 97087405.0, |
| "residual_var": 0.03523598238825798, |
| "reward": 0.78125, |
| "reward_std": 0.08337578922510147, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.18749995529651642, |
| "step": 629 |
| }, |
| { |
| "advantages/mean": 3.4924596548080444e-10, |
| "advantages/snr": 1.4418181244196808e-09, |
| "advantages/std": 0.24222607910633087, |
| "advantages/var": 0.05867347339922646, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 3.6189111747851004, |
| "grad_norm": 42.7166520741745, |
| "learning_rate": 5.033992217457395e-08, |
| "loss": -0.0058, |
| "num_tokens": 97237459.0, |
| "residual_var": 0.04767220467329025, |
| "reward": 0.7890625, |
| "reward_std": 0.10520447790622711, |
| "rewards/drgrpo_math_reward/mean": 0.7890625, |
| "rewards/drgrpo_math_reward/std": 0.4087733030319214, |
| "rho2": 0.18749995529651642, |
| "step": 630 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.4005852964484455e-09, |
| "advantages/std": 0.1939782202243805, |
| "advantages/var": 0.03762754992141826, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.6246418338108883, |
| "grad_norm": 36.484359460099256, |
| "learning_rate": 4.8943483704846465e-08, |
| "loss": -0.2171, |
| "num_tokens": 97390087.0, |
| "residual_var": 0.03174825757741928, |
| "reward": 0.80078125, |
| "reward_std": 0.077071413397789, |
| "rewards/drgrpo_math_reward/mean": 0.80078125, |
| "rewards/drgrpo_math_reward/std": 0.40019527077674866, |
| "rho2": 0.1562499701976776, |
| "step": 631 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 6.105844111983757e-10, |
| "advantages/std": 0.19066211581230164, |
| "advantages/var": 0.03635204240602352, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.6303724928366763, |
| "grad_norm": 39.535528476647805, |
| "learning_rate": 4.756620143067724e-08, |
| "loss": 0.0628, |
| "num_tokens": 97530545.0, |
| "residual_var": 0.031808048486709595, |
| "reward": 0.92578125, |
| "reward_std": 0.06944026052951813, |
| "rewards/drgrpo_math_reward/mean": 0.92578125, |
| "rewards/drgrpo_math_reward/std": 0.2626400291919708, |
| "rho2": 0.12499997764825821, |
| "step": 632 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 5.470839275353562e-09, |
| "advantages/std": 0.21279242634773254, |
| "advantages/var": 0.04528061671095518, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 3.6361031518624642, |
| "grad_norm": 36.704938684625795, |
| "learning_rate": 4.620810309330803e-08, |
| "loss": -0.2712, |
| "num_tokens": 97668443.0, |
| "residual_var": 0.03396047279238701, |
| "reward": 0.87109375, |
| "reward_std": 0.09864053130149841, |
| "rewards/drgrpo_math_reward/mean": 0.87109375, |
| "rewards/drgrpo_math_reward/std": 0.33575257658958435, |
| "rho2": 0.24999995529651642, |
| "step": 633 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-10, |
| "advantages/snr": 5.322951500113594e-10, |
| "advantages/std": 0.2187044471502304, |
| "advantages/var": 0.047831635203287926, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 3.641833810888252, |
| "grad_norm": 40.750492569142516, |
| "learning_rate": 4.4869216047576986e-08, |
| "loss": -0.4012, |
| "num_tokens": 97795803.0, |
| "residual_var": 0.037368472665548325, |
| "reward": 0.89453125, |
| "reward_std": 0.10087842494249344, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.2187499701976776, |
| "step": 634 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.06681530922651291, |
| "advantages/var": 0.004464285547034541, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 3.64756446991404, |
| "grad_norm": 16.114132966316024, |
| "learning_rate": 4.3549567261368494e-08, |
| "loss": -0.0488, |
| "num_tokens": 97936947.0, |
| "residual_var": 0.004324787296354771, |
| "reward": 0.90234375, |
| "reward_std": 0.011048543266952038, |
| "rewards/drgrpo_math_reward/mean": 0.90234375, |
| "rewards/drgrpo_math_reward/std": 0.29743078351020813, |
| "rho2": 0.031249990686774254, |
| "step": 635 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.20671138167381287, |
| "advantages/var": 0.04272959531349674, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 3.653295128939828, |
| "grad_norm": 39.26219200768812, |
| "learning_rate": 4.224918331506955e-08, |
| "loss": -0.2505, |
| "num_tokens": 98077775.0, |
| "residual_var": 0.03605310246348381, |
| "reward": 0.90234375, |
| "reward_std": 0.07574218511581421, |
| "rewards/drgrpo_math_reward/mean": 0.90234375, |
| "rewards/drgrpo_math_reward/std": 0.29743078351020813, |
| "rho2": 0.15624995529651642, |
| "step": 636 |
| }, |
| { |
| "advantages/mean": -1.280568540096283e-09, |
| "advantages/snr": 4.728538381659934e-09, |
| "advantages/std": 0.27081698179244995, |
| "advantages/var": 0.07334183762717217, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.659025787965616, |
| "grad_norm": 79.3512309491782, |
| "learning_rate": 4.096809040103444e-08, |
| "loss": -0.52, |
| "num_tokens": 98222118.0, |
| "residual_var": 0.048130594193935394, |
| "reward": 0.89453125, |
| "reward_std": 0.14716322720050812, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.3437499403953552, |
| "step": 637 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.0372886398259093e-09, |
| "advantages/std": 0.22446079552173615, |
| "advantages/var": 0.050382648726250645, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 3.664756446991404, |
| "grad_norm": 40.029001858794814, |
| "learning_rate": 3.9706314323056936e-08, |
| "loss": -0.1787, |
| "num_tokens": 98386258.0, |
| "residual_var": 0.03621254488825798, |
| "reward": 0.75390625, |
| "reward_std": 0.10968662798404694, |
| "rewards/drgrpo_math_reward/mean": 0.75390625, |
| "rewards/drgrpo_math_reward/std": 0.43157756328582764, |
| "rho2": 0.2812499403953552, |
| "step": 638 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 1.2320239164203405e-09, |
| "advantages/std": 0.18898223340511322, |
| "advantages/var": 0.03571428454278469, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.670487106017192, |
| "grad_norm": 73.46864887642852, |
| "learning_rate": 3.846388049585114e-08, |
| "loss": -0.1359, |
| "num_tokens": 98534613.0, |
| "residual_var": 0.030133940279483795, |
| "reward": 0.71875, |
| "reward_std": 0.07536394149065018, |
| "rewards/drgrpo_math_reward/mean": 0.71875, |
| "rewards/drgrpo_math_reward/std": 0.45048993825912476, |
| "rho2": 0.1562499701976776, |
| "step": 639 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 5.287816379893159e-09, |
| "advantages/std": 0.22015763819217682, |
| "advantages/var": 0.04846938565435743, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.67621776504298, |
| "grad_norm": 36.833960394703546, |
| "learning_rate": 3.724081394453915e-08, |
| "loss": -0.1712, |
| "num_tokens": 98679085.0, |
| "residual_var": 0.03938139230012894, |
| "reward": 0.828125, |
| "reward_std": 0.09495474398136139, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.18749995529651642, |
| "step": 640 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.158150412641125e-09, |
| "advantages/std": 0.21576867997646332, |
| "advantages/var": 0.04655612325878544, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 3.681948424068768, |
| "grad_norm": 39.81293821380969, |
| "learning_rate": 3.6037139304146756e-08, |
| "loss": -0.3951, |
| "num_tokens": 98822479.0, |
| "residual_var": 0.040736615657806396, |
| "reward": 0.85546875, |
| "reward_std": 0.0861440896987915, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.12499997019767761, |
| "step": 641 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 8.830796510959388e-10, |
| "advantages/std": 0.2636575698852539, |
| "advantages/var": 0.06951531415779755, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.687679083094556, |
| "grad_norm": 49.9808279744873, |
| "learning_rate": 3.4852880819107974e-08, |
| "loss": -0.502, |
| "num_tokens": 98976832.0, |
| "residual_var": 0.04779178649187088, |
| "reward": 0.87890625, |
| "reward_std": 0.13611221313476562, |
| "rewards/drgrpo_math_reward/mean": 0.87890625, |
| "rewards/drgrpo_math_reward/std": 0.3268752694129944, |
| "rho2": 0.3124999403953552, |
| "step": 642 |
| }, |
| { |
| "advantages/mean": 1.0477378964424133e-09, |
| "advantages/snr": 3.8520920107734665e-09, |
| "advantages/std": 0.27199190855026245, |
| "advantages/var": 0.07397959831681433, |
| "completions/clipped_ratio": -2.71875, |
| "epoch": 3.693409742120344, |
| "grad_norm": 72.31122433456379, |
| "learning_rate": 3.3688062342776105e-08, |
| "loss": -0.4692, |
| "num_tokens": 99131413.0, |
| "residual_var": 0.05317284166812897, |
| "reward": 0.8203125, |
| "reward_std": 0.14005757868289948, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.2812499403953552, |
| "step": 643 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.4045714528780252e-09, |
| "advantages/std": 0.20516294240951538, |
| "advantages/var": 0.042091832938130125, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 3.6991404011461317, |
| "grad_norm": 41.15042948921175, |
| "learning_rate": 3.254270733694331e-08, |
| "loss": -0.5067, |
| "num_tokens": 99265316.0, |
| "residual_var": 0.03419962897896767, |
| "reward": 0.921875, |
| "reward_std": 0.08166586607694626, |
| "rewards/drgrpo_math_reward/mean": 0.921875, |
| "rewards/drgrpo_math_reward/std": 0.26889389753341675, |
| "rho2": 0.18749994039535522, |
| "step": 644 |
| }, |
| { |
| "advantages/mean": -1.862645149230957e-09, |
| "advantages/snr": 6.395541197776692e-09, |
| "advantages/std": 0.29124119877815247, |
| "advantages/var": 0.08482143586573532, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 3.7048710601719197, |
| "grad_norm": 54.4289614540195, |
| "learning_rate": 3.141683887136892e-08, |
| "loss": -0.7299, |
| "num_tokens": 99423558.0, |
| "residual_var": 0.058314744383096695, |
| "reward": 0.80859375, |
| "reward_std": 0.1514868289232254, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.3124999403953552, |
| "step": 645 |
| }, |
| { |
| "advantages/mean": -9.313225746154785e-10, |
| "advantages/snr": 4.972694468794569e-09, |
| "advantages/std": 0.18728730082511902, |
| "advantages/var": 0.03507653305035863, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 3.7106017191977076, |
| "grad_norm": 39.47961175206918, |
| "learning_rate": 3.0310479623313125e-08, |
| "loss": -0.3528, |
| "num_tokens": 99574657.0, |
| "residual_var": 0.02849969081580639, |
| "reward": 0.86328125, |
| "reward_std": 0.07483352720737457, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.18749995529651642, |
| "step": 646 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 3.97670992529417e-09, |
| "advantages/std": 0.23419423401355743, |
| "advantages/var": 0.0548469392451969, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.7163323782234956, |
| "grad_norm": 54.18482780264222, |
| "learning_rate": 2.9223651877081867e-08, |
| "loss": 0.1581, |
| "num_tokens": 99720962.0, |
| "residual_var": 0.042849183082580566, |
| "reward": 0.875, |
| "reward_std": 0.10771076381206512, |
| "rewards/drgrpo_math_reward/mean": 0.875, |
| "rewards/drgrpo_math_reward/std": 0.33136674761772156, |
| "rho2": 0.21874994039535522, |
| "step": 647 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.22728431224822998, |
| "advantages/var": 0.051658158594150905, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 3.7220630372492836, |
| "grad_norm": 51.455630603359445, |
| "learning_rate": 2.8156377523576802e-08, |
| "loss": -0.4604, |
| "num_tokens": 99857133.0, |
| "residual_var": 0.03712931647896767, |
| "reward": 0.91015625, |
| "reward_std": 0.11139655113220215, |
| "rewards/drgrpo_math_reward/mean": 0.91015625, |
| "rewards/drgrpo_math_reward/std": 0.2865179479122162, |
| "rho2": 0.2812499403953552, |
| "step": 648 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.20516295731067657, |
| "advantages/var": 0.0420918390524625, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 3.7277936962750715, |
| "grad_norm": 39.31169214983867, |
| "learning_rate": 2.7108678059855062e-08, |
| "loss": -0.5246, |
| "num_tokens": 100002956.0, |
| "residual_var": 0.03551499918103218, |
| "reward": 0.8203125, |
| "reward_std": 0.08219873160123825, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.1562499850988388, |
| "step": 649 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.1939782202243805, |
| "advantages/var": 0.03762754992141826, |
| "completions/clipped_ratio": -2.734375, |
| "epoch": 3.7335243553008595, |
| "grad_norm": 43.58478515285809, |
| "learning_rate": 2.6080574588696058e-08, |
| "loss": -0.1068, |
| "num_tokens": 100157687.0, |
| "residual_var": 0.029396533966064453, |
| "reward": 0.83203125, |
| "reward_std": 0.08417459577322006, |
| "rewards/drgrpo_math_reward/mean": 0.83203125, |
| "rewards/drgrpo_math_reward/std": 0.3745708465576172, |
| "rho2": 0.2187499701976776, |
| "step": 650 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.380496292175044e-09, |
| "advantages/std": 0.19561520218849182, |
| "advantages/var": 0.038265307327244535, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.7392550143266474, |
| "grad_norm": 37.90389101739278, |
| "learning_rate": 2.507208781817638e-08, |
| "loss": -0.3881, |
| "num_tokens": 100296628.0, |
| "residual_var": 0.033482152968645096, |
| "reward": 0.8828125, |
| "reward_std": 0.07114773988723755, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.12499997019767761, |
| "step": 651 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 2.982532443970627e-09, |
| "advantages/std": 0.23419423401355743, |
| "advantages/var": 0.0548469392451969, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 3.7449856733524354, |
| "grad_norm": 53.56485887973218, |
| "learning_rate": 2.4083238061252563e-08, |
| "loss": -0.5428, |
| "num_tokens": 100457887.0, |
| "residual_var": 0.03942124918103218, |
| "reward": 0.8828125, |
| "reward_std": 0.11481393873691559, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.2812499403953552, |
| "step": 652 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-09, |
| "advantages/snr": 6.001463241121114e-09, |
| "advantages/std": 0.1939782202243805, |
| "advantages/var": 0.03762754992141826, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 3.7507163323782233, |
| "grad_norm": 36.834434673444534, |
| "learning_rate": 2.311404523535243e-08, |
| "loss": -0.2637, |
| "num_tokens": 100610163.0, |
| "residual_var": 0.03174825757741928, |
| "reward": 0.87109375, |
| "reward_std": 0.077071413397789, |
| "rewards/drgrpo_math_reward/mean": 0.87109375, |
| "rewards/drgrpo_math_reward/std": 0.33575257658958435, |
| "rho2": 0.15624995529651642, |
| "step": 653 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 4.6098110067255745e-09, |
| "advantages/std": 0.15152288973331451, |
| "advantages/var": 0.02295918611313419, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.7564469914040117, |
| "grad_norm": 29.425561410312323, |
| "learning_rate": 2.2164528861973065e-08, |
| "loss": -0.1512, |
| "num_tokens": 100739733.0, |
| "residual_var": 0.020089294761419296, |
| "reward": 0.8671875, |
| "reward_std": 0.04931904375553131, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.12499997019767761, |
| "step": 654 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.9018629386468357e-09, |
| "advantages/std": 0.24484480917453766, |
| "advantages/var": 0.05994898057971576, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 3.7621776504297992, |
| "grad_norm": 50.246981983613296, |
| "learning_rate": 2.1234708066288574e-08, |
| "loss": -0.3237, |
| "num_tokens": 100876691.0, |
| "residual_var": 0.04683515802025795, |
| "reward": 0.8046875, |
| "reward_std": 0.11928971856832504, |
| "rewards/drgrpo_math_reward/mean": 0.8046875, |
| "rewards/drgrpo_math_reward/std": 0.39721766114234924, |
| "rho2": 0.21874995529651642, |
| "step": 655 |
| }, |
| { |
| "advantages/mean": -1.0477378964424133e-09, |
| "advantages/snr": 4.473798381299966e-09, |
| "advantages/std": 0.23419424891471863, |
| "advantages/var": 0.05484694622472919, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.7679083094555876, |
| "grad_norm": 55.94674225949123, |
| "learning_rate": 2.032460157676452e-08, |
| "loss": -0.2971, |
| "num_tokens": 101028763.0, |
| "residual_var": 0.04113522171974182, |
| "reward": 0.8828125, |
| "reward_std": 0.10889272391796112, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.24999995529651642, |
| "step": 656 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.158150412641125e-09, |
| "advantages/std": 0.21576867997646332, |
| "advantages/var": 0.04655612325878544, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.773638968481375, |
| "grad_norm": 57.17036715686087, |
| "learning_rate": 1.9434227724779984e-08, |
| "loss": -0.639, |
| "num_tokens": 101164375.0, |
| "residual_var": 0.03637198358774185, |
| "reward": 0.87890625, |
| "reward_std": 0.09271440654993057, |
| "rewards/drgrpo_math_reward/mean": 0.87890625, |
| "rewards/drgrpo_math_reward/std": 0.3268752694129944, |
| "rho2": 0.21874995529651642, |
| "step": 657 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 1.8347735436508976e-09, |
| "advantages/std": 0.2537976801395416, |
| "advantages/var": 0.06441326244421308, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 3.7793696275071635, |
| "grad_norm": 50.445926119141305, |
| "learning_rate": 1.856360444425953e-08, |
| "loss": -0.1995, |
| "num_tokens": 101308179.0, |
| "residual_var": 0.04428413510322571, |
| "reward": 0.91015625, |
| "reward_std": 0.1374414563179016, |
| "rewards/drgrpo_math_reward/mean": 0.91015625, |
| "rewards/drgrpo_math_reward/std": 0.2865179479122162, |
| "rho2": 0.3124999403953552, |
| "step": 658 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.19884873926639557, |
| "advantages/var": 0.03954082110783497, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.785100286532951, |
| "grad_norm": 37.38180564757065, |
| "learning_rate": 1.771274927131139e-08, |
| "loss": -0.4388, |
| "num_tokens": 101452351.0, |
| "residual_var": 0.03089127317070961, |
| "reward": 0.84375, |
| "reward_std": 0.08588207513093948, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.21874995529651642, |
| "step": 659 |
| }, |
| { |
| "advantages/mean": -1.0477378964424133e-09, |
| "advantages/snr": 5.401316502086732e-09, |
| "advantages/std": 0.1939782351255417, |
| "advantages/var": 0.037627555702419935, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.7908309455587395, |
| "grad_norm": 28.992450313688646, |
| "learning_rate": 1.6881679343873634e-08, |
| "loss": 0.1372, |
| "num_tokens": 101597856.0, |
| "residual_var": 0.032924119383096695, |
| "reward": 0.92578125, |
| "reward_std": 0.07115019112825394, |
| "rewards/drgrpo_math_reward/mean": 0.92578125, |
| "rewards/drgrpo_math_reward/std": 0.2626400291919708, |
| "rho2": 0.1249999850988388, |
| "step": 660 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 6.621393318979555e-09, |
| "advantages/std": 0.24614372849464417, |
| "advantages/var": 0.0605867350772451, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.796561604584527, |
| "grad_norm": 60.214687351110435, |
| "learning_rate": 1.607041140137033e-08, |
| "loss": -0.1922, |
| "num_tokens": 101755824.0, |
| "residual_var": 0.04165339842438698, |
| "reward": 0.83203125, |
| "reward_std": 0.1275724172592163, |
| "rewards/drgrpo_math_reward/mean": 0.83203125, |
| "rewards/drgrpo_math_reward/std": 0.3745708465576172, |
| "rho2": 0.3124999403953552, |
| "step": 661 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.017830079630706e-09, |
| "advantages/std": 0.2314550131559372, |
| "advantages/var": 0.05357142311501506, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 3.8022922636103154, |
| "grad_norm": 39.232079845003454, |
| "learning_rate": 1.5278961784373266e-08, |
| "loss": -0.2005, |
| "num_tokens": 101911230.0, |
| "residual_var": 0.038504477590322495, |
| "reward": 0.78125, |
| "reward_std": 0.11310403048992157, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.2812499403953552, |
| "step": 662 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.15361295640468597, |
| "advantages/var": 0.023596940375387954, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.8080229226361033, |
| "grad_norm": 24.328039087105505, |
| "learning_rate": 1.4507346434273316e-08, |
| "loss": -0.0079, |
| "num_tokens": 102062868.0, |
| "residual_var": 0.02138473652303219, |
| "reward": 0.80859375, |
| "reward_std": 0.04339536651968956, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.09374997019767761, |
| "step": 663 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.486347036575778e-09, |
| "advantages/std": 0.1872873157262802, |
| "advantages/var": 0.03507653863195537, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 3.8137535816618913, |
| "grad_norm": 40.12027079585644, |
| "learning_rate": 1.375558089295914e-08, |
| "loss": -0.3267, |
| "num_tokens": 102191204.0, |
| "residual_var": 0.029595833271741867, |
| "reward": 0.85546875, |
| "reward_std": 0.06891229748725891, |
| "rewards/drgrpo_math_reward/mean": 0.85546875, |
| "rewards/drgrpo_math_reward/std": 0.35231640934944153, |
| "rho2": 0.1562499850988388, |
| "step": 664 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.36110830002126e-10, |
| "advantages/std": 0.24872122704982758, |
| "advantages/var": 0.06186224878517188, |
| "completions/clipped_ratio": -2.671875, |
| "epoch": 3.819484240687679, |
| "grad_norm": 54.17369996636717, |
| "learning_rate": 1.3023680302504336e-08, |
| "loss": -0.6099, |
| "num_tokens": 102345256.0, |
| "residual_var": 0.04832988977432251, |
| "reward": 0.80859375, |
| "reward_std": 0.11454310268163681, |
| "rewards/drgrpo_math_reward/mean": 0.80859375, |
| "rewards/drgrpo_math_reward/std": 0.39417871832847595, |
| "rho2": 0.21874994039535522, |
| "step": 665 |
| }, |
| { |
| "advantages/mean": 1.5133991837501526e-09, |
| "advantages/snr": 6.352307063392909e-09, |
| "advantages/std": 0.23824401199817657, |
| "advantages/var": 0.056760209252987304, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 3.825214899713467, |
| "grad_norm": 38.6683462023298, |
| "learning_rate": 1.231165940486234e-08, |
| "loss": 0.1883, |
| "num_tokens": 102510042.0, |
| "residual_var": 0.049665190279483795, |
| "reward": 0.75390625, |
| "reward_std": 0.10284791886806488, |
| "rewards/drgrpo_math_reward/mean": 0.75390625, |
| "rewards/drgrpo_math_reward/std": 0.43157756328582764, |
| "rho2": 0.12499997019767761, |
| "step": 666 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.1730858844377734e-09, |
| "advantages/std": 0.2142857164144516, |
| "advantages/var": 0.04591836825925477, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 3.830945558739255, |
| "grad_norm": 50.021929000970786, |
| "learning_rate": 1.1619532541569332e-08, |
| "loss": -0.4099, |
| "num_tokens": 102659424.0, |
| "residual_var": 0.038743630051612854, |
| "reward": 0.859375, |
| "reward_std": 0.0856136754155159, |
| "rewards/drgrpo_math_reward/mean": 0.859375, |
| "rewards/drgrpo_math_reward/std": 0.3483152687549591, |
| "rho2": 0.1562499701976776, |
| "step": 667 |
| }, |
| { |
| "advantages/mean": -1.6298145055770874e-09, |
| "advantages/snr": 6.7653530196824894e-09, |
| "advantages/std": 0.24090604484081268, |
| "advantages/var": 0.05803572244084365, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 3.836676217765043, |
| "grad_norm": 93.84246863269392, |
| "learning_rate": 1.0947313653455693e-08, |
| "loss": -0.3309, |
| "num_tokens": 102813762.0, |
| "residual_var": 0.043526798486709595, |
| "reward": 0.89453125, |
| "reward_std": 0.11231012642383575, |
| "rewards/drgrpo_math_reward/mean": 0.89453125, |
| "rewards/drgrpo_math_reward/std": 0.3077581524848938, |
| "rho2": 0.2499999701976776, |
| "step": 668 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.862645074725154e-09, |
| "advantages/std": 0.25, |
| "advantages/var": 0.0625, |
| "completions/clipped_ratio": -2.859375, |
| "epoch": 3.842406876790831, |
| "grad_norm": 44.3179794110122, |
| "learning_rate": 1.029501628036511e-08, |
| "loss": -0.1561, |
| "num_tokens": 102955237.0, |
| "residual_var": 0.046875011175870895, |
| "reward": 0.8515625, |
| "reward_std": 0.12863080203533173, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.2499999701976776, |
| "step": 669 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-09, |
| "advantages/snr": 8.523543981158329e-09, |
| "advantages/std": 0.27316176891326904, |
| "advantages/var": 0.0746173519958262, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 3.848137535816619, |
| "grad_norm": 41.95287015060324, |
| "learning_rate": 9.662653560881584e-09, |
| "loss": -0.4442, |
| "num_tokens": 103096599.0, |
| "residual_var": 0.05129943788051605, |
| "reward": 0.84765625, |
| "reward_std": 0.14769119024276733, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.3124999403953552, |
| "step": 670 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 4.2801022341927405e-10, |
| "advantages/std": 0.27199190855026245, |
| "advantages/var": 0.07397959831681433, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 3.853868194842407, |
| "grad_norm": 58.944559417389065, |
| "learning_rate": 9.050238232065299e-09, |
| "loss": -0.8938, |
| "num_tokens": 103250186.0, |
| "residual_var": 0.048549119383096695, |
| "reward": 0.8359375, |
| "reward_std": 0.14716076850891113, |
| "rewards/drgrpo_math_reward/mean": 0.8359375, |
| "rewards/drgrpo_math_reward/std": 0.3710577189922333, |
| "rho2": 0.3437499403953552, |
| "step": 671 |
| }, |
| { |
| "advantages/mean": -1.1641532182693481e-09, |
| "advantages/snr": 4.914075334034159e-09, |
| "advantages/std": 0.23690177500247955, |
| "advantages/var": 0.056122450999325446, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.859598853868195, |
| "grad_norm": 49.36231287699574, |
| "learning_rate": 8.457782629195387e-09, |
| "loss": -0.6094, |
| "num_tokens": 103394094.0, |
| "residual_var": 0.03858419507741928, |
| "reward": 0.8203125, |
| "reward_std": 0.12244509160518646, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.3124999403953552, |
| "step": 672 |
| }, |
| { |
| "advantages/mean": 3.4924596548080444e-10, |
| "advantages/snr": 1.4340454135281388e-09, |
| "advantages/std": 0.2435389757156372, |
| "advantages/var": 0.05931123269262173, |
| "completions/clipped_ratio": -2.828125, |
| "epoch": 3.865329512893983, |
| "grad_norm": 43.523584419930096, |
| "learning_rate": 7.885298685522235e-09, |
| "loss": -0.3124, |
| "num_tokens": 103533751.0, |
| "residual_var": 0.05004385486245155, |
| "reward": 0.86328125, |
| "reward_std": 0.10573489964008331, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.1562499701976776, |
| "step": 673 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 9.61212082946454e-10, |
| "advantages/std": 0.24222607910633087, |
| "advantages/var": 0.05867347339922646, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 3.871060171919771, |
| "grad_norm": 81.6292265086487, |
| "learning_rate": 7.3327979320264575e-09, |
| "loss": -0.2091, |
| "num_tokens": 103678840.0, |
| "residual_var": 0.04400511458516121, |
| "reward": 0.8828125, |
| "reward_std": 0.11876175552606583, |
| "rewards/drgrpo_math_reward/mean": 0.8828125, |
| "rewards/drgrpo_math_reward/std": 0.3222736418247223, |
| "rho2": 0.2499999701976776, |
| "step": 674 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.9018629386468357e-09, |
| "advantages/std": 0.24484480917453766, |
| "advantages/var": 0.05994898057971576, |
| "completions/clipped_ratio": -2.78125, |
| "epoch": 3.8767908309455588, |
| "grad_norm": 51.53624188456518, |
| "learning_rate": 6.800291497187083e-09, |
| "loss": -0.0777, |
| "num_tokens": 103838662.0, |
| "residual_var": 0.04683515802025795, |
| "reward": 0.7578125, |
| "reward_std": 0.11928972601890564, |
| "rewards/drgrpo_math_reward/mean": 0.7578125, |
| "rewards/drgrpo_math_reward/std": 0.4292463958263397, |
| "rho2": 0.21874995529651642, |
| "step": 675 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 5.215406001438975e-09, |
| "advantages/std": 0.1785714328289032, |
| "advantages/var": 0.03188775662256749, |
| "completions/clipped_ratio": -2.796875, |
| "epoch": 3.8825214899713467, |
| "grad_norm": 26.68293306608483, |
| "learning_rate": 6.2877901067573955e-09, |
| "loss": -0.1505, |
| "num_tokens": 103991156.0, |
| "residual_var": 0.0259088147431612, |
| "reward": 0.8515625, |
| "reward_std": 0.07141612470149994, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.18749995529651642, |
| "step": 676 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 1.9545560195055106e-09, |
| "advantages/std": 0.23824401199817657, |
| "advantages/var": 0.056760209252987304, |
| "completions/clipped_ratio": -2.921875, |
| "epoch": 3.8882521489971347, |
| "grad_norm": 41.98491960912337, |
| "learning_rate": 5.795304083548558e-09, |
| "loss": -0.3763, |
| "num_tokens": 104128312.0, |
| "residual_var": 0.04611767828464508, |
| "reward": 0.86328125, |
| "reward_std": 0.10349701344966888, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.1874999701976776, |
| "step": 677 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 5.215406001438975e-09, |
| "advantages/std": 0.1785714328289032, |
| "advantages/var": 0.03188775662256749, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 3.8939828080229226, |
| "grad_norm": 38.20002096525398, |
| "learning_rate": 5.322843347221661e-09, |
| "loss": -0.0537, |
| "num_tokens": 104261103.0, |
| "residual_var": 0.025908811017870903, |
| "reward": 0.9375, |
| "reward_std": 0.07141613215208054, |
| "rewards/drgrpo_math_reward/mean": 0.9375, |
| "rewards/drgrpo_math_reward/std": 0.24253563582897186, |
| "rho2": 0.18749994039535522, |
| "step": 678 |
| }, |
| { |
| "advantages/mean": -2.3283064365386963e-10, |
| "advantages/snr": 8.213493136884692e-10, |
| "advantages/std": 0.28347334265708923, |
| "advantages/var": 0.08035713599718353, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.8997134670487106, |
| "grad_norm": 54.09125544505304, |
| "learning_rate": 4.870417414088779e-09, |
| "loss": -0.4964, |
| "num_tokens": 104400662.0, |
| "residual_var": 0.057756710797548294, |
| "reward": 0.8671875, |
| "reward_std": 0.14689236879348755, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.2812499403953552, |
| "step": 679 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 2.011886590227791e-09, |
| "advantages/std": 0.2314550280570984, |
| "advantages/var": 0.0535714300129122, |
| "completions/clipped_ratio": -2.765625, |
| "epoch": 3.9054441260744985, |
| "grad_norm": 41.67372257346115, |
| "learning_rate": 4.438035396920003e-09, |
| "loss": -0.0959, |
| "num_tokens": 104555103.0, |
| "residual_var": 0.043526794761419296, |
| "reward": 0.78125, |
| "reward_std": 0.10007961839437485, |
| "rewards/drgrpo_math_reward/mean": 0.78125, |
| "rewards/drgrpo_math_reward/std": 0.41420844197273254, |
| "rho2": 0.1874999701976776, |
| "step": 680 |
| }, |
| { |
| "advantages/mean": 9.313225746154785e-10, |
| "advantages/snr": 3.7064275334476093e-09, |
| "advantages/std": 0.2512722909450531, |
| "advantages/var": 0.06313776419677541, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.9111747851002865, |
| "grad_norm": 89.25950778644456, |
| "learning_rate": 4.025706004760931e-09, |
| "loss": -0.6127, |
| "num_tokens": 104718658.0, |
| "residual_var": 0.04538027569651604, |
| "reward": 0.71484375, |
| "reward_std": 0.12335620820522308, |
| "rewards/drgrpo_math_reward/mean": 0.71484375, |
| "rewards/drgrpo_math_reward/std": 0.4523732364177704, |
| "rho2": 0.2812499403953552, |
| "step": 681 |
| }, |
| { |
| "advantages/mean": -1.862645149230957e-09, |
| "advantages/snr": 8.00006775297642e-09, |
| "advantages/std": 0.23282866179943085, |
| "advantages/var": 0.05420918575531375, |
| "completions/clipped_ratio": -2.9375, |
| "epoch": 3.9169054441260744, |
| "grad_norm": 44.862514699792975, |
| "learning_rate": 3.633437542756912e-09, |
| "loss": -0.063, |
| "num_tokens": 104862151.0, |
| "residual_var": 0.04065689817070961, |
| "reward": 0.91796875, |
| "reward_std": 0.10718034952878952, |
| "rewards/drgrpo_math_reward/mean": 0.91796875, |
| "rewards/drgrpo_math_reward/std": 0.2749498784542084, |
| "rho2": 0.24999994039535522, |
| "step": 682 |
| }, |
| { |
| "advantages/mean": 0.0, |
| "advantages/snr": 0.0, |
| "advantages/std": 0.220157653093338, |
| "advantages/var": 0.048469392215566565, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.9226361031518624, |
| "grad_norm": 35.18229290368417, |
| "learning_rate": 3.261237911985404e-09, |
| "loss": -0.2328, |
| "num_tokens": 105011857.0, |
| "residual_var": 0.03635205328464508, |
| "reward": 0.8671875, |
| "reward_std": 0.10205793380737305, |
| "rewards/drgrpo_math_reward/mean": 0.8671875, |
| "rewards/drgrpo_math_reward/std": 0.3400367796421051, |
| "rho2": 0.2499999701976776, |
| "step": 683 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 7.905760564528911e-10, |
| "advantages/std": 0.14725378155708313, |
| "advantages/var": 0.021683676182861156, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.9283667621776504, |
| "grad_norm": 27.054667621788635, |
| "learning_rate": 2.909114609297325e-09, |
| "loss": 0.0683, |
| "num_tokens": 105149957.0, |
| "residual_var": 0.019650837406516075, |
| "reward": 0.921875, |
| "reward_std": 0.041687894612550735, |
| "rewards/drgrpo_math_reward/mean": 0.921875, |
| "rewards/drgrpo_math_reward/std": 0.26889389753341675, |
| "rho2": 0.09374997764825821, |
| "step": 684 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.3609033788060646e-09, |
| "advantages/std": 0.19723859429359436, |
| "advantages/var": 0.038903063078913114, |
| "completions/clipped_ratio": -2.671875, |
| "epoch": 3.9340974212034383, |
| "grad_norm": 61.97921697441788, |
| "learning_rate": 2.577074727165951e-09, |
| "loss": 0.0996, |
| "num_tokens": 105303131.0, |
| "residual_var": 0.03525590896606445, |
| "reward": 0.76953125, |
| "reward_std": 0.07167815417051315, |
| "rewards/drgrpo_math_reward/mean": 0.76953125, |
| "rewards/drgrpo_math_reward/std": 0.4219578504562378, |
| "rho2": 0.09374997764825821, |
| "step": 685 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 4.656612686812885e-10, |
| "advantages/std": 0.25, |
| "advantages/var": 0.0625, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.9398280802292263, |
| "grad_norm": 47.583896244179826, |
| "learning_rate": 2.2651249535439177e-09, |
| "loss": -0.2606, |
| "num_tokens": 105462806.0, |
| "residual_var": 0.046875014901161194, |
| "reward": 0.828125, |
| "reward_std": 0.12863078713417053, |
| "rewards/drgrpo_math_reward/mean": 0.828125, |
| "rewards/drgrpo_math_reward/std": 0.3780108094215393, |
| "rho2": 0.24999995529651642, |
| "step": 686 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 9.772780097527553e-10, |
| "advantages/std": 0.23824401199817657, |
| "advantages/var": 0.056760209252987304, |
| "completions/clipped_ratio": -2.84375, |
| "epoch": 3.945558739255014, |
| "grad_norm": 42.244044782175166, |
| "learning_rate": 1.973271571728441e-09, |
| "loss": -0.0936, |
| "num_tokens": 105603670.0, |
| "residual_var": 0.04079641029238701, |
| "reward": 0.86328125, |
| "reward_std": 0.11652141809463501, |
| "rewards/drgrpo_math_reward/mean": 0.86328125, |
| "rewards/drgrpo_math_reward/std": 0.34422317147254944, |
| "rho2": 0.2812499403953552, |
| "step": 687 |
| }, |
| { |
| "advantages/mean": -3.4924596548080444e-10, |
| "advantages/snr": 2.1339279744244726e-09, |
| "advantages/std": 0.16366341710090637, |
| "advantages/var": 0.026785714097145252, |
| "completions/clipped_ratio": -2.8125, |
| "epoch": 3.951289398280802, |
| "grad_norm": 31.72833119113645, |
| "learning_rate": 1.701520460235084e-09, |
| "loss": -0.2415, |
| "num_tokens": 105741207.0, |
| "residual_var": 0.024274565279483795, |
| "reward": 0.8203125, |
| "reward_std": 0.05326685309410095, |
| "rewards/drgrpo_math_reward/mean": 0.8203125, |
| "rewards/drgrpo_math_reward/std": 0.38467901945114136, |
| "rho2": 0.09374997764825821, |
| "step": 688 |
| }, |
| { |
| "advantages/mean": 4.656612873077393e-10, |
| "advantages/snr": 2.3417865283086908e-09, |
| "advantages/std": 0.19884872436523438, |
| "advantages/var": 0.039540815181680955, |
| "completions/clipped_ratio": -2.890625, |
| "epoch": 3.95702005730659, |
| "grad_norm": 36.907782413793186, |
| "learning_rate": 1.4498770926790749e-09, |
| "loss": -0.2752, |
| "num_tokens": 105887946.0, |
| "residual_var": 0.03089127317070961, |
| "reward": 0.84375, |
| "reward_std": 0.08588206768035889, |
| "rewards/drgrpo_math_reward/mean": 0.84375, |
| "rewards/drgrpo_math_reward/std": 0.3638034462928772, |
| "rho2": 0.21874994039535522, |
| "step": 689 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.1726896131954143e-09, |
| "advantages/std": 0.220157653093338, |
| "advantages/var": 0.048469392215566565, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.962750716332378, |
| "grad_norm": 54.84225952553401, |
| "learning_rate": 1.2183465376650603e-09, |
| "loss": -0.6302, |
| "num_tokens": 106028876.0, |
| "residual_var": 0.03938138857483864, |
| "reward": 0.8515625, |
| "reward_std": 0.09495474398136139, |
| "rewards/drgrpo_math_reward/mean": 0.8515625, |
| "rewards/drgrpo_math_reward/std": 0.3562295734882355, |
| "rho2": 0.1874999701976776, |
| "step": 690 |
| }, |
| { |
| "advantages/mean": -4.656612873077393e-10, |
| "advantages/snr": 3.424081661465097e-09, |
| "advantages/std": 0.13599595427513123, |
| "advantages/var": 0.018494899579203583, |
| "completions/clipped_ratio": -2.875, |
| "epoch": 3.968481375358166, |
| "grad_norm": 32.456933445245255, |
| "learning_rate": 1.0069334586854105e-09, |
| "loss": -0.1288, |
| "num_tokens": 106162604.0, |
| "residual_var": 0.016761010512709618, |
| "reward": 0.88671875, |
| "reward_std": 0.0382704995572567, |
| "rewards/drgrpo_math_reward/mean": 0.88671875, |
| "rewards/drgrpo_math_reward/std": 0.31755712628364563, |
| "rho2": 0.09374997764825821, |
| "step": 691 |
| }, |
| { |
| "advantages/mean": 1.1641532182693481e-10, |
| "advantages/snr": 4.942236508931846e-10, |
| "advantages/std": 0.2355518937110901, |
| "advantages/var": 0.05548469463088068, |
| "completions/clipped_ratio": -2.734375, |
| "epoch": 3.974212034383954, |
| "grad_norm": 52.73347449536594, |
| "learning_rate": 8.156421140254055e-10, |
| "loss": 0.3587, |
| "num_tokens": 106323078.0, |
| "residual_var": 0.045081328600645065, |
| "reward": 0.84765625, |
| "reward_std": 0.10178709030151367, |
| "rewards/drgrpo_math_reward/mean": 0.84765625, |
| "rewards/drgrpo_math_reward/std": 0.3600577116012573, |
| "rho2": 0.18749995529651642, |
| "step": 692 |
| }, |
| { |
| "advantages/mean": -6.984919309616089e-10, |
| "advantages/snr": 4.967679260771851e-09, |
| "advantages/std": 0.14060728251934052, |
| "advantages/var": 0.01977040789747364, |
| "completions/clipped_ratio": -2.96875, |
| "epoch": 3.9799426934097424, |
| "grad_norm": 26.14018984078511, |
| "learning_rate": 6.44476356678636e-10, |
| "loss": -0.1711, |
| "num_tokens": 106443771.0, |
| "residual_var": 0.01791694387793541, |
| "reward": 0.94921875, |
| "reward_std": 0.03998042270541191, |
| "rewards/drgrpo_math_reward/mean": 0.94921875, |
| "rewards/drgrpo_math_reward/std": 0.21998079121112823, |
| "rho2": 0.0937499850988388, |
| "step": 693 |
| }, |
| { |
| "advantages/mean": -8.149072527885437e-10, |
| "advantages/snr": 4.810330410837374e-09, |
| "advantages/std": 0.16940774023532867, |
| "advantages/var": 0.028698982451640598, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 3.98567335243553, |
| "grad_norm": 31.191093269885084, |
| "learning_rate": 4.934396342683999e-10, |
| "loss": 0.0238, |
| "num_tokens": 106579713.0, |
| "residual_var": 0.02421477437019348, |
| "reward": 0.91796875, |
| "reward_std": 0.06207750737667084, |
| "rewards/drgrpo_math_reward/mean": 0.91796875, |
| "rewards/drgrpo_math_reward/std": 0.2749498784542084, |
| "rho2": 0.1562499701976776, |
| "step": 694 |
| }, |
| { |
| "advantages/mean": 6.984919309616089e-10, |
| "advantages/snr": 3.5707444382625662e-09, |
| "advantages/std": 0.19561520218849182, |
| "advantages/var": 0.038265307327244535, |
| "completions/clipped_ratio": -2.90625, |
| "epoch": 3.9914040114613183, |
| "grad_norm": 38.53702742178626, |
| "learning_rate": 3.625349889788687e-10, |
| "loss": -0.2754, |
| "num_tokens": 106715542.0, |
| "residual_var": 0.031090570613741875, |
| "reward": 0.8984375, |
| "reward_std": 0.07825091481208801, |
| "rewards/drgrpo_math_reward/mean": 0.8984375, |
| "rewards/drgrpo_math_reward/std": 0.3026638329029083, |
| "rho2": 0.1874999701976776, |
| "step": 695 |
| }, |
| { |
| "advantages/mean": 2.3283064365386963e-10, |
| "advantages/snr": 1.043916747715097e-09, |
| "advantages/std": 0.22303563356399536, |
| "advantages/var": 0.049744893839292814, |
| "completions/clipped_ratio": -2.953125, |
| "epoch": 3.997134670487106, |
| "grad_norm": 49.227715319987915, |
| "learning_rate": 2.517650574934693e-10, |
| "loss": -0.3185, |
| "num_tokens": 106860947.0, |
| "residual_var": 0.03886320814490318, |
| "reward": 0.890625, |
| "reward_std": 0.10258589684963226, |
| "rewards/drgrpo_math_reward/mean": 0.890625, |
| "rewards/drgrpo_math_reward/std": 0.31272050738334656, |
| "rho2": 0.2187499701976776, |
| "step": 696 |
| }, |
| { |
| "epoch": 3.997134670487106, |
| "step": 696, |
| "total_flos": 0.0, |
| "train_loss": -0.9753264665694629, |
| "train_runtime": 28353.7016, |
| "train_samples_per_second": 0.787, |
| "train_steps_per_second": 0.025 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 700, |
| "num_input_tokens_seen": 106860947, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|