JH Na
Model save
b7a4d14 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.997134670487106,
"eval_steps": 500,
"global_step": 696,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 4.005929201951151e-09,
"advantages/std": 0.46497204899787903,
"advantages/var": 0.21619900634928602,
"completions/clipped_ratio": -1.890625,
"epoch": 0.0057306590257879654,
"grad_norm": 591.0469885189827,
"learning_rate": 2e-06,
"loss": -34.4309,
"num_tokens": 188198.0,
"residual_var": 0.060805998742580414,
"reward": 0.43359375,
"reward_std": 0.4025588035583496,
"rewards/drgrpo_math_reward/mean": 0.43359375,
"rewards/drgrpo_math_reward/std": 0.4965413510799408,
"rho2": 0.7187498807907104,
"step": 1
},
{
"advantages/mean": 4.889443516731262e-09,
"advantages/snr": 1.0609876673533912e-08,
"advantages/std": 0.46083885431289673,
"advantages/var": 0.21237244964442326,
"completions/clipped_ratio": -1.578125,
"epoch": 0.011461318051575931,
"grad_norm": 995.3395033100987,
"learning_rate": 1.9999899289920054e-06,
"loss": -38.2217,
"num_tokens": 388582.0,
"residual_var": 0.04645651578903198,
"reward": 0.37890625,
"reward_std": 0.3851699233055115,
"rewards/drgrpo_math_reward/mean": 0.37890625,
"rewards/drgrpo_math_reward/std": 0.4860650300979614,
"rho2": 0.7812498211860657,
"step": 2
},
{
"advantages/mean": -1.0477378964424133e-09,
"advantages/snr": 2.3413202815254037e-09,
"advantages/std": 0.44749873876571655,
"advantages/var": 0.20025512119690703,
"completions/clipped_ratio": -1.5625,
"epoch": 0.017191977077363897,
"grad_norm": 261.62193963816037,
"learning_rate": 1.999959716170871e-06,
"loss": -35.6328,
"num_tokens": 602957.0,
"residual_var": 0.056321777403354645,
"reward": 0.4375,
"reward_std": 0.3725636303424835,
"rewards/drgrpo_math_reward/mean": 0.4375,
"rewards/drgrpo_math_reward/std": 0.49705013632774353,
"rho2": 0.7187498807907104,
"step": 3
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 3.0971986497173363e-09,
"advantages/std": 0.45104753971099854,
"advantages/var": 0.2034438830793448,
"completions/clipped_ratio": -1.8125,
"epoch": 0.022922636103151862,
"grad_norm": 388.3029257567336,
"learning_rate": 1.999909362145145e-06,
"loss": -26.3566,
"num_tokens": 816817.0,
"residual_var": 0.06993386149406433,
"reward": 0.58984375,
"reward_std": 0.37533193826675415,
"rewards/drgrpo_math_reward/mean": 0.58984375,
"rewards/drgrpo_math_reward/std": 0.49282538890838623,
"rho2": 0.6562498807907104,
"step": 4
},
{
"advantages/mean": -3.3760443329811096e-09,
"advantages/snr": 7.850225838136565e-09,
"advantages/std": 0.43005695939064026,
"advantages/var": 0.1849489883203228,
"completions/clipped_ratio": -1.421875,
"epoch": 0.02865329512893983,
"grad_norm": 179.31864687689426,
"learning_rate": 1.999838867929058e-06,
"loss": -18.2458,
"num_tokens": 1042032.0,
"residual_var": 0.07513555139303207,
"reward": 0.484375,
"reward_std": 0.33824339509010315,
"rewards/drgrpo_math_reward/mean": 0.484375,
"rewards/drgrpo_math_reward/std": 0.5007347464561462,
"rho2": 0.5937498807907104,
"step": 5
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 4.522330689365846e-09,
"advantages/std": 0.41187724471092224,
"advantages/var": 0.16964286471066092,
"completions/clipped_ratio": -2.03125,
"epoch": 0.034383954154727794,
"grad_norm": 233.82793486575466,
"learning_rate": 1.9997482349425066e-06,
"loss": -11.5251,
"num_tokens": 1238234.0,
"residual_var": 0.03180807828903198,
"reward": 0.703125,
"reward_std": 0.34193065762519836,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"rho2": 0.8124997615814209,
"step": 6
},
{
"advantages/mean": 1.7462298274040222e-09,
"advantages/snr": 3.972400980534068e-09,
"advantages/std": 0.4395905137062073,
"advantages/var": 0.1932398197404872,
"completions/clipped_ratio": -1.859375,
"epoch": 0.04011461318051576,
"grad_norm": 229.54028598418301,
"learning_rate": 1.999637465011021e-06,
"loss": -23.6846,
"num_tokens": 1443163.0,
"residual_var": 0.03019375540316105,
"reward": 0.62890625,
"reward_std": 0.3734835982322693,
"rewards/drgrpo_math_reward/mean": 0.62890625,
"rewards/drgrpo_math_reward/std": 0.48404383659362793,
"rho2": 0.8437498211860657,
"step": 7
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 8.945516905539432e-10,
"advantages/std": 0.3904145061969757,
"advantages/var": 0.15242348664902838,
"completions/clipped_ratio": -1.875,
"epoch": 0.045845272206303724,
"grad_norm": 186.0838781990229,
"learning_rate": 1.9995065603657316e-06,
"loss": -18.2365,
"num_tokens": 1656784.0,
"residual_var": 0.05715882405638695,
"reward": 0.64453125,
"reward_std": 0.2987997531890869,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"rho2": 0.6249998807907104,
"step": 8
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.2028437765240612e-09,
"advantages/std": 0.3871336281299591,
"advantages/var": 0.14987244602906546,
"completions/clipped_ratio": -1.921875,
"epoch": 0.05157593123209169,
"grad_norm": 101.5457998948026,
"learning_rate": 1.999355523643321e-06,
"loss": -10.5425,
"num_tokens": 1858387.0,
"residual_var": 0.0608857125043869,
"reward": 0.71484375,
"reward_std": 0.28300461173057556,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"rho2": 0.5937498807907104,
"step": 9
},
{
"advantages/mean": -2.9103830456733704e-09,
"advantages/snr": 8.10862952743916e-09,
"advantages/std": 0.35892415046691895,
"advantages/var": 0.12882654578839947,
"completions/clipped_ratio": -1.640625,
"epoch": 0.05730659025787966,
"grad_norm": 213.60109250542604,
"learning_rate": 1.9991843578859746e-06,
"loss": -19.1132,
"num_tokens": 2082512.0,
"residual_var": 0.05636163428425789,
"reward": 0.5,
"reward_std": 0.2613256275653839,
"rewards/drgrpo_math_reward/mean": 0.5,
"rewards/drgrpo_math_reward/std": 0.5009794235229492,
"rho2": 0.5624998807907104,
"step": 10
},
{
"advantages/mean": -1.5133991837501526e-09,
"advantages/snr": 4.125582353770003e-09,
"advantages/std": 0.3668328523635864,
"advantages/var": 0.1345663415732048,
"completions/clipped_ratio": -1.984375,
"epoch": 0.06303724928366762,
"grad_norm": 115.78635926271141,
"learning_rate": 1.9989930665413145e-06,
"loss": -11.9284,
"num_tokens": 2269571.0,
"residual_var": 0.06728318333625793,
"reward": 0.69921875,
"reward_std": 0.2610597014427185,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"rho2": 0.49999991059303284,
"step": 11
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 2.503474956315218e-09,
"advantages/std": 0.3720119297504425,
"advantages/var": 0.13839287587664817,
"completions/clipped_ratio": -1.46875,
"epoch": 0.06876790830945559,
"grad_norm": 86.54669725830207,
"learning_rate": 1.998781653462335e-06,
"loss": -5.6964,
"num_tokens": 2475242.0,
"residual_var": 0.06487166881561279,
"reward": 0.52734375,
"reward_std": 0.2703958749771118,
"rewards/drgrpo_math_reward/mean": 0.52734375,
"rewards/drgrpo_math_reward/std": 0.5002297759056091,
"rho2": 0.5312498807907104,
"step": 12
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 4.4965306713937905e-09,
"advantages/std": 0.3624604344367981,
"advantages/var": 0.1313775665321124,
"completions/clipped_ratio": -2.03125,
"epoch": 0.07449856733524356,
"grad_norm": 248.72480558278457,
"learning_rate": 1.998550122907321e-06,
"loss": -6.6781,
"num_tokens": 2676337.0,
"residual_var": 0.04926660656929016,
"reward": 0.703125,
"reward_std": 0.27658790349960327,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"rho2": 0.6249998807907104,
"step": 13
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 3.0520855732441985e-09,
"advantages/std": 0.30514299869537354,
"advantages/var": 0.09311224965280473,
"completions/clipped_ratio": -2.125,
"epoch": 0.08022922636103152,
"grad_norm": 167.19083795315274,
"learning_rate": 1.9982984795397646e-06,
"loss": -4.0752,
"num_tokens": 2865669.0,
"residual_var": 0.05528540536761284,
"reward": 0.734375,
"reward_std": 0.18596167862415314,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"rho2": 0.4062499403953552,
"step": 14
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 6.815341083570872e-10,
"advantages/std": 0.3416272699832916,
"advantages/var": 0.11670919159623683,
"completions/clipped_ratio": -1.71875,
"epoch": 0.08595988538681948,
"grad_norm": 88.29950202682701,
"learning_rate": 1.9980267284282714e-06,
"loss": -3.5993,
"num_tokens": 3093146.0,
"residual_var": 0.06929609924554825,
"reward": 0.58203125,
"reward_std": 0.21633264422416687,
"rewards/drgrpo_math_reward/mean": 0.58203125,
"rewards/drgrpo_math_reward/std": 0.49419113993644714,
"rho2": 0.40624991059303284,
"step": 15
},
{
"advantages/mean": 2.2118911147117615e-09,
"advantages/snr": 5.775276936058749e-09,
"advantages/std": 0.38299307227134705,
"advantages/var": 0.14668369340784526,
"completions/clipped_ratio": -1.609375,
"epoch": 0.09169054441260745,
"grad_norm": 127.5442814323443,
"learning_rate": 1.997734875046456e-06,
"loss": -5.593,
"num_tokens": 3312316.0,
"residual_var": 0.06417413055896759,
"reward": 0.6015625,
"reward_std": 0.28657418489456177,
"rewards/drgrpo_math_reward/mean": 0.6015625,
"rewards/drgrpo_math_reward/std": 0.4905354380607605,
"rho2": 0.5624998807907104,
"step": 16
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 1.550447277389203e-09,
"advantages/std": 0.37542495131492615,
"advantages/var": 0.14094389406981467,
"completions/clipped_ratio": -2.234375,
"epoch": 0.09742120343839542,
"grad_norm": 268.45005256485683,
"learning_rate": 1.997422925272834e-06,
"loss": -5.6212,
"num_tokens": 3497926.0,
"residual_var": 0.05285397917032242,
"reward": 0.73046875,
"reward_std": 0.2803860306739807,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"rho2": 0.6249998807907104,
"step": 17
},
{
"advantages/mean": 2.561137080192566e-09,
"advantages/snr": 7.559090868705042e-09,
"advantages/std": 0.33881548047065735,
"advantages/var": 0.11479592980656239,
"completions/clipped_ratio": -2.34375,
"epoch": 0.10315186246418338,
"grad_norm": 149.2003080189042,
"learning_rate": 1.9970908853907026e-06,
"loss": -3.6572,
"num_tokens": 3674901.0,
"residual_var": 0.05381061136722565,
"reward": 0.75,
"reward_std": 0.23357081413269043,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"rho2": 0.5312498807907104,
"step": 18
},
{
"advantages/mean": -2.444721758365631e-09,
"advantages/snr": 7.060306184456525e-09,
"advantages/std": 0.3462628424167633,
"advantages/var": 0.11989795603853626,
"completions/clipped_ratio": -2.140625,
"epoch": 0.10888252148997135,
"grad_norm": 77.94130251951155,
"learning_rate": 1.9967387620880144e-06,
"loss": -4.1568,
"num_tokens": 3868267.0,
"residual_var": 0.07118944078683853,
"reward": 0.7265625,
"reward_std": 0.22620412707328796,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"rho2": 0.40624991059303284,
"step": 19
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 1.8042655258901198e-09,
"advantages/std": 0.3871336579322815,
"advantages/var": 0.14987246910402874,
"completions/clipped_ratio": -2.5,
"epoch": 0.11461318051575932,
"grad_norm": 95.62080008887207,
"learning_rate": 1.996366562457243e-06,
"loss": -4.9432,
"num_tokens": 4056543.0,
"residual_var": 0.0608857087790966,
"reward": 0.73828125,
"reward_std": 0.289458692073822,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"rho2": 0.5937498807907104,
"step": 20
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 4.3869819386401945e-09,
"advantages/std": 0.31843847036361694,
"advantages/var": 0.10140305940752015,
"completions/clipped_ratio": -2.375,
"epoch": 0.12034383954154727,
"grad_norm": 94.44241989116894,
"learning_rate": 1.995974293995239e-06,
"loss": -3.5967,
"num_tokens": 4239399.0,
"residual_var": 0.06020808964967728,
"reward": 0.78515625,
"reward_std": 0.19450394809246063,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"rho2": 0.40624991059303284,
"step": 21
},
{
"advantages/mean": 2.6775524020195007e-09,
"advantages/snr": 7.052721734123111e-09,
"advantages/std": 0.3796480894088745,
"advantages/var": 0.14413267179180878,
"completions/clipped_ratio": -1.765625,
"epoch": 0.12607449856733524,
"grad_norm": 109.37752818286293,
"learning_rate": 1.99556196460308e-06,
"loss": -4.1523,
"num_tokens": 4456912.0,
"residual_var": 0.0540497712790966,
"reward": 0.5859375,
"reward_std": 0.2902575135231018,
"rewards/drgrpo_math_reward/mean": 0.5859375,
"rewards/drgrpo_math_reward/std": 0.4935242533683777,
"rho2": 0.6249998807907104,
"step": 22
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.3412577461348454e-09,
"advantages/std": 0.3471825420856476,
"advantages/var": 0.12053571752905246,
"completions/clipped_ratio": -2.40625,
"epoch": 0.1318051575931232,
"grad_norm": 103.74041156571671,
"learning_rate": 1.995129582585911e-06,
"loss": -3.7992,
"num_tokens": 4641229.0,
"residual_var": 0.05650113895535469,
"reward": 0.70703125,
"reward_std": 0.24686214327812195,
"rewards/drgrpo_math_reward/mean": 0.70703125,
"rewards/drgrpo_math_reward/std": 0.45601576566696167,
"rho2": 0.5312498807907104,
"step": 23
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 4.732108640838661e-09,
"advantages/std": 0.3444161117076874,
"advantages/var": 0.11862245800384219,
"completions/clipped_ratio": -2.09375,
"epoch": 0.13753581661891118,
"grad_norm": 79.54553022424207,
"learning_rate": 1.994677156652778e-06,
"loss": -4.236,
"num_tokens": 4859649.0,
"residual_var": 0.05189733952283859,
"reward": 0.6953125,
"reward_std": 0.23816770315170288,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"rho2": 0.5624998807907104,
"step": 24
},
{
"advantages/mean": -1.0477378964424133e-09,
"advantages/snr": 2.784512456412912e-09,
"advantages/std": 0.37627336382865906,
"advantages/var": 0.14158164432693443,
"completions/clipped_ratio": -1.953125,
"epoch": 0.14326647564469913,
"grad_norm": 154.38060766574566,
"learning_rate": 1.994204695916451e-06,
"loss": -4.3948,
"num_tokens": 5063750.0,
"residual_var": 0.05751756578683853,
"reward": 0.65625,
"reward_std": 0.27446234226226807,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"rho2": 0.5937498807907104,
"step": 25
},
{
"advantages/mean": -1.0477378964424133e-09,
"advantages/snr": 3.290236453980146e-09,
"advantages/std": 0.31843847036361694,
"advantages/var": 0.10140305940752015,
"completions/clipped_ratio": -2.359375,
"epoch": 0.1489971346704871,
"grad_norm": 90.32609776484996,
"learning_rate": 1.9937122098932426e-06,
"loss": -3.7122,
"num_tokens": 5244742.0,
"residual_var": 0.066545769572258,
"reward": 0.72265625,
"reward_std": 0.2003089338541031,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"rho2": 0.3437499403953552,
"step": 26
},
{
"advantages/mean": 3.026798367500305e-09,
"advantages/snr": 8.539322219708342e-09,
"advantages/std": 0.3544541597366333,
"advantages/var": 0.12563775135460276,
"completions/clipped_ratio": -2.0,
"epoch": 0.15472779369627507,
"grad_norm": 73.45824601580452,
"learning_rate": 1.9931997085028128e-06,
"loss": -4.1696,
"num_tokens": 5438925.0,
"residual_var": 0.0667450800538063,
"reward": 0.71484375,
"reward_std": 0.2384297400712967,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"rho2": 0.46874991059303284,
"step": 27
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.29558831453323364,
"advantages/var": 0.08737245168859786,
"completions/clipped_ratio": -2.3125,
"epoch": 0.16045845272206305,
"grad_norm": 72.70898121193805,
"learning_rate": 1.9926672020679734e-06,
"loss": -2.3686,
"num_tokens": 5615770.0,
"residual_var": 0.05733818560838699,
"reward": 0.66015625,
"reward_std": 0.1737360805273056,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"rho2": 0.3437499403953552,
"step": 28
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 3.2760502000971653e-09,
"advantages/std": 0.3553526699542999,
"advantages/var": 0.12627552004364961,
"completions/clipped_ratio": -1.96875,
"epoch": 0.166189111747851,
"grad_norm": 79.31434150094832,
"learning_rate": 1.9921147013144777e-06,
"loss": -1.8823,
"num_tokens": 5833569.0,
"residual_var": 0.06708388775587082,
"reward": 0.640625,
"reward_std": 0.24488137662410736,
"rewards/drgrpo_math_reward/mean": 0.640625,
"rewards/drgrpo_math_reward/std": 0.4807571768760681,
"rho2": 0.46874988079071045,
"step": 29
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 4.422026899202607e-09,
"advantages/std": 0.3685672879219055,
"advantages/var": 0.1358418457261088,
"completions/clipped_ratio": -2.296875,
"epoch": 0.17191977077363896,
"grad_norm": 75.57215984106453,
"learning_rate": 1.9915422173708044e-06,
"loss": -3.8191,
"num_tokens": 6026380.0,
"residual_var": 0.05943083018064499,
"reward": 0.72265625,
"reward_std": 0.26170387864112854,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"rho2": 0.5624998807907104,
"step": 30
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 4.280614987975828e-09,
"advantages/std": 0.32635119557380676,
"advantages/var": 0.10650510285245307,
"completions/clipped_ratio": -2.21875,
"epoch": 0.17765042979942694,
"grad_norm": 60.21761972546305,
"learning_rate": 1.9909497617679347e-06,
"loss": -2.0778,
"num_tokens": 6220330.0,
"residual_var": 0.07322227954864502,
"reward": 0.64453125,
"reward_std": 0.19951260089874268,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"rho2": 0.3124999403953552,
"step": 31
},
{
"advantages/mean": 2.2118911147117615e-09,
"advantages/snr": 6.256172755954382e-09,
"advantages/std": 0.3535533845424652,
"advantages/var": 0.12499999572143228,
"completions/clipped_ratio": -2.671875,
"epoch": 0.1833810888252149,
"grad_norm": 75.90161991383617,
"learning_rate": 1.9903373464391184e-06,
"loss": -2.4459,
"num_tokens": 6388484.0,
"residual_var": 0.06250002980232239,
"reward": 0.8203125,
"reward_std": 0.24435339868068695,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.49999991059303284,
"step": 32
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 9.302683664335218e-10,
"advantages/std": 0.37542495131492615,
"advantages/var": 0.14094389406981467,
"completions/clipped_ratio": -1.953125,
"epoch": 0.18911174785100288,
"grad_norm": 88.08278791565704,
"learning_rate": 1.9897049837196347e-06,
"loss": -4.1498,
"num_tokens": 6608514.0,
"residual_var": 0.06606747210025787,
"reward": 0.58984375,
"reward_std": 0.2673616409301758,
"rewards/drgrpo_math_reward/mean": 0.58984375,
"rewards/drgrpo_math_reward/std": 0.49282538890838623,
"rho2": 0.5312498807907104,
"step": 33
},
{
"advantages/mean": -5.820766091346741e-10,
"advantages/snr": 1.9764402082422215e-09,
"advantages/std": 0.29450756311416626,
"advantages/var": 0.08673470473144462,
"completions/clipped_ratio": -2.640625,
"epoch": 0.19484240687679083,
"grad_norm": 158.9426980296641,
"learning_rate": 1.9890526863465443e-06,
"loss": -1.9826,
"num_tokens": 6787702.0,
"residual_var": 0.046077825129032135,
"reward": 0.8203125,
"reward_std": 0.18623007833957672,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.46874988079071045,
"step": 34
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 3.763894853665204e-09,
"advantages/std": 0.30929481983184814,
"advantages/var": 0.0956632855748154,
"completions/clipped_ratio": -2.5,
"epoch": 0.20057306590257878,
"grad_norm": 54.19371702022302,
"learning_rate": 1.9883804674584306e-06,
"loss": 0.1586,
"num_tokens": 6974380.0,
"residual_var": 0.05381060764193535,
"reward": 0.7578125,
"reward_std": 0.19530275464057922,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"rho2": 0.43749991059303284,
"step": 35
},
{
"advantages/mean": -2.444721758365631e-09,
"advantages/snr": 7.381391359252789e-09,
"advantages/std": 0.33120065927505493,
"advantages/var": 0.10969387670423103,
"completions/clipped_ratio": -2.359375,
"epoch": 0.20630372492836677,
"grad_norm": 93.4601212855401,
"learning_rate": 1.9876883405951377e-06,
"loss": -3.3799,
"num_tokens": 7168721.0,
"residual_var": 0.06513076275587082,
"reward": 0.765625,
"reward_std": 0.2095002979040146,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.4062499403953552,
"step": 36
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.3499271273612976,
"advantages/var": 0.1224489944633298,
"completions/clipped_ratio": -2.375,
"epoch": 0.21203438395415472,
"grad_norm": 153.14054844478008,
"learning_rate": 1.9869763196974956e-06,
"loss": -2.5922,
"num_tokens": 7366495.0,
"residual_var": 0.06887757778167725,
"reward": 0.6484375,
"reward_std": 0.2290911078453064,
"rewards/drgrpo_math_reward/mean": 0.6484375,
"rewards/drgrpo_math_reward/std": 0.47839346528053284,
"rho2": 0.43749988079071045,
"step": 37
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 4.5749320956588125e-09,
"advantages/std": 0.3562488853931427,
"advantages/var": 0.12691326834385652,
"completions/clipped_ratio": -2.421875,
"epoch": 0.2177650429799427,
"grad_norm": 73.9873748933185,
"learning_rate": 1.986244419107041e-06,
"loss": -1.17,
"num_tokens": 7563588.0,
"residual_var": 0.06345665454864502,
"reward": 0.68359375,
"reward_std": 0.24606089293956757,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"rho2": 0.49999991059303284,
"step": 38
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 4.1231406671042415e-09,
"advantages/std": 0.2823462188243866,
"advantages/var": 0.0797193872844284,
"completions/clipped_ratio": -2.171875,
"epoch": 0.22349570200573066,
"grad_norm": 77.56288158012738,
"learning_rate": 1.9854926535657268e-06,
"loss": -1.8093,
"num_tokens": 7755300.0,
"residual_var": 0.05978955700993538,
"reward": 0.59765625,
"reward_std": 0.1521669626235962,
"rewards/drgrpo_math_reward/mean": 0.59765625,
"rewards/drgrpo_math_reward/std": 0.4913311004638672,
"rho2": 0.24999994039535522,
"step": 39
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.4311629836649927e-09,
"advantages/std": 0.3253726363182068,
"advantages/var": 0.10586735246466006,
"completions/clipped_ratio": -2.671875,
"epoch": 0.22922636103151864,
"grad_norm": 87.24551220149847,
"learning_rate": 1.9847210382156263e-06,
"loss": -1.523,
"num_tokens": 7934446.0,
"residual_var": 0.06616710126399994,
"reward": 0.7109375,
"reward_std": 0.20490340888500214,
"rewards/drgrpo_math_reward/mean": 0.7109375,
"rewards/drgrpo_math_reward/std": 0.45421501994132996,
"rho2": 0.3749999403953552,
"step": 40
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 5.9243035139399444e-09,
"advantages/std": 0.31440743803977966,
"advantages/var": 0.09885203709473789,
"completions/clipped_ratio": -2.328125,
"epoch": 0.2349570200573066,
"grad_norm": 55.094477304288226,
"learning_rate": 1.9839295885986295e-06,
"loss": -0.5665,
"num_tokens": 8125659.0,
"residual_var": 0.05560428649187088,
"reward": 0.71484375,
"reward_std": 0.1922660768032074,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"rho2": 0.4374999403953552,
"step": 41
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 3.417019487087845e-10,
"advantages/std": 0.34069257974624634,
"advantages/var": 0.11607143389415242,
"completions/clipped_ratio": -2.21875,
"epoch": 0.24068767908309455,
"grad_norm": 82.4610385566399,
"learning_rate": 1.983118320656126e-06,
"loss": -2.4495,
"num_tokens": 8321968.0,
"residual_var": 0.06529020518064499,
"reward": 0.65625,
"reward_std": 0.22225631773471832,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"rho2": 0.43749991059303284,
"step": 42
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.3154200315475464,
"advantages/var": 0.09948979630145516,
"completions/clipped_ratio": -2.4375,
"epoch": 0.24641833810888253,
"grad_norm": 74.27035735922438,
"learning_rate": 1.9822872507286887e-06,
"loss": -3.659,
"num_tokens": 8503101.0,
"residual_var": 0.05907208472490311,
"reward": 0.78125,
"reward_std": 0.1927964836359024,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.4062499403953552,
"step": 43
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.5639963792566626e-09,
"advantages/std": 0.29773807525634766,
"advantages/var": 0.08864796145735454,
"completions/clipped_ratio": -2.546875,
"epoch": 0.2521489971346705,
"grad_norm": 124.26993483415545,
"learning_rate": 1.98143639555574e-06,
"loss": -4.6213,
"num_tokens": 8678007.0,
"residual_var": 0.055404990911483765,
"reward": 0.76171875,
"reward_std": 0.17491313815116882,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"rho2": 0.3749999403953552,
"step": 44
},
{
"advantages/mean": 3.4924596548080444e-10,
"advantages/snr": 1.2129215590738813e-09,
"advantages/std": 0.28793779015541077,
"advantages/var": 0.08290817099958137,
"completions/clipped_ratio": -2.421875,
"epoch": 0.25787965616045844,
"grad_norm": 54.50555477439546,
"learning_rate": 1.98056577227522e-06,
"loss": -2.9954,
"num_tokens": 8867225.0,
"residual_var": 0.05440850183367729,
"reward": 0.6796875,
"reward_std": 0.1621546745300293,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"rho2": 0.34374991059303284,
"step": 45
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 5.47398732739832e-09,
"advantages/std": 0.29773807525634766,
"advantages/var": 0.08864796145735454,
"completions/clipped_ratio": -2.390625,
"epoch": 0.2636103151862464,
"grad_norm": 50.70189605191493,
"learning_rate": 1.9796753984232355e-06,
"loss": -3.1074,
"num_tokens": 9076637.0,
"residual_var": 0.06371574103832245,
"reward": 0.66015625,
"reward_std": 0.16834282875061035,
"rewards/drgrpo_math_reward/mean": 0.66015625,
"rewards/drgrpo_math_reward/std": 0.47458380460739136,
"rho2": 0.2812499403953552,
"step": 46
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 9.509314693234178e-10,
"advantages/std": 0.24484480917453766,
"advantages/var": 0.05994898057971576,
"completions/clipped_ratio": -2.671875,
"epoch": 0.2693409742120344,
"grad_norm": 43.609323731133514,
"learning_rate": 1.9787652919337115e-06,
"loss": -1.4678,
"num_tokens": 9242656.0,
"residual_var": 0.04683515429496765,
"reward": 0.859375,
"reward_std": 0.11283563077449799,
"rewards/drgrpo_math_reward/mean": 0.859375,
"rewards/drgrpo_math_reward/std": 0.3483152687549591,
"rho2": 0.21874994039535522,
"step": 47
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 4.5166742596055914e-09,
"advantages/std": 0.30929479002952576,
"advantages/var": 0.09566326713940843,
"completions/clipped_ratio": -2.421875,
"epoch": 0.27507163323782235,
"grad_norm": 138.18146615794635,
"learning_rate": 1.977835471138027e-06,
"loss": -1.2657,
"num_tokens": 9419647.0,
"residual_var": 0.05381060019135475,
"reward": 0.78125,
"reward_std": 0.19530275464057922,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.4374999403953552,
"step": 48
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 8.830797509141478e-10,
"advantages/std": 0.2636575400829315,
"advantages/var": 0.06951529844258264,
"completions/clipped_ratio": -2.40625,
"epoch": 0.2808022922636103,
"grad_norm": 37.49999329733934,
"learning_rate": 1.9768859547646473e-06,
"loss": -0.3458,
"num_tokens": 9592645.0,
"residual_var": 0.04779179021716118,
"reward": 0.79296875,
"reward_std": 0.14256632328033447,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"rho2": 0.3124999403953552,
"step": 49
},
{
"advantages/mean": -2.7939677238464355e-09,
"advantages/snr": 8.510420944439486e-09,
"advantages/std": 0.3282995820045471,
"advantages/var": 0.10778061554436036,
"completions/clipped_ratio": -2.53125,
"epoch": 0.28653295128939826,
"grad_norm": 58.64737177653492,
"learning_rate": 1.9759167619387473e-06,
"loss": -2.9043,
"num_tokens": 9765343.0,
"residual_var": 0.06399475783109665,
"reward": 0.72265625,
"reward_std": 0.20779283344745636,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"rho2": 0.4062499403953552,
"step": 50
},
{
"advantages/mean": 8.149072527885437e-10,
"advantages/snr": 2.2320711875154694e-09,
"advantages/std": 0.36509016156196594,
"advantages/var": 0.1332908260693424,
"completions/clipped_ratio": -2.453125,
"epoch": 0.2922636103151863,
"grad_norm": 89.34276653392745,
"learning_rate": 1.9749279121818236e-06,
"loss": -1.9981,
"num_tokens": 9950969.0,
"residual_var": 0.05831475183367729,
"reward": 0.67578125,
"reward_std": 0.26645296812057495,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"rho2": 0.5624998807907104,
"step": 51
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 3.9972132486104327e-10,
"advantages/std": 0.29124119877815247,
"advantages/var": 0.08482143586573532,
"completions/clipped_ratio": -2.6875,
"epoch": 0.2979942693409742,
"grad_norm": 54.58559855508494,
"learning_rate": 1.973919425411304e-06,
"loss": -1.13,
"num_tokens": 10121127.0,
"residual_var": 0.06361608952283859,
"reward": 0.77734375,
"reward_std": 0.15729182958602905,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"rho2": 0.24999994039535522,
"step": 52
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.8081168879392927e-09,
"advantages/std": 0.2575393617153168,
"advantages/var": 0.06632652283273277,
"completions/clipped_ratio": -2.703125,
"epoch": 0.3037249283667622,
"grad_norm": 39.35070508430935,
"learning_rate": 1.9728913219401447e-06,
"loss": -1.8596,
"num_tokens": 10285728.0,
"residual_var": 0.045599501579999924,
"reward": 0.78125,
"reward_std": 0.13914892077445984,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.3124999403953552,
"step": 53
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.253797709941864,
"advantages/var": 0.06441327757173454,
"completions/clipped_ratio": -2.59375,
"epoch": 0.30945558739255014,
"grad_norm": 136.99193873954198,
"learning_rate": 1.971843622476423e-06,
"loss": -1.4447,
"num_tokens": 10451827.0,
"residual_var": 0.04830996319651604,
"reward": 0.83203125,
"reward_std": 0.1238841786980629,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"rho2": 0.24999994039535522,
"step": 54
},
{
"advantages/mean": 1.280568540096283e-09,
"advantages/snr": 3.811440982290487e-09,
"advantages/std": 0.3359801471233368,
"advantages/var": 0.11288265926101904,
"completions/clipped_ratio": -2.46875,
"epoch": 0.3151862464183381,
"grad_norm": 87.22776016485538,
"learning_rate": 1.970776348122918e-06,
"loss": -2.9245,
"num_tokens": 10635142.0,
"residual_var": 0.07407926768064499,
"reward": 0.69140625,
"reward_std": 0.1993604302406311,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"rho2": 0.34374991059303284,
"step": 55
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.3376010956411484e-09,
"advantages/std": 0.29880714416503906,
"advantages/var": 0.08928570940406644,
"completions/clipped_ratio": -2.65625,
"epoch": 0.3209169054441261,
"grad_norm": 64.42194097152215,
"learning_rate": 1.9696895203766866e-06,
"loss": -2.6551,
"num_tokens": 10816137.0,
"residual_var": 0.055803585797548294,
"reward": 0.765625,
"reward_std": 0.18781886994838715,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.3749999403953552,
"step": 56
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 3.136579162365215e-10,
"advantages/std": 0.3711537718772888,
"advantages/var": 0.13775512237873855,
"completions/clipped_ratio": -2.703125,
"epoch": 0.32664756446991405,
"grad_norm": 70.92817738441774,
"learning_rate": 1.968583161128631e-06,
"loss": -1.7118,
"num_tokens": 10987350.0,
"residual_var": 0.06457272171974182,
"reward": 0.6328125,
"reward_std": 0.25103604793548584,
"rewards/drgrpo_math_reward/mean": 0.6328125,
"rewards/drgrpo_math_reward/std": 0.48298248648643494,
"rho2": 0.5312498807907104,
"step": 57
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 5.3587966720360335e-09,
"advantages/std": 0.21724152565002441,
"advantages/var": 0.047193880466750215,
"completions/clipped_ratio": -2.6875,
"epoch": 0.332378223495702,
"grad_norm": 47.37141263170206,
"learning_rate": 1.9674572926630564e-06,
"loss": -0.5569,
"num_tokens": 11138208.0,
"residual_var": 0.03539542108774185,
"reward": 0.859375,
"reward_std": 0.10034801065921783,
"rewards/drgrpo_math_reward/mean": 0.859375,
"rewards/drgrpo_math_reward/std": 0.3483152687549591,
"rho2": 0.24999994039535522,
"step": 58
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.0059432951138956e-09,
"advantages/std": 0.2314550280570984,
"advantages/var": 0.0535714300129122,
"completions/clipped_ratio": -2.703125,
"epoch": 0.33810888252148996,
"grad_norm": 38.885603895596766,
"learning_rate": 1.966311937657224e-06,
"loss": -1.2637,
"num_tokens": 11306846.0,
"residual_var": 0.043526798486709595,
"reward": 0.8828125,
"reward_std": 0.10007961094379425,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.1874999701976776,
"step": 59
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.2488633010539937e-09,
"advantages/std": 0.3728680908679962,
"advantages/var": 0.13903061318754428,
"completions/clipped_ratio": -2.546875,
"epoch": 0.3438395415472779,
"grad_norm": 156.58855155725036,
"learning_rate": 1.9651471191808923e-06,
"loss": -4.2495,
"num_tokens": 11494798.0,
"residual_var": 0.06082591786980629,
"reward": 0.703125,
"reward_std": 0.2780294418334961,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"rho2": 0.5624998807907104,
"step": 60
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 4.7448812031710745e-09,
"advantages/std": 0.34348899126052856,
"advantages/var": 0.11798468711717547,
"completions/clipped_ratio": -2.4375,
"epoch": 0.3495702005730659,
"grad_norm": 63.90428061896672,
"learning_rate": 1.9639628606958534e-06,
"loss": -3.3587,
"num_tokens": 11692303.0,
"residual_var": 0.07374045252799988,
"reward": 0.60546875,
"reward_std": 0.2168606072664261,
"rewards/drgrpo_math_reward/mean": 0.60546875,
"rewards/drgrpo_math_reward/std": 0.48970720171928406,
"rho2": 0.37499988079071045,
"step": 61
},
{
"advantages/mean": -1.5133991837501526e-09,
"advantages/snr": 4.959639056521822e-09,
"advantages/std": 0.30514299869537354,
"advantages/var": 0.09311224965280473,
"completions/clipped_ratio": -2.34375,
"epoch": 0.3553008595988539,
"grad_norm": 87.18799338980655,
"learning_rate": 1.962759186055461e-06,
"loss": -1.0981,
"num_tokens": 11866779.0,
"residual_var": 0.06110493093729019,
"reward": 0.6953125,
"reward_std": 0.17885848879814148,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"rho2": 0.34374991059303284,
"step": 62
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 6.104171146488397e-09,
"advantages/std": 0.30514299869537354,
"advantages/var": 0.09311224965280473,
"completions/clipped_ratio": -2.5,
"epoch": 0.36103151862464183,
"grad_norm": 76.80663904599737,
"learning_rate": 1.961536119504149e-06,
"loss": -4.3722,
"num_tokens": 12044597.0,
"residual_var": 0.04655614122748375,
"reward": 0.765625,
"reward_std": 0.19898608326911926,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.49999991059303284,
"step": 63
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.6864667746454993e-09,
"advantages/std": 0.260003924369812,
"advantages/var": 0.06760204068770292,
"completions/clipped_ratio": -2.40625,
"epoch": 0.3667621776504298,
"grad_norm": 50.8446775500655,
"learning_rate": 1.960293685676943e-06,
"loss": -2.5164,
"num_tokens": 12235035.0,
"residual_var": 0.05070154368877411,
"reward": 0.75,
"reward_std": 0.12730157375335693,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"rho2": 0.24999995529651642,
"step": 64
},
{
"advantages/mean": -1.7462298274040222e-09,
"advantages/snr": 5.907641413650253e-09,
"advantages/std": 0.29558831453323364,
"advantages/var": 0.08737245168859786,
"completions/clipped_ratio": -2.15625,
"epoch": 0.37249283667621774,
"grad_norm": 55.661046948184854,
"learning_rate": 1.9590319095989656e-06,
"loss": -2.5047,
"num_tokens": 12433204.0,
"residual_var": 0.05187740921974182,
"reward": 0.68359375,
"reward_std": 0.1743851751089096,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"rho2": 0.4062499403953552,
"step": 65
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 7.875790433379715e-09,
"advantages/std": 0.26606544852256775,
"advantages/var": 0.07079082289751515,
"completions/clipped_ratio": -2.796875,
"epoch": 0.37822349570200575,
"grad_norm": 66.03759288988809,
"learning_rate": 1.9577508166849303e-06,
"loss": -2.0897,
"num_tokens": 12591123.0,
"residual_var": 0.04645648971199989,
"reward": 0.83203125,
"reward_std": 0.1437433809041977,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"rho2": 0.34374991059303284,
"step": 66
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 1.7998288379265802e-09,
"advantages/std": 0.3234066367149353,
"advantages/var": 0.10459185267126614,
"completions/clipped_ratio": -2.4375,
"epoch": 0.3839541547277937,
"grad_norm": 62.754101502938425,
"learning_rate": 1.9564504327386314e-06,
"loss": -1.9607,
"num_tokens": 12770302.0,
"residual_var": 0.062101420015096664,
"reward": 0.6953125,
"reward_std": 0.1914672553539276,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"rho2": 0.40624991059303284,
"step": 67
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 7.905760832968887e-10,
"advantages/std": 0.29450756311416626,
"advantages/var": 0.08673470473144462,
"completions/clipped_ratio": -2.5625,
"epoch": 0.38968481375358166,
"grad_norm": 60.793677536940486,
"learning_rate": 1.955130783952423e-06,
"loss": -0.3413,
"num_tokens": 12959733.0,
"residual_var": 0.05420919880270958,
"reward": 0.7421875,
"reward_std": 0.1732056736946106,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"rho2": 0.3749999403953552,
"step": 68
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.6252634777932388e-09,
"advantages/std": 0.26606544852256775,
"advantages/var": 0.07079082289751515,
"completions/clipped_ratio": -2.65625,
"epoch": 0.3954154727793696,
"grad_norm": 48.873217225599404,
"learning_rate": 1.953791896906692e-06,
"loss": -1.5571,
"num_tokens": 13113127.0,
"residual_var": 0.05309312045574188,
"reward": 0.83984375,
"reward_std": 0.13717305660247803,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.2499999701976776,
"step": 69
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.23007319867610931,
"advantages/var": 0.05293367674905647,
"completions/clipped_ratio": -2.78125,
"epoch": 0.40114613180515757,
"grad_norm": 49.36381459017057,
"learning_rate": 1.9524337985693227e-06,
"loss": -1.7489,
"num_tokens": 13276750.0,
"residual_var": 0.036391910165548325,
"reward": 0.80859375,
"reward_std": 0.11902770400047302,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.3124999403953552,
"step": 70
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.899437187779043e-09,
"advantages/std": 0.2409060299396515,
"advantages/var": 0.05803571526128426,
"completions/clipped_ratio": -2.625,
"epoch": 0.4068767908309456,
"grad_norm": 36.88997932012926,
"learning_rate": 1.9510565162951534e-06,
"loss": -1.3753,
"num_tokens": 13442695.0,
"residual_var": 0.041713181883096695,
"reward": 0.83984375,
"reward_std": 0.11823134124279022,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.2812499403953552,
"step": 71
},
{
"advantages/mean": -2.444721758365631e-09,
"advantages/snr": 8.331736741469324e-09,
"advantages/std": 0.2934228181838989,
"advantages/var": 0.0860969502309814,
"completions/clipped_ratio": -2.78125,
"epoch": 0.41260744985673353,
"grad_norm": 69.03881568139195,
"learning_rate": 1.949660077825426e-06,
"loss": 0.1049,
"num_tokens": 13623554.0,
"residual_var": 0.06457272171974182,
"reward": 0.85546875,
"reward_std": 0.15900176763534546,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.24999995529651642,
"step": 72
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.2074180229024673e-09,
"advantages/std": 0.31642937660217285,
"advantages/var": 0.10012755037683974,
"completions/clipped_ratio": -2.71875,
"epoch": 0.4183381088825215,
"grad_norm": 75.3655355836356,
"learning_rate": 1.948244511287226e-06,
"loss": -0.4727,
"num_tokens": 13799558.0,
"residual_var": 0.0688377097249031,
"reward": 0.75390625,
"reward_std": 0.18622371554374695,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"rho2": 0.3124999403953552,
"step": 73
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 2.8623259673299855e-09,
"advantages/std": 0.3253726363182068,
"advantages/var": 0.10586735246466006,
"completions/clipped_ratio": -2.53125,
"epoch": 0.42406876790830944,
"grad_norm": 50.55349729785912,
"learning_rate": 1.946809845192918e-06,
"loss": -0.6276,
"num_tokens": 13979108.0,
"residual_var": 0.052933696657419205,
"reward": 0.7578125,
"reward_std": 0.21910977363586426,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"rho2": 0.49999991059303284,
"step": 74
},
{
"advantages/mean": 1.979060471057892e-09,
"advantages/snr": 7.124253269284449e-09,
"advantages/std": 0.27779197692871094,
"advantages/var": 0.07716838244596147,
"completions/clipped_ratio": -2.8125,
"epoch": 0.4297994269340974,
"grad_norm": 149.96300323315998,
"learning_rate": 1.9453561084395687e-06,
"loss": -3.9114,
"num_tokens": 14147162.0,
"residual_var": 0.04823024198412895,
"reward": 0.75390625,
"reward_std": 0.1629534810781479,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"rho2": 0.37499988079071045,
"step": 75
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.31743553280830383,
"advantages/var": 0.10076531748929174,
"completions/clipped_ratio": -2.453125,
"epoch": 0.4355300859598854,
"grad_norm": 63.727237228202135,
"learning_rate": 1.9438833303083674e-06,
"loss": -3.3417,
"num_tokens": 14317167.0,
"residual_var": 0.056680500507354736,
"reward": 0.65625,
"reward_std": 0.20688170194625854,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"rho2": 0.4374999403953552,
"step": 76
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 7.709835570839547e-09,
"advantages/std": 0.30199170112609863,
"advantages/var": 0.09119898754903488,
"completions/clipped_ratio": -2.578125,
"epoch": 0.44126074498567336,
"grad_norm": 56.35814893945447,
"learning_rate": 1.9423915404640348e-06,
"loss": -0.9106,
"num_tokens": 14492451.0,
"residual_var": 0.05984935164451599,
"reward": 0.66796875,
"reward_std": 0.1706969439983368,
"rewards/drgrpo_math_reward/mean": 0.66796875,
"rewards/drgrpo_math_reward/std": 0.4718646705150604,
"rho2": 0.34374988079071045,
"step": 77
},
{
"advantages/mean": 1.280568540096283e-09,
"advantages/snr": 5.1486095650116935e-09,
"advantages/std": 0.24872122704982758,
"advantages/var": 0.06186224878517188,
"completions/clipped_ratio": -2.640625,
"epoch": 0.4469914040114613,
"grad_norm": 36.08436726472045,
"learning_rate": 1.9408807689542254e-06,
"loss": -1.3103,
"num_tokens": 14657029.0,
"residual_var": 0.044463496655225754,
"reward": 0.69140625,
"reward_std": 0.12164628505706787,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"rho2": 0.2812499403953552,
"step": 78
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 8.416325633651013e-10,
"advantages/std": 0.2766416668891907,
"advantages/var": 0.07653061185922994,
"completions/clipped_ratio": -2.78125,
"epoch": 0.45272206303724927,
"grad_norm": 58.90239262695802,
"learning_rate": 1.9393510462089237e-06,
"loss": -1.5106,
"num_tokens": 14822142.0,
"residual_var": 0.05739796906709671,
"reward": 0.8359375,
"reward_std": 0.14874956011772156,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.24999994039535522,
"step": 79
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.3061862289905548,
"advantages/var": 0.09375000682345647,
"completions/clipped_ratio": -2.515625,
"epoch": 0.4584527220630373,
"grad_norm": 52.44279292628102,
"learning_rate": 1.937802403039829e-06,
"loss": -0.0152,
"num_tokens": 15004930.0,
"residual_var": 0.05859377235174179,
"reward": 0.67578125,
"reward_std": 0.18003800511360168,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"rho2": 0.37499991059303284,
"step": 80
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 1.7085097435439224e-09,
"advantages/std": 0.34069257974624634,
"advantages/var": 0.11607143389415242,
"completions/clipped_ratio": -2.5625,
"epoch": 0.46418338108882523,
"grad_norm": 69.59869023850752,
"learning_rate": 1.936234870639737e-06,
"loss": -2.8108,
"num_tokens": 15191177.0,
"residual_var": 0.07254466414451599,
"reward": 0.6640625,
"reward_std": 0.22160722315311432,
"rewards/drgrpo_math_reward/mean": 0.6640625,
"rewards/drgrpo_math_reward/std": 0.4732423722743988,
"rho2": 0.3749999403953552,
"step": 81
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 5.575178862375007e-09,
"advantages/std": 0.29233402013778687,
"advantages/var": 0.08545917932991998,
"completions/clipped_ratio": -2.65625,
"epoch": 0.4699140401146132,
"grad_norm": 59.051506077119946,
"learning_rate": 1.934648480581911e-06,
"loss": -1.8163,
"num_tokens": 15362117.0,
"residual_var": 0.04807080700993538,
"reward": 0.734375,
"reward_std": 0.17859891057014465,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"rho2": 0.43749991059303284,
"step": 82
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 8.830796510959388e-10,
"advantages/std": 0.2636575698852539,
"advantages/var": 0.06951531415779755,
"completions/clipped_ratio": -2.359375,
"epoch": 0.47564469914040114,
"grad_norm": 39.306186581687186,
"learning_rate": 1.933043264819444e-06,
"loss": -0.6807,
"num_tokens": 15550143.0,
"residual_var": 0.05213649198412895,
"reward": 0.75390625,
"reward_std": 0.12900903820991516,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"rho2": 0.24999994039535522,
"step": 83
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.0968132639044306e-09,
"advantages/std": 0.3331207036972046,
"advantages/var": 0.11096940323172078,
"completions/clipped_ratio": -2.578125,
"epoch": 0.4813753581661891,
"grad_norm": 71.96477285108665,
"learning_rate": 1.931419255684618e-06,
"loss": -0.9075,
"num_tokens": 15727115.0,
"residual_var": 0.062420301139354706,
"reward": 0.7421875,
"reward_std": 0.21713145077228546,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"rho2": 0.43749991059303284,
"step": 84
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 3.06259193017924e-09,
"advantages/std": 0.30409619212150574,
"advantages/var": 0.09247449406279973,
"completions/clipped_ratio": -2.75,
"epoch": 0.4871060171919771,
"grad_norm": 47.61110860557143,
"learning_rate": 1.929776485888251e-06,
"loss": -1.8949,
"num_tokens": 15886432.0,
"residual_var": 0.057796575129032135,
"reward": 0.76953125,
"reward_std": 0.17832808196544647,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"rho2": 0.37499991059303284,
"step": 85
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 7.819981896283313e-09,
"advantages/std": 0.29773807525634766,
"advantages/var": 0.08864796145735454,
"completions/clipped_ratio": -2.640625,
"epoch": 0.49283667621776506,
"grad_norm": 48.589294451635624,
"learning_rate": 1.928114988519039e-06,
"loss": -1.5481,
"num_tokens": 16054808.0,
"residual_var": 0.0692562386393547,
"reward": 0.80859375,
"reward_std": 0.1612396389245987,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.21874994039535522,
"step": 86
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 3.72681136151475e-09,
"advantages/std": 0.31237244606018066,
"advantages/var": 0.09757654505762048,
"completions/clipped_ratio": -2.5,
"epoch": 0.498567335243553,
"grad_norm": 77.61472809779868,
"learning_rate": 1.926434797042887e-06,
"loss": -1.1408,
"num_tokens": 16222572.0,
"residual_var": 0.06403461843729019,
"reward": 0.76953125,
"reward_std": 0.17753173410892487,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"rho2": 0.34374991059303284,
"step": 87
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 6.706288730674227e-10,
"advantages/std": 0.3471825420856476,
"advantages/var": 0.12053571752905246,
"completions/clipped_ratio": -2.5625,
"epoch": 0.504297994269341,
"grad_norm": 58.93647688904723,
"learning_rate": 1.9247359453022406e-06,
"loss": -0.6762,
"num_tokens": 16402078.0,
"residual_var": 0.06780136376619339,
"reward": 0.76171875,
"reward_std": 0.22738362848758698,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"rho2": 0.4374999403953552,
"step": 88
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 5.575178862375007e-09,
"advantages/std": 0.29233402013778687,
"advantages/var": 0.08545917932991998,
"completions/clipped_ratio": -2.65625,
"epoch": 0.5100286532951289,
"grad_norm": 60.11029385266424,
"learning_rate": 1.9230184675153973e-06,
"loss": -1.8015,
"num_tokens": 16564467.0,
"residual_var": 0.05341200530529022,
"reward": 0.84375,
"reward_std": 0.1714957356452942,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.37499991059303284,
"step": 89
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 4.7295666564139676e-09,
"advantages/std": 0.24614372849464417,
"advantages/var": 0.0605867350772451,
"completions/clipped_ratio": -2.6875,
"epoch": 0.5157593123209169,
"grad_norm": 32.80821455872672,
"learning_rate": 1.9212823982758223e-06,
"loss": -0.388,
"num_tokens": 16735491.0,
"residual_var": 0.049226731061935425,
"reward": 0.74609375,
"reward_std": 0.11336605250835419,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"rho2": 0.1874999701976776,
"step": 90
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 7.115184749671998e-09,
"advantages/std": 0.29450756311416626,
"advantages/var": 0.08673470473144462,
"completions/clipped_ratio": -2.515625,
"epoch": 0.5214899713467048,
"grad_norm": 126.43847336761601,
"learning_rate": 1.9195277725514506e-06,
"loss": -1.8816,
"num_tokens": 16921769.0,
"residual_var": 0.05149874463677406,
"reward": 0.6796875,
"reward_std": 0.17912688851356506,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"rho2": 0.40624988079071045,
"step": 91
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.2562982141971588,
"advantages/var": 0.0656887746006527,
"completions/clipped_ratio": -2.625,
"epoch": 0.5272206303724928,
"grad_norm": 44.145512065978785,
"learning_rate": 1.917754625683981e-06,
"loss": -1.4745,
"num_tokens": 17083984.0,
"residual_var": 0.043108273297548294,
"reward": 0.80078125,
"reward_std": 0.13861849904060364,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.3437499403953552,
"step": 92
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.7194685024217942e-09,
"advantages/std": 0.27081698179244995,
"advantages/var": 0.07334183762717217,
"completions/clipped_ratio": -2.65625,
"epoch": 0.5329512893982808,
"grad_norm": 38.3146731634114,
"learning_rate": 1.9159629933881667e-06,
"loss": -0.3711,
"num_tokens": 17245004.0,
"residual_var": 0.05042252317070961,
"reward": 0.73828125,
"reward_std": 0.15243536233901978,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"rho2": 0.3124999403953552,
"step": 93
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 7.709835570839548e-10,
"advantages/std": 0.30199170112609863,
"advantages/var": 0.09119898754903488,
"completions/clipped_ratio": -2.609375,
"epoch": 0.5386819484240688,
"grad_norm": 51.034802200564776,
"learning_rate": 1.914152911751093e-06,
"loss": -2.5489,
"num_tokens": 17418162.0,
"residual_var": 0.05984934791922569,
"reward": 0.80859375,
"reward_std": 0.17715103924274445,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.3437499403953552,
"step": 94
},
{
"advantages/mean": 3.4924596548080444e-10,
"advantages/snr": 1.318584786741314e-09,
"advantages/std": 0.2648642361164093,
"advantages/var": 0.07015306357352902,
"completions/clipped_ratio": -2.328125,
"epoch": 0.5444126074498568,
"grad_norm": 39.90106380343523,
"learning_rate": 1.912324417231454e-06,
"loss": -1.5377,
"num_tokens": 17609294.0,
"residual_var": 0.05042252317070961,
"reward": 0.625,
"reward_std": 0.14309674501419067,
"rewards/drgrpo_math_reward/mean": 0.625,
"rewards/drgrpo_math_reward/std": 0.4850712716579437,
"rho2": 0.2812499403953552,
"step": 95
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 8.513219981545141e-09,
"advantages/std": 0.24614372849464417,
"advantages/var": 0.0605867350772451,
"completions/clipped_ratio": -2.9375,
"epoch": 0.5501432664756447,
"grad_norm": 39.1391593416744,
"learning_rate": 1.9104775466588157e-06,
"loss": -1.0478,
"num_tokens": 17758949.0,
"residual_var": 0.04354672506451607,
"reward": 0.87109375,
"reward_std": 0.11993636190891266,
"rewards/drgrpo_math_reward/mean": 0.87109375,
"rewards/drgrpo_math_reward/std": 0.33575257658958435,
"rho2": 0.2812499403953552,
"step": 96
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 7.155814918324964e-10,
"advantages/std": 0.3253726363182068,
"advantages/var": 0.10586735246466006,
"completions/clipped_ratio": -2.75,
"epoch": 0.5558739255014327,
"grad_norm": 52.65706152505506,
"learning_rate": 1.9086123372328743e-06,
"loss": -1.9894,
"num_tokens": 17931749.0,
"residual_var": 0.059550393372774124,
"reward": 0.7890625,
"reward_std": 0.1990984082221985,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"rho2": 0.4374999403953552,
"step": 97
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 3.352589773780917e-09,
"advantages/std": 0.27779197692871094,
"advantages/var": 0.07716838244596147,
"completions/clipped_ratio": -2.53125,
"epoch": 0.5616045845272206,
"grad_norm": 42.36508110920937,
"learning_rate": 1.906728826522708e-06,
"loss": -0.8938,
"num_tokens": 18100959.0,
"residual_var": 0.050641756504774094,
"reward": 0.69140625,
"reward_std": 0.15057817101478577,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"rho2": 0.3437499403953552,
"step": 98
},
{
"advantages/mean": -1.0477378964424133e-09,
"advantages/snr": 3.902891199784966e-09,
"advantages/std": 0.2684517204761505,
"advantages/var": 0.07206632622660525,
"completions/clipped_ratio": -2.765625,
"epoch": 0.5673352435530086,
"grad_norm": 49.233447686472736,
"learning_rate": 1.9048270524660196e-06,
"loss": -0.2706,
"num_tokens": 18256522.0,
"residual_var": 0.05404975265264511,
"reward": 0.86328125,
"reward_std": 0.13888299465179443,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.2499999701976776,
"step": 99
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 8.086143727159208e-09,
"advantages/std": 0.28793779015541077,
"advantages/var": 0.08290817099958137,
"completions/clipped_ratio": -2.75,
"epoch": 0.5730659025787965,
"grad_norm": 45.12521046013055,
"learning_rate": 1.9029070533683722e-06,
"loss": -1.2174,
"num_tokens": 18421106.0,
"residual_var": 0.04922673851251602,
"reward": 0.78125,
"reward_std": 0.16925784945487976,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.40624991059303284,
"step": 100
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 3.783653096074668e-09,
"advantages/std": 0.24614374339580536,
"advantages/var": 0.060586742412900074,
"completions/clipped_ratio": -2.5,
"epoch": 0.5787965616045845,
"grad_norm": 38.66496875469516,
"learning_rate": 1.9009688679024189e-06,
"loss": -0.7118,
"num_tokens": 18589613.0,
"residual_var": 0.04544006660580635,
"reward": 0.69140625,
"reward_std": 0.12046922743320465,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"rho2": 0.24999995529651642,
"step": 101
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 4.5421821470515416e-09,
"advantages/std": 0.2562982141971588,
"advantages/var": 0.0656887746006527,
"completions/clipped_ratio": -2.859375,
"epoch": 0.5845272206303725,
"grad_norm": 60.63497224407915,
"learning_rate": 1.8990125351071223e-06,
"loss": -1.0793,
"num_tokens": 18733932.0,
"residual_var": 0.049266595393419266,
"reward": 0.90234375,
"reward_std": 0.13204818964004517,
"rewards/drgrpo_math_reward/mean": 0.90234375,
"rewards/drgrpo_math_reward/std": 0.29743078351020813,
"rho2": 0.24999995529651642,
"step": 102
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.24222607910633087,
"advantages/var": 0.05867347339922646,
"completions/clipped_ratio": -2.421875,
"epoch": 0.5902578796561605,
"grad_norm": 98.73285501942651,
"learning_rate": 1.8970380943869686e-06,
"loss": -1.4782,
"num_tokens": 18904374.0,
"residual_var": 0.04217156767845154,
"reward": 0.6953125,
"reward_std": 0.11822889000177383,
"rewards/drgrpo_math_reward/mean": 0.6953125,
"rewards/drgrpo_math_reward/std": 0.4611765742301941,
"rho2": 0.2812499403953552,
"step": 103
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 4.255210472219743e-09,
"advantages/std": 0.3282995820045471,
"advantages/var": 0.10778061554436036,
"completions/clipped_ratio": -2.546875,
"epoch": 0.5959885386819485,
"grad_norm": 42.68933358778393,
"learning_rate": 1.8950455855111742e-06,
"loss": -1.2247,
"num_tokens": 19072246.0,
"residual_var": 0.07073105126619339,
"reward": 0.64453125,
"reward_std": 0.20714375376701355,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"rho2": 0.3437499403953552,
"step": 104
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 7.630213187892461e-10,
"advantages/std": 0.3051430284976959,
"advantages/var": 0.09311226784074567,
"completions/clipped_ratio": -2.75,
"epoch": 0.6017191977077364,
"grad_norm": 41.03832126155318,
"learning_rate": 1.8930350486128855e-06,
"loss": -0.5834,
"num_tokens": 19240780.0,
"residual_var": 0.058195166289806366,
"reward": 0.75,
"reward_std": 0.18649454414844513,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"rho2": 0.3749999403953552,
"step": 105
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 3.802900033317195e-09,
"advantages/std": 0.2142857313156128,
"advantages/var": 0.045918374645467,
"completions/clipped_ratio": -2.890625,
"epoch": 0.6074498567335244,
"grad_norm": 36.27068845842204,
"learning_rate": 1.8910065241883678e-06,
"loss": -1.19,
"num_tokens": 19390230.0,
"residual_var": 0.035873737186193466,
"reward": 0.8984375,
"reward_std": 0.09271685779094696,
"rewards/drgrpo_math_reward/mean": 0.8984375,
"rewards/drgrpo_math_reward/std": 0.3026638329029083,
"rho2": 0.2187499701976776,
"step": 106
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 6.840946068269286e-09,
"advantages/std": 0.23824401199817657,
"advantages/var": 0.056760209252987304,
"completions/clipped_ratio": -2.90625,
"epoch": 0.6131805157593123,
"grad_norm": 34.31275723668857,
"learning_rate": 1.8889600530961932e-06,
"loss": -0.404,
"num_tokens": 19542572.0,
"residual_var": 0.04079640656709671,
"reward": 0.74609375,
"reward_std": 0.11652141809463501,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"rho2": 0.2812499403953552,
"step": 107
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 2.627483538992809e-09,
"advantages/std": 0.3544541895389557,
"advantages/var": 0.12563777248171792,
"completions/clipped_ratio": -2.625,
"epoch": 0.6189111747851003,
"grad_norm": 62.78402318125484,
"learning_rate": 1.8868956765564148e-06,
"loss": -1.6898,
"num_tokens": 19717110.0,
"residual_var": 0.07852361351251602,
"reward": 0.69140625,
"reward_std": 0.23831351101398468,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"rho2": 0.3749999403953552,
"step": 108
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 7.764322892357765e-10,
"advantages/std": 0.29987242817878723,
"advantages/var": 0.0899234731818419,
"completions/clipped_ratio": -2.640625,
"epoch": 0.6246418338108882,
"grad_norm": 47.45129141101553,
"learning_rate": 1.8848134361497382e-06,
"loss": -0.4754,
"num_tokens": 19873742.0,
"residual_var": 0.061822403222322464,
"reward": 0.81640625,
"reward_std": 0.17597398161888123,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"rho2": 0.3124999403953552,
"step": 109
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 6.395541197776692e-09,
"advantages/std": 0.29124119877815247,
"advantages/var": 0.08482143586573532,
"completions/clipped_ratio": -2.796875,
"epoch": 0.6303724928366762,
"grad_norm": 45.06513373893126,
"learning_rate": 1.882713373816683e-06,
"loss": -0.4579,
"num_tokens": 20030959.0,
"residual_var": 0.055664073675870895,
"reward": 0.82421875,
"reward_std": 0.17031623423099518,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.34374991059303284,
"step": 110
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 3.06259193017924e-09,
"advantages/std": 0.30409619212150574,
"advantages/var": 0.09247449406279973,
"completions/clipped_ratio": -2.859375,
"epoch": 0.6361031518624641,
"grad_norm": 46.493358792765605,
"learning_rate": 1.8805955318567379e-06,
"loss": -2.3481,
"num_tokens": 20194007.0,
"residual_var": 0.05201692134141922,
"reward": 0.86328125,
"reward_std": 0.18543127179145813,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.43749991059303284,
"step": 111
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 6.160119582101702e-09,
"advantages/std": 0.18898223340511322,
"advantages/var": 0.03571428454278469,
"completions/clipped_ratio": -2.953125,
"epoch": 0.6418338108882522,
"grad_norm": 32.10619667409936,
"learning_rate": 1.8784599529275099e-06,
"loss": -0.5848,
"num_tokens": 20331794.0,
"residual_var": 0.030133940279483795,
"reward": 0.9140625,
"reward_std": 0.07536393404006958,
"rewards/drgrpo_math_reward/mean": 0.9140625,
"rewards/drgrpo_math_reward/std": 0.28082075715065,
"rho2": 0.1562499701976776,
"step": 112
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.23555190861225128,
"advantages/var": 0.05548470165087438,
"completions/clipped_ratio": -2.78125,
"epoch": 0.6475644699140402,
"grad_norm": 39.33915422125142,
"learning_rate": 1.8763066800438634e-06,
"loss": -0.9856,
"num_tokens": 20487738.0,
"residual_var": 0.04161353409290314,
"reward": 0.83984375,
"reward_std": 0.11534436047077179,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.24999995529651642,
"step": 113
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.2624453604221344,
"advantages/var": 0.06887756720710403,
"completions/clipped_ratio": -2.5625,
"epoch": 0.6532951289398281,
"grad_norm": 40.44236478817312,
"learning_rate": 1.874135756577056e-06,
"loss": -1.4464,
"num_tokens": 20659930.0,
"residual_var": 0.055963024497032166,
"reward": 0.7578125,
"reward_std": 0.12190830707550049,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"rho2": 0.18749995529651642,
"step": 114
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.7581130489884187e-09,
"advantages/std": 0.2648642361164093,
"advantages/var": 0.07015306357352902,
"completions/clipped_ratio": -2.875,
"epoch": 0.6590257879656161,
"grad_norm": 45.149902363692064,
"learning_rate": 1.8719472262538622e-06,
"loss": -3.3805,
"num_tokens": 20803640.0,
"residual_var": 0.043845679610967636,
"reward": 0.8671875,
"reward_std": 0.14966705441474915,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.37499991059303284,
"step": 115
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 3.1507420872801353e-09,
"advantages/std": 0.29558831453323364,
"advantages/var": 0.08737245168859786,
"completions/clipped_ratio": -2.78125,
"epoch": 0.664756446991404,
"grad_norm": 56.38374910767192,
"learning_rate": 1.8697411331556953e-06,
"loss": -2.1825,
"num_tokens": 20956942.0,
"residual_var": 0.06279896944761276,
"reward": 0.79296875,
"reward_std": 0.16663289070129395,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"rho2": 0.2812499403953552,
"step": 116
},
{
"advantages/mean": 2.561137080192566e-09,
"advantages/snr": 8.633090704662692e-09,
"advantages/std": 0.29666513204574585,
"advantages/var": 0.08801020057171982,
"completions/clipped_ratio": -2.78125,
"epoch": 0.670487106017192,
"grad_norm": 272.8870600231476,
"learning_rate": 1.8675175217177175e-06,
"loss": -1.3051,
"num_tokens": 21116292.0,
"residual_var": 0.052256081253290176,
"reward": 0.7578125,
"reward_std": 0.17438271641731262,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"rho2": 0.40624991059303284,
"step": 117
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 8.055221268003356e-09,
"advantages/std": 0.289043128490448,
"advantages/var": 0.08354593012754563,
"completions/clipped_ratio": -2.71875,
"epoch": 0.6762177650429799,
"grad_norm": 49.4537830676488,
"learning_rate": 1.8652764367279459e-06,
"loss": -1.0499,
"num_tokens": 21280178.0,
"residual_var": 0.05743783712387085,
"reward": 0.79296875,
"reward_std": 0.1626850813627243,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"rho2": 0.3124999403953552,
"step": 118
},
{
"advantages/mean": 1.7462298274040222e-09,
"advantages/snr": 7.0208312250159456e-09,
"advantages/std": 0.24872122704982758,
"advantages/var": 0.06186224878517188,
"completions/clipped_ratio": -2.71875,
"epoch": 0.6819484240687679,
"grad_norm": 37.031634565478385,
"learning_rate": 1.86301792332635e-06,
"loss": -0.4774,
"num_tokens": 21426617.0,
"residual_var": 0.04446350410580635,
"reward": 0.84765625,
"reward_std": 0.12810038030147552,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.2812499403953552,
"step": 119
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.0865429422188867e-09,
"advantages/std": 0.2142857164144516,
"advantages/var": 0.04591836825925477,
"completions/clipped_ratio": -2.84375,
"epoch": 0.6876790830945558,
"grad_norm": 57.8559807139232,
"learning_rate": 1.8607420270039435e-06,
"loss": -0.8314,
"num_tokens": 21575376.0,
"residual_var": 0.034438785165548325,
"reward": 0.875,
"reward_std": 0.09863808751106262,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.24999994039535522,
"step": 120
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.4577503818942396e-09,
"advantages/std": 0.31943827867507935,
"advantages/var": 0.10204081388289765,
"completions/clipped_ratio": -2.578125,
"epoch": 0.6934097421203438,
"grad_norm": 47.86157687219279,
"learning_rate": 1.858448793601866e-06,
"loss": -0.0483,
"num_tokens": 21740794.0,
"residual_var": 0.06377552449703217,
"reward": 0.7265625,
"reward_std": 0.18858027458190918,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"rho2": 0.3749999403953552,
"step": 121
},
{
"advantages/mean": 1.5133991837501526e-09,
"advantages/snr": 5.0290080186388524e-09,
"advantages/std": 0.30093392729759216,
"advantages/var": 0.09056122859875249,
"completions/clipped_ratio": -2.5625,
"epoch": 0.6991404011461319,
"grad_norm": 72.79748126385374,
"learning_rate": 1.8561382693104614e-06,
"loss": -1.7789,
"num_tokens": 21917447.0,
"residual_var": 0.06509089469909668,
"reward": 0.71875,
"reward_std": 0.17005029320716858,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"rho2": 0.2812499403953552,
"step": 122
},
{
"advantages/mean": -5.820766091346741e-10,
"advantages/snr": 2.3165174831568894e-09,
"advantages/std": 0.2512722611427307,
"advantages/var": 0.06313774921978066,
"completions/clipped_ratio": -2.75,
"epoch": 0.7048710601719198,
"grad_norm": 44.88746539099347,
"learning_rate": 1.853810500668347e-06,
"loss": -0.7919,
"num_tokens": 22094461.0,
"residual_var": 0.04538027569651604,
"reward": 0.78515625,
"reward_std": 0.12981030344963074,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"rho2": 0.2812499403953552,
"step": 123
},
{
"advantages/mean": 3.4924596548080444e-10,
"advantages/snr": 1.2896013768163457e-09,
"advantages/std": 0.27081698179244995,
"advantages/var": 0.07334183762717217,
"completions/clipped_ratio": -2.53125,
"epoch": 0.7106017191977078,
"grad_norm": 48.80423262194112,
"learning_rate": 1.8514655345614762e-06,
"loss": -0.8237,
"num_tokens": 22268217.0,
"residual_var": 0.05271446332335472,
"reward": 0.75390625,
"reward_std": 0.14006003737449646,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"rho2": 0.2812499403953552,
"step": 124
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 9.040583393524847e-10,
"advantages/std": 0.25753939151763916,
"advantages/var": 0.06632653818327583,
"completions/clipped_ratio": -2.8125,
"epoch": 0.7163323782234957,
"grad_norm": 46.02844560487019,
"learning_rate": 1.8491034182221936e-06,
"loss": -0.9501,
"num_tokens": 22430925.0,
"residual_var": 0.04767220839858055,
"reward": 0.7890625,
"reward_std": 0.12677361071109772,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"rho2": 0.2812499403953552,
"step": 125
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 5.203854933046621e-09,
"advantages/std": 0.2684517204761505,
"advantages/var": 0.07206632622660525,
"completions/clipped_ratio": -2.84375,
"epoch": 0.7220630372492837,
"grad_norm": 60.67743324938953,
"learning_rate": 1.8467241992282841e-06,
"loss": -1.0718,
"num_tokens": 22583031.0,
"residual_var": 0.045041464269161224,
"reward": 0.87109375,
"reward_std": 0.15782861411571503,
"rewards/drgrpo_math_reward/mean": 0.87109375,
"rewards/drgrpo_math_reward/std": 0.33575257658958435,
"rho2": 0.3749999403953552,
"step": 126
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 2.3227503455075525e-09,
"advantages/std": 0.35083720088005066,
"advantages/var": 0.12308674152134902,
"completions/clipped_ratio": -2.65625,
"epoch": 0.7277936962750716,
"grad_norm": 56.280257142687866,
"learning_rate": 1.844327925502015e-06,
"loss": -1.0202,
"num_tokens": 22755738.0,
"residual_var": 0.06154339015483856,
"reward": 0.75390625,
"reward_std": 0.22973774373531342,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"rho2": 0.49999988079071045,
"step": 127
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 4.395282127917263e-10,
"advantages/std": 0.2648642659187317,
"advantages/var": 0.07015307936066861,
"completions/clipped_ratio": -2.671875,
"epoch": 0.7335243553008596,
"grad_norm": 54.82075946899719,
"learning_rate": 1.8419146453091702e-06,
"loss": -1.0377,
"num_tokens": 22910639.0,
"residual_var": 0.052614808082580566,
"reward": 0.765625,
"reward_std": 0.13072142004966736,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.2499999701976776,
"step": 128
},
{
"advantages/mean": 1.280568540096283e-09,
"advantages/snr": 5.741541728836773e-09,
"advantages/std": 0.22303564846515656,
"advantages/var": 0.04974490048627289,
"completions/clipped_ratio": -2.890625,
"epoch": 0.7392550143266475,
"grad_norm": 38.05363017417547,
"learning_rate": 1.8394844072580772e-06,
"loss": -0.7448,
"num_tokens": 23065211.0,
"residual_var": 0.03886321187019348,
"reward": 0.8828125,
"reward_std": 0.10258589684963226,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.2187499701976776,
"step": 129
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.774317220522933e-09,
"advantages/std": 0.262445330619812,
"advantages/var": 0.06887755156414244,
"completions/clipped_ratio": -2.921875,
"epoch": 0.7449856733524355,
"grad_norm": 51.16139356760249,
"learning_rate": 1.8370372602986302e-06,
"loss": -1.3307,
"num_tokens": 23214742.0,
"residual_var": 0.049505751579999924,
"reward": 0.7890625,
"reward_std": 0.1349327266216278,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"rho2": 0.2812499403953552,
"step": 130
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 4.656612686812885e-09,
"advantages/std": 0.25,
"advantages/var": 0.0625,
"completions/clipped_ratio": -2.703125,
"epoch": 0.7507163323782235,
"grad_norm": 93.65015154291969,
"learning_rate": 1.8345732537213026e-06,
"loss": -0.0264,
"num_tokens": 23380824.0,
"residual_var": 0.042968764901161194,
"reward": 0.75,
"reward_std": 0.12927988171577454,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"rho2": 0.3124999403953552,
"step": 131
},
{
"advantages/mean": 2.444721758365631e-09,
"advantages/snr": 7.424683779959137e-09,
"advantages/std": 0.3292694687843323,
"advantages/var": 0.10841838307351637,
"completions/clipped_ratio": -2.828125,
"epoch": 0.7564469914040115,
"grad_norm": 61.13746296372967,
"learning_rate": 1.832092437156154e-06,
"loss": -0.4033,
"num_tokens": 23542656.0,
"residual_var": 0.0643734335899353,
"reward": 0.8515625,
"reward_std": 0.20133627951145172,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.40624988079071045,
"step": 132
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.152452789710391e-09,
"advantages/std": 0.30304577946662903,
"advantages/var": 0.09183674445253676,
"completions/clipped_ratio": -2.78125,
"epoch": 0.7621776504297995,
"grad_norm": 82.25821857438845,
"learning_rate": 1.8295948605718311e-06,
"loss": -0.9542,
"num_tokens": 23699606.0,
"residual_var": 0.05739797279238701,
"reward": 0.8515625,
"reward_std": 0.1718764305114746,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.3749999403953552,
"step": 133
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.872221660004252e-09,
"advantages/std": 0.24872122704982758,
"advantages/var": 0.06186224878517188,
"completions/clipped_ratio": -2.65625,
"epoch": 0.7679083094555874,
"grad_norm": 61.86934388754127,
"learning_rate": 1.8270805742745616e-06,
"loss": -0.4728,
"num_tokens": 23857410.0,
"residual_var": 0.04446350410580635,
"reward": 0.80859375,
"reward_std": 0.12164628505706787,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.2812499403953552,
"step": 134
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.6426984546755946e-09,
"advantages/std": 0.2834733724594116,
"advantages/var": 0.08035715289351231,
"completions/clipped_ratio": -2.859375,
"epoch": 0.7736389684813754,
"grad_norm": 71.10537320866999,
"learning_rate": 1.8245496289071406e-06,
"loss": -0.9753,
"num_tokens": 24019095.0,
"residual_var": 0.057756710797548294,
"reward": 0.8828125,
"reward_std": 0.1533464789390564,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.2812499403953552,
"step": 135
},
{
"advantages/mean": 8.149072527885437e-10,
"advantages/snr": 3.0491036849611125e-09,
"advantages/std": 0.26726123690605164,
"advantages/var": 0.07142856875255266,
"completions/clipped_ratio": -2.75,
"epoch": 0.7793696275071633,
"grad_norm": 62.37237735361797,
"learning_rate": 1.82200207544791e-06,
"loss": -0.9455,
"num_tokens": 24175665.0,
"residual_var": 0.049107152968645096,
"reward": 0.828125,
"reward_std": 0.1442737877368927,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.3124999403953552,
"step": 136
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.011985032910295e-09,
"advantages/std": 0.23007319867610931,
"advantages/var": 0.05293367674905647,
"completions/clipped_ratio": -2.796875,
"epoch": 0.7851002865329513,
"grad_norm": 54.84248499224107,
"learning_rate": 1.8194379652097318e-06,
"loss": -0.6355,
"num_tokens": 24321485.0,
"residual_var": 0.04135444760322571,
"reward": 0.83203125,
"reward_std": 0.1060032919049263,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"rho2": 0.21874994039535522,
"step": 137
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.2004213037461316e-09,
"advantages/std": 0.31743553280830383,
"advantages/var": 0.10076531748929174,
"completions/clipped_ratio": -2.671875,
"epoch": 0.7908309455587392,
"grad_norm": 113.83761802134484,
"learning_rate": 1.8168573498389562e-06,
"loss": -0.4863,
"num_tokens": 24494928.0,
"residual_var": 0.05668049678206444,
"reward": 0.6875,
"reward_std": 0.20042762160301208,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"rho2": 0.4374999403953552,
"step": 138
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 4.261771990579164e-09,
"advantages/std": 0.27316176891326904,
"advantages/var": 0.0746173519958262,
"completions/clipped_ratio": -2.546875,
"epoch": 0.7965616045845272,
"grad_norm": 58.86024131658044,
"learning_rate": 1.8142602813143784e-06,
"loss": -0.3733,
"num_tokens": 24664246.0,
"residual_var": 0.05596302077174187,
"reward": 0.76171875,
"reward_std": 0.14058800041675568,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"rho2": 0.24999995529651642,
"step": 139
},
{
"advantages/mean": 8.149072527885437e-10,
"advantages/snr": 2.151245094721418e-09,
"advantages/std": 0.3788072466850281,
"advantages/var": 0.14349493014109171,
"completions/clipped_ratio": -2.671875,
"epoch": 0.8022922636103151,
"grad_norm": 85.49564799929581,
"learning_rate": 1.8116468119461951e-06,
"loss": -1.2034,
"num_tokens": 24833688.0,
"residual_var": 0.07174746692180634,
"reward": 0.68359375,
"reward_std": 0.2695994973182678,
"rewards/drgrpo_math_reward/mean": 0.68359375,
"rewards/drgrpo_math_reward/std": 0.4659844934940338,
"rho2": 0.49999988079071045,
"step": 140
},
{
"advantages/mean": 3.259629011154175e-09,
"advantages/snr": 1.0756226037296983e-08,
"advantages/std": 0.30304577946662903,
"advantages/var": 0.09183674445253676,
"completions/clipped_ratio": -2.609375,
"epoch": 0.8080229226361032,
"grad_norm": 46.26865155218474,
"learning_rate": 1.8090169943749474e-06,
"loss": -0.5296,
"num_tokens": 24994931.0,
"residual_var": 0.06313777714967728,
"reward": 0.765625,
"reward_std": 0.16477325558662415,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.31249991059303284,
"step": 141
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 2.5299625822757374e-09,
"advantages/std": 0.23007319867610931,
"advantages/var": 0.05293367674905647,
"completions/clipped_ratio": -2.78125,
"epoch": 0.8137535816618912,
"grad_norm": 58.97870578609364,
"learning_rate": 1.806370881570463e-06,
"loss": 0.0156,
"num_tokens": 25160661.0,
"residual_var": 0.036391910165548325,
"reward": 0.80859375,
"reward_std": 0.11902769654989243,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.3124999403953552,
"step": 142
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.7825890840358e-09,
"advantages/std": 0.26122748851776123,
"advantages/var": 0.06823980075729708,
"completions/clipped_ratio": -2.8125,
"epoch": 0.8194842406876791,
"grad_norm": 79.08516224088073,
"learning_rate": 1.8037085268307885e-06,
"loss": -0.8249,
"num_tokens": 25315126.0,
"residual_var": 0.04691487178206444,
"reward": 0.89453125,
"reward_std": 0.14085638523101807,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.3124999403953552,
"step": 143
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.4418181244196808e-09,
"advantages/std": 0.24222607910633087,
"advantages/var": 0.05867347339922646,
"completions/clipped_ratio": -2.734375,
"epoch": 0.8252148997134671,
"grad_norm": 41.798700207046984,
"learning_rate": 1.8010299837811158e-06,
"loss": -0.783,
"num_tokens": 25478148.0,
"residual_var": 0.04400511458516121,
"reward": 0.71875,
"reward_std": 0.11876175552606583,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"rho2": 0.2499999701976776,
"step": 144
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.509314693234178e-10,
"advantages/std": 0.24484480917453766,
"advantages/var": 0.05994898057971576,
"completions/clipped_ratio": -2.84375,
"epoch": 0.830945558739255,
"grad_norm": 73.56406215605747,
"learning_rate": 1.7983353063727014e-06,
"loss": -0.3336,
"num_tokens": 25630661.0,
"residual_var": 0.04121493920683861,
"reward": 0.765625,
"reward_std": 0.13231413066387177,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.3124999403953552,
"step": 145
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.7994859396734798e-09,
"advantages/std": 0.25877460837364197,
"advantages/var": 0.06696429793893177,
"completions/clipped_ratio": -2.921875,
"epoch": 0.836676217765043,
"grad_norm": 49.186265655217355,
"learning_rate": 1.795624548881781e-06,
"loss": -0.6319,
"num_tokens": 25776933.0,
"residual_var": 0.04813059791922569,
"reward": 0.83203125,
"reward_std": 0.12677115201950073,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"rho2": 0.28124991059303284,
"step": 146
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.022616832722391e-09,
"advantages/std": 0.34534069895744324,
"advantages/var": 0.11926019835641544,
"completions/clipped_ratio": -2.75,
"epoch": 0.8424068767908309,
"grad_norm": 110.13488773783708,
"learning_rate": 1.792897765908475e-06,
"loss": -1.0988,
"num_tokens": 25946196.0,
"residual_var": 0.06708388030529022,
"reward": 0.76953125,
"reward_std": 0.23212778568267822,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"rho2": 0.4374999403953552,
"step": 147
},
{
"advantages/mean": -2.561137080192566e-09,
"advantages/snr": 1.0041665435540562e-08,
"advantages/std": 0.25505101680755615,
"advantages/var": 0.06505102117456829,
"completions/clipped_ratio": -2.890625,
"epoch": 0.8481375358166189,
"grad_norm": 42.935601661756145,
"learning_rate": 1.7901550123756903e-06,
"loss": -0.9533,
"num_tokens": 26099008.0,
"residual_var": 0.046755433082580566,
"reward": 0.8203125,
"reward_std": 0.1250636875629425,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.2812499403953552,
"step": 148
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 4.578128359866298e-09,
"advantages/std": 0.30514299869537354,
"advantages/var": 0.09311224965280473,
"completions/clipped_ratio": -2.890625,
"epoch": 0.8538681948424068,
"grad_norm": 79.02183185379518,
"learning_rate": 1.787396343528012e-06,
"loss": -0.5308,
"num_tokens": 26245967.0,
"residual_var": 0.06401468068361282,
"reward": 0.875,
"reward_std": 0.172937273979187,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.3124999403953552,
"step": 149
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 7.71368947201975e-09,
"advantages/std": 0.21128857135772705,
"advantages/var": 0.044642860386389316,
"completions/clipped_ratio": -2.828125,
"epoch": 0.8595988538681948,
"grad_norm": 33.21391170215605,
"learning_rate": 1.7846218149305919e-06,
"loss": -0.7593,
"num_tokens": 26394941.0,
"residual_var": 0.034877244383096695,
"reward": 0.8203125,
"reward_std": 0.09100693464279175,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.21874995529651642,
"step": 150
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.2187044471502304,
"advantages/var": 0.047831635203287926,
"completions/clipped_ratio": -2.8125,
"epoch": 0.8653295128939829,
"grad_norm": 52.69766460617733,
"learning_rate": 1.7818314824680298e-06,
"loss": -1.2605,
"num_tokens": 26563921.0,
"residual_var": 0.037368472665548325,
"reward": 0.77734375,
"reward_std": 0.10087841749191284,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"rho2": 0.2187499701976776,
"step": 151
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.1688005478205742e-09,
"advantages/std": 0.29880714416503906,
"advantages/var": 0.08928570940406644,
"completions/clipped_ratio": -2.84375,
"epoch": 0.8710601719197708,
"grad_norm": 86.09764168588923,
"learning_rate": 1.7790254023432464e-06,
"loss": 0.0834,
"num_tokens": 26726648.0,
"residual_var": 0.05859377235174179,
"reward": 0.7734375,
"reward_std": 0.16898946464061737,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"rho2": 0.34374991059303284,
"step": 152
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.6864667746454993e-09,
"advantages/std": 0.260003924369812,
"advantages/var": 0.06760204068770292,
"completions/clipped_ratio": -2.765625,
"epoch": 0.8767908309455588,
"grad_norm": 36.9976857893275,
"learning_rate": 1.776203631076353e-06,
"loss": -0.3414,
"num_tokens": 26875356.0,
"residual_var": 0.044363852590322495,
"reward": 0.8203125,
"reward_std": 0.1467800736427307,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.3437499403953552,
"step": 153
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 8.086143727159208e-10,
"advantages/std": 0.28793779015541077,
"advantages/var": 0.08290817099958137,
"completions/clipped_ratio": -2.78125,
"epoch": 0.8825214899713467,
"grad_norm": 53.3898001272296,
"learning_rate": 1.7733662255035111e-06,
"loss": -0.6895,
"num_tokens": 27042351.0,
"residual_var": 0.04922674223780632,
"reward": 0.734375,
"reward_std": 0.16925786435604095,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"rho2": 0.40624991059303284,
"step": 154
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 4.68055415001063e-10,
"advantages/std": 0.24872122704982758,
"advantages/var": 0.06186224878517188,
"completions/clipped_ratio": -2.90625,
"epoch": 0.8882521489971347,
"grad_norm": 33.30659799512382,
"learning_rate": 1.7705132427757892e-06,
"loss": -1.0276,
"num_tokens": 27187580.0,
"residual_var": 0.04446350410580635,
"reward": 0.82421875,
"reward_std": 0.12164628505706787,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.2812499403953552,
"step": 155
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.30409619212150574,
"advantages/var": 0.09247449406279973,
"completions/clipped_ratio": -2.90625,
"epoch": 0.8939828080229226,
"grad_norm": 53.760508839254754,
"learning_rate": 1.767644740358011e-06,
"loss": -0.8033,
"num_tokens": 27339935.0,
"residual_var": 0.05201692134141922,
"reward": 0.80859375,
"reward_std": 0.18543127179145813,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.43749988079071045,
"step": 156
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 3.5355637047424465e-10,
"advantages/std": 0.3292694687843323,
"advantages/var": 0.10841838307351637,
"completions/clipped_ratio": -2.78125,
"epoch": 0.8997134670487106,
"grad_norm": 110.32996992700565,
"learning_rate": 1.7647607760275985e-06,
"loss": -0.5867,
"num_tokens": 27499321.0,
"residual_var": 0.0643734335899353,
"reward": 0.6875,
"reward_std": 0.20779038965702057,
"rewards/drgrpo_math_reward/mean": 0.6875,
"rewards/drgrpo_math_reward/std": 0.4644203782081604,
"rho2": 0.40624988079071045,
"step": 157
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 3.6908025963194066e-09,
"advantages/std": 0.3154200613498688,
"advantages/var": 0.09948981510195498,
"completions/clipped_ratio": -2.78125,
"epoch": 0.9054441260744985,
"grad_norm": 51.3684103060375,
"learning_rate": 1.7618614078734067e-06,
"loss": -0.1811,
"num_tokens": 27664718.0,
"residual_var": 0.0652901902794838,
"reward": 0.78125,
"reward_std": 0.18569329380989075,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.3437499403953552,
"step": 158
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 4.173527128124609e-09,
"advantages/std": 0.2789374887943268,
"advantages/var": 0.07780612265488518,
"completions/clipped_ratio": -2.765625,
"epoch": 0.9111747851002865,
"grad_norm": 41.1640018865697,
"learning_rate": 1.7589466942945555e-06,
"loss": -0.4552,
"num_tokens": 27819289.0,
"residual_var": 0.051060281693935394,
"reward": 0.7890625,
"reward_std": 0.15702980756759644,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"rho2": 0.34374991059303284,
"step": 159
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.6694108512498434e-09,
"advantages/std": 0.2789374887943268,
"advantages/var": 0.07780612265488518,
"completions/clipped_ratio": -2.84375,
"epoch": 0.9169054441260746,
"grad_norm": 51.077607649603536,
"learning_rate": 1.7560166939992527e-06,
"loss": -0.892,
"num_tokens": 27963753.0,
"residual_var": 0.05835460126399994,
"reward": 0.828125,
"reward_std": 0.13755130767822266,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.24999994039535522,
"step": 160
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 3.326844631279246e-10,
"advantages/std": 0.3499271273612976,
"advantages/var": 0.1224489944633298,
"completions/clipped_ratio": -2.890625,
"epoch": 0.9226361031518625,
"grad_norm": 48.10192863656198,
"learning_rate": 1.753071466003611e-06,
"loss": -1.1203,
"num_tokens": 28122012.0,
"residual_var": 0.06887757033109665,
"reward": 0.8046875,
"reward_std": 0.22263701260089874,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"rho2": 0.4374999403953552,
"step": 161
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 4.520292219848232e-09,
"advantages/std": 0.2575393617153168,
"advantages/var": 0.06632652283273277,
"completions/clipped_ratio": -2.859375,
"epoch": 0.9283667621776505,
"grad_norm": 34.72294908019458,
"learning_rate": 1.7501110696304595e-06,
"loss": -0.1205,
"num_tokens": 28284283.0,
"residual_var": 0.045599501579999924,
"reward": 0.7734375,
"reward_std": 0.13269484043121338,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"rho2": 0.3124999403953552,
"step": 162
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.2372256189616873e-09,
"advantages/std": 0.21576867997646332,
"advantages/var": 0.04655612325878544,
"completions/clipped_ratio": -2.859375,
"epoch": 0.9340974212034384,
"grad_norm": 55.63584185341244,
"learning_rate": 1.7471355645081495e-06,
"loss": -0.2935,
"num_tokens": 28425609.0,
"residual_var": 0.037826862186193466,
"reward": 0.83984375,
"reward_std": 0.09324727952480316,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.1874999701976776,
"step": 163
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.6614758307843997e-09,
"advantages/std": 0.262445330619812,
"advantages/var": 0.06887755156414244,
"completions/clipped_ratio": -2.921875,
"epoch": 0.9398280802292264,
"grad_norm": 47.70746569664927,
"learning_rate": 1.7441450105693529e-06,
"loss": 0.0682,
"num_tokens": 28568538.0,
"residual_var": 0.049505751579999924,
"reward": 0.8515625,
"reward_std": 0.1349327117204666,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.2812499403953552,
"step": 164
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 4.633034966313779e-09,
"advantages/std": 0.2512722611427307,
"advantages/var": 0.06313774921978066,
"completions/clipped_ratio": -2.703125,
"epoch": 0.9455587392550143,
"grad_norm": 42.28262395990937,
"learning_rate": 1.7411394680498548e-06,
"loss": -0.5688,
"num_tokens": 28720699.0,
"residual_var": 0.04932638630270958,
"reward": 0.78515625,
"reward_std": 0.12270711362361908,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"rho2": 0.21874995529651642,
"step": 165
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 4.208162363484517e-10,
"advantages/std": 0.27664169669151306,
"advantages/var": 0.07653062834835911,
"completions/clipped_ratio": -2.921875,
"epoch": 0.9512893982808023,
"grad_norm": 41.67501812101098,
"learning_rate": 1.7381189974873407e-06,
"loss": -0.7094,
"num_tokens": 28860305.0,
"residual_var": 0.052614811807870865,
"reward": 0.8828125,
"reward_std": 0.14939865469932556,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.3124999403953552,
"step": 166
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 5.322951711271804e-09,
"advantages/std": 0.1749635487794876,
"advantages/var": 0.03061224340151214,
"completions/clipped_ratio": -2.953125,
"epoch": 0.9570200573065902,
"grad_norm": 29.567632786804115,
"learning_rate": 1.7350836597201765e-06,
"loss": 0.1182,
"num_tokens": 28991869.0,
"residual_var": 0.025829091668128967,
"reward": 0.9609375,
"reward_std": 0.06378497928380966,
"rewards/drgrpo_math_reward/mean": 0.9609375,
"rewards/drgrpo_math_reward/std": 0.19412322342395782,
"rho2": 0.1562499701976776,
"step": 167
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 3.407670541785436e-09,
"advantages/std": 0.3416272699832916,
"advantages/var": 0.11670919159623683,
"completions/clipped_ratio": -2.625,
"epoch": 0.9627507163323782,
"grad_norm": 58.85091702310497,
"learning_rate": 1.7320335158861852e-06,
"loss": -1.8019,
"num_tokens": 29158280.0,
"residual_var": 0.06564892828464508,
"reward": 0.75390625,
"reward_std": 0.21751460433006287,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"rho2": 0.4374999403953552,
"step": 168
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 4.175666432902228e-09,
"advantages/std": 0.22303566336631775,
"advantages/var": 0.04974490713325341,
"completions/clipped_ratio": -2.453125,
"epoch": 0.9684813753581661,
"grad_norm": 39.61307892613523,
"learning_rate": 1.7289686274214115e-06,
"loss": -0.2184,
"num_tokens": 29321178.0,
"residual_var": 0.04041774198412895,
"reward": 0.71875,
"reward_std": 0.0966646671295166,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"rho2": 0.1874999701976776,
"step": 169
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.5707444382625662e-09,
"advantages/std": 0.19561520218849182,
"advantages/var": 0.038265307327244535,
"completions/clipped_ratio": -2.6875,
"epoch": 0.9742120343839542,
"grad_norm": 38.590165781283964,
"learning_rate": 1.7258890560588887e-06,
"loss": -0.2002,
"num_tokens": 29473074.0,
"residual_var": 0.031090570613741875,
"reward": 0.7265625,
"reward_std": 0.07825091481208801,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"rho2": 0.1874999701976776,
"step": 170
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 6.758136794548967e-09,
"advantages/std": 0.20671138167381287,
"advantages/var": 0.04272959531349674,
"completions/clipped_ratio": -2.9375,
"epoch": 0.9799426934097422,
"grad_norm": 30.278598266227252,
"learning_rate": 1.7227948638273915e-06,
"loss": -0.2199,
"num_tokens": 29611466.0,
"residual_var": 0.03338250517845154,
"reward": 0.91796875,
"reward_std": 0.08929946273565292,
"rewards/drgrpo_math_reward/mean": 0.91796875,
"rewards/drgrpo_math_reward/std": 0.2749498784542084,
"rho2": 0.21874995529651642,
"step": 171
},
{
"advantages/mean": 1.979060471057892e-09,
"advantages/snr": 6.377399533485238e-09,
"advantages/std": 0.3103240430355072,
"advantages/var": 0.09630101168590333,
"completions/clipped_ratio": -2.84375,
"epoch": 0.9856733524355301,
"grad_norm": 46.96748798734129,
"learning_rate": 1.7196861130501902e-06,
"loss": -0.2326,
"num_tokens": 29761366.0,
"residual_var": 0.06319756805896759,
"reward": 0.76953125,
"reward_std": 0.1822758913040161,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"rho2": 0.34374991059303284,
"step": 172
},
{
"advantages/mean": 1.0477378964424133e-09,
"advantages/snr": 4.638532964616805e-09,
"advantages/std": 0.22587698698043823,
"advantages/var": 0.05102041324736106,
"completions/clipped_ratio": -2.609375,
"epoch": 0.9914040114613181,
"grad_norm": 82.39543338312782,
"learning_rate": 1.716562866343792e-06,
"loss": -0.063,
"num_tokens": 29936989.0,
"residual_var": 0.03985970467329025,
"reward": 0.6171875,
"reward_std": 0.09784172475337982,
"rewards/drgrpo_math_reward/mean": 0.6171875,
"rewards/drgrpo_math_reward/std": 0.48702529072761536,
"rho2": 0.21874995529651642,
"step": 173
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 3.116801356546741e-09,
"advantages/std": 0.14940357208251953,
"advantages/var": 0.02232142735101661,
"completions/clipped_ratio": -2.78125,
"epoch": 0.997134670487106,
"grad_norm": 25.91850427167227,
"learning_rate": 1.7134251866166828e-06,
"loss": -0.1145,
"num_tokens": 30091098.0,
"residual_var": 0.018833715468645096,
"reward": 0.80078125,
"reward_std": 0.055242717266082764,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.1562499701976776,
"step": 174
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.4447215635128062e-09,
"advantages/std": 0.2857142984867096,
"advantages/var": 0.08163266035975258,
"completions/clipped_ratio": -2.828125,
"epoch": 1.005730659025788,
"grad_norm": 52.003556279462764,
"learning_rate": 1.710273137068057e-06,
"loss": -0.5332,
"num_tokens": 30238663.0,
"residual_var": 0.05612246319651604,
"reward": 0.8125,
"reward_std": 0.15452352166175842,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"rho2": 0.3124999403953552,
"step": 175
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.3626546441154624e-09,
"advantages/std": 0.2562982141971588,
"advantages/var": 0.0656887746006527,
"completions/clipped_ratio": -2.9375,
"epoch": 1.011461318051576,
"grad_norm": 49.08232252807409,
"learning_rate": 1.7071067811865474e-06,
"loss": 0.2621,
"num_tokens": 30386461.0,
"residual_var": 0.05337214469909668,
"reward": 0.79296875,
"reward_std": 0.11849091202020645,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"rho2": 0.18749994039535522,
"step": 176
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 8.181092293916676e-09,
"advantages/std": 0.2845960259437561,
"advantages/var": 0.0809948979829791,
"completions/clipped_ratio": -2.515625,
"epoch": 1.0171919770773639,
"grad_norm": 43.315763175106994,
"learning_rate": 1.7039261827489448e-06,
"loss": -0.7983,
"num_tokens": 30540223.0,
"residual_var": 0.05821509286761284,
"reward": 0.72265625,
"reward_std": 0.1533440202474594,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"rho2": 0.2812499403953552,
"step": 177
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 5.638654887602349e-09,
"advantages/std": 0.289043128490448,
"advantages/var": 0.08354593012754563,
"completions/clipped_ratio": -2.828125,
"epoch": 1.0229226361031518,
"grad_norm": 47.223138020041155,
"learning_rate": 1.7007314058189138e-06,
"loss": -0.4208,
"num_tokens": 30701218.0,
"residual_var": 0.06004864722490311,
"reward": 0.74609375,
"reward_std": 0.15030977129936218,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"rho2": 0.2812499403953552,
"step": 178
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 3.997213248610433e-09,
"advantages/std": 0.29124119877815247,
"advantages/var": 0.08482143586573532,
"completions/clipped_ratio": -2.953125,
"epoch": 1.0286532951289398,
"grad_norm": 50.17432938470871,
"learning_rate": 1.6975225147457024e-06,
"loss": -0.3989,
"num_tokens": 30849451.0,
"residual_var": 0.06361608952283859,
"reward": 0.82421875,
"reward_std": 0.15729182958602905,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.24999994039535522,
"step": 179
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.017830079630706e-09,
"advantages/std": 0.2314550131559372,
"advantages/var": 0.05357142311501506,
"completions/clipped_ratio": -2.96875,
"epoch": 1.0343839541547277,
"grad_norm": 37.273134893984825,
"learning_rate": 1.6942995741628456e-06,
"loss": -0.4196,
"num_tokens": 30982416.0,
"residual_var": 0.043526798486709595,
"reward": 0.8984375,
"reward_std": 0.10007961094379425,
"rewards/drgrpo_math_reward/mean": 0.8984375,
"rewards/drgrpo_math_reward/std": 0.3026638329029083,
"rho2": 0.18749995529651642,
"step": 180
},
{
"advantages/mean": -1.5133991837501526e-09,
"advantages/snr": 5.8483293039388095e-09,
"advantages/std": 0.25877460837364197,
"advantages/var": 0.06696429793893177,
"completions/clipped_ratio": -2.84375,
"epoch": 1.0401146131805157,
"grad_norm": 61.99506107492152,
"learning_rate": 1.6910626489868648e-06,
"loss": -0.4872,
"num_tokens": 31136161.0,
"residual_var": 0.05022323131561279,
"reward": 0.76171875,
"reward_std": 0.12730401754379272,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"rho2": 0.2499999701976776,
"step": 181
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 7.819981896283313e-09,
"advantages/std": 0.29773807525634766,
"advantages/var": 0.08864796145735454,
"completions/clipped_ratio": -2.65625,
"epoch": 1.0458452722063036,
"grad_norm": 113.48047316941651,
"learning_rate": 1.6878118044159578e-06,
"loss": -0.2918,
"num_tokens": 31287067.0,
"residual_var": 0.06371574103832245,
"reward": 0.77734375,
"reward_std": 0.15543463826179504,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"rho2": 0.2812499403953552,
"step": 182
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 3.062807733827676e-09,
"advantages/std": 0.26606541872024536,
"advantages/var": 0.07079080703877949,
"completions/clipped_ratio": -2.75,
"epoch": 1.0515759312320916,
"grad_norm": 40.02864895138621,
"learning_rate": 1.6845471059286886e-06,
"loss": -0.1182,
"num_tokens": 31439199.0,
"residual_var": 0.05309312418103218,
"reward": 0.77734375,
"reward_std": 0.13717305660247803,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"rho2": 0.24999995529651642,
"step": 183
},
{
"advantages/mean": 1.280568540096283e-09,
"advantages/snr": 4.856938081027663e-09,
"advantages/std": 0.2636575698852539,
"advantages/var": 0.06951531415779755,
"completions/clipped_ratio": -2.84375,
"epoch": 1.0573065902578798,
"grad_norm": 40.549121661009465,
"learning_rate": 1.6812686192826662e-06,
"loss": -0.7632,
"num_tokens": 31597985.0,
"residual_var": 0.04779178649187088,
"reward": 0.91015625,
"reward_std": 0.14256632328033447,
"rewards/drgrpo_math_reward/mean": 0.91015625,
"rewards/drgrpo_math_reward/std": 0.2865179479122162,
"rho2": 0.3124999403953552,
"step": 184
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 5.186443199129546e-10,
"advantages/std": 0.22446079552173615,
"advantages/var": 0.050382648726250645,
"completions/clipped_ratio": -2.78125,
"epoch": 1.0630372492836677,
"grad_norm": 39.5470013856504,
"learning_rate": 1.677976410513221e-06,
"loss": -0.3753,
"num_tokens": 31745502.0,
"residual_var": 0.04093591496348381,
"reward": 0.83203125,
"reward_std": 0.10311630368232727,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"rho2": 0.1874999701976776,
"step": 185
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 8.560204468385481e-10,
"advantages/std": 0.27199190855026245,
"advantages/var": 0.07397959831681433,
"completions/clipped_ratio": -2.828125,
"epoch": 1.0687679083094557,
"grad_norm": 44.057693695852585,
"learning_rate": 1.6746705459320744e-06,
"loss": 0.0624,
"num_tokens": 31905760.0,
"residual_var": 0.053172849118709564,
"reward": 0.796875,
"reward_std": 0.14005759358406067,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"rho2": 0.2812499403953552,
"step": 186
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.5559327531530887e-09,
"advantages/std": 0.22446082532405853,
"advantages/var": 0.050382662105157516,
"completions/clipped_ratio": -2.859375,
"epoch": 1.0744985673352436,
"grad_norm": 34.399246117778546,
"learning_rate": 1.6713510921260038e-06,
"loss": -0.4038,
"num_tokens": 32059438.0,
"residual_var": 0.037786997854709625,
"reward": 0.89453125,
"reward_std": 0.10376540571451187,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.2499999701976776,
"step": 187
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 8.024651453540351e-09,
"advantages/std": 0.2901442348957062,
"advantages/var": 0.08418367704321472,
"completions/clipped_ratio": -2.8125,
"epoch": 1.0802292263610316,
"grad_norm": 75.92610469530256,
"learning_rate": 1.6680181159555011e-06,
"loss": -0.3991,
"num_tokens": 32202032.0,
"residual_var": 0.06050703302025795,
"reward": 0.8046875,
"reward_std": 0.15676140785217285,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"rho2": 0.28124991059303284,
"step": 188
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 3.116801356546741e-09,
"advantages/std": 0.14940357208251953,
"advantages/var": 0.02232142735101661,
"completions/clipped_ratio": -2.96875,
"epoch": 1.0859598853868195,
"grad_norm": 29.435908660503742,
"learning_rate": 1.6646716845534256e-06,
"loss": -0.0744,
"num_tokens": 32334668.0,
"residual_var": 0.018833715468645096,
"reward": 0.86328125,
"reward_std": 0.055242717266082764,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.1562499701976776,
"step": 189
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 8.790565244942093e-10,
"advantages/std": 0.2648642361164093,
"advantages/var": 0.07015306357352902,
"completions/clipped_ratio": -2.859375,
"epoch": 1.0916905444126075,
"grad_norm": 89.04585523798126,
"learning_rate": 1.6613118653236517e-06,
"loss": -0.3858,
"num_tokens": 32483123.0,
"residual_var": 0.043845679610967636,
"reward": 0.75,
"reward_std": 0.14966705441474915,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"rho2": 0.3749999403953552,
"step": 190
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 5.298477906575633e-09,
"advantages/std": 0.2636575698852539,
"advantages/var": 0.06951531415779755,
"completions/clipped_ratio": -2.953125,
"epoch": 1.0974212034383954,
"grad_norm": 45.88711988000871,
"learning_rate": 1.6579387259397126e-06,
"loss": 0.4409,
"num_tokens": 32628123.0,
"residual_var": 0.04779178649187088,
"reward": 0.77734375,
"reward_std": 0.13611222803592682,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"rho2": 0.3124999403953552,
"step": 191
},
{
"advantages/mean": -5.820766091346741e-10,
"advantages/snr": 2.3773286733085444e-09,
"advantages/std": 0.24484480917453766,
"advantages/var": 0.05994898057971576,
"completions/clipped_ratio": -2.796875,
"epoch": 1.1031518624641834,
"grad_norm": 40.04913866471509,
"learning_rate": 1.6545523343434353e-06,
"loss": -1.5379,
"num_tokens": 32774588.0,
"residual_var": 0.046835146844387054,
"reward": 0.828125,
"reward_std": 0.11928971856832504,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.2187499701976776,
"step": 192
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.8439244876346257e-09,
"advantages/std": 0.25253814458847046,
"advantages/var": 0.06377551447218721,
"completions/clipped_ratio": -2.84375,
"epoch": 1.1088825214899714,
"grad_norm": 49.81062329564691,
"learning_rate": 1.6511527587435735e-06,
"loss": -0.4636,
"num_tokens": 32922532.0,
"residual_var": 0.047831643372774124,
"reward": 0.8828125,
"reward_std": 0.12388662993907928,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.2499999701976776,
"step": 193
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 3.259628751350408e-09,
"advantages/std": 0.2857142984867096,
"advantages/var": 0.08163266035975258,
"completions/clipped_ratio": -2.875,
"epoch": 1.1146131805157593,
"grad_norm": 45.511791262701145,
"learning_rate": 1.6477400676144333e-06,
"loss": 0.2143,
"num_tokens": 33077704.0,
"residual_var": 0.05612245947122574,
"reward": 0.8125,
"reward_std": 0.14806942641735077,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"rho2": 0.3124999403953552,
"step": 194
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.1291806000454375e-09,
"advantages/std": 0.2187044471502304,
"advantages/var": 0.047831635203287926,
"completions/clipped_ratio": -2.859375,
"epoch": 1.1203438395415473,
"grad_norm": 39.835124741875426,
"learning_rate": 1.6443143296944945e-06,
"loss": -1.345,
"num_tokens": 33220605.0,
"residual_var": 0.03736847639083862,
"reward": 0.81640625,
"reward_std": 0.10087842494249344,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"rho2": 0.21874995529651642,
"step": 195
},
{
"advantages/mean": -1.0477378964424133e-09,
"advantages/snr": 4.12824047321452e-09,
"advantages/std": 0.2537976801395416,
"advantages/var": 0.06441326244421308,
"completions/clipped_ratio": -2.890625,
"epoch": 1.1260744985673352,
"grad_norm": 36.741125549960515,
"learning_rate": 1.640875613985024e-06,
"loss": -0.4177,
"num_tokens": 33372791.0,
"residual_var": 0.044284138828516006,
"reward": 0.80859375,
"reward_std": 0.1374414563179016,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.3124999403953552,
"step": 196
},
{
"advantages/mean": -1.280568540096283e-09,
"advantages/snr": 4.4473790499375645e-09,
"advantages/std": 0.28793779015541077,
"advantages/var": 0.08290817099958137,
"completions/clipped_ratio": -2.84375,
"epoch": 1.1318051575931232,
"grad_norm": 47.67398766795382,
"learning_rate": 1.6374239897486897e-06,
"loss": -0.1512,
"num_tokens": 33524940.0,
"residual_var": 0.06218114122748375,
"reward": 0.765625,
"reward_std": 0.15558436512947083,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.24999994039535522,
"step": 197
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 3.548634441045866e-09,
"advantages/std": 0.262445330619812,
"advantages/var": 0.06887755156414244,
"completions/clipped_ratio": -2.703125,
"epoch": 1.1375358166189111,
"grad_norm": 48.57034087855925,
"learning_rate": 1.6339595265081616e-06,
"loss": -0.6361,
"num_tokens": 33686111.0,
"residual_var": 0.055963024497032166,
"reward": 0.7890625,
"reward_std": 0.12836240231990814,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"rho2": 0.18749995529651642,
"step": 198
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 6.653689639089756e-09,
"advantages/std": 0.1749635487794876,
"advantages/var": 0.03061224340151214,
"completions/clipped_ratio": -2.8125,
"epoch": 1.143266475644699,
"grad_norm": 25.524074525646057,
"learning_rate": 1.6304822940447136e-06,
"loss": -0.1862,
"num_tokens": 33837046.0,
"residual_var": 0.025829095393419266,
"reward": 0.7421875,
"reward_std": 0.06378497928380966,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"rho2": 0.15624995529651642,
"step": 199
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.29233402013778687,
"advantages/var": 0.08545917932991998,
"completions/clipped_ratio": -2.75,
"epoch": 1.148997134670487,
"grad_norm": 66.74123021504873,
"learning_rate": 1.6269923623968179e-06,
"loss": -0.1777,
"num_tokens": 33982336.0,
"residual_var": 0.04807081073522568,
"reward": 0.8359375,
"reward_std": 0.17859894037246704,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.43749991059303284,
"step": 200
},
{
"advantages/mean": 8.149072527885437e-10,
"advantages/snr": 3.00906987923814e-09,
"advantages/std": 0.27081698179244995,
"advantages/var": 0.07334183762717217,
"completions/clipped_ratio": -2.5625,
"epoch": 1.154727793696275,
"grad_norm": 60.27379111957229,
"learning_rate": 1.6234898018587336e-06,
"loss": -2.2732,
"num_tokens": 34146966.0,
"residual_var": 0.04583865776658058,
"reward": 0.76171875,
"reward_std": 0.15308445692062378,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"rho2": 0.3749999403953552,
"step": 201
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 6.552775417430944e-09,
"advantages/std": 0.24872124195098877,
"advantages/var": 0.061862256197642296,
"completions/clipped_ratio": -2.828125,
"epoch": 1.1604584527220632,
"grad_norm": 49.46313200862665,
"learning_rate": 1.6199746829790905e-06,
"loss": 0.0609,
"num_tokens": 34305411.0,
"residual_var": 0.04639669507741928,
"reward": 0.80859375,
"reward_std": 0.1157250627875328,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.2499999701976776,
"step": 202
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.7994861469150812e-09,
"advantages/std": 0.2587745785713196,
"advantages/var": 0.06696428251476405,
"completions/clipped_ratio": -2.78125,
"epoch": 1.1661891117478511,
"grad_norm": 107.60304176172177,
"learning_rate": 1.6164470765594697e-06,
"loss": -0.9408,
"num_tokens": 34446676.0,
"residual_var": 0.048130594193935394,
"reward": 0.78515625,
"reward_std": 0.1332252472639084,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"rho2": 0.2812499403953552,
"step": 203
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 8.331737022612655e-09,
"advantages/std": 0.19561520218849182,
"advantages/var": 0.038265307327244535,
"completions/clipped_ratio": -2.8125,
"epoch": 1.171919770773639,
"grad_norm": 44.088290605449814,
"learning_rate": 1.6129070536529765e-06,
"loss": -0.2678,
"num_tokens": 34593589.0,
"residual_var": 0.031090570613741875,
"reward": 0.84375,
"reward_std": 0.07825092226266861,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.1874999701976776,
"step": 204
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 2.3165172084047557e-09,
"advantages/std": 0.2512722909450531,
"advantages/var": 0.06313776419677541,
"completions/clipped_ratio": -2.90625,
"epoch": 1.177650429799427,
"grad_norm": 47.83310352768681,
"learning_rate": 1.6093546855628081e-06,
"loss": -0.3666,
"num_tokens": 34744310.0,
"residual_var": 0.04932638630270958,
"reward": 0.88671875,
"reward_std": 0.11625302582979202,
"rewards/drgrpo_math_reward/mean": 0.88671875,
"rewards/drgrpo_math_reward/std": 0.31755712628364563,
"rho2": 0.21874995529651642,
"step": 205
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 5.549560778490846e-09,
"advantages/std": 0.2097739279270172,
"advantages/var": 0.04400510083792941,
"completions/clipped_ratio": -2.671875,
"epoch": 1.183381088825215,
"grad_norm": 35.1700095212735,
"learning_rate": 1.6057900438408199e-06,
"loss": -0.2259,
"num_tokens": 34898568.0,
"residual_var": 0.037129320204257965,
"reward": 0.76953125,
"reward_std": 0.08390620350837708,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"rho2": 0.1562499701976776,
"step": 206
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 5.3781130186484915e-09,
"advantages/std": 0.30304577946662903,
"advantages/var": 0.09183674445253676,
"completions/clipped_ratio": -2.921875,
"epoch": 1.189111747851003,
"grad_norm": 63.771394008210834,
"learning_rate": 1.6022132002860821e-06,
"loss": -0.101,
"num_tokens": 35049921.0,
"residual_var": 0.05739797279238701,
"reward": 0.8359375,
"reward_std": 0.17833054065704346,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.3749999403953552,
"step": 207
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.1118655063061774e-09,
"advantages/std": 0.22446082532405853,
"advantages/var": 0.050382662105157516,
"completions/clipped_ratio": -2.859375,
"epoch": 1.1948424068767909,
"grad_norm": 51.11574479359161,
"learning_rate": 1.598624226943435e-06,
"loss": 0.1908,
"num_tokens": 35184799.0,
"residual_var": 0.04093591868877411,
"reward": 0.85546875,
"reward_std": 0.09666222333908081,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.18749994039535522,
"step": 208
},
{
"advantages/mean": -1.280568540096283e-09,
"advantages/snr": 4.6483877879022735e-09,
"advantages/std": 0.2754865884780884,
"advantages/var": 0.07589286043129562,
"completions/clipped_ratio": -2.796875,
"epoch": 1.2005730659025788,
"grad_norm": 54.22474322398488,
"learning_rate": 1.595023196102037e-06,
"loss": -0.0642,
"num_tokens": 35331541.0,
"residual_var": 0.05691966041922569,
"reward": 0.69921875,
"reward_std": 0.1422979235649109,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"rho2": 0.24999995529651642,
"step": 209
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.569652855393217e-09,
"advantages/std": 0.29666513204574585,
"advantages/var": 0.08801020057171982,
"completions/clipped_ratio": -2.859375,
"epoch": 1.2063037249283668,
"grad_norm": 52.08833318906159,
"learning_rate": 1.5914101802939088e-06,
"loss": -0.5718,
"num_tokens": 35486393.0,
"residual_var": 0.06600767374038696,
"reward": 0.765625,
"reward_std": 0.1607092320919037,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.24999994039535522,
"step": 210
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 4.139732864518914e-09,
"advantages/std": 0.28121456503868103,
"advantages/var": 0.07908163158989456,
"completions/clipped_ratio": -2.90625,
"epoch": 1.2120343839541547,
"grad_norm": 48.84084160737796,
"learning_rate": 1.587785252292473e-06,
"loss": -0.0827,
"num_tokens": 35632888.0,
"residual_var": 0.051897335797548294,
"reward": 0.8359375,
"reward_std": 0.152285635471344,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.34374991059303284,
"step": 211
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 4.31630082528225e-09,
"advantages/std": 0.21576867997646332,
"advantages/var": 0.04655612325878544,
"completions/clipped_ratio": -2.84375,
"epoch": 1.2177650429799427,
"grad_norm": 31.030637833339267,
"learning_rate": 1.584148485111087e-06,
"loss": -0.1616,
"num_tokens": 35768842.0,
"residual_var": 0.03637198358774185,
"reward": 0.85546875,
"reward_std": 0.09916850179433823,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.21874994039535522,
"step": 212
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.0244025692171684e-09,
"advantages/std": 0.22728431224822998,
"advantages/var": 0.051658158594150905,
"completions/clipped_ratio": -2.703125,
"epoch": 1.2234957020057307,
"grad_norm": 50.84114456820672,
"learning_rate": 1.5804999520015733e-06,
"loss": -0.6298,
"num_tokens": 35930457.0,
"residual_var": 0.04035795107483864,
"reward": 0.75390625,
"reward_std": 0.10429336875677109,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"rho2": 0.21874995529651642,
"step": 213
},
{
"advantages/mean": 1.280568540096283e-09,
"advantages/snr": 4.770199825726274e-09,
"advantages/std": 0.2684517502784729,
"advantages/var": 0.07206634222757557,
"completions/clipped_ratio": -2.828125,
"epoch": 1.2292263610315186,
"grad_norm": 43.68379202104907,
"learning_rate": 1.5768397264527446e-06,
"loss": 0.3753,
"num_tokens": 36081065.0,
"residual_var": 0.047293536365032196,
"reward": 0.78515625,
"reward_std": 0.1454533040523529,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"rho2": 0.3437499403953552,
"step": 214
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 2.5932214274099493e-09,
"advantages/std": 0.22446081042289734,
"advantages/var": 0.05038265541570386,
"completions/clipped_ratio": -2.78125,
"epoch": 1.2349570200573066,
"grad_norm": 31.309381344591714,
"learning_rate": 1.5731678821889222e-06,
"loss": 0.0674,
"num_tokens": 36228838.0,
"residual_var": 0.04093592241406441,
"reward": 0.83984375,
"reward_std": 0.10311631113290787,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.18749995529651642,
"step": 215
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.252712264849656e-09,
"advantages/std": 0.20671138167381287,
"advantages/var": 0.04272959531349674,
"completions/clipped_ratio": -2.9375,
"epoch": 1.2406876790830945,
"grad_norm": 33.93200973085216,
"learning_rate": 1.569484493168452e-06,
"loss": -0.4791,
"num_tokens": 36370678.0,
"residual_var": 0.03338250517845154,
"reward": 0.80859375,
"reward_std": 0.08929947018623352,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.2187499701976776,
"step": 216
},
{
"advantages/mean": 8.149072527885437e-10,
"advantages/snr": 3.1950753658538154e-09,
"advantages/std": 0.25505101680755615,
"advantages/var": 0.06505102117456829,
"completions/clipped_ratio": -2.890625,
"epoch": 1.2464183381088825,
"grad_norm": 43.01139157018457,
"learning_rate": 1.5657896335822145e-06,
"loss": -0.1037,
"num_tokens": 36514432.0,
"residual_var": 0.046755433082580566,
"reward": 0.796875,
"reward_std": 0.13151776790618896,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"rho2": 0.2812499403953552,
"step": 217
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.822921321434942e-09,
"advantages/std": 0.24743583798408508,
"advantages/var": 0.0612244939188864,
"completions/clipped_ratio": -2.890625,
"epoch": 1.2521489971346704,
"grad_norm": 52.736686618922654,
"learning_rate": 1.5620833778521306e-06,
"loss": -0.3578,
"num_tokens": 36655270.0,
"residual_var": 0.04209185019135475,
"reward": 0.9140625,
"reward_std": 0.13402405381202698,
"rewards/drgrpo_math_reward/mean": 0.9140625,
"rewards/drgrpo_math_reward/std": 0.28082075715065,
"rho2": 0.3124999403953552,
"step": 218
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 5.1361226810312885e-09,
"advantages/std": 0.27199190855026245,
"advantages/var": 0.07397959831681433,
"completions/clipped_ratio": -2.84375,
"epoch": 1.2578796561604584,
"grad_norm": 66.86254250830923,
"learning_rate": 1.5583658006296623e-06,
"loss": 0.1719,
"num_tokens": 36825958.0,
"residual_var": 0.053172849118709564,
"reward": 0.65625,
"reward_std": 0.14651167392730713,
"rewards/drgrpo_math_reward/mean": 0.65625,
"rewards/drgrpo_math_reward/std": 0.47588926553726196,
"rho2": 0.2812499403953552,
"step": 219
},
{
"advantages/mean": 8.149072527885437e-10,
"advantages/snr": 3.179527502936079e-09,
"advantages/std": 0.2562982141971588,
"advantages/var": 0.0656887746006527,
"completions/clipped_ratio": -2.8125,
"epoch": 1.2636103151862463,
"grad_norm": 54.50502110716351,
"learning_rate": 1.5546369767943102e-06,
"loss": -0.5218,
"num_tokens": 36985406.0,
"residual_var": 0.043108273297548294,
"reward": 0.84765625,
"reward_std": 0.13861849904060364,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.3437499403953552,
"step": 220
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 8.560204468385481e-10,
"advantages/std": 0.27199190855026245,
"advantages/var": 0.07397959831681433,
"completions/clipped_ratio": -2.890625,
"epoch": 1.2693409742120343,
"grad_norm": 40.739286290569964,
"learning_rate": 1.5508969814521024e-06,
"loss": 0.0063,
"num_tokens": 37116884.0,
"residual_var": 0.060108430683612823,
"reward": 0.8984375,
"reward_std": 0.1334872543811798,
"rewards/drgrpo_math_reward/mean": 0.8984375,
"rewards/drgrpo_math_reward/std": 0.3026638329029083,
"rho2": 0.18749995529651642,
"step": 221
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.2731026598319146e-09,
"advantages/std": 0.2743266224861145,
"advantages/var": 0.07525509580463918,
"completions/clipped_ratio": -2.90625,
"epoch": 1.2750716332378222,
"grad_norm": 58.428618246393256,
"learning_rate": 1.5471458899340858e-06,
"loss": -1.2012,
"num_tokens": 37269959.0,
"residual_var": 0.05408962070941925,
"reward": 0.7734375,
"reward_std": 0.14822159707546234,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"rho2": 0.2812499403953552,
"step": 222
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 4.656612686812885e-09,
"advantages/std": 0.25,
"advantages/var": 0.0625,
"completions/clipped_ratio": -2.65625,
"epoch": 1.2808022922636102,
"grad_norm": 48.94953357605194,
"learning_rate": 1.5433837777948058e-06,
"loss": -0.4937,
"num_tokens": 37434350.0,
"residual_var": 0.046875014901161194,
"reward": 0.765625,
"reward_std": 0.11572261154651642,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.24999994039535522,
"step": 223
},
{
"advantages/mean": -1.5133991837501526e-09,
"advantages/snr": 5.4479589668666796e-09,
"advantages/std": 0.27779194712638855,
"advantages/var": 0.07716836588827025,
"completions/clipped_ratio": -2.828125,
"epoch": 1.2865329512893982,
"grad_norm": 48.425063051432105,
"learning_rate": 1.5396107208107846e-06,
"loss": -0.4051,
"num_tokens": 37593087.0,
"residual_var": 0.05546478554606438,
"reward": 0.82421875,
"reward_std": 0.1434749811887741,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.2812499403953552,
"step": 224
},
{
"advantages/mean": -1.7462298274040222e-09,
"advantages/snr": 8.781698823081943e-09,
"advantages/std": 0.19884873926639557,
"advantages/var": 0.03954082110783497,
"completions/clipped_ratio": -2.953125,
"epoch": 1.2922636103151863,
"grad_norm": 34.77580496494329,
"learning_rate": 1.5358267949789964e-06,
"loss": -0.0698,
"num_tokens": 37731294.0,
"residual_var": 0.032126929610967636,
"reward": 0.90625,
"reward_std": 0.07996084541082382,
"rewards/drgrpo_math_reward/mean": 0.90625,
"rewards/drgrpo_math_reward/std": 0.2920515835285187,
"rho2": 0.1874999701976776,
"step": 225
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.25253814458847046,
"advantages/var": 0.06377551447218721,
"completions/clipped_ratio": -2.765625,
"epoch": 1.2979942693409743,
"grad_norm": 41.08991075758347,
"learning_rate": 1.5320320765153365e-06,
"loss": -0.1379,
"num_tokens": 37881528.0,
"residual_var": 0.04583866521716118,
"reward": 0.828125,
"reward_std": 0.1233537495136261,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.28124991059303284,
"step": 226
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.7194685024217942e-09,
"advantages/std": 0.27081698179244995,
"advantages/var": 0.07334183762717217,
"completions/clipped_ratio": -2.890625,
"epoch": 1.3037249283667622,
"grad_norm": 53.21476933729365,
"learning_rate": 1.5282266418530844e-06,
"loss": 0.1193,
"num_tokens": 38027798.0,
"residual_var": 0.05042253062129021,
"reward": 0.82421875,
"reward_std": 0.15243536233901978,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.3124999403953552,
"step": 227
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 5.717766645206049e-10,
"advantages/std": 0.20360277593135834,
"advantages/var": 0.04145409036695491,
"completions/clipped_ratio": -2.796875,
"epoch": 1.3094555873925502,
"grad_norm": 32.614018221273795,
"learning_rate": 1.5244105676413656e-06,
"loss": -0.5882,
"num_tokens": 38181529.0,
"residual_var": 0.03368145227432251,
"reward": 0.84765625,
"reward_std": 0.08166831731796265,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.1874999701976776,
"step": 228
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 6.877874009687177e-09,
"advantages/std": 0.27081698179244995,
"advantages/var": 0.07334183762717217,
"completions/clipped_ratio": -2.859375,
"epoch": 1.3151862464183381,
"grad_norm": 57.591177765790256,
"learning_rate": 1.5205839307436086e-06,
"loss": -1.1811,
"num_tokens": 38330487.0,
"residual_var": 0.05042252689599991,
"reward": 0.78515625,
"reward_std": 0.14598126709461212,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"rho2": 0.31249991059303284,
"step": 229
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 4.058712560030977e-10,
"advantages/std": 0.2868281900882721,
"advantages/var": 0.08227041062931395,
"completions/clipped_ratio": -2.828125,
"epoch": 1.320916905444126,
"grad_norm": 63.47812761517252,
"learning_rate": 1.5167468082359944e-06,
"loss": -0.6156,
"num_tokens": 38482705.0,
"residual_var": 0.05398997291922569,
"reward": 0.80078125,
"reward_std": 0.16861121356487274,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.3437499403953552,
"step": 230
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 8.56020446838548e-09,
"advantages/std": 0.27199190855026245,
"advantages/var": 0.07397959831681433,
"completions/clipped_ratio": -2.796875,
"epoch": 1.326647564469914,
"grad_norm": 64.11487366448154,
"learning_rate": 1.5128992774059062e-06,
"loss": -1.467,
"num_tokens": 38625275.0,
"residual_var": 0.041613537818193436,
"reward": 0.8828125,
"reward_std": 0.1666392683982849,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.43749991059303284,
"step": 231
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 6.25598551702665e-09,
"advantages/std": 0.29773807525634766,
"advantages/var": 0.08864796145735454,
"completions/clipped_ratio": -2.890625,
"epoch": 1.332378223495702,
"grad_norm": 49.53825848796451,
"learning_rate": 1.5090414157503713e-06,
"loss": -0.8094,
"num_tokens": 38774409.0,
"residual_var": 0.06371574103832245,
"reward": 0.80078125,
"reward_std": 0.1618887335062027,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.2812499403953552,
"step": 232
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.1937709000681565e-09,
"advantages/std": 0.2187044471502304,
"advantages/var": 0.047831635203287926,
"completions/clipped_ratio": -2.90625,
"epoch": 1.33810888252149,
"grad_norm": 32.156107327334965,
"learning_rate": 1.5051733009745012e-06,
"loss": -0.1481,
"num_tokens": 38923354.0,
"residual_var": 0.037368472665548325,
"reward": 0.87890625,
"reward_std": 0.09442433714866638,
"rewards/drgrpo_math_reward/mean": 0.87890625,
"rewards/drgrpo_math_reward/std": 0.3268752694129944,
"rho2": 0.21874994039535522,
"step": 233
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.3417863528218515e-09,
"advantages/std": 0.19884873926639557,
"advantages/var": 0.03954082110783497,
"completions/clipped_ratio": -2.859375,
"epoch": 1.343839541547278,
"grad_norm": 53.25849122067783,
"learning_rate": 1.501295010989925e-06,
"loss": -0.1868,
"num_tokens": 39081935.0,
"residual_var": 0.034598227590322495,
"reward": 0.875,
"reward_std": 0.07285766303539276,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.12499997019767761,
"step": 234
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 4.258361200090875e-09,
"advantages/std": 0.2187044471502304,
"advantages/var": 0.047831635203287926,
"completions/clipped_ratio": -2.953125,
"epoch": 1.3495702005730659,
"grad_norm": 41.785440666032066,
"learning_rate": 1.4974066239132218e-06,
"loss": -0.5715,
"num_tokens": 39224346.0,
"residual_var": 0.037368472665548325,
"reward": 0.88671875,
"reward_std": 0.09442433714866638,
"rewards/drgrpo_math_reward/mean": 0.88671875,
"rewards/drgrpo_math_reward/std": 0.31755712628364563,
"rho2": 0.2187499701976776,
"step": 235
},
{
"advantages/mean": -1.0477378964424133e-09,
"advantages/snr": 4.148830097177908e-09,
"advantages/std": 0.25253814458847046,
"advantages/var": 0.06377551447218721,
"completions/clipped_ratio": -2.921875,
"epoch": 1.3553008595988538,
"grad_norm": 53.85641702669578,
"learning_rate": 1.4935082180643467e-06,
"loss": -0.4736,
"num_tokens": 39366574.0,
"residual_var": 0.04783164709806442,
"reward": 0.875,
"reward_std": 0.12388662993907928,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.2499999701976776,
"step": 236
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 6.125614781517557e-09,
"advantages/std": 0.26606544852256775,
"advantages/var": 0.07079082289751515,
"completions/clipped_ratio": -2.609375,
"epoch": 1.3610315186246418,
"grad_norm": 56.90528111968534,
"learning_rate": 1.4895998719650523e-06,
"loss": -0.7502,
"num_tokens": 39526191.0,
"residual_var": 0.048668697476387024,
"reward": 0.72265625,
"reward_std": 0.14427624642848969,
"rewards/drgrpo_math_reward/mean": 0.72265625,
"rewards/drgrpo_math_reward/std": 0.4485645890235901,
"rho2": 0.3124999701976776,
"step": 237
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.690322831964463e-09,
"advantages/std": 0.2754865884780884,
"advantages/var": 0.07589286043129562,
"completions/clipped_ratio": -2.84375,
"epoch": 1.3667621776504297,
"grad_norm": 43.19070198809019,
"learning_rate": 1.4856816643373082e-06,
"loss": -0.9103,
"num_tokens": 39683090.0,
"residual_var": 0.04980470612645149,
"reward": 0.76953125,
"reward_std": 0.15532232820987701,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"rho2": 0.34374991059303284,
"step": 238
},
{
"advantages/mean": 1.280568540096283e-09,
"advantages/snr": 5.669318067864984e-09,
"advantages/std": 0.22587698698043823,
"advantages/var": 0.05102041324736106,
"completions/clipped_ratio": -2.75,
"epoch": 1.3724928366762177,
"grad_norm": 43.06595444024102,
"learning_rate": 1.4817536741017151e-06,
"loss": -0.0039,
"num_tokens": 39836659.0,
"residual_var": 0.03985970467329025,
"reward": 0.8125,
"reward_std": 0.09784172475337982,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"rho2": 0.2187499701976776,
"step": 239
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 4.972694864437645e-09,
"advantages/std": 0.18728728592395782,
"advantages/var": 0.03507652746876233,
"completions/clipped_ratio": -2.96875,
"epoch": 1.3782234957020059,
"grad_norm": 48.141601296719074,
"learning_rate": 1.4778159803759156e-06,
"loss": -0.1759,
"num_tokens": 39974954.0,
"residual_var": 0.02849968895316124,
"reward": 0.87890625,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.87890625,
"rewards/drgrpo_math_reward/std": 0.3268752694129944,
"rho2": 0.1874999701976776,
"step": 240
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.9545560195055106e-09,
"advantages/std": 0.23824401199817657,
"advantages/var": 0.056760209252987304,
"completions/clipped_ratio": -2.75,
"epoch": 1.3839541547277938,
"grad_norm": 48.60932568929768,
"learning_rate": 1.4738686624729987e-06,
"loss": -0.2783,
"num_tokens": 40125785.0,
"residual_var": 0.04079641401767731,
"reward": 0.81640625,
"reward_std": 0.12297550588846207,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"rho2": 0.2812499403953552,
"step": 241
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.6635067535109087e-09,
"advantages/std": 0.19066210091114044,
"advantages/var": 0.0363520367238499,
"completions/clipped_ratio": -2.84375,
"epoch": 1.3896848137535818,
"grad_norm": 62.29778777727231,
"learning_rate": 1.4699117998999054e-06,
"loss": -0.6888,
"num_tokens": 40284183.0,
"residual_var": 0.02840004302561283,
"reward": 0.76953125,
"reward_std": 0.08246467262506485,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"rho2": 0.21874994039535522,
"step": 242
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 5.91612991187562e-09,
"advantages/std": 0.2754865884780884,
"advantages/var": 0.07589286043129562,
"completions/clipped_ratio": -2.859375,
"epoch": 1.3954154727793697,
"grad_norm": 64.60278694418795,
"learning_rate": 1.4659454723558246e-06,
"loss": -0.3987,
"num_tokens": 40437955.0,
"residual_var": 0.049804698675870895,
"reward": 0.79296875,
"reward_std": 0.15532232820987701,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"rho2": 0.3437499403953552,
"step": 243
},
{
"advantages/mean": 1.0477378964424133e-09,
"advantages/snr": 3.937895216689858e-09,
"advantages/std": 0.26606544852256775,
"advantages/var": 0.07079082289751515,
"completions/clipped_ratio": -2.828125,
"epoch": 1.4011461318051577,
"grad_norm": 96.87862275971443,
"learning_rate": 1.4619697597305899e-06,
"loss": -0.3258,
"num_tokens": 40588989.0,
"residual_var": 0.05309312418103218,
"reward": 0.86328125,
"reward_std": 0.13071896135807037,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.24999994039535522,
"step": 244
},
{
"advantages/mean": -1.7462298274040222e-09,
"advantages/snr": 6.28610582583922e-09,
"advantages/std": 0.27779197692871094,
"advantages/var": 0.07716838244596147,
"completions/clipped_ratio": -2.90625,
"epoch": 1.4068767908309456,
"grad_norm": 53.57336970473912,
"learning_rate": 1.4579847421030676e-06,
"loss": -0.1779,
"num_tokens": 40729537.0,
"residual_var": 0.050641756504774094,
"reward": 0.87890625,
"reward_std": 0.15057817101478577,
"rewards/drgrpo_math_reward/mean": 0.87890625,
"rewards/drgrpo_math_reward/std": 0.3268752694129944,
"rho2": 0.3437499403953552,
"step": 245
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 6.867192539799447e-09,
"advantages/std": 0.30514299869537354,
"advantages/var": 0.09311224965280473,
"completions/clipped_ratio": -2.75,
"epoch": 1.4126074498567336,
"grad_norm": 76.51726864020476,
"learning_rate": 1.4539904997395467e-06,
"loss": -0.1959,
"num_tokens": 40887865.0,
"residual_var": 0.055285416543483734,
"reward": 0.7421875,
"reward_std": 0.18596169352531433,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"rho2": 0.40624991059303284,
"step": 246
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.750175847901529e-09,
"advantages/std": 0.26606541872024536,
"advantages/var": 0.07079080703877949,
"completions/clipped_ratio": -2.8125,
"epoch": 1.4183381088825215,
"grad_norm": 36.59307165492083,
"learning_rate": 1.449987113092121e-06,
"loss": -0.3474,
"num_tokens": 41031181.0,
"residual_var": 0.05309312045574188,
"reward": 0.83984375,
"reward_std": 0.13717305660247803,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.2499999701976776,
"step": 247
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 5.675479987696761e-09,
"advantages/std": 0.24614372849464417,
"advantages/var": 0.0605867350772451,
"completions/clipped_ratio": -2.875,
"epoch": 1.4240687679083095,
"grad_norm": 68.05512469299691,
"learning_rate": 1.4459746627970684e-06,
"loss": -0.2955,
"num_tokens": 41184203.0,
"residual_var": 0.043546728789806366,
"reward": 0.81640625,
"reward_std": 0.1263904571533203,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"rho2": 0.2812499403953552,
"step": 248
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 3.4539869611748964e-09,
"advantages/std": 0.26963695883750916,
"advantages/var": 0.0727040895711406,
"completions/clipped_ratio": -2.84375,
"epoch": 1.4297994269340975,
"grad_norm": 59.493421784362106,
"learning_rate": 1.4419532296732268e-06,
"loss": -1.7344,
"num_tokens": 41343611.0,
"residual_var": 0.047712069004774094,
"reward": 0.8125,
"reward_std": 0.14545084536075592,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"rho2": 0.34374991059303284,
"step": 249
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 8.997430734575406e-10,
"advantages/std": 0.2587745785713196,
"advantages/var": 0.06696428251476405,
"completions/clipped_ratio": -2.90625,
"epoch": 1.4355300859598854,
"grad_norm": 65.74542653499171,
"learning_rate": 1.4379228947203662e-06,
"loss": -0.2735,
"num_tokens": 41495636.0,
"residual_var": 0.048130594193935394,
"reward": 0.82421875,
"reward_std": 0.1332252472639084,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.2812499403953552,
"step": 250
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 4.28010223419274e-09,
"advantages/std": 0.27199190855026245,
"advantages/var": 0.07397959831681433,
"completions/clipped_ratio": -2.90625,
"epoch": 1.4412607449856734,
"grad_norm": 80.0417918069496,
"learning_rate": 1.433883739117558e-06,
"loss": -0.8315,
"num_tokens": 41634633.0,
"residual_var": 0.046237263828516006,
"reward": 0.828125,
"reward_std": 0.1530819833278656,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.37499991059303284,
"step": 251
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.158150412641125e-09,
"advantages/std": 0.21576867997646332,
"advantages/var": 0.04655612325878544,
"completions/clipped_ratio": -2.8125,
"epoch": 1.4469914040114613,
"grad_norm": 64.1441250560586,
"learning_rate": 1.4298358442215388e-06,
"loss": -0.2455,
"num_tokens": 41781254.0,
"residual_var": 0.03346222639083862,
"reward": 0.82421875,
"reward_std": 0.10627168416976929,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.2812499403953552,
"step": 252
},
{
"advantages/mean": 2.3283064365386963e-09,
"advantages/snr": 7.33473836777306e-09,
"advantages/std": 0.31743550300598145,
"advantages/var": 0.10076529856866046,
"completions/clipped_ratio": -2.84375,
"epoch": 1.4527220630372493,
"grad_norm": 68.26001489650739,
"learning_rate": 1.4257792915650725e-06,
"loss": -0.7108,
"num_tokens": 41924107.0,
"residual_var": 0.06297832727432251,
"reward": 0.8828125,
"reward_std": 0.18687033653259277,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.3749999403953552,
"step": 253
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 2.970572550223795e-09,
"advantages/std": 0.2743266522884369,
"advantages/var": 0.07525511215578096,
"completions/clipped_ratio": -2.890625,
"epoch": 1.4584527220630372,
"grad_norm": 46.95429888318211,
"learning_rate": 1.4217141628553076e-06,
"loss": -1.0476,
"num_tokens": 42074555.0,
"residual_var": 0.05408961698412895,
"reward": 0.8203125,
"reward_std": 0.1417675018310547,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.2812499403953552,
"step": 254
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 3.616233635463797e-09,
"advantages/std": 0.1287696808576584,
"advantages/var": 0.016581630708183193,
"completions/clipped_ratio": -2.96875,
"epoch": 1.4641833810888252,
"grad_norm": 22.725984301890314,
"learning_rate": 1.417640539972131e-06,
"loss": 0.0385,
"num_tokens": 42202249.0,
"residual_var": 0.015027116052806377,
"reward": 0.90625,
"reward_std": 0.036563027650117874,
"rewards/drgrpo_math_reward/mean": 0.90625,
"rewards/drgrpo_math_reward/std": 0.2920515835285187,
"rho2": 0.0937499850988388,
"step": 255
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.982532443970627e-09,
"advantages/std": 0.23419423401355743,
"advantages/var": 0.0548469392451969,
"completions/clipped_ratio": -2.90625,
"epoch": 1.4699140401146131,
"grad_norm": 44.22865864057851,
"learning_rate": 1.4135585049665206e-06,
"loss": -0.1639,
"num_tokens": 42353064.0,
"residual_var": 0.042849186807870865,
"reward": 0.765625,
"reward_std": 0.10771076381206512,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.21874994039535522,
"step": 256
},
{
"advantages/mean": -1.0477378964424133e-09,
"advantages/snr": 4.0488433642653295e-09,
"advantages/std": 0.25877460837364197,
"advantages/var": 0.06696429793893177,
"completions/clipped_ratio": -2.90625,
"epoch": 1.475644699140401,
"grad_norm": 48.71604701518212,
"learning_rate": 1.4094681400588907e-06,
"loss": -0.6364,
"num_tokens": 42500406.0,
"residual_var": 0.050223227590322495,
"reward": 0.84765625,
"reward_std": 0.12730401754379272,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.24999995529651642,
"step": 257
},
{
"advantages/mean": 8.149072527885437e-10,
"advantages/snr": 3.1050551359151326e-09,
"advantages/std": 0.262445330619812,
"advantages/var": 0.06887755156414244,
"completions/clipped_ratio": -2.765625,
"epoch": 1.481375358166189,
"grad_norm": 62.11118022578198,
"learning_rate": 1.405369527637436e-06,
"loss": 0.3678,
"num_tokens": 42648437.0,
"residual_var": 0.049505751579999924,
"reward": 0.765625,
"reward_std": 0.1349327266216278,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.2812499403953552,
"step": 258
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 8.523543981158328e-10,
"advantages/std": 0.27316176891326904,
"advantages/var": 0.0746173519958262,
"completions/clipped_ratio": -2.78125,
"epoch": 1.487106017191977,
"grad_norm": 67.65573090786955,
"learning_rate": 1.4012627502564742e-06,
"loss": -1.1556,
"num_tokens": 42789425.0,
"residual_var": 0.05129944160580635,
"reward": 0.76953125,
"reward_std": 0.14769119024276733,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"rho2": 0.3124999403953552,
"step": 259
},
{
"advantages/mean": -1.0477378964424133e-09,
"advantages/snr": 3.696071523020088e-09,
"advantages/std": 0.2834733724594116,
"advantages/var": 0.08035715289351231,
"completions/clipped_ratio": -2.859375,
"epoch": 1.492836676217765,
"grad_norm": 62.192696949199856,
"learning_rate": 1.3971478906347805e-06,
"loss": -0.9984,
"num_tokens": 42957416.0,
"residual_var": 0.05022323131561279,
"reward": 0.75,
"reward_std": 0.16637086868286133,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"rho2": 0.37499988079071045,
"step": 260
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.9318340292582657e-09,
"advantages/std": 0.23824401199817657,
"advantages/var": 0.056760209252987304,
"completions/clipped_ratio": -2.890625,
"epoch": 1.498567335243553,
"grad_norm": 33.8876635168633,
"learning_rate": 1.3930250316539235e-06,
"loss": -0.0272,
"num_tokens": 43096508.0,
"residual_var": 0.04434392228722572,
"reward": 0.87890625,
"reward_std": 0.115872323513031,
"rewards/drgrpo_math_reward/mean": 0.87890625,
"rewards/drgrpo_math_reward/std": 0.3268752694129944,
"rho2": 0.21874995529651642,
"step": 261
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 3.714691764834188e-09,
"advantages/std": 0.3133915960788727,
"advantages/var": 0.09821429249286329,
"completions/clipped_ratio": -2.734375,
"epoch": 1.5042979942693409,
"grad_norm": 56.81531691666041,
"learning_rate": 1.3888942563565948e-06,
"loss": 0.0313,
"num_tokens": 43261550.0,
"residual_var": 0.05524555593729019,
"reward": 0.796875,
"reward_std": 0.19700777530670166,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"rho2": 0.43749988079071045,
"step": 262
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 1.7677820123739347e-09,
"advantages/std": 0.3292694389820099,
"advantages/var": 0.10841836344752753,
"completions/clipped_ratio": -2.828125,
"epoch": 1.5100286532951288,
"grad_norm": 49.90809038754078,
"learning_rate": 1.384755647944936e-06,
"loss": -0.3604,
"num_tokens": 43411190.0,
"residual_var": 0.0643734261393547,
"reward": 0.8046875,
"reward_std": 0.20779037475585938,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"rho2": 0.40624991059303284,
"step": 263
},
{
"advantages/mean": 1.0477378964424133e-09,
"advantages/snr": 5.64584279348239e-09,
"advantages/std": 0.18557687103748322,
"advantages/var": 0.03443877506406268,
"completions/clipped_ratio": -2.84375,
"epoch": 1.5157593123209168,
"grad_norm": 72.71898678230077,
"learning_rate": 1.3806092897788643e-06,
"loss": -0.2531,
"num_tokens": 43553476.0,
"residual_var": 0.026905305683612823,
"reward": 0.859375,
"reward_std": 0.08075720071792603,
"rewards/drgrpo_math_reward/mean": 0.859375,
"rewards/drgrpo_math_reward/std": 0.3483152687549591,
"rho2": 0.21874995529651642,
"step": 264
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.2910053593143641e-09,
"advantages/std": 0.18034830689430237,
"advantages/var": 0.03252551179964147,
"completions/clipped_ratio": -2.953125,
"epoch": 1.5214899713467047,
"grad_norm": 29.822109608396552,
"learning_rate": 1.3764552653743919e-06,
"loss": -0.3537,
"num_tokens": 43701138.0,
"residual_var": 0.027443410828709602,
"reward": 0.85546875,
"reward_std": 0.07194654643535614,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.1562499701976776,
"step": 265
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.1348570685336994e-09,
"advantages/std": 0.20516295731067657,
"advantages/var": 0.0420918390524625,
"completions/clipped_ratio": -2.859375,
"epoch": 1.5272206303724927,
"grad_norm": 59.49985738560282,
"learning_rate": 1.3722936584019451e-06,
"loss": -0.2906,
"num_tokens": 43851563.0,
"residual_var": 0.03551499918103218,
"reward": 0.828125,
"reward_std": 0.08219873160123825,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.1562499701976776,
"step": 266
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 8.560204468385481e-10,
"advantages/std": 0.27199190855026245,
"advantages/var": 0.07397959831681433,
"completions/clipped_ratio": -2.890625,
"epoch": 1.5329512893982808,
"grad_norm": 46.34469421593609,
"learning_rate": 1.3681245526846781e-06,
"loss": -0.4533,
"num_tokens": 43987630.0,
"residual_var": 0.060108426958322525,
"reward": 0.8125,
"reward_std": 0.133487269282341,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"rho2": 0.1874999701976776,
"step": 267
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.1726896131954143e-09,
"advantages/std": 0.220157653093338,
"advantages/var": 0.048469392215566565,
"completions/clipped_ratio": -2.8125,
"epoch": 1.5386819484240688,
"grad_norm": 39.76207826773463,
"learning_rate": 1.3639480321967845e-06,
"loss": 0.1273,
"num_tokens": 44132623.0,
"residual_var": 0.03938139230012894,
"reward": 0.8359375,
"reward_std": 0.09495474398136139,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.1874999701976776,
"step": 268
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 4.287037337628827e-09,
"advantages/std": 0.21724152565002441,
"advantages/var": 0.047193880466750215,
"completions/clipped_ratio": -2.953125,
"epoch": 1.5444126074498568,
"grad_norm": 35.89001157588973,
"learning_rate": 1.359764181061807e-06,
"loss": -0.2152,
"num_tokens": 44277413.0,
"residual_var": 0.03834503889083862,
"reward": 0.84375,
"reward_std": 0.08679073303937912,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.18749994039535522,
"step": 269
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.6992289095102196e-09,
"advantages/std": 0.25877460837364197,
"advantages/var": 0.06696429793893177,
"completions/clipped_ratio": -2.84375,
"epoch": 1.5501432664756447,
"grad_norm": 53.21944819552593,
"learning_rate": 1.3555730835509419e-06,
"loss": -0.1514,
"num_tokens": 44429618.0,
"residual_var": 0.05022323504090309,
"reward": 0.79296875,
"reward_std": 0.12730401754379272,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"rho2": 0.24999995529651642,
"step": 270
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 4.79665589833252e-09,
"advantages/std": 0.29124119877815247,
"advantages/var": 0.08482143586573532,
"completions/clipped_ratio": -2.84375,
"epoch": 1.5558739255014327,
"grad_norm": 61.803069074199236,
"learning_rate": 1.3513748240813427e-06,
"loss": 0.1271,
"num_tokens": 44583681.0,
"residual_var": 0.05301341041922569,
"reward": 0.79296875,
"reward_std": 0.16504409909248352,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"rho2": 0.3749999403953552,
"step": 271
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.21576867997646332,
"advantages/var": 0.04655612325878544,
"completions/clipped_ratio": -2.921875,
"epoch": 1.5616045845272206,
"grad_norm": 34.187999091685725,
"learning_rate": 1.3471694872144185e-06,
"loss": -0.2637,
"num_tokens": 44727554.0,
"residual_var": 0.03346222639083862,
"reward": 0.86328125,
"reward_std": 0.10627167671918869,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.2812499403953552,
"step": 272
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.1730858844377734e-09,
"advantages/std": 0.2142857164144516,
"advantages/var": 0.04591836825925477,
"completions/clipped_ratio": -2.9375,
"epoch": 1.5673352435530086,
"grad_norm": 48.52107737550291,
"learning_rate": 1.3429571576541314e-06,
"loss": -1.3899,
"num_tokens": 44876865.0,
"residual_var": 0.035873737186193466,
"reward": 0.875,
"reward_std": 0.09271685779094696,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.2187499701976776,
"step": 273
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.22728431224822998,
"advantages/var": 0.051658158594150905,
"completions/clipped_ratio": -2.8125,
"epoch": 1.5730659025787965,
"grad_norm": 54.31265956931611,
"learning_rate": 1.3387379202452915e-06,
"loss": -0.7988,
"num_tokens": 45039696.0,
"residual_var": 0.037129323929548264,
"reward": 0.80078125,
"reward_std": 0.11139655113220215,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.2812499403953552,
"step": 274
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.24222607910633087,
"advantages/var": 0.05867347339922646,
"completions/clipped_ratio": -2.75,
"epoch": 1.5787965616045845,
"grad_norm": 48.08288648494765,
"learning_rate": 1.3345118599718454e-06,
"loss": -0.2865,
"num_tokens": 45201572.0,
"residual_var": 0.04217156767845154,
"reward": 0.7578125,
"reward_std": 0.11822889000177383,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"rho2": 0.2812499403953552,
"step": 275
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 2.6545718234372027e-09,
"advantages/std": 0.35083720088005066,
"advantages/var": 0.12308674152134902,
"completions/clipped_ratio": -2.859375,
"epoch": 1.5845272206303727,
"grad_norm": 55.04932654737737,
"learning_rate": 1.3302790619551672e-06,
"loss": -0.1071,
"num_tokens": 45370041.0,
"residual_var": 0.06923630088567734,
"reward": 0.80859375,
"reward_std": 0.2290886491537094,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.4374999403953552,
"step": 276
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.22587698698043823,
"advantages/var": 0.05102041324736106,
"completions/clipped_ratio": -2.84375,
"epoch": 1.5902578796561606,
"grad_norm": 42.25464681628162,
"learning_rate": 1.3260396114523417e-06,
"loss": -0.4081,
"num_tokens": 45526026.0,
"residual_var": 0.04304848238825798,
"reward": 0.75,
"reward_std": 0.09073854237794876,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"rho2": 0.15624995529651642,
"step": 277
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 2.983240393405415e-09,
"advantages/std": 0.27316176891326904,
"advantages/var": 0.0746173519958262,
"completions/clipped_ratio": -2.890625,
"epoch": 1.5959885386819486,
"grad_norm": 46.40080500539711,
"learning_rate": 1.3217935938544495e-06,
"loss": -0.1385,
"num_tokens": 45672381.0,
"residual_var": 0.058294814079999924,
"reward": 0.83984375,
"reward_std": 0.1346667855978012,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.2187499701976776,
"step": 278
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.504116276874765e-09,
"advantages/std": 0.2789374887943268,
"advantages/var": 0.07780612265488518,
"completions/clipped_ratio": -2.859375,
"epoch": 1.6017191977077365,
"grad_norm": 52.843715111367665,
"learning_rate": 1.3175410946848444e-06,
"loss": -0.3251,
"num_tokens": 45821622.0,
"residual_var": 0.046197403222322464,
"reward": 0.8203125,
"reward_std": 0.1641329824924469,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.40624991059303284,
"step": 279
},
{
"advantages/mean": -5.820766091346741e-10,
"advantages/snr": 2.3773285286252477e-09,
"advantages/std": 0.24484482407569885,
"advantages/var": 0.05994898787665992,
"completions/clipped_ratio": -2.875,
"epoch": 1.6074498567335245,
"grad_norm": 46.228824312012414,
"learning_rate": 1.3132821995974326e-06,
"loss": 0.2804,
"num_tokens": 45973378.0,
"residual_var": 0.048708558082580566,
"reward": 0.875,
"reward_std": 0.10691440105438232,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.1874999701976776,
"step": 280
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 8.523543981158328e-10,
"advantages/std": 0.27316176891326904,
"advantages/var": 0.0746173519958262,
"completions/clipped_ratio": -2.875,
"epoch": 1.6131805157593124,
"grad_norm": 55.11002748041064,
"learning_rate": 1.3090169943749473e-06,
"loss": -0.463,
"num_tokens": 46117341.0,
"residual_var": 0.05129943788051605,
"reward": 0.84765625,
"reward_std": 0.14123709499835968,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.3124999403953552,
"step": 281
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.9329581251860285e-09,
"advantages/std": 0.2409060299396515,
"advantages/var": 0.05803571526128426,
"completions/clipped_ratio": -2.84375,
"epoch": 1.6189111747851004,
"grad_norm": 46.93471833357915,
"learning_rate": 1.3047455649272206e-06,
"loss": -0.504,
"num_tokens": 46278368.0,
"residual_var": 0.0453404076397419,
"reward": 0.82421875,
"reward_std": 0.11112816631793976,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.2187499701976776,
"step": 282
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 7.303029407665864e-09,
"advantages/std": 0.25505101680755615,
"advantages/var": 0.06505102117456829,
"completions/clipped_ratio": -2.9375,
"epoch": 1.6246418338108883,
"grad_norm": 42.613912161685086,
"learning_rate": 1.3004679972894518e-06,
"loss": -0.4131,
"num_tokens": 46421038.0,
"residual_var": 0.050821125507354736,
"reward": 0.796875,
"reward_std": 0.1244145929813385,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"rho2": 0.21874995529651642,
"step": 283
},
{
"advantages/mean": 1.5133991837501526e-09,
"advantages/snr": 5.766530966699532e-09,
"advantages/std": 0.262445330619812,
"advantages/var": 0.06887755156414244,
"completions/clipped_ratio": -2.8125,
"epoch": 1.6303724928366763,
"grad_norm": 55.21201691395928,
"learning_rate": 1.2961843776204776e-06,
"loss": -0.2435,
"num_tokens": 46580804.0,
"residual_var": 0.045200902968645096,
"reward": 0.8125,
"reward_std": 0.14203590154647827,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"rho2": 0.3437499403953552,
"step": 284
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 5.675479987696761e-09,
"advantages/std": 0.24614372849464417,
"advantages/var": 0.0605867350772451,
"completions/clipped_ratio": -2.875,
"epoch": 1.6361031518624642,
"grad_norm": 50.79647099010151,
"learning_rate": 1.2918947922010336e-06,
"loss": -0.1687,
"num_tokens": 46743701.0,
"residual_var": 0.045440055429935455,
"reward": 0.83984375,
"reward_std": 0.12046923488378525,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.2499999701976776,
"step": 285
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.9155005520288713e-09,
"advantages/std": 0.2395787239074707,
"advantages/var": 0.057397964949132074,
"completions/clipped_ratio": -2.765625,
"epoch": 1.6418338108882522,
"grad_norm": 74.3888925274376,
"learning_rate": 1.2875993274320173e-06,
"loss": -0.9076,
"num_tokens": 46908808.0,
"residual_var": 0.04663585498929024,
"reward": 0.828125,
"reward_std": 0.10994865000247955,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.18749995529651642,
"step": 286
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.126356132424828e-09,
"advantages/std": 0.20671138167381287,
"advantages/var": 0.04272959531349674,
"completions/clipped_ratio": -2.828125,
"epoch": 1.6475644699140402,
"grad_norm": 49.41706074564681,
"learning_rate": 1.2832980698327494e-06,
"loss": -0.1523,
"num_tokens": 47070033.0,
"residual_var": 0.03338250517845154,
"reward": 0.74609375,
"reward_std": 0.08929947018623352,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"rho2": 0.21874995529651642,
"step": 287
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.20044593513011932,
"advantages/var": 0.040178572910188004,
"completions/clipped_ratio": -2.984375,
"epoch": 1.653295128939828,
"grad_norm": 45.268262312968005,
"learning_rate": 1.2789911060392294e-06,
"loss": -0.4863,
"num_tokens": 47209548.0,
"residual_var": 0.032645102590322495,
"reward": 0.89453125,
"reward_std": 0.08641248196363449,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.1874999701976776,
"step": 288
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.2409060299396515,
"advantages/var": 0.05803571526128426,
"completions/clipped_ratio": -2.875,
"epoch": 1.659025787965616,
"grad_norm": 72.12409051313657,
"learning_rate": 1.2746785228023901e-06,
"loss": -0.2533,
"num_tokens": 47355458.0,
"residual_var": 0.04534041881561279,
"reward": 0.84765625,
"reward_std": 0.11112815141677856,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.21874995529651642,
"step": 289
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.4073954360621055e-09,
"advantages/std": 0.2901442348957062,
"advantages/var": 0.08418367704321472,
"completions/clipped_ratio": -2.828125,
"epoch": 1.664756446991404,
"grad_norm": 81.65094517437481,
"learning_rate": 1.2703604069863528e-06,
"loss": -0.5033,
"num_tokens": 47508301.0,
"residual_var": 0.06313777714967728,
"reward": 0.6796875,
"reward_std": 0.15729427337646484,
"rewards/drgrpo_math_reward/mean": 0.6796875,
"rewards/drgrpo_math_reward/std": 0.4675106406211853,
"rho2": 0.24999995529651642,
"step": 290
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.828150668068317e-10,
"advantages/std": 0.23690177500247955,
"advantages/var": 0.056122450999325446,
"completions/clipped_ratio": -2.84375,
"epoch": 1.670487106017192,
"grad_norm": 40.0774805833318,
"learning_rate": 1.266036845566675e-06,
"loss": -0.1413,
"num_tokens": 47662898.0,
"residual_var": 0.04384567216038704,
"reward": 0.78125,
"reward_std": 0.10942068696022034,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.2187499701976776,
"step": 291
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 5.807816254470197e-10,
"advantages/std": 0.20044593513011932,
"advantages/var": 0.040178572910188004,
"completions/clipped_ratio": -2.96875,
"epoch": 1.67621776504298,
"grad_norm": 61.83270346354945,
"learning_rate": 1.2617079256286e-06,
"loss": 0.3358,
"num_tokens": 47801263.0,
"residual_var": 0.032645102590322495,
"reward": 0.83984375,
"reward_std": 0.07995839416980743,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.1874999701976776,
"step": 292
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 4.439648622792677e-09,
"advantages/std": 0.2097739279270172,
"advantages/var": 0.04400510083792941,
"completions/clipped_ratio": -2.890625,
"epoch": 1.6819484240687679,
"grad_norm": 115.93567638341011,
"learning_rate": 1.2573737343653023e-06,
"loss": -0.8953,
"num_tokens": 47949713.0,
"residual_var": 0.03300383687019348,
"reward": 0.86328125,
"reward_std": 0.0969306081533432,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.24999994039535522,
"step": 293
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.9768946035727382e-09,
"advantages/std": 0.2355518937110901,
"advantages/var": 0.05548469463088068,
"completions/clipped_ratio": -2.9375,
"epoch": 1.6876790830945558,
"grad_norm": 62.65175714195031,
"learning_rate": 1.2530343590761317e-06,
"loss": -0.4164,
"num_tokens": 48098928.0,
"residual_var": 0.04508132487535477,
"reward": 0.83984375,
"reward_std": 0.10824117064476013,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.1874999701976776,
"step": 294
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 2.8301503045057226e-09,
"advantages/std": 0.28793779015541077,
"advantages/var": 0.08290817099958137,
"completions/clipped_ratio": -2.953125,
"epoch": 1.6934097421203438,
"grad_norm": 75.52026681703008,
"learning_rate": 1.2486898871648551e-06,
"loss": -0.2693,
"num_tokens": 48232546.0,
"residual_var": 0.05699937418103218,
"reward": 0.8671875,
"reward_std": 0.15623344480991364,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.3124999403953552,
"step": 295
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 3.6695466564034195e-09,
"advantages/std": 0.253797709941864,
"advantages/var": 0.06441327757173454,
"completions/clipped_ratio": -2.859375,
"epoch": 1.6991404011461317,
"grad_norm": 62.035253493008476,
"learning_rate": 1.244340406137894e-06,
"loss": -0.5796,
"num_tokens": 48387262.0,
"residual_var": 0.04428413510322571,
"reward": 0.80859375,
"reward_std": 0.1374414563179016,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.3124999403953552,
"step": 296
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 4.806060414732269e-09,
"advantages/std": 0.24222607910633087,
"advantages/var": 0.05867347339922646,
"completions/clipped_ratio": -2.90625,
"epoch": 1.7048710601719197,
"grad_norm": 46.16600507678362,
"learning_rate": 1.2399860036025658e-06,
"loss": 0.1704,
"num_tokens": 48517720.0,
"residual_var": 0.04217156767845154,
"reward": 0.8671875,
"reward_std": 0.12468297779560089,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.2812499403953552,
"step": 297
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 8.702216012765879e-09,
"advantages/std": 0.18728728592395782,
"advantages/var": 0.03507652746876233,
"completions/clipped_ratio": -2.84375,
"epoch": 1.7106017191977076,
"grad_norm": 43.965060130880126,
"learning_rate": 1.235626767265316e-06,
"loss": -0.1406,
"num_tokens": 48671527.0,
"residual_var": 0.028499694541096687,
"reward": 0.87109375,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.87109375,
"rewards/drgrpo_math_reward/std": 0.33575257658958435,
"rho2": 0.18749994039535522,
"step": 298
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 7.769385089887185e-09,
"advantages/std": 0.2097739279270172,
"advantages/var": 0.04400510083792941,
"completions/clipped_ratio": -2.90625,
"epoch": 1.7163323782234956,
"grad_norm": 47.8375889530657,
"learning_rate": 1.2312627849299522e-06,
"loss": 0.0195,
"num_tokens": 48805831.0,
"residual_var": 0.03712931647896767,
"reward": 0.85546875,
"reward_std": 0.08390620350837708,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.1562499701976776,
"step": 299
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.36110830002126e-10,
"advantages/std": 0.24872122704982758,
"advantages/var": 0.06186224878517188,
"completions/clipped_ratio": -2.75,
"epoch": 1.7220630372492836,
"grad_norm": 99.10893071959802,
"learning_rate": 1.2268941444958764e-06,
"loss": -0.8685,
"num_tokens": 48949622.0,
"residual_var": 0.04446350038051605,
"reward": 0.79296875,
"reward_std": 0.12164628505706787,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"rho2": 0.2812499403953552,
"step": 300
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 4.59388789526886e-09,
"advantages/std": 0.30409619212150574,
"advantages/var": 0.09247449406279973,
"completions/clipped_ratio": -2.9375,
"epoch": 1.7277936962750715,
"grad_norm": 44.15970985324081,
"learning_rate": 1.2225209339563143e-06,
"loss": -0.5217,
"num_tokens": 49094796.0,
"residual_var": 0.057796575129032135,
"reward": 0.85546875,
"reward_std": 0.17832809686660767,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.37499991059303284,
"step": 301
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 5.645842642869884e-09,
"advantages/std": 0.24743583798408508,
"advantages/var": 0.0612244939188864,
"completions/clipped_ratio": -2.875,
"epoch": 1.7335243553008595,
"grad_norm": 50.98950387379492,
"learning_rate": 1.2181432413965426e-06,
"loss": -0.2709,
"num_tokens": 49228796.0,
"residual_var": 0.045918382704257965,
"reward": 0.890625,
"reward_std": 0.12046678364276886,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.24999994039535522,
"step": 302
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.4838397187113485e-09,
"advantages/std": 0.28121456503868103,
"advantages/var": 0.07908163158989456,
"completions/clipped_ratio": -2.859375,
"epoch": 1.7392550143266474,
"grad_norm": 67.15690274376962,
"learning_rate": 1.2137611549921145e-06,
"loss": -0.1553,
"num_tokens": 49380594.0,
"residual_var": 0.051897335797548294,
"reward": 0.8046875,
"reward_std": 0.15873973071575165,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"rho2": 0.3437499403953552,
"step": 303
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.084364294103083e-10,
"advantages/std": 0.2562982141971588,
"advantages/var": 0.0656887746006527,
"completions/clipped_ratio": -2.921875,
"epoch": 1.7449856733524354,
"grad_norm": 49.955242749828415,
"learning_rate": 1.2093747630070842e-06,
"loss": -0.7495,
"num_tokens": 49529014.0,
"residual_var": 0.04721382260322571,
"reward": 0.81640625,
"reward_std": 0.13151532411575317,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"rho2": 0.2812499403953552,
"step": 304
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 6.0181390962042404e-09,
"advantages/std": 0.27081701159477234,
"advantages/var": 0.07334185376912306,
"completions/clipped_ratio": -2.671875,
"epoch": 1.7507163323782235,
"grad_norm": 60.730178677804794,
"learning_rate": 1.2049841537922305e-06,
"loss": 0.0502,
"num_tokens": 49679831.0,
"residual_var": 0.05271446332335472,
"reward": 0.76171875,
"reward_std": 0.1465141326189041,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"rho2": 0.2812499403953552,
"step": 305
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.7194683132012115e-09,
"advantages/std": 0.27081701159477234,
"advantages/var": 0.07334185376912306,
"completions/clipped_ratio": -2.90625,
"epoch": 1.7564469914040115,
"grad_norm": 59.49589899349533,
"learning_rate": 1.2005894157832728e-06,
"loss": -0.8913,
"num_tokens": 49828981.0,
"residual_var": 0.048130594193935394,
"reward": 0.85546875,
"reward_std": 0.14716322720050812,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.3437499403953552,
"step": 306
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.219622438173128e-10,
"advantages/std": 0.25253814458847046,
"advantages/var": 0.06377551447218721,
"completions/clipped_ratio": -2.828125,
"epoch": 1.7621776504297995,
"grad_norm": 55.76930921253595,
"learning_rate": 1.196190637499095e-06,
"loss": -1.2713,
"num_tokens": 49971360.0,
"residual_var": 0.04583866149187088,
"reward": 0.890625,
"reward_std": 0.12980785965919495,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.2812499403953552,
"step": 307
},
{
"advantages/mean": -5.820766091346741e-10,
"advantages/snr": 1.8882523284691135e-09,
"advantages/std": 0.30826207995414734,
"advantages/var": 0.09502550993765713,
"completions/clipped_ratio": -2.890625,
"epoch": 1.7679083094555874,
"grad_norm": 55.11636955178904,
"learning_rate": 1.19178790753996e-06,
"loss": -2.6177,
"num_tokens": 50126326.0,
"residual_var": 0.05939095839858055,
"reward": 0.81640625,
"reward_std": 0.18820202350616455,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"rho2": 0.3749999403953552,
"step": 308
},
{
"advantages/mean": 1.5133991837501526e-09,
"advantages/snr": 6.352307063392909e-09,
"advantages/std": 0.23824401199817657,
"advantages/var": 0.056760209252987304,
"completions/clipped_ratio": -2.921875,
"epoch": 1.7736389684813754,
"grad_norm": 76.34588377569375,
"learning_rate": 1.1873813145857248e-06,
"loss": -0.0001,
"num_tokens": 50260527.0,
"residual_var": 0.04079640656709671,
"reward": 0.91015625,
"reward_std": 0.1165214255452156,
"rewards/drgrpo_math_reward/mean": 0.91015625,
"rewards/drgrpo_math_reward/std": 0.2865179479122162,
"rho2": 0.2812499403953552,
"step": 309
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 5.8310011040577425e-09,
"advantages/std": 0.2395787239074707,
"advantages/var": 0.057397964949132074,
"completions/clipped_ratio": -2.71875,
"epoch": 1.7793696275071633,
"grad_norm": 49.25562543554959,
"learning_rate": 1.1829709473940547e-06,
"loss": -0.1656,
"num_tokens": 50416781.0,
"residual_var": 0.04663585498929024,
"reward": 0.734375,
"reward_std": 0.1034945547580719,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"rho2": 0.18749994039535522,
"step": 310
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.1213382228454678e-09,
"advantages/std": 0.3292694687843323,
"advantages/var": 0.10841838307351637,
"completions/clipped_ratio": -2.53125,
"epoch": 1.7851002865329513,
"grad_norm": 145.86003387133044,
"learning_rate": 1.1785568947986366e-06,
"loss": -0.9573,
"num_tokens": 50579721.0,
"residual_var": 0.06437341868877411,
"reward": 0.765625,
"reward_std": 0.20779037475585938,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.4062499403953552,
"step": 311
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.29450756311416626,
"advantages/var": 0.08673470473144462,
"completions/clipped_ratio": -2.78125,
"epoch": 1.7908309455587392,
"grad_norm": 56.44350194766821,
"learning_rate": 1.1741392457073886e-06,
"loss": -0.5022,
"num_tokens": 50736415.0,
"residual_var": 0.05963011458516121,
"reward": 0.7421875,
"reward_std": 0.15964838862419128,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"rho2": 0.3124999403953552,
"step": 312
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 4.754657346617089e-10,
"advantages/std": 0.24484480917453766,
"advantages/var": 0.05994898057971576,
"completions/clipped_ratio": -2.984375,
"epoch": 1.7965616045845272,
"grad_norm": 51.79635444923773,
"learning_rate": 1.1697180891006689e-06,
"loss": -0.3783,
"num_tokens": 50883741.0,
"residual_var": 0.04683515429496765,
"reward": 0.8359375,
"reward_std": 0.11928971856832504,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.21874995529651642,
"step": 313
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 6.597025067366786e-09,
"advantages/std": 0.2823462188243866,
"advantages/var": 0.0797193872844284,
"completions/clipped_ratio": -2.765625,
"epoch": 1.8022922636103151,
"grad_norm": 68.241969257841,
"learning_rate": 1.165293514029485e-06,
"loss": -0.6181,
"num_tokens": 51050995.0,
"residual_var": 0.05978954955935478,
"reward": 0.73046875,
"reward_std": 0.14571286737918854,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"rho2": 0.24999994039535522,
"step": 314
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.21128857135772705,
"advantages/var": 0.044642860386389316,
"completions/clipped_ratio": -2.953125,
"epoch": 1.8080229226361033,
"grad_norm": 48.580294172921796,
"learning_rate": 1.1608656096136983e-06,
"loss": -0.3171,
"num_tokens": 51192523.0,
"residual_var": 0.03487724810838699,
"reward": 0.8984375,
"reward_std": 0.0974610298871994,
"rewards/drgrpo_math_reward/mean": 0.8984375,
"rewards/drgrpo_math_reward/std": 0.3026638329029083,
"rho2": 0.21874995529651642,
"step": 315
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.173867718254488e-10,
"advantages/std": 0.2537976801395416,
"advantages/var": 0.06441326244421308,
"completions/clipped_ratio": -2.84375,
"epoch": 1.8137535816618913,
"grad_norm": 47.621003392183994,
"learning_rate": 1.156434465040231e-06,
"loss": -0.1354,
"num_tokens": 51346027.0,
"residual_var": 0.04428413510322571,
"reward": 0.69921875,
"reward_std": 0.13098736107349396,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"rho2": 0.3124999403953552,
"step": 316
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.941774813235425e-10,
"advantages/std": 0.23419423401355743,
"advantages/var": 0.0548469392451969,
"completions/clipped_ratio": -2.921875,
"epoch": 1.8194842406876792,
"grad_norm": 57.02857353398555,
"learning_rate": 1.1520001695612673e-06,
"loss": -0.8921,
"num_tokens": 51477023.0,
"residual_var": 0.04284917935729027,
"reward": 0.890625,
"reward_std": 0.10125666856765747,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.21874994039535522,
"step": 317
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.6992292203726215e-09,
"advantages/std": 0.2587745785713196,
"advantages/var": 0.06696428251476405,
"completions/clipped_ratio": -2.875,
"epoch": 1.8252148997134672,
"grad_norm": 59.283326258454025,
"learning_rate": 1.1475628124924578e-06,
"loss": -0.4742,
"num_tokens": 51629827.0,
"residual_var": 0.048130594193935394,
"reward": 0.83203125,
"reward_std": 0.1332252472639084,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"rho2": 0.2812499403953552,
"step": 318
},
{
"advantages/mean": 1.280568540096283e-09,
"advantages/snr": 4.300990042955823e-09,
"advantages/std": 0.29773807525634766,
"advantages/var": 0.08864796145735454,
"completions/clipped_ratio": -2.75,
"epoch": 1.8309455587392551,
"grad_norm": 74.49197012171977,
"learning_rate": 1.1431224832111194e-06,
"loss": -0.9767,
"num_tokens": 51782044.0,
"residual_var": 0.05540499463677406,
"reward": 0.76953125,
"reward_std": 0.17491313815116882,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"rho2": 0.37499991059303284,
"step": 319
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 8.830796510959388e-10,
"advantages/std": 0.2636575698852539,
"advantages/var": 0.06951531415779755,
"completions/clipped_ratio": -2.75,
"epoch": 1.836676217765043,
"grad_norm": 95.72259182532687,
"learning_rate": 1.138679271154436e-06,
"loss": 0.1021,
"num_tokens": 51927336.0,
"residual_var": 0.05430884286761284,
"reward": 0.80078125,
"reward_std": 0.1230878233909607,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.2187499701976776,
"step": 320
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 9.12878675958233e-10,
"advantages/std": 0.25505101680755615,
"advantages/var": 0.06505102117456829,
"completions/clipped_ratio": -2.921875,
"epoch": 1.842406876790831,
"grad_norm": 59.64753369437884,
"learning_rate": 1.1342332658176555e-06,
"loss": -0.0949,
"num_tokens": 52069866.0,
"residual_var": 0.056919656693935394,
"reward": 0.8984375,
"reward_std": 0.11784427613019943,
"rewards/drgrpo_math_reward/mean": 0.8984375,
"rewards/drgrpo_math_reward/std": 0.3026638329029083,
"rho2": 0.1249999850988388,
"step": 321
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 8.997430734575406e-10,
"advantages/std": 0.2587745785713196,
"advantages/var": 0.06696428251476405,
"completions/clipped_ratio": -2.859375,
"epoch": 1.848137535816619,
"grad_norm": 61.94070407305759,
"learning_rate": 1.1297845567522886e-06,
"loss": -0.5812,
"num_tokens": 52224671.0,
"residual_var": 0.043945323675870895,
"reward": 0.73828125,
"reward_std": 0.14032843708992004,
"rewards/drgrpo_math_reward/mean": 0.73828125,
"rewards/drgrpo_math_reward/std": 0.4404313564300537,
"rho2": 0.3437499403953552,
"step": 322
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.6558931458075657e-09,
"advantages/std": 0.28121456503868103,
"advantages/var": 0.07908163158989456,
"completions/clipped_ratio": -2.796875,
"epoch": 1.853868194842407,
"grad_norm": 55.218722365267084,
"learning_rate": 1.1253332335643042e-06,
"loss": -0.5945,
"num_tokens": 52372889.0,
"residual_var": 0.05683993920683861,
"reward": 0.765625,
"reward_std": 0.15163654088974,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.2812499403953552,
"step": 323
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 3.210853701389071e-09,
"advantages/std": 0.2537976801395416,
"advantages/var": 0.06441326244421308,
"completions/clipped_ratio": -2.875,
"epoch": 1.859598853868195,
"grad_norm": 69.06598539631601,
"learning_rate": 1.1208793859123256e-06,
"loss": -0.5763,
"num_tokens": 52514332.0,
"residual_var": 0.04830996319651604,
"reward": 0.85546875,
"reward_std": 0.1238841786980629,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.24999994039535522,
"step": 324
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.6974700286993113e-09,
"advantages/std": 0.2743266522884369,
"advantages/var": 0.07525511215578096,
"completions/clipped_ratio": -2.875,
"epoch": 1.8653295128939829,
"grad_norm": 52.17395719662457,
"learning_rate": 1.1164231035058227e-06,
"loss": -0.5038,
"num_tokens": 52655435.0,
"residual_var": 0.05408961698412895,
"reward": 0.8671875,
"reward_std": 0.13531342148780823,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.2812499403953552,
"step": 325
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.988354962647085e-09,
"advantages/std": 0.23419423401355743,
"advantages/var": 0.0548469392451969,
"completions/clipped_ratio": -2.859375,
"epoch": 1.8710601719197708,
"grad_norm": 54.14916950149179,
"learning_rate": 1.1119644761033077e-06,
"loss": -0.3666,
"num_tokens": 52817959.0,
"residual_var": 0.03770728409290314,
"reward": 0.828125,
"reward_std": 0.12073516845703125,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.31249991059303284,
"step": 326
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.922424165892908e-09,
"advantages/std": 0.24222607910633087,
"advantages/var": 0.05867347339922646,
"completions/clipped_ratio": -2.84375,
"epoch": 1.8767908309455588,
"grad_norm": 58.06906202221281,
"learning_rate": 1.107503593510525e-06,
"loss": -0.5308,
"num_tokens": 52960527.0,
"residual_var": 0.047672200947999954,
"reward": 0.8671875,
"reward_std": 0.10520448535680771,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.1874999701976776,
"step": 327
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 5.114126388694997e-09,
"advantages/std": 0.27316176891326904,
"advantages/var": 0.0746173519958262,
"completions/clipped_ratio": -2.9375,
"epoch": 1.8825214899713467,
"grad_norm": 55.880601623517,
"learning_rate": 1.1030405455786424e-06,
"loss": -0.406,
"num_tokens": 53097288.0,
"residual_var": 0.05129944160580635,
"reward": 0.85546875,
"reward_std": 0.14123709499835968,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.3124999403953552,
"step": 328
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 3.865916250372057e-09,
"advantages/std": 0.2409060299396515,
"advantages/var": 0.05803571526128426,
"completions/clipped_ratio": -2.90625,
"epoch": 1.8882521489971347,
"grad_norm": 100.89430686833502,
"learning_rate": 1.0985754222024436e-06,
"loss": -0.1128,
"num_tokens": 53231633.0,
"residual_var": 0.041713181883096695,
"reward": 0.85546875,
"reward_std": 0.11823134869337082,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.2812499403953552,
"step": 329
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.729520851595926e-09,
"advantages/std": 0.18728730082511902,
"advantages/var": 0.03507653305035863,
"completions/clipped_ratio": -2.984375,
"epoch": 1.8939828080229226,
"grad_norm": 39.60488962547746,
"learning_rate": 1.0941083133185145e-06,
"loss": -1.4171,
"num_tokens": 53357002.0,
"residual_var": 0.028499694541096687,
"reward": 0.92578125,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.92578125,
"rewards/drgrpo_math_reward/std": 0.2626400291919708,
"rho2": 0.18749995529651642,
"step": 330
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 3.967493686413963e-09,
"advantages/std": 0.2934228181838989,
"advantages/var": 0.0860969502309814,
"completions/clipped_ratio": -2.734375,
"epoch": 1.8997134670487106,
"grad_norm": 49.7244276230108,
"learning_rate": 1.0896393089034335e-06,
"loss": -0.1195,
"num_tokens": 53534358.0,
"residual_var": 0.05650113523006439,
"reward": 0.69140625,
"reward_std": 0.16557207703590393,
"rewards/drgrpo_math_reward/mean": 0.69140625,
"rewards/drgrpo_math_reward/std": 0.46281787753105164,
"rho2": 0.34374991059303284,
"step": 331
},
{
"advantages/mean": -1.5133991837501526e-09,
"advantages/snr": 6.24787853915195e-09,
"advantages/std": 0.24222607910633087,
"advantages/var": 0.05867347339922646,
"completions/clipped_ratio": -2.8125,
"epoch": 1.9054441260744985,
"grad_norm": 57.8377677490507,
"learning_rate": 1.0851684989719594e-06,
"loss": -0.0041,
"num_tokens": 53691398.0,
"residual_var": 0.04400511458516121,
"reward": 0.8203125,
"reward_std": 0.11876175552606583,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.2499999701976776,
"step": 332
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.799237303606776e-09,
"advantages/std": 0.18385055661201477,
"advantages/var": 0.03380102716654765,
"completions/clipped_ratio": -2.9375,
"epoch": 1.9111747851002865,
"grad_norm": 44.490624674036994,
"learning_rate": 1.0806959735752173e-06,
"loss": -0.0637,
"num_tokens": 53837722.0,
"residual_var": 0.02851962298154831,
"reward": 0.76171875,
"reward_std": 0.0672023743391037,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"rho2": 0.1562499701976776,
"step": 333
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 3.338821702499687e-09,
"advantages/std": 0.2789374887943268,
"advantages/var": 0.07780612265488518,
"completions/clipped_ratio": -2.65625,
"epoch": 1.9169054441260744,
"grad_norm": 49.97152951305063,
"learning_rate": 1.076221822798885e-06,
"loss": -0.7242,
"num_tokens": 54010772.0,
"residual_var": 0.053491730242967606,
"reward": 0.7578125,
"reward_std": 0.15110857784748077,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"rho2": 0.3124999403953552,
"step": 334
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.18385054171085358,
"advantages/var": 0.03380102168737431,
"completions/clipped_ratio": -2.78125,
"epoch": 1.9226361031518624,
"grad_norm": 36.61290451397358,
"learning_rate": 1.0717461367613792e-06,
"loss": 0.1276,
"num_tokens": 54156350.0,
"residual_var": 0.03063218668103218,
"reward": 0.89453125,
"reward_std": 0.0665532797574997,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.09374997764825821,
"step": 335
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.598885299444173e-09,
"advantages/std": 0.29124119877815247,
"advantages/var": 0.08482143586573532,
"completions/clipped_ratio": -2.96875,
"epoch": 1.9283667621776504,
"grad_norm": 67.7518786607019,
"learning_rate": 1.0672690056120398e-06,
"loss": -0.6978,
"num_tokens": 54296370.0,
"residual_var": 0.05831475183367729,
"reward": 0.80078125,
"reward_std": 0.16439500451087952,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.3124999403953552,
"step": 336
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 6.210110271830265e-09,
"advantages/std": 0.262445330619812,
"advantages/var": 0.06887755156414244,
"completions/clipped_ratio": -2.828125,
"epoch": 1.9340974212034383,
"grad_norm": 47.49343597302498,
"learning_rate": 1.0627905195293135e-06,
"loss": -0.8715,
"num_tokens": 54450420.0,
"residual_var": 0.049505751579999924,
"reward": 0.78125,
"reward_std": 0.1349327266216278,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.2812499403953552,
"step": 337
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.4577503362594367e-09,
"advantages/std": 0.15971913933753967,
"advantages/var": 0.025510203470724413,
"completions/clipped_ratio": -2.953125,
"epoch": 1.9398280802292263,
"grad_norm": 41.621221294628306,
"learning_rate": 1.0583107687189387e-06,
"loss": -1.5985,
"num_tokens": 54581042.0,
"residual_var": 0.02152424491941929,
"reward": 0.890625,
"reward_std": 0.0586601123213768,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.1562499701976776,
"step": 338
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 3.3117862916151314e-09,
"advantages/std": 0.28121456503868103,
"advantages/var": 0.07908163158989456,
"completions/clipped_ratio": -2.859375,
"epoch": 1.9455587392550142,
"grad_norm": 71.06222944609438,
"learning_rate": 1.0538298434121282e-06,
"loss": -1.9821,
"num_tokens": 54737524.0,
"residual_var": 0.05189734697341919,
"reward": 0.84375,
"reward_std": 0.15873973071575165,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.34374991059303284,
"step": 339
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 4.1735271281246086e-10,
"advantages/std": 0.2789374887943268,
"advantages/var": 0.07780612265488518,
"completions/clipped_ratio": -2.953125,
"epoch": 1.9512893982808022,
"grad_norm": 94.66915132544855,
"learning_rate": 1.049347833863751e-06,
"loss": -0.3612,
"num_tokens": 54890793.0,
"residual_var": 0.058354608714580536,
"reward": 0.7578125,
"reward_std": 0.14400538802146912,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"rho2": 0.24999994039535522,
"step": 340
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.24222607910633087,
"advantages/var": 0.05867347339922646,
"completions/clipped_ratio": -2.890625,
"epoch": 1.9570200573065901,
"grad_norm": 38.99157475481538,
"learning_rate": 1.044864830350515e-06,
"loss": -0.1946,
"num_tokens": 55030026.0,
"residual_var": 0.04217156767845154,
"reward": 0.78125,
"reward_std": 0.12468298524618149,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.2812499403953552,
"step": 341
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 8.790565244942093e-10,
"advantages/std": 0.2648642361164093,
"advantages/var": 0.07015306357352902,
"completions/clipped_ratio": -2.796875,
"epoch": 1.962750716332378,
"grad_norm": 68.6227790006013,
"learning_rate": 1.0403809231691484e-06,
"loss": -1.0464,
"num_tokens": 55180828.0,
"residual_var": 0.05042253062129021,
"reward": 0.8671875,
"reward_std": 0.13664263486862183,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.2812499403953552,
"step": 342
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 6.105843634782741e-10,
"advantages/std": 0.19066213071346283,
"advantages/var": 0.036352048088197586,
"completions/clipped_ratio": -2.90625,
"epoch": 1.968481375358166,
"grad_norm": 67.90463567427477,
"learning_rate": 1.0358962026345824e-06,
"loss": -0.154,
"num_tokens": 55332364.0,
"residual_var": 0.031808048486709595,
"reward": 0.87109375,
"reward_std": 0.06944026052951813,
"rewards/drgrpo_math_reward/mean": 0.87109375,
"rewards/drgrpo_math_reward/std": 0.33575257658958435,
"rho2": 0.12499997019767761,
"step": 343
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 3.3461059648989908e-09,
"advantages/std": 0.2435389757156372,
"advantages/var": 0.05931123269262173,
"completions/clipped_ratio": -2.828125,
"epoch": 1.9742120343839542,
"grad_norm": 62.318007127153344,
"learning_rate": 1.0314107590781281e-06,
"loss": -0.621,
"num_tokens": 55476400.0,
"residual_var": 0.05004385486245155,
"reward": 0.91796875,
"reward_std": 0.10573489964008331,
"rewards/drgrpo_math_reward/mean": 0.91796875,
"rewards/drgrpo_math_reward/std": 0.2749498784542084,
"rho2": 0.1562499701976776,
"step": 344
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.9155006725188733e-09,
"advantages/std": 0.15971913933753967,
"advantages/var": 0.025510203470724413,
"completions/clipped_ratio": -2.90625,
"epoch": 1.9799426934097422,
"grad_norm": 35.042547615301785,
"learning_rate": 1.026924682845663e-06,
"loss": -0.0466,
"num_tokens": 55603276.0,
"residual_var": 0.021524248644709587,
"reward": 0.859375,
"reward_std": 0.0586601160466671,
"rewards/drgrpo_math_reward/mean": 0.859375,
"rewards/drgrpo_math_reward/std": 0.3483152687549591,
"rho2": 0.15624995529651642,
"step": 345
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 7.823109002158462e-09,
"advantages/std": 0.1785714328289032,
"advantages/var": 0.03188775662256749,
"completions/clipped_ratio": -2.78125,
"epoch": 1.9856733524355301,
"grad_norm": 39.869269652084334,
"learning_rate": 1.022438064295805e-06,
"loss": -0.2044,
"num_tokens": 55750009.0,
"residual_var": 0.026905305683612823,
"reward": 0.9140625,
"reward_std": 0.06549490988254547,
"rewards/drgrpo_math_reward/mean": 0.9140625,
"rewards/drgrpo_math_reward/std": 0.28082075715065,
"rho2": 0.1562499701976776,
"step": 346
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 8.246281334208482e-10,
"advantages/std": 0.2823462188243866,
"advantages/var": 0.0797193872844284,
"completions/clipped_ratio": -2.890625,
"epoch": 1.991404011461318,
"grad_norm": 58.782506623819536,
"learning_rate": 1.0179509937980971e-06,
"loss": -0.0113,
"num_tokens": 55876382.0,
"residual_var": 0.05978954955935478,
"reward": 0.81640625,
"reward_std": 0.1521669626235962,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"rho2": 0.24999994039535522,
"step": 347
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 9.772780097527553e-10,
"advantages/std": 0.23824401199817657,
"advantages/var": 0.056760209252987304,
"completions/clipped_ratio": -2.921875,
"epoch": 1.997134670487106,
"grad_norm": 58.5206879967775,
"learning_rate": 1.0134635617311853e-06,
"loss": 0.0692,
"num_tokens": 56017420.0,
"residual_var": 0.04611767828464508,
"reward": 0.80859375,
"reward_std": 0.10995110124349594,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.1874999701976776,
"step": 348
},
{
"advantages/mean": 1.0477378964424133e-09,
"advantages/snr": 4.638532964616805e-09,
"advantages/std": 0.22587698698043823,
"advantages/var": 0.05102041324736106,
"completions/clipped_ratio": -2.890625,
"epoch": 2.005730659025788,
"grad_norm": 40.990040303252684,
"learning_rate": 1.0089758584809977e-06,
"loss": -0.3943,
"num_tokens": 56163905.0,
"residual_var": 0.03985970839858055,
"reward": 0.90625,
"reward_std": 0.09784172475337982,
"rewards/drgrpo_math_reward/mean": 0.90625,
"rewards/drgrpo_math_reward/std": 0.2920515835285187,
"rho2": 0.21874995529651642,
"step": 349
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.092355513747993e-09,
"advantages/std": 0.22587697207927704,
"advantages/var": 0.0510204065157025,
"completions/clipped_ratio": -2.90625,
"epoch": 2.011461318051576,
"grad_norm": 45.769842359441355,
"learning_rate": 1.0044879744389256e-06,
"loss": -0.2645,
"num_tokens": 56300771.0,
"residual_var": 0.03507654368877411,
"reward": 0.859375,
"reward_std": 0.1173202320933342,
"rewards/drgrpo_math_reward/mean": 0.859375,
"rewards/drgrpo_math_reward/std": 0.3483152687549591,
"rho2": 0.3124999403953552,
"step": 350
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.0745771419279592e-09,
"advantages/std": 0.22446081042289734,
"advantages/var": 0.05038265541570386,
"completions/clipped_ratio": -2.921875,
"epoch": 2.017191977077364,
"grad_norm": 55.073488092850695,
"learning_rate": 1e-06,
"loss": 0.3208,
"num_tokens": 56434972.0,
"residual_var": 0.04093591496348381,
"reward": 0.82421875,
"reward_std": 0.09666221588850021,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.1874999701976776,
"step": 351
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 7.902533784003918e-09,
"advantages/std": 0.1767766922712326,
"advantages/var": 0.03124999893035807,
"completions/clipped_ratio": -2.859375,
"epoch": 2.022922636103152,
"grad_norm": 35.999997406726365,
"learning_rate": 9.955120255610746e-07,
"loss": -0.2176,
"num_tokens": 56580608.0,
"residual_var": 0.027343759313225746,
"reward": 0.80078125,
"reward_std": 0.05786130577325821,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.12499997019767761,
"step": 352
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.1708932641543454e-09,
"advantages/std": 0.19884872436523438,
"advantages/var": 0.039540815181680955,
"completions/clipped_ratio": -2.984375,
"epoch": 2.0286532951289398,
"grad_norm": 41.84499982504666,
"learning_rate": 9.91024141519002e-07,
"loss": -0.0258,
"num_tokens": 56727775.0,
"residual_var": 0.03089127317070961,
"reward": 0.78125,
"reward_std": 0.08588206768035889,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.21874995529651642,
"step": 353
},
{
"advantages/mean": -1.0477378964424133e-09,
"advantages/snr": 4.422667522444229e-09,
"advantages/std": 0.23690178990364075,
"advantages/var": 0.05612245805954874,
"completions/clipped_ratio": -2.90625,
"epoch": 2.0343839541547277,
"grad_norm": 40.89486934381943,
"learning_rate": 9.865364382688144e-07,
"loss": -0.2472,
"num_tokens": 56872331.0,
"residual_var": 0.04384567588567734,
"reward": 0.90625,
"reward_std": 0.10942068696022034,
"rewards/drgrpo_math_reward/mean": 0.90625,
"rewards/drgrpo_math_reward/std": 0.2920515835285187,
"rho2": 0.21874995529651642,
"step": 354
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 3.66736918388653e-09,
"advantages/std": 0.31743550300598145,
"advantages/var": 0.10076529856866046,
"completions/clipped_ratio": -2.828125,
"epoch": 2.0401146131805157,
"grad_norm": 62.92783490689025,
"learning_rate": 9.82049006201903e-07,
"loss": -1.0709,
"num_tokens": 57029960.0,
"residual_var": 0.0629783347249031,
"reward": 0.703125,
"reward_std": 0.19977852702140808,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"rho2": 0.37499991059303284,
"step": 355
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.25,
"advantages/var": 0.0625,
"completions/clipped_ratio": -2.859375,
"epoch": 2.0458452722063036,
"grad_norm": 41.13574343425845,
"learning_rate": 9.77561935704195e-07,
"loss": -0.4015,
"num_tokens": 57182250.0,
"residual_var": 0.050781264901161194,
"reward": 0.8046875,
"reward_std": 0.11507351696491241,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"rho2": 0.18749995529651642,
"step": 356
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 4.407822555439857e-09,
"advantages/std": 0.21128857135772705,
"advantages/var": 0.044642860386389316,
"completions/clipped_ratio": -2.84375,
"epoch": 2.0515759312320916,
"grad_norm": 47.1257480404097,
"learning_rate": 9.730753171543374e-07,
"loss": -0.7046,
"num_tokens": 57321794.0,
"residual_var": 0.034877244383096695,
"reward": 0.84375,
"reward_std": 0.09100693464279175,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.21874994039535522,
"step": 357
},
{
"advantages/mean": -1.280568540096283e-09,
"advantages/snr": 5.175355444292141e-09,
"advantages/std": 0.24743585288524628,
"advantages/var": 0.06122450129304924,
"completions/clipped_ratio": -2.90625,
"epoch": 2.0573065902578795,
"grad_norm": 39.598904124881315,
"learning_rate": 9.685892409218718e-07,
"loss": -0.4665,
"num_tokens": 57481414.0,
"residual_var": 0.047831643372774124,
"reward": 0.8125,
"reward_std": 0.1145455539226532,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"rho2": 0.2187499701976776,
"step": 358
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.6172287454318415e-09,
"advantages/std": 0.28793779015541077,
"advantages/var": 0.08290817099958137,
"completions/clipped_ratio": -2.9375,
"epoch": 2.0630372492836675,
"grad_norm": 49.79710645310716,
"learning_rate": 9.641037973654178e-07,
"loss": -0.4794,
"num_tokens": 57643547.0,
"residual_var": 0.054408494383096695,
"reward": 0.890625,
"reward_std": 0.1621546745300293,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.3437499403953552,
"step": 359
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.22303566336631775,
"advantages/var": 0.04974490713325341,
"completions/clipped_ratio": -2.9375,
"epoch": 2.0687679083094554,
"grad_norm": 50.82435735502694,
"learning_rate": 9.596190768308513e-07,
"loss": -0.4087,
"num_tokens": 57784823.0,
"residual_var": 0.03575415909290314,
"reward": 0.8828125,
"reward_std": 0.10968907922506332,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.2812499403953552,
"step": 360
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 2.3283063434064427e-09,
"advantages/std": 0.25,
"advantages/var": 0.0625,
"completions/clipped_ratio": -2.703125,
"epoch": 2.0744985673352434,
"grad_norm": 45.40379831308186,
"learning_rate": 9.551351696494853e-07,
"loss": -1.0983,
"num_tokens": 57940022.0,
"residual_var": 0.046875014901161194,
"reward": 0.7734375,
"reward_std": 0.12217670679092407,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"rho2": 0.24999995529651642,
"step": 361
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 3.3665301317676013e-09,
"advantages/std": 0.13832083344459534,
"advantages/var": 0.019132652964807484,
"completions/clipped_ratio": -2.984375,
"epoch": 2.0802292263610314,
"grad_norm": 29.094653129208076,
"learning_rate": 9.506521661362492e-07,
"loss": -0.188,
"num_tokens": 58079358.0,
"residual_var": 0.01793687231838703,
"reward": 0.8828125,
"reward_std": 0.0388009138405323,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.062499985098838806,
"step": 362
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.664790625930143e-10,
"advantages/std": 0.2409060299396515,
"advantages/var": 0.05803571526128426,
"completions/clipped_ratio": -2.921875,
"epoch": 2.0859598853868193,
"grad_norm": 83.4007947773079,
"learning_rate": 9.461701565878718e-07,
"loss": -0.2678,
"num_tokens": 58219170.0,
"residual_var": 0.045340411365032196,
"reward": 0.76171875,
"reward_std": 0.11112815886735916,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"rho2": 0.2187499701976776,
"step": 363
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.8439244876346257e-09,
"advantages/std": 0.25253814458847046,
"advantages/var": 0.06377551447218721,
"completions/clipped_ratio": -2.90625,
"epoch": 2.0916905444126073,
"grad_norm": 61.00925066972484,
"learning_rate": 9.416892312810613e-07,
"loss": -0.1656,
"num_tokens": 58371207.0,
"residual_var": 0.041852690279483795,
"reward": 0.8203125,
"reward_std": 0.1369110345840454,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.3437499403953552,
"step": 364
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 4.5202922198482317e-10,
"advantages/std": 0.2575393617153168,
"advantages/var": 0.06632652283273277,
"completions/clipped_ratio": -2.8125,
"epoch": 2.097421203438395,
"grad_norm": 50.15075310712616,
"learning_rate": 9.372094804706866e-07,
"loss": -1.0044,
"num_tokens": 58515929.0,
"residual_var": 0.045599501579999924,
"reward": 0.78125,
"reward_std": 0.13269482553005219,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.3124999403953552,
"step": 365
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.6558931458075657e-09,
"advantages/std": 0.28121456503868103,
"advantages/var": 0.07908163158989456,
"completions/clipped_ratio": -2.9375,
"epoch": 2.103151862464183,
"grad_norm": 111.89824195115196,
"learning_rate": 9.327309943879603e-07,
"loss": -0.8244,
"num_tokens": 58657277.0,
"residual_var": 0.05683993175625801,
"reward": 0.7890625,
"reward_std": 0.14518246054649353,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"rho2": 0.2812499403953552,
"step": 366
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 8.947597331911881e-09,
"advantages/std": 0.23419423401355743,
"advantages/var": 0.0548469392451969,
"completions/clipped_ratio": -2.90625,
"epoch": 2.1088825214899716,
"grad_norm": 102.01837387407974,
"learning_rate": 9.282538632386206e-07,
"loss": -0.7782,
"num_tokens": 58803731.0,
"residual_var": 0.042849186807870865,
"reward": 0.890625,
"reward_std": 0.10771076381206512,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.21874995529651642,
"step": 367
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 7.604216454059908e-10,
"advantages/std": 0.3061862289905548,
"advantages/var": 0.09375000682345647,
"completions/clipped_ratio": -2.71875,
"epoch": 2.1146131805157595,
"grad_norm": 55.21812239129645,
"learning_rate": 9.237781772011151e-07,
"loss": -0.2983,
"num_tokens": 58969432.0,
"residual_var": 0.0644531399011612,
"reward": 0.79296875,
"reward_std": 0.17938891053199768,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"rho2": 0.3124999403953552,
"step": 368
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 5.717767063674629e-10,
"advantages/std": 0.20360276103019714,
"advantages/var": 0.041454084299119565,
"completions/clipped_ratio": -2.921875,
"epoch": 2.1203438395415475,
"grad_norm": 99.8146176812901,
"learning_rate": 9.193040264247828e-07,
"loss": -0.6521,
"num_tokens": 59107832.0,
"residual_var": 0.03368145227432251,
"reward": 0.92578125,
"reward_std": 0.08166831731796265,
"rewards/drgrpo_math_reward/mean": 0.92578125,
"rewards/drgrpo_math_reward/std": 0.2626400291919708,
"rho2": 0.1874999701976776,
"step": 369
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.1708931764109258e-09,
"advantages/std": 0.19884873926639557,
"advantages/var": 0.03954082110783497,
"completions/clipped_ratio": -2.96875,
"epoch": 2.1260744985673354,
"grad_norm": 36.18387046219496,
"learning_rate": 9.148315010280407e-07,
"loss": -0.7438,
"num_tokens": 59241040.0,
"residual_var": 0.03089127317070961,
"reward": 0.921875,
"reward_std": 0.08588206768035889,
"rewards/drgrpo_math_reward/mean": 0.921875,
"rewards/drgrpo_math_reward/std": 0.26889389753341675,
"rho2": 0.21874994039535522,
"step": 370
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.4188699969241902e-09,
"advantages/std": 0.24614372849464417,
"advantages/var": 0.0605867350772451,
"completions/clipped_ratio": -2.9375,
"epoch": 2.1318051575931234,
"grad_norm": 58.45422371851887,
"learning_rate": 9.103606910965665e-07,
"loss": -0.3998,
"num_tokens": 59396683.0,
"residual_var": 0.04544006660580635,
"reward": 0.75390625,
"reward_std": 0.12046922743320465,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"rho2": 0.24999995529651642,
"step": 371
},
{
"advantages/mean": 8.149072527885437e-10,
"advantages/snr": 3.70147121539465e-09,
"advantages/std": 0.220157653093338,
"advantages/var": 0.048469392215566565,
"completions/clipped_ratio": -2.96875,
"epoch": 2.1375358166189113,
"grad_norm": 43.47332393270564,
"learning_rate": 9.058916866814856e-07,
"loss": 0.0326,
"num_tokens": 59519814.0,
"residual_var": 0.03938138857483864,
"reward": 0.859375,
"reward_std": 0.09495474398136139,
"rewards/drgrpo_math_reward/mean": 0.859375,
"rewards/drgrpo_math_reward/std": 0.3483152687549591,
"rho2": 0.1874999701976776,
"step": 372
},
{
"advantages/mean": 2.561137080192566e-09,
"advantages/snr": 7.919246886876952e-09,
"advantages/std": 0.3234066367149353,
"advantages/var": 0.10459185267126614,
"completions/clipped_ratio": -2.890625,
"epoch": 2.1432664756446993,
"grad_norm": 68.3814059604225,
"learning_rate": 9.014245777975564e-07,
"loss": -0.8138,
"num_tokens": 59688958.0,
"residual_var": 0.05883292481303215,
"reward": 0.796875,
"reward_std": 0.20384256541728973,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"rho2": 0.43749988079071045,
"step": 373
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 3.651514703832932e-09,
"advantages/std": 0.25505101680755615,
"advantages/var": 0.06505102117456829,
"completions/clipped_ratio": -2.828125,
"epoch": 2.1489971346704873,
"grad_norm": 55.37713615201918,
"learning_rate": 8.969594544213577e-07,
"loss": -0.2838,
"num_tokens": 59844959.0,
"residual_var": 0.050821125507354736,
"reward": 0.8046875,
"reward_std": 0.1244145929813385,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"rho2": 0.21874995529651642,
"step": 374
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.0178298853416863e-09,
"advantages/std": 0.2314550280570984,
"advantages/var": 0.0535714300129122,
"completions/clipped_ratio": -2.828125,
"epoch": 2.154727793696275,
"grad_norm": 75.81852215411352,
"learning_rate": 8.924964064894753e-07,
"loss": -0.397,
"num_tokens": 60014526.0,
"residual_var": 0.043526798486709595,
"reward": 0.8359375,
"reward_std": 0.10007961839437485,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.18749995529651642,
"step": 375
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.158150412641125e-09,
"advantages/std": 0.21576867997646332,
"advantages/var": 0.04655612325878544,
"completions/clipped_ratio": -2.984375,
"epoch": 2.160458452722063,
"grad_norm": 36.36413325512361,
"learning_rate": 8.880355238966921e-07,
"loss": -0.1531,
"num_tokens": 60145596.0,
"residual_var": 0.03637198358774185,
"reward": 0.92578125,
"reward_std": 0.09916849434375763,
"rewards/drgrpo_math_reward/mean": 0.92578125,
"rewards/drgrpo_math_reward/std": 0.2626400291919708,
"rho2": 0.21874994039535522,
"step": 376
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 3.541947615186032e-09,
"advantages/std": 0.23007319867610931,
"advantages/var": 0.05293367674905647,
"completions/clipped_ratio": -2.921875,
"epoch": 2.166189111747851,
"grad_norm": 36.5323392935581,
"learning_rate": 8.835768964941772e-07,
"loss": -0.5071,
"num_tokens": 60302043.0,
"residual_var": 0.04135444387793541,
"reward": 0.87109375,
"reward_std": 0.1060032844543457,
"rewards/drgrpo_math_reward/mean": 0.87109375,
"rewards/drgrpo_math_reward/std": 0.33575257658958435,
"rho2": 0.21874994039535522,
"step": 377
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.2785315388989055e-09,
"advantages/std": 0.1821078509092331,
"advantages/var": 0.03316326936277947,
"completions/clipped_ratio": -2.9375,
"epoch": 2.171919770773639,
"grad_norm": 34.55263411580291,
"learning_rate": 8.791206140876745e-07,
"loss": -0.0678,
"num_tokens": 60442295.0,
"residual_var": 0.026945164427161217,
"reward": 0.953125,
"reward_std": 0.07312604784965515,
"rewards/drgrpo_math_reward/mean": 0.953125,
"rewards/drgrpo_math_reward/std": 0.21178513765335083,
"rho2": 0.1874999701976776,
"step": 378
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 8.416324726969034e-10,
"advantages/std": 0.27664169669151306,
"advantages/var": 0.07653062834835911,
"completions/clipped_ratio": -2.9375,
"epoch": 2.177650429799427,
"grad_norm": 49.5081416172195,
"learning_rate": 8.746667664356955e-07,
"loss": -0.5504,
"num_tokens": 60578216.0,
"residual_var": 0.05978955700993538,
"reward": 0.859375,
"reward_std": 0.1428283452987671,
"rewards/drgrpo_math_reward/mean": 0.859375,
"rewards/drgrpo_math_reward/std": 0.3483152687549591,
"rho2": 0.21874995529651642,
"step": 379
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.1348570685336994e-09,
"advantages/std": 0.20516295731067657,
"advantages/var": 0.0420918390524625,
"completions/clipped_ratio": -2.953125,
"epoch": 2.183381088825215,
"grad_norm": 43.20035315786397,
"learning_rate": 8.702154432477115e-07,
"loss": -0.2085,
"num_tokens": 60712266.0,
"residual_var": 0.03551500290632248,
"reward": 0.90625,
"reward_std": 0.08219873160123825,
"rewards/drgrpo_math_reward/mean": 0.90625,
"rewards/drgrpo_math_reward/std": 0.2920515835285187,
"rho2": 0.1562499701976776,
"step": 380
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 2.0118866328303662e-09,
"advantages/std": 0.1157275065779686,
"advantages/var": 0.013392855778753765,
"completions/clipped_ratio": -2.765625,
"epoch": 2.189111747851003,
"grad_norm": 18.632259572410177,
"learning_rate": 8.657667341823448e-07,
"loss": -0.1763,
"num_tokens": 60867807.0,
"residual_var": 0.012137286365032196,
"reward": 0.83984375,
"reward_std": 0.03314562886953354,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.09374997764825821,
"step": 381
},
{
"advantages/mean": -1.7462298274040222e-09,
"advantages/snr": 7.094349984620951e-09,
"advantages/std": 0.24614372849464417,
"advantages/var": 0.0605867350772451,
"completions/clipped_ratio": -2.921875,
"epoch": 2.194842406876791,
"grad_norm": 47.23394786445544,
"learning_rate": 8.613207288455641e-07,
"loss": -0.4775,
"num_tokens": 61004135.0,
"residual_var": 0.04544006288051605,
"reward": 0.81640625,
"reward_std": 0.12046922743320465,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"rho2": 0.2499999701976776,
"step": 382
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 3.5819556995273322e-09,
"advantages/std": 0.260003924369812,
"advantages/var": 0.06760204068770292,
"completions/clipped_ratio": -2.859375,
"epoch": 2.200573065902579,
"grad_norm": 54.11461392640346,
"learning_rate": 8.568775167888805e-07,
"loss": -0.2017,
"num_tokens": 61168740.0,
"residual_var": 0.05070154368877411,
"reward": 0.8515625,
"reward_std": 0.1337556540966034,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.24999994039535522,
"step": 383
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 5.000042345610263e-10,
"advantages/std": 0.23282866179943085,
"advantages/var": 0.05420918575531375,
"completions/clipped_ratio": -2.921875,
"epoch": 2.206303724928367,
"grad_norm": 43.4870952376505,
"learning_rate": 8.524371875075424e-07,
"loss": -0.3357,
"num_tokens": 61306482.0,
"residual_var": 0.04235094040632248,
"reward": 0.82421875,
"reward_std": 0.10125912725925446,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.21874995529651642,
"step": 384
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.25962882665666e-09,
"advantages/std": 0.2142857164144516,
"advantages/var": 0.04591836825925477,
"completions/clipped_ratio": -2.890625,
"epoch": 2.2120343839541547,
"grad_norm": 42.735990238221774,
"learning_rate": 8.479998304387328e-07,
"loss": -0.162,
"num_tokens": 61459774.0,
"residual_var": 0.035873740911483765,
"reward": 0.8359375,
"reward_std": 0.09271685779094696,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.21874995529651642,
"step": 385
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.464047832840681e-09,
"advantages/std": 0.18898223340511322,
"advantages/var": 0.03571428454278469,
"completions/clipped_ratio": -2.84375,
"epoch": 2.2177650429799427,
"grad_norm": 39.815277212712225,
"learning_rate": 8.435655349597689e-07,
"loss": -0.1584,
"num_tokens": 61591873.0,
"residual_var": 0.030133936554193497,
"reward": 0.859375,
"reward_std": 0.06890985369682312,
"rewards/drgrpo_math_reward/mean": 0.859375,
"rewards/drgrpo_math_reward/std": 0.3483152687549591,
"rho2": 0.1562499701976776,
"step": 386
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 4.355861921364802e-09,
"advantages/std": 0.267261266708374,
"advantages/var": 0.07142858468256463,
"completions/clipped_ratio": -2.9375,
"epoch": 2.2234957020057307,
"grad_norm": 61.97166050903191,
"learning_rate": 8.391343903863017e-07,
"loss": -0.4498,
"num_tokens": 61739668.0,
"residual_var": 0.049107152968645096,
"reward": 0.8359375,
"reward_std": 0.1442738026380539,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.3124999403953552,
"step": 387
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.2082482874393463,
"advantages/var": 0.043367349221420604,
"completions/clipped_ratio": -2.765625,
"epoch": 2.2292263610315186,
"grad_norm": 46.20210801677951,
"learning_rate": 8.347064859705152e-07,
"loss": -0.8056,
"num_tokens": 61898279.0,
"residual_var": 0.037946440279483795,
"reward": 0.8203125,
"reward_std": 0.08272669464349747,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.12499997019767761,
"step": 388
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 3.209860470786324e-09,
"advantages/std": 0.1450721174478531,
"advantages/var": 0.02104591926080368,
"completions/clipped_ratio": -2.90625,
"epoch": 2.2349570200573066,
"grad_norm": 35.73079034504171,
"learning_rate": 8.302819108993311e-07,
"loss": -0.3412,
"num_tokens": 62034899.0,
"residual_var": 0.018415190279483795,
"reward": 0.89453125,
"reward_std": 0.04761157184839249,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.12499997764825821,
"step": 389
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.2664125371788293e-09,
"advantages/std": 0.18385054171085358,
"advantages/var": 0.03380102168737431,
"completions/clipped_ratio": -2.953125,
"epoch": 2.2406876790830945,
"grad_norm": 45.68840941118221,
"learning_rate": 8.258607542926114e-07,
"loss": -0.2069,
"num_tokens": 62159693.0,
"residual_var": 0.02851962298154831,
"reward": 0.96484375,
"reward_std": 0.0672023743391037,
"rewards/drgrpo_math_reward/mean": 0.96484375,
"rewards/drgrpo_math_reward/std": 0.18453538417816162,
"rho2": 0.1562499701976776,
"step": 390
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 3.116801460854865e-09,
"advantages/std": 0.29880714416503906,
"advantages/var": 0.08928570940406644,
"completions/clipped_ratio": -2.859375,
"epoch": 2.2464183381088825,
"grad_norm": 54.218033325007504,
"learning_rate": 8.214431052013634e-07,
"loss": -0.6774,
"num_tokens": 62285721.0,
"residual_var": 0.0641741156578064,
"reward": 0.8671875,
"reward_std": 0.16834035515785217,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.2812499403953552,
"step": 391
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 5.059925164551475e-10,
"advantages/std": 0.23007319867610931,
"advantages/var": 0.05293367674905647,
"completions/clipped_ratio": -2.859375,
"epoch": 2.2521489971346704,
"grad_norm": 79.24952958588219,
"learning_rate": 8.170290526059452e-07,
"loss": -0.2492,
"num_tokens": 62431924.0,
"residual_var": 0.03804609179496765,
"reward": 0.87109375,
"reward_std": 0.11310647428035736,
"rewards/drgrpo_math_reward/mean": 0.87109375,
"rewards/drgrpo_math_reward/std": 0.33575257658958435,
"rho2": 0.2812499403953552,
"step": 392
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.3790683972744836e-09,
"advantages/std": 0.20671138167381287,
"advantages/var": 0.04272959531349674,
"completions/clipped_ratio": -2.734375,
"epoch": 2.2578796561604584,
"grad_norm": 51.74546036901536,
"learning_rate": 8.126186854142751e-07,
"loss": 0.0164,
"num_tokens": 62567990.0,
"residual_var": 0.03338250517845154,
"reward": 0.74609375,
"reward_std": 0.08929947018623352,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"rho2": 0.2187499701976776,
"step": 393
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 6.332062685894146e-09,
"advantages/std": 0.18385054171085358,
"advantages/var": 0.03380102168737431,
"completions/clipped_ratio": -2.9375,
"epoch": 2.2636103151862463,
"grad_norm": 103.33108063231454,
"learning_rate": 8.0821209246004e-07,
"loss": 0.045,
"num_tokens": 62709253.0,
"residual_var": 0.02851962298154831,
"reward": 0.89453125,
"reward_std": 0.0672023743391037,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.1562499701976776,
"step": 394
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 8.279465729037828e-10,
"advantages/std": 0.28121456503868103,
"advantages/var": 0.07908163158989456,
"completions/clipped_ratio": -2.796875,
"epoch": 2.2693409742120343,
"grad_norm": 49.10069181374784,
"learning_rate": 8.03809362500905e-07,
"loss": -0.7296,
"num_tokens": 62864754.0,
"residual_var": 0.05683993920683861,
"reward": 0.75,
"reward_std": 0.14518244564533234,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"rho2": 0.2812499403953552,
"step": 395
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.4258431181477625e-09,
"advantages/std": 0.28793779015541077,
"advantages/var": 0.08290817099958137,
"completions/clipped_ratio": -2.578125,
"epoch": 2.2750716332378222,
"grad_norm": 60.20448218570951,
"learning_rate": 7.994105842167272e-07,
"loss": -0.8891,
"num_tokens": 63030352.0,
"residual_var": 0.05699938163161278,
"reward": 0.75,
"reward_std": 0.1626875400543213,
"rewards/drgrpo_math_reward/mean": 0.75,
"rewards/drgrpo_math_reward/std": 0.4338609278202057,
"rho2": 0.3124999403953552,
"step": 396
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.5328250743576586e-09,
"advantages/std": 0.18385054171085358,
"advantages/var": 0.03380102168737431,
"completions/clipped_ratio": -2.859375,
"epoch": 2.28080229226361,
"grad_norm": 40.465213721376465,
"learning_rate": 7.950158462077697e-07,
"loss": -0.189,
"num_tokens": 63171710.0,
"residual_var": 0.02851962298154831,
"reward": 0.80078125,
"reward_std": 0.0672023743391037,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.1562499701976776,
"step": 397
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.282346248626709,
"advantages/var": 0.07971940411357537,
"completions/clipped_ratio": -2.8125,
"epoch": 2.286532951289398,
"grad_norm": 59.346861894412804,
"learning_rate": 7.906252369929154e-07,
"loss": -0.5438,
"num_tokens": 63322080.0,
"residual_var": 0.05480709299445152,
"reward": 0.85546875,
"reward_std": 0.1528160572052002,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.3124999403953552,
"step": 398
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 7.025359058465554e-09,
"advantages/std": 0.19884873926639557,
"advantages/var": 0.03954082110783497,
"completions/clipped_ratio": -2.9375,
"epoch": 2.292263610315186,
"grad_norm": 59.615400328722934,
"learning_rate": 7.862388450078854e-07,
"loss": -0.3341,
"num_tokens": 63458274.0,
"residual_var": 0.03089127317070961,
"reward": 0.828125,
"reward_std": 0.08588206768035889,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.21874994039535522,
"step": 399
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 4.375439129655397e-09,
"advantages/std": 0.26606544852256775,
"advantages/var": 0.07079082289751515,
"completions/clipped_ratio": -2.9375,
"epoch": 2.297994269340974,
"grad_norm": 51.14379978645281,
"learning_rate": 7.818567586034576e-07,
"loss": -0.2393,
"num_tokens": 63616372.0,
"residual_var": 0.046456485986709595,
"reward": 0.87890625,
"reward_std": 0.1437433660030365,
"rewards/drgrpo_math_reward/mean": 0.87890625,
"rewards/drgrpo_math_reward/std": 0.3268752694129944,
"rho2": 0.3437499403953552,
"step": 400
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.509314693234178e-10,
"advantages/std": 0.24484480917453766,
"advantages/var": 0.05994898057971576,
"completions/clipped_ratio": -2.90625,
"epoch": 2.303724928366762,
"grad_norm": 62.83649070850356,
"learning_rate": 7.774790660436857e-07,
"loss": -0.4879,
"num_tokens": 63750284.0,
"residual_var": 0.048708558082580566,
"reward": 0.9140625,
"reward_std": 0.11336849629878998,
"rewards/drgrpo_math_reward/mean": 0.9140625,
"rewards/drgrpo_math_reward/std": 0.28082075715065,
"rho2": 0.1874999701976776,
"step": 401
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.9653419053591074e-09,
"advantages/std": 0.2355518937110901,
"advantages/var": 0.05548469463088068,
"completions/clipped_ratio": -2.890625,
"epoch": 2.30945558739255,
"grad_norm": 66.03436103957057,
"learning_rate": 7.731058555041236e-07,
"loss": -0.0605,
"num_tokens": 63886228.0,
"residual_var": 0.039879634976387024,
"reward": 0.85546875,
"reward_std": 0.1148114949464798,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.2812499403953552,
"step": 402
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 9.664790625930143e-10,
"advantages/std": 0.2409060299396515,
"advantages/var": 0.05803571526128426,
"completions/clipped_ratio": -2.890625,
"epoch": 2.315186246418338,
"grad_norm": 85.42827831346187,
"learning_rate": 7.687372150700479e-07,
"loss": 0.1938,
"num_tokens": 64032369.0,
"residual_var": 0.04534041881561279,
"reward": 0.81640625,
"reward_std": 0.11112815141677856,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"rho2": 0.21874995529651642,
"step": 403
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.1291806000454375e-09,
"advantages/std": 0.2187044471502304,
"advantages/var": 0.047831635203287926,
"completions/clipped_ratio": -3.0,
"epoch": 2.3209169054441263,
"grad_norm": 36.125332534122556,
"learning_rate": 7.643732327346841e-07,
"loss": -0.1364,
"num_tokens": 64160653.0,
"residual_var": 0.037368472665548325,
"reward": 0.93359375,
"reward_std": 0.10087842494249344,
"rewards/drgrpo_math_reward/mean": 0.93359375,
"rewards/drgrpo_math_reward/std": 0.24947863817214966,
"rho2": 0.2187499701976776,
"step": 404
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 4.754657346617089e-10,
"advantages/std": 0.24484480917453766,
"advantages/var": 0.05994898057971576,
"completions/clipped_ratio": -2.703125,
"epoch": 2.326647564469914,
"grad_norm": 48.26815772105712,
"learning_rate": 7.60013996397434e-07,
"loss": -0.5838,
"num_tokens": 64339916.0,
"residual_var": 0.04308834299445152,
"reward": 0.71875,
"reward_std": 0.1263929009437561,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"rho2": 0.2812499403953552,
"step": 405
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.22446081042289734,
"advantages/var": 0.05038265541570386,
"completions/clipped_ratio": -2.953125,
"epoch": 2.3323782234957022,
"grad_norm": 43.58466398770359,
"learning_rate": 7.556595938621058e-07,
"loss": -0.1253,
"num_tokens": 64484115.0,
"residual_var": 0.04093591868877411,
"reward": 0.85546875,
"reward_std": 0.10311631113290787,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.18749995529651642,
"step": 406
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 2.069866432259457e-09,
"advantages/std": 0.28121456503868103,
"advantages/var": 0.07908163158989456,
"completions/clipped_ratio": -2.78125,
"epoch": 2.3381088825214897,
"grad_norm": 53.382125444657255,
"learning_rate": 7.513101128351453e-07,
"loss": 0.147,
"num_tokens": 64631940.0,
"residual_var": 0.05931123346090317,
"reward": 0.7578125,
"reward_std": 0.15216940641403198,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"rho2": 0.2499999701976776,
"step": 407
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.0362738671494303e-09,
"advantages/std": 0.22868302464485168,
"advantages/var": 0.05229592576071784,
"completions/clipped_ratio": -2.765625,
"epoch": 2.343839541547278,
"grad_norm": 60.18215125277809,
"learning_rate": 7.469656409238683e-07,
"loss": -0.3729,
"num_tokens": 64772308.0,
"residual_var": 0.039221953600645065,
"reward": 0.8359375,
"reward_std": 0.11192695796489716,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.24999995529651642,
"step": 408
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.092355309744537e-09,
"advantages/std": 0.22587698698043823,
"advantages/var": 0.05102041324736106,
"completions/clipped_ratio": -2.953125,
"epoch": 2.349570200573066,
"grad_norm": 45.72333700789412,
"learning_rate": 7.426262656346978e-07,
"loss": -0.6002,
"num_tokens": 64911639.0,
"residual_var": 0.03985970839858055,
"reward": 0.890625,
"reward_std": 0.10429581999778748,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.21874995529651642,
"step": 409
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.0,
"advantages/var": 0.0,
"completions/clipped_ratio": -3.0,
"epoch": 2.355300859598854,
"grad_norm": 0.0,
"learning_rate": 7.382920743713998e-07,
"loss": 0.0,
"num_tokens": 65039171.0,
"residual_var": 9.99999993922529e-09,
"reward": 0.9375,
"reward_std": 0.0,
"rewards/drgrpo_math_reward/mean": 0.9375,
"rewards/drgrpo_math_reward/std": 0.24253563582897186,
"rho2": 0.0,
"step": 410
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 8.830796510959388e-10,
"advantages/std": 0.2636575698852539,
"advantages/var": 0.06951531415779755,
"completions/clipped_ratio": -2.9375,
"epoch": 2.361031518624642,
"grad_norm": 93.30075449743337,
"learning_rate": 7.33963154433325e-07,
"loss": -0.0906,
"num_tokens": 65182661.0,
"residual_var": 0.04779178649187088,
"reward": 0.83984375,
"reward_std": 0.13611222803592682,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.3124999403953552,
"step": 411
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 6.1815575576715716e-09,
"advantages/std": 0.2636575698852539,
"advantages/var": 0.06951531415779755,
"completions/clipped_ratio": -2.96875,
"epoch": 2.36676217765043,
"grad_norm": 94.04588923049533,
"learning_rate": 7.29639593013647e-07,
"loss": 0.2124,
"num_tokens": 65331080.0,
"residual_var": 0.04779178649187088,
"reward": 0.89453125,
"reward_std": 0.14256632328033447,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.3124999403953552,
"step": 412
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.6738836260536997e-09,
"advantages/std": 0.26122748851776123,
"advantages/var": 0.06823980075729708,
"completions/clipped_ratio": -2.90625,
"epoch": 2.372492836676218,
"grad_norm": 52.465569457913595,
"learning_rate": 7.2532147719761e-07,
"loss": -0.4931,
"num_tokens": 65476367.0,
"residual_var": 0.053312353789806366,
"reward": 0.87890625,
"reward_std": 0.12783199548721313,
"rewards/drgrpo_math_reward/mean": 0.87890625,
"rewards/drgrpo_math_reward/std": 0.3268752694129944,
"rho2": 0.21874995529651642,
"step": 413
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.329736467094508e-09,
"advantages/std": 0.2097739279270172,
"advantages/var": 0.04400510083792941,
"completions/clipped_ratio": -3.0,
"epoch": 2.378223495702006,
"grad_norm": 39.472581591236306,
"learning_rate": 7.210088939607708e-07,
"loss": 0.0679,
"num_tokens": 65615937.0,
"residual_var": 0.03300383687019348,
"reward": 0.92578125,
"reward_std": 0.0969306156039238,
"rewards/drgrpo_math_reward/mean": 0.92578125,
"rewards/drgrpo_math_reward/std": 0.2626400291919708,
"rho2": 0.24999994039535522,
"step": 414
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.13832083344459534,
"advantages/var": 0.019132652964807484,
"completions/clipped_ratio": -2.859375,
"epoch": 2.383954154727794,
"grad_norm": 32.19586387362773,
"learning_rate": 7.167019301672508e-07,
"loss": -0.2886,
"num_tokens": 65760842.0,
"residual_var": 0.017936870455741882,
"reward": 0.8359375,
"reward_std": 0.0388009138405323,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.062499988824129105,
"step": 415
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 5.186442510510295e-10,
"advantages/std": 0.22446082532405853,
"advantages/var": 0.050382662105157516,
"completions/clipped_ratio": -2.8125,
"epoch": 2.3896848137535818,
"grad_norm": 52.24445047939618,
"learning_rate": 7.124006725679828e-07,
"loss": -0.283,
"num_tokens": 65917872.0,
"residual_var": 0.037787001579999924,
"reward": 0.80078125,
"reward_std": 0.10376540571451187,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.24999995529651642,
"step": 416
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.115126408796943e-09,
"advantages/std": 0.220157653093338,
"advantages/var": 0.048469392215566565,
"completions/clipped_ratio": -2.84375,
"epoch": 2.3954154727793697,
"grad_norm": 74.83747879148169,
"learning_rate": 7.081052077989667e-07,
"loss": -0.2753,
"num_tokens": 66061607.0,
"residual_var": 0.03635205700993538,
"reward": 0.90625,
"reward_std": 0.10205793380737305,
"rewards/drgrpo_math_reward/mean": 0.90625,
"rewards/drgrpo_math_reward/std": 0.2920515835285187,
"rho2": 0.24999995529651642,
"step": 417
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.2910053593143641e-09,
"advantages/std": 0.18034830689430237,
"advantages/var": 0.03252551179964147,
"completions/clipped_ratio": -3.0,
"epoch": 2.4011461318051577,
"grad_norm": 31.091632611376465,
"learning_rate": 7.038156223795224e-07,
"loss": -0.1819,
"num_tokens": 66191762.0,
"residual_var": 0.027443408966064453,
"reward": 0.87109375,
"reward_std": 0.07194654643535614,
"rewards/drgrpo_math_reward/mean": 0.87109375,
"rewards/drgrpo_math_reward/std": 0.33575257658958435,
"rho2": 0.1562499701976776,
"step": 418
},
{
"advantages/mean": 3.4924596548080444e-10,
"advantages/snr": 1.3760799961512823e-09,
"advantages/std": 0.253797709941864,
"advantages/var": 0.06441327757173454,
"completions/clipped_ratio": -2.828125,
"epoch": 2.4068767908309456,
"grad_norm": 50.8649830535786,
"learning_rate": 6.995320027105481e-07,
"loss": -0.2243,
"num_tokens": 66344202.0,
"residual_var": 0.05032287538051605,
"reward": 0.75390625,
"reward_std": 0.12441704422235489,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"rho2": 0.2187499701976776,
"step": 419
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 2.5610062551388323e-09,
"advantages/std": 0.22728432714939117,
"advantages/var": 0.051658165367751474,
"completions/clipped_ratio": -2.765625,
"epoch": 2.4126074498567336,
"grad_norm": 55.606646356605715,
"learning_rate": 6.952544350727799e-07,
"loss": -0.2911,
"num_tokens": 66490917.0,
"residual_var": 0.041972268372774124,
"reward": 0.81640625,
"reward_std": 0.09837214648723602,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"rho2": 0.1874999701976776,
"step": 420
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 4.0725477342988605e-09,
"advantages/std": 0.22868302464485168,
"advantages/var": 0.05229592576071784,
"completions/clipped_ratio": -2.90625,
"epoch": 2.4183381088825215,
"grad_norm": 62.95766779171665,
"learning_rate": 6.909830056250526e-07,
"loss": -0.2863,
"num_tokens": 66648538.0,
"residual_var": 0.03922194987535477,
"reward": 0.828125,
"reward_std": 0.1054728776216507,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.24999994039535522,
"step": 421
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 5.7177670636746294e-09,
"advantages/std": 0.20360276103019714,
"advantages/var": 0.041454084299119565,
"completions/clipped_ratio": -2.96875,
"epoch": 2.4240687679083095,
"grad_norm": 32.371041420700465,
"learning_rate": 6.867178004025676e-07,
"loss": -0.348,
"num_tokens": 66792036.0,
"residual_var": 0.03238601237535477,
"reward": 0.83984375,
"reward_std": 0.08758954703807831,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.21874994039535522,
"step": 422
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 6.160119582101702e-10,
"advantages/std": 0.18898223340511322,
"advantages/var": 0.03571428454278469,
"completions/clipped_ratio": -2.828125,
"epoch": 2.4297994269340975,
"grad_norm": 39.984557826282405,
"learning_rate": 6.824589053151557e-07,
"loss": -0.333,
"num_tokens": 66929298.0,
"residual_var": 0.030133940279483795,
"reward": 0.8984375,
"reward_std": 0.07536394149065018,
"rewards/drgrpo_math_reward/mean": 0.8984375,
"rewards/drgrpo_math_reward/std": 0.3026638329029083,
"rho2": 0.1562499701976776,
"step": 423
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 3.725290149450308e-09,
"advantages/std": 0.25,
"advantages/var": 0.0625,
"completions/clipped_ratio": -2.921875,
"epoch": 2.4355300859598854,
"grad_norm": 50.055448927921034,
"learning_rate": 6.782064061455504e-07,
"loss": -0.4335,
"num_tokens": 67077150.0,
"residual_var": 0.046875014901161194,
"reward": 0.890625,
"reward_std": 0.12217670679092407,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.24999994039535522,
"step": 424
},
{
"advantages/mean": -1.280568540096283e-09,
"advantages/snr": 4.590879840937069e-09,
"advantages/std": 0.2789374887943268,
"advantages/var": 0.07780612265488518,
"completions/clipped_ratio": -2.9375,
"epoch": 2.4412607449856734,
"grad_norm": 57.2938109556653,
"learning_rate": 6.739603885476582e-07,
"loss": -0.111,
"num_tokens": 67227554.0,
"residual_var": 0.06564892828464508,
"reward": 0.8203125,
"reward_std": 0.13743507862091064,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.1562499701976776,
"step": 425
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 4.28010223419274e-09,
"advantages/std": 0.27199190855026245,
"advantages/var": 0.07397959831681433,
"completions/clipped_ratio": -2.75,
"epoch": 2.4469914040114613,
"grad_norm": 55.31860953422715,
"learning_rate": 6.697209380448332e-07,
"loss": -0.703,
"num_tokens": 67387084.0,
"residual_var": 0.05548471212387085,
"reward": 0.765625,
"reward_std": 0.14059044420719147,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.24999995529651642,
"step": 426
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.4306602382047774e-09,
"advantages/std": 0.20360276103019714,
"advantages/var": 0.041454084299119565,
"completions/clipped_ratio": -2.921875,
"epoch": 2.4527220630372493,
"grad_norm": 42.06385390657045,
"learning_rate": 6.654881400281547e-07,
"loss": -0.0225,
"num_tokens": 67524018.0,
"residual_var": 0.03368144854903221,
"reward": 0.92578125,
"reward_std": 0.08166831731796265,
"rewards/drgrpo_math_reward/mean": 0.92578125,
"rewards/drgrpo_math_reward/std": 0.2626400291919708,
"rho2": 0.1874999701976776,
"step": 427
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.24743583798408508,
"advantages/var": 0.0612244939188864,
"completions/clipped_ratio": -2.96875,
"epoch": 2.458452722063037,
"grad_norm": 49.93600952633924,
"learning_rate": 6.612620797547086e-07,
"loss": -0.0831,
"num_tokens": 67655964.0,
"residual_var": 0.04591837897896767,
"reward": 0.890625,
"reward_std": 0.1140126883983612,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.24999994039535522,
"step": 428
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.23007319867610931,
"advantages/var": 0.05293367674905647,
"completions/clipped_ratio": -2.828125,
"epoch": 2.464183381088825,
"grad_norm": 73.33296408165174,
"learning_rate": 6.570428423458686e-07,
"loss": -0.0866,
"num_tokens": 67818794.0,
"residual_var": 0.03804609179496765,
"reward": 0.80859375,
"reward_std": 0.11310647428035736,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.2812499701976776,
"step": 429
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.061570206496358e-09,
"advantages/std": 0.22587698698043823,
"advantages/var": 0.05102041324736106,
"completions/clipped_ratio": -2.828125,
"epoch": 2.469914040114613,
"grad_norm": 41.3588146711098,
"learning_rate": 6.528305127855815e-07,
"loss": -0.1169,
"num_tokens": 67958774.0,
"residual_var": 0.03985970839858055,
"reward": 0.8125,
"reward_std": 0.09784172475337982,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"rho2": 0.21874995529651642,
"step": 430
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.5792027536326915e-09,
"advantages/std": 0.27081698179244995,
"advantages/var": 0.07334183762717217,
"completions/clipped_ratio": -2.859375,
"epoch": 2.475644699140401,
"grad_norm": 47.53806778721076,
"learning_rate": 6.486251759186572e-07,
"loss": -0.0798,
"num_tokens": 68123504.0,
"residual_var": 0.05271446332335472,
"reward": 0.80078125,
"reward_std": 0.1336059421300888,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.2812499403953552,
"step": 431
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.1291804549763264e-09,
"advantages/std": 0.2187044620513916,
"advantages/var": 0.04783164172118859,
"completions/clipped_ratio": -2.90625,
"epoch": 2.481375358166189,
"grad_norm": 54.79795531786569,
"learning_rate": 6.444269164490578e-07,
"loss": -0.4716,
"num_tokens": 68270822.0,
"residual_var": 0.04035795107483864,
"reward": 0.82421875,
"reward_std": 0.08732114732265472,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.15624995529651642,
"step": 432
},
{
"advantages/mean": 1.0477378964424133e-09,
"advantages/snr": 6.117115762230131e-09,
"advantages/std": 0.17127971351146698,
"advantages/var": 0.029336740260570204,
"completions/clipped_ratio": -2.90625,
"epoch": 2.487106017191977,
"grad_norm": 31.992254782418268,
"learning_rate": 6.402358189381933e-07,
"loss": 0.0456,
"num_tokens": 68409422.0,
"residual_var": 0.025669652968645096,
"reward": 0.84375,
"reward_std": 0.056153833866119385,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.1249999850988388,
"step": 433
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.23282866179943085,
"advantages/var": 0.05420918575531375,
"completions/clipped_ratio": -2.859375,
"epoch": 2.492836676217765,
"grad_norm": 45.283505671775984,
"learning_rate": 6.360519678032157e-07,
"loss": -0.039,
"num_tokens": 68570279.0,
"residual_var": 0.045739009976387024,
"reward": 0.80859375,
"reward_std": 0.10061003267765045,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.1562499701976776,
"step": 434
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.7487602347642132e-09,
"advantages/std": 0.16940774023532867,
"advantages/var": 0.028698982451640598,
"completions/clipped_ratio": -2.859375,
"epoch": 2.498567335243553,
"grad_norm": 41.997973250055495,
"learning_rate": 6.31875447315322e-07,
"loss": 0.147,
"num_tokens": 68717741.0,
"residual_var": 0.02421477437019348,
"reward": 0.85546875,
"reward_std": 0.062077511101961136,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.1562499701976776,
"step": 435
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 3.66736918388653e-09,
"advantages/std": 0.31743550300598145,
"advantages/var": 0.10076529856866046,
"completions/clipped_ratio": -2.890625,
"epoch": 2.504297994269341,
"grad_norm": 68.79884909207622,
"learning_rate": 6.277063415980548e-07,
"loss": -0.269,
"num_tokens": 68869663.0,
"residual_var": 0.07242507487535477,
"reward": 0.7890625,
"reward_std": 0.18675412237644196,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"rho2": 0.2812499403953552,
"step": 436
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.219622438173128e-10,
"advantages/std": 0.25253814458847046,
"advantages/var": 0.06377551447218721,
"completions/clipped_ratio": -2.953125,
"epoch": 2.510028653295129,
"grad_norm": 59.63463907817795,
"learning_rate": 6.23544734625608e-07,
"loss": -0.1937,
"num_tokens": 69011738.0,
"residual_var": 0.04783164709806442,
"reward": 0.828125,
"reward_std": 0.12388662248849869,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.24999995529651642,
"step": 437
},
{
"advantages/mean": 3.4924596548080444e-10,
"advantages/snr": 1.4418180357225341e-09,
"advantages/std": 0.24222609400749207,
"advantages/var": 0.058673480618126383,
"completions/clipped_ratio": -2.78125,
"epoch": 2.5157593123209168,
"grad_norm": 40.77789706768845,
"learning_rate": 6.193907102211358e-07,
"loss": -0.2813,
"num_tokens": 69161665.0,
"residual_var": 0.04400511831045151,
"reward": 0.7265625,
"reward_std": 0.11230766773223877,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"rho2": 0.24999995529651642,
"step": 438
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.8168728588206165e-09,
"advantages/std": 0.2562982141971588,
"advantages/var": 0.0656887746006527,
"completions/clipped_ratio": -2.90625,
"epoch": 2.5214899713467047,
"grad_norm": 50.03689448920175,
"learning_rate": 6.152443520550641e-07,
"loss": -0.9247,
"num_tokens": 69316042.0,
"residual_var": 0.04721382260322571,
"reward": 0.75390625,
"reward_std": 0.13151532411575317,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"rho2": 0.2812499403953552,
"step": 439
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 4.914075334034159e-09,
"advantages/std": 0.23690177500247955,
"advantages/var": 0.056122450999325446,
"completions/clipped_ratio": -2.9375,
"epoch": 2.5272206303724927,
"grad_norm": 49.160447041501605,
"learning_rate": 6.111057436434055e-07,
"loss": -0.2751,
"num_tokens": 69455761.0,
"residual_var": 0.04209184646606445,
"reward": 0.9296875,
"reward_std": 0.10888782143592834,
"rewards/drgrpo_math_reward/mean": 0.9296875,
"rewards/drgrpo_math_reward/std": 0.2561737895011902,
"rho2": 0.24999994039535522,
"step": 440
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 9.219622438173128e-10,
"advantages/std": 0.25253814458847046,
"advantages/var": 0.06377551447218721,
"completions/clipped_ratio": -2.96875,
"epoch": 2.532951289398281,
"grad_norm": 47.156529472775865,
"learning_rate": 6.069749683460764e-07,
"loss": -0.1707,
"num_tokens": 69599648.0,
"residual_var": 0.05181761458516121,
"reward": 0.890625,
"reward_std": 0.11678344011306763,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.1874999701976776,
"step": 441
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.9656301336136633e-09,
"advantages/std": 0.23690177500247955,
"advantages/var": 0.056122450999325446,
"completions/clipped_ratio": -2.8125,
"epoch": 2.5386819484240686,
"grad_norm": 46.898459689438916,
"learning_rate": 6.028521093652194e-07,
"loss": -0.7164,
"num_tokens": 69765995.0,
"residual_var": 0.043845679610967636,
"reward": 0.8203125,
"reward_std": 0.10942068696022034,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.21874995529651642,
"step": 442
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.054410800724145e-09,
"advantages/std": 0.22868302464485168,
"advantages/var": 0.05229592576071784,
"completions/clipped_ratio": -2.71875,
"epoch": 2.544412607449857,
"grad_norm": 36.281922684341176,
"learning_rate": 5.987372497435258e-07,
"loss": -0.2242,
"num_tokens": 69918989.0,
"residual_var": 0.04739318788051605,
"reward": 0.7890625,
"reward_std": 0.09179937839508057,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"rho2": 0.0937499850988388,
"step": 443
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.1730857333241115e-09,
"advantages/std": 0.2142857313156128,
"advantages/var": 0.045918374645467,
"completions/clipped_ratio": -2.828125,
"epoch": 2.5501432664756445,
"grad_norm": 37.74002948472885,
"learning_rate": 5.946304723625639e-07,
"loss": -0.0797,
"num_tokens": 70061853.0,
"residual_var": 0.038743630051612854,
"reward": 0.7734375,
"reward_std": 0.07915958762168884,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"rho2": 0.1562499701976776,
"step": 444
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 7.2154957227372524e-09,
"advantages/std": 0.22587698698043823,
"advantages/var": 0.05102041324736106,
"completions/clipped_ratio": -2.6875,
"epoch": 2.555873925501433,
"grad_norm": 35.717771683637935,
"learning_rate": 5.905318599411097e-07,
"loss": -0.262,
"num_tokens": 70237613.0,
"residual_var": 0.03985970839858055,
"reward": 0.7578125,
"reward_std": 0.09784172475337982,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"rho2": 0.21874995529651642,
"step": 445
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.18728730082511902,
"advantages/var": 0.03507653305035863,
"completions/clipped_ratio": -2.90625,
"epoch": 2.5616045845272204,
"grad_norm": 40.02898953814623,
"learning_rate": 5.864414950334795e-07,
"loss": -0.2901,
"num_tokens": 70384711.0,
"residual_var": 0.028499692678451538,
"reward": 0.91796875,
"reward_std": 0.07483351975679398,
"rewards/drgrpo_math_reward/mean": 0.91796875,
"rewards/drgrpo_math_reward/std": 0.2749498784542084,
"rho2": 0.1874999701976776,
"step": 446
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.911554501079231e-09,
"advantages/std": 0.1785714328289032,
"advantages/var": 0.03188775662256749,
"completions/clipped_ratio": -2.84375,
"epoch": 2.567335243553009,
"grad_norm": 36.74075622863581,
"learning_rate": 5.82359460027869e-07,
"loss": 0.021,
"num_tokens": 70512660.0,
"residual_var": 0.025908811017870903,
"reward": 0.890625,
"reward_std": 0.07141612470149994,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.18749994039535522,
"step": 447
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 5.090684667873576e-10,
"advantages/std": 0.22868302464485168,
"advantages/var": 0.05229592576071784,
"completions/clipped_ratio": -2.890625,
"epoch": 2.5730659025787963,
"grad_norm": 42.09374187090726,
"learning_rate": 5.782858371446927e-07,
"loss": -0.4403,
"num_tokens": 70665873.0,
"residual_var": 0.03922194987535477,
"reward": 0.828125,
"reward_std": 0.11192697286605835,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.24999994039535522,
"step": 448
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 4.034460560317909e-09,
"advantages/std": 0.17313142120838165,
"advantages/var": 0.029974489009634064,
"completions/clipped_ratio": -2.828125,
"epoch": 2.5787965616045847,
"grad_norm": 26.676232917963386,
"learning_rate": 5.742207084349273e-07,
"loss": -0.0319,
"num_tokens": 70810642.0,
"residual_var": 0.02435428649187088,
"reward": 0.80078125,
"reward_std": 0.06970866024494171,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.18749995529651642,
"step": 449
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.991240497915941e-09,
"advantages/std": 0.15567496418952942,
"advantages/var": 0.024234694475411267,
"completions/clipped_ratio": -2.875,
"epoch": 2.5845272206303727,
"grad_norm": 24.910936996773792,
"learning_rate": 5.701641557784609e-07,
"loss": 0.0803,
"num_tokens": 70940467.0,
"residual_var": 0.022720035165548325,
"reward": 0.84375,
"reward_std": 0.043925780802965164,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.062499985098838806,
"step": 450
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 3.9537892071454765e-09,
"advantages/std": 0.2355518937110901,
"advantages/var": 0.05548469463088068,
"completions/clipped_ratio": -2.859375,
"epoch": 2.5902578796561606,
"grad_norm": 88.14454543805262,
"learning_rate": 5.661162608824419e-07,
"loss": -0.3043,
"num_tokens": 71083256.0,
"residual_var": 0.04508132487535477,
"reward": 0.82421875,
"reward_std": 0.10824117809534073,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.1874999701976776,
"step": 451
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 5.982480995831882e-09,
"advantages/std": 0.15567496418952942,
"advantages/var": 0.024234694475411267,
"completions/clipped_ratio": -2.828125,
"epoch": 2.5959885386819486,
"grad_norm": 24.812763493051456,
"learning_rate": 5.620771052796338e-07,
"loss": 0.1889,
"num_tokens": 71235989.0,
"residual_var": 0.021205367520451546,
"reward": 0.890625,
"reward_std": 0.05102896690368652,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.12499997764825821,
"step": 452
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 5.000042345610262e-09,
"advantages/std": 0.23282866179943085,
"advantages/var": 0.05420918575531375,
"completions/clipped_ratio": -2.90625,
"epoch": 2.6017191977077365,
"grad_norm": 40.96379593667113,
"learning_rate": 5.580467703267735e-07,
"loss": -0.443,
"num_tokens": 71376750.0,
"residual_var": 0.045739009976387024,
"reward": 0.83203125,
"reward_std": 0.10061003267765045,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"rho2": 0.1562499701976776,
"step": 453
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 3.2596287508993594e-09,
"advantages/std": 0.1785714328289032,
"advantages/var": 0.03188775662256749,
"completions/clipped_ratio": -2.859375,
"epoch": 2.6074498567335245,
"grad_norm": 29.902235896857928,
"learning_rate": 5.540253372029314e-07,
"loss": -0.0184,
"num_tokens": 71512479.0,
"residual_var": 0.028898291289806366,
"reward": 0.8671875,
"reward_std": 0.058391720056533813,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.09374997764825821,
"step": 454
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.305866683433113e-09,
"advantages/std": 0.21128858625888824,
"advantages/var": 0.04464286668327966,
"completions/clipped_ratio": -2.828125,
"epoch": 2.6131805157593124,
"grad_norm": 34.34699092506041,
"learning_rate": 5.500128869078788e-07,
"loss": 0.1558,
"num_tokens": 71659797.0,
"residual_var": 0.039062514901161194,
"reward": 0.8515625,
"reward_std": 0.08443661779165268,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.12499997764825821,
"step": 455
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.380496292175044e-09,
"advantages/std": 0.19561520218849182,
"advantages/var": 0.038265307327244535,
"completions/clipped_ratio": -2.84375,
"epoch": 2.6189111747851004,
"grad_norm": 37.02449809638985,
"learning_rate": 5.460095002604532e-07,
"loss": -0.3228,
"num_tokens": 71800154.0,
"residual_var": 0.031090570613741875,
"reward": 0.875,
"reward_std": 0.07825092226266861,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.1874999701976776,
"step": 456
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.2369422001312725e-09,
"advantages/std": 0.2823462188243866,
"advantages/var": 0.0797193872844284,
"completions/clipped_ratio": -2.796875,
"epoch": 2.6246418338108883,
"grad_norm": 49.1134055528491,
"learning_rate": 5.420152578969325e-07,
"loss": -0.2936,
"num_tokens": 71964871.0,
"residual_var": 0.05480710044503212,
"reward": 0.82421875,
"reward_std": 0.15927013754844666,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.3124999403953552,
"step": 457
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.8527944079702535e-09,
"advantages/std": 0.24484480917453766,
"advantages/var": 0.05994898057971576,
"completions/clipped_ratio": -2.96875,
"epoch": 2.6303724928366763,
"grad_norm": 46.372268356616715,
"learning_rate": 5.380302402694103e-07,
"loss": -0.2865,
"num_tokens": 72116926.0,
"residual_var": 0.04308834299445152,
"reward": 0.828125,
"reward_std": 0.11993882060050964,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.2812499403953552,
"step": 458
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.2198241537125106e-09,
"advantages/std": 0.2097739428281784,
"advantages/var": 0.04400510708967986,
"completions/clipped_ratio": -2.84375,
"epoch": 2.6361031518624642,
"grad_norm": 42.058817691725466,
"learning_rate": 5.340545276441754e-07,
"loss": -0.0683,
"num_tokens": 72260427.0,
"residual_var": 0.037129320204257965,
"reward": 0.89453125,
"reward_std": 0.08390620350837708,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.1562499701976776,
"step": 459
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 4.5643933797911646e-09,
"advantages/std": 0.25505101680755615,
"advantages/var": 0.06505102117456829,
"completions/clipped_ratio": -2.796875,
"epoch": 2.641833810888252,
"grad_norm": 37.5340582752598,
"learning_rate": 5.300882001000946e-07,
"loss": -0.4365,
"num_tokens": 72414368.0,
"residual_var": 0.046755433082580566,
"reward": 0.7578125,
"reward_std": 0.1250636875629425,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"rho2": 0.2812499403953552,
"step": 460
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 4.801170592896891e-09,
"advantages/std": 0.1939782202243805,
"advantages/var": 0.03762754992141826,
"completions/clipped_ratio": -2.875,
"epoch": 2.64756446991404,
"grad_norm": 28.066627315135047,
"learning_rate": 5.261313375270013e-07,
"loss": 0.2219,
"num_tokens": 72556336.0,
"residual_var": 0.0293965395539999,
"reward": 0.84765625,
"reward_std": 0.08417459577322006,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.21874995529651642,
"step": 461
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.3049056547270335e-09,
"advantages/std": 0.2020305097103119,
"advantages/var": 0.04081632685380843,
"completions/clipped_ratio": -3.0,
"epoch": 2.653295128939828,
"grad_norm": 38.265928227983444,
"learning_rate": 5.221840196240848e-07,
"loss": -0.4421,
"num_tokens": 72694108.0,
"residual_var": 0.03188776224851608,
"reward": 0.8515625,
"reward_std": 0.0875919908285141,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.2187499701976776,
"step": 462
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 6.390150731707631e-09,
"advantages/std": 0.25505101680755615,
"advantages/var": 0.06505102117456829,
"completions/clipped_ratio": -2.921875,
"epoch": 2.659025787965616,
"grad_norm": 41.52771750903016,
"learning_rate": 5.182463258982846e-07,
"loss": -0.2373,
"num_tokens": 72839820.0,
"residual_var": 0.050821125507354736,
"reward": 0.8203125,
"reward_std": 0.1244145929813385,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.21874994039535522,
"step": 463
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.0059432951138956e-09,
"advantages/std": 0.2314550280570984,
"advantages/var": 0.0535714300129122,
"completions/clipped_ratio": -2.84375,
"epoch": 2.664756446991404,
"grad_norm": 44.56932098497142,
"learning_rate": 5.143183356626916e-07,
"loss": -0.3485,
"num_tokens": 72979719.0,
"residual_var": 0.040178585797548294,
"reward": 0.9296875,
"reward_std": 0.10718280076980591,
"rewards/drgrpo_math_reward/mean": 0.9296875,
"rewards/drgrpo_math_reward/std": 0.2561737895011902,
"rho2": 0.24999994039535522,
"step": 464
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 4.5421821470515416e-09,
"advantages/std": 0.2562982141971588,
"advantages/var": 0.0656887746006527,
"completions/clipped_ratio": -2.953125,
"epoch": 2.670487106017192,
"grad_norm": 52.1727676862902,
"learning_rate": 5.104001280349479e-07,
"loss": -0.217,
"num_tokens": 73117461.0,
"residual_var": 0.049266595393419266,
"reward": 0.83984375,
"reward_std": 0.1255940943956375,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.24999994039535522,
"step": 465
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.712175018057454e-09,
"advantages/std": 0.25753939151763916,
"advantages/var": 0.06632653818327583,
"completions/clipped_ratio": -2.96875,
"epoch": 2.67621776504298,
"grad_norm": 57.659842033208214,
"learning_rate": 5.064917819356531e-07,
"loss": -0.7347,
"num_tokens": 73275288.0,
"residual_var": 0.045599501579999924,
"reward": 0.7265625,
"reward_std": 0.13269482553005219,
"rewards/drgrpo_math_reward/mean": 0.7265625,
"rewards/drgrpo_math_reward/std": 0.446596622467041,
"rho2": 0.3124999403953552,
"step": 466
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 8.750878259310795e-10,
"advantages/std": 0.26606544852256775,
"advantages/var": 0.07079082289751515,
"completions/clipped_ratio": -2.8125,
"epoch": 2.681948424068768,
"grad_norm": 44.43579130370154,
"learning_rate": 5.025933760867781e-07,
"loss": -0.656,
"num_tokens": 73421135.0,
"residual_var": 0.046456485986709595,
"reward": 0.74609375,
"reward_std": 0.1437433809041977,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"rho2": 0.3437499403953552,
"step": 467
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.9961067217240515e-09,
"advantages/std": 0.1749635636806488,
"advantages/var": 0.03061224861583245,
"completions/clipped_ratio": -2.953125,
"epoch": 2.687679083094556,
"grad_norm": 43.168997925092384,
"learning_rate": 4.987049890100752e-07,
"loss": -0.3475,
"num_tokens": 73562177.0,
"residual_var": 0.025829093530774117,
"reward": 0.8359375,
"reward_std": 0.06378498673439026,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.15624995529651642,
"step": 468
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 4.743456338717347e-09,
"advantages/std": 0.14725378155708313,
"advantages/var": 0.021683676182861156,
"completions/clipped_ratio": -2.921875,
"epoch": 2.693409742120344,
"grad_norm": 25.475930745385376,
"learning_rate": 4.948266990254988e-07,
"loss": -0.2273,
"num_tokens": 73691189.0,
"residual_var": 0.019650839269161224,
"reward": 0.875,
"reward_std": 0.041687894612550735,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.09374997764825821,
"step": 469
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 2.0118866328303662e-09,
"advantages/std": 0.1157275065779686,
"advantages/var": 0.013392855778753765,
"completions/clipped_ratio": -2.9375,
"epoch": 2.6991404011461317,
"grad_norm": 59.802744549134886,
"learning_rate": 4.909585842496287e-07,
"loss": -0.0798,
"num_tokens": 73844398.0,
"residual_var": 0.012137286365032196,
"reward": 0.90234375,
"reward_std": 0.03314562886953354,
"rewards/drgrpo_math_reward/mean": 0.90234375,
"rewards/drgrpo_math_reward/std": 0.29743078351020813,
"rho2": 0.09374997764825821,
"step": 470
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.266069932627558e-10,
"advantages/std": 0.2512722611427307,
"advantages/var": 0.06313774921978066,
"completions/clipped_ratio": -2.875,
"epoch": 2.7048710601719197,
"grad_norm": 80.40343933087797,
"learning_rate": 4.871007225940939e-07,
"loss": -0.5229,
"num_tokens": 73990263.0,
"residual_var": 0.043407220393419266,
"reward": 0.84765625,
"reward_std": 0.12927743792533875,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.3124999403953552,
"step": 471
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 2.464047702455537e-09,
"advantages/std": 0.09449111670255661,
"advantages/var": 0.008928571135696173,
"completions/clipped_ratio": -3.0,
"epoch": 2.7106017191977076,
"grad_norm": 13.501865451703313,
"learning_rate": 4.832531917640057e-07,
"loss": 0.0409,
"num_tokens": 74126906.0,
"residual_var": 0.008370545692741871,
"reward": 0.84375,
"reward_std": 0.022097086533904076,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.062499985098838806,
"step": 472
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 5.645842642869884e-09,
"advantages/std": 0.24743583798408508,
"advantages/var": 0.0612244939188864,
"completions/clipped_ratio": -2.75,
"epoch": 2.7163323782234956,
"grad_norm": 44.063649508463605,
"learning_rate": 4.794160692563917e-07,
"loss": -0.1131,
"num_tokens": 74277313.0,
"residual_var": 0.04783164709806442,
"reward": 0.71875,
"reward_std": 0.1145455539226532,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"rho2": 0.21874995529651642,
"step": 473
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.2431736171986422e-09,
"advantages/std": 0.18728730082511902,
"advantages/var": 0.03507653305035863,
"completions/clipped_ratio": -2.9375,
"epoch": 2.7220630372492836,
"grad_norm": 31.63012832023641,
"learning_rate": 4.755894323586341e-07,
"loss": -0.288,
"num_tokens": 74413591.0,
"residual_var": 0.028499694541096687,
"reward": 0.91015625,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.91015625,
"rewards/drgrpo_math_reward/std": 0.2865179479122162,
"rho2": 0.18749995529651642,
"step": 474
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 5.854465882054629e-10,
"advantages/std": 0.19884873926639557,
"advantages/var": 0.03954082110783497,
"completions/clipped_ratio": -2.65625,
"epoch": 2.7277936962750715,
"grad_norm": 40.0832962857382,
"learning_rate": 4.7177335814691564e-07,
"loss": -0.1724,
"num_tokens": 74577378.0,
"residual_var": 0.03089127689599991,
"reward": 0.8046875,
"reward_std": 0.08588206768035889,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"rho2": 0.21874995529651642,
"step": 475
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.2320239164203405e-09,
"advantages/std": 0.18898223340511322,
"advantages/var": 0.03571428454278469,
"completions/clipped_ratio": -2.921875,
"epoch": 2.7335243553008595,
"grad_norm": 91.24920309867713,
"learning_rate": 4.6796792348466353e-07,
"loss": -0.1096,
"num_tokens": 74713567.0,
"residual_var": 0.030133940279483795,
"reward": 0.875,
"reward_std": 0.07536394149065018,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.1562499701976776,
"step": 476
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.050673483071512e-09,
"advantages/std": 0.22160132229328156,
"advantages/var": 0.049107146042130845,
"completions/clipped_ratio": -2.890625,
"epoch": 2.7392550143266474,
"grad_norm": 40.786722179486674,
"learning_rate": 4.641732050210031e-07,
"loss": -0.4072,
"num_tokens": 74844079.0,
"residual_var": 0.03836497291922569,
"reward": 0.89453125,
"reward_std": 0.096134252846241,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.21874994039535522,
"step": 477
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 8.09588026328236e-09,
"advantages/std": 0.23007319867610931,
"advantages/var": 0.05293367674905647,
"completions/clipped_ratio": -2.90625,
"epoch": 2.7449856733524354,
"grad_norm": 68.16071403640386,
"learning_rate": 4.6038927918921566e-07,
"loss": 0.1926,
"num_tokens": 74988949.0,
"residual_var": 0.04135444387793541,
"reward": 0.87890625,
"reward_std": 0.1060032844543457,
"rewards/drgrpo_math_reward/mean": 0.87890625,
"rewards/drgrpo_math_reward/std": 0.3268752694129944,
"rho2": 0.21874995529651642,
"step": 478
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.2825037950751864e-09,
"advantages/std": 0.21279241144657135,
"advantages/var": 0.04528061036924691,
"completions/clipped_ratio": -2.703125,
"epoch": 2.7507163323782233,
"grad_norm": 48.60041249893682,
"learning_rate": 4.5661622220519455e-07,
"loss": -0.1421,
"num_tokens": 75147052.0,
"residual_var": 0.03679051250219345,
"reward": 0.79296875,
"reward_std": 0.09153735637664795,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"rho2": 0.18749994039535522,
"step": 479
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.729520554863667e-09,
"advantages/std": 0.1872873157262802,
"advantages/var": 0.03507653863195537,
"completions/clipped_ratio": -2.796875,
"epoch": 2.7564469914040117,
"grad_norm": 30.727892281289908,
"learning_rate": 4.52854110065914e-07,
"loss": -0.3299,
"num_tokens": 75292654.0,
"residual_var": 0.031788118183612823,
"reward": 0.84765625,
"reward_std": 0.06180911511182785,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.0937499925494194,
"step": 480
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.663506180869644e-09,
"advantages/std": 0.19066213071346283,
"advantages/var": 0.036352048088197586,
"completions/clipped_ratio": -3.0,
"epoch": 2.7621776504297992,
"grad_norm": 35.808697911905746,
"learning_rate": 4.4910301854789755e-07,
"loss": 0.0054,
"num_tokens": 75435968.0,
"residual_var": 0.031808048486709595,
"reward": 0.92578125,
"reward_std": 0.06944026798009872,
"rewards/drgrpo_math_reward/mean": 0.92578125,
"rewards/drgrpo_math_reward/std": 0.2626400291919708,
"rho2": 0.12499997019767761,
"step": 481
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 3.209860470786324e-09,
"advantages/std": 0.1450721174478531,
"advantages/var": 0.02104591926080368,
"completions/clipped_ratio": -2.890625,
"epoch": 2.7679083094555876,
"grad_norm": 41.99713741690543,
"learning_rate": 4.4536302320569006e-07,
"loss": 0.1462,
"num_tokens": 75594473.0,
"residual_var": 0.018415190279483795,
"reward": 0.82421875,
"reward_std": 0.04761157184839249,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.1249999850988388,
"step": 482
},
{
"advantages/mean": 8.149072527885437e-10,
"advantages/snr": 3.6773571907502916e-09,
"advantages/std": 0.22160132229328156,
"advantages/var": 0.049107146042130845,
"completions/clipped_ratio": -2.90625,
"epoch": 2.773638968481375,
"grad_norm": 32.267390440275356,
"learning_rate": 4.416341993703373e-07,
"loss": -0.0421,
"num_tokens": 75741541.0,
"residual_var": 0.036830369383096695,
"reward": 0.78515625,
"reward_std": 0.10205549001693726,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"rho2": 0.24999994039535522,
"step": 483
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 5.7594526340187176e-09,
"advantages/std": 0.16170330345630646,
"advantages/var": 0.026147958348682332,
"completions/clipped_ratio": -2.90625,
"epoch": 2.7793696275071635,
"grad_norm": 26.778065667371298,
"learning_rate": 4.3791662214786963e-07,
"loss": -0.2001,
"num_tokens": 75879212.0,
"residual_var": 0.022879473865032196,
"reward": 0.83984375,
"reward_std": 0.05273643881082535,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.12499997764825821,
"step": 484
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 7.488886640017008e-09,
"advantages/std": 0.24872122704982758,
"advantages/var": 0.06186224878517188,
"completions/clipped_ratio": -2.921875,
"epoch": 2.785100286532951,
"grad_norm": 46.19294691896084,
"learning_rate": 4.3421036641778553e-07,
"loss": -0.2724,
"num_tokens": 76021846.0,
"residual_var": 0.04446350410580635,
"reward": 0.78515625,
"reward_std": 0.12164628505706787,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"rho2": 0.2812499403953552,
"step": 485
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.13832083344459534,
"advantages/var": 0.019132652964807484,
"completions/clipped_ratio": -2.921875,
"epoch": 2.7908309455587395,
"grad_norm": 24.433410892525433,
"learning_rate": 4.3051550683154804e-07,
"loss": 0.0299,
"num_tokens": 76165491.0,
"residual_var": 0.017936870455741882,
"reward": 0.921875,
"reward_std": 0.0388009138405323,
"rewards/drgrpo_math_reward/mean": 0.921875,
"rewards/drgrpo_math_reward/std": 0.26889389753341675,
"rho2": 0.062499988824129105,
"step": 486
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 4.217937792220085e-09,
"advantages/std": 0.16560032963752747,
"advantages/var": 0.027423469176057758,
"completions/clipped_ratio": -2.9375,
"epoch": 2.796561604584527,
"grad_norm": 31.259156249194138,
"learning_rate": 4.2683211781107785e-07,
"loss": -0.0272,
"num_tokens": 76298358.0,
"residual_var": 0.023138564079999924,
"reward": 0.82421875,
"reward_std": 0.06036758795380592,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.15624995529651642,
"step": 487
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 7.818224078022042e-09,
"advantages/std": 0.23824401199817657,
"advantages/var": 0.056760209252987304,
"completions/clipped_ratio": -2.921875,
"epoch": 2.8022922636103154,
"grad_norm": 69.13207204855937,
"learning_rate": 4.2316027354725515e-07,
"loss": -0.2976,
"num_tokens": 76438276.0,
"residual_var": 0.042570166289806366,
"reward": 0.85546875,
"reward_std": 0.11060018837451935,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.24999995529651642,
"step": 488
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 5.00823255374953e-09,
"advantages/std": 0.2789374887943268,
"advantages/var": 0.07780612265488518,
"completions/clipped_ratio": -2.84375,
"epoch": 2.8080229226361033,
"grad_norm": 45.43666043048814,
"learning_rate": 4.195000479984264e-07,
"loss": -0.6233,
"num_tokens": 76588502.0,
"residual_var": 0.058354608714580536,
"reward": 0.8515625,
"reward_std": 0.14400538802146912,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.24999994039535522,
"step": 489
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.0745772796518186e-09,
"advantages/std": 0.22446079552173615,
"advantages/var": 0.050382648726250645,
"completions/clipped_ratio": -2.78125,
"epoch": 2.8137535816618913,
"grad_norm": 57.63830937677325,
"learning_rate": 4.15851514888913e-07,
"loss": -0.0804,
"num_tokens": 76732142.0,
"residual_var": 0.04093591868877411,
"reward": 0.86328125,
"reward_std": 0.09666222333908081,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.18749995529651642,
"step": 490
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.0575632043984715e-09,
"advantages/std": 0.220157653093338,
"advantages/var": 0.048469392215566565,
"completions/clipped_ratio": -2.96875,
"epoch": 2.819484240687679,
"grad_norm": 51.84633885141528,
"learning_rate": 4.1221474770752696e-07,
"loss": -0.2512,
"num_tokens": 76877475.0,
"residual_var": 0.03635205328464508,
"reward": 0.875,
"reward_std": 0.10205793380737305,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.2499999701976776,
"step": 491
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 2.95806495593781e-09,
"advantages/std": 0.2754865884780884,
"advantages/var": 0.07589286043129562,
"completions/clipped_ratio": -2.625,
"epoch": 2.825214899713467,
"grad_norm": 57.20556558168534,
"learning_rate": 4.0858981970609107e-07,
"loss": -0.5989,
"num_tokens": 77042376.0,
"residual_var": 0.05217635631561279,
"reward": 0.74609375,
"reward_std": 0.14940109848976135,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"rho2": 0.3124999403953552,
"step": 492
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 6.233602299984905e-09,
"advantages/std": 0.29880717396736145,
"advantages/var": 0.08928572721436101,
"completions/clipped_ratio": -2.828125,
"epoch": 2.830945558739255,
"grad_norm": 59.43106284892127,
"learning_rate": 4.049768038979631e-07,
"loss": -0.5261,
"num_tokens": 77215285.0,
"residual_var": 0.061383944004774094,
"reward": 0.8515625,
"reward_std": 0.1630682349205017,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.3124999403953552,
"step": 493
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.030785103248179e-09,
"advantages/std": 0.22587698698043823,
"advantages/var": 0.05102041324736106,
"completions/clipped_ratio": -2.953125,
"epoch": 2.836676217765043,
"grad_norm": 45.3427622201107,
"learning_rate": 4.013757730565648e-07,
"loss": -0.6061,
"num_tokens": 77347136.0,
"residual_var": 0.04304848238825798,
"reward": 0.8515625,
"reward_std": 0.09719263762235641,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.15624995529651642,
"step": 494
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 8.347054256249217e-10,
"advantages/std": 0.2789374887943268,
"advantages/var": 0.07780612265488518,
"completions/clipped_ratio": -2.796875,
"epoch": 2.842406876790831,
"grad_norm": 84.63958616387762,
"learning_rate": 3.977867997139178e-07,
"loss": -0.4822,
"num_tokens": 77512853.0,
"residual_var": 0.05106028541922569,
"reward": 0.7578125,
"reward_std": 0.15702980756759644,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"rho2": 0.34374991059303284,
"step": 495
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 5.3587966720360335e-09,
"advantages/std": 0.21724152565002441,
"advantages/var": 0.047193880466750215,
"completions/clipped_ratio": -2.875,
"epoch": 2.848137535816619,
"grad_norm": 51.67407403119023,
"learning_rate": 3.942099561591802e-07,
"loss": -0.3061,
"num_tokens": 77665722.0,
"residual_var": 0.03539542108774185,
"reward": 0.7890625,
"reward_std": 0.10034801065921783,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"rho2": 0.24999994039535522,
"step": 496
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.011886590227791e-09,
"advantages/std": 0.2314550280570984,
"advantages/var": 0.0535714300129122,
"completions/clipped_ratio": -2.984375,
"epoch": 2.853868194842407,
"grad_norm": 41.89118231868127,
"learning_rate": 3.9064531443719194e-07,
"loss": -0.1388,
"num_tokens": 77811032.0,
"residual_var": 0.043526798486709595,
"reward": 0.890625,
"reward_std": 0.1065337061882019,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.1874999701976776,
"step": 497
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 3.7836533251311736e-09,
"advantages/std": 0.24614372849464417,
"advantages/var": 0.0605867350772451,
"completions/clipped_ratio": -2.9375,
"epoch": 2.859598853868195,
"grad_norm": 38.561900143076926,
"learning_rate": 3.8709294634702373e-07,
"loss": -0.4367,
"num_tokens": 77951892.0,
"residual_var": 0.04922673851251602,
"reward": 0.89453125,
"reward_std": 0.11336605250835419,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.18749995529651642,
"step": 498
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.035955098730885e-09,
"advantages/std": 0.23007319867610931,
"advantages/var": 0.05293367674905647,
"completions/clipped_ratio": -2.75,
"epoch": 2.865329512893983,
"grad_norm": 70.63712164551843,
"learning_rate": 3.835529234405303e-07,
"loss": -0.2097,
"num_tokens": 78096080.0,
"residual_var": 0.04135444387793541,
"reward": 0.83203125,
"reward_std": 0.09954920411109924,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"rho2": 0.21874995529651642,
"step": 499
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 5.167123471029963e-09,
"advantages/std": 0.1577100306749344,
"advantages/var": 0.024872453775488745,
"completions/clipped_ratio": -2.921875,
"epoch": 2.871060171919771,
"grad_norm": 24.234209829875365,
"learning_rate": 3.8002531702090933e-07,
"loss": -0.3787,
"num_tokens": 78230318.0,
"residual_var": 0.02254066802561283,
"reward": 0.94921875,
"reward_std": 0.04510528966784477,
"rewards/drgrpo_math_reward/mean": 0.94921875,
"rewards/drgrpo_math_reward/std": 0.21998079121112823,
"rho2": 0.0937499850988388,
"step": 500
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.2211688223967514e-09,
"advantages/std": 0.19066211581230164,
"advantages/var": 0.03635204240602352,
"completions/clipped_ratio": -2.984375,
"epoch": 2.8767908309455588,
"grad_norm": 31.66254462537452,
"learning_rate": 3.765101981412665e-07,
"loss": -0.1826,
"num_tokens": 78372140.0,
"residual_var": 0.02840004302561283,
"reward": 0.81640625,
"reward_std": 0.08246467262506485,
"rewards/drgrpo_math_reward/mean": 0.81640625,
"rewards/drgrpo_math_reward/std": 0.387910932302475,
"rho2": 0.21874995529651642,
"step": 501
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.2624487090453552e-09,
"advantages/std": 0.27664169669151306,
"advantages/var": 0.07653062834835911,
"completions/clipped_ratio": -2.875,
"epoch": 2.8825214899713467,
"grad_norm": 48.621615828233956,
"learning_rate": 3.730076376031821e-07,
"loss": -0.811,
"num_tokens": 78520915.0,
"residual_var": 0.05500639230012894,
"reward": 0.8125,
"reward_std": 0.1434774398803711,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"rho2": 0.2812499403953552,
"step": 502
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 9.292506007152315e-09,
"advantages/std": 0.20044593513011932,
"advantages/var": 0.040178572910188004,
"completions/clipped_ratio": -2.8125,
"epoch": 2.8882521489971347,
"grad_norm": 50.217816467547244,
"learning_rate": 3.6951770595528606e-07,
"loss": 0.1648,
"num_tokens": 78670558.0,
"residual_var": 0.032645098865032196,
"reward": 0.89453125,
"reward_std": 0.07995839416980743,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.18749995529651642,
"step": 503
},
{
"advantages/mean": 1.280568540096283e-09,
"advantages/snr": 4.535454255086579e-09,
"advantages/std": 0.282346248626709,
"advantages/var": 0.07971940411357537,
"completions/clipped_ratio": -2.890625,
"epoch": 2.8939828080229226,
"grad_norm": 45.149223842328865,
"learning_rate": 3.6604047349183866e-07,
"loss": 0.558,
"num_tokens": 78817902.0,
"residual_var": 0.062280792742967606,
"reward": 0.80859375,
"reward_std": 0.13979163765907288,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.21874995529651642,
"step": 504
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 5.000042345610262e-09,
"advantages/std": 0.23282866179943085,
"advantages/var": 0.05420918575531375,
"completions/clipped_ratio": -2.890625,
"epoch": 2.8997134670487106,
"grad_norm": 39.995422100809684,
"learning_rate": 3.625760102513102e-07,
"loss": -0.403,
"num_tokens": 78964706.0,
"residual_var": 0.03726882115006447,
"reward": 0.86328125,
"reward_std": 0.12073761969804764,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.3124999701976776,
"step": 505
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.5461776548722686e-09,
"advantages/std": 0.22587698698043823,
"advantages/var": 0.05102041324736106,
"completions/clipped_ratio": -2.96875,
"epoch": 2.9054441260744985,
"grad_norm": 31.787805292575744,
"learning_rate": 3.5912438601497584e-07,
"loss": -0.0309,
"num_tokens": 79114030.0,
"residual_var": 0.04304848238825798,
"reward": 0.8828125,
"reward_std": 0.09719263762235641,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.15624995529651642,
"step": 506
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 4.407822555439857e-09,
"advantages/std": 0.21128857135772705,
"advantages/var": 0.044642860386389316,
"completions/clipped_ratio": -2.921875,
"epoch": 2.9111747851002865,
"grad_norm": 34.96829725190875,
"learning_rate": 3.5568567030550577e-07,
"loss": -0.0775,
"num_tokens": 79275874.0,
"residual_var": 0.034877244383096695,
"reward": 0.8125,
"reward_std": 0.09100693464279175,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"rho2": 0.21874995529651642,
"step": 507
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 5.090684667873576e-09,
"advantages/std": 0.22868302464485168,
"advantages/var": 0.05229592576071784,
"completions/clipped_ratio": -2.796875,
"epoch": 2.9169054441260744,
"grad_norm": 51.66520804751849,
"learning_rate": 3.522599323855664e-07,
"loss": -0.3839,
"num_tokens": 79426033.0,
"residual_var": 0.040856197476387024,
"reward": 0.859375,
"reward_std": 0.09955164790153503,
"rewards/drgrpo_math_reward/mean": 0.859375,
"rewards/drgrpo_math_reward/std": 0.3483152687549591,
"rho2": 0.2187499701976776,
"step": 508
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.115126408796943e-09,
"advantages/std": 0.220157653093338,
"advantages/var": 0.048469392215566565,
"completions/clipped_ratio": -2.921875,
"epoch": 2.9226361031518624,
"grad_norm": 48.69432010109758,
"learning_rate": 3.488472412564264e-07,
"loss": -0.2349,
"num_tokens": 79574622.0,
"residual_var": 0.03938138857483864,
"reward": 0.84375,
"reward_std": 0.09495475143194199,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.1874999701976776,
"step": 509
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.1291806000454375e-09,
"advantages/std": 0.2187044471502304,
"advantages/var": 0.047831635203287926,
"completions/clipped_ratio": -2.859375,
"epoch": 2.9283667621776504,
"grad_norm": 40.305077001653665,
"learning_rate": 3.45447665656565e-07,
"loss": -0.1031,
"num_tokens": 79727238.0,
"residual_var": 0.03736847639083862,
"reward": 0.83203125,
"reward_std": 0.09442433714866638,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"rho2": 0.21874995529651642,
"step": 510
},
{
"advantages/mean": -5.820766091346741e-10,
"advantages/snr": 2.5610062551388323e-09,
"advantages/std": 0.22728432714939117,
"advantages/var": 0.051658165367751474,
"completions/clipped_ratio": -2.921875,
"epoch": 2.9340974212034383,
"grad_norm": 51.197575059820984,
"learning_rate": 3.420612740602874e-07,
"loss": -0.2717,
"num_tokens": 79878658.0,
"residual_var": 0.03874363377690315,
"reward": 0.85546875,
"reward_std": 0.10547532886266708,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.2499999701976776,
"step": 511
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 5.158405507265383e-09,
"advantages/std": 0.27081698179244995,
"advantages/var": 0.07334183762717217,
"completions/clipped_ratio": -2.71875,
"epoch": 2.9398280802292263,
"grad_norm": 67.20237705888161,
"learning_rate": 3.3868813467634827e-07,
"loss": -0.4017,
"num_tokens": 80040195.0,
"residual_var": 0.05042252317070961,
"reward": 0.73046875,
"reward_std": 0.14598125219345093,
"rewards/drgrpo_math_reward/mean": 0.73046875,
"rewards/drgrpo_math_reward/std": 0.44458550214767456,
"rho2": 0.3124999403953552,
"step": 512
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.879726051639026e-09,
"advantages/std": 0.16170331835746765,
"advantages/var": 0.026147963167816535,
"completions/clipped_ratio": -2.890625,
"epoch": 2.945558739255014,
"grad_norm": 28.3295603420239,
"learning_rate": 3.3532831544657456e-07,
"loss": 0.2034,
"num_tokens": 80168350.0,
"residual_var": 0.022879473865032196,
"reward": 0.91796875,
"reward_std": 0.05273643881082535,
"rewards/drgrpo_math_reward/mean": 0.91796875,
"rewards/drgrpo_math_reward/std": 0.2749498784542084,
"rho2": 0.12499997764825821,
"step": 513
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 4.0725477342988605e-09,
"advantages/std": 0.22868302464485168,
"advantages/var": 0.05229592576071784,
"completions/clipped_ratio": -2.8125,
"epoch": 2.951289398280802,
"grad_norm": 37.95690908529424,
"learning_rate": 3.3198188404449865e-07,
"loss": -0.3579,
"num_tokens": 80315188.0,
"residual_var": 0.03922194987535477,
"reward": 0.84375,
"reward_std": 0.1054728701710701,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.24999994039535522,
"step": 514
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.3049056547270335e-09,
"advantages/std": 0.2020305097103119,
"advantages/var": 0.04081632685380843,
"completions/clipped_ratio": -2.78125,
"epoch": 2.95702005730659,
"grad_norm": 35.60499253279663,
"learning_rate": 3.28648907873996e-07,
"loss": -0.2166,
"num_tokens": 80472984.0,
"residual_var": 0.034438785165548325,
"reward": 0.8046875,
"reward_std": 0.08048880845308304,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"rho2": 0.1562499701976776,
"step": 515
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.061570206496358e-09,
"advantages/std": 0.22587698698043823,
"advantages/var": 0.05102041324736106,
"completions/clipped_ratio": -2.890625,
"epoch": 2.962750716332378,
"grad_norm": 54.49623163410567,
"learning_rate": 3.253294540679257e-07,
"loss": -0.3385,
"num_tokens": 80603270.0,
"residual_var": 0.03985970839858055,
"reward": 0.8828125,
"reward_std": 0.09784172475337982,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.21874995529651642,
"step": 516
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 5.7361820050862295e-09,
"advantages/std": 0.243538960814476,
"advantages/var": 0.05931122543459488,
"completions/clipped_ratio": -2.921875,
"epoch": 2.968481375358166,
"grad_norm": 40.92539135350785,
"learning_rate": 3.220235894867793e-07,
"loss": -0.3327,
"num_tokens": 80761049.0,
"residual_var": 0.044483426958322525,
"reward": 0.82421875,
"reward_std": 0.11875930428504944,
"rewards/drgrpo_math_reward/mean": 0.82421875,
"rewards/drgrpo_math_reward/std": 0.3813795745372772,
"rho2": 0.24999995529651642,
"step": 517
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.7909778497636661e-09,
"advantages/std": 0.260003924369812,
"advantages/var": 0.06760204068770292,
"completions/clipped_ratio": -2.90625,
"epoch": 2.974212034383954,
"grad_norm": 58.74183993632324,
"learning_rate": 3.1873138071733396e-07,
"loss": -0.4267,
"num_tokens": 80912014.0,
"residual_var": 0.05070154368877411,
"reward": 0.8046875,
"reward_std": 0.1337556689977646,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"rho2": 0.24999994039535522,
"step": 518
},
{
"advantages/mean": -5.820766091346741e-10,
"advantages/snr": 2.3773286733085444e-09,
"advantages/std": 0.24484480917453766,
"advantages/var": 0.05994898057971576,
"completions/clipped_ratio": -2.734375,
"epoch": 2.9799426934097424,
"grad_norm": 54.40628144346276,
"learning_rate": 3.154528940713113e-07,
"loss": -0.5217,
"num_tokens": 81056401.0,
"residual_var": 0.04308834299445152,
"reward": 0.703125,
"reward_std": 0.11993882060050964,
"rewards/drgrpo_math_reward/mean": 0.703125,
"rewards/drgrpo_math_reward/std": 0.45777595043182373,
"rho2": 0.2812499403953552,
"step": 519
},
{
"advantages/mean": -5.820766091346741e-10,
"advantages/snr": 2.1218375358741393e-09,
"advantages/std": 0.2743266522884369,
"advantages/var": 0.07525511215578096,
"completions/clipped_ratio": -2.765625,
"epoch": 2.98567335243553,
"grad_norm": 63.17104470170552,
"learning_rate": 3.121881955840421e-07,
"loss": -0.4794,
"num_tokens": 81215589.0,
"residual_var": 0.05408961698412895,
"reward": 0.796875,
"reward_std": 0.14822161197662354,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"rho2": 0.2812499403953552,
"step": 520
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 5.590217219463297e-10,
"advantages/std": 0.2082482874393463,
"advantages/var": 0.043367349221420604,
"completions/clipped_ratio": -2.796875,
"epoch": 2.9914040114613183,
"grad_norm": 37.01518983121177,
"learning_rate": 3.0893735101313535e-07,
"loss": -0.0663,
"num_tokens": 81356272.0,
"residual_var": 0.03523597866296768,
"reward": 0.8671875,
"reward_std": 0.08337578177452087,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.1874999701976776,
"step": 521
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 4.7068702485910024e-09,
"advantages/std": 0.17313143610954285,
"advantages/var": 0.029974494169352717,
"completions/clipped_ratio": -2.96875,
"epoch": 2.997134670487106,
"grad_norm": 58.93078878918081,
"learning_rate": 3.0570042583715405e-07,
"loss": -0.1133,
"num_tokens": 81490470.0,
"residual_var": 0.027164394035935402,
"reward": 0.90234375,
"reward_std": 0.05668424814939499,
"rewards/drgrpo_math_reward/mean": 0.90234375,
"rewards/drgrpo_math_reward/std": 0.29743078351020813,
"rho2": 0.0937499850988388,
"step": 522
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 4.970887406617712e-10,
"advantages/std": 0.23419423401355743,
"advantages/var": 0.0548469392451969,
"completions/clipped_ratio": -2.84375,
"epoch": 3.005730659025788,
"grad_norm": 58.40789905317615,
"learning_rate": 3.0247748525429785e-07,
"loss": -0.1845,
"num_tokens": 81625398.0,
"residual_var": 0.03942125290632248,
"reward": 0.8359375,
"reward_std": 0.11481395363807678,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.2812499403953552,
"step": 523
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 7.009548925799328e-10,
"advantages/std": 0.332162082195282,
"advantages/var": 0.11033164884830526,
"completions/clipped_ratio": -2.640625,
"epoch": 3.011461318051576,
"grad_norm": 70.75980746457977,
"learning_rate": 2.992685941810864e-07,
"loss": -0.3411,
"num_tokens": 81783785.0,
"residual_var": 0.06206155940890312,
"reward": 0.78515625,
"reward_std": 0.2106797993183136,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"rho2": 0.4374999403953552,
"step": 524
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.1435186688144133e-09,
"advantages/std": 0.21724152565002441,
"advantages/var": 0.047193880466750215,
"completions/clipped_ratio": -2.875,
"epoch": 3.017191977077364,
"grad_norm": 42.73496740391504,
"learning_rate": 2.9607381725105507e-07,
"loss": -0.1104,
"num_tokens": 81926608.0,
"residual_var": 0.03539542108774185,
"reward": 0.8515625,
"reward_std": 0.10034800320863724,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.24999995529651642,
"step": 525
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.3541305716820545e-09,
"advantages/std": 0.20824827253818512,
"advantages/var": 0.04336734301513823,
"completions/clipped_ratio": -2.828125,
"epoch": 3.022922636103152,
"grad_norm": 33.651790231476845,
"learning_rate": 2.9289321881345254e-07,
"loss": -0.3786,
"num_tokens": 82065619.0,
"residual_var": 0.03523597866296768,
"reward": 0.8359375,
"reward_std": 0.08982987701892853,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.1874999701976776,
"step": 526
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 7.199315129097565e-10,
"advantages/std": 0.16170331835746765,
"advantages/var": 0.026147963167816535,
"completions/clipped_ratio": -3.0,
"epoch": 3.0286532951289398,
"grad_norm": 32.1041070083472,
"learning_rate": 2.8972686293194306e-07,
"loss": -0.3798,
"num_tokens": 82190899.0,
"residual_var": 0.022879473865032196,
"reward": 0.97265625,
"reward_std": 0.05273643881082535,
"rewards/drgrpo_math_reward/mean": 0.97265625,
"rewards/drgrpo_math_reward/std": 0.1634024828672409,
"rho2": 0.12499997764825821,
"step": 527
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.3049056547270335e-09,
"advantages/std": 0.2020305097103119,
"advantages/var": 0.04081632685380843,
"completions/clipped_ratio": -2.859375,
"epoch": 3.0343839541547277,
"grad_norm": 69.42900570188318,
"learning_rate": 2.8657481338331713e-07,
"loss": -0.0839,
"num_tokens": 82341140.0,
"residual_var": 0.03188776969909668,
"reward": 0.8984375,
"reward_std": 0.0875919908285141,
"rewards/drgrpo_math_reward/mean": 0.8984375,
"rewards/drgrpo_math_reward/std": 0.3026638329029083,
"rho2": 0.21874995529651642,
"step": 528
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.380496292175044e-09,
"advantages/std": 0.19561520218849182,
"advantages/var": 0.038265307327244535,
"completions/clipped_ratio": -2.984375,
"epoch": 3.0401146131805157,
"grad_norm": 29.96720122136539,
"learning_rate": 2.834371336562077e-07,
"loss": -0.0128,
"num_tokens": 82472668.0,
"residual_var": 0.031090570613741875,
"reward": 0.9140625,
"reward_std": 0.07825092226266861,
"rewards/drgrpo_math_reward/mean": 0.9140625,
"rewards/drgrpo_math_reward/std": 0.28082075715065,
"rho2": 0.1874999701976776,
"step": 529
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 2.3773286733085444e-09,
"advantages/std": 0.24484480917453766,
"advantages/var": 0.05994898057971576,
"completions/clipped_ratio": -2.890625,
"epoch": 3.0458452722063036,
"grad_norm": 44.458534391156505,
"learning_rate": 2.803138869498102e-07,
"loss": -0.0022,
"num_tokens": 82621094.0,
"residual_var": 0.05245537310838699,
"reward": 0.875,
"reward_std": 0.10626532137393951,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.12499997764825821,
"step": 530
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.2142857164144516,
"advantages/var": 0.04591836825925477,
"completions/clipped_ratio": -2.9375,
"epoch": 3.0515759312320916,
"grad_norm": 50.51329825337595,
"learning_rate": 2.7720513617260855e-07,
"loss": -0.5093,
"num_tokens": 82765309.0,
"residual_var": 0.03874363377690315,
"reward": 0.890625,
"reward_std": 0.09206777065992355,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.1562499701976776,
"step": 531
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.1151265519572637e-09,
"advantages/std": 0.22015763819217682,
"advantages/var": 0.04846938565435743,
"completions/clipped_ratio": -2.90625,
"epoch": 3.0573065902578795,
"grad_norm": 35.10227440382755,
"learning_rate": 2.7411094394111167e-07,
"loss": 0.1374,
"num_tokens": 82918289.0,
"residual_var": 0.03938138484954834,
"reward": 0.84375,
"reward_std": 0.09495474398136139,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.1874999701976776,
"step": 532
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.9155006725188733e-09,
"advantages/std": 0.15971913933753967,
"advantages/var": 0.025510203470724413,
"completions/clipped_ratio": -2.8125,
"epoch": 3.0630372492836675,
"grad_norm": 35.13963081749907,
"learning_rate": 2.7103137257858863e-07,
"loss": -0.2506,
"num_tokens": 83050303.0,
"residual_var": 0.021524246782064438,
"reward": 0.8671875,
"reward_std": 0.0586601123213768,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.1562499701976776,
"step": 533
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.2395787239074707,
"advantages/var": 0.057397964949132074,
"completions/clipped_ratio": -2.796875,
"epoch": 3.0687679083094554,
"grad_norm": 46.470983378748755,
"learning_rate": 2.6796648411381485e-07,
"loss": -0.1509,
"num_tokens": 83225273.0,
"residual_var": 0.04304848611354828,
"reward": 0.8515625,
"reward_std": 0.11705183237791061,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.24999995529651642,
"step": 534
},
{
"advantages/mean": -1.7462298274040222e-09,
"advantages/snr": 5.242033628734544e-09,
"advantages/std": 0.3331206738948822,
"advantages/var": 0.11096938337618045,
"completions/clipped_ratio": -2.765625,
"epoch": 3.0744985673352434,
"grad_norm": 66.47161989496657,
"learning_rate": 2.6491634027982324e-07,
"loss": -0.9807,
"num_tokens": 83377762.0,
"residual_var": 0.06588809192180634,
"reward": 0.71875,
"reward_std": 0.2112102210521698,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"rho2": 0.4062499403953552,
"step": 535
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 3.896001826068581e-10,
"advantages/std": 0.29880714416503906,
"advantages/var": 0.08928570940406644,
"completions/clipped_ratio": -2.765625,
"epoch": 3.0802292263610314,
"grad_norm": 55.3104308735608,
"learning_rate": 2.6188100251265943e-07,
"loss": -0.5125,
"num_tokens": 83524281.0,
"residual_var": 0.058593761175870895,
"reward": 0.8203125,
"reward_std": 0.16898946464061737,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.3437499403953552,
"step": 536
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 9.084364294103083e-10,
"advantages/std": 0.2562982141971588,
"advantages/var": 0.0656887746006527,
"completions/clipped_ratio": -2.9375,
"epoch": 3.0859598853868193,
"grad_norm": 48.087935659741,
"learning_rate": 2.5886053195014534e-07,
"loss": -0.335,
"num_tokens": 83672810.0,
"residual_var": 0.049266595393419266,
"reward": 0.90234375,
"reward_std": 0.1255941092967987,
"rewards/drgrpo_math_reward/mean": 0.90234375,
"rewards/drgrpo_math_reward/std": 0.29743078351020813,
"rho2": 0.24999995529651642,
"step": 537
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.7153301191023887e-09,
"advantages/std": 0.20360276103019714,
"advantages/var": 0.041454084299119565,
"completions/clipped_ratio": -2.890625,
"epoch": 3.0916905444126073,
"grad_norm": 36.909324026341544,
"learning_rate": 2.558549894306472e-07,
"loss": -0.2538,
"num_tokens": 83813642.0,
"residual_var": 0.03368145599961281,
"reward": 0.77734375,
"reward_std": 0.08166831731796265,
"rewards/drgrpo_math_reward/mean": 0.77734375,
"rewards/drgrpo_math_reward/std": 0.41684433817863464,
"rho2": 0.1874999701976776,
"step": 538
},
{
"advantages/mean": 2.0954757928848267e-09,
"advantages/snr": 7.840552332757146e-09,
"advantages/std": 0.26726123690605164,
"advantages/var": 0.07142856875255266,
"completions/clipped_ratio": -2.96875,
"epoch": 3.097421203438395,
"grad_norm": 45.36181805672189,
"learning_rate": 2.528644354918503e-07,
"loss": -0.2785,
"num_tokens": 83948051.0,
"residual_var": 0.04464287683367729,
"reward": 0.765625,
"reward_std": 0.15137697756290436,
"rewards/drgrpo_math_reward/mean": 0.765625,
"rewards/drgrpo_math_reward/std": 0.42443734407424927,
"rho2": 0.3749999403953552,
"step": 539
},
{
"advantages/mean": 1.0477378964424133e-09,
"advantages/snr": 4.1697309751285605e-09,
"advantages/std": 0.2512722909450531,
"advantages/var": 0.06313776419677541,
"completions/clipped_ratio": -2.765625,
"epoch": 3.103151862464183,
"grad_norm": 48.626496905887414,
"learning_rate": 2.498889303695404e-07,
"loss": -0.0594,
"num_tokens": 84100302.0,
"residual_var": 0.04538027569651604,
"reward": 0.79296875,
"reward_std": 0.12335620820522308,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"rho2": 0.2812499403953552,
"step": 540
},
{
"advantages/mean": 3.14321368932724e-09,
"advantages/snr": 1.0556975559982473e-08,
"advantages/std": 0.29773807525634766,
"advantages/var": 0.08864796145735454,
"completions/clipped_ratio": -2.890625,
"epoch": 3.1088825214899716,
"grad_norm": 56.91244977023089,
"learning_rate": 2.4692853399638913e-07,
"loss": -0.3349,
"num_tokens": 84266071.0,
"residual_var": 0.055404990911483765,
"reward": 0.80859375,
"reward_std": 0.17491313815116882,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.3749999403953552,
"step": 541
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.19561520218849182,
"advantages/var": 0.038265307327244535,
"completions/clipped_ratio": -3.0,
"epoch": 3.1146131805157595,
"grad_norm": 42.631437764051206,
"learning_rate": 2.439833060007471e-07,
"loss": -0.2056,
"num_tokens": 84388596.0,
"residual_var": 0.031090570613741875,
"reward": 0.9296875,
"reward_std": 0.07825092226266861,
"rewards/drgrpo_math_reward/mean": 0.9296875,
"rewards/drgrpo_math_reward/std": 0.2561737895011902,
"rho2": 0.1874999701976776,
"step": 542
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.8081166084975916e-09,
"advantages/std": 0.12876969575881958,
"advantages/var": 0.016581634545818957,
"completions/clipped_ratio": -2.921875,
"epoch": 3.1203438395415475,
"grad_norm": 27.399377039968435,
"learning_rate": 2.410533057054446e-07,
"loss": -0.1746,
"num_tokens": 84527155.0,
"residual_var": 0.015027116052806377,
"reward": 0.859375,
"reward_std": 0.036563027650117874,
"rewards/drgrpo_math_reward/mean": 0.859375,
"rewards/drgrpo_math_reward/std": 0.3483152687549591,
"rho2": 0.0937499850988388,
"step": 543
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.6412517826060687e-09,
"advantages/std": 0.21279242634773254,
"advantages/var": 0.04528061671095518,
"completions/clipped_ratio": -2.796875,
"epoch": 3.1260744985673354,
"grad_norm": 41.00985971424138,
"learning_rate": 2.381385921265936e-07,
"loss": -0.3216,
"num_tokens": 84699776.0,
"residual_var": 0.03396047279238701,
"reward": 0.74609375,
"reward_std": 0.09864053130149841,
"rewards/drgrpo_math_reward/mean": 0.74609375,
"rewards/drgrpo_math_reward/std": 0.4360972046852112,
"rho2": 0.24999995529651642,
"step": 544
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.911554501079231e-09,
"advantages/std": 0.1785714328289032,
"advantages/var": 0.03188775662256749,
"completions/clipped_ratio": -2.828125,
"epoch": 3.1318051575931234,
"grad_norm": 30.980264195023306,
"learning_rate": 2.352392239724016e-07,
"loss": 0.1331,
"num_tokens": 84825150.0,
"residual_var": 0.025908809155225754,
"reward": 0.7890625,
"reward_std": 0.07141612470149994,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"rho2": 0.18749995529651642,
"step": 545
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 3.931260267227327e-09,
"advantages/std": 0.23690177500247955,
"advantages/var": 0.056122450999325446,
"completions/clipped_ratio": -2.84375,
"epoch": 3.1375358166189113,
"grad_norm": 46.5091780323008,
"learning_rate": 2.3235525964198888e-07,
"loss": -0.0935,
"num_tokens": 84974438.0,
"residual_var": 0.04209184646606445,
"reward": 0.796875,
"reward_std": 0.1153419017791748,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"rho2": 0.24999994039535522,
"step": 546
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.9120605513708516e-09,
"advantages/std": 0.2435389757156372,
"advantages/var": 0.05931123269262173,
"completions/clipped_ratio": -2.921875,
"epoch": 3.1432664756446993,
"grad_norm": 44.11507983606423,
"learning_rate": 2.2948675722421085e-07,
"loss": -0.6253,
"num_tokens": 85131523.0,
"residual_var": 0.04077647998929024,
"reward": 0.86328125,
"reward_std": 0.1258624941110611,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.3124999403953552,
"step": 547
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 4.79665589833252e-09,
"advantages/std": 0.29124119877815247,
"advantages/var": 0.08482143586573532,
"completions/clipped_ratio": -2.640625,
"epoch": 3.1489971346704873,
"grad_norm": 53.1988713378116,
"learning_rate": 2.266337744964888e-07,
"loss": -0.678,
"num_tokens": 85310056.0,
"residual_var": 0.05831475183367729,
"reward": 0.64453125,
"reward_std": 0.15794092416763306,
"rewards/drgrpo_math_reward/mean": 0.64453125,
"rewards/drgrpo_math_reward/std": 0.4795927405357361,
"rho2": 0.3124999403953552,
"step": 548
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 4.376671726766915e-09,
"advantages/std": 0.21279241144657135,
"advantages/var": 0.04528061036924691,
"completions/clipped_ratio": -2.90625,
"epoch": 3.154727793696275,
"grad_norm": 43.52396169619168,
"learning_rate": 2.2379636892364717e-07,
"loss": -0.1809,
"num_tokens": 85450686.0,
"residual_var": 0.036790505051612854,
"reward": 0.83984375,
"reward_std": 0.0850832611322403,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.1874999701976776,
"step": 549
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 5.215406001438975e-09,
"advantages/std": 0.1785714328289032,
"advantages/var": 0.03188775662256749,
"completions/clipped_ratio": -3.0,
"epoch": 3.160458452722063,
"grad_norm": 37.17154478236981,
"learning_rate": 2.2097459765675343e-07,
"loss": -0.1732,
"num_tokens": 85594559.0,
"residual_var": 0.025908811017870903,
"reward": 0.90625,
"reward_std": 0.07141612470149994,
"rewards/drgrpo_math_reward/mean": 0.90625,
"rewards/drgrpo_math_reward/std": 0.2920515835285187,
"rho2": 0.18749995529651642,
"step": 550
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 6.108821999503798e-09,
"advantages/std": 0.2286830097436905,
"advantages/var": 0.05229591894543284,
"completions/clipped_ratio": -2.828125,
"epoch": 3.166189111747851,
"grad_norm": 41.61138123554162,
"learning_rate": 2.181685175319702e-07,
"loss": -0.3167,
"num_tokens": 85736708.0,
"residual_var": 0.04249044507741928,
"reward": 0.796875,
"reward_std": 0.09836968779563904,
"rewards/drgrpo_math_reward/mean": 0.796875,
"rewards/drgrpo_math_reward/std": 0.40311288833618164,
"rho2": 0.18749994039535522,
"step": 551
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 5.559641684095414e-09,
"advantages/std": 0.1675148457288742,
"advantages/var": 0.028061223539568525,
"completions/clipped_ratio": -2.984375,
"epoch": 3.171919770773639,
"grad_norm": 33.13704713458902,
"learning_rate": 2.153781850694082e-07,
"loss": -0.0867,
"num_tokens": 85880529.0,
"residual_var": 0.024553582072257996,
"reward": 0.9140625,
"reward_std": 0.05444391071796417,
"rewards/drgrpo_math_reward/mean": 0.9140625,
"rewards/drgrpo_math_reward/std": 0.28082075715065,
"rho2": 0.12499996274709702,
"step": 552
},
{
"advantages/mean": -1.7462298274040222e-09,
"advantages/snr": 8.781698823081943e-09,
"advantages/std": 0.19884873926639557,
"advantages/var": 0.03954082110783497,
"completions/clipped_ratio": -2.96875,
"epoch": 3.177650429799427,
"grad_norm": 50.867545017443405,
"learning_rate": 2.1260365647198797e-07,
"loss": -0.27,
"num_tokens": 86020634.0,
"residual_var": 0.03212692588567734,
"reward": 0.90625,
"reward_std": 0.07996084541082382,
"rewards/drgrpo_math_reward/mean": 0.90625,
"rewards/drgrpo_math_reward/std": 0.2920515835285187,
"rho2": 0.1874999701976776,
"step": 553
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.891826548037334e-09,
"advantages/std": 0.24614374339580536,
"advantages/var": 0.060586742412900074,
"completions/clipped_ratio": -2.921875,
"epoch": 3.183381088825215,
"grad_norm": 43.90013873622376,
"learning_rate": 2.0984498762430957e-07,
"loss": -0.3093,
"num_tokens": 86160983.0,
"residual_var": 0.04922673478722572,
"reward": 0.92578125,
"reward_std": 0.11336604505777359,
"rewards/drgrpo_math_reward/mean": 0.92578125,
"rewards/drgrpo_math_reward/std": 0.2626400291919708,
"rho2": 0.1874999701976776,
"step": 554
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 7.527790190493179e-09,
"advantages/std": 0.24743583798408508,
"advantages/var": 0.0612244939188864,
"completions/clipped_ratio": -2.828125,
"epoch": 3.189111747851003,
"grad_norm": 54.52746162819845,
"learning_rate": 2.0710223409152471e-07,
"loss": -0.3519,
"num_tokens": 86308593.0,
"residual_var": 0.047831643372774124,
"reward": 0.7734375,
"reward_std": 0.12099964171648026,
"rewards/drgrpo_math_reward/mean": 0.7734375,
"rewards/drgrpo_math_reward/std": 0.41942715644836426,
"rho2": 0.2187499701976776,
"step": 555
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 7.527789707330408e-10,
"advantages/std": 0.30929481983184814,
"advantages/var": 0.0956632855748154,
"completions/clipped_ratio": -2.921875,
"epoch": 3.194842406876791,
"grad_norm": 65.55525445754603,
"learning_rate": 2.043754511182191e-07,
"loss": -0.1717,
"num_tokens": 86470234.0,
"residual_var": 0.06875799596309662,
"reward": 0.8046875,
"reward_std": 0.17517516016960144,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"rho2": 0.2812499403953552,
"step": 556
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.7120408307325485e-09,
"advantages/std": 0.13599595427513123,
"advantages/var": 0.018494899579203583,
"completions/clipped_ratio": -2.984375,
"epoch": 3.200573065902579,
"grad_norm": 21.553191096455976,
"learning_rate": 2.0166469362729865e-07,
"loss": -0.1121,
"num_tokens": 86616135.0,
"residual_var": 0.016761012375354767,
"reward": 0.84765625,
"reward_std": 0.0382704958319664,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.09374997019767761,
"step": 557
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 5.965064887941254e-09,
"advantages/std": 0.23419423401355743,
"advantages/var": 0.0548469392451969,
"completions/clipped_ratio": -2.9375,
"epoch": 3.206303724928367,
"grad_norm": 53.09012235190345,
"learning_rate": 1.9897001621888432e-07,
"loss": -0.4491,
"num_tokens": 86763730.0,
"residual_var": 0.03942125290632248,
"reward": 0.8203125,
"reward_std": 0.11481394618749619,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.2812499403953552,
"step": 558
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 5.215406001438975e-09,
"advantages/std": 0.1785714328289032,
"advantages/var": 0.03188775662256749,
"completions/clipped_ratio": -2.859375,
"epoch": 3.2120343839541547,
"grad_norm": 41.12764521799682,
"learning_rate": 1.9629147316921123e-07,
"loss": -0.7028,
"num_tokens": 86899036.0,
"residual_var": 0.025908807292580605,
"reward": 0.90625,
"reward_std": 0.07141612470149994,
"rewards/drgrpo_math_reward/mean": 0.90625,
"rewards/drgrpo_math_reward/std": 0.2920515835285187,
"rho2": 0.1874999701976776,
"step": 559
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.0717593344072067e-09,
"advantages/std": 0.21724152565002441,
"advantages/var": 0.047193880466750215,
"completions/clipped_ratio": -2.90625,
"epoch": 3.2177650429799427,
"grad_norm": 41.15636930814853,
"learning_rate": 1.9362911842953678e-07,
"loss": -0.4087,
"num_tokens": 87036281.0,
"residual_var": 0.03834503889083862,
"reward": 0.875,
"reward_std": 0.09324482083320618,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.18749994039535522,
"step": 560
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.4863472343972843e-09,
"advantages/std": 0.18728730082511902,
"advantages/var": 0.03507653305035863,
"completions/clipped_ratio": -2.84375,
"epoch": 3.2234957020057307,
"grad_norm": 34.18837003348517,
"learning_rate": 1.9098300562505264e-07,
"loss": -0.2386,
"num_tokens": 87184357.0,
"residual_var": 0.02849968895316124,
"reward": 0.78515625,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.78515625,
"rewards/drgrpo_math_reward/std": 0.4115184545516968,
"rho2": 0.1874999701976776,
"step": 561
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.9120605513708516e-09,
"advantages/std": 0.2435389757156372,
"advantages/var": 0.05931123269262173,
"completions/clipped_ratio": -2.890625,
"epoch": 3.2292263610315186,
"grad_norm": 47.01875974339951,
"learning_rate": 1.8835318805380508e-07,
"loss": -0.3076,
"num_tokens": 87319723.0,
"residual_var": 0.05004385486245155,
"reward": 0.90234375,
"reward_std": 0.10573489964008331,
"rewards/drgrpo_math_reward/mean": 0.90234375,
"rewards/drgrpo_math_reward/std": 0.29743078351020813,
"rho2": 0.1562499701976776,
"step": 562
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.22728432714939117,
"advantages/var": 0.051658165367751474,
"completions/clipped_ratio": -3.0,
"epoch": 3.2349570200573066,
"grad_norm": 35.16009147859087,
"learning_rate": 1.8573971868562156e-07,
"loss": 0.0347,
"num_tokens": 87450416.0,
"residual_var": 0.04197227209806442,
"reward": 0.89453125,
"reward_std": 0.09837214648723602,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.1874999701976776,
"step": 563
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 6.387541800136313e-09,
"advantages/std": 0.2187044471502304,
"advantages/var": 0.047831635203287926,
"completions/clipped_ratio": -2.96875,
"epoch": 3.2406876790830945,
"grad_norm": 70.44073367079997,
"learning_rate": 1.8314265016104414e-07,
"loss": -0.3285,
"num_tokens": 87582852.0,
"residual_var": 0.041852690279483795,
"reward": 0.94140625,
"reward_std": 0.08785402029752731,
"rewards/drgrpo_math_reward/mean": 0.94140625,
"rewards/drgrpo_math_reward/std": 0.23532284796237946,
"rho2": 0.1249999850988388,
"step": 564
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.464047638551681e-09,
"advantages/std": 0.18898224830627441,
"advantages/var": 0.03571429017489436,
"completions/clipped_ratio": -2.875,
"epoch": 3.2464183381088825,
"grad_norm": 36.19653827955635,
"learning_rate": 1.805620347902681e-07,
"loss": 0.2787,
"num_tokens": 87718694.0,
"residual_var": 0.030133940279483795,
"reward": 0.8515625,
"reward_std": 0.06890984624624252,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.1562499701976776,
"step": 565
},
{
"advantages/mean": 1.862645149230957e-09,
"advantages/snr": 7.527790190493179e-09,
"advantages/std": 0.24743583798408508,
"advantages/var": 0.0612244939188864,
"completions/clipped_ratio": -2.875,
"epoch": 3.2521489971346704,
"grad_norm": 35.60382202111244,
"learning_rate": 1.7799792455209016e-07,
"loss": -0.4789,
"num_tokens": 87860362.0,
"residual_var": 0.05165817216038704,
"reward": 0.84375,
"reward_std": 0.10744237154722214,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.1562499701976776,
"step": 566
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.030785103248179e-09,
"advantages/std": 0.22587698698043823,
"advantages/var": 0.05102041324736106,
"completions/clipped_ratio": -2.875,
"epoch": 3.2578796561604584,
"grad_norm": 41.956700507237855,
"learning_rate": 1.7545037109285942e-07,
"loss": -0.1415,
"num_tokens": 88014979.0,
"residual_var": 0.03826531767845154,
"reward": 0.84375,
"reward_std": 0.10376295447349548,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.24999994039535522,
"step": 567
},
{
"advantages/mean": 3.4924596548080444e-10,
"advantages/snr": 1.4577502760144356e-09,
"advantages/std": 0.2395787239074707,
"advantages/var": 0.057397964949132074,
"completions/clipped_ratio": -2.78125,
"epoch": 3.2636103151862463,
"grad_norm": 53.664794389081344,
"learning_rate": 1.7291942572543805e-07,
"loss": -1.3273,
"num_tokens": 88173410.0,
"residual_var": 0.04304848238825798,
"reward": 0.875,
"reward_std": 0.11705182492733002,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.24999994039535522,
"step": 568
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.1615632508940394e-09,
"advantages/std": 0.20044593513011932,
"advantages/var": 0.040178572910188004,
"completions/clipped_ratio": -2.859375,
"epoch": 3.2693409742120343,
"grad_norm": 47.15953812150556,
"learning_rate": 1.7040513942816904e-07,
"loss": -0.5937,
"num_tokens": 88309931.0,
"residual_var": 0.032645098865032196,
"reward": 0.91796875,
"reward_std": 0.07995839416980743,
"rewards/drgrpo_math_reward/mean": 0.91796875,
"rewards/drgrpo_math_reward/std": 0.2749498784542084,
"rho2": 0.18749995529651642,
"step": 569
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.22587698698043823,
"advantages/var": 0.05102041324736106,
"completions/clipped_ratio": -3.0,
"epoch": 3.2750716332378222,
"grad_norm": 41.7539818186476,
"learning_rate": 1.6790756284384611e-07,
"loss": 0.1196,
"num_tokens": 88455379.0,
"residual_var": 0.04304847866296768,
"reward": 0.9296875,
"reward_std": 0.09073854982852936,
"rewards/drgrpo_math_reward/mean": 0.9296875,
"rewards/drgrpo_math_reward/std": 0.2561737895011902,
"rho2": 0.15624995529651642,
"step": 570
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 3.616233357409939e-09,
"advantages/std": 0.25753939151763916,
"advantages/var": 0.06632653818327583,
"completions/clipped_ratio": -2.796875,
"epoch": 3.28080229226361,
"grad_norm": 54.99980080757648,
"learning_rate": 1.6542674627869734e-07,
"loss": -0.1985,
"num_tokens": 88607222.0,
"residual_var": 0.055963024497032166,
"reward": 0.890625,
"reward_std": 0.11902132630348206,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.15624995529651642,
"step": 571
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 4.760992584350088e-09,
"advantages/std": 0.19561520218849182,
"advantages/var": 0.038265307327244535,
"completions/clipped_ratio": -2.84375,
"epoch": 3.286532951289398,
"grad_norm": 74.1830205981033,
"learning_rate": 1.6296273970136976e-07,
"loss": -0.1486,
"num_tokens": 88759284.0,
"residual_var": 0.033482152968645096,
"reward": 0.84375,
"reward_std": 0.07114773988723755,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.12499997019767761,
"step": 572
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 4.31630082528225e-09,
"advantages/std": 0.21576867997646332,
"advantages/var": 0.04655612325878544,
"completions/clipped_ratio": -2.859375,
"epoch": 3.292263610315186,
"grad_norm": 36.86016378974931,
"learning_rate": 1.6051559274192273e-07,
"loss": -0.4809,
"num_tokens": 88909197.0,
"residual_var": 0.03637198358774185,
"reward": 0.79296875,
"reward_std": 0.09271440654993057,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"rho2": 0.21874992549419403,
"step": 573
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.442337644793503e-09,
"advantages/std": 0.19066211581230164,
"advantages/var": 0.03635204240602352,
"completions/clipped_ratio": -2.828125,
"epoch": 3.297994269340974,
"grad_norm": 36.068320712126784,
"learning_rate": 1.5808535469082995e-07,
"loss": -0.0395,
"num_tokens": 89058629.0,
"residual_var": 0.02840004302561283,
"reward": 0.67578125,
"reward_std": 0.08246467262506485,
"rewards/drgrpo_math_reward/mean": 0.67578125,
"rewards/drgrpo_math_reward/std": 0.46899911761283875,
"rho2": 0.21874994039535522,
"step": 574
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 4.987832171158065e-09,
"advantages/std": 0.28007835149765015,
"advantages/var": 0.07844388297764127,
"completions/clipped_ratio": -2.796875,
"epoch": 3.303724928366762,
"grad_norm": 63.2889445553862,
"learning_rate": 1.5567207449798515e-07,
"loss": -0.6808,
"num_tokens": 89220273.0,
"residual_var": 0.05393018200993538,
"reward": 0.76171875,
"reward_std": 0.15756022930145264,
"rewards/drgrpo_math_reward/mean": 0.76171875,
"rewards/drgrpo_math_reward/std": 0.4268665909767151,
"rho2": 0.31249991059303284,
"step": 575
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 2.5932212552551478e-09,
"advantages/std": 0.22446082532405853,
"advantages/var": 0.050382662105157516,
"completions/clipped_ratio": -2.953125,
"epoch": 3.30945558739255,
"grad_norm": 48.82676725018094,
"learning_rate": 1.5327580077171588e-07,
"loss": -0.2314,
"num_tokens": 89355020.0,
"residual_var": 0.037787001579999924,
"reward": 0.90234375,
"reward_std": 0.10376540571451187,
"rewards/drgrpo_math_reward/mean": 0.90234375,
"rewards/drgrpo_math_reward/std": 0.29743078351020813,
"rho2": 0.2499999701976776,
"step": 576
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.7120408307325485e-09,
"advantages/std": 0.13599595427513123,
"advantages/var": 0.018494899579203583,
"completions/clipped_ratio": -2.84375,
"epoch": 3.315186246418338,
"grad_norm": 44.58586769511727,
"learning_rate": 1.508965817778065e-07,
"loss": -0.6477,
"num_tokens": 89481662.0,
"residual_var": 0.016761010512709618,
"reward": 0.87890625,
"reward_std": 0.0382704995572567,
"rewards/drgrpo_math_reward/mean": 0.87890625,
"rewards/drgrpo_math_reward/std": 0.3268752694129944,
"rho2": 0.09374997764825821,
"step": 577
},
{
"advantages/mean": 1.3969838619232178e-09,
"advantages/snr": 5.675479644112002e-09,
"advantages/std": 0.24614374339580536,
"advantages/var": 0.060586742412900074,
"completions/clipped_ratio": -2.796875,
"epoch": 3.3209169054441263,
"grad_norm": 42.11671269533907,
"learning_rate": 1.4853446543852388e-07,
"loss": -0.1887,
"num_tokens": 89619119.0,
"residual_var": 0.04922673478722572,
"reward": 0.80078125,
"reward_std": 0.11336604505777359,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.1874999701976776,
"step": 578
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.6008779446726684e-09,
"advantages/std": 0.1939782202243805,
"advantages/var": 0.03762754992141826,
"completions/clipped_ratio": -2.875,
"epoch": 3.326647564469914,
"grad_norm": 34.86806196473536,
"learning_rate": 1.461894993316527e-07,
"loss": -0.316,
"num_tokens": 89770285.0,
"residual_var": 0.029396535828709602,
"reward": 0.79296875,
"reward_std": 0.08417459577322006,
"rewards/drgrpo_math_reward/mean": 0.79296875,
"rewards/drgrpo_math_reward/std": 0.40597182512283325,
"rho2": 0.2187499701976776,
"step": 579
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 4.683572705643703e-09,
"advantages/std": 0.19884873926639557,
"advantages/var": 0.03954082110783497,
"completions/clipped_ratio": -2.9375,
"epoch": 3.3323782234957022,
"grad_norm": 40.50716015519298,
"learning_rate": 1.4386173068953844e-07,
"loss": 0.12,
"num_tokens": 89924251.0,
"residual_var": 0.03212692588567734,
"reward": 0.8515625,
"reward_std": 0.07996084541082382,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.1874999701976776,
"step": 580
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.380496292175044e-09,
"advantages/std": 0.19561520218849182,
"advantages/var": 0.038265307327244535,
"completions/clipped_ratio": -2.875,
"epoch": 3.3381088825214897,
"grad_norm": 28.278917680568767,
"learning_rate": 1.415512063981339e-07,
"loss": 0.0335,
"num_tokens": 90071164.0,
"residual_var": 0.033482152968645096,
"reward": 0.859375,
"reward_std": 0.0776018276810646,
"rewards/drgrpo_math_reward/mean": 0.859375,
"rewards/drgrpo_math_reward/std": 0.3483152687549591,
"rho2": 0.12499997019767761,
"step": 581
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 4.5869338591272445e-09,
"advantages/std": 0.2537976801395416,
"advantages/var": 0.06441326244421308,
"completions/clipped_ratio": -2.921875,
"epoch": 3.343839541547278,
"grad_norm": 60.39334512324598,
"learning_rate": 1.3925797299605645e-07,
"loss": -0.2582,
"num_tokens": 90224239.0,
"residual_var": 0.04830996319651604,
"reward": 0.85546875,
"reward_std": 0.13033825159072876,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.24999994039535522,
"step": 582
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.2355518937110901,
"advantages/var": 0.05548469463088068,
"completions/clipped_ratio": -2.8125,
"epoch": 3.349570200573066,
"grad_norm": 45.31574691335599,
"learning_rate": 1.3698207667364982e-07,
"loss": -0.2152,
"num_tokens": 90374149.0,
"residual_var": 0.041613537818193436,
"reward": 0.69921875,
"reward_std": 0.11534436047077179,
"rewards/drgrpo_math_reward/mean": 0.69921875,
"rewards/drgrpo_math_reward/std": 0.45949608087539673,
"rho2": 0.24999995529651642,
"step": 583
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 5.02971679938451e-10,
"advantages/std": 0.2314550131559372,
"advantages/var": 0.05357142311501506,
"completions/clipped_ratio": -2.96875,
"epoch": 3.355300859598854,
"grad_norm": 46.7865592883064,
"learning_rate": 1.3472356327205402e-07,
"loss": -0.0519,
"num_tokens": 90506453.0,
"residual_var": 0.038504477590322495,
"reward": 0.8828125,
"reward_std": 0.11310403048992157,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.2812499403953552,
"step": 584
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.19884872436523438,
"advantages/var": 0.039540815181680955,
"completions/clipped_ratio": -2.84375,
"epoch": 3.361031518624642,
"grad_norm": 29.8521935954077,
"learning_rate": 1.3248247828228243e-07,
"loss": -0.004,
"num_tokens": 90664546.0,
"residual_var": 0.03089127317070961,
"reward": 0.875,
"reward_std": 0.08588206768035889,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.21874994039535522,
"step": 585
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.3417865283086908e-09,
"advantages/std": 0.19884872436523438,
"advantages/var": 0.039540815181680955,
"completions/clipped_ratio": -2.921875,
"epoch": 3.36676217765043,
"grad_norm": 35.58514447675251,
"learning_rate": 1.3025886684430465e-07,
"loss": 0.032,
"num_tokens": 90800250.0,
"residual_var": 0.03089127317070961,
"reward": 0.8984375,
"reward_std": 0.08588206768035889,
"rewards/drgrpo_math_reward/mean": 0.8984375,
"rewards/drgrpo_math_reward/std": 0.3026638329029083,
"rho2": 0.21874995529651642,
"step": 586
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.6960714578275213e-09,
"advantages/std": 0.18898224830627441,
"advantages/var": 0.03571429017489436,
"completions/clipped_ratio": -2.890625,
"epoch": 3.372492836676218,
"grad_norm": 37.42055712162249,
"learning_rate": 1.2805277374613744e-07,
"loss": -0.1686,
"num_tokens": 90946273.0,
"residual_var": 0.030133938416838646,
"reward": 0.8984375,
"reward_std": 0.06890985369682312,
"rewards/drgrpo_math_reward/mean": 0.8984375,
"rewards/drgrpo_math_reward/std": 0.3026638329029083,
"rho2": 0.15624995529651642,
"step": 587
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.2002926482242227e-09,
"advantages/std": 0.1939782202243805,
"advantages/var": 0.03762754992141826,
"completions/clipped_ratio": -2.96875,
"epoch": 3.378223495702006,
"grad_norm": 36.531684597985844,
"learning_rate": 1.258642434229441e-07,
"loss": 0.061,
"num_tokens": 91078924.0,
"residual_var": 0.03174825385212898,
"reward": 0.90234375,
"reward_std": 0.077071413397789,
"rewards/drgrpo_math_reward/mean": 0.90234375,
"rewards/drgrpo_math_reward/std": 0.29743078351020813,
"rho2": 0.1562499701976776,
"step": 588
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.190248146087522e-09,
"advantages/std": 0.19561520218849182,
"advantages/var": 0.038265307327244535,
"completions/clipped_ratio": -3.0,
"epoch": 3.383954154727794,
"grad_norm": 40.661807421663504,
"learning_rate": 1.2369331995613663e-07,
"loss": -0.2936,
"num_tokens": 91224368.0,
"residual_var": 0.031090570613741875,
"reward": 0.9296875,
"reward_std": 0.07825092226266861,
"rewards/drgrpo_math_reward/mean": 0.9296875,
"rewards/drgrpo_math_reward/std": 0.2561737895011902,
"rho2": 0.1874999701976776,
"step": 589
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 3.942246463486898e-09,
"advantages/std": 0.20671138167381287,
"advantages/var": 0.04272959531349674,
"completions/clipped_ratio": -2.953125,
"epoch": 3.3896848137535818,
"grad_norm": 32.00222756493884,
"learning_rate": 1.215400470724901e-07,
"loss": -0.2501,
"num_tokens": 91371870.0,
"residual_var": 0.03338250517845154,
"reward": 0.90234375,
"reward_std": 0.08929945528507233,
"rewards/drgrpo_math_reward/mean": 0.90234375,
"rewards/drgrpo_math_reward/std": 0.29743078351020813,
"rho2": 0.21874995529651642,
"step": 590
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.3743802382730425e-09,
"advantages/std": 0.16940772533416748,
"advantages/var": 0.02869897740289673,
"completions/clipped_ratio": -2.96875,
"epoch": 3.3954154727793697,
"grad_norm": 30.824827526887713,
"learning_rate": 1.19404468143262e-07,
"loss": -0.0275,
"num_tokens": 91514404.0,
"residual_var": 0.02421477437019348,
"reward": 0.97265625,
"reward_std": 0.06207750737667084,
"rewards/drgrpo_math_reward/mean": 0.97265625,
"rewards/drgrpo_math_reward/std": 0.1634024828672409,
"rho2": 0.1562499701976776,
"step": 591
},
{
"advantages/mean": 1.5133991837501526e-09,
"advantages/snr": 5.688070868552017e-09,
"advantages/std": 0.26606544852256775,
"advantages/var": 0.07079082289751515,
"completions/clipped_ratio": -2.84375,
"epoch": 3.4011461318051577,
"grad_norm": 48.81711650537176,
"learning_rate": 1.1728662618331698e-07,
"loss": -0.4048,
"num_tokens": 91680934.0,
"residual_var": 0.04645648971199989,
"reward": 0.80078125,
"reward_std": 0.1437433809041977,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.34374991059303284,
"step": 592
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.380496292175044e-09,
"advantages/std": 0.19561520218849182,
"advantages/var": 0.038265307327244535,
"completions/clipped_ratio": -2.6875,
"epoch": 3.4068767908309456,
"grad_norm": 49.10110326954101,
"learning_rate": 1.1518656385026148e-07,
"loss": -0.53,
"num_tokens": 91830724.0,
"residual_var": 0.031090570613741875,
"reward": 0.8515625,
"reward_std": 0.07825092226266861,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.1874999701976776,
"step": 593
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.439863025819513e-09,
"advantages/std": 0.16170331835746765,
"advantages/var": 0.026147963167816535,
"completions/clipped_ratio": -2.96875,
"epoch": 3.4126074498567336,
"grad_norm": 27.394151022413606,
"learning_rate": 1.1310432344358489e-07,
"loss": -0.1922,
"num_tokens": 91979322.0,
"residual_var": 0.022879473865032196,
"reward": 0.88671875,
"reward_std": 0.05273643881082535,
"rewards/drgrpo_math_reward/mean": 0.88671875,
"rewards/drgrpo_math_reward/std": 0.31755712628364563,
"rho2": 0.12499997019767761,
"step": 594
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 9.884473017863691e-10,
"advantages/std": 0.2355518937110901,
"advantages/var": 0.05548469463088068,
"completions/clipped_ratio": -2.796875,
"epoch": 3.4183381088825215,
"grad_norm": 42.73983938785139,
"learning_rate": 1.1103994690380681e-07,
"loss": -0.4317,
"num_tokens": 92123199.0,
"residual_var": 0.03987964242696762,
"reward": 0.84765625,
"reward_std": 0.1148114949464798,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.28124991059303284,
"step": 595
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.13363061845302582,
"advantages/var": 0.017857142188138164,
"completions/clipped_ratio": -2.875,
"epoch": 3.4240687679083095,
"grad_norm": 23.86254859993742,
"learning_rate": 1.089934758116322e-07,
"loss": -0.097,
"num_tokens": 92270530.0,
"residual_var": 0.015625009313225746,
"reward": 0.8359375,
"reward_std": 0.04419417306780815,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.12499997764825821,
"step": 596
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 5.437436706144895e-09,
"advantages/std": 0.1712796986103058,
"advantages/var": 0.029336735156037186,
"completions/clipped_ratio": -2.953125,
"epoch": 3.4297994269340975,
"grad_norm": 31.83756189804068,
"learning_rate": 1.069649513871147e-07,
"loss": 0.015,
"num_tokens": 92408037.0,
"residual_var": 0.025669652968645096,
"reward": 0.8671875,
"reward_std": 0.056153833866119385,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.12499997764825821,
"step": 597
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 2.407395353089743e-09,
"advantages/std": 0.1450721174478531,
"advantages/var": 0.02104591926080368,
"completions/clipped_ratio": -2.984375,
"epoch": 3.4355300859598854,
"grad_norm": 60.871403127702614,
"learning_rate": 1.049544144888257e-07,
"loss": -0.1135,
"num_tokens": 92547556.0,
"residual_var": 0.018415190279483795,
"reward": 0.86328125,
"reward_std": 0.04761157184839249,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.12499997764825821,
"step": 598
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.23282866179943085,
"advantages/var": 0.05420918575531375,
"completions/clipped_ratio": -2.9375,
"epoch": 3.4412607449856734,
"grad_norm": 41.65630491047125,
"learning_rate": 1.0296190561303131e-07,
"loss": -0.0226,
"num_tokens": 92689649.0,
"residual_var": 0.04065689817070961,
"reward": 0.88671875,
"reward_std": 0.11363443732261658,
"rewards/drgrpo_math_reward/mean": 0.88671875,
"rewards/drgrpo_math_reward/std": 0.31755712628364563,
"rho2": 0.24999994039535522,
"step": 599
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.464047682013392e-09,
"advantages/std": 0.2834733724594116,
"advantages/var": 0.08035715289351231,
"completions/clipped_ratio": -2.90625,
"epoch": 3.4469914040114613,
"grad_norm": 53.78196015691887,
"learning_rate": 1.0098746489287758e-07,
"loss": 0.2443,
"num_tokens": 92845107.0,
"residual_var": 0.057756707072257996,
"reward": 0.8046875,
"reward_std": 0.1404382884502411,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"rho2": 0.2812499403953552,
"step": 600
},
{
"advantages/mean": 1.0477378964424133e-09,
"advantages/snr": 4.373250828043307e-09,
"advantages/std": 0.2395787239074707,
"advantages/var": 0.057397964949132074,
"completions/clipped_ratio": -3.0,
"epoch": 3.4527220630372493,
"grad_norm": 39.90121714734247,
"learning_rate": 9.903113209758096e-08,
"loss": -0.3874,
"num_tokens": 92989220.0,
"residual_var": 0.04304848238825798,
"reward": 0.90625,
"reward_std": 0.11705183237791061,
"rewards/drgrpo_math_reward/mean": 0.90625,
"rewards/drgrpo_math_reward/std": 0.2920515835285187,
"rho2": 0.24999995529651642,
"step": 601
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 4.754657057250496e-10,
"advantages/std": 0.24484482407569885,
"advantages/var": 0.05994898787665992,
"completions/clipped_ratio": -2.921875,
"epoch": 3.458452722063037,
"grad_norm": 55.126160561348684,
"learning_rate": 9.70929466316277e-08,
"loss": -0.3398,
"num_tokens": 93155169.0,
"residual_var": 0.04496174678206444,
"reward": 0.8125,
"reward_std": 0.11401759088039398,
"rewards/drgrpo_math_reward/mean": 0.8125,
"rewards/drgrpo_math_reward/std": 0.3910769522190094,
"rho2": 0.2499999701976776,
"step": 602
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 5.153925516240895e-09,
"advantages/std": 0.22587698698043823,
"advantages/var": 0.05102041324736106,
"completions/clipped_ratio": -2.921875,
"epoch": 3.464183381088825,
"grad_norm": 47.87950722821978,
"learning_rate": 9.517294753398064e-08,
"loss": -0.1339,
"num_tokens": 93315903.0,
"residual_var": 0.03985970839858055,
"reward": 0.734375,
"reward_std": 0.10429581999778748,
"rewards/drgrpo_math_reward/mean": 0.734375,
"rewards/drgrpo_math_reward/std": 0.4425306022167206,
"rho2": 0.2187499701976776,
"step": 603
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 3.8029002977661035e-09,
"advantages/std": 0.2142857164144516,
"advantages/var": 0.04591836825925477,
"completions/clipped_ratio": -2.90625,
"epoch": 3.469914040114613,
"grad_norm": 49.87684872523336,
"learning_rate": 9.327117347729197e-08,
"loss": -0.1311,
"num_tokens": 93471268.0,
"residual_var": 0.035873737186193466,
"reward": 0.8984375,
"reward_std": 0.09271685779094696,
"rewards/drgrpo_math_reward/mean": 0.8984375,
"rewards/drgrpo_math_reward/std": 0.3026638329029083,
"rho2": 0.2187499701976776,
"step": 604
},
{
"advantages/mean": 1.6298145055770874e-09,
"advantages/snr": 7.2154961987453165e-09,
"advantages/std": 0.22587697207927704,
"advantages/var": 0.0510204065157025,
"completions/clipped_ratio": -2.9375,
"epoch": 3.475644699140401,
"grad_norm": 38.908678706119375,
"learning_rate": 9.13876627671255e-08,
"loss": 0.0163,
"num_tokens": 93615423.0,
"residual_var": 0.03826531767845154,
"reward": 0.828125,
"reward_std": 0.10376295447349548,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.24999994039535522,
"step": 605
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 1.2785315971737493e-09,
"advantages/std": 0.27316176891326904,
"advantages/var": 0.0746173519958262,
"completions/clipped_ratio": -2.890625,
"epoch": 3.481375358166189,
"grad_norm": 62.798294427898014,
"learning_rate": 8.952245334118413e-08,
"loss": -0.8204,
"num_tokens": 93761778.0,
"residual_var": 0.04663585498929024,
"reward": 0.80078125,
"reward_std": 0.1547943651676178,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.3749999403953552,
"step": 606
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.0941679316917287e-09,
"advantages/std": 0.21279241144657135,
"advantages/var": 0.04528061036924691,
"completions/clipped_ratio": -2.953125,
"epoch": 3.487106017191977,
"grad_norm": 88.73741860886084,
"learning_rate": 8.767558276854547e-08,
"loss": -0.4571,
"num_tokens": 93888659.0,
"residual_var": 0.036790505051612854,
"reward": 0.87890625,
"reward_std": 0.0850832611322403,
"rewards/drgrpo_math_reward/mean": 0.87890625,
"rewards/drgrpo_math_reward/std": 0.3268752694129944,
"rho2": 0.18749995529651642,
"step": 607
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 9.040584088659493e-10,
"advantages/std": 0.1287696808576584,
"advantages/var": 0.016581630708183193,
"completions/clipped_ratio": -2.984375,
"epoch": 3.492836676217765,
"grad_norm": 23.220600936321258,
"learning_rate": 8.584708824890696e-08,
"loss": -0.0098,
"num_tokens": 94015866.0,
"residual_var": 0.015027116052806377,
"reward": 0.890625,
"reward_std": 0.036563027650117874,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.0937499850988388,
"step": 608
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.101346966143024e-09,
"advantages/std": 0.22160132229328156,
"advantages/var": 0.049107146042130845,
"completions/clipped_ratio": -2.953125,
"epoch": 3.498567335243553,
"grad_norm": 29.651912245039497,
"learning_rate": 8.403700661183355e-08,
"loss": 0.1099,
"num_tokens": 94150833.0,
"residual_var": 0.041434165090322495,
"reward": 0.83984375,
"reward_std": 0.09548516571521759,
"rewards/drgrpo_math_reward/mean": 0.83984375,
"rewards/drgrpo_math_reward/std": 0.36746934056282043,
"rho2": 0.1562499701976776,
"step": 609
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.0181369335747151e-09,
"advantages/std": 0.22868302464485168,
"advantages/var": 0.05229592576071784,
"completions/clipped_ratio": -2.875,
"epoch": 3.504297994269341,
"grad_norm": 44.8975747932072,
"learning_rate": 8.224537431601886e-08,
"loss": -0.3263,
"num_tokens": 94306132.0,
"residual_var": 0.03922194987535477,
"reward": 0.7421875,
"reward_std": 0.1054728701710701,
"rewards/drgrpo_math_reward/mean": 0.7421875,
"rewards/drgrpo_math_reward/std": 0.4382871091365814,
"rho2": 0.24999994039535522,
"step": 610
},
{
"advantages/mean": 1.7462298274040222e-09,
"advantages/snr": 7.371113001051238e-09,
"advantages/std": 0.23690177500247955,
"advantages/var": 0.056122450999325446,
"completions/clipped_ratio": -2.90625,
"epoch": 3.510028653295129,
"grad_norm": 46.59898003698075,
"learning_rate": 8.047222744854942e-08,
"loss": -0.6579,
"num_tokens": 94445829.0,
"residual_var": 0.04735332727432251,
"reward": 0.8203125,
"reward_std": 0.10231750458478928,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.15624995529651642,
"step": 611
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 3.7638950952465895e-09,
"advantages/std": 0.24743583798408508,
"advantages/var": 0.0612244939188864,
"completions/clipped_ratio": -2.9375,
"epoch": 3.5157593123209168,
"grad_norm": 39.87785063035209,
"learning_rate": 7.871760172417763e-08,
"loss": -0.5317,
"num_tokens": 94594088.0,
"residual_var": 0.04209185391664505,
"reward": 0.8515625,
"reward_std": 0.12756997346878052,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.31249991059303284,
"step": 612
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.7120408936770962e-09,
"advantages/std": 0.27199190855026245,
"advantages/var": 0.07397959831681433,
"completions/clipped_ratio": -2.84375,
"epoch": 3.5214899713467047,
"grad_norm": 47.423187700952894,
"learning_rate": 7.698153248460271e-08,
"loss": -0.7605,
"num_tokens": 94757232.0,
"residual_var": 0.048549119383096695,
"reward": 0.8359375,
"reward_std": 0.14716076850891113,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.3437499403953552,
"step": 613
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.459133312827934e-10,
"advantages/std": 0.24614372849464417,
"advantages/var": 0.0605867350772451,
"completions/clipped_ratio": -2.90625,
"epoch": 3.5272206303724927,
"grad_norm": 50.92583879853945,
"learning_rate": 7.526405469775954e-08,
"loss": -0.1927,
"num_tokens": 94906140.0,
"residual_var": 0.04544006660580635,
"reward": 0.84765625,
"reward_std": 0.12046922743320465,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.24999995529651642,
"step": 614
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 3.1937709000681565e-09,
"advantages/std": 0.2187044471502304,
"advantages/var": 0.047831635203287926,
"completions/clipped_ratio": -2.90625,
"epoch": 3.532951289398281,
"grad_norm": 44.685094883209764,
"learning_rate": 7.356520295711254e-08,
"loss": -0.294,
"num_tokens": 95067668.0,
"residual_var": 0.037368472665548325,
"reward": 0.85546875,
"reward_std": 0.10087842494249344,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.2187499701976776,
"step": 615
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.1157275065779686,
"advantages/var": 0.013392855778753765,
"completions/clipped_ratio": -3.0,
"epoch": 3.5386819484240686,
"grad_norm": 28.964255935442797,
"learning_rate": 7.188501148096116e-08,
"loss": -0.0823,
"num_tokens": 95207002.0,
"residual_var": 0.012137286365032196,
"reward": 0.86328125,
"reward_std": 0.03314562886953354,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.09374997764825821,
"step": 616
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.7658867314519385e-09,
"advantages/std": 0.25253814458847046,
"advantages/var": 0.06377551447218721,
"completions/clipped_ratio": -3.0,
"epoch": 3.544412607449857,
"grad_norm": 49.01721111929761,
"learning_rate": 7.022351411174865e-08,
"loss": -0.1572,
"num_tokens": 95352828.0,
"residual_var": 0.05181760713458061,
"reward": 0.8203125,
"reward_std": 0.11678344011306763,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.1874999701976776,
"step": 617
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 3.209860470786324e-09,
"advantages/std": 0.1450721174478531,
"advantages/var": 0.02104591926080368,
"completions/clipped_ratio": -2.984375,
"epoch": 3.5501432664756445,
"grad_norm": 30.35392299977766,
"learning_rate": 6.858074431538164e-08,
"loss": -0.0558,
"num_tokens": 95497198.0,
"residual_var": 0.018415190279483795,
"reward": 0.96484375,
"reward_std": 0.04761157184839249,
"rewards/drgrpo_math_reward/mean": 0.96484375,
"rewards/drgrpo_math_reward/std": 0.18453538417816162,
"rho2": 0.12499997764825821,
"step": 618
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.043916747715097e-09,
"advantages/std": 0.22303563356399536,
"advantages/var": 0.049744893839292814,
"completions/clipped_ratio": -2.75,
"epoch": 3.555873925501433,
"grad_norm": 41.92973970694584,
"learning_rate": 6.695673518055578e-08,
"loss": 0.1913,
"num_tokens": 95646256.0,
"residual_var": 0.03886321559548378,
"reward": 0.8203125,
"reward_std": 0.10258589684963226,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.21874994039535522,
"step": 619
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 4.375439129655397e-09,
"advantages/std": 0.26606544852256775,
"advantages/var": 0.07079082289751515,
"completions/clipped_ratio": -2.890625,
"epoch": 3.5616045845272204,
"grad_norm": 56.92800431775484,
"learning_rate": 6.535151941808914e-08,
"loss": -0.5503,
"num_tokens": 95790003.0,
"residual_var": 0.05751755088567734,
"reward": 0.86328125,
"reward_std": 0.13006988167762756,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.18749995529651642,
"step": 620
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 4.230252817593886e-09,
"advantages/std": 0.220157653093338,
"advantages/var": 0.048469392215566565,
"completions/clipped_ratio": -3.0,
"epoch": 3.567335243553009,
"grad_norm": 60.286859613453636,
"learning_rate": 6.376512936026279e-08,
"loss": -0.0555,
"num_tokens": 95931649.0,
"residual_var": 0.03938138857483864,
"reward": 0.8828125,
"reward_std": 0.09495474398136139,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.1874999701976776,
"step": 621
},
{
"advantages/mean": 1.7462298274040222e-09,
"advantages/snr": 8.781698823081943e-09,
"advantages/std": 0.19884873926639557,
"advantages/var": 0.03954082110783497,
"completions/clipped_ratio": -2.90625,
"epoch": 3.5730659025787963,
"grad_norm": 39.14535432506035,
"learning_rate": 6.219759696017113e-08,
"loss": -0.1751,
"num_tokens": 96075681.0,
"residual_var": 0.03089127317070961,
"reward": 0.8515625,
"reward_std": 0.08588206768035889,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.21874995529651642,
"step": 622
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 5.3229515001135936e-09,
"advantages/std": 0.2187044471502304,
"advantages/var": 0.047831635203287926,
"completions/clipped_ratio": -2.859375,
"epoch": 3.5787965616045847,
"grad_norm": 53.47147141809364,
"learning_rate": 6.064895379107659e-08,
"loss": -0.1134,
"num_tokens": 96225596.0,
"residual_var": 0.03736847639083862,
"reward": 0.84765625,
"reward_std": 0.09442433714866638,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.21874994039535522,
"step": 623
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 2.6614758307843997e-09,
"advantages/std": 0.262445330619812,
"advantages/var": 0.06887755156414244,
"completions/clipped_ratio": -2.859375,
"epoch": 3.5845272206303727,
"grad_norm": 47.78001634418774,
"learning_rate": 5.911923104577454e-08,
"loss": -0.3104,
"num_tokens": 96380274.0,
"residual_var": 0.05165817588567734,
"reward": 0.78125,
"reward_std": 0.12901148200035095,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.24999995529651642,
"step": 624
},
{
"advantages/mean": -1.3969838619232178e-09,
"advantages/snr": 6.2634996493533425e-09,
"advantages/std": 0.22303566336631775,
"advantages/var": 0.04974490713325341,
"completions/clipped_ratio": -2.890625,
"epoch": 3.5902578796561606,
"grad_norm": 46.22086643750276,
"learning_rate": 5.760845953596527e-08,
"loss": -0.8019,
"num_tokens": 96521301.0,
"residual_var": 0.043526798486709595,
"reward": 0.84375,
"reward_std": 0.08956148475408554,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.12499997019767761,
"step": 625
},
{
"advantages/mean": 5.820766091346741e-10,
"advantages/snr": 2.7747801921406384e-09,
"advantages/std": 0.2097739428281784,
"advantages/var": 0.04400510708967986,
"completions/clipped_ratio": -2.96875,
"epoch": 3.5959885386819486,
"grad_norm": 46.5684109422701,
"learning_rate": 5.611666969163242e-08,
"loss": -0.289,
"num_tokens": 96657732.0,
"residual_var": 0.03437899798154831,
"reward": 0.86328125,
"reward_std": 0.09100939333438873,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.2187499701976776,
"step": 626
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.557063287032119e-09,
"advantages/std": 0.1821078360080719,
"advantages/var": 0.03316326393554281,
"completions/clipped_ratio": -2.765625,
"epoch": 3.6017191977077365,
"grad_norm": 44.745888500064524,
"learning_rate": 5.464389156043114e-08,
"loss": -0.2185,
"num_tokens": 96799094.0,
"residual_var": 0.029017869383096695,
"reward": 0.84375,
"reward_std": 0.06602286547422409,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.12499997019767761,
"step": 627
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 3.1623042258115645e-09,
"advantages/std": 0.14725378155708313,
"advantages/var": 0.021683676182861156,
"completions/clipped_ratio": -2.921875,
"epoch": 3.6074498567335245,
"grad_norm": 29.061281994660337,
"learning_rate": 5.3190154807082e-08,
"loss": 0.137,
"num_tokens": 96935256.0,
"residual_var": 0.019650837406516075,
"reward": 0.890625,
"reward_std": 0.041687894612550735,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.09374997764825821,
"step": 628
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.236086887785319e-09,
"advantages/std": 0.2082482874393463,
"advantages/var": 0.043367349221420604,
"completions/clipped_ratio": -2.890625,
"epoch": 3.6131805157593124,
"grad_norm": 49.603404350651765,
"learning_rate": 5.175548871277358e-08,
"loss": -0.357,
"num_tokens": 97087405.0,
"residual_var": 0.03523598238825798,
"reward": 0.78125,
"reward_std": 0.08337578922510147,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.18749995529651642,
"step": 629
},
{
"advantages/mean": 3.4924596548080444e-10,
"advantages/snr": 1.4418181244196808e-09,
"advantages/std": 0.24222607910633087,
"advantages/var": 0.05867347339922646,
"completions/clipped_ratio": -2.84375,
"epoch": 3.6189111747851004,
"grad_norm": 42.7166520741745,
"learning_rate": 5.033992217457395e-08,
"loss": -0.0058,
"num_tokens": 97237459.0,
"residual_var": 0.04767220467329025,
"reward": 0.7890625,
"reward_std": 0.10520447790622711,
"rewards/drgrpo_math_reward/mean": 0.7890625,
"rewards/drgrpo_math_reward/std": 0.4087733030319214,
"rho2": 0.18749995529651642,
"step": 630
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.4005852964484455e-09,
"advantages/std": 0.1939782202243805,
"advantages/var": 0.03762754992141826,
"completions/clipped_ratio": -2.96875,
"epoch": 3.6246418338108883,
"grad_norm": 36.484359460099256,
"learning_rate": 4.8943483704846465e-08,
"loss": -0.2171,
"num_tokens": 97390087.0,
"residual_var": 0.03174825757741928,
"reward": 0.80078125,
"reward_std": 0.077071413397789,
"rewards/drgrpo_math_reward/mean": 0.80078125,
"rewards/drgrpo_math_reward/std": 0.40019527077674866,
"rho2": 0.1562499701976776,
"step": 631
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 6.105844111983757e-10,
"advantages/std": 0.19066211581230164,
"advantages/var": 0.03635204240602352,
"completions/clipped_ratio": -2.96875,
"epoch": 3.6303724928366763,
"grad_norm": 39.535528476647805,
"learning_rate": 4.756620143067724e-08,
"loss": 0.0628,
"num_tokens": 97530545.0,
"residual_var": 0.031808048486709595,
"reward": 0.92578125,
"reward_std": 0.06944026052951813,
"rewards/drgrpo_math_reward/mean": 0.92578125,
"rewards/drgrpo_math_reward/std": 0.2626400291919708,
"rho2": 0.12499997764825821,
"step": 632
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 5.470839275353562e-09,
"advantages/std": 0.21279242634773254,
"advantages/var": 0.04528061671095518,
"completions/clipped_ratio": -2.828125,
"epoch": 3.6361031518624642,
"grad_norm": 36.704938684625795,
"learning_rate": 4.620810309330803e-08,
"loss": -0.2712,
"num_tokens": 97668443.0,
"residual_var": 0.03396047279238701,
"reward": 0.87109375,
"reward_std": 0.09864053130149841,
"rewards/drgrpo_math_reward/mean": 0.87109375,
"rewards/drgrpo_math_reward/std": 0.33575257658958435,
"rho2": 0.24999995529651642,
"step": 633
},
{
"advantages/mean": -1.1641532182693481e-10,
"advantages/snr": 5.322951500113594e-10,
"advantages/std": 0.2187044471502304,
"advantages/var": 0.047831635203287926,
"completions/clipped_ratio": -2.9375,
"epoch": 3.641833810888252,
"grad_norm": 40.750492569142516,
"learning_rate": 4.4869216047576986e-08,
"loss": -0.4012,
"num_tokens": 97795803.0,
"residual_var": 0.037368472665548325,
"reward": 0.89453125,
"reward_std": 0.10087842494249344,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.2187499701976776,
"step": 634
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.06681530922651291,
"advantages/var": 0.004464285547034541,
"completions/clipped_ratio": -2.859375,
"epoch": 3.64756446991404,
"grad_norm": 16.114132966316024,
"learning_rate": 4.3549567261368494e-08,
"loss": -0.0488,
"num_tokens": 97936947.0,
"residual_var": 0.004324787296354771,
"reward": 0.90234375,
"reward_std": 0.011048543266952038,
"rewards/drgrpo_math_reward/mean": 0.90234375,
"rewards/drgrpo_math_reward/std": 0.29743078351020813,
"rho2": 0.031249990686774254,
"step": 635
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.20671138167381287,
"advantages/var": 0.04272959531349674,
"completions/clipped_ratio": -2.953125,
"epoch": 3.653295128939828,
"grad_norm": 39.26219200768812,
"learning_rate": 4.224918331506955e-08,
"loss": -0.2505,
"num_tokens": 98077775.0,
"residual_var": 0.03605310246348381,
"reward": 0.90234375,
"reward_std": 0.07574218511581421,
"rewards/drgrpo_math_reward/mean": 0.90234375,
"rewards/drgrpo_math_reward/std": 0.29743078351020813,
"rho2": 0.15624995529651642,
"step": 636
},
{
"advantages/mean": -1.280568540096283e-09,
"advantages/snr": 4.728538381659934e-09,
"advantages/std": 0.27081698179244995,
"advantages/var": 0.07334183762717217,
"completions/clipped_ratio": -2.96875,
"epoch": 3.659025787965616,
"grad_norm": 79.3512309491782,
"learning_rate": 4.096809040103444e-08,
"loss": -0.52,
"num_tokens": 98222118.0,
"residual_var": 0.048130594193935394,
"reward": 0.89453125,
"reward_std": 0.14716322720050812,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.3437499403953552,
"step": 637
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.0372886398259093e-09,
"advantages/std": 0.22446079552173615,
"advantages/var": 0.050382648726250645,
"completions/clipped_ratio": -2.8125,
"epoch": 3.664756446991404,
"grad_norm": 40.029001858794814,
"learning_rate": 3.9706314323056936e-08,
"loss": -0.1787,
"num_tokens": 98386258.0,
"residual_var": 0.03621254488825798,
"reward": 0.75390625,
"reward_std": 0.10968662798404694,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"rho2": 0.2812499403953552,
"step": 638
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 1.2320239164203405e-09,
"advantages/std": 0.18898223340511322,
"advantages/var": 0.03571428454278469,
"completions/clipped_ratio": -2.90625,
"epoch": 3.670487106017192,
"grad_norm": 73.46864887642852,
"learning_rate": 3.846388049585114e-08,
"loss": -0.1359,
"num_tokens": 98534613.0,
"residual_var": 0.030133940279483795,
"reward": 0.71875,
"reward_std": 0.07536394149065018,
"rewards/drgrpo_math_reward/mean": 0.71875,
"rewards/drgrpo_math_reward/std": 0.45048993825912476,
"rho2": 0.1562499701976776,
"step": 639
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 5.287816379893159e-09,
"advantages/std": 0.22015763819217682,
"advantages/var": 0.04846938565435743,
"completions/clipped_ratio": -2.875,
"epoch": 3.67621776504298,
"grad_norm": 36.833960394703546,
"learning_rate": 3.724081394453915e-08,
"loss": -0.1712,
"num_tokens": 98679085.0,
"residual_var": 0.03938139230012894,
"reward": 0.828125,
"reward_std": 0.09495474398136139,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.18749995529651642,
"step": 640
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.158150412641125e-09,
"advantages/std": 0.21576867997646332,
"advantages/var": 0.04655612325878544,
"completions/clipped_ratio": -2.9375,
"epoch": 3.681948424068768,
"grad_norm": 39.81293821380969,
"learning_rate": 3.6037139304146756e-08,
"loss": -0.3951,
"num_tokens": 98822479.0,
"residual_var": 0.040736615657806396,
"reward": 0.85546875,
"reward_std": 0.0861440896987915,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.12499997019767761,
"step": 641
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 8.830796510959388e-10,
"advantages/std": 0.2636575698852539,
"advantages/var": 0.06951531415779755,
"completions/clipped_ratio": -2.875,
"epoch": 3.687679083094556,
"grad_norm": 49.9808279744873,
"learning_rate": 3.4852880819107974e-08,
"loss": -0.502,
"num_tokens": 98976832.0,
"residual_var": 0.04779178649187088,
"reward": 0.87890625,
"reward_std": 0.13611221313476562,
"rewards/drgrpo_math_reward/mean": 0.87890625,
"rewards/drgrpo_math_reward/std": 0.3268752694129944,
"rho2": 0.3124999403953552,
"step": 642
},
{
"advantages/mean": 1.0477378964424133e-09,
"advantages/snr": 3.8520920107734665e-09,
"advantages/std": 0.27199190855026245,
"advantages/var": 0.07397959831681433,
"completions/clipped_ratio": -2.71875,
"epoch": 3.693409742120344,
"grad_norm": 72.31122433456379,
"learning_rate": 3.3688062342776105e-08,
"loss": -0.4692,
"num_tokens": 99131413.0,
"residual_var": 0.05317284166812897,
"reward": 0.8203125,
"reward_std": 0.14005757868289948,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.2812499403953552,
"step": 643
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.4045714528780252e-09,
"advantages/std": 0.20516294240951538,
"advantages/var": 0.042091832938130125,
"completions/clipped_ratio": -2.953125,
"epoch": 3.6991404011461317,
"grad_norm": 41.15042948921175,
"learning_rate": 3.254270733694331e-08,
"loss": -0.5067,
"num_tokens": 99265316.0,
"residual_var": 0.03419962897896767,
"reward": 0.921875,
"reward_std": 0.08166586607694626,
"rewards/drgrpo_math_reward/mean": 0.921875,
"rewards/drgrpo_math_reward/std": 0.26889389753341675,
"rho2": 0.18749994039535522,
"step": 644
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 6.395541197776692e-09,
"advantages/std": 0.29124119877815247,
"advantages/var": 0.08482143586573532,
"completions/clipped_ratio": -2.890625,
"epoch": 3.7048710601719197,
"grad_norm": 54.4289614540195,
"learning_rate": 3.141683887136892e-08,
"loss": -0.7299,
"num_tokens": 99423558.0,
"residual_var": 0.058314744383096695,
"reward": 0.80859375,
"reward_std": 0.1514868289232254,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.3124999403953552,
"step": 645
},
{
"advantages/mean": -9.313225746154785e-10,
"advantages/snr": 4.972694468794569e-09,
"advantages/std": 0.18728730082511902,
"advantages/var": 0.03507653305035863,
"completions/clipped_ratio": -2.828125,
"epoch": 3.7106017191977076,
"grad_norm": 39.47961175206918,
"learning_rate": 3.0310479623313125e-08,
"loss": -0.3528,
"num_tokens": 99574657.0,
"residual_var": 0.02849969081580639,
"reward": 0.86328125,
"reward_std": 0.07483352720737457,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.18749995529651642,
"step": 646
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 3.97670992529417e-09,
"advantages/std": 0.23419423401355743,
"advantages/var": 0.0548469392451969,
"completions/clipped_ratio": -2.90625,
"epoch": 3.7163323782234956,
"grad_norm": 54.18482780264222,
"learning_rate": 2.9223651877081867e-08,
"loss": 0.1581,
"num_tokens": 99720962.0,
"residual_var": 0.042849183082580566,
"reward": 0.875,
"reward_std": 0.10771076381206512,
"rewards/drgrpo_math_reward/mean": 0.875,
"rewards/drgrpo_math_reward/std": 0.33136674761772156,
"rho2": 0.21874994039535522,
"step": 647
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.22728431224822998,
"advantages/var": 0.051658158594150905,
"completions/clipped_ratio": -2.953125,
"epoch": 3.7220630372492836,
"grad_norm": 51.455630603359445,
"learning_rate": 2.8156377523576802e-08,
"loss": -0.4604,
"num_tokens": 99857133.0,
"residual_var": 0.03712931647896767,
"reward": 0.91015625,
"reward_std": 0.11139655113220215,
"rewards/drgrpo_math_reward/mean": 0.91015625,
"rewards/drgrpo_math_reward/std": 0.2865179479122162,
"rho2": 0.2812499403953552,
"step": 648
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.20516295731067657,
"advantages/var": 0.0420918390524625,
"completions/clipped_ratio": -2.796875,
"epoch": 3.7277936962750715,
"grad_norm": 39.31169214983867,
"learning_rate": 2.7108678059855062e-08,
"loss": -0.5246,
"num_tokens": 100002956.0,
"residual_var": 0.03551499918103218,
"reward": 0.8203125,
"reward_std": 0.08219873160123825,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.1562499850988388,
"step": 649
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.1939782202243805,
"advantages/var": 0.03762754992141826,
"completions/clipped_ratio": -2.734375,
"epoch": 3.7335243553008595,
"grad_norm": 43.58478515285809,
"learning_rate": 2.6080574588696058e-08,
"loss": -0.1068,
"num_tokens": 100157687.0,
"residual_var": 0.029396533966064453,
"reward": 0.83203125,
"reward_std": 0.08417459577322006,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"rho2": 0.2187499701976776,
"step": 650
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.380496292175044e-09,
"advantages/std": 0.19561520218849182,
"advantages/var": 0.038265307327244535,
"completions/clipped_ratio": -2.90625,
"epoch": 3.7392550143266474,
"grad_norm": 37.90389101739278,
"learning_rate": 2.507208781817638e-08,
"loss": -0.3881,
"num_tokens": 100296628.0,
"residual_var": 0.033482152968645096,
"reward": 0.8828125,
"reward_std": 0.07114773988723755,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.12499997019767761,
"step": 651
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 2.982532443970627e-09,
"advantages/std": 0.23419423401355743,
"advantages/var": 0.0548469392451969,
"completions/clipped_ratio": -2.8125,
"epoch": 3.7449856733524354,
"grad_norm": 53.56485887973218,
"learning_rate": 2.4083238061252563e-08,
"loss": -0.5428,
"num_tokens": 100457887.0,
"residual_var": 0.03942124918103218,
"reward": 0.8828125,
"reward_std": 0.11481393873691559,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.2812499403953552,
"step": 652
},
{
"advantages/mean": 1.1641532182693481e-09,
"advantages/snr": 6.001463241121114e-09,
"advantages/std": 0.1939782202243805,
"advantages/var": 0.03762754992141826,
"completions/clipped_ratio": -2.796875,
"epoch": 3.7507163323782233,
"grad_norm": 36.834434673444534,
"learning_rate": 2.311404523535243e-08,
"loss": -0.2637,
"num_tokens": 100610163.0,
"residual_var": 0.03174825757741928,
"reward": 0.87109375,
"reward_std": 0.077071413397789,
"rewards/drgrpo_math_reward/mean": 0.87109375,
"rewards/drgrpo_math_reward/std": 0.33575257658958435,
"rho2": 0.15624995529651642,
"step": 653
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 4.6098110067255745e-09,
"advantages/std": 0.15152288973331451,
"advantages/var": 0.02295918611313419,
"completions/clipped_ratio": -2.96875,
"epoch": 3.7564469914040117,
"grad_norm": 29.425561410312323,
"learning_rate": 2.2164528861973065e-08,
"loss": -0.1512,
"num_tokens": 100739733.0,
"residual_var": 0.020089294761419296,
"reward": 0.8671875,
"reward_std": 0.04931904375553131,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.12499997019767761,
"step": 654
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.9018629386468357e-09,
"advantages/std": 0.24484480917453766,
"advantages/var": 0.05994898057971576,
"completions/clipped_ratio": -2.953125,
"epoch": 3.7621776504297992,
"grad_norm": 50.246981983613296,
"learning_rate": 2.1234708066288574e-08,
"loss": -0.3237,
"num_tokens": 100876691.0,
"residual_var": 0.04683515802025795,
"reward": 0.8046875,
"reward_std": 0.11928971856832504,
"rewards/drgrpo_math_reward/mean": 0.8046875,
"rewards/drgrpo_math_reward/std": 0.39721766114234924,
"rho2": 0.21874995529651642,
"step": 655
},
{
"advantages/mean": -1.0477378964424133e-09,
"advantages/snr": 4.473798381299966e-09,
"advantages/std": 0.23419424891471863,
"advantages/var": 0.05484694622472919,
"completions/clipped_ratio": -2.90625,
"epoch": 3.7679083094555876,
"grad_norm": 55.94674225949123,
"learning_rate": 2.032460157676452e-08,
"loss": -0.2971,
"num_tokens": 101028763.0,
"residual_var": 0.04113522171974182,
"reward": 0.8828125,
"reward_std": 0.10889272391796112,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.24999995529651642,
"step": 656
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.158150412641125e-09,
"advantages/std": 0.21576867997646332,
"advantages/var": 0.04655612325878544,
"completions/clipped_ratio": -2.90625,
"epoch": 3.773638968481375,
"grad_norm": 57.17036715686087,
"learning_rate": 1.9434227724779984e-08,
"loss": -0.639,
"num_tokens": 101164375.0,
"residual_var": 0.03637198358774185,
"reward": 0.87890625,
"reward_std": 0.09271440654993057,
"rewards/drgrpo_math_reward/mean": 0.87890625,
"rewards/drgrpo_math_reward/std": 0.3268752694129944,
"rho2": 0.21874995529651642,
"step": 657
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 1.8347735436508976e-09,
"advantages/std": 0.2537976801395416,
"advantages/var": 0.06441326244421308,
"completions/clipped_ratio": -2.921875,
"epoch": 3.7793696275071635,
"grad_norm": 50.445926119141305,
"learning_rate": 1.856360444425953e-08,
"loss": -0.1995,
"num_tokens": 101308179.0,
"residual_var": 0.04428413510322571,
"reward": 0.91015625,
"reward_std": 0.1374414563179016,
"rewards/drgrpo_math_reward/mean": 0.91015625,
"rewards/drgrpo_math_reward/std": 0.2865179479122162,
"rho2": 0.3124999403953552,
"step": 658
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.19884873926639557,
"advantages/var": 0.03954082110783497,
"completions/clipped_ratio": -2.90625,
"epoch": 3.785100286532951,
"grad_norm": 37.38180564757065,
"learning_rate": 1.771274927131139e-08,
"loss": -0.4388,
"num_tokens": 101452351.0,
"residual_var": 0.03089127317070961,
"reward": 0.84375,
"reward_std": 0.08588207513093948,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.21874995529651642,
"step": 659
},
{
"advantages/mean": -1.0477378964424133e-09,
"advantages/snr": 5.401316502086732e-09,
"advantages/std": 0.1939782351255417,
"advantages/var": 0.037627555702419935,
"completions/clipped_ratio": -2.90625,
"epoch": 3.7908309455587395,
"grad_norm": 28.992450313688646,
"learning_rate": 1.6881679343873634e-08,
"loss": 0.1372,
"num_tokens": 101597856.0,
"residual_var": 0.032924119383096695,
"reward": 0.92578125,
"reward_std": 0.07115019112825394,
"rewards/drgrpo_math_reward/mean": 0.92578125,
"rewards/drgrpo_math_reward/std": 0.2626400291919708,
"rho2": 0.1249999850988388,
"step": 660
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 6.621393318979555e-09,
"advantages/std": 0.24614372849464417,
"advantages/var": 0.0605867350772451,
"completions/clipped_ratio": -2.875,
"epoch": 3.796561604584527,
"grad_norm": 60.214687351110435,
"learning_rate": 1.607041140137033e-08,
"loss": -0.1922,
"num_tokens": 101755824.0,
"residual_var": 0.04165339842438698,
"reward": 0.83203125,
"reward_std": 0.1275724172592163,
"rewards/drgrpo_math_reward/mean": 0.83203125,
"rewards/drgrpo_math_reward/std": 0.3745708465576172,
"rho2": 0.3124999403953552,
"step": 661
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.017830079630706e-09,
"advantages/std": 0.2314550131559372,
"advantages/var": 0.05357142311501506,
"completions/clipped_ratio": -2.921875,
"epoch": 3.8022922636103154,
"grad_norm": 39.232079845003454,
"learning_rate": 1.5278961784373266e-08,
"loss": -0.2005,
"num_tokens": 101911230.0,
"residual_var": 0.038504477590322495,
"reward": 0.78125,
"reward_std": 0.11310403048992157,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.2812499403953552,
"step": 662
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.15361295640468597,
"advantages/var": 0.023596940375387954,
"completions/clipped_ratio": -2.96875,
"epoch": 3.8080229226361033,
"grad_norm": 24.328039087105505,
"learning_rate": 1.4507346434273316e-08,
"loss": -0.0079,
"num_tokens": 102062868.0,
"residual_var": 0.02138473652303219,
"reward": 0.80859375,
"reward_std": 0.04339536651968956,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.09374997019767761,
"step": 663
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.486347036575778e-09,
"advantages/std": 0.1872873157262802,
"advantages/var": 0.03507653863195537,
"completions/clipped_ratio": -2.9375,
"epoch": 3.8137535816618913,
"grad_norm": 40.12027079585644,
"learning_rate": 1.375558089295914e-08,
"loss": -0.3267,
"num_tokens": 102191204.0,
"residual_var": 0.029595833271741867,
"reward": 0.85546875,
"reward_std": 0.06891229748725891,
"rewards/drgrpo_math_reward/mean": 0.85546875,
"rewards/drgrpo_math_reward/std": 0.35231640934944153,
"rho2": 0.1562499850988388,
"step": 664
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.36110830002126e-10,
"advantages/std": 0.24872122704982758,
"advantages/var": 0.06186224878517188,
"completions/clipped_ratio": -2.671875,
"epoch": 3.819484240687679,
"grad_norm": 54.17369996636717,
"learning_rate": 1.3023680302504336e-08,
"loss": -0.6099,
"num_tokens": 102345256.0,
"residual_var": 0.04832988977432251,
"reward": 0.80859375,
"reward_std": 0.11454310268163681,
"rewards/drgrpo_math_reward/mean": 0.80859375,
"rewards/drgrpo_math_reward/std": 0.39417871832847595,
"rho2": 0.21874994039535522,
"step": 665
},
{
"advantages/mean": 1.5133991837501526e-09,
"advantages/snr": 6.352307063392909e-09,
"advantages/std": 0.23824401199817657,
"advantages/var": 0.056760209252987304,
"completions/clipped_ratio": -2.84375,
"epoch": 3.825214899713467,
"grad_norm": 38.6683462023298,
"learning_rate": 1.231165940486234e-08,
"loss": 0.1883,
"num_tokens": 102510042.0,
"residual_var": 0.049665190279483795,
"reward": 0.75390625,
"reward_std": 0.10284791886806488,
"rewards/drgrpo_math_reward/mean": 0.75390625,
"rewards/drgrpo_math_reward/std": 0.43157756328582764,
"rho2": 0.12499997019767761,
"step": 666
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.1730858844377734e-09,
"advantages/std": 0.2142857164144516,
"advantages/var": 0.04591836825925477,
"completions/clipped_ratio": -2.9375,
"epoch": 3.830945558739255,
"grad_norm": 50.021929000970786,
"learning_rate": 1.1619532541569332e-08,
"loss": -0.4099,
"num_tokens": 102659424.0,
"residual_var": 0.038743630051612854,
"reward": 0.859375,
"reward_std": 0.0856136754155159,
"rewards/drgrpo_math_reward/mean": 0.859375,
"rewards/drgrpo_math_reward/std": 0.3483152687549591,
"rho2": 0.1562499701976776,
"step": 667
},
{
"advantages/mean": -1.6298145055770874e-09,
"advantages/snr": 6.7653530196824894e-09,
"advantages/std": 0.24090604484081268,
"advantages/var": 0.05803572244084365,
"completions/clipped_ratio": -2.84375,
"epoch": 3.836676217765043,
"grad_norm": 93.84246863269392,
"learning_rate": 1.0947313653455693e-08,
"loss": -0.3309,
"num_tokens": 102813762.0,
"residual_var": 0.043526798486709595,
"reward": 0.89453125,
"reward_std": 0.11231012642383575,
"rewards/drgrpo_math_reward/mean": 0.89453125,
"rewards/drgrpo_math_reward/std": 0.3077581524848938,
"rho2": 0.2499999701976776,
"step": 668
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.862645074725154e-09,
"advantages/std": 0.25,
"advantages/var": 0.0625,
"completions/clipped_ratio": -2.859375,
"epoch": 3.842406876790831,
"grad_norm": 44.3179794110122,
"learning_rate": 1.029501628036511e-08,
"loss": -0.1561,
"num_tokens": 102955237.0,
"residual_var": 0.046875011175870895,
"reward": 0.8515625,
"reward_std": 0.12863080203533173,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.2499999701976776,
"step": 669
},
{
"advantages/mean": -2.3283064365386963e-09,
"advantages/snr": 8.523543981158329e-09,
"advantages/std": 0.27316176891326904,
"advantages/var": 0.0746173519958262,
"completions/clipped_ratio": -2.78125,
"epoch": 3.848137535816619,
"grad_norm": 41.95287015060324,
"learning_rate": 9.662653560881584e-09,
"loss": -0.4442,
"num_tokens": 103096599.0,
"residual_var": 0.05129943788051605,
"reward": 0.84765625,
"reward_std": 0.14769119024276733,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.3124999403953552,
"step": 670
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 4.2801022341927405e-10,
"advantages/std": 0.27199190855026245,
"advantages/var": 0.07397959831681433,
"completions/clipped_ratio": -2.9375,
"epoch": 3.853868194842407,
"grad_norm": 58.944559417389065,
"learning_rate": 9.050238232065299e-09,
"loss": -0.8938,
"num_tokens": 103250186.0,
"residual_var": 0.048549119383096695,
"reward": 0.8359375,
"reward_std": 0.14716076850891113,
"rewards/drgrpo_math_reward/mean": 0.8359375,
"rewards/drgrpo_math_reward/std": 0.3710577189922333,
"rho2": 0.3437499403953552,
"step": 671
},
{
"advantages/mean": -1.1641532182693481e-09,
"advantages/snr": 4.914075334034159e-09,
"advantages/std": 0.23690177500247955,
"advantages/var": 0.056122450999325446,
"completions/clipped_ratio": -2.875,
"epoch": 3.859598853868195,
"grad_norm": 49.36231287699574,
"learning_rate": 8.457782629195387e-09,
"loss": -0.6094,
"num_tokens": 103394094.0,
"residual_var": 0.03858419507741928,
"reward": 0.8203125,
"reward_std": 0.12244509160518646,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.3124999403953552,
"step": 672
},
{
"advantages/mean": 3.4924596548080444e-10,
"advantages/snr": 1.4340454135281388e-09,
"advantages/std": 0.2435389757156372,
"advantages/var": 0.05931123269262173,
"completions/clipped_ratio": -2.828125,
"epoch": 3.865329512893983,
"grad_norm": 43.523584419930096,
"learning_rate": 7.885298685522235e-09,
"loss": -0.3124,
"num_tokens": 103533751.0,
"residual_var": 0.05004385486245155,
"reward": 0.86328125,
"reward_std": 0.10573489964008331,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.1562499701976776,
"step": 673
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 9.61212082946454e-10,
"advantages/std": 0.24222607910633087,
"advantages/var": 0.05867347339922646,
"completions/clipped_ratio": -2.953125,
"epoch": 3.871060171919771,
"grad_norm": 81.6292265086487,
"learning_rate": 7.3327979320264575e-09,
"loss": -0.2091,
"num_tokens": 103678840.0,
"residual_var": 0.04400511458516121,
"reward": 0.8828125,
"reward_std": 0.11876175552606583,
"rewards/drgrpo_math_reward/mean": 0.8828125,
"rewards/drgrpo_math_reward/std": 0.3222736418247223,
"rho2": 0.2499999701976776,
"step": 674
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.9018629386468357e-09,
"advantages/std": 0.24484480917453766,
"advantages/var": 0.05994898057971576,
"completions/clipped_ratio": -2.78125,
"epoch": 3.8767908309455588,
"grad_norm": 51.53624188456518,
"learning_rate": 6.800291497187083e-09,
"loss": -0.0777,
"num_tokens": 103838662.0,
"residual_var": 0.04683515802025795,
"reward": 0.7578125,
"reward_std": 0.11928972601890564,
"rewards/drgrpo_math_reward/mean": 0.7578125,
"rewards/drgrpo_math_reward/std": 0.4292463958263397,
"rho2": 0.21874995529651642,
"step": 675
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 5.215406001438975e-09,
"advantages/std": 0.1785714328289032,
"advantages/var": 0.03188775662256749,
"completions/clipped_ratio": -2.796875,
"epoch": 3.8825214899713467,
"grad_norm": 26.68293306608483,
"learning_rate": 6.2877901067573955e-09,
"loss": -0.1505,
"num_tokens": 103991156.0,
"residual_var": 0.0259088147431612,
"reward": 0.8515625,
"reward_std": 0.07141612470149994,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.18749995529651642,
"step": 676
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 1.9545560195055106e-09,
"advantages/std": 0.23824401199817657,
"advantages/var": 0.056760209252987304,
"completions/clipped_ratio": -2.921875,
"epoch": 3.8882521489971347,
"grad_norm": 41.98491960912337,
"learning_rate": 5.795304083548558e-09,
"loss": -0.3763,
"num_tokens": 104128312.0,
"residual_var": 0.04611767828464508,
"reward": 0.86328125,
"reward_std": 0.10349701344966888,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.1874999701976776,
"step": 677
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 5.215406001438975e-09,
"advantages/std": 0.1785714328289032,
"advantages/var": 0.03188775662256749,
"completions/clipped_ratio": -2.953125,
"epoch": 3.8939828080229226,
"grad_norm": 38.20002096525398,
"learning_rate": 5.322843347221661e-09,
"loss": -0.0537,
"num_tokens": 104261103.0,
"residual_var": 0.025908811017870903,
"reward": 0.9375,
"reward_std": 0.07141613215208054,
"rewards/drgrpo_math_reward/mean": 0.9375,
"rewards/drgrpo_math_reward/std": 0.24253563582897186,
"rho2": 0.18749994039535522,
"step": 678
},
{
"advantages/mean": -2.3283064365386963e-10,
"advantages/snr": 8.213493136884692e-10,
"advantages/std": 0.28347334265708923,
"advantages/var": 0.08035713599718353,
"completions/clipped_ratio": -2.90625,
"epoch": 3.8997134670487106,
"grad_norm": 54.09125544505304,
"learning_rate": 4.870417414088779e-09,
"loss": -0.4964,
"num_tokens": 104400662.0,
"residual_var": 0.057756710797548294,
"reward": 0.8671875,
"reward_std": 0.14689236879348755,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.2812499403953552,
"step": 679
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 2.011886590227791e-09,
"advantages/std": 0.2314550280570984,
"advantages/var": 0.0535714300129122,
"completions/clipped_ratio": -2.765625,
"epoch": 3.9054441260744985,
"grad_norm": 41.67372257346115,
"learning_rate": 4.438035396920003e-09,
"loss": -0.0959,
"num_tokens": 104555103.0,
"residual_var": 0.043526794761419296,
"reward": 0.78125,
"reward_std": 0.10007961839437485,
"rewards/drgrpo_math_reward/mean": 0.78125,
"rewards/drgrpo_math_reward/std": 0.41420844197273254,
"rho2": 0.1874999701976776,
"step": 680
},
{
"advantages/mean": 9.313225746154785e-10,
"advantages/snr": 3.7064275334476093e-09,
"advantages/std": 0.2512722909450531,
"advantages/var": 0.06313776419677541,
"completions/clipped_ratio": -2.875,
"epoch": 3.9111747851002865,
"grad_norm": 89.25950778644456,
"learning_rate": 4.025706004760931e-09,
"loss": -0.6127,
"num_tokens": 104718658.0,
"residual_var": 0.04538027569651604,
"reward": 0.71484375,
"reward_std": 0.12335620820522308,
"rewards/drgrpo_math_reward/mean": 0.71484375,
"rewards/drgrpo_math_reward/std": 0.4523732364177704,
"rho2": 0.2812499403953552,
"step": 681
},
{
"advantages/mean": -1.862645149230957e-09,
"advantages/snr": 8.00006775297642e-09,
"advantages/std": 0.23282866179943085,
"advantages/var": 0.05420918575531375,
"completions/clipped_ratio": -2.9375,
"epoch": 3.9169054441260744,
"grad_norm": 44.862514699792975,
"learning_rate": 3.633437542756912e-09,
"loss": -0.063,
"num_tokens": 104862151.0,
"residual_var": 0.04065689817070961,
"reward": 0.91796875,
"reward_std": 0.10718034952878952,
"rewards/drgrpo_math_reward/mean": 0.91796875,
"rewards/drgrpo_math_reward/std": 0.2749498784542084,
"rho2": 0.24999994039535522,
"step": 682
},
{
"advantages/mean": 0.0,
"advantages/snr": 0.0,
"advantages/std": 0.220157653093338,
"advantages/var": 0.048469392215566565,
"completions/clipped_ratio": -2.875,
"epoch": 3.9226361031518624,
"grad_norm": 35.18229290368417,
"learning_rate": 3.261237911985404e-09,
"loss": -0.2328,
"num_tokens": 105011857.0,
"residual_var": 0.03635205328464508,
"reward": 0.8671875,
"reward_std": 0.10205793380737305,
"rewards/drgrpo_math_reward/mean": 0.8671875,
"rewards/drgrpo_math_reward/std": 0.3400367796421051,
"rho2": 0.2499999701976776,
"step": 683
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 7.905760564528911e-10,
"advantages/std": 0.14725378155708313,
"advantages/var": 0.021683676182861156,
"completions/clipped_ratio": -2.96875,
"epoch": 3.9283667621776504,
"grad_norm": 27.054667621788635,
"learning_rate": 2.909114609297325e-09,
"loss": 0.0683,
"num_tokens": 105149957.0,
"residual_var": 0.019650837406516075,
"reward": 0.921875,
"reward_std": 0.041687894612550735,
"rewards/drgrpo_math_reward/mean": 0.921875,
"rewards/drgrpo_math_reward/std": 0.26889389753341675,
"rho2": 0.09374997764825821,
"step": 684
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.3609033788060646e-09,
"advantages/std": 0.19723859429359436,
"advantages/var": 0.038903063078913114,
"completions/clipped_ratio": -2.671875,
"epoch": 3.9340974212034383,
"grad_norm": 61.97921697441788,
"learning_rate": 2.577074727165951e-09,
"loss": 0.0996,
"num_tokens": 105303131.0,
"residual_var": 0.03525590896606445,
"reward": 0.76953125,
"reward_std": 0.07167815417051315,
"rewards/drgrpo_math_reward/mean": 0.76953125,
"rewards/drgrpo_math_reward/std": 0.4219578504562378,
"rho2": 0.09374997764825821,
"step": 685
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 4.656612686812885e-10,
"advantages/std": 0.25,
"advantages/var": 0.0625,
"completions/clipped_ratio": -2.90625,
"epoch": 3.9398280802292263,
"grad_norm": 47.583896244179826,
"learning_rate": 2.2651249535439177e-09,
"loss": -0.2606,
"num_tokens": 105462806.0,
"residual_var": 0.046875014901161194,
"reward": 0.828125,
"reward_std": 0.12863078713417053,
"rewards/drgrpo_math_reward/mean": 0.828125,
"rewards/drgrpo_math_reward/std": 0.3780108094215393,
"rho2": 0.24999995529651642,
"step": 686
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 9.772780097527553e-10,
"advantages/std": 0.23824401199817657,
"advantages/var": 0.056760209252987304,
"completions/clipped_ratio": -2.84375,
"epoch": 3.945558739255014,
"grad_norm": 42.244044782175166,
"learning_rate": 1.973271571728441e-09,
"loss": -0.0936,
"num_tokens": 105603670.0,
"residual_var": 0.04079641029238701,
"reward": 0.86328125,
"reward_std": 0.11652141809463501,
"rewards/drgrpo_math_reward/mean": 0.86328125,
"rewards/drgrpo_math_reward/std": 0.34422317147254944,
"rho2": 0.2812499403953552,
"step": 687
},
{
"advantages/mean": -3.4924596548080444e-10,
"advantages/snr": 2.1339279744244726e-09,
"advantages/std": 0.16366341710090637,
"advantages/var": 0.026785714097145252,
"completions/clipped_ratio": -2.8125,
"epoch": 3.951289398280802,
"grad_norm": 31.72833119113645,
"learning_rate": 1.701520460235084e-09,
"loss": -0.2415,
"num_tokens": 105741207.0,
"residual_var": 0.024274565279483795,
"reward": 0.8203125,
"reward_std": 0.05326685309410095,
"rewards/drgrpo_math_reward/mean": 0.8203125,
"rewards/drgrpo_math_reward/std": 0.38467901945114136,
"rho2": 0.09374997764825821,
"step": 688
},
{
"advantages/mean": 4.656612873077393e-10,
"advantages/snr": 2.3417865283086908e-09,
"advantages/std": 0.19884872436523438,
"advantages/var": 0.039540815181680955,
"completions/clipped_ratio": -2.890625,
"epoch": 3.95702005730659,
"grad_norm": 36.907782413793186,
"learning_rate": 1.4498770926790749e-09,
"loss": -0.2752,
"num_tokens": 105887946.0,
"residual_var": 0.03089127317070961,
"reward": 0.84375,
"reward_std": 0.08588206768035889,
"rewards/drgrpo_math_reward/mean": 0.84375,
"rewards/drgrpo_math_reward/std": 0.3638034462928772,
"rho2": 0.21874994039535522,
"step": 689
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.1726896131954143e-09,
"advantages/std": 0.220157653093338,
"advantages/var": 0.048469392215566565,
"completions/clipped_ratio": -2.90625,
"epoch": 3.962750716332378,
"grad_norm": 54.84225952553401,
"learning_rate": 1.2183465376650603e-09,
"loss": -0.6302,
"num_tokens": 106028876.0,
"residual_var": 0.03938138857483864,
"reward": 0.8515625,
"reward_std": 0.09495474398136139,
"rewards/drgrpo_math_reward/mean": 0.8515625,
"rewards/drgrpo_math_reward/std": 0.3562295734882355,
"rho2": 0.1874999701976776,
"step": 690
},
{
"advantages/mean": -4.656612873077393e-10,
"advantages/snr": 3.424081661465097e-09,
"advantages/std": 0.13599595427513123,
"advantages/var": 0.018494899579203583,
"completions/clipped_ratio": -2.875,
"epoch": 3.968481375358166,
"grad_norm": 32.456933445245255,
"learning_rate": 1.0069334586854105e-09,
"loss": -0.1288,
"num_tokens": 106162604.0,
"residual_var": 0.016761010512709618,
"reward": 0.88671875,
"reward_std": 0.0382704995572567,
"rewards/drgrpo_math_reward/mean": 0.88671875,
"rewards/drgrpo_math_reward/std": 0.31755712628364563,
"rho2": 0.09374997764825821,
"step": 691
},
{
"advantages/mean": 1.1641532182693481e-10,
"advantages/snr": 4.942236508931846e-10,
"advantages/std": 0.2355518937110901,
"advantages/var": 0.05548469463088068,
"completions/clipped_ratio": -2.734375,
"epoch": 3.974212034383954,
"grad_norm": 52.73347449536594,
"learning_rate": 8.156421140254055e-10,
"loss": 0.3587,
"num_tokens": 106323078.0,
"residual_var": 0.045081328600645065,
"reward": 0.84765625,
"reward_std": 0.10178709030151367,
"rewards/drgrpo_math_reward/mean": 0.84765625,
"rewards/drgrpo_math_reward/std": 0.3600577116012573,
"rho2": 0.18749995529651642,
"step": 692
},
{
"advantages/mean": -6.984919309616089e-10,
"advantages/snr": 4.967679260771851e-09,
"advantages/std": 0.14060728251934052,
"advantages/var": 0.01977040789747364,
"completions/clipped_ratio": -2.96875,
"epoch": 3.9799426934097424,
"grad_norm": 26.14018984078511,
"learning_rate": 6.44476356678636e-10,
"loss": -0.1711,
"num_tokens": 106443771.0,
"residual_var": 0.01791694387793541,
"reward": 0.94921875,
"reward_std": 0.03998042270541191,
"rewards/drgrpo_math_reward/mean": 0.94921875,
"rewards/drgrpo_math_reward/std": 0.21998079121112823,
"rho2": 0.0937499850988388,
"step": 693
},
{
"advantages/mean": -8.149072527885437e-10,
"advantages/snr": 4.810330410837374e-09,
"advantages/std": 0.16940774023532867,
"advantages/var": 0.028698982451640598,
"completions/clipped_ratio": -2.953125,
"epoch": 3.98567335243553,
"grad_norm": 31.191093269885084,
"learning_rate": 4.934396342683999e-10,
"loss": 0.0238,
"num_tokens": 106579713.0,
"residual_var": 0.02421477437019348,
"reward": 0.91796875,
"reward_std": 0.06207750737667084,
"rewards/drgrpo_math_reward/mean": 0.91796875,
"rewards/drgrpo_math_reward/std": 0.2749498784542084,
"rho2": 0.1562499701976776,
"step": 694
},
{
"advantages/mean": 6.984919309616089e-10,
"advantages/snr": 3.5707444382625662e-09,
"advantages/std": 0.19561520218849182,
"advantages/var": 0.038265307327244535,
"completions/clipped_ratio": -2.90625,
"epoch": 3.9914040114613183,
"grad_norm": 38.53702742178626,
"learning_rate": 3.625349889788687e-10,
"loss": -0.2754,
"num_tokens": 106715542.0,
"residual_var": 0.031090570613741875,
"reward": 0.8984375,
"reward_std": 0.07825091481208801,
"rewards/drgrpo_math_reward/mean": 0.8984375,
"rewards/drgrpo_math_reward/std": 0.3026638329029083,
"rho2": 0.1874999701976776,
"step": 695
},
{
"advantages/mean": 2.3283064365386963e-10,
"advantages/snr": 1.043916747715097e-09,
"advantages/std": 0.22303563356399536,
"advantages/var": 0.049744893839292814,
"completions/clipped_ratio": -2.953125,
"epoch": 3.997134670487106,
"grad_norm": 49.227715319987915,
"learning_rate": 2.517650574934693e-10,
"loss": -0.3185,
"num_tokens": 106860947.0,
"residual_var": 0.03886320814490318,
"reward": 0.890625,
"reward_std": 0.10258589684963226,
"rewards/drgrpo_math_reward/mean": 0.890625,
"rewards/drgrpo_math_reward/std": 0.31272050738334656,
"rho2": 0.2187499701976776,
"step": 696
},
{
"epoch": 3.997134670487106,
"step": 696,
"total_flos": 0.0,
"train_loss": -0.9753264665694629,
"train_runtime": 28353.7016,
"train_samples_per_second": 0.787,
"train_steps_per_second": 0.025
}
],
"logging_steps": 1,
"max_steps": 700,
"num_input_tokens_seen": 106860947,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}