math_model / trainer_state.json
jdecim's picture
Push exp8 GRPO best (step 750), gen temp=0.7 for pass@8
ab2047d verified
raw
history blame
291 kB
{
"best_global_step": 750,
"best_metric": 1.6413334274291993,
"best_model_checkpoint": "/scratch/checkpoints/exp8_grpo_fast/checkpoint-750",
"epoch": 0.273224043715847,
"eval_steps": 150,
"global_step": 750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333432674407,
"completions/max_length": 1388.2,
"completions/max_terminated_length": 1149.4,
"completions/mean_length": 961.8666870117188,
"completions/mean_terminated_length": 813.0800170898438,
"completions/min_length": 593.6,
"completions/min_terminated_length": 593.6,
"entropy": 0.15543196325500805,
"epoch": 0.0018214936247723133,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.04320038482546806,
"kl": 0.0001638038011151366,
"learning_rate": 1.45985401459854e-08,
"loss": -0.002662058547139168,
"num_tokens": 30920.0,
"reward": 2.02333345413208,
"reward_std": 0.5894708633422852,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.19119417667388916,
"rewards/correctness/mean": 0.46666666567325593,
"rewards/correctness/std": 0.2665788769721985,
"rewards/multi_component_reward/mean": 0.5233333643525839,
"rewards/multi_component_reward/std": 0.4098228693008423,
"rewards/no_answer_rate/mean": 0.13333333432674407,
"rewards/no_answer_rate/std": 0.19119417667388916,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.12532416619360448,
"sampling/importance_sampling_ratio/mean": 0.04354772977530956,
"sampling/importance_sampling_ratio/min": 0.006865633289925732,
"sampling/sampling_logp_difference/max": 0.305552613735199,
"sampling/sampling_logp_difference/mean": 0.010744557529687882,
"step": 5,
"step_time": 32.99731477890164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1657.0,
"completions/max_terminated_length": 1145.8,
"completions/mean_length": 919.3000366210938,
"completions/mean_terminated_length": 807.0200317382812,
"completions/min_length": 442.0,
"completions/min_terminated_length": 442.0,
"entropy": 0.21022272085150082,
"epoch": 0.0036429872495446266,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.014283710159361362,
"kl": 0.00032655518104244646,
"learning_rate": 3.2846715328467156e-08,
"loss": -0.0015700791031122209,
"num_tokens": 61163.0,
"reward": 1.180000114440918,
"reward_std": 0.4240097999572754,
"rewards/boxed_rate/mean": 0.899999988079071,
"rewards/boxed_rate/std": 0.24494898319244385,
"rewards/correctness/mean": 0.06666666865348816,
"rewards/correctness/std": 0.1632993221282959,
"rewards/multi_component_reward/mean": 0.11333334147930145,
"rewards/multi_component_reward/std": 0.27013830840587616,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.24494898319244385,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.06362589932978154,
"sampling/importance_sampling_ratio/mean": 0.01664090696722269,
"sampling/importance_sampling_ratio/min": 0.0005761486502072345,
"sampling/sampling_logp_difference/max": 0.49286861419677735,
"sampling/sampling_logp_difference/mean": 0.01417195163667202,
"step": 10,
"step_time": 37.58825172036886
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20000000298023224,
"completions/max_length": 1657.4,
"completions/max_terminated_length": 899.0,
"completions/mean_length": 897.2000244140625,
"completions/mean_terminated_length": 615.4466735839844,
"completions/min_length": 296.4,
"completions/min_terminated_length": 296.4,
"entropy": 0.20501985649267832,
"epoch": 0.00546448087431694,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.0192733071744442,
"kl": 0.0003123952769480335,
"learning_rate": 5.10948905109489e-08,
"loss": -0.004246947914361953,
"num_tokens": 93923.0,
"reward": 1.8200001120567322,
"reward_std": 0.5815625011920929,
"rewards/boxed_rate/mean": 0.8,
"rewards/boxed_rate/std": 0.29447373151779177,
"rewards/correctness/mean": 0.4000000059604645,
"rewards/correctness/std": 0.20655910968780516,
"rewards/multi_component_reward/mean": 0.42000001221895217,
"rewards/multi_component_reward/std": 0.3785122692584991,
"rewards/no_answer_rate/mean": 0.20000000298023224,
"rewards/no_answer_rate/std": 0.29447373151779177,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.18622441291809083,
"sampling/importance_sampling_ratio/mean": 0.04659921806305647,
"sampling/importance_sampling_ratio/min": 0.0010122977004923522,
"sampling/sampling_logp_difference/max": 0.39253207445144656,
"sampling/sampling_logp_difference/mean": 0.01371441949158907,
"step": 15,
"step_time": 38.31680890209973
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 2023.8,
"completions/max_terminated_length": 1348.6,
"completions/mean_length": 1195.0000610351562,
"completions/mean_terminated_length": 1013.5533630371094,
"completions/min_length": 736.4,
"completions/min_terminated_length": 736.4,
"entropy": 0.15656836417814096,
"epoch": 0.007285974499089253,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02143455669283867,
"kl": 0.00023284607256452244,
"learning_rate": 6.934306569343065e-08,
"loss": -0.006335017085075378,
"num_tokens": 131981.0,
"reward": 1.3033334732055664,
"reward_std": 0.7852452993392944,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.34822853803634646,
"rewards/correctness/mean": 0.13333333730697633,
"rewards/correctness/std": 0.3265986442565918,
"rewards/multi_component_reward/mean": 0.1366666793823242,
"rewards/multi_component_reward/std": 0.5012870132923126,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.34822853803634646,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.05830998048186302,
"sampling/importance_sampling_ratio/mean": 0.016274437960237264,
"sampling/importance_sampling_ratio/min": 0.00017535050738298644,
"sampling/sampling_logp_difference/max": 0.3216116189956665,
"sampling/sampling_logp_difference/mean": 0.011110224667936564,
"step": 20,
"step_time": 45.75825558342039
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1375.4,
"completions/max_terminated_length": 1177.4,
"completions/mean_length": 1042.1333740234375,
"completions/mean_terminated_length": 911.9000244140625,
"completions/min_length": 680.6,
"completions/min_terminated_length": 680.6,
"entropy": 0.1741524614393711,
"epoch": 0.009107468123861567,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.01335141435265541,
"kl": 0.0002629145164974034,
"learning_rate": 8.759124087591241e-08,
"loss": 0.0026412704959511758,
"num_tokens": 166221.0,
"reward": 1.6033333897590638,
"reward_std": 0.5469928443431854,
"rewards/boxed_rate/mean": 0.8666666746139526,
"rewards/boxed_rate/std": 0.20655910968780516,
"rewards/correctness/mean": 0.26666667461395266,
"rewards/correctness/std": 0.20655910968780516,
"rewards/multi_component_reward/mean": 0.3033333241939545,
"rewards/multi_component_reward/std": 0.36893237233161924,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.20655910968780516,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.04747706279158592,
"sampling/importance_sampling_ratio/mean": 0.016832177247852086,
"sampling/importance_sampling_ratio/min": 0.00119685190729814,
"sampling/sampling_logp_difference/max": 0.3412928104400635,
"sampling/sampling_logp_difference/mean": 0.011194901075214148,
"step": 25,
"step_time": 33.345185896754266
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2666666716337204,
"completions/max_length": 1851.2,
"completions/max_terminated_length": 1397.6,
"completions/mean_length": 1248.2000366210937,
"completions/mean_terminated_length": 1000.1933471679688,
"completions/min_length": 643.4,
"completions/min_terminated_length": 643.4,
"entropy": 0.1906235640247663,
"epoch": 0.01092896174863388,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.02501230686903,
"kl": 0.0002681747629443028,
"learning_rate": 1.0583941605839415e-07,
"loss": -0.0029103375971317293,
"num_tokens": 207351.0,
"reward": 1.3366667747497558,
"reward_std": 0.7248244881629944,
"rewards/boxed_rate/mean": 0.7333333313465118,
"rewards/boxed_rate/std": 0.29447373151779177,
"rewards/correctness/mean": 0.1666666716337204,
"rewards/correctness/std": 0.2882087707519531,
"rewards/multi_component_reward/mean": 0.10333333685994148,
"rewards/multi_component_reward/std": 0.48569379448890687,
"rewards/no_answer_rate/mean": 0.2666666716337204,
"rewards/no_answer_rate/std": 0.29447373151779177,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.05718064345419407,
"sampling/importance_sampling_ratio/mean": 0.01394880290608853,
"sampling/importance_sampling_ratio/min": 3.71510857371279e-06,
"sampling/sampling_logp_difference/max": 0.39807581901550293,
"sampling/sampling_logp_difference/mean": 0.011931476183235645,
"step": 30,
"step_time": 42.76963838078082
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06666666865348816,
"completions/max_length": 1265.8,
"completions/max_terminated_length": 1010.8,
"completions/mean_length": 801.2000183105469,
"completions/mean_terminated_length": 710.2333435058594,
"completions/min_length": 515.6,
"completions/min_terminated_length": 515.6,
"entropy": 0.18569878712296486,
"epoch": 0.012750455373406194,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.013473724946379662,
"kl": 0.00025912571412239536,
"learning_rate": 1.240875912408759e-07,
"loss": -0.007362464070320129,
"num_tokens": 234243.0,
"reward": 1.5500000715255737,
"reward_std": 0.6336740732192994,
"rewards/boxed_rate/mean": 0.9333333373069763,
"rewards/boxed_rate/std": 0.10327955484390258,
"rewards/correctness/mean": 0.23333334028720856,
"rewards/correctness/std": 0.2882087707519531,
"rewards/multi_component_reward/mean": 0.3166666716337204,
"rewards/multi_component_reward/std": 0.35141916275024415,
"rewards/no_answer_rate/mean": 0.06666666865348816,
"rewards/no_answer_rate/std": 0.10327955484390258,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.10159578723832965,
"sampling/importance_sampling_ratio/mean": 0.03656362611800432,
"sampling/importance_sampling_ratio/min": 0.002998881617435645,
"sampling/sampling_logp_difference/max": 0.3329684495925903,
"sampling/sampling_logp_difference/mean": 0.011683504190295934,
"step": 35,
"step_time": 29.749523213878273
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.23333334028720856,
"completions/max_length": 1788.4,
"completions/max_terminated_length": 1253.4,
"completions/mean_length": 1151.96669921875,
"completions/mean_terminated_length": 852.2566772460938,
"completions/min_length": 632.2,
"completions/min_terminated_length": 632.2,
"entropy": 0.19733075598875682,
"epoch": 0.014571948998178506,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.017843080684542656,
"kl": 0.00030036380388385927,
"learning_rate": 1.4233576642335764e-07,
"loss": 0.0052158843725919725,
"num_tokens": 271334.0,
"reward": 1.3800000905990601,
"reward_std": 0.8547543048858642,
"rewards/boxed_rate/mean": 0.7666666686534882,
"rewards/boxed_rate/std": 0.2128240704536438,
"rewards/correctness/mean": 0.20000000298023224,
"rewards/correctness/std": 0.35449349880218506,
"rewards/multi_component_reward/mean": 0.1800000086426735,
"rewards/multi_component_reward/std": 0.5035018444061279,
"rewards/no_answer_rate/mean": 0.2333333373069763,
"rewards/no_answer_rate/std": 0.2128240704536438,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.05587816089391708,
"sampling/importance_sampling_ratio/mean": 0.020892591564916074,
"sampling/importance_sampling_ratio/min": 2.3939082747448788e-05,
"sampling/sampling_logp_difference/max": 0.42613508701324465,
"sampling/sampling_logp_difference/mean": 0.012935483455657959,
"step": 40,
"step_time": 40.51654889807105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06666666865348816,
"completions/max_length": 1250.0,
"completions/max_terminated_length": 700.2,
"completions/mean_length": 596.7666748046875,
"completions/mean_terminated_length": 492.7933410644531,
"completions/min_length": 341.8,
"completions/min_terminated_length": 341.8,
"entropy": 0.22809130648771922,
"epoch": 0.01639344262295082,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.06362088024616241,
"kl": 0.00031622202028908455,
"learning_rate": 1.6058394160583942e-07,
"loss": -0.0004649309907108545,
"num_tokens": 293263.0,
"reward": 1.6900001049041748,
"reward_std": 0.6551559090614318,
"rewards/boxed_rate/mean": 0.9333333253860474,
"rewards/boxed_rate/std": 0.1632993221282959,
"rewards/correctness/mean": 0.3000000089406967,
"rewards/correctness/std": 0.2882087707519531,
"rewards/multi_component_reward/mean": 0.3900000035762787,
"rewards/multi_component_reward/std": 0.37366979122161864,
"rewards/no_answer_rate/mean": 0.06666666865348816,
"rewards/no_answer_rate/std": 0.1632993221282959,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.06921428106725216,
"sampling/importance_sampling_ratio/mean": 0.029317377135157585,
"sampling/importance_sampling_ratio/min": 0.00602934339824146,
"sampling/sampling_logp_difference/max": 0.35652687549591067,
"sampling/sampling_logp_difference/mean": 0.015356982313096523,
"step": 45,
"step_time": 28.76009795963764
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1886.2,
"completions/max_terminated_length": 1599.0,
"completions/mean_length": 1163.9333862304688,
"completions/mean_terminated_length": 1057.6466918945312,
"completions/min_length": 663.8,
"completions/min_terminated_length": 663.8,
"entropy": 0.17236283471186956,
"epoch": 0.018214936247723135,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01997266709804535,
"kl": 0.0002550489671799975,
"learning_rate": 1.7883211678832117e-07,
"loss": -0.005824955180287361,
"num_tokens": 331739.0,
"reward": 1.8566666960716247,
"reward_std": 0.7818804442882538,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.36666667461395264,
"rewards/correctness/std": 0.3161036252975464,
"rewards/multi_component_reward/mean": 0.4233333319425583,
"rewards/multi_component_reward/std": 0.4823453426361084,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.054672015644609925,
"sampling/importance_sampling_ratio/mean": 0.01720548253506422,
"sampling/importance_sampling_ratio/min": 0.00035658250299805546,
"sampling/sampling_logp_difference/max": 0.3509830951690674,
"sampling/sampling_logp_difference/mean": 0.011117835808545352,
"step": 50,
"step_time": 43.21503445059061
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3333333432674408,
"completions/max_length": 1744.8,
"completions/max_terminated_length": 1236.2,
"completions/mean_length": 1297.1666931152345,
"completions/mean_terminated_length": 1065.0066833496094,
"completions/min_length": 936.6,
"completions/min_terminated_length": 936.6,
"entropy": 0.12224519066512585,
"epoch": 0.020036429872495445,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.021082526072859764,
"kl": 0.00019305126891898302,
"learning_rate": 1.9708029197080292e-07,
"loss": 0.0025430308654904366,
"num_tokens": 373144.0,
"reward": 1.4600000858306885,
"reward_std": 0.7388215482234954,
"rewards/boxed_rate/mean": 0.6666666597127915,
"rewards/boxed_rate/std": 0.35449349880218506,
"rewards/correctness/mean": 0.23333334028720856,
"rewards/correctness/std": 0.2882087707519531,
"rewards/multi_component_reward/mean": 0.12666668593883515,
"rewards/multi_component_reward/std": 0.506528252363205,
"rewards/no_answer_rate/mean": 0.33333333134651183,
"rewards/no_answer_rate/std": 0.35449349880218506,
"rewards/repetition_rate/mean": 0.10000000298023223,
"rewards/repetition_rate/std": 0.18492921590805053,
"sampling/importance_sampling_ratio/max": 0.10442680462729186,
"sampling/importance_sampling_ratio/mean": 0.04177726461493876,
"sampling/importance_sampling_ratio/min": 0.011532440476071316,
"sampling/sampling_logp_difference/max": 0.32466731071472166,
"sampling/sampling_logp_difference/mean": 0.007843528036028146,
"step": 55,
"step_time": 40.733187234401704
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333432674407,
"completions/max_length": 1700.0,
"completions/max_terminated_length": 1413.4,
"completions/mean_length": 923.9000366210937,
"completions/mean_terminated_length": 779.9666870117187,
"completions/min_length": 489.0,
"completions/min_terminated_length": 489.0,
"entropy": 0.18838655066986879,
"epoch": 0.02185792349726776,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.016334548592567444,
"kl": 0.0002971158295016115,
"learning_rate": 2.1532846715328465e-07,
"loss": 0.0031619951128959655,
"num_tokens": 403573.0,
"reward": 1.393333411216736,
"reward_std": 0.45263335704803465,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.19119417667388916,
"rewards/correctness/mean": 0.1666666716337204,
"rewards/correctness/std": 0.18492921590805053,
"rewards/multi_component_reward/mean": 0.19333333075046538,
"rewards/multi_component_reward/std": 0.30624626874923705,
"rewards/no_answer_rate/mean": 0.13333333432674407,
"rewards/no_answer_rate/std": 0.19119417667388916,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.14395143389701842,
"sampling/importance_sampling_ratio/mean": 0.04990396222565323,
"sampling/importance_sampling_ratio/min": 0.00032090346864226403,
"sampling/sampling_logp_difference/max": 0.40355303287506106,
"sampling/sampling_logp_difference/mean": 0.012135114334523679,
"step": 60,
"step_time": 38.38553868718445
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666865348817,
"completions/max_length": 1554.4,
"completions/max_terminated_length": 1147.6,
"completions/mean_length": 972.8000244140625,
"completions/mean_terminated_length": 804.5466735839843,
"completions/min_length": 498.2,
"completions/min_terminated_length": 498.2,
"entropy": 0.1604183206955592,
"epoch": 0.023679417122040074,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.00623254245147109,
"kl": 0.00027490937888311845,
"learning_rate": 2.335766423357664e-07,
"loss": -0.005589158460497856,
"num_tokens": 436225.0,
"reward": 1.490000057220459,
"reward_std": 0.6202400684356689,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.2728438377380371,
"rewards/correctness/mean": 0.20000000596046447,
"rewards/correctness/std": 0.2665788769721985,
"rewards/multi_component_reward/mean": 0.19000000208616258,
"rewards/multi_component_reward/std": 0.4335045665502548,
"rewards/no_answer_rate/mean": 0.16666666865348817,
"rewards/no_answer_rate/std": 0.2728438377380371,
"rewards/repetition_rate/mean": 0.1,
"rewards/repetition_rate/std": 0.10954451560974121,
"sampling/importance_sampling_ratio/max": 0.0990872667171061,
"sampling/importance_sampling_ratio/mean": 0.04436926119960845,
"sampling/importance_sampling_ratio/min": 0.004196830893285148,
"sampling/sampling_logp_difference/max": 0.297141432762146,
"sampling/sampling_logp_difference/mean": 0.010508796572685242,
"step": 65,
"step_time": 36.07466428950429
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1634.0,
"completions/max_terminated_length": 994.8,
"completions/mean_length": 744.2666870117188,
"completions/mean_terminated_length": 601.7000122070312,
"completions/min_length": 374.2,
"completions/min_terminated_length": 374.2,
"entropy": 0.2318819284439087,
"epoch": 0.025500910746812388,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.05441533029079437,
"kl": 0.0003520799073157832,
"learning_rate": 2.518248175182482e-07,
"loss": -0.003310435265302658,
"num_tokens": 460845.0,
"reward": 1.6933334827423097,
"reward_std": 0.7755845502018929,
"rewards/boxed_rate/mean": 0.899999988079071,
"rewards/boxed_rate/std": 0.24494898319244385,
"rewards/correctness/mean": 0.3,
"rewards/correctness/std": 0.34822853803634646,
"rewards/multi_component_reward/mean": 0.36000003293156624,
"rewards/multi_component_reward/std": 0.5008985161781311,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.24494898319244385,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.056702688336372375,
"sampling/importance_sampling_ratio/mean": 0.019475685060024263,
"sampling/importance_sampling_ratio/min": 0.001143362923176032,
"sampling/sampling_logp_difference/max": 0.3391279220581055,
"sampling/sampling_logp_difference/mean": 0.014528140239417554,
"step": 70,
"step_time": 36.275424292869864
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20000000298023224,
"completions/max_length": 1515.2,
"completions/max_terminated_length": 845.0,
"completions/mean_length": 917.0000244140625,
"completions/mean_terminated_length": 678.5633483886719,
"completions/min_length": 475.4,
"completions/min_terminated_length": 475.4,
"entropy": 0.16173700019717216,
"epoch": 0.0273224043715847,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.018244028091430664,
"kl": 0.00024622106284368785,
"learning_rate": 2.700729927007299e-07,
"loss": -0.014353486895561218,
"num_tokens": 491205.0,
"reward": 1.633333432674408,
"reward_std": 0.8448223888874054,
"rewards/boxed_rate/mean": 0.8,
"rewards/boxed_rate/std": 0.29447373151779177,
"rewards/correctness/mean": 0.3,
"rewards/correctness/std": 0.34822853803634646,
"rewards/multi_component_reward/mean": 0.3000000298023224,
"rewards/multi_component_reward/std": 0.5207933127880097,
"rewards/no_answer_rate/mean": 0.20000000298023224,
"rewards/no_answer_rate/std": 0.29447373151779177,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.16285581663250923,
"sampling/importance_sampling_ratio/mean": 0.05478177797049284,
"sampling/importance_sampling_ratio/min": 0.0013068500558716862,
"sampling/sampling_logp_difference/max": 0.3721138000488281,
"sampling/sampling_logp_difference/mean": 0.011166188679635525,
"step": 75,
"step_time": 35.12463851571083
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03333333432674408,
"completions/max_length": 1582.6,
"completions/max_terminated_length": 1313.0,
"completions/mean_length": 985.9666870117187,
"completions/mean_terminated_length": 935.2533569335938,
"completions/min_length": 670.8,
"completions/min_terminated_length": 670.8,
"entropy": 0.17678433768451213,
"epoch": 0.029143897996357013,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009146427735686302,
"kl": 0.0002533862963900901,
"learning_rate": 2.8832116788321166e-07,
"loss": 0.002100883610546589,
"num_tokens": 524180.0,
"reward": 1.6633334636688233,
"reward_std": 0.8858610212802887,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.26666666865348815,
"rewards/correctness/std": 0.3823883533477783,
"rewards/multi_component_reward/mean": 0.36333334222435953,
"rewards/multi_component_reward/std": 0.473619781434536,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.044011454284191134,
"sampling/importance_sampling_ratio/mean": 0.013894069381058216,
"sampling/importance_sampling_ratio/min": 0.001124254113574473,
"sampling/sampling_logp_difference/max": 0.32026147842407227,
"sampling/sampling_logp_difference/mean": 0.01106615299358964,
"step": 80,
"step_time": 36.22448882814497
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1693.6,
"completions/max_terminated_length": 1402.8,
"completions/mean_length": 1091.4666870117187,
"completions/mean_terminated_length": 998.7600219726562,
"completions/min_length": 584.2,
"completions/min_terminated_length": 584.2,
"entropy": 0.1462914695342382,
"epoch": 0.030965391621129327,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.018090059980750084,
"kl": 0.00022486963668294873,
"learning_rate": 3.065693430656934e-07,
"loss": -0.00974438264966011,
"num_tokens": 560254.0,
"reward": 1.8100001335144043,
"reward_std": 1.0823543906211852,
"rewards/boxed_rate/mean": 0.899999988079071,
"rewards/boxed_rate/std": 0.24494898319244385,
"rewards/correctness/mean": 0.36666667461395264,
"rewards/correctness/std": 0.4794029474258423,
"rewards/multi_component_reward/mean": 0.44333334267139435,
"rewards/multi_component_reward/std": 0.6124810755252839,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.24494898319244385,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.1276155561208725,
"sampling/importance_sampling_ratio/mean": 0.03584536015987396,
"sampling/importance_sampling_ratio/min": 0.002319600686418858,
"sampling/sampling_logp_difference/max": 0.41160542964935304,
"sampling/sampling_logp_difference/mean": 0.010097185987979174,
"step": 85,
"step_time": 39.475563449971375
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3333333432674408,
"completions/max_length": 1841.2,
"completions/max_terminated_length": 875.0,
"completions/mean_length": 1059.2000122070312,
"completions/mean_terminated_length": 630.0533386230469,
"completions/min_length": 458.8,
"completions/min_terminated_length": 458.8,
"entropy": 0.2378736046453317,
"epoch": 0.03278688524590164,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00910874642431736,
"kl": 0.0003503820747331095,
"learning_rate": 3.2481751824817516e-07,
"loss": 0.0034383241087198257,
"num_tokens": 593668.0,
"reward": 1.4166667222976685,
"reward_std": 0.8338384032249451,
"rewards/boxed_rate/mean": 0.6333333253860474,
"rewards/boxed_rate/std": 0.451508092880249,
"rewards/correctness/mean": 0.23333334028720856,
"rewards/correctness/std": 0.2882087707519531,
"rewards/multi_component_reward/mean": 0.11666668206453323,
"rewards/multi_component_reward/std": 0.566602936387062,
"rewards/no_answer_rate/mean": 0.36666667759418486,
"rewards/no_answer_rate/std": 0.451508092880249,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.05201358497142792,
"sampling/importance_sampling_ratio/mean": 0.017090958543121815,
"sampling/importance_sampling_ratio/min": 0.0002041866974779629,
"sampling/sampling_logp_difference/max": 0.3399634838104248,
"sampling/sampling_logp_difference/mean": 0.015104132890701293,
"step": 90,
"step_time": 41.00809515919536
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06666666865348816,
"completions/max_length": 1441.6,
"completions/max_terminated_length": 960.8,
"completions/mean_length": 802.6000366210938,
"completions/mean_terminated_length": 711.3800231933594,
"completions/min_length": 525.6,
"completions/min_terminated_length": 525.6,
"entropy": 0.2225553606947263,
"epoch": 0.03460837887067395,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.01798272132873535,
"kl": 0.00032260040970868433,
"learning_rate": 3.4306569343065697e-07,
"loss": 0.007113450020551681,
"num_tokens": 621316.0,
"reward": 1.223333477973938,
"reward_std": 0.36713925525546076,
"rewards/boxed_rate/mean": 0.9333333253860474,
"rewards/boxed_rate/std": 0.1632993221282959,
"rewards/correctness/mean": 0.06666666865348816,
"rewards/correctness/std": 0.1632993221282959,
"rewards/multi_component_reward/mean": 0.12333333715796471,
"rewards/multi_component_reward/std": 0.2738735854625702,
"rewards/no_answer_rate/mean": 0.06666666865348816,
"rewards/no_answer_rate/std": 0.1632993221282959,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.15052475407719612,
"sampling/importance_sampling_ratio/mean": 0.046577111538499597,
"sampling/importance_sampling_ratio/min": 0.012816431716431823,
"sampling/sampling_logp_difference/max": 0.34408046007156373,
"sampling/sampling_logp_difference/mean": 0.014901423826813698,
"step": 95,
"step_time": 32.91087112892419
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1537.8,
"completions/max_terminated_length": 1380.2,
"completions/mean_length": 972.9000366210937,
"completions/mean_terminated_length": 902.8400146484375,
"completions/min_length": 618.0,
"completions/min_terminated_length": 618.0,
"entropy": 0.2104227361579736,
"epoch": 0.03642987249544627,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006778502371162176,
"kl": 0.00031352789907638606,
"learning_rate": 3.6131386861313867e-07,
"loss": -0.0013602237217128278,
"num_tokens": 653443.0,
"reward": 1.880000078678131,
"reward_std": 0.9815195143222809,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.4000000059604645,
"rewards/correctness/std": 0.4256481409072876,
"rewards/multi_component_reward/mean": 0.48000000715255736,
"rewards/multi_component_reward/std": 0.5586783826351166,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.0448111507575959,
"sampling/importance_sampling_ratio/mean": 0.01701541549991816,
"sampling/importance_sampling_ratio/min": 0.0003750977984182488,
"sampling/sampling_logp_difference/max": 0.3470792770385742,
"sampling/sampling_logp_difference/mean": 0.013565080799162388,
"step": 100,
"step_time": 35.31437961217016
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1421.4,
"completions/max_terminated_length": 1072.0,
"completions/mean_length": 947.7000366210938,
"completions/mean_terminated_length": 787.706689453125,
"completions/min_length": 551.6,
"completions/min_terminated_length": 551.6,
"entropy": 0.2239283885806799,
"epoch": 0.03825136612021858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.011714774183928967,
"kl": 0.0003353157648234628,
"learning_rate": 3.795620437956204e-07,
"loss": 0.009437119215726852,
"num_tokens": 683674.0,
"reward": 1.4433334589004516,
"reward_std": 0.8665581226348877,
"rewards/boxed_rate/mean": 0.8333333313465119,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.20000000596046447,
"rewards/correctness/std": 0.36985843181610106,
"rewards/multi_component_reward/mean": 0.21000000834465027,
"rewards/multi_component_reward/std": 0.5039317846298218,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.1078285550349392,
"sampling/importance_sampling_ratio/mean": 0.03690805228543468,
"sampling/importance_sampling_ratio/min": 0.0026856323238462275,
"sampling/sampling_logp_difference/max": 0.41998746395111086,
"sampling/sampling_logp_difference/mean": 0.014628523960709572,
"step": 105,
"step_time": 32.93989271316677
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1471.4,
"completions/max_terminated_length": 979.4,
"completions/mean_length": 852.2666748046875,
"completions/mean_terminated_length": 721.6166748046875,
"completions/min_length": 488.4,
"completions/min_terminated_length": 488.4,
"entropy": 0.14421232057114441,
"epoch": 0.04007285974499089,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.07266412675380707,
"kl": 0.00022867745647090488,
"learning_rate": 3.978102189781022e-07,
"loss": 0.0018386229872703551,
"num_tokens": 713286.0,
"reward": 1.6733334064483643,
"reward_std": 0.7425484180450439,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.3000000059604645,
"rewards/correctness/std": 0.3161036252975464,
"rewards/multi_component_reward/mean": 0.3400000125169754,
"rewards/multi_component_reward/std": 0.47312636375427247,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.13587955459952356,
"sampling/importance_sampling_ratio/mean": 0.050121305510401726,
"sampling/importance_sampling_ratio/min": 0.0025831204319047172,
"sampling/sampling_logp_difference/max": 0.345587694644928,
"sampling/sampling_logp_difference/mean": 0.00959718506783247,
"step": 110,
"step_time": 33.89908826816827
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2333333373069763,
"completions/max_length": 1749.4,
"completions/max_terminated_length": 884.2,
"completions/mean_length": 869.533349609375,
"completions/mean_terminated_length": 539.5833374023438,
"completions/min_length": 307.8,
"completions/min_terminated_length": 307.8,
"entropy": 0.2536199669043223,
"epoch": 0.04189435336976321,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.011132585816085339,
"kl": 0.0003656693840942656,
"learning_rate": 4.160583941605839e-07,
"loss": 0.0021231153979897497,
"num_tokens": 741952.0,
"reward": 1.7500001192092896,
"reward_std": 1.106396782398224,
"rewards/boxed_rate/mean": 0.799999988079071,
"rewards/boxed_rate/std": 0.35449349880218506,
"rewards/correctness/mean": 0.36666667461395264,
"rewards/correctness/std": 0.4794029474258423,
"rewards/multi_component_reward/mean": 0.383333346247673,
"rewards/multi_component_reward/std": 0.6433047533035279,
"rewards/no_answer_rate/mean": 0.20000000298023224,
"rewards/no_answer_rate/std": 0.35449349880218506,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.25683521628379824,
"sampling/importance_sampling_ratio/mean": 0.065315605327487,
"sampling/importance_sampling_ratio/min": 0.00023330498968689221,
"sampling/sampling_logp_difference/max": 0.35823286771774293,
"sampling/sampling_logp_difference/mean": 0.01769500244408846,
"step": 115,
"step_time": 39.532602636888626
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06666666865348816,
"completions/max_length": 1573.0,
"completions/max_terminated_length": 1324.8,
"completions/mean_length": 1093.2000366210937,
"completions/mean_terminated_length": 1037.0133666992188,
"completions/min_length": 766.8,
"completions/min_terminated_length": 766.8,
"entropy": 0.15541148359576862,
"epoch": 0.04371584699453552,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.01879267208278179,
"kl": 0.00025834220529456314,
"learning_rate": 4.343065693430657e-07,
"loss": 0.0006856221705675125,
"num_tokens": 777766.0,
"reward": 1.433333396911621,
"reward_std": 0.4934782743453979,
"rewards/boxed_rate/mean": 0.9333333253860474,
"rewards/boxed_rate/std": 0.1632993221282959,
"rewards/correctness/mean": 0.16666666865348817,
"rewards/correctness/std": 0.2128240704536438,
"rewards/multi_component_reward/mean": 0.23333334028720856,
"rewards/multi_component_reward/std": 0.32306827008724215,
"rewards/no_answer_rate/mean": 0.06666666865348816,
"rewards/no_answer_rate/std": 0.1632993221282959,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.027404608484357596,
"sampling/importance_sampling_ratio/mean": 0.012392069399356841,
"sampling/importance_sampling_ratio/min": 0.0004944111146869545,
"sampling/sampling_logp_difference/max": 0.37873687744140627,
"sampling/sampling_logp_difference/mean": 0.010291843488812447,
"step": 120,
"step_time": 36.481429217197004
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1672.2,
"completions/max_terminated_length": 1231.2,
"completions/mean_length": 1009.0000122070312,
"completions/mean_terminated_length": 875.0133422851562,
"completions/min_length": 573.0,
"completions/min_terminated_length": 573.0,
"entropy": 0.16983376170198122,
"epoch": 0.04553734061930783,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.026680519804358482,
"kl": 0.00028378382169951997,
"learning_rate": 4.5255474452554743e-07,
"loss": 0.002333539165556431,
"num_tokens": 811096.0,
"reward": 1.3900000691413879,
"reward_std": 0.4993088662624359,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.1666666716337204,
"rewards/correctness/std": 0.18492921590805053,
"rewards/multi_component_reward/mean": 0.22333332896232605,
"rewards/multi_component_reward/std": 0.31437968015670775,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.1588123269379139,
"sampling/importance_sampling_ratio/mean": 0.04118972420692444,
"sampling/importance_sampling_ratio/min": 0.002068098068754896,
"sampling/sampling_logp_difference/max": 0.42397408485412597,
"sampling/sampling_logp_difference/mean": 0.01178776090964675,
"step": 125,
"step_time": 38.221346308663485
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20000000298023224,
"completions/max_length": 1774.8,
"completions/max_terminated_length": 1017.6,
"completions/mean_length": 970.7666870117188,
"completions/mean_terminated_length": 722.2733520507812,
"completions/min_length": 472.0,
"completions/min_terminated_length": 472.0,
"entropy": 0.20842353130380312,
"epoch": 0.04735883424408015,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.017246264964342117,
"kl": 0.00032587918249191714,
"learning_rate": 4.7080291970802913e-07,
"loss": -0.004489587619900703,
"num_tokens": 843681.0,
"reward": 1.3233334302902222,
"reward_std": 0.6886623933911323,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.2728438377380371,
"rewards/correctness/mean": 0.13333333730697633,
"rewards/correctness/std": 0.2665788769721985,
"rewards/multi_component_reward/mean": 0.12333333268761634,
"rewards/multi_component_reward/std": 0.4415378957986832,
"rewards/no_answer_rate/mean": 0.16666666865348817,
"rewards/no_answer_rate/std": 0.2728438377380371,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.057003194093704225,
"sampling/importance_sampling_ratio/mean": 0.015649724379181863,
"sampling/importance_sampling_ratio/min": 0.0008366983823620787,
"sampling/sampling_logp_difference/max": 0.42825262546539306,
"sampling/sampling_logp_difference/mean": 0.013275405205786229,
"step": 130,
"step_time": 40.03070425353944
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1638.0,
"completions/max_terminated_length": 1082.6,
"completions/mean_length": 965.3000427246094,
"completions/mean_terminated_length": 805.8833435058593,
"completions/min_length": 475.4,
"completions/min_terminated_length": 475.4,
"entropy": 0.20910785247882208,
"epoch": 0.04918032786885246,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.014268649742007256,
"kl": 0.00030581061485766744,
"learning_rate": 4.89051094890511e-07,
"loss": -0.0031251441687345505,
"num_tokens": 875316.0,
"reward": 1.3700001120567322,
"reward_std": 0.6670311987400055,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.16666666865348817,
"rewards/correctness/std": 0.2728438377380371,
"rewards/multi_component_reward/mean": 0.20333334803581238,
"rewards/multi_component_reward/std": 0.4036152184009552,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.051390893710777166,
"sampling/importance_sampling_ratio/mean": 0.015738325094571338,
"sampling/importance_sampling_ratio/min": 0.0017150626291213885,
"sampling/sampling_logp_difference/max": 0.3242307841777802,
"sampling/sampling_logp_difference/mean": 0.01380961835384369,
"step": 135,
"step_time": 37.537090591713785
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.23333334028720856,
"completions/max_length": 1763.0,
"completions/max_terminated_length": 775.6,
"completions/mean_length": 960.7000122070312,
"completions/mean_terminated_length": 604.2866760253906,
"completions/min_length": 455.4,
"completions/min_terminated_length": 455.4,
"entropy": 0.23928763965765634,
"epoch": 0.051001821493624776,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.037585072219371796,
"kl": 0.0004156571065929408,
"learning_rate": 4.996165644171779e-07,
"loss": -0.003277498111128807,
"num_tokens": 907599.0,
"reward": 1.8700001239776611,
"reward_std": 1.213301706314087,
"rewards/boxed_rate/mean": 0.7666666567325592,
"rewards/boxed_rate/std": 0.34822853803634646,
"rewards/correctness/mean": 0.4333333343267441,
"rewards/correctness/std": 0.5198277235031128,
"rewards/multi_component_reward/mean": 0.4366666838526726,
"rewards/multi_component_reward/std": 0.7051358222961426,
"rewards/no_answer_rate/mean": 0.23333334028720856,
"rewards/no_answer_rate/std": 0.34822853803634646,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.03320965538732708,
"sampling/importance_sampling_ratio/mean": 0.015124964842107147,
"sampling/importance_sampling_ratio/min": 0.0001439221122021143,
"sampling/sampling_logp_difference/max": 0.3350650787353516,
"sampling/sampling_logp_difference/mean": 0.015135842747986317,
"step": 140,
"step_time": 39.69141132887453
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333432674407,
"completions/max_length": 1378.6,
"completions/max_terminated_length": 1035.8,
"completions/mean_length": 898.6333557128906,
"completions/mean_terminated_length": 752.8266784667969,
"completions/min_length": 510.2,
"completions/min_terminated_length": 510.2,
"entropy": 0.15787406551341215,
"epoch": 0.052823315118397086,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19890737533569336,
"kl": 0.0003336386442242656,
"learning_rate": 4.986579754601227e-07,
"loss": 0.002312604896724224,
"num_tokens": 936976.0,
"reward": 1.6733333706855773,
"reward_std": 0.7778201699256897,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.19119417667388916,
"rewards/correctness/mean": 0.3000000059604645,
"rewards/correctness/std": 0.3161036252975464,
"rewards/multi_component_reward/mean": 0.34000000953674314,
"rewards/multi_component_reward/std": 0.4764533966779709,
"rewards/no_answer_rate/mean": 0.13333333432674407,
"rewards/no_answer_rate/std": 0.19119417667388916,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.17277967557311058,
"sampling/importance_sampling_ratio/mean": 0.0497883933596313,
"sampling/importance_sampling_ratio/min": 0.0009090198537052175,
"sampling/sampling_logp_difference/max": 0.5492818593978882,
"sampling/sampling_logp_difference/mean": 0.010555424820631742,
"step": 145,
"step_time": 32.12684296686202
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1774.2,
"completions/max_terminated_length": 1617.8,
"completions/mean_length": 1212.6333618164062,
"completions/mean_terminated_length": 1111.5633544921875,
"completions/min_length": 845.2,
"completions/min_terminated_length": 845.2,
"entropy": 0.17395392432808876,
"epoch": 0.0546448087431694,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.021323509514331818,
"kl": 0.00030080197708836444,
"learning_rate": 4.976993865030675e-07,
"loss": -0.005566118285059929,
"num_tokens": 976577.0,
"reward": 1.4133334517478944,
"reward_std": 0.5511920034885407,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.16666666865348817,
"rewards/correctness/std": 0.2128240704536438,
"rewards/multi_component_reward/mean": 0.17999999672174455,
"rewards/multi_component_reward/std": 0.3554683744907379,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.05498660355806351,
"sampling/importance_sampling_ratio/mean": 0.01640337195713073,
"sampling/importance_sampling_ratio/min": 1.4973960100639972e-05,
"sampling/sampling_logp_difference/max": 0.47727389335632325,
"sampling/sampling_logp_difference/mean": 0.011420094780623913,
"step": 150,
"step_time": 41.61420878618956
},
{
"epoch": 0.0546448087431694,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.21666667133569717,
"eval_completions/max_length": 1789.4,
"eval_completions/max_terminated_length": 1216.88,
"eval_completions/mean_length": 1121.4300323486327,
"eval_completions/mean_terminated_length": 886.213013305664,
"eval_completions/min_length": 641.78,
"eval_completions/min_terminated_length": 641.78,
"eval_entropy": 0.17698929071426392,
"eval_frac_reward_zero_std": 0.08,
"eval_kl": 0.00028998344379942864,
"eval_loss": -0.0026245773769915104,
"eval_num_tokens": 976577.0,
"eval_reward": 1.5393334233760834,
"eval_reward_std": 0.7298788775503635,
"eval_rewards/boxed_rate/mean": 0.7833333307504654,
"eval_rewards/boxed_rate/std": 0.32846659898757935,
"eval_rewards/correctness/mean": 0.24666666924953462,
"eval_rewards/correctness/std": 0.27019834876060483,
"eval_rewards/multi_component_reward/mean": 0.21933334972709417,
"eval_rewards/multi_component_reward/std": 0.48081229120492935,
"eval_rewards/no_answer_rate/mean": 0.21666667133569717,
"eval_rewards/no_answer_rate/std": 0.32846659898757935,
"eval_rewards/repetition_rate/mean": 0.07333333462476731,
"eval_rewards/repetition_rate/std": 0.13300593733787536,
"eval_runtime": 1782.1981,
"eval_samples_per_second": 0.028,
"eval_sampling/importance_sampling_ratio/max": 0.07466922109248117,
"eval_sampling/importance_sampling_ratio/mean": 0.026714042400126346,
"eval_sampling/importance_sampling_ratio/min": 0.0011027993811156295,
"eval_sampling/sampling_logp_difference/max": 0.4103265881538391,
"eval_sampling/sampling_logp_difference/mean": 0.011447538509964944,
"eval_steps_per_second": 0.005,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1207.0,
"completions/max_terminated_length": 1207.0,
"completions/mean_length": 770.1333679199219,
"completions/mean_terminated_length": 770.1333679199219,
"completions/min_length": 500.8,
"completions/min_terminated_length": 500.8,
"entropy": 0.19859646161397299,
"epoch": 0.056466302367941715,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.05886993184685707,
"kl": 0.0003771214178414084,
"learning_rate": 4.967407975460122e-07,
"loss": -0.004334203898906708,
"num_tokens": 1003269.0,
"reward": 1.4733334302902221,
"reward_std": 0.504085260629654,
"rewards/boxed_rate/mean": 1.0,
"rewards/boxed_rate/std": 0.0,
"rewards/correctness/mean": 0.16666666865348817,
"rewards/correctness/std": 0.2128240704536438,
"rewards/multi_component_reward/mean": 0.27333334758877753,
"rewards/multi_component_reward/std": 0.2586013779044151,
"rewards/no_answer_rate/mean": 0.0,
"rewards/no_answer_rate/std": 0.0,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.10831067697145044,
"sampling/importance_sampling_ratio/mean": 0.040180114656686784,
"sampling/importance_sampling_ratio/min": 0.0008547768986318261,
"sampling/sampling_logp_difference/max": 0.3474395990371704,
"sampling/sampling_logp_difference/mean": 0.013024460710585117,
"step": 155,
"step_time": 28.411310048028827
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1885.8,
"completions/max_terminated_length": 1037.6,
"completions/mean_length": 888.3666748046875,
"completions/mean_terminated_length": 646.7700012207031,
"completions/min_length": 398.6,
"completions/min_terminated_length": 398.6,
"entropy": 0.21153039361039797,
"epoch": 0.058287795992714025,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.030352359637618065,
"kl": 0.00036930939143834014,
"learning_rate": 4.957822085889571e-07,
"loss": -0.015004897117614746,
"num_tokens": 1032692.0,
"reward": 1.6533334136009217,
"reward_std": 0.7879160098731518,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.34822853803634646,
"rewards/correctness/mean": 0.3000000059604645,
"rewards/correctness/std": 0.3161036252975464,
"rewards/multi_component_reward/mean": 0.3200000114738941,
"rewards/multi_component_reward/std": 0.5426497280597686,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.34822853803634646,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.15256396904587746,
"sampling/importance_sampling_ratio/mean": 0.05275641949847341,
"sampling/importance_sampling_ratio/min": 0.0003334702798868202,
"sampling/sampling_logp_difference/max": 0.3136852741241455,
"sampling/sampling_logp_difference/mean": 0.014769792556762695,
"step": 160,
"step_time": 41.816410617157814
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2666666716337204,
"completions/max_length": 1347.0,
"completions/max_terminated_length": 859.2,
"completions/mean_length": 907.3000244140625,
"completions/mean_terminated_length": 642.22001953125,
"completions/min_length": 436.6,
"completions/min_terminated_length": 436.6,
"entropy": 0.2138912024597327,
"epoch": 0.060109289617486336,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.0335945226252079,
"kl": 0.00036641353217419236,
"learning_rate": 4.948236196319019e-07,
"loss": -0.001109707448631525,
"num_tokens": 1062455.0,
"reward": 1.733333444595337,
"reward_std": 0.6965994656085968,
"rewards/boxed_rate/mean": 0.7333333313465118,
"rewards/boxed_rate/std": 0.29447373151779177,
"rewards/correctness/mean": 0.3666666686534882,
"rewards/correctness/std": 0.2728438377380371,
"rewards/multi_component_reward/mean": 0.33333335630595684,
"rewards/multi_component_reward/std": 0.4374739408493042,
"rewards/no_answer_rate/mean": 0.2666666716337204,
"rewards/no_answer_rate/std": 0.29447373151779177,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.17867551841773094,
"sampling/importance_sampling_ratio/mean": 0.07929584998637437,
"sampling/importance_sampling_ratio/min": 0.007277285696347713,
"sampling/sampling_logp_difference/max": 0.3139556646347046,
"sampling/sampling_logp_difference/mean": 0.014788956940174102,
"step": 165,
"step_time": 31.849167810566723
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666865348817,
"completions/max_length": 1693.4,
"completions/max_terminated_length": 1086.8,
"completions/mean_length": 985.8000244140625,
"completions/mean_terminated_length": 765.3133666992187,
"completions/min_length": 539.0,
"completions/min_terminated_length": 539.0,
"entropy": 0.22184887006878853,
"epoch": 0.061930783242258654,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.010766901075839996,
"kl": 0.0003837254077855808,
"learning_rate": 4.938650306748465e-07,
"loss": -0.005718140304088593,
"num_tokens": 1094465.0,
"reward": 1.5600001335144043,
"reward_std": 0.6616819977760315,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.2728438377380371,
"rewards/correctness/mean": 0.2666666656732559,
"rewards/correctness/std": 0.2665788769721985,
"rewards/multi_component_reward/mean": 0.2933333650231361,
"rewards/multi_component_reward/std": 0.4033259034156799,
"rewards/no_answer_rate/mean": 0.16666666865348817,
"rewards/no_answer_rate/std": 0.2728438377380371,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.04150749985128641,
"sampling/importance_sampling_ratio/mean": 0.017618318973109125,
"sampling/importance_sampling_ratio/min": 0.0001386483045517349,
"sampling/sampling_logp_difference/max": 0.41498665809631347,
"sampling/sampling_logp_difference/mean": 0.015049760602414608,
"step": 170,
"step_time": 38.327568624168634
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.33333334028720857,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1280.8,
"completions/mean_length": 1264.76669921875,
"completions/mean_terminated_length": 950.460009765625,
"completions/min_length": 712.6,
"completions/min_terminated_length": 712.6,
"entropy": 0.20655053692559402,
"epoch": 0.06375227686703097,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1300213485956192,
"kl": 0.00040407002428158496,
"learning_rate": 4.929064417177914e-07,
"loss": -0.002616160549223423,
"num_tokens": 1134790.0,
"reward": 1.2733334183692933,
"reward_std": 0.7226614370942116,
"rewards/boxed_rate/mean": 0.6666666567325592,
"rewards/boxed_rate/std": 0.45777305364608767,
"rewards/correctness/mean": 0.16666666865348817,
"rewards/correctness/std": 0.2728438377380371,
"rewards/multi_component_reward/mean": 0.0733333457261324,
"rewards/multi_component_reward/std": 0.5279393553733825,
"rewards/no_answer_rate/mean": 0.33333334028720857,
"rewards/no_answer_rate/std": 0.45777305364608767,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.05303769689053297,
"sampling/importance_sampling_ratio/mean": 0.015054715669248254,
"sampling/importance_sampling_ratio/min": 8.732383148210488e-05,
"sampling/sampling_logp_difference/max": 0.29345667362213135,
"sampling/sampling_logp_difference/mean": 0.013776615634560585,
"step": 175,
"step_time": 46.28360453415662
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2333333373069763,
"completions/max_length": 1848.8,
"completions/max_terminated_length": 1419.8,
"completions/mean_length": 1218.3667114257812,
"completions/mean_terminated_length": 983.3333618164063,
"completions/min_length": 721.0,
"completions/min_terminated_length": 721.0,
"entropy": 0.14731517334779104,
"epoch": 0.06557377049180328,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03868393227458,
"kl": 0.00034544099568544574,
"learning_rate": 4.919478527607362e-07,
"loss": -0.007346557080745697,
"num_tokens": 1175133.0,
"reward": 1.7500000834465026,
"reward_std": 0.9921784222126007,
"rewards/boxed_rate/mean": 0.8,
"rewards/boxed_rate/std": 0.29447373151779177,
"rewards/correctness/mean": 0.3666666716337204,
"rewards/correctness/std": 0.404018247127533,
"rewards/multi_component_reward/mean": 0.383333346247673,
"rewards/multi_component_reward/std": 0.5928741514682769,
"rewards/no_answer_rate/mean": 0.20000000298023224,
"rewards/no_answer_rate/std": 0.29447373151779177,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.08876338973641396,
"sampling/importance_sampling_ratio/mean": 0.02539586406201124,
"sampling/importance_sampling_ratio/min": 0.0011980376816850713,
"sampling/sampling_logp_difference/max": 0.3423057317733765,
"sampling/sampling_logp_difference/mean": 0.010648915357887746,
"step": 180,
"step_time": 42.86893743276596
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20000000596046447,
"completions/max_length": 1706.6,
"completions/max_terminated_length": 1151.0,
"completions/mean_length": 1097.7666870117187,
"completions/mean_terminated_length": 885.9666931152344,
"completions/min_length": 678.4,
"completions/min_terminated_length": 678.4,
"entropy": 0.13618133092919985,
"epoch": 0.06739526411657559,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.04725215584039688,
"kl": 0.0003020782351086382,
"learning_rate": 4.909892638036809e-07,
"loss": -0.0009320625104010105,
"num_tokens": 1210850.0,
"reward": 1.5866667985916139,
"reward_std": 0.3331963121891022,
"rewards/boxed_rate/mean": 0.7999999940395355,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.26666666865348815,
"rewards/correctness/std": 0.10327955484390258,
"rewards/multi_component_reward/mean": 0.2533333495259285,
"rewards/multi_component_reward/std": 0.27605399787425994,
"rewards/no_answer_rate/mean": 0.20000000596046447,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.0795806871727109,
"sampling/importance_sampling_ratio/mean": 0.02610416170209646,
"sampling/importance_sampling_ratio/min": 0.002741426149717241,
"sampling/sampling_logp_difference/max": 0.3459432005882263,
"sampling/sampling_logp_difference/mean": 0.008953130897134542,
"step": 185,
"step_time": 39.274089214019476
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1897.2,
"completions/max_terminated_length": 1270.2,
"completions/mean_length": 1048.533349609375,
"completions/mean_terminated_length": 838.5800048828125,
"completions/min_length": 490.8,
"completions/min_terminated_length": 490.8,
"entropy": 0.18140921418865522,
"epoch": 0.0692167577413479,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.05130758509039879,
"kl": 0.00038463542005047203,
"learning_rate": 4.900306748466257e-07,
"loss": -0.00950215756893158,
"num_tokens": 1245174.0,
"reward": 1.2133333802223205,
"reward_std": 0.6049932837486267,
"rewards/boxed_rate/mean": 0.8,
"rewards/boxed_rate/std": 0.36985843181610106,
"rewards/correctness/mean": 0.10000000298023223,
"rewards/correctness/std": 0.18492921590805053,
"rewards/multi_component_reward/mean": 0.08000001087784767,
"rewards/multi_component_reward/std": 0.39686688780784607,
"rewards/no_answer_rate/mean": 0.20000000596046447,
"rewards/no_answer_rate/std": 0.36985843181610106,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.0888688538223505,
"sampling/importance_sampling_ratio/mean": 0.03125911168754101,
"sampling/importance_sampling_ratio/min": 0.00010974614788835727,
"sampling/sampling_logp_difference/max": 0.3610305070877075,
"sampling/sampling_logp_difference/mean": 0.01223631165921688,
"step": 190,
"step_time": 42.93450187277049
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2333333373069763,
"completions/max_length": 1931.4,
"completions/max_terminated_length": 1245.0,
"completions/mean_length": 1162.5333740234375,
"completions/mean_terminated_length": 899.483349609375,
"completions/min_length": 618.2,
"completions/min_terminated_length": 618.2,
"entropy": 0.1432020000492533,
"epoch": 0.07103825136612021,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.06832960247993469,
"kl": 0.00033801408014066204,
"learning_rate": 4.890720858895706e-07,
"loss": -0.00368819460272789,
"num_tokens": 1282900.0,
"reward": 1.6600000619888307,
"reward_std": 1.03403559923172,
"rewards/boxed_rate/mean": 0.7666666626930236,
"rewards/boxed_rate/std": 0.3761233925819397,
"rewards/correctness/mean": 0.33333333432674406,
"rewards/correctness/std": 0.4298781991004944,
"rewards/multi_component_reward/mean": 0.3266666799783707,
"rewards/multi_component_reward/std": 0.6201063513755798,
"rewards/no_answer_rate/mean": 0.2333333373069763,
"rewards/no_answer_rate/std": 0.3761233925819397,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.24176665293052793,
"sampling/importance_sampling_ratio/mean": 0.05982898166403174,
"sampling/importance_sampling_ratio/min": 0.010627524812724437,
"sampling/sampling_logp_difference/max": 0.3079464197158813,
"sampling/sampling_logp_difference/mean": 0.009482560632750391,
"step": 195,
"step_time": 43.80696959905326
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1874.4,
"completions/max_terminated_length": 1207.6,
"completions/mean_length": 1102.2666748046875,
"completions/mean_terminated_length": 956.0200134277344,
"completions/min_length": 668.8,
"completions/min_terminated_length": 668.8,
"entropy": 0.13664823652555544,
"epoch": 0.07285974499089254,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01865367591381073,
"kl": 0.00029515944139954324,
"learning_rate": 4.881134969325153e-07,
"loss": 0.006076012551784515,
"num_tokens": 1318920.0,
"reward": 1.9533334493637085,
"reward_std": 1.1985347509384154,
"rewards/boxed_rate/mean": 0.8666666507720947,
"rewards/boxed_rate/std": 0.3265986442565918,
"rewards/correctness/mean": 0.4333333373069763,
"rewards/correctness/std": 0.5351926565170289,
"rewards/multi_component_reward/mean": 0.4866666853427887,
"rewards/multi_component_reward/std": 0.7141769766807556,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.3265986442565918,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.08207609243690968,
"sampling/importance_sampling_ratio/mean": 0.03297081319615245,
"sampling/importance_sampling_ratio/min": 0.003100475773135258,
"sampling/sampling_logp_difference/max": 0.40416555404663085,
"sampling/sampling_logp_difference/mean": 0.008233289048075677,
"step": 200,
"step_time": 42.36675942577422
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1600.0,
"completions/max_terminated_length": 1066.6,
"completions/mean_length": 959.2000122070312,
"completions/mean_terminated_length": 798.0866943359375,
"completions/min_length": 461.0,
"completions/min_terminated_length": 461.0,
"entropy": 0.14408907170097032,
"epoch": 0.07468123861566485,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.032731793820858,
"kl": 0.0003691373499653613,
"learning_rate": 4.871549079754601e-07,
"loss": -0.001960751600563526,
"num_tokens": 1352016.0,
"reward": 1.673333430290222,
"reward_std": 0.9306211471557617,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.3000000029802322,
"rewards/correctness/std": 0.404018247127533,
"rewards/multi_component_reward/mean": 0.3400000214576721,
"rewards/multi_component_reward/std": 0.5803858906030654,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.10455413907766342,
"sampling/importance_sampling_ratio/mean": 0.03612115085124969,
"sampling/importance_sampling_ratio/min": 0.0026871272621811215,
"sampling/sampling_logp_difference/max": 0.3213069200515747,
"sampling/sampling_logp_difference/mean": 0.00922326734289527,
"step": 205,
"step_time": 37.272668573819104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1572.2,
"completions/max_terminated_length": 1133.2,
"completions/mean_length": 879.3333618164063,
"completions/mean_terminated_length": 743.8000244140625,
"completions/min_length": 459.6,
"completions/min_terminated_length": 459.6,
"entropy": 0.19108737086256344,
"epoch": 0.07650273224043716,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.018986158072948456,
"kl": 0.0004163069315836765,
"learning_rate": 4.86196319018405e-07,
"loss": 0.0035621844232082366,
"num_tokens": 1380880.0,
"reward": 1.740000104904175,
"reward_std": 0.947225558757782,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.33333333432674406,
"rewards/correctness/std": 0.4298781991004944,
"rewards/multi_component_reward/mean": 0.40666669309139253,
"rewards/multi_component_reward/std": 0.5280151665210724,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.07124827802181244,
"sampling/importance_sampling_ratio/mean": 0.02541533587500453,
"sampling/importance_sampling_ratio/min": 0.002891992630936181,
"sampling/sampling_logp_difference/max": 0.3407265663146973,
"sampling/sampling_logp_difference/mean": 0.012478712201118469,
"step": 210,
"step_time": 35.72363304812461
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1773.4,
"completions/max_terminated_length": 1327.6,
"completions/mean_length": 1154.46669921875,
"completions/mean_terminated_length": 950.5800048828125,
"completions/min_length": 652.4,
"completions/min_terminated_length": 652.4,
"entropy": 0.18063361259798208,
"epoch": 0.07832422586520947,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02516000345349312,
"kl": 0.0003309295129535409,
"learning_rate": 4.852377300613496e-07,
"loss": -0.003862759843468666,
"num_tokens": 1418892.0,
"reward": 1.766666793823242,
"reward_std": 0.897074180841446,
"rewards/boxed_rate/mean": 0.8333333373069763,
"rewards/boxed_rate/std": 0.2882087707519531,
"rewards/correctness/mean": 0.33333333432674406,
"rewards/correctness/std": 0.36985843181610106,
"rewards/multi_component_reward/mean": 0.3333333648741245,
"rewards/multi_component_reward/std": 0.5403176128864289,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.2882087707519531,
"rewards/repetition_rate/mean": 0.10000000298023223,
"rewards/repetition_rate/std": 0.24494898319244385,
"sampling/importance_sampling_ratio/max": 0.04162683514878154,
"sampling/importance_sampling_ratio/mean": 0.011416873242706061,
"sampling/importance_sampling_ratio/min": 0.001456704799248534,
"sampling/sampling_logp_difference/max": 0.3141038179397583,
"sampling/sampling_logp_difference/mean": 0.011231625638902187,
"step": 215,
"step_time": 40.99782377649099
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03333333432674408,
"completions/max_length": 1087.4,
"completions/max_terminated_length": 1005.0,
"completions/mean_length": 687.36669921875,
"completions/mean_terminated_length": 652.033349609375,
"completions/min_length": 441.2,
"completions/min_terminated_length": 441.2,
"entropy": 0.20975149522225062,
"epoch": 0.08014571948998178,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.026713233441114426,
"kl": 0.0004683414850054154,
"learning_rate": 4.842791411042944e-07,
"loss": -0.0019101161509752274,
"num_tokens": 1442585.0,
"reward": 1.9400001287460327,
"reward_std": 1.0250610709190369,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.4,
"rewards/correctness/std": 0.45777305364608767,
"rewards/multi_component_reward/mean": 0.5066666960716247,
"rewards/multi_component_reward/std": 0.49611697196960447,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.11259435554966331,
"sampling/importance_sampling_ratio/mean": 0.04658988020382822,
"sampling/importance_sampling_ratio/min": 0.0064018826728026255,
"sampling/sampling_logp_difference/max": 0.35799312591552734,
"sampling/sampling_logp_difference/mean": 0.013789117708802224,
"step": 220,
"step_time": 25.938872035220264
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1644.6,
"completions/max_terminated_length": 1458.8,
"completions/mean_length": 1090.4333618164062,
"completions/mean_terminated_length": 1024.5933349609375,
"completions/min_length": 586.4,
"completions/min_terminated_length": 586.4,
"entropy": 0.15132917563120524,
"epoch": 0.08196721311475409,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.015617283061146736,
"kl": 0.00032217044063145297,
"learning_rate": 4.833205521472393e-07,
"loss": 0.005397457629442215,
"num_tokens": 1478730.0,
"reward": 1.2733334183692933,
"reward_std": 0.49739299416542054,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.10000000298023223,
"rewards/correctness/std": 0.18492921590805053,
"rewards/multi_component_reward/mean": 0.14000000953674316,
"rewards/multi_component_reward/std": 0.3222720980644226,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.09620626708492637,
"sampling/importance_sampling_ratio/mean": 0.030177546793129294,
"sampling/importance_sampling_ratio/min": 0.0006849504668480222,
"sampling/sampling_logp_difference/max": 0.35564944744110105,
"sampling/sampling_logp_difference/mean": 0.009922309406101703,
"step": 225,
"step_time": 38.19373752269894
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.26666667461395266,
"completions/max_length": 1435.0,
"completions/max_terminated_length": 1110.6,
"completions/mean_length": 1050.7667114257813,
"completions/mean_terminated_length": 875.8666809082031,
"completions/min_length": 714.8,
"completions/min_terminated_length": 714.8,
"entropy": 0.20316853250066438,
"epoch": 0.08378870673952642,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.021429063752293587,
"kl": 0.00040220142885421715,
"learning_rate": 4.82361963190184e-07,
"loss": -0.00029108244925737383,
"num_tokens": 1513091.0,
"reward": 1.3133334159851073,
"reward_std": 0.7383940577507019,
"rewards/boxed_rate/mean": 0.7333333343267441,
"rewards/boxed_rate/std": 0.19119417667388916,
"rewards/correctness/mean": 0.1666666716337204,
"rewards/correctness/std": 0.2882087707519531,
"rewards/multi_component_reward/mean": 0.11333334147930145,
"rewards/multi_component_reward/std": 0.4417478919029236,
"rewards/no_answer_rate/mean": 0.2666666626930237,
"rewards/no_answer_rate/std": 0.19119417667388916,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.14741267394274474,
"sampling/importance_sampling_ratio/mean": 0.06669870759360493,
"sampling/importance_sampling_ratio/min": 0.0190697070662623,
"sampling/sampling_logp_difference/max": 0.34659633636474607,
"sampling/sampling_logp_difference/mean": 0.013104908354580402,
"step": 230,
"step_time": 33.9486582595855
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03333333432674408,
"completions/max_length": 1370.2,
"completions/max_terminated_length": 1313.4,
"completions/mean_length": 1031.9666870117187,
"completions/mean_terminated_length": 1006.8400268554688,
"completions/min_length": 716.6,
"completions/min_terminated_length": 716.6,
"entropy": 0.14171662541727226,
"epoch": 0.08561020036429873,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.031329505145549774,
"kl": 0.0002987771406575727,
"learning_rate": 4.814033742331288e-07,
"loss": -0.0023104714229702948,
"num_tokens": 1546912.0,
"reward": 1.7100001096725463,
"reward_std": 0.7802696824073792,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.3,
"rewards/correctness/std": 0.34822853803634646,
"rewards/multi_component_reward/mean": 0.41000002026557925,
"rewards/multi_component_reward/std": 0.43204120099544524,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.06215879218652844,
"sampling/importance_sampling_ratio/mean": 0.030630824994295834,
"sampling/importance_sampling_ratio/min": 0.007751126746109232,
"sampling/sampling_logp_difference/max": 0.3309502720832825,
"sampling/sampling_logp_difference/mean": 0.009595037158578634,
"step": 235,
"step_time": 33.03153369966894
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3000000029802322,
"completions/max_length": 1976.2,
"completions/max_terminated_length": 1429.4,
"completions/mean_length": 1217.8667114257812,
"completions/mean_terminated_length": 870.3533569335938,
"completions/min_length": 525.4,
"completions/min_terminated_length": 525.4,
"entropy": 0.1393390517681837,
"epoch": 0.08743169398907104,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.05459148809313774,
"kl": 0.0003685045870952308,
"learning_rate": 4.804447852760736e-07,
"loss": -0.0035786613821983337,
"num_tokens": 1587708.0,
"reward": 1.52666677236557,
"reward_std": 0.9796114563941956,
"rewards/boxed_rate/mean": 0.7,
"rewards/boxed_rate/std": 0.404018247127533,
"rewards/correctness/mean": 0.26666666865348815,
"rewards/correctness/std": 0.3823883533477783,
"rewards/multi_component_reward/mean": 0.19333335161209106,
"rewards/multi_component_reward/std": 0.6527607858180999,
"rewards/no_answer_rate/mean": 0.3000000029802322,
"rewards/no_answer_rate/std": 0.404018247127533,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.08990185633301735,
"sampling/importance_sampling_ratio/mean": 0.030038297548890112,
"sampling/importance_sampling_ratio/min": 0.00043211893236647645,
"sampling/sampling_logp_difference/max": 0.38284850120544434,
"sampling/sampling_logp_difference/mean": 0.008784445654600858,
"step": 240,
"step_time": 45.60475273691118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1608.4,
"completions/max_terminated_length": 1074.2,
"completions/mean_length": 920.5666870117187,
"completions/mean_terminated_length": 754.0433471679687,
"completions/min_length": 548.6,
"completions/min_terminated_length": 548.6,
"entropy": 0.13041699826717376,
"epoch": 0.08925318761384335,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.011656666174530983,
"kl": 0.00033540410368004814,
"learning_rate": 4.794861963190184e-07,
"loss": -0.004278642311692238,
"num_tokens": 1618097.0,
"reward": 1.3933334350585938,
"reward_std": 0.31584063470363616,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.13333333730697633,
"rewards/correctness/std": 0.10327955484390258,
"rewards/multi_component_reward/mean": 0.1266666680574417,
"rewards/multi_component_reward/std": 0.3364715576171875,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.13333333730697633,
"rewards/repetition_rate/std": 0.2665788769721985,
"sampling/importance_sampling_ratio/max": 0.13740080818533898,
"sampling/importance_sampling_ratio/mean": 0.05796990524977445,
"sampling/importance_sampling_ratio/min": 0.006626390311389762,
"sampling/sampling_logp_difference/max": 0.3156670331954956,
"sampling/sampling_logp_difference/mean": 0.008615392539650202,
"step": 245,
"step_time": 36.637498759664595
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03333333432674408,
"completions/max_length": 1533.4,
"completions/max_terminated_length": 1395.2,
"completions/mean_length": 973.1333923339844,
"completions/mean_terminated_length": 939.9467224121094,
"completions/min_length": 659.4,
"completions/min_terminated_length": 659.4,
"entropy": 0.14471415306131044,
"epoch": 0.09107468123861566,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.012578152120113373,
"kl": 0.00044469132699305193,
"learning_rate": 4.785276073619632e-07,
"loss": 0.00015797601081430913,
"num_tokens": 1651371.0,
"reward": 2.2233334302902223,
"reward_std": 0.6588261000812053,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.5333333432674408,
"rewards/correctness/std": 0.3098386645317078,
"rewards/multi_component_reward/mean": 0.656666674464941,
"rewards/multi_component_reward/std": 0.41430723667144775,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.18325970163568855,
"sampling/importance_sampling_ratio/mean": 0.10900020101107658,
"sampling/importance_sampling_ratio/min": 0.058780076946641203,
"sampling/sampling_logp_difference/max": 0.28016397953033445,
"sampling/sampling_logp_difference/mean": 0.009201886225491762,
"step": 250,
"step_time": 36.18357318136841
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1565.8,
"completions/max_terminated_length": 1137.4,
"completions/mean_length": 961.133349609375,
"completions/mean_terminated_length": 823.0366821289062,
"completions/min_length": 616.4,
"completions/min_terminated_length": 616.4,
"entropy": 0.14495401444534461,
"epoch": 0.09289617486338798,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.010682695545256138,
"kl": 0.00035484658534793806,
"learning_rate": 4.775690184049079e-07,
"loss": 0.0014964478090405465,
"num_tokens": 1682425.0,
"reward": 1.8333334445953369,
"reward_std": 0.6724297642707825,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.3666666716337204,
"rewards/correctness/std": 0.2882087707519531,
"rewards/multi_component_reward/mean": 0.4333333522081375,
"rewards/multi_component_reward/std": 0.4199430406093597,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.10162204280495643,
"sampling/importance_sampling_ratio/mean": 0.033536690101027486,
"sampling/importance_sampling_ratio/min": 0.00426993980855085,
"sampling/sampling_logp_difference/max": 0.33833720684051516,
"sampling/sampling_logp_difference/mean": 0.009549971111118794,
"step": 255,
"step_time": 35.81306515969336
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4000000089406967,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1168.6,
"completions/mean_length": 1366.4000366210937,
"completions/mean_terminated_length": 998.9466796875,
"completions/min_length": 838.8,
"completions/min_terminated_length": 838.8,
"entropy": 0.14179731508096058,
"epoch": 0.0947176684881603,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.05714869126677513,
"kl": 0.00032895591721171513,
"learning_rate": 4.7661042944785273e-07,
"loss": -0.004640129953622818,
"num_tokens": 1726081.0,
"reward": 1.4900000929832458,
"reward_std": 0.996655923128128,
"rewards/boxed_rate/mean": 0.6000000059604644,
"rewards/boxed_rate/std": 0.5010328412055969,
"rewards/correctness/mean": 0.2666666716337204,
"rewards/correctness/std": 0.3977532863616943,
"rewards/multi_component_reward/mean": 0.12333334572613239,
"rewards/multi_component_reward/std": 0.6989697873592376,
"rewards/no_answer_rate/mean": 0.4000000089406967,
"rewards/no_answer_rate/std": 0.5010328412055969,
"rewards/repetition_rate/mean": 0.1,
"rewards/repetition_rate/std": 0.10954451560974121,
"sampling/importance_sampling_ratio/max": 0.043305744789540766,
"sampling/importance_sampling_ratio/mean": 0.01504859896376729,
"sampling/importance_sampling_ratio/min": 4.04306270474383e-05,
"sampling/sampling_logp_difference/max": 0.3569210052490234,
"sampling/sampling_logp_difference/mean": 0.009369691275060177,
"step": 260,
"step_time": 46.90199479162693
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2666666716337204,
"completions/max_length": 1583.0,
"completions/max_terminated_length": 934.6,
"completions/mean_length": 1023.133349609375,
"completions/mean_terminated_length": 685.6266723632813,
"completions/min_length": 537.4,
"completions/min_terminated_length": 537.4,
"entropy": 0.17780156135559083,
"epoch": 0.0965391621129326,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.018404532223939896,
"kl": 0.0003682690895705794,
"learning_rate": 4.756518404907975e-07,
"loss": 0.012411002814769746,
"num_tokens": 1759811.0,
"reward": 1.3833333849906921,
"reward_std": 0.7943651616573334,
"rewards/boxed_rate/mean": 0.7333333313465118,
"rewards/boxed_rate/std": 0.29447373151779177,
"rewards/correctness/mean": 0.20000000298023224,
"rewards/correctness/std": 0.29447373151779177,
"rewards/multi_component_reward/mean": 0.15000001043081285,
"rewards/multi_component_reward/std": 0.5146282583475112,
"rewards/no_answer_rate/mean": 0.2666666716337204,
"rewards/no_answer_rate/std": 0.29447373151779177,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.1577057947986759,
"sampling/importance_sampling_ratio/mean": 0.09503802299732342,
"sampling/importance_sampling_ratio/min": 0.0009112325180752826,
"sampling/sampling_logp_difference/max": 0.3926606416702271,
"sampling/sampling_logp_difference/mean": 0.012097407877445222,
"step": 265,
"step_time": 36.53071039505303
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1533.2,
"completions/max_terminated_length": 932.6,
"completions/mean_length": 900.7333618164063,
"completions/mean_terminated_length": 726.6700073242188,
"completions/min_length": 482.4,
"completions/min_terminated_length": 482.4,
"entropy": 0.18941813930869103,
"epoch": 0.09836065573770492,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1255546510219574,
"kl": 0.0004568617567808057,
"learning_rate": 4.746932515337423e-07,
"loss": -0.006510256975889206,
"num_tokens": 1790271.0,
"reward": 1.7666667699813843,
"reward_std": 1.1080477476119994,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.34822853803634646,
"rewards/correctness/mean": 0.33333333730697634,
"rewards/correctness/std": 0.48566790819168093,
"rewards/multi_component_reward/mean": 0.33333334922790525,
"rewards/multi_component_reward/std": 0.6733725190162658,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.34822853803634646,
"rewards/repetition_rate/mean": 0.10000000298023223,
"rewards/repetition_rate/std": 0.18492921590805053,
"sampling/importance_sampling_ratio/max": 0.0878407845273614,
"sampling/importance_sampling_ratio/mean": 0.026912105677183717,
"sampling/importance_sampling_ratio/min": 0.0006732676178655298,
"sampling/sampling_logp_difference/max": 0.33626770973205566,
"sampling/sampling_logp_difference/mean": 0.012423686124384404,
"step": 270,
"step_time": 35.3790395591408
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1538.0,
"completions/max_terminated_length": 1314.2,
"completions/mean_length": 987.2333618164063,
"completions/mean_terminated_length": 886.1933471679688,
"completions/min_length": 572.4,
"completions/min_terminated_length": 572.4,
"entropy": 0.17079306667049726,
"epoch": 0.10018214936247723,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0078973937779665,
"kl": 0.00039174315752461554,
"learning_rate": 4.737346625766871e-07,
"loss": -0.0023402150720357893,
"num_tokens": 1822306.0,
"reward": 1.833333396911621,
"reward_std": 0.9216741323471069,
"rewards/boxed_rate/mean": 0.899999988079071,
"rewards/boxed_rate/std": 0.24494898319244385,
"rewards/correctness/mean": 0.3666666716337204,
"rewards/correctness/std": 0.404018247127533,
"rewards/multi_component_reward/mean": 0.43333334028720855,
"rewards/multi_component_reward/std": 0.5590049713850022,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.24494898319244385,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.05113908690400422,
"sampling/importance_sampling_ratio/mean": 0.021760703669860958,
"sampling/importance_sampling_ratio/min": 0.008651768978314297,
"sampling/sampling_logp_difference/max": 0.4047112584114075,
"sampling/sampling_logp_difference/mean": 0.01078464426100254,
"step": 275,
"step_time": 35.81637797001749
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20000000298023224,
"completions/max_length": 1824.0,
"completions/max_terminated_length": 1109.4,
"completions/mean_length": 981.833349609375,
"completions/mean_terminated_length": 751.3400024414062,
"completions/min_length": 517.2,
"completions/min_terminated_length": 517.2,
"entropy": 0.15676350990931193,
"epoch": 0.10200364298724955,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01452692411839962,
"kl": 0.0003485525502280022,
"learning_rate": 4.727760736196319e-07,
"loss": -0.004090853407979012,
"num_tokens": 1855067.0,
"reward": 1.4933334350585938,
"reward_std": 0.7522169463336468,
"rewards/boxed_rate/mean": 0.799999988079071,
"rewards/boxed_rate/std": 0.35449349880218506,
"rewards/correctness/mean": 0.23333333432674408,
"rewards/correctness/std": 0.30073869228363037,
"rewards/multi_component_reward/mean": 0.2266666792333126,
"rewards/multi_component_reward/std": 0.5248861283063888,
"rewards/no_answer_rate/mean": 0.20000000298023224,
"rewards/no_answer_rate/std": 0.35449349880218506,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.12218071203678846,
"sampling/importance_sampling_ratio/mean": 0.04405408292077482,
"sampling/importance_sampling_ratio/min": 0.00501076535471764,
"sampling/sampling_logp_difference/max": 0.37753498554229736,
"sampling/sampling_logp_difference/mean": 0.009699719026684761,
"step": 280,
"step_time": 40.97467570956796
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1744.4,
"completions/max_terminated_length": 1173.6,
"completions/mean_length": 970.86669921875,
"completions/mean_terminated_length": 758.7533447265625,
"completions/min_length": 479.4,
"completions/min_terminated_length": 479.4,
"entropy": 0.15489849572380385,
"epoch": 0.10382513661202186,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.09483744204044342,
"kl": 0.000336567038417949,
"learning_rate": 4.718174846625766e-07,
"loss": -0.004700837284326553,
"num_tokens": 1886869.0,
"reward": 1.7000001311302184,
"reward_std": 0.8241950333118438,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.34822853803634646,
"rewards/correctness/mean": 0.33333333730697634,
"rewards/correctness/std": 0.32236858606338503,
"rewards/multi_component_reward/mean": 0.36666667461395264,
"rewards/multi_component_reward/std": 0.5094490587711334,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.34822853803634646,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.13168737590312957,
"sampling/importance_sampling_ratio/mean": 0.06171476859599352,
"sampling/importance_sampling_ratio/min": 0.010470659041346208,
"sampling/sampling_logp_difference/max": 0.344563889503479,
"sampling/sampling_logp_difference/mean": 0.010558286216109991,
"step": 285,
"step_time": 39.89412596244365
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.23333333432674408,
"completions/max_length": 1809.2,
"completions/max_terminated_length": 1132.6,
"completions/mean_length": 1119.7000366210937,
"completions/mean_terminated_length": 832.9400268554688,
"completions/min_length": 590.4,
"completions/min_terminated_length": 590.4,
"entropy": 0.15704507902264594,
"epoch": 0.10564663023679417,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.023737642914056778,
"kl": 0.0003448050100511561,
"learning_rate": 4.7085889570552147e-07,
"loss": 0.0019955366849899294,
"num_tokens": 1923040.0,
"reward": 1.380000114440918,
"reward_std": 0.5739886239171028,
"rewards/boxed_rate/mean": 0.7666666626930236,
"rewards/boxed_rate/std": 0.30073869228363037,
"rewards/correctness/mean": 0.16666666865348817,
"rewards/correctness/std": 0.2128240704536438,
"rewards/multi_component_reward/mean": 0.11333334371447563,
"rewards/multi_component_reward/std": 0.46365060210227965,
"rewards/no_answer_rate/mean": 0.23333333432674408,
"rewards/no_answer_rate/std": 0.30073869228363037,
"rewards/repetition_rate/mean": 0.10000000298023223,
"rewards/repetition_rate/std": 0.18492921590805053,
"sampling/importance_sampling_ratio/max": 0.04411560408771038,
"sampling/importance_sampling_ratio/mean": 0.01753278383985162,
"sampling/importance_sampling_ratio/min": 0.0009861058487468544,
"sampling/sampling_logp_difference/max": 0.3644953012466431,
"sampling/sampling_logp_difference/mean": 0.010005151480436325,
"step": 290,
"step_time": 41.05979204457253
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1592.8,
"completions/max_terminated_length": 1143.4,
"completions/mean_length": 987.1000366210938,
"completions/mean_terminated_length": 825.9633544921875,
"completions/min_length": 591.8,
"completions/min_terminated_length": 591.8,
"entropy": 0.16288827657699584,
"epoch": 0.10746812386156648,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.026686564087867737,
"kl": 0.0003618090704549104,
"learning_rate": 4.6990030674846625e-07,
"loss": -0.0018155150115489959,
"num_tokens": 1955647.0,
"reward": 1.7666668176651001,
"reward_std": 0.4796482801437378,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.33333333432674406,
"rewards/correctness/std": 0.19119417667388916,
"rewards/multi_component_reward/mean": 0.36666668206453323,
"rewards/multi_component_reward/std": 0.34831122159957884,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.17200291641056537,
"sampling/importance_sampling_ratio/mean": 0.09221775899641216,
"sampling/importance_sampling_ratio/min": 0.053504807552870126,
"sampling/sampling_logp_difference/max": 0.31802849769592284,
"sampling/sampling_logp_difference/mean": 0.010488973837345839,
"step": 295,
"step_time": 36.921972643770275
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03333333432674408,
"completions/max_length": 1181.2,
"completions/max_terminated_length": 1068.4,
"completions/mean_length": 714.5666870117187,
"completions/mean_terminated_length": 675.57333984375,
"completions/min_length": 433.8,
"completions/min_terminated_length": 433.8,
"entropy": 0.20909111325939497,
"epoch": 0.1092896174863388,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9235589504241943,
"kl": 0.0007714893268712331,
"learning_rate": 4.68941717791411e-07,
"loss": 0.0028855174779891966,
"num_tokens": 1980240.0,
"reward": 1.8966667413711549,
"reward_std": 1.0613920927047729,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.3666666716337204,
"rewards/correctness/std": 0.5072978019714356,
"rewards/multi_component_reward/mean": 0.4633333578705788,
"rewards/multi_component_reward/std": 0.5980847001075744,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.11060594655573368,
"sampling/importance_sampling_ratio/mean": 0.05734073922503739,
"sampling/importance_sampling_ratio/min": 0.024227803308775008,
"sampling/sampling_logp_difference/max": 0.31456995010375977,
"sampling/sampling_logp_difference/mean": 0.013103757984936237,
"step": 300,
"step_time": 27.952531585469842
},
{
"epoch": 0.1092896174863388,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.22666667222976686,
"eval_completions/max_length": 1758.54,
"eval_completions/max_terminated_length": 1125.4,
"eval_completions/mean_length": 1107.770029296875,
"eval_completions/mean_terminated_length": 846.1800146484375,
"eval_completions/min_length": 606.8,
"eval_completions/min_terminated_length": 606.8,
"eval_entropy": 0.17873862124979495,
"eval_frac_reward_zero_std": 0.08,
"eval_kl": 0.0003903971218096558,
"eval_loss": -0.0051070391200482845,
"eval_num_tokens": 1980240.0,
"eval_reward": 1.5980000698566437,
"eval_reward_std": 0.782314371317625,
"eval_rewards/boxed_rate/mean": 0.7766666629910469,
"eval_rewards/boxed_rate/std": 0.33167909026145936,
"eval_rewards/correctness/mean": 0.28666666984558103,
"eval_rewards/correctness/std": 0.30271870851516725,
"eval_rewards/multi_component_reward/mean": 0.26800001293420794,
"eval_rewards/multi_component_reward/std": 0.5066750717163085,
"eval_rewards/no_answer_rate/mean": 0.22333333760499954,
"eval_rewards/no_answer_rate/std": 0.33167909026145936,
"eval_rewards/repetition_rate/mean": 0.04333333432674408,
"eval_rewards/repetition_rate/std": 0.08060015916824341,
"eval_runtime": 1753.4069,
"eval_samples_per_second": 0.029,
"eval_sampling/importance_sampling_ratio/max": 0.0897229278186569,
"eval_sampling/importance_sampling_ratio/mean": 0.035299012192626836,
"eval_sampling/importance_sampling_ratio/min": 0.0025020323396462286,
"eval_sampling/sampling_logp_difference/max": 0.5019185048341751,
"eval_sampling/sampling_logp_difference/mean": 0.012026464603841304,
"eval_steps_per_second": 0.005,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 1605.6,
"completions/max_terminated_length": 1384.6,
"completions/mean_length": 1010.7000366210938,
"completions/mean_terminated_length": 876.26669921875,
"completions/min_length": 573.8,
"completions/min_terminated_length": 573.8,
"entropy": 0.1340769293407599,
"epoch": 0.1111111111111111,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.034374091774225235,
"kl": 0.000321884588629473,
"learning_rate": 4.6798312883435583e-07,
"loss": -0.014604152739048004,
"num_tokens": 2014467.0,
"reward": 2.043333411216736,
"reward_std": 0.8047043204307556,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.10954451560974121,
"rewards/correctness/mean": 0.4666666716337204,
"rewards/correctness/std": 0.36985843181610106,
"rewards/multi_component_reward/mean": 0.5433333471417428,
"rewards/multi_component_reward/std": 0.4605102062225342,
"rewards/no_answer_rate/mean": 0.1,
"rewards/no_answer_rate/std": 0.10954451560974121,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.12428842782974243,
"sampling/importance_sampling_ratio/mean": 0.046624291501939294,
"sampling/importance_sampling_ratio/min": 0.0006754371514054114,
"sampling/sampling_logp_difference/max": 0.32104220390319826,
"sampling/sampling_logp_difference/mean": 0.008986939676105976,
"step": 305,
"step_time": 36.810268136300145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666865348817,
"completions/max_length": 1557.0,
"completions/max_terminated_length": 1291.8,
"completions/mean_length": 1068.033349609375,
"completions/mean_terminated_length": 916.2,
"completions/min_length": 631.6,
"completions/min_terminated_length": 631.6,
"entropy": 0.15544118496278922,
"epoch": 0.11293260473588343,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01212399173527956,
"kl": 0.00047503276291536165,
"learning_rate": 4.670245398773006e-07,
"loss": -8.383383974432945e-05,
"num_tokens": 2049616.0,
"reward": 1.6533334255218506,
"reward_std": 1.070458745956421,
"rewards/boxed_rate/mean": 0.8333333373069763,
"rewards/boxed_rate/std": 0.2128240704536438,
"rewards/correctness/mean": 0.3000000059604645,
"rewards/correctness/std": 0.4794029474258423,
"rewards/multi_component_reward/mean": 0.3200000122189522,
"rewards/multi_component_reward/std": 0.6258516371250152,
"rewards/no_answer_rate/mean": 0.16666666865348817,
"rewards/no_answer_rate/std": 0.2128240704536438,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.08939264337532223,
"sampling/importance_sampling_ratio/mean": 0.030286337621510027,
"sampling/importance_sampling_ratio/min": 0.0005470938450967466,
"sampling/sampling_logp_difference/max": 0.34738160371780397,
"sampling/sampling_logp_difference/mean": 0.010354180075228214,
"step": 310,
"step_time": 36.12143337074667
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3333333432674408,
"completions/max_length": 1634.2,
"completions/max_terminated_length": 1091.4,
"completions/mean_length": 1218.6333618164062,
"completions/mean_terminated_length": 858.4666870117187,
"completions/min_length": 677.6,
"completions/min_terminated_length": 677.6,
"entropy": 0.17260866140325865,
"epoch": 0.11475409836065574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.05682944133877754,
"kl": 0.0005160604792763479,
"learning_rate": 4.6606595092024536e-07,
"loss": -0.006619427353143692,
"num_tokens": 2088659.0,
"reward": 2.090000092983246,
"reward_std": 1.1096696853637695,
"rewards/boxed_rate/mean": 0.6666666716337204,
"rewards/boxed_rate/std": 0.29447373151779177,
"rewards/correctness/mean": 0.566666665673256,
"rewards/correctness/std": 0.451508092880249,
"rewards/multi_component_reward/mean": 0.523333391547203,
"rewards/multi_component_reward/std": 0.6607022762298584,
"rewards/no_answer_rate/mean": 0.33333333134651183,
"rewards/no_answer_rate/std": 0.29447373151779177,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.05110827875323594,
"sampling/importance_sampling_ratio/mean": 0.018865692079998553,
"sampling/importance_sampling_ratio/min": 0.0009169460441439363,
"sampling/sampling_logp_difference/max": 0.3731674671173096,
"sampling/sampling_logp_difference/mean": 0.01166527420282364,
"step": 315,
"step_time": 38.15295873656869
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1484.2,
"completions/max_terminated_length": 1039.2,
"completions/mean_length": 931.7000244140625,
"completions/mean_terminated_length": 790.5233520507812,
"completions/min_length": 563.4,
"completions/min_terminated_length": 563.4,
"entropy": 0.143128818521897,
"epoch": 0.11657559198542805,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.005422326270490885,
"kl": 0.0004371852917150439,
"learning_rate": 4.6510736196319015e-07,
"loss": 0.004143273830413819,
"num_tokens": 2119250.0,
"reward": 1.5300001502037048,
"reward_std": 0.24947773814201354,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.23333333432674408,
"rewards/correctness/std": 0.08164966106414795,
"rewards/multi_component_reward/mean": 0.29666668176651,
"rewards/multi_component_reward/std": 0.17254199385643004,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.12071083066985011,
"sampling/importance_sampling_ratio/mean": 0.05540300235152244,
"sampling/importance_sampling_ratio/min": 0.013269457093102949,
"sampling/sampling_logp_difference/max": 0.36359539031982424,
"sampling/sampling_logp_difference/mean": 0.009230617992579937,
"step": 320,
"step_time": 34.49489349294454
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666865348817,
"completions/max_length": 1798.0,
"completions/max_terminated_length": 1537.6,
"completions/mean_length": 1144.8333618164063,
"completions/mean_terminated_length": 965.6733520507812,
"completions/min_length": 634.0,
"completions/min_terminated_length": 634.0,
"entropy": 0.18388984675208728,
"epoch": 0.11839708561020036,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14283621311187744,
"kl": 0.0006492346903542057,
"learning_rate": 4.64148773006135e-07,
"loss": -0.0025676295161247255,
"num_tokens": 2157267.0,
"reward": 1.630000066757202,
"reward_std": 0.89271479845047,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.2728438377380371,
"rewards/correctness/mean": 0.3000000059604645,
"rewards/correctness/std": 0.3761233925819397,
"rewards/multi_component_reward/mean": 0.3300000101327896,
"rewards/multi_component_reward/std": 0.5265865564346314,
"rewards/no_answer_rate/mean": 0.16666666865348817,
"rewards/no_answer_rate/std": 0.2728438377380371,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.04229360092431307,
"sampling/importance_sampling_ratio/mean": 0.019075372768566013,
"sampling/importance_sampling_ratio/min": 8.805350876674933e-05,
"sampling/sampling_logp_difference/max": 0.3247460603713989,
"sampling/sampling_logp_difference/mean": 0.01177409002557397,
"step": 325,
"step_time": 41.04225417692214
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 823.4,
"completions/max_terminated_length": 823.4,
"completions/mean_length": 544.1333435058593,
"completions/mean_terminated_length": 544.1333435058593,
"completions/min_length": 366.0,
"completions/min_terminated_length": 366.0,
"entropy": 0.22606497158606847,
"epoch": 0.12021857923497267,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.9119893908500671,
"kl": 0.0007021349136872838,
"learning_rate": 4.631901840490797e-07,
"loss": -0.006102682277560234,
"num_tokens": 2175259.0,
"reward": 1.8700001001358033,
"reward_std": 0.8484382510185242,
"rewards/boxed_rate/mean": 1.0,
"rewards/boxed_rate/std": 0.0,
"rewards/correctness/mean": 0.3666666716337204,
"rewards/correctness/std": 0.404018247127533,
"rewards/multi_component_reward/mean": 0.503333343565464,
"rewards/multi_component_reward/std": 0.44442008137702943,
"rewards/no_answer_rate/mean": 0.0,
"rewards/no_answer_rate/std": 0.0,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.13480582600459456,
"sampling/importance_sampling_ratio/mean": 0.07863845487590879,
"sampling/importance_sampling_ratio/min": 0.01574344330747408,
"sampling/sampling_logp_difference/max": 0.3204054594039917,
"sampling/sampling_logp_difference/mean": 0.014066471531987191,
"step": 330,
"step_time": 20.059878361225127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06666666865348816,
"completions/max_length": 1395.2,
"completions/max_terminated_length": 910.6,
"completions/mean_length": 742.1333679199219,
"completions/mean_terminated_length": 640.480029296875,
"completions/min_length": 509.6,
"completions/min_terminated_length": 509.6,
"entropy": 0.2093804210424423,
"epoch": 0.122040072859745,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.028569765388965607,
"kl": 0.0006724427085525046,
"learning_rate": 4.622315950920245e-07,
"loss": -0.0013746816664934158,
"num_tokens": 2200067.0,
"reward": 1.6200001239776611,
"reward_std": 0.8776960492134094,
"rewards/boxed_rate/mean": 0.9333333253860474,
"rewards/boxed_rate/std": 0.1632993221282959,
"rewards/correctness/mean": 0.26666666865348815,
"rewards/correctness/std": 0.3823883533477783,
"rewards/multi_component_reward/mean": 0.3533333420753479,
"rewards/multi_component_reward/std": 0.49811467826366423,
"rewards/no_answer_rate/mean": 0.06666666865348816,
"rewards/no_answer_rate/std": 0.1632993221282959,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.07016745954751968,
"sampling/importance_sampling_ratio/mean": 0.027400266751646994,
"sampling/importance_sampling_ratio/min": 0.0014501588469148616,
"sampling/sampling_logp_difference/max": 0.36046857833862306,
"sampling/sampling_logp_difference/mean": 0.014280413649976254,
"step": 335,
"step_time": 31.59101020414382
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20000000596046447,
"completions/max_length": 1939.2,
"completions/max_terminated_length": 988.8,
"completions/mean_length": 976.86669921875,
"completions/mean_terminated_length": 681.2466857910156,
"completions/min_length": 446.6,
"completions/min_terminated_length": 446.6,
"entropy": 0.2083836982647578,
"epoch": 0.12386156648451731,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.06303755939006805,
"kl": 0.0005712434819239812,
"learning_rate": 4.612730061349693e-07,
"loss": -0.0044234395027160645,
"num_tokens": 2232145.0,
"reward": 1.74333336353302,
"reward_std": 0.9833520889282227,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.34822853803634646,
"rewards/correctness/mean": 0.33333334028720857,
"rewards/correctness/std": 0.3977532863616943,
"rewards/multi_component_reward/mean": 0.3433333486318588,
"rewards/multi_component_reward/std": 0.6024900764226914,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.34822853803634646,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.1221102561801672,
"sampling/importance_sampling_ratio/mean": 0.029469438176602126,
"sampling/importance_sampling_ratio/min": 5.1707089343915565e-05,
"sampling/sampling_logp_difference/max": 0.3329290866851807,
"sampling/sampling_logp_difference/mean": 0.012941631115972995,
"step": 340,
"step_time": 43.22656154055149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 1163.2,
"completions/max_terminated_length": 1144.4,
"completions/mean_length": 778.86669921875,
"completions/mean_terminated_length": 688.9000244140625,
"completions/min_length": 431.8,
"completions/min_terminated_length": 431.8,
"entropy": 0.1657411351799965,
"epoch": 0.12568306010928962,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1018831878900528,
"kl": 0.0005899470415897667,
"learning_rate": 4.603144171779141e-07,
"loss": -0.006323814392089844,
"num_tokens": 2259027.0,
"reward": 1.5300001144409179,
"reward_std": 0.9171831011772156,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.10954451560974121,
"rewards/correctness/mean": 0.20000000596046447,
"rewards/correctness/std": 0.4298781991004944,
"rewards/multi_component_reward/mean": 0.23000000715255736,
"rewards/multi_component_reward/std": 0.5341363847255707,
"rewards/no_answer_rate/mean": 0.1,
"rewards/no_answer_rate/std": 0.10954451560974121,
"rewards/repetition_rate/mean": 0.1,
"rewards/repetition_rate/std": 0.10954451560974121,
"sampling/importance_sampling_ratio/max": 0.12837026529014112,
"sampling/importance_sampling_ratio/mean": 0.05263339746743441,
"sampling/importance_sampling_ratio/min": 0.0009968833521444183,
"sampling/sampling_logp_difference/max": 0.31432443857192993,
"sampling/sampling_logp_difference/mean": 0.011136513762176036,
"step": 345,
"step_time": 27.575114846229553
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1772.4,
"completions/max_terminated_length": 1112.4,
"completions/mean_length": 983.0333618164062,
"completions/mean_terminated_length": 776.57333984375,
"completions/min_length": 516.6,
"completions/min_terminated_length": 516.6,
"entropy": 0.18533113847176233,
"epoch": 0.12750455373406194,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.1743193119764328,
"kl": 0.0007398304010469777,
"learning_rate": 4.593558282208589e-07,
"loss": 0.0035149652510881426,
"num_tokens": 2292724.0,
"reward": 1.5366667270660401,
"reward_std": 0.7906980156898499,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.34822853803634646,
"rewards/correctness/mean": 0.23333334028720856,
"rewards/correctness/std": 0.34822853803634646,
"rewards/multi_component_reward/mean": 0.2366666778922081,
"rewards/multi_component_reward/std": 0.52751624584198,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.34822853803634646,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.16946832239627838,
"sampling/importance_sampling_ratio/mean": 0.04226875379681587,
"sampling/importance_sampling_ratio/min": 0.00029637051690457807,
"sampling/sampling_logp_difference/max": 0.34900493621826173,
"sampling/sampling_logp_difference/mean": 0.011652881279587746,
"step": 350,
"step_time": 40.440953285992144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03333333432674408,
"completions/max_length": 1092.8,
"completions/max_terminated_length": 992.2,
"completions/mean_length": 720.4666931152344,
"completions/mean_terminated_length": 686.5133483886718,
"completions/min_length": 467.2,
"completions/min_terminated_length": 467.2,
"entropy": 0.1794678675631682,
"epoch": 0.12932604735883424,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.020413219928741455,
"kl": 0.0006120980954923046,
"learning_rate": 4.583972392638036e-07,
"loss": -0.0014667985960841179,
"num_tokens": 2316852.0,
"reward": 2.060000109672546,
"reward_std": 0.7259637594223023,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.46666666865348816,
"rewards/correctness/std": 0.32236858606338503,
"rewards/multi_component_reward/mean": 0.59333336353302,
"rewards/multi_component_reward/std": 0.4035952419042587,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.14755254588089883,
"sampling/importance_sampling_ratio/mean": 0.07640535607933999,
"sampling/importance_sampling_ratio/min": 0.03205070712427443,
"sampling/sampling_logp_difference/max": 0.3078671216964722,
"sampling/sampling_logp_difference/mean": 0.011428364552557468,
"step": 355,
"step_time": 26.221613752283154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1573.4,
"completions/max_terminated_length": 1007.8,
"completions/mean_length": 839.6000122070312,
"completions/mean_terminated_length": 670.27333984375,
"completions/min_length": 423.0,
"completions/min_terminated_length": 423.0,
"entropy": 0.2592687545965115,
"epoch": 0.13114754098360656,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21311414241790771,
"kl": 0.0006711701275586772,
"learning_rate": 4.5743865030674845e-07,
"loss": -0.0009625215083360672,
"num_tokens": 2345424.0,
"reward": 1.6000000953674316,
"reward_std": 0.7632826209068299,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.2666666656732559,
"rewards/correctness/std": 0.3265986442565918,
"rewards/multi_component_reward/mean": 0.33333335518836976,
"rewards/multi_component_reward/std": 0.44263782203197477,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.06810822673141956,
"sampling/importance_sampling_ratio/mean": 0.026497452543117105,
"sampling/importance_sampling_ratio/min": 0.0005043454068991798,
"sampling/sampling_logp_difference/max": 0.46866347789764407,
"sampling/sampling_logp_difference/mean": 0.01661387998610735,
"step": 360,
"step_time": 35.86930139716715
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.23333334028720856,
"completions/max_length": 1906.6,
"completions/max_terminated_length": 1405.8,
"completions/mean_length": 1317.56669921875,
"completions/mean_terminated_length": 1097.8466796875,
"completions/min_length": 801.0,
"completions/min_terminated_length": 801.0,
"entropy": 0.11761416780451933,
"epoch": 0.13296903460837886,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.015462023206055164,
"kl": 0.0003710240945414019,
"learning_rate": 4.5648006134969324e-07,
"loss": -0.007586926966905594,
"num_tokens": 2388119.0,
"reward": 1.7300000667572022,
"reward_std": 1.0704036235809327,
"rewards/boxed_rate/mean": 0.7666666686534882,
"rewards/boxed_rate/std": 0.2882087707519531,
"rewards/correctness/mean": 0.33333333730697634,
"rewards/correctness/std": 0.48566790819168093,
"rewards/multi_component_reward/mean": 0.2966666899621487,
"rewards/multi_component_reward/std": 0.6780295848846436,
"rewards/no_answer_rate/mean": 0.23333334028720856,
"rewards/no_answer_rate/std": 0.2882087707519531,
"rewards/repetition_rate/mean": 0.10000000298023223,
"rewards/repetition_rate/std": 0.24494898319244385,
"sampling/importance_sampling_ratio/max": 0.04545931427273899,
"sampling/importance_sampling_ratio/mean": 0.02320328297209926,
"sampling/importance_sampling_ratio/min": 0.001607318179527617,
"sampling/sampling_logp_difference/max": 0.35903769731521606,
"sampling/sampling_logp_difference/mean": 0.007444971334189176,
"step": 365,
"step_time": 43.916714335232975
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1418.0,
"completions/max_terminated_length": 1135.4,
"completions/mean_length": 861.5666931152343,
"completions/mean_terminated_length": 710.4833435058594,
"completions/min_length": 346.6,
"completions/min_terminated_length": 346.6,
"entropy": 0.22315610001484554,
"epoch": 0.13479052823315119,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7706270217895508,
"kl": 0.0008742218895349651,
"learning_rate": 4.55521472392638e-07,
"loss": 0.003686445206403732,
"num_tokens": 2416660.0,
"reward": 1.5800000548362731,
"reward_std": 0.8671060860157013,
"rewards/boxed_rate/mean": 0.8666666746139526,
"rewards/boxed_rate/std": 0.20655910968780516,
"rewards/correctness/mean": 0.26666667461395266,
"rewards/correctness/std": 0.36985843181610106,
"rewards/multi_component_reward/mean": 0.31333333253860474,
"rewards/multi_component_reward/std": 0.5032015323638916,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.20655910968780516,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.20161812007427216,
"sampling/importance_sampling_ratio/mean": 0.09829787239432335,
"sampling/importance_sampling_ratio/min": 0.056043115374653285,
"sampling/sampling_logp_difference/max": 0.35296247005462644,
"sampling/sampling_logp_difference/mean": 0.014671148918569087,
"step": 370,
"step_time": 33.350730242021385
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333432674407,
"completions/max_length": 1741.8,
"completions/max_terminated_length": 1278.2,
"completions/mean_length": 958.4000366210937,
"completions/mean_terminated_length": 774.8600219726562,
"completions/min_length": 536.0,
"completions/min_terminated_length": 536.0,
"entropy": 0.19409313425421715,
"epoch": 0.1366120218579235,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.10258854180574417,
"kl": 0.0005339390016160905,
"learning_rate": 4.545628834355828e-07,
"loss": 0.0006846859585493803,
"num_tokens": 2448526.0,
"reward": 1.6966667890548706,
"reward_std": 1.038489294052124,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.19119417667388916,
"rewards/correctness/mean": 0.3000000059604645,
"rewards/correctness/std": 0.4794029474258423,
"rewards/multi_component_reward/mean": 0.3300000175833702,
"rewards/multi_component_reward/std": 0.6378163933753968,
"rewards/no_answer_rate/mean": 0.13333333432674407,
"rewards/no_answer_rate/std": 0.19119417667388916,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.023186815530061723,
"sampling/importance_sampling_ratio/mean": 0.013102744333446026,
"sampling/importance_sampling_ratio/min": 0.00010366742491344506,
"sampling/sampling_logp_difference/max": 0.3774606704711914,
"sampling/sampling_logp_difference/mean": 0.011916628293693065,
"step": 375,
"step_time": 30.433830817975103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.23333334028720856,
"completions/max_length": 1764.6,
"completions/max_terminated_length": 1185.0,
"completions/mean_length": 1108.233349609375,
"completions/mean_terminated_length": 801.3400024414062,
"completions/min_length": 541.8,
"completions/min_terminated_length": 541.8,
"entropy": 0.25079266702135405,
"epoch": 0.1384335154826958,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.015607405453920364,
"kl": 0.000638273500953801,
"learning_rate": 4.536042944785276e-07,
"loss": -0.004855351522564888,
"num_tokens": 2484623.0,
"reward": 1.6300000548362732,
"reward_std": 0.6760274231433868,
"rewards/boxed_rate/mean": 0.799999988079071,
"rewards/boxed_rate/std": 0.35449349880218506,
"rewards/correctness/mean": 0.2666666656732559,
"rewards/correctness/std": 0.2665788769721985,
"rewards/multi_component_reward/mean": 0.23000002317130566,
"rewards/multi_component_reward/std": 0.4806109189987183,
"rewards/no_answer_rate/mean": 0.20000000298023224,
"rewards/no_answer_rate/std": 0.35449349880218506,
"rewards/repetition_rate/mean": 0.13333333432674407,
"rewards/repetition_rate/std": 0.19119417667388916,
"sampling/importance_sampling_ratio/max": 0.030251298751682042,
"sampling/importance_sampling_ratio/mean": 0.01690196809358895,
"sampling/importance_sampling_ratio/min": 0.0027151843596084375,
"sampling/sampling_logp_difference/max": 0.3498866558074951,
"sampling/sampling_logp_difference/mean": 0.016159016452729703,
"step": 380,
"step_time": 15.216538714803756
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20000000596046447,
"completions/max_length": 1980.6,
"completions/max_terminated_length": 1102.2,
"completions/mean_length": 1102.2000122070312,
"completions/mean_terminated_length": 820.3700073242187,
"completions/min_length": 554.4,
"completions/min_terminated_length": 554.4,
"entropy": 0.21501451556881268,
"epoch": 0.14025500910746813,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.039810437709093094,
"kl": 0.0005428103109200795,
"learning_rate": 4.5264570552147234e-07,
"loss": -0.001014799065887928,
"num_tokens": 2521715.0,
"reward": 1.33000009059906,
"reward_std": 0.713744330406189,
"rewards/boxed_rate/mean": 0.8,
"rewards/boxed_rate/std": 0.36985843181610106,
"rewards/correctness/mean": 0.16666666865348817,
"rewards/correctness/std": 0.2728438377380371,
"rewards/multi_component_reward/mean": 0.16333335041999816,
"rewards/multi_component_reward/std": 0.4538270115852356,
"rewards/no_answer_rate/mean": 0.20000000596046447,
"rewards/no_answer_rate/std": 0.36985843181610106,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.050251276697963476,
"sampling/importance_sampling_ratio/mean": 0.014141232566908002,
"sampling/importance_sampling_ratio/min": 1.2361302467010191e-05,
"sampling/sampling_logp_difference/max": 0.30763993263244627,
"sampling/sampling_logp_difference/mean": 0.014410833641886712,
"step": 385,
"step_time": 16.672282354161144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.33333333730697634,
"completions/max_length": 1734.4,
"completions/max_terminated_length": 1527.6,
"completions/mean_length": 1290.500048828125,
"completions/mean_terminated_length": 1057.36669921875,
"completions/min_length": 721.2,
"completions/min_terminated_length": 721.2,
"entropy": 0.17866969083746273,
"epoch": 0.14207650273224043,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009094689041376114,
"kl": 0.0004922606613642226,
"learning_rate": 4.5168711656441713e-07,
"loss": -0.0016124360263347626,
"num_tokens": 2563196.0,
"reward": 1.5066667079925538,
"reward_std": 0.9714008927345276,
"rewards/boxed_rate/mean": 0.6666666686534881,
"rewards/boxed_rate/std": 0.32236858606338503,
"rewards/correctness/mean": 0.26666667461395266,
"rewards/correctness/std": 0.36985843181610106,
"rewards/multi_component_reward/mean": 0.17333333306014537,
"rewards/multi_component_reward/std": 0.564999234676361,
"rewards/no_answer_rate/mean": 0.33333333730697634,
"rewards/no_answer_rate/std": 0.32236858606338503,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.09146881643682718,
"sampling/importance_sampling_ratio/mean": 0.027744953380897643,
"sampling/importance_sampling_ratio/min": 0.0008299978789871564,
"sampling/sampling_logp_difference/max": 0.44667046070098876,
"sampling/sampling_logp_difference/mean": 0.011561266146600246,
"step": 390,
"step_time": 15.62863542791456
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666865348817,
"completions/max_length": 1713.0,
"completions/max_terminated_length": 1077.4,
"completions/mean_length": 995.9333801269531,
"completions/mean_terminated_length": 786.106689453125,
"completions/min_length": 546.4,
"completions/min_terminated_length": 546.4,
"entropy": 0.1970411961277326,
"epoch": 0.14389799635701275,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.018740270286798477,
"kl": 0.0007653967642302935,
"learning_rate": 4.50728527607362e-07,
"loss": 0.005758083239197731,
"num_tokens": 2595858.0,
"reward": 1.4666666984558105,
"reward_std": 0.6528747193515301,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.2728438377380371,
"rewards/correctness/mean": 0.20000000596046447,
"rewards/correctness/std": 0.2665788769721985,
"rewards/multi_component_reward/mean": 0.20000000223517417,
"rewards/multi_component_reward/std": 0.46670507490634916,
"rewards/no_answer_rate/mean": 0.16666666865348817,
"rewards/no_answer_rate/std": 0.2728438377380371,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.09698612228967249,
"sampling/importance_sampling_ratio/mean": 0.030241910181939603,
"sampling/importance_sampling_ratio/min": 0.0021489314406773754,
"sampling/sampling_logp_difference/max": 0.34886856079101564,
"sampling/sampling_logp_difference/mean": 0.012738966010510921,
"step": 395,
"step_time": 14.849660068005324
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666865348817,
"completions/max_length": 1575.0,
"completions/max_terminated_length": 948.8,
"completions/mean_length": 935.7000366210938,
"completions/mean_terminated_length": 702.1866821289062,
"completions/min_length": 482.0,
"completions/min_terminated_length": 482.0,
"entropy": 0.2041997093707323,
"epoch": 0.14571948998178508,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.10930619388818741,
"kl": 0.0005768983881959381,
"learning_rate": 4.497699386503067e-07,
"loss": -0.004532752931118012,
"num_tokens": 2626437.0,
"reward": 1.7466667890548706,
"reward_std": 0.8040140867233276,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.2728438377380371,
"rewards/correctness/mean": 0.33333333134651183,
"rewards/correctness/std": 0.35449349880218506,
"rewards/multi_component_reward/mean": 0.34666669368743896,
"rewards/multi_component_reward/std": 0.5015990376472473,
"rewards/no_answer_rate/mean": 0.16666666865348817,
"rewards/no_answer_rate/std": 0.2728438377380371,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.09340296909213067,
"sampling/importance_sampling_ratio/mean": 0.03166428431868553,
"sampling/importance_sampling_ratio/min": 0.007079391362589009,
"sampling/sampling_logp_difference/max": 0.4040722370147705,
"sampling/sampling_logp_difference/mean": 0.013007087912410498,
"step": 400,
"step_time": 13.86912056710571
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1673.6,
"completions/max_terminated_length": 1493.4,
"completions/mean_length": 1166.2000366210937,
"completions/mean_terminated_length": 1052.9500122070312,
"completions/min_length": 725.0,
"completions/min_terminated_length": 725.0,
"entropy": 0.1420179379483064,
"epoch": 0.14754098360655737,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.021460620686411858,
"kl": 0.0005037313070109425,
"learning_rate": 4.488113496932515e-07,
"loss": -0.0034239646047353745,
"num_tokens": 2665191.0,
"reward": 1.6500000953674316,
"reward_std": 0.5699952363967895,
"rewards/boxed_rate/mean": 0.8666666746139526,
"rewards/boxed_rate/std": 0.20655910968780516,
"rewards/correctness/mean": 0.2666666656732559,
"rewards/correctness/std": 0.2665788769721985,
"rewards/multi_component_reward/mean": 0.2833333633840084,
"rewards/multi_component_reward/std": 0.40620826482772826,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.20655910968780516,
"rewards/repetition_rate/mean": 0.10000000298023223,
"rewards/repetition_rate/std": 0.18492921590805053,
"sampling/importance_sampling_ratio/max": 0.047187263565137985,
"sampling/importance_sampling_ratio/mean": 0.016343344282358886,
"sampling/importance_sampling_ratio/min": 0.0005224805446630398,
"sampling/sampling_logp_difference/max": 0.33698827028274536,
"sampling/sampling_logp_difference/mean": 0.009091775678098202,
"step": 405,
"step_time": 14.932769652456045
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2333333373069763,
"completions/max_length": 1674.6,
"completions/max_terminated_length": 1293.0,
"completions/mean_length": 1194.5333679199218,
"completions/mean_terminated_length": 945.5333374023437,
"completions/min_length": 665.2,
"completions/min_terminated_length": 665.2,
"entropy": 0.19754986638824146,
"epoch": 0.1493624772313297,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.12479820102453232,
"kl": 0.000494376394393233,
"learning_rate": 4.4785276073619634e-07,
"loss": -0.0048382353037595745,
"num_tokens": 2707231.0,
"reward": 1.240000104904175,
"reward_std": 0.5429712414741517,
"rewards/boxed_rate/mean": 0.7666666746139527,
"rewards/boxed_rate/std": 0.3161036252975464,
"rewards/correctness/mean": 0.13333333432674407,
"rewards/correctness/std": 0.19119417667388916,
"rewards/multi_component_reward/mean": 0.10666667819023132,
"rewards/multi_component_reward/std": 0.35998966693878176,
"rewards/no_answer_rate/mean": 0.2333333373069763,
"rewards/no_answer_rate/std": 0.3161036252975464,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.10651108641177416,
"sampling/importance_sampling_ratio/mean": 0.03249359540641308,
"sampling/importance_sampling_ratio/min": 2.793159140067336e-05,
"sampling/sampling_logp_difference/max": 0.34277451038360596,
"sampling/sampling_logp_difference/mean": 0.01319133285433054,
"step": 410,
"step_time": 15.258313446864486
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20000000596046447,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1373.4,
"completions/mean_length": 1174.6666870117188,
"completions/mean_terminated_length": 982.5400207519531,
"completions/min_length": 717.8,
"completions/min_terminated_length": 717.8,
"entropy": 0.17395039709905782,
"epoch": 0.151183970856102,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1571768969297409,
"kl": 0.0006294176423883376,
"learning_rate": 4.468941717791411e-07,
"loss": -0.013487279415130615,
"num_tokens": 2744805.0,
"reward": 1.6100000858306884,
"reward_std": 0.9736106634140015,
"rewards/boxed_rate/mean": 0.799999988079071,
"rewards/boxed_rate/std": 0.4298781991004944,
"rewards/correctness/mean": 0.3000000089406967,
"rewards/correctness/std": 0.3914883255958557,
"rewards/multi_component_reward/mean": 0.31000002175569535,
"rewards/multi_component_reward/std": 0.5971025705337525,
"rewards/no_answer_rate/mean": 0.20000000596046447,
"rewards/no_answer_rate/std": 0.4298781991004944,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.09501664619892836,
"sampling/importance_sampling_ratio/mean": 0.02351754056289792,
"sampling/importance_sampling_ratio/min": 1.3345446579433804e-05,
"sampling/sampling_logp_difference/max": 0.35430474281311036,
"sampling/sampling_logp_difference/mean": 0.012131126876920462,
"step": 415,
"step_time": 17.18821715861559
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1233.4,
"completions/max_terminated_length": 934.0,
"completions/mean_length": 838.5667053222656,
"completions/mean_terminated_length": 744.7300109863281,
"completions/min_length": 594.4,
"completions/min_terminated_length": 594.4,
"entropy": 0.17896714533368746,
"epoch": 0.15300546448087432,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.08533581346273422,
"kl": 0.00047911926230881363,
"learning_rate": 4.4593558282208586e-07,
"loss": -0.00437382236123085,
"num_tokens": 2771858.0,
"reward": 1.5300000667572022,
"reward_std": 0.8672845721244812,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.2333333373069763,
"rewards/correctness/std": 0.3761233925819397,
"rewards/multi_component_reward/mean": 0.29666668176651,
"rewards/multi_component_reward/std": 0.49711505472660067,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.07356768772006035,
"sampling/importance_sampling_ratio/mean": 0.026540174335241317,
"sampling/importance_sampling_ratio/min": 0.0027435134909879364,
"sampling/sampling_logp_difference/max": 0.2972338914871216,
"sampling/sampling_logp_difference/mean": 0.011641639284789562,
"step": 420,
"step_time": 11.845613337121904
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1578.6,
"completions/max_terminated_length": 1112.6,
"completions/mean_length": 957.2333740234375,
"completions/mean_terminated_length": 789.5533447265625,
"completions/min_length": 506.8,
"completions/min_terminated_length": 506.8,
"entropy": 0.22455186421672504,
"epoch": 0.15482695810564662,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.011058821342885494,
"kl": 0.0005187941482290625,
"learning_rate": 4.449769938650307e-07,
"loss": -0.0018246347084641457,
"num_tokens": 2803527.0,
"reward": 1.6033333539962769,
"reward_std": 0.8526834607124328,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.26666667461395266,
"rewards/correctness/std": 0.36985843181610106,
"rewards/multi_component_reward/mean": 0.3033333346247673,
"rewards/multi_component_reward/std": 0.5273210048675537,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.040888981963507834,
"sampling/importance_sampling_ratio/mean": 0.012350313179194928,
"sampling/importance_sampling_ratio/min": 4.1122279969854155e-05,
"sampling/sampling_logp_difference/max": 0.3840524673461914,
"sampling/sampling_logp_difference/mean": 0.014351606741547585,
"step": 425,
"step_time": 13.965443047881127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666865348817,
"completions/max_length": 1702.0,
"completions/max_terminated_length": 1172.4,
"completions/mean_length": 1089.1000244140625,
"completions/mean_terminated_length": 908.7400146484375,
"completions/min_length": 716.8,
"completions/min_terminated_length": 716.8,
"entropy": 0.1683452881872654,
"epoch": 0.15664845173041894,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.014243739657104015,
"kl": 0.0004299223937171822,
"learning_rate": 4.4401840490797544e-07,
"loss": -5.882064579054713e-05,
"num_tokens": 2838744.0,
"reward": 1.5133334517478942,
"reward_std": 0.6114970922470093,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.2728438377380371,
"rewards/correctness/mean": 0.23333333134651185,
"rewards/correctness/std": 0.24494898319244385,
"rewards/multi_component_reward/mean": 0.24666669517755507,
"rewards/multi_component_reward/std": 0.39071274995803834,
"rewards/no_answer_rate/mean": 0.16666666865348817,
"rewards/no_answer_rate/std": 0.2728438377380371,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.08568799644708633,
"sampling/importance_sampling_ratio/mean": 0.032788149639964105,
"sampling/importance_sampling_ratio/min": 0.0002142171015298561,
"sampling/sampling_logp_difference/max": 0.3399065971374512,
"sampling/sampling_logp_difference/mean": 0.011240929551422595,
"step": 430,
"step_time": 14.933206058479845
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2666666716337204,
"completions/max_length": 1818.2,
"completions/max_terminated_length": 1232.2,
"completions/mean_length": 1174.3333618164063,
"completions/mean_terminated_length": 910.4933471679688,
"completions/min_length": 678.2,
"completions/min_terminated_length": 678.2,
"entropy": 0.1602409650882085,
"epoch": 0.15846994535519127,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.07029227167367935,
"kl": 0.00047036608011694623,
"learning_rate": 4.4305981595092023e-07,
"loss": -0.004705151170492172,
"num_tokens": 2877094.0,
"reward": 1.4300000429153443,
"reward_std": 0.7005062401294708,
"rewards/boxed_rate/mean": 0.7333333373069764,
"rewards/boxed_rate/std": 0.3977532863616943,
"rewards/correctness/mean": 0.20000000596046447,
"rewards/correctness/std": 0.2665788769721985,
"rewards/multi_component_reward/mean": 0.13000000715255738,
"rewards/multi_component_reward/std": 0.5194472134113312,
"rewards/no_answer_rate/mean": 0.2666666716337204,
"rewards/no_answer_rate/std": 0.3977532863616943,
"rewards/repetition_rate/mean": 0.10000000298023223,
"rewards/repetition_rate/std": 0.24494898319244385,
"sampling/importance_sampling_ratio/max": 0.13022951409220695,
"sampling/importance_sampling_ratio/mean": 0.05298698227852583,
"sampling/importance_sampling_ratio/min": 9.12893192064404e-05,
"sampling/sampling_logp_difference/max": 0.3845913648605347,
"sampling/sampling_logp_difference/mean": 0.01090236771851778,
"step": 435,
"step_time": 15.83292675986886
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06666666865348816,
"completions/max_length": 1450.6,
"completions/max_terminated_length": 1089.0,
"completions/mean_length": 925.2000366210938,
"completions/mean_terminated_length": 851.3733581542969,
"completions/min_length": 552.8,
"completions/min_terminated_length": 552.8,
"entropy": 0.19520388320088386,
"epoch": 0.16029143897996356,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11756877601146698,
"kl": 0.0004913257944281213,
"learning_rate": 4.4210122699386497e-07,
"loss": -0.009752783924341202,
"num_tokens": 2907568.0,
"reward": 1.783333420753479,
"reward_std": 0.8172009825706482,
"rewards/boxed_rate/mean": 0.9333333253860474,
"rewards/boxed_rate/std": 0.1632993221282959,
"rewards/correctness/mean": 0.33333333134651183,
"rewards/correctness/std": 0.35449349880218506,
"rewards/multi_component_reward/mean": 0.41666669249534605,
"rewards/multi_component_reward/std": 0.4753227561712265,
"rewards/no_answer_rate/mean": 0.06666666865348816,
"rewards/no_answer_rate/std": 0.1632993221282959,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.07866446217522025,
"sampling/importance_sampling_ratio/mean": 0.03093269495293498,
"sampling/importance_sampling_ratio/min": 0.002900865976889124,
"sampling/sampling_logp_difference/max": 0.3607002258300781,
"sampling/sampling_logp_difference/mean": 0.01270847897976637,
"step": 440,
"step_time": 13.279750570654869
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03333333432674408,
"completions/max_length": 1239.0,
"completions/max_terminated_length": 991.2,
"completions/mean_length": 793.8333618164063,
"completions/mean_terminated_length": 740.3466857910156,
"completions/min_length": 537.4,
"completions/min_terminated_length": 537.4,
"entropy": 0.172708131869634,
"epoch": 0.1621129326047359,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.1338849663734436,
"kl": 0.0005728138513707866,
"learning_rate": 4.411426380368098e-07,
"loss": -0.0031532272696495056,
"num_tokens": 2937617.0,
"reward": 1.6400001287460326,
"reward_std": 0.6522666096687317,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.2666666716337204,
"rewards/correctness/std": 0.29447373151779177,
"rewards/multi_component_reward/mean": 0.37333334386348727,
"rewards/multi_component_reward/std": 0.35980162024497986,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.07935073487460613,
"sampling/importance_sampling_ratio/mean": 0.024918343313038348,
"sampling/importance_sampling_ratio/min": 0.001050433111959137,
"sampling/sampling_logp_difference/max": 0.37157559394836426,
"sampling/sampling_logp_difference/mean": 0.011105079017579556,
"step": 445,
"step_time": 11.855768189579248
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1251.4,
"completions/max_terminated_length": 829.6,
"completions/mean_length": 769.7333618164063,
"completions/mean_terminated_length": 598.9666748046875,
"completions/min_length": 440.8,
"completions/min_terminated_length": 440.8,
"entropy": 0.16670421635111174,
"epoch": 0.16393442622950818,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47747763991355896,
"kl": 0.00048284496394141265,
"learning_rate": 4.401840490797546e-07,
"loss": -0.013033266365528106,
"num_tokens": 2963841.0,
"reward": 1.9300000429153443,
"reward_std": 0.8960777290165425,
"rewards/boxed_rate/mean": 0.8666666746139526,
"rewards/boxed_rate/std": 0.20655910968780516,
"rewards/correctness/mean": 0.40000001192092893,
"rewards/correctness/std": 0.41311821937561033,
"rewards/multi_component_reward/mean": 0.42999999821186063,
"rewards/multi_component_reward/std": 0.601305240392685,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.20655910968780516,
"rewards/repetition_rate/mean": 0.10000000298023223,
"rewards/repetition_rate/std": 0.18492921590805053,
"sampling/importance_sampling_ratio/max": 0.14179300479590892,
"sampling/importance_sampling_ratio/mean": 0.06188145540654659,
"sampling/importance_sampling_ratio/min": 0.017718057095797223,
"sampling/sampling_logp_difference/max": 0.3239980101585388,
"sampling/sampling_logp_difference/mean": 0.010319609567523003,
"step": 450,
"step_time": 11.826861386746168
},
{
"epoch": 0.16393442622950818,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.20666667073965073,
"eval_completions/max_length": 1688.02,
"eval_completions/max_terminated_length": 1185.42,
"eval_completions/mean_length": 1123.9933685302735,
"eval_completions/mean_terminated_length": 846.4790167236329,
"eval_completions/min_length": 676.62,
"eval_completions/min_terminated_length": 594.7,
"eval_entropy": 0.18630945876240732,
"eval_frac_reward_zero_std": 0.16,
"eval_kl": 0.0004587844273191877,
"eval_loss": -0.0019339508144184947,
"eval_num_tokens": 2963841.0,
"eval_reward": 1.5870000886917115,
"eval_reward_std": 0.7317568999528885,
"eval_rewards/boxed_rate/mean": 0.7933333313465118,
"eval_rewards/boxed_rate/std": 0.2689453566074371,
"eval_rewards/correctness/mean": 0.2733333346247673,
"eval_rewards/correctness/std": 0.2873582673072815,
"eval_rewards/multi_component_reward/mean": 0.2603333519771695,
"eval_rewards/multi_component_reward/std": 0.4510752035677433,
"eval_rewards/no_answer_rate/mean": 0.20666667073965073,
"eval_rewards/no_answer_rate/std": 0.2689453566074371,
"eval_rewards/repetition_rate/mean": 0.05333333492279053,
"eval_rewards/repetition_rate/std": 0.09630359530448913,
"eval_runtime": 669.3873,
"eval_samples_per_second": 0.075,
"eval_sampling/importance_sampling_ratio/max": 0.10398212056839838,
"eval_sampling/importance_sampling_ratio/mean": 0.035226596340653483,
"eval_sampling/importance_sampling_ratio/min": 0.0024896363094269146,
"eval_sampling/sampling_logp_difference/max": 0.5326641929149628,
"eval_sampling/sampling_logp_difference/mean": 0.012828467870131135,
"eval_steps_per_second": 0.013,
"step": 450
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1987.6,
"completions/max_terminated_length": 1503.0,
"completions/mean_length": 1150.6666870117188,
"completions/mean_terminated_length": 990.6999938964843,
"completions/min_length": 602.2,
"completions/min_terminated_length": 602.2,
"entropy": 0.2162185865143935,
"epoch": 0.1657559198542805,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01271678227931261,
"kl": 0.0005497580714290962,
"learning_rate": 4.3922546012269933e-07,
"loss": 0.0011618516407907009,
"num_tokens": 3000941.0,
"reward": 1.6966667413711547,
"reward_std": 0.9000118732452392,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.3000000089406967,
"rewards/correctness/std": 0.3914883255958557,
"rewards/multi_component_reward/mean": 0.3300000101327896,
"rewards/multi_component_reward/std": 0.5810832887887954,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.11817909283563495,
"sampling/importance_sampling_ratio/mean": 0.02859312845394015,
"sampling/importance_sampling_ratio/min": 0.00019200471108481247,
"sampling/sampling_logp_difference/max": 0.40274404287338256,
"sampling/sampling_logp_difference/mean": 0.014035884011536836,
"step": 455,
"step_time": 16.644585143402217
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1850.4,
"completions/max_terminated_length": 1004.8,
"completions/mean_length": 914.5666870117187,
"completions/mean_terminated_length": 717.0500061035157,
"completions/min_length": 396.2,
"completions/min_terminated_length": 396.2,
"entropy": 0.18004637969036896,
"epoch": 0.16757741347905283,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.051200754940509796,
"kl": 0.0005335340790528183,
"learning_rate": 4.3826687116564417e-07,
"loss": -0.01714204400777817,
"num_tokens": 3030898.0,
"reward": 1.5366667270660401,
"reward_std": 0.7392153739929199,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.34822853803634646,
"rewards/correctness/mean": 0.23333334028720856,
"rewards/correctness/std": 0.2882087707519531,
"rewards/multi_component_reward/mean": 0.23666667342185974,
"rewards/multi_component_reward/std": 0.4870242774486542,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.34822853803634646,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.1253473086282611,
"sampling/importance_sampling_ratio/mean": 0.06275567463599145,
"sampling/importance_sampling_ratio/min": 0.0048506295410769285,
"sampling/sampling_logp_difference/max": 0.35658023357391355,
"sampling/sampling_logp_difference/mean": 0.012955071125179528,
"step": 460,
"step_time": 15.270139714889229
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06666666865348816,
"completions/max_length": 1409.8,
"completions/max_terminated_length": 850.4,
"completions/mean_length": 728.4666931152344,
"completions/mean_terminated_length": 629.5000183105469,
"completions/min_length": 469.8,
"completions/min_terminated_length": 469.8,
"entropy": 0.16985683316985767,
"epoch": 0.16939890710382513,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.02098464034497738,
"kl": 0.0005443188643160586,
"learning_rate": 4.3730828220858896e-07,
"loss": -0.007486963272094726,
"num_tokens": 3056190.0,
"reward": 1.9700001001358032,
"reward_std": 0.4863309383392334,
"rewards/boxed_rate/mean": 0.9333333253860474,
"rewards/boxed_rate/std": 0.1632993221282959,
"rewards/correctness/mean": 0.43333333134651186,
"rewards/correctness/std": 0.18492921590805053,
"rewards/multi_component_reward/mean": 0.5366666868329049,
"rewards/multi_component_reward/std": 0.3014017343521118,
"rewards/no_answer_rate/mean": 0.06666666865348816,
"rewards/no_answer_rate/std": 0.1632993221282959,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.12007005885243416,
"sampling/importance_sampling_ratio/mean": 0.05674390830099583,
"sampling/importance_sampling_ratio/min": 0.0004069998664532642,
"sampling/sampling_logp_difference/max": 0.5247022390365601,
"sampling/sampling_logp_difference/mean": 0.012214668095111847,
"step": 465,
"step_time": 12.395325169526041
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1901.0,
"completions/max_terminated_length": 1594.8,
"completions/mean_length": 1356.2000244140625,
"completions/mean_terminated_length": 1266.056689453125,
"completions/min_length": 993.0,
"completions/min_terminated_length": 993.0,
"entropy": 0.10780975048740706,
"epoch": 0.17122040072859745,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.05852176249027252,
"kl": 0.0004035008855377479,
"learning_rate": 4.363496932515337e-07,
"loss": -0.0037086054682731627,
"num_tokens": 3100044.0,
"reward": 1.6266667366027832,
"reward_std": 0.9011179387569428,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.26666666865348815,
"rewards/correctness/std": 0.3823883533477783,
"rewards/multi_component_reward/mean": 0.2933333463966846,
"rewards/multi_component_reward/std": 0.5466254830360413,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.062121391948312524,
"sampling/importance_sampling_ratio/mean": 0.0237305941991508,
"sampling/importance_sampling_ratio/min": 0.004560414611228225,
"sampling/sampling_logp_difference/max": 0.33158082962036134,
"sampling/sampling_logp_difference/mean": 0.007295731455087662,
"step": 470,
"step_time": 16.880542963929475
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1782.4,
"completions/max_terminated_length": 1526.4,
"completions/mean_length": 1103.5000427246093,
"completions/mean_terminated_length": 1016.5600524902344,
"completions/min_length": 741.8,
"completions/min_terminated_length": 741.8,
"entropy": 0.1433272872120142,
"epoch": 0.17304189435336975,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.03871724009513855,
"kl": 0.0003871701846946962,
"learning_rate": 4.353911042944785e-07,
"loss": -0.0024786345660686494,
"num_tokens": 3136959.0,
"reward": 1.1100000619888306,
"reward_std": 0.318433678150177,
"rewards/boxed_rate/mean": 0.899999988079071,
"rewards/boxed_rate/std": 0.24494898319244385,
"rewards/correctness/mean": 0.03333333432674408,
"rewards/correctness/std": 0.08164966106414795,
"rewards/multi_component_reward/mean": 0.0766666665673256,
"rewards/multi_component_reward/std": 0.23678402006626129,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.24494898319244385,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.0617107892408967,
"sampling/importance_sampling_ratio/mean": 0.02746242703869939,
"sampling/importance_sampling_ratio/min": 8.897909594907105e-05,
"sampling/sampling_logp_difference/max": 0.3906729698181152,
"sampling/sampling_logp_difference/mean": 0.009637977462261915,
"step": 475,
"step_time": 15.594477642700076
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20000000298023224,
"completions/max_length": 1867.2,
"completions/max_terminated_length": 1634.6,
"completions/mean_length": 1205.7000244140625,
"completions/mean_terminated_length": 989.493359375,
"completions/min_length": 550.8,
"completions/min_terminated_length": 550.8,
"entropy": 0.16859658484657605,
"epoch": 0.17486338797814208,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0642157569527626,
"kl": 0.0005781898080992202,
"learning_rate": 4.3443251533742333e-07,
"loss": -0.0028929352760314942,
"num_tokens": 3175776.0,
"reward": 1.3066667556762694,
"reward_std": 0.6569535873830319,
"rewards/boxed_rate/mean": 0.8,
"rewards/boxed_rate/std": 0.29447373151779177,
"rewards/correctness/mean": 0.13333333730697633,
"rewards/correctness/std": 0.2665788769721985,
"rewards/multi_component_reward/mean": 0.1066666692495346,
"rewards/multi_component_reward/std": 0.4782795637845993,
"rewards/no_answer_rate/mean": 0.20000000298023224,
"rewards/no_answer_rate/std": 0.29447373151779177,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.06810643598437309,
"sampling/importance_sampling_ratio/mean": 0.01840525190345943,
"sampling/importance_sampling_ratio/min": 0.0006505777751514535,
"sampling/sampling_logp_difference/max": 0.43449815511703493,
"sampling/sampling_logp_difference/mean": 0.011115654464811086,
"step": 480,
"step_time": 15.972666030749679
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666865348817,
"completions/max_length": 1663.0,
"completions/max_terminated_length": 1303.0,
"completions/mean_length": 1173.3333740234375,
"completions/mean_terminated_length": 989.41669921875,
"completions/min_length": 683.0,
"completions/min_terminated_length": 683.0,
"entropy": 0.14007456911106905,
"epoch": 0.1766848816029144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010275622829794884,
"kl": 0.00036965057806810365,
"learning_rate": 4.3347392638036806e-07,
"loss": 0.0001720041735097766,
"num_tokens": 3214522.0,
"reward": 1.9766667842864991,
"reward_std": 0.9272954225540161,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.19119417667388916,
"rewards/correctness/mean": 0.43333333134651186,
"rewards/correctness/std": 0.404018247127533,
"rewards/multi_component_reward/mean": 0.47666670083999635,
"rewards/multi_component_reward/std": 0.5632504820823669,
"rewards/no_answer_rate/mean": 0.13333333432674407,
"rewards/no_answer_rate/std": 0.19119417667388916,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.05611779941245913,
"sampling/importance_sampling_ratio/mean": 0.021316760312765838,
"sampling/importance_sampling_ratio/min": 0.0011236143288555823,
"sampling/sampling_logp_difference/max": 0.3524420738220215,
"sampling/sampling_logp_difference/mean": 0.00864975331351161,
"step": 485,
"step_time": 15.02983694653958
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1206.8,
"completions/max_terminated_length": 934.0,
"completions/mean_length": 728.1666748046875,
"completions/mean_terminated_length": 595.9966796875,
"completions/min_length": 426.2,
"completions/min_terminated_length": 426.2,
"entropy": 0.23062201142311095,
"epoch": 0.1785063752276867,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.018157748505473137,
"kl": 0.001033562931115739,
"learning_rate": 4.3251533742331285e-07,
"loss": 0.014268814027309418,
"num_tokens": 3238755.0,
"reward": 1.833333384990692,
"reward_std": 0.6647640764713287,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.3666666686534882,
"rewards/correctness/std": 0.2882087707519531,
"rewards/multi_component_reward/mean": 0.43333335071802137,
"rewards/multi_component_reward/std": 0.4189693987369537,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.12710050344467164,
"sampling/importance_sampling_ratio/mean": 0.05628317659720779,
"sampling/importance_sampling_ratio/min": 0.015683322838595436,
"sampling/sampling_logp_difference/max": 0.36403354406356814,
"sampling/sampling_logp_difference/mean": 0.015313766896724701,
"step": 490,
"step_time": 11.357261310145258
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1498.2,
"completions/max_terminated_length": 1076.4,
"completions/mean_length": 1013.5000305175781,
"completions/mean_terminated_length": 849.3533508300782,
"completions/min_length": 602.6,
"completions/min_terminated_length": 602.6,
"entropy": 0.14790264442563056,
"epoch": 0.18032786885245902,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.034156352281570435,
"kl": 0.0005029304617589029,
"learning_rate": 4.315567484662577e-07,
"loss": -0.010992055386304855,
"num_tokens": 3274110.0,
"reward": 1.4433334589004516,
"reward_std": 0.6818739175796509,
"rewards/boxed_rate/mean": 0.8333333313465119,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.20000000298023224,
"rewards/correctness/std": 0.29447373151779177,
"rewards/multi_component_reward/mean": 0.21000000536441804,
"rewards/multi_component_reward/std": 0.4088459849357605,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.10836826749145985,
"sampling/importance_sampling_ratio/mean": 0.04134368039667606,
"sampling/importance_sampling_ratio/min": 0.0007355699201900378,
"sampling/sampling_logp_difference/max": 0.5285765409469605,
"sampling/sampling_logp_difference/mean": 0.00972840515896678,
"step": 495,
"step_time": 13.717384218610823
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1768.6,
"completions/max_terminated_length": 1172.4,
"completions/mean_length": 1021.0333618164062,
"completions/mean_terminated_length": 829.9766723632813,
"completions/min_length": 574.6,
"completions/min_terminated_length": 574.6,
"entropy": 0.1277496966222922,
"epoch": 0.18214936247723132,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.036998555064201355,
"kl": 0.00041782591021425713,
"learning_rate": 4.3059815950920243e-07,
"loss": -0.014814507961273194,
"num_tokens": 3306649.0,
"reward": 1.9100001335144043,
"reward_std": 0.8856898427009583,
"rewards/boxed_rate/mean": 0.8333333373069763,
"rewards/boxed_rate/std": 0.2882087707519531,
"rewards/correctness/mean": 0.4333333373069763,
"rewards/correctness/std": 0.3761233925819397,
"rewards/multi_component_reward/mean": 0.4766666904091835,
"rewards/multi_component_reward/std": 0.5212879002094268,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.2882087707519531,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.2542514935135841,
"sampling/importance_sampling_ratio/mean": 0.09570314064621925,
"sampling/importance_sampling_ratio/min": 0.0020768470163739948,
"sampling/sampling_logp_difference/max": 0.34144341945648193,
"sampling/sampling_logp_difference/mean": 0.010077364556491376,
"step": 500,
"step_time": 15.174134342558682
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1232.6,
"completions/max_terminated_length": 1232.6,
"completions/mean_length": 928.7333618164063,
"completions/mean_terminated_length": 928.7333618164063,
"completions/min_length": 612.8,
"completions/min_terminated_length": 612.8,
"entropy": 0.15919652469456197,
"epoch": 0.18397085610200364,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.03940599039196968,
"kl": 0.0004976143633636335,
"learning_rate": 4.296395705521472e-07,
"loss": 8.051458280533552e-05,
"num_tokens": 3338765.0,
"reward": 1.6133334398269654,
"reward_std": 0.5944257378578186,
"rewards/boxed_rate/mean": 1.0,
"rewards/boxed_rate/std": 0.0,
"rewards/correctness/mean": 0.23333334028720856,
"rewards/correctness/std": 0.2882087707519531,
"rewards/multi_component_reward/mean": 0.34666666090488435,
"rewards/multi_component_reward/std": 0.3269805133342743,
"rewards/no_answer_rate/mean": 0.0,
"rewards/no_answer_rate/std": 0.0,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.12865251041948794,
"sampling/importance_sampling_ratio/mean": 0.046758548077195886,
"sampling/importance_sampling_ratio/min": 0.001370607664284762,
"sampling/sampling_logp_difference/max": 0.37642626762390136,
"sampling/sampling_logp_difference/mean": 0.010083456197753549,
"step": 505,
"step_time": 12.115066050365566
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03333333432674408,
"completions/max_length": 1233.6,
"completions/max_terminated_length": 1011.6,
"completions/mean_length": 741.0666870117187,
"completions/mean_terminated_length": 699.6933471679688,
"completions/min_length": 502.8,
"completions/min_terminated_length": 502.8,
"entropy": 0.14653264259298643,
"epoch": 0.18579234972677597,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.022397559136152267,
"kl": 0.0004972440418593275,
"learning_rate": 4.28680981595092e-07,
"loss": -0.010462169349193574,
"num_tokens": 3363841.0,
"reward": 2.0600001335144045,
"reward_std": 0.9187898755073547,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.4666666626930237,
"rewards/correctness/std": 0.4298781991004944,
"rewards/multi_component_reward/mean": 0.5933333843946457,
"rewards/multi_component_reward/std": 0.49362565875053405,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.16412589382380247,
"sampling/importance_sampling_ratio/mean": 0.0629353643860668,
"sampling/importance_sampling_ratio/min": 0.011790907313888965,
"sampling/sampling_logp_difference/max": 0.3437652587890625,
"sampling/sampling_logp_difference/mean": 0.009886026009917259,
"step": 510,
"step_time": 11.563566098548472
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1427.2,
"completions/max_terminated_length": 666.4,
"completions/mean_length": 705.2000061035156,
"completions/mean_terminated_length": 511.7700134277344,
"completions/min_length": 344.2,
"completions/min_terminated_length": 344.2,
"entropy": 0.26261382003625233,
"epoch": 0.18761384335154827,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.10859289020299911,
"kl": 0.0008684398761639992,
"learning_rate": 4.277223926380368e-07,
"loss": -0.002386706694960594,
"num_tokens": 3386899.0,
"reward": 1.3700001001358033,
"reward_std": 0.7070007205009461,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.1666666716337204,
"rewards/correctness/std": 0.2882087707519531,
"rewards/multi_component_reward/mean": 0.20333334654569626,
"rewards/multi_component_reward/std": 0.4274008393287659,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.07567991334944964,
"sampling/importance_sampling_ratio/mean": 0.024421159457415343,
"sampling/importance_sampling_ratio/min": 0.001837286539412988,
"sampling/sampling_logp_difference/max": 0.34601569175720215,
"sampling/sampling_logp_difference/mean": 0.017737336456775665,
"step": 515,
"step_time": 12.585359536856412
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1618.2,
"completions/max_terminated_length": 1045.2,
"completions/mean_length": 936.9666870117187,
"completions/mean_terminated_length": 758.3166870117187,
"completions/min_length": 469.8,
"completions/min_terminated_length": 469.8,
"entropy": 0.1936116027335326,
"epoch": 0.1894353369763206,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0316350944340229,
"kl": 0.0006832293069843824,
"learning_rate": 4.267638036809816e-07,
"loss": -0.008644417673349381,
"num_tokens": 3417600.0,
"reward": 1.5800000786781312,
"reward_std": 0.7550430953502655,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.2666666716337204,
"rewards/correctness/std": 0.29447373151779177,
"rewards/multi_component_reward/mean": 0.3133333310484886,
"rewards/multi_component_reward/std": 0.46337632834911346,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.0883142702281475,
"sampling/importance_sampling_ratio/mean": 0.030587387550622226,
"sampling/importance_sampling_ratio/min": 0.0063572791577344466,
"sampling/sampling_logp_difference/max": 0.3835047721862793,
"sampling/sampling_logp_difference/mean": 0.013410749472677707,
"step": 520,
"step_time": 13.994114908203482
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06666666865348816,
"completions/max_length": 1382.6,
"completions/max_terminated_length": 1288.6,
"completions/mean_length": 903.3000122070313,
"completions/mean_terminated_length": 852.4833374023438,
"completions/min_length": 593.0,
"completions/min_terminated_length": 593.0,
"entropy": 0.18077843909462293,
"epoch": 0.1912568306010929,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.024261396378278732,
"kl": 0.0005541147484715718,
"learning_rate": 4.2580521472392637e-07,
"loss": -0.0031598765403032304,
"num_tokens": 3448035.0,
"reward": 1.8300000786781312,
"reward_std": 0.6672061026096344,
"rewards/boxed_rate/mean": 0.9333333373069763,
"rewards/boxed_rate/std": 0.10327955484390258,
"rewards/correctness/mean": 0.3666666686534882,
"rewards/correctness/std": 0.2882087707519531,
"rewards/multi_component_reward/mean": 0.4633333504199982,
"rewards/multi_component_reward/std": 0.378997391462326,
"rewards/no_answer_rate/mean": 0.06666666865348816,
"rewards/no_answer_rate/std": 0.10327955484390258,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.10047482601366937,
"sampling/importance_sampling_ratio/mean": 0.02871947007952258,
"sampling/importance_sampling_ratio/min": 0.0007279034301063802,
"sampling/sampling_logp_difference/max": 0.41403778791427615,
"sampling/sampling_logp_difference/mean": 0.011369843780994416,
"step": 525,
"step_time": 12.759401306509972
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20000000596046447,
"completions/max_length": 1729.4,
"completions/max_terminated_length": 1090.4,
"completions/mean_length": 1117.200048828125,
"completions/mean_terminated_length": 875.8833618164062,
"completions/min_length": 682.2,
"completions/min_terminated_length": 682.2,
"entropy": 0.18201239357391993,
"epoch": 0.1930783242258652,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00892422441393137,
"kl": 0.000488846098111632,
"learning_rate": 4.2484662576687116e-07,
"loss": -0.004933690279722213,
"num_tokens": 3483861.0,
"reward": 1.3033334016799927,
"reward_std": 0.6930439114570618,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.34822853803634646,
"rewards/correctness/mean": 0.13333333730697633,
"rewards/correctness/std": 0.2665788769721985,
"rewards/multi_component_reward/mean": 0.13666666969656943,
"rewards/multi_component_reward/std": 0.4622148424386978,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.34822853803634646,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.06577782724052668,
"sampling/importance_sampling_ratio/mean": 0.02595797423273325,
"sampling/importance_sampling_ratio/min": 0.0002929158277353877,
"sampling/sampling_logp_difference/max": 0.355899715423584,
"sampling/sampling_logp_difference/mean": 0.012041485961526632,
"step": 530,
"step_time": 15.400034189224243
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1707.4,
"completions/max_terminated_length": 958.8,
"completions/mean_length": 998.1000122070312,
"completions/mean_terminated_length": 766.7933349609375,
"completions/min_length": 525.4,
"completions/min_terminated_length": 525.4,
"entropy": 0.22935964514811832,
"epoch": 0.19489981785063754,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.020105650648474693,
"kl": 0.000557902626072367,
"learning_rate": 4.2388803680981595e-07,
"loss": -0.0007442331407219172,
"num_tokens": 3518562.0,
"reward": 1.2100000977516174,
"reward_std": 0.6208420097827911,
"rewards/boxed_rate/mean": 0.8333333373069763,
"rewards/boxed_rate/std": 0.2882087707519531,
"rewards/correctness/mean": 0.10000000298023223,
"rewards/correctness/std": 0.24494898319244385,
"rewards/multi_component_reward/mean": 0.11000000834465026,
"rewards/multi_component_reward/std": 0.38656076192855837,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.2882087707519531,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.015947306994348764,
"sampling/importance_sampling_ratio/mean": 0.004562706989236176,
"sampling/importance_sampling_ratio/min": 2.5106548244674746e-05,
"sampling/sampling_logp_difference/max": 0.36506946086883546,
"sampling/sampling_logp_difference/mean": 0.014524615183472634,
"step": 535,
"step_time": 14.785470444336534
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 2011.6,
"completions/max_terminated_length": 1252.0,
"completions/mean_length": 974.2000366210938,
"completions/mean_terminated_length": 799.5733703613281,
"completions/min_length": 499.0,
"completions/min_terminated_length": 499.0,
"entropy": 0.20504479110240936,
"epoch": 0.19672131147540983,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.016418447718024254,
"kl": 0.0007665569457458333,
"learning_rate": 4.229294478527607e-07,
"loss": -0.004444469138979912,
"num_tokens": 3550332.0,
"reward": 1.5333333969116212,
"reward_std": 0.6269734501838684,
"rewards/boxed_rate/mean": 0.8666666507720947,
"rewards/boxed_rate/std": 0.3265986442565918,
"rewards/correctness/mean": 0.23333333134651185,
"rewards/correctness/std": 0.24494898319244385,
"rewards/multi_component_reward/mean": 0.266666679084301,
"rewards/multi_component_reward/std": 0.4252805233001709,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.3265986442565918,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.0710077840834856,
"sampling/importance_sampling_ratio/mean": 0.0242210254073143,
"sampling/importance_sampling_ratio/min": 2.3673296637852498e-05,
"sampling/sampling_logp_difference/max": 0.37494783401489257,
"sampling/sampling_logp_difference/mean": 0.013403672352433204,
"step": 540,
"step_time": 16.360854005627335
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1755.4,
"completions/max_terminated_length": 1206.4,
"completions/mean_length": 827.3000122070313,
"completions/mean_terminated_length": 693.3333374023438,
"completions/min_length": 476.2,
"completions/min_terminated_length": 476.2,
"entropy": 0.20222537256777287,
"epoch": 0.19854280510018216,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.07017825543880463,
"kl": 0.0005556545128153327,
"learning_rate": 4.219708588957055e-07,
"loss": -0.0067356817424297334,
"num_tokens": 3577779.0,
"reward": 1.8100000619888306,
"reward_std": 0.7233119249343872,
"rewards/boxed_rate/mean": 0.899999988079071,
"rewards/boxed_rate/std": 0.24494898319244385,
"rewards/correctness/mean": 0.3666666686534882,
"rewards/correctness/std": 0.2882087707519531,
"rewards/multi_component_reward/mean": 0.44333335757255554,
"rewards/multi_component_reward/std": 0.43861203789711,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.24494898319244385,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.07237468343228101,
"sampling/importance_sampling_ratio/mean": 0.02907364722341299,
"sampling/importance_sampling_ratio/min": 0.003791734886874308,
"sampling/sampling_logp_difference/max": 0.3440469026565552,
"sampling/sampling_logp_difference/mean": 0.013698535226285457,
"step": 545,
"step_time": 14.568762712925672
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1740.6,
"completions/max_terminated_length": 1098.0,
"completions/mean_length": 972.5000366210937,
"completions/mean_terminated_length": 813.2100280761719,
"completions/min_length": 548.8,
"completions/min_terminated_length": 548.8,
"entropy": 0.15388319492340088,
"epoch": 0.20036429872495445,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.09412358701229095,
"kl": 0.00047789589831760774,
"learning_rate": 4.210122699386503e-07,
"loss": 0.0004236143082380295,
"num_tokens": 3610650.0,
"reward": 1.7666667222976684,
"reward_std": 0.748459929227829,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.3333333432674408,
"rewards/correctness/std": 0.3098386645317078,
"rewards/multi_component_reward/mean": 0.3666666656732559,
"rewards/multi_component_reward/std": 0.5095339387655258,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.07994540650397539,
"sampling/importance_sampling_ratio/mean": 0.0390994424931705,
"sampling/importance_sampling_ratio/min": 0.0009056368981219976,
"sampling/sampling_logp_difference/max": 0.3753530740737915,
"sampling/sampling_logp_difference/mean": 0.009989727940410375,
"step": 550,
"step_time": 15.042225486785174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1604.4,
"completions/max_terminated_length": 979.4,
"completions/mean_length": 832.6666870117188,
"completions/mean_terminated_length": 665.1333374023437,
"completions/min_length": 452.0,
"completions/min_terminated_length": 452.0,
"entropy": 0.2048481193681558,
"epoch": 0.20218579234972678,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.08716036379337311,
"kl": 0.000548629568462881,
"learning_rate": 4.2005368098159505e-07,
"loss": -0.005523241683840751,
"num_tokens": 3638024.0,
"reward": 1.6033334016799927,
"reward_std": 0.797308748960495,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.26666666865348815,
"rewards/correctness/std": 0.32236858606338503,
"rewards/multi_component_reward/mean": 0.30333334803581236,
"rewards/multi_component_reward/std": 0.5069476455450058,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.07022089585661888,
"sampling/importance_sampling_ratio/mean": 0.022322331066243352,
"sampling/importance_sampling_ratio/min": 0.000656013962759859,
"sampling/sampling_logp_difference/max": 0.3337650179862976,
"sampling/sampling_logp_difference/mean": 0.013166856206953525,
"step": 555,
"step_time": 13.850998834148049
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06666666865348816,
"completions/max_length": 1198.2,
"completions/max_terminated_length": 1077.2,
"completions/mean_length": 757.1333618164062,
"completions/mean_terminated_length": 690.2466918945313,
"completions/min_length": 448.2,
"completions/min_terminated_length": 448.2,
"entropy": 0.18011298303802808,
"epoch": 0.2040072859744991,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01096227765083313,
"kl": 0.0006569843031077956,
"learning_rate": 4.1909509202453984e-07,
"loss": -0.007073364406824112,
"num_tokens": 3663726.0,
"reward": 1.9000000715255738,
"reward_std": 0.8417381644248962,
"rewards/boxed_rate/mean": 0.9333333253860474,
"rewards/boxed_rate/std": 0.1632993221282959,
"rewards/correctness/mean": 0.40000000298023225,
"rewards/correctness/std": 0.36985843181610106,
"rewards/multi_component_reward/mean": 0.5000000268220901,
"rewards/multi_component_reward/std": 0.4765937089920044,
"rewards/no_answer_rate/mean": 0.06666666865348816,
"rewards/no_answer_rate/std": 0.1632993221282959,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.09752159416675568,
"sampling/importance_sampling_ratio/mean": 0.05281178932636976,
"sampling/importance_sampling_ratio/min": 0.003194372950383695,
"sampling/sampling_logp_difference/max": 0.3607391357421875,
"sampling/sampling_logp_difference/mean": 0.011687950044870377,
"step": 560,
"step_time": 11.587789290212095
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20000000298023224,
"completions/max_length": 1599.4,
"completions/max_terminated_length": 1067.8,
"completions/mean_length": 1085.8000244140626,
"completions/mean_terminated_length": 867.1133544921875,
"completions/min_length": 651.6,
"completions/min_terminated_length": 651.6,
"entropy": 0.12313016926248868,
"epoch": 0.2058287795992714,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.05608590692281723,
"kl": 0.00031730876992999886,
"learning_rate": 4.181365030674847e-07,
"loss": 0.0087483711540699,
"num_tokens": 3699090.0,
"reward": 1.7966667652130126,
"reward_std": 0.8426974892616272,
"rewards/boxed_rate/mean": 0.8,
"rewards/boxed_rate/std": 0.29447373151779177,
"rewards/correctness/mean": 0.36666667461395264,
"rewards/correctness/std": 0.3161036252975464,
"rewards/multi_component_reward/mean": 0.36333334594964983,
"rewards/multi_component_reward/std": 0.5058179676532746,
"rewards/no_answer_rate/mean": 0.20000000298023224,
"rewards/no_answer_rate/std": 0.29447373151779177,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.10868240725831128,
"sampling/importance_sampling_ratio/mean": 0.04146801241877256,
"sampling/importance_sampling_ratio/min": 0.008836518857719966,
"sampling/sampling_logp_difference/max": 0.3564323544502258,
"sampling/sampling_logp_difference/mean": 0.007883247546851635,
"step": 565,
"step_time": 14.454759103059768
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.33333333432674406,
"completions/max_length": 1746.6,
"completions/max_terminated_length": 973.4,
"completions/mean_length": 1263.9000244140625,
"completions/mean_terminated_length": 730.0800231933594,
"completions/min_length": 952.8,
"completions/min_terminated_length": 543.2,
"entropy": 0.14238746762275695,
"epoch": 0.20765027322404372,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.01621309295296669,
"kl": 0.0003668556613168524,
"learning_rate": 4.171779141104294e-07,
"loss": 0.00620177686214447,
"num_tokens": 3740091.0,
"reward": 1.4800001382827759,
"reward_std": 0.5089709401130676,
"rewards/boxed_rate/mean": 0.6666666626930237,
"rewards/boxed_rate/std": 0.19119417667388916,
"rewards/correctness/mean": 0.2666666626930237,
"rewards/correctness/std": 0.19119417667388916,
"rewards/multi_component_reward/mean": 0.17999999970197678,
"rewards/multi_component_reward/std": 0.30394219756126406,
"rewards/no_answer_rate/mean": 0.33333333432674406,
"rewards/no_answer_rate/std": 0.19119417667388916,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.09319847710430622,
"sampling/importance_sampling_ratio/mean": 0.04575341045856476,
"sampling/importance_sampling_ratio/min": 0.004630656010958226,
"sampling/sampling_logp_difference/max": 0.3321978569030762,
"sampling/sampling_logp_difference/mean": 0.009725131979212164,
"step": 570,
"step_time": 15.637936590984463
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333432674407,
"completions/max_length": 1493.2,
"completions/max_terminated_length": 1230.4,
"completions/mean_length": 1086.3333618164063,
"completions/mean_terminated_length": 990.8466918945312,
"completions/min_length": 756.6,
"completions/min_terminated_length": 756.6,
"entropy": 0.1577371135354042,
"epoch": 0.20947176684881602,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.017710594460368156,
"kl": 0.0003999709859878446,
"learning_rate": 4.162193251533742e-07,
"loss": -0.0008659601211547851,
"num_tokens": 3778573.0,
"reward": 1.2533334612846374,
"reward_std": 0.6350648760795593,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.19119417667388916,
"rewards/correctness/mean": 0.10000000298023223,
"rewards/correctness/std": 0.24494898319244385,
"rewards/multi_component_reward/mean": 0.12000000327825547,
"rewards/multi_component_reward/std": 0.34919504523277284,
"rewards/no_answer_rate/mean": 0.13333333432674407,
"rewards/no_answer_rate/std": 0.19119417667388916,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.021646471600979565,
"sampling/importance_sampling_ratio/mean": 0.009971876302734018,
"sampling/importance_sampling_ratio/min": 0.0019394792594557403,
"sampling/sampling_logp_difference/max": 0.331281840801239,
"sampling/sampling_logp_difference/mean": 0.01047133533284068,
"step": 575,
"step_time": 13.984978386759758
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03333333432674408,
"completions/max_length": 1077.0,
"completions/max_terminated_length": 843.4,
"completions/mean_length": 618.633349609375,
"completions/mean_terminated_length": 570.0800170898438,
"completions/min_length": 312.4,
"completions/min_terminated_length": 312.4,
"entropy": 0.18354702678819498,
"epoch": 0.21129326047358835,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.01734095625579357,
"kl": 0.0007040846471985181,
"learning_rate": 4.15260736196319e-07,
"loss": -0.034613341093063354,
"num_tokens": 3799994.0,
"reward": 1.733333420753479,
"reward_std": 0.8469935655593872,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.3000000029802322,
"rewards/correctness/std": 0.404018247127533,
"rewards/multi_component_reward/mean": 0.4000000163912773,
"rewards/multi_component_reward/std": 0.481517493724823,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.2370181828737259,
"sampling/importance_sampling_ratio/mean": 0.07491586618125438,
"sampling/importance_sampling_ratio/min": 0.017679853770823685,
"sampling/sampling_logp_difference/max": 0.2730631709098816,
"sampling/sampling_logp_difference/mean": 0.011196557991206646,
"step": 580,
"step_time": 10.380646949820221
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1423.0,
"completions/max_terminated_length": 1241.8,
"completions/mean_length": 999.9000366210937,
"completions/mean_terminated_length": 926.8766967773438,
"completions/min_length": 742.8,
"completions/min_terminated_length": 742.8,
"entropy": 0.17347905735174815,
"epoch": 0.21311475409836064,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.019545914605259895,
"kl": 0.0004457860350764046,
"learning_rate": 4.143021472392638e-07,
"loss": -0.0001349434838630259,
"num_tokens": 3834629.0,
"reward": 1.4100001215934754,
"reward_std": 0.6349397718906402,
"rewards/boxed_rate/mean": 0.9333333373069763,
"rewards/boxed_rate/std": 0.10327955484390258,
"rewards/correctness/mean": 0.16666666865348817,
"rewards/correctness/std": 0.2728438377380371,
"rewards/multi_component_reward/mean": 0.2433333396911621,
"rewards/multi_component_reward/std": 0.36209596991539,
"rewards/no_answer_rate/mean": 0.06666666865348816,
"rewards/no_answer_rate/std": 0.10327955484390258,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.04343400467187166,
"sampling/importance_sampling_ratio/mean": 0.013024529488757253,
"sampling/importance_sampling_ratio/min": 0.001408253621775657,
"sampling/sampling_logp_difference/max": 0.5604913473129273,
"sampling/sampling_logp_difference/mean": 0.010900301579385995,
"step": 585,
"step_time": 13.423828819952906
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1522.6,
"completions/max_terminated_length": 1304.6,
"completions/mean_length": 1048.300048828125,
"completions/mean_terminated_length": 953.2133544921875,
"completions/min_length": 677.2,
"completions/min_terminated_length": 677.2,
"entropy": 0.13777906013031802,
"epoch": 0.21493624772313297,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.020055213943123817,
"kl": 0.00046802494471194224,
"learning_rate": 4.1334355828220857e-07,
"loss": 0.0007817570120096206,
"num_tokens": 3868946.0,
"reward": 1.3900001525878907,
"reward_std": 0.7840524554252625,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.1666666716337204,
"rewards/correctness/std": 0.34822853803634646,
"rewards/multi_component_reward/mean": 0.22333333939313887,
"rewards/multi_component_reward/std": 0.44443279504776,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.05379525385797024,
"sampling/importance_sampling_ratio/mean": 0.019492477364838123,
"sampling/importance_sampling_ratio/min": 0.0019421720592784908,
"sampling/sampling_logp_difference/max": 0.36746039390563967,
"sampling/sampling_logp_difference/mean": 0.009275554586201907,
"step": 590,
"step_time": 13.933117091469466
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1637.0,
"completions/max_terminated_length": 1173.6,
"completions/mean_length": 936.9000122070313,
"completions/mean_terminated_length": 814.3366760253906,
"completions/min_length": 551.2,
"completions/min_terminated_length": 551.2,
"entropy": 0.1666003334025542,
"epoch": 0.2167577413479053,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.011394626460969448,
"kl": 0.00048624937092730153,
"learning_rate": 4.1238496932515336e-07,
"loss": -6.75798102747649e-05,
"num_tokens": 3899387.0,
"reward": 1.483333444595337,
"reward_std": 0.6897397696971893,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.20000000298023224,
"rewards/correctness/std": 0.29447373151779177,
"rewards/multi_component_reward/mean": 0.2500000074505806,
"rewards/multi_component_reward/std": 0.42727351784706114,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.03627656940370798,
"sampling/importance_sampling_ratio/mean": 0.014781518559902906,
"sampling/importance_sampling_ratio/min": 0.0008682889689286911,
"sampling/sampling_logp_difference/max": 0.3049093961715698,
"sampling/sampling_logp_difference/mean": 0.010832818690687418,
"step": 595,
"step_time": 14.285567329451442
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1661.6,
"completions/max_terminated_length": 1382.2,
"completions/mean_length": 1118.2000244140625,
"completions/mean_terminated_length": 938.136669921875,
"completions/min_length": 657.6,
"completions/min_terminated_length": 657.6,
"entropy": 0.17573260565598806,
"epoch": 0.2185792349726776,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.020734934136271477,
"kl": 0.0004598392901243642,
"learning_rate": 4.1142638036809815e-07,
"loss": -0.002467634528875351,
"num_tokens": 3936269.0,
"reward": 1.0933334231376648,
"reward_std": 0.33367283940315245,
"rewards/boxed_rate/mean": 0.8333333373069763,
"rewards/boxed_rate/std": 0.2882087707519531,
"rewards/correctness/mean": 0.03333333432674408,
"rewards/correctness/std": 0.08164966106414795,
"rewards/multi_component_reward/mean": 0.02666666656732559,
"rewards/multi_component_reward/std": 0.2805217713117599,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.2882087707519531,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.044962253980338575,
"sampling/importance_sampling_ratio/mean": 0.011017974140122533,
"sampling/importance_sampling_ratio/min": 0.0003941454909745216,
"sampling/sampling_logp_difference/max": 0.38812339305877686,
"sampling/sampling_logp_difference/mean": 0.01122636515647173,
"step": 600,
"step_time": 14.878706483915447
},
{
"epoch": 0.2185792349726776,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.18333333790302275,
"eval_completions/max_length": 1633.0,
"eval_completions/max_terminated_length": 1261.48,
"eval_completions/mean_length": 1074.5033642578126,
"eval_completions/mean_terminated_length": 884.9103503417969,
"eval_completions/min_length": 608.76,
"eval_completions/min_terminated_length": 608.76,
"eval_entropy": 0.1780784809589386,
"eval_frac_reward_zero_std": 0.1,
"eval_kl": 0.000503081139177084,
"eval_loss": -0.002404627623036504,
"eval_num_tokens": 3936269.0,
"eval_reward": 1.5616667437553406,
"eval_reward_std": 0.7698906905949116,
"eval_rewards/boxed_rate/mean": 0.816666665673256,
"eval_rewards/boxed_rate/std": 0.28143630146980286,
"eval_rewards/correctness/mean": 0.260000002682209,
"eval_rewards/correctness/std": 0.3175121474266052,
"eval_rewards/multi_component_reward/mean": 0.26500001683831215,
"eval_rewards/multi_component_reward/std": 0.49374800592660906,
"eval_rewards/no_answer_rate/mean": 0.18333333790302275,
"eval_rewards/no_answer_rate/std": 0.28143630146980286,
"eval_rewards/repetition_rate/mean": 0.036666667759418486,
"eval_rewards/repetition_rate/std": 0.08381265044212341,
"eval_runtime": 648.1074,
"eval_samples_per_second": 0.077,
"eval_sampling/importance_sampling_ratio/max": 0.08480258989613504,
"eval_sampling/importance_sampling_ratio/mean": 0.03125391862238757,
"eval_sampling/importance_sampling_ratio/min": 0.0025257051857090485,
"eval_sampling/sampling_logp_difference/max": 0.5640374529361725,
"eval_sampling/sampling_logp_difference/mean": 0.01284005825407803,
"eval_steps_per_second": 0.014,
"step": 600
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1519.2,
"completions/max_terminated_length": 1256.6,
"completions/mean_length": 946.06669921875,
"completions/mean_terminated_length": 839.1633544921875,
"completions/min_length": 563.2,
"completions/min_terminated_length": 563.2,
"entropy": 0.14678182030717532,
"epoch": 0.2204007285974499,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.023046690970659256,
"kl": 0.0005303417919397664,
"learning_rate": 4.1046779141104294e-07,
"loss": -0.0009497999213635922,
"num_tokens": 3967489.0,
"reward": 1.2500001072883606,
"reward_std": 0.46636478304862977,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.10000000298023223,
"rewards/correctness/std": 0.18492921590805053,
"rewards/multi_component_reward/mean": 0.15000001192092896,
"rewards/multi_component_reward/std": 0.28614950776100156,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.15675322916358708,
"sampling/importance_sampling_ratio/mean": 0.057061844225972894,
"sampling/importance_sampling_ratio/min": 0.0010045379561626475,
"sampling/sampling_logp_difference/max": 0.3354309320449829,
"sampling/sampling_logp_difference/mean": 0.010376049391925334,
"step": 605,
"step_time": 13.543810763396323
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03333333432674408,
"completions/max_length": 1038.6,
"completions/max_terminated_length": 971.6,
"completions/mean_length": 741.6333618164062,
"completions/mean_terminated_length": 710.4466918945312,
"completions/min_length": 523.2,
"completions/min_terminated_length": 523.2,
"entropy": 0.17811203648646673,
"epoch": 0.2222222222222222,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1156023740768433,
"kl": 0.0007145934030025577,
"learning_rate": 4.095092024539877e-07,
"loss": 0.001612265408039093,
"num_tokens": 3992006.0,
"reward": 1.8733334302902223,
"reward_std": 0.8630595266819,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.3666666656732559,
"rewards/correctness/std": 0.3761233925819397,
"rewards/multi_component_reward/mean": 0.4733333714306355,
"rewards/multi_component_reward/std": 0.4589902624487877,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.12649268209934234,
"sampling/importance_sampling_ratio/mean": 0.06289250953122974,
"sampling/importance_sampling_ratio/min": 0.03086609955687436,
"sampling/sampling_logp_difference/max": 0.3575249195098877,
"sampling/sampling_logp_difference/mean": 0.011591816507279872,
"step": 610,
"step_time": 10.539130839891731
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1739.8,
"completions/max_terminated_length": 1545.4,
"completions/mean_length": 1192.700048828125,
"completions/mean_terminated_length": 1113.4033569335938,
"completions/min_length": 698.0,
"completions/min_terminated_length": 698.0,
"entropy": 0.1430244820813338,
"epoch": 0.22404371584699453,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.008428337052464485,
"kl": 0.00040708660574940346,
"learning_rate": 4.0855061349693246e-07,
"loss": 0.0004372816067188978,
"num_tokens": 4030967.0,
"reward": 2.020000123977661,
"reward_std": 0.8252323269844055,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.4666666746139526,
"rewards/correctness/std": 0.36985843181610106,
"rewards/multi_component_reward/mean": 0.5533333420753479,
"rewards/multi_component_reward/std": 0.4648365914821625,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.06130031887441874,
"sampling/importance_sampling_ratio/mean": 0.01900877393782139,
"sampling/importance_sampling_ratio/min": 0.00034280085452338406,
"sampling/sampling_logp_difference/max": 0.37778170108795167,
"sampling/sampling_logp_difference/mean": 0.009492585808038712,
"step": 615,
"step_time": 15.555601417645812
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1618.8,
"completions/max_terminated_length": 932.0,
"completions/mean_length": 796.36669921875,
"completions/mean_terminated_length": 658.6666809082031,
"completions/min_length": 447.4,
"completions/min_terminated_length": 447.4,
"entropy": 0.24623483493924142,
"epoch": 0.22586520947176686,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.031970929354429245,
"kl": 0.0006379926751833409,
"learning_rate": 4.075920245398773e-07,
"loss": -0.0030921820551157,
"num_tokens": 4057612.0,
"reward": 1.3433334350585937,
"reward_std": 0.5004158258438111,
"rewards/boxed_rate/mean": 0.899999988079071,
"rewards/boxed_rate/std": 0.24494898319244385,
"rewards/correctness/mean": 0.13333333730697633,
"rewards/correctness/std": 0.20655910968780516,
"rewards/multi_component_reward/mean": 0.1766666740179062,
"rewards/multi_component_reward/std": 0.33977961242198945,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.24494898319244385,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.06918634744361044,
"sampling/importance_sampling_ratio/mean": 0.02087077551987022,
"sampling/importance_sampling_ratio/min": 0.0005975837658579486,
"sampling/sampling_logp_difference/max": 0.34571409225463867,
"sampling/sampling_logp_difference/mean": 0.01637477772310376,
"step": 620,
"step_time": 13.71192037537694
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666865348817,
"completions/max_length": 1644.8,
"completions/max_terminated_length": 1458.6,
"completions/mean_length": 1135.6000244140625,
"completions/mean_terminated_length": 976.4267211914063,
"completions/min_length": 573.8,
"completions/min_terminated_length": 573.8,
"entropy": 0.15917381308972836,
"epoch": 0.22768670309653916,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.03386072441935539,
"kl": 0.0005424051094450987,
"learning_rate": 4.066334355828221e-07,
"loss": 0.0008449406363070011,
"num_tokens": 4094980.0,
"reward": 1.4666667938232423,
"reward_std": 0.5768125057220459,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.2728438377380371,
"rewards/correctness/mean": 0.20000000596046447,
"rewards/correctness/std": 0.20655910968780516,
"rewards/multi_component_reward/mean": 0.20000001415610313,
"rewards/multi_component_reward/std": 0.3994357705116272,
"rewards/no_answer_rate/mean": 0.16666666865348817,
"rewards/no_answer_rate/std": 0.2728438377380371,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.054127467796206474,
"sampling/importance_sampling_ratio/mean": 0.016182091273367404,
"sampling/importance_sampling_ratio/min": 0.002016719186418556,
"sampling/sampling_logp_difference/max": 0.3768587350845337,
"sampling/sampling_logp_difference/mean": 0.009851652942597867,
"step": 625,
"step_time": 14.74142508301884
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03333333432674408,
"completions/max_length": 1218.6,
"completions/max_terminated_length": 1002.4,
"completions/mean_length": 759.3666870117188,
"completions/mean_terminated_length": 716.2800170898438,
"completions/min_length": 491.2,
"completions/min_terminated_length": 491.2,
"entropy": 0.19249504134058953,
"epoch": 0.22950819672131148,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.040883224457502365,
"kl": 0.0008160913091463347,
"learning_rate": 4.0567484662576683e-07,
"loss": -0.02089705318212509,
"num_tokens": 4121865.0,
"reward": 1.6600001096725463,
"reward_std": 0.8352818369865418,
"rewards/boxed_rate/mean": 1.0,
"rewards/boxed_rate/std": 0.0,
"rewards/correctness/mean": 0.2666666716337204,
"rewards/correctness/std": 0.3977532863616943,
"rewards/multi_component_reward/mean": 0.3933333531022072,
"rewards/multi_component_reward/std": 0.4375286281108856,
"rewards/no_answer_rate/mean": 0.0,
"rewards/no_answer_rate/std": 0.0,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.12170692235231399,
"sampling/importance_sampling_ratio/mean": 0.031014200672507285,
"sampling/importance_sampling_ratio/min": 0.001588601156254299,
"sampling/sampling_logp_difference/max": 0.3645863771438599,
"sampling/sampling_logp_difference/mean": 0.01216810904443264,
"step": 630,
"step_time": 11.676486384868621
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1104.8,
"completions/max_terminated_length": 1104.8,
"completions/mean_length": 809.733349609375,
"completions/mean_terminated_length": 809.733349609375,
"completions/min_length": 576.0,
"completions/min_terminated_length": 576.0,
"entropy": 0.17598983322580655,
"epoch": 0.23132969034608378,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.016296500340104103,
"kl": 0.0005092226871056482,
"learning_rate": 4.0471625766871167e-07,
"loss": -0.004767249897122383,
"num_tokens": 4148941.0,
"reward": 1.520000123977661,
"reward_std": 0.6183947920799255,
"rewards/boxed_rate/mean": 1.0,
"rewards/boxed_rate/std": 0.0,
"rewards/correctness/mean": 0.20000000298023224,
"rewards/correctness/std": 0.29447373151779177,
"rewards/multi_component_reward/mean": 0.3200000077486038,
"rewards/multi_component_reward/std": 0.3239211142063141,
"rewards/no_answer_rate/mean": 0.0,
"rewards/no_answer_rate/std": 0.0,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.17697620438411832,
"sampling/importance_sampling_ratio/mean": 0.06787280989810825,
"sampling/importance_sampling_ratio/min": 0.03228051500591391,
"sampling/sampling_logp_difference/max": 0.34911930561065674,
"sampling/sampling_logp_difference/mean": 0.011234280280768871,
"step": 635,
"step_time": 10.972807624004782
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03333333432674408,
"completions/max_length": 1261.6,
"completions/max_terminated_length": 1218.8,
"completions/mean_length": 815.0000244140625,
"completions/mean_terminated_length": 784.780029296875,
"completions/min_length": 532.2,
"completions/min_terminated_length": 532.2,
"entropy": 0.14789476320147515,
"epoch": 0.2331511839708561,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.08125295490026474,
"kl": 0.0006361469063752641,
"learning_rate": 4.037576687116564e-07,
"loss": -0.008612464368343353,
"num_tokens": 4176637.0,
"reward": 1.7100001096725463,
"reward_std": 0.8422193884849548,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.3000000089406967,
"rewards/correctness/std": 0.3914883255958557,
"rewards/multi_component_reward/mean": 0.41000000983476637,
"rewards/multi_component_reward/std": 0.4542399704456329,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.14553753491491078,
"sampling/importance_sampling_ratio/mean": 0.0640607855282724,
"sampling/importance_sampling_ratio/min": 0.0038686898650384326,
"sampling/sampling_logp_difference/max": 0.3177733182907104,
"sampling/sampling_logp_difference/mean": 0.00996503084897995,
"step": 640,
"step_time": 11.762126618809997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1707.4,
"completions/max_terminated_length": 1151.8,
"completions/mean_length": 1026.000048828125,
"completions/mean_terminated_length": 833.5666870117187,
"completions/min_length": 601.6,
"completions/min_terminated_length": 601.6,
"entropy": 0.15211746419469516,
"epoch": 0.23497267759562843,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04184797778725624,
"kl": 0.0004896697394239406,
"learning_rate": 4.027990797546012e-07,
"loss": -0.0011240935884416104,
"num_tokens": 4210123.0,
"reward": 1.7233334302902221,
"reward_std": 0.8806208968162537,
"rewards/boxed_rate/mean": 0.8333333373069763,
"rewards/boxed_rate/std": 0.2882087707519531,
"rewards/correctness/mean": 0.33333333432674406,
"rewards/correctness/std": 0.36985843181610106,
"rewards/multi_component_reward/mean": 0.35666669607162477,
"rewards/multi_component_reward/std": 0.5503794223070144,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.2882087707519531,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.10542454989627004,
"sampling/importance_sampling_ratio/mean": 0.03725480753928423,
"sampling/importance_sampling_ratio/min": 0.0006887080645215871,
"sampling/sampling_logp_difference/max": 0.3569879293441772,
"sampling/sampling_logp_difference/mean": 0.011003307159990072,
"step": 645,
"step_time": 14.911480633541942
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06666666865348816,
"completions/max_length": 1574.6,
"completions/max_terminated_length": 1237.0,
"completions/mean_length": 920.3000244140625,
"completions/mean_terminated_length": 845.8533569335938,
"completions/min_length": 600.0,
"completions/min_terminated_length": 600.0,
"entropy": 0.2023964905490478,
"epoch": 0.23679417122040072,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.023950839415192604,
"kl": 0.0005954526874120347,
"learning_rate": 4.01840490797546e-07,
"loss": 0.0016510456800460816,
"num_tokens": 4241086.0,
"reward": 1.4766667366027832,
"reward_std": 0.5016432344913483,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.16666666865348817,
"rewards/correctness/std": 0.2128240704536438,
"rewards/multi_component_reward/mean": 0.24333334639668464,
"rewards/multi_component_reward/std": 0.29857338666915895,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.02175317257642746,
"sampling/importance_sampling_ratio/mean": 0.007160785421729088,
"sampling/importance_sampling_ratio/min": 0.00033338346147502305,
"sampling/sampling_logp_difference/max": 0.387990665435791,
"sampling/sampling_logp_difference/mean": 0.012314715608954429,
"step": 650,
"step_time": 13.970132818445563
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03333333432674408,
"completions/max_length": 1193.4,
"completions/max_terminated_length": 1015.4,
"completions/mean_length": 716.5000244140625,
"completions/mean_terminated_length": 674.5733520507813,
"completions/min_length": 464.6,
"completions/min_terminated_length": 464.6,
"entropy": 0.16268871029218038,
"epoch": 0.23861566484517305,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.06329948455095291,
"kl": 0.0005586101295193657,
"learning_rate": 4.0088190184049077e-07,
"loss": -0.0020235760137438776,
"num_tokens": 4265005.0,
"reward": 1.5000001192092896,
"reward_std": 0.7927483677864074,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.20000000596046447,
"rewards/correctness/std": 0.36985843181610106,
"rewards/multi_component_reward/mean": 0.30000000447034836,
"rewards/multi_component_reward/std": 0.4276039183139801,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.08887637332081795,
"sampling/importance_sampling_ratio/mean": 0.04327146988362074,
"sampling/importance_sampling_ratio/min": 0.01085926350569757,
"sampling/sampling_logp_difference/max": 0.34261035919189453,
"sampling/sampling_logp_difference/mean": 0.010840693116188049,
"step": 655,
"step_time": 11.254522551037372
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1448.4,
"completions/max_terminated_length": 1037.6,
"completions/mean_length": 916.8000366210938,
"completions/mean_terminated_length": 796.9666870117187,
"completions/min_length": 576.2,
"completions/min_terminated_length": 576.2,
"entropy": 0.1381769967575868,
"epoch": 0.24043715846994534,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.018603617325425148,
"kl": 0.0004810667795633587,
"learning_rate": 3.9992331288343556e-07,
"loss": -0.00833008587360382,
"num_tokens": 4294873.0,
"reward": 1.530000126361847,
"reward_std": 0.5924062907695771,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.23333333134651185,
"rewards/correctness/std": 0.24494898319244385,
"rewards/multi_component_reward/mean": 0.29666669070720675,
"rewards/multi_component_reward/std": 0.35217125415802003,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.19538256973028184,
"sampling/importance_sampling_ratio/mean": 0.0801917064934969,
"sampling/importance_sampling_ratio/min": 0.002801872471235356,
"sampling/sampling_logp_difference/max": 0.36407305002212526,
"sampling/sampling_logp_difference/mean": 0.009397336840629577,
"step": 660,
"step_time": 13.188869021646678
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 915.0,
"completions/max_terminated_length": 915.0,
"completions/mean_length": 611.6000183105468,
"completions/mean_terminated_length": 611.6000183105468,
"completions/min_length": 423.0,
"completions/min_terminated_length": 423.0,
"entropy": 0.21465270320574442,
"epoch": 0.24225865209471767,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.021906884387135506,
"kl": 0.0005874778813449666,
"learning_rate": 3.9896472392638035e-07,
"loss": 0.0034822095185518265,
"num_tokens": 4315237.0,
"reward": 1.9400000810623168,
"reward_std": 0.8352818369865418,
"rewards/boxed_rate/mean": 1.0,
"rewards/boxed_rate/std": 0.0,
"rewards/correctness/mean": 0.4000000089406967,
"rewards/correctness/std": 0.3977532863616943,
"rewards/multi_component_reward/mean": 0.5399999991059303,
"rewards/multi_component_reward/std": 0.4375286281108856,
"rewards/no_answer_rate/mean": 0.0,
"rewards/no_answer_rate/std": 0.0,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.08231882937252522,
"sampling/importance_sampling_ratio/mean": 0.03466434141155332,
"sampling/importance_sampling_ratio/min": 0.0024839662786689588,
"sampling/sampling_logp_difference/max": 0.35751450061798096,
"sampling/sampling_logp_difference/mean": 0.013147631753236055,
"step": 665,
"step_time": 9.326043193787337
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1493.4,
"completions/max_terminated_length": 1139.0,
"completions/mean_length": 914.633349609375,
"completions/mean_terminated_length": 779.0366821289062,
"completions/min_length": 560.8,
"completions/min_terminated_length": 560.8,
"entropy": 0.15737735082705814,
"epoch": 0.24408014571949,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.01330585964024067,
"kl": 0.0005065599815376724,
"learning_rate": 3.9800613496932514e-07,
"loss": -0.006443501263856888,
"num_tokens": 4345802.0,
"reward": 1.32000013589859,
"reward_std": 0.6418773233890533,
"rewards/boxed_rate/mean": 0.9,
"rewards/boxed_rate/std": 0.18492921590805053,
"rewards/correctness/mean": 0.13333333730697633,
"rewards/correctness/std": 0.2665788769721985,
"rewards/multi_component_reward/mean": 0.18666667491197586,
"rewards/multi_component_reward/std": 0.3788073122501373,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.18492921590805053,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.10360531127080322,
"sampling/importance_sampling_ratio/mean": 0.05636316137388349,
"sampling/importance_sampling_ratio/min": 0.022174429874797168,
"sampling/sampling_logp_difference/max": 0.37078907489776614,
"sampling/sampling_logp_difference/mean": 0.010791171807795763,
"step": 670,
"step_time": 13.329693194106222
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06666666865348816,
"completions/max_length": 1439.8,
"completions/max_terminated_length": 1202.4,
"completions/mean_length": 803.2666870117188,
"completions/mean_terminated_length": 723.4000122070313,
"completions/min_length": 409.0,
"completions/min_terminated_length": 409.0,
"entropy": 0.17263933196663855,
"epoch": 0.2459016393442623,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.1460752934217453,
"kl": 0.0005690245316751923,
"learning_rate": 3.970475460122699e-07,
"loss": -0.012128566205501557,
"num_tokens": 4373710.0,
"reward": 1.6166667461395263,
"reward_std": 0.8088707447052002,
"rewards/boxed_rate/mean": 0.9666666626930237,
"rewards/boxed_rate/std": 0.08164966106414795,
"rewards/correctness/mean": 0.23333334028720856,
"rewards/correctness/std": 0.3914883255958557,
"rewards/multi_component_reward/mean": 0.31666667610406873,
"rewards/multi_component_reward/std": 0.4805600345134735,
"rewards/no_answer_rate/mean": 0.03333333432674408,
"rewards/no_answer_rate/std": 0.08164966106414795,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.10961102806031704,
"sampling/importance_sampling_ratio/mean": 0.043116622418165204,
"sampling/importance_sampling_ratio/min": 0.001660405399086784,
"sampling/sampling_logp_difference/max": 0.3272708892822266,
"sampling/sampling_logp_difference/mean": 0.011050505377352238,
"step": 675,
"step_time": 12.905224220454693
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.23333333432674408,
"completions/max_length": 1583.4,
"completions/max_terminated_length": 1101.4,
"completions/mean_length": 1081.7000366210937,
"completions/mean_terminated_length": 832.26669921875,
"completions/min_length": 648.0,
"completions/min_terminated_length": 648.0,
"entropy": 0.17594740043083826,
"epoch": 0.24772313296903462,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.07136747241020203,
"kl": 0.0005246107791511653,
"learning_rate": 3.960889570552147e-07,
"loss": 0.003443557769060135,
"num_tokens": 4409875.0,
"reward": 1.4266667485237121,
"reward_std": 0.781731303036213,
"rewards/boxed_rate/mean": 0.7666666626930236,
"rewards/boxed_rate/std": 0.30073869228363037,
"rewards/correctness/mean": 0.20000000298023224,
"rewards/correctness/std": 0.29447373151779177,
"rewards/multi_component_reward/mean": 0.16000001206994058,
"rewards/multi_component_reward/std": 0.509483140707016,
"rewards/no_answer_rate/mean": 0.23333333432674408,
"rewards/no_answer_rate/std": 0.30073869228363037,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.07207991853356362,
"sampling/importance_sampling_ratio/mean": 0.018585072737187146,
"sampling/importance_sampling_ratio/min": 0.00033181315377639464,
"sampling/sampling_logp_difference/max": 0.35654211044311523,
"sampling/sampling_logp_difference/mean": 0.011251153517514467,
"step": 680,
"step_time": 14.236122718639672
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.23333334028720856,
"completions/max_length": 1406.2,
"completions/max_terminated_length": 905.2,
"completions/mean_length": 937.7333770751953,
"completions/mean_terminated_length": 666.7800079345703,
"completions/min_length": 499.4,
"completions/min_terminated_length": 499.4,
"entropy": 0.15866130826373895,
"epoch": 0.2495446265938069,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.019822267815470695,
"kl": 0.0025144809878838714,
"learning_rate": 3.9513036809815945e-07,
"loss": 0.0014299599453806878,
"num_tokens": 4441367.0,
"reward": 1.5433334589004517,
"reward_std": 0.8197273433208465,
"rewards/boxed_rate/mean": 0.7666666686534882,
"rewards/boxed_rate/std": 0.2882087707519531,
"rewards/correctness/mean": 0.26666666865348815,
"rewards/correctness/std": 0.32236858606338503,
"rewards/multi_component_reward/mean": 0.2433333471417427,
"rewards/multi_component_reward/std": 0.5061476826667786,
"rewards/no_answer_rate/mean": 0.23333334028720856,
"rewards/no_answer_rate/std": 0.2882087707519531,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.19941046759486197,
"sampling/importance_sampling_ratio/mean": 0.10512657705694436,
"sampling/importance_sampling_ratio/min": 0.052634853626081814,
"sampling/sampling_logp_difference/max": 0.39468674659729003,
"sampling/sampling_logp_difference/mean": 0.010894744656980038,
"step": 685,
"step_time": 13.195306334272027
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06666666865348816,
"completions/max_length": 1319.4,
"completions/max_terminated_length": 1112.6,
"completions/mean_length": 827.1333801269532,
"completions/mean_terminated_length": 760.5666931152343,
"completions/min_length": 536.4,
"completions/min_terminated_length": 536.4,
"entropy": 0.18860391999284426,
"epoch": 0.25136612021857924,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.028540268540382385,
"kl": 0.0006184172428523501,
"learning_rate": 3.941717791411043e-07,
"loss": -0.008557839691638947,
"num_tokens": 4468635.0,
"reward": 2.013333463668823,
"reward_std": 0.8281045794487,
"rewards/boxed_rate/mean": 0.9333333253860474,
"rewards/boxed_rate/std": 0.1632993221282959,
"rewards/correctness/mean": 0.4333333343267441,
"rewards/correctness/std": 0.3761233925819397,
"rewards/multi_component_reward/mean": 0.513333360850811,
"rewards/multi_component_reward/std": 0.4702867269515991,
"rewards/no_answer_rate/mean": 0.06666666865348816,
"rewards/no_answer_rate/std": 0.1632993221282959,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.06727873869240283,
"sampling/importance_sampling_ratio/mean": 0.03433487480506301,
"sampling/importance_sampling_ratio/min": 0.00048347263269847286,
"sampling/sampling_logp_difference/max": 0.42853312492370604,
"sampling/sampling_logp_difference/mean": 0.012394568137824535,
"step": 690,
"step_time": 12.268561779148877
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1604.2,
"completions/max_terminated_length": 1084.2,
"completions/mean_length": 907.400048828125,
"completions/mean_terminated_length": 778.053369140625,
"completions/min_length": 563.2,
"completions/min_terminated_length": 563.2,
"entropy": 0.14893935720125834,
"epoch": 0.25318761384335153,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.018119433894753456,
"kl": 0.00045477717115621394,
"learning_rate": 3.932131901840491e-07,
"loss": -0.002763097546994686,
"num_tokens": 4499319.0,
"reward": 1.3900001049041748,
"reward_std": 0.6540532588958741,
"rewards/boxed_rate/mean": 0.899999988079071,
"rewards/boxed_rate/std": 0.24494898319244385,
"rewards/correctness/mean": 0.16666666865348817,
"rewards/correctness/std": 0.2728438377380371,
"rewards/multi_component_reward/mean": 0.2233333483338356,
"rewards/multi_component_reward/std": 0.3906372755765915,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.24494898319244385,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.09045649766921997,
"sampling/importance_sampling_ratio/mean": 0.034694204293191436,
"sampling/importance_sampling_ratio/min": 0.0024610218745878347,
"sampling/sampling_logp_difference/max": 0.3914134740829468,
"sampling/sampling_logp_difference/mean": 0.010633220244199038,
"step": 695,
"step_time": 13.952417024224996
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1497.8,
"completions/max_terminated_length": 771.4,
"completions/mean_length": 722.333349609375,
"completions/mean_terminated_length": 584.6333435058593,
"completions/min_length": 428.8,
"completions/min_terminated_length": 428.8,
"entropy": 0.20499998504916828,
"epoch": 0.2550091074681239,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.06937753409147263,
"kl": 0.0006178093899507076,
"learning_rate": 3.922546012269938e-07,
"loss": -0.01590590924024582,
"num_tokens": 4523047.0,
"reward": 1.4600000619888305,
"reward_std": 0.7687346935272217,
"rewards/boxed_rate/mean": 0.899999988079071,
"rewards/boxed_rate/std": 0.24494898319244385,
"rewards/correctness/mean": 0.20000000596046447,
"rewards/correctness/std": 0.3098386645317078,
"rewards/multi_component_reward/mean": 0.2600000202655792,
"rewards/multi_component_reward/std": 0.4624049305915833,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.24494898319244385,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.16114517226815223,
"sampling/importance_sampling_ratio/mean": 0.06391621977090836,
"sampling/importance_sampling_ratio/min": 0.010175443676988779,
"sampling/sampling_logp_difference/max": 0.3058194637298584,
"sampling/sampling_logp_difference/mean": 0.013935025222599507,
"step": 700,
"step_time": 13.013086147420108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666865348817,
"completions/max_length": 1429.0,
"completions/max_terminated_length": 942.0,
"completions/mean_length": 947.1667053222657,
"completions/mean_terminated_length": 811.6733581542969,
"completions/min_length": 627.2,
"completions/min_terminated_length": 627.2,
"entropy": 0.17254207134246827,
"epoch": 0.2568306010928962,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.012989591807126999,
"kl": 0.0013695822664885782,
"learning_rate": 3.9129601226993866e-07,
"loss": 0.02462577521800995,
"num_tokens": 4554762.0,
"reward": 2.000000071525574,
"reward_std": 1.0529863953590393,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.2728438377380371,
"rewards/correctness/mean": 0.4666666626930237,
"rewards/correctness/std": 0.4298781991004944,
"rewards/multi_component_reward/mean": 0.5000000357627868,
"rewards/multi_component_reward/std": 0.5574271440505981,
"rewards/no_answer_rate/mean": 0.16666666865348817,
"rewards/no_answer_rate/std": 0.2728438377380371,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.1705483404919505,
"sampling/importance_sampling_ratio/mean": 0.062803098349832,
"sampling/importance_sampling_ratio/min": 0.011600531370969901,
"sampling/sampling_logp_difference/max": 0.36405808925628663,
"sampling/sampling_logp_difference/mean": 0.01186208426952362,
"step": 705,
"step_time": 13.669213125482202
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1826.4,
"completions/max_terminated_length": 1155.4,
"completions/mean_length": 1059.6667114257812,
"completions/mean_terminated_length": 841.7533569335938,
"completions/min_length": 515.2,
"completions/min_terminated_length": 515.2,
"entropy": 0.23051666493217152,
"epoch": 0.2586520947176685,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04497559741139412,
"kl": 0.000664207028845946,
"learning_rate": 3.9033742331288344e-07,
"loss": -0.00020618487615138293,
"num_tokens": 4589378.0,
"reward": 1.8600000977516173,
"reward_std": 0.9444810330867768,
"rewards/boxed_rate/mean": 0.8333333373069763,
"rewards/boxed_rate/std": 0.2882087707519531,
"rewards/correctness/mean": 0.40000000298023225,
"rewards/correctness/std": 0.36985843181610106,
"rewards/multi_component_reward/mean": 0.4266666829586029,
"rewards/multi_component_reward/std": 0.552643096446991,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.2882087707519531,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.04061034563928843,
"sampling/importance_sampling_ratio/mean": 0.008759467327035964,
"sampling/importance_sampling_ratio/min": 0.0002094253270666409,
"sampling/sampling_logp_difference/max": 0.3215909957885742,
"sampling/sampling_logp_difference/mean": 0.014831000193953513,
"step": 710,
"step_time": 15.66096483822912
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06666666865348816,
"completions/max_length": 1843.8,
"completions/max_terminated_length": 1615.2,
"completions/mean_length": 1085.3667114257812,
"completions/mean_terminated_length": 1008.9400390625,
"completions/min_length": 637.2,
"completions/min_terminated_length": 637.2,
"entropy": 0.19345977554718655,
"epoch": 0.2604735883424408,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.04418517276644707,
"kl": 0.0008112916897516697,
"learning_rate": 3.893788343558282e-07,
"loss": -0.0014110441319644452,
"num_tokens": 4626127.0,
"reward": 1.5000000953674317,
"reward_std": 0.7173020958900451,
"rewards/boxed_rate/mean": 0.9333333253860474,
"rewards/boxed_rate/std": 0.1632993221282959,
"rewards/correctness/mean": 0.20000000298023224,
"rewards/correctness/std": 0.29447373151779177,
"rewards/multi_component_reward/mean": 0.266666679084301,
"rewards/multi_component_reward/std": 0.3763957768678665,
"rewards/no_answer_rate/mean": 0.06666666865348816,
"rewards/no_answer_rate/std": 0.1632993221282959,
"rewards/repetition_rate/mean": 0.03333333432674408,
"rewards/repetition_rate/std": 0.08164966106414795,
"sampling/importance_sampling_ratio/max": 0.04309340715408325,
"sampling/importance_sampling_ratio/mean": 0.012624496826902032,
"sampling/importance_sampling_ratio/min": 0.00014569541402083248,
"sampling/sampling_logp_difference/max": 0.3676969051361084,
"sampling/sampling_logp_difference/mean": 0.01275388365611434,
"step": 715,
"step_time": 15.848505285196007
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 1794.8,
"completions/max_terminated_length": 1234.6,
"completions/mean_length": 973.4000183105469,
"completions/mean_terminated_length": 763.0933410644532,
"completions/min_length": 480.8,
"completions/min_terminated_length": 480.8,
"entropy": 0.19203649039069812,
"epoch": 0.26229508196721313,
"frac_reward_zero_std": 0.2,
"grad_norm": 0.01788559928536415,
"kl": 0.0007585312666681906,
"learning_rate": 3.8842024539877297e-07,
"loss": -0.0016064226627349853,
"num_tokens": 4659067.0,
"reward": 1.3966667771339416,
"reward_std": 0.3670642614364624,
"rewards/boxed_rate/mean": 0.8333333373069763,
"rewards/boxed_rate/std": 0.2882087707519531,
"rewards/correctness/mean": 0.1666666626930237,
"rewards/correctness/std": 0.08164966106414795,
"rewards/multi_component_reward/mean": 0.16333335414528846,
"rewards/multi_component_reward/std": 0.28173157572746277,
"rewards/no_answer_rate/mean": 0.1666666716337204,
"rewards/no_answer_rate/std": 0.2882087707519531,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.1632993221282959,
"sampling/importance_sampling_ratio/max": 0.2071637876331806,
"sampling/importance_sampling_ratio/mean": 0.051915486436337234,
"sampling/importance_sampling_ratio/min": 3.873565276598517e-06,
"sampling/sampling_logp_difference/max": 0.34898459911346436,
"sampling/sampling_logp_difference/mean": 0.013108029030263424,
"step": 720,
"step_time": 15.256685120798647
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20000000596046447,
"completions/max_length": 1793.6,
"completions/max_terminated_length": 1117.6,
"completions/mean_length": 1055.9667114257813,
"completions/mean_terminated_length": 758.4666870117187,
"completions/min_length": 497.0,
"completions/min_terminated_length": 497.0,
"entropy": 0.20671888664364815,
"epoch": 0.2641165755919854,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.023016415536403656,
"kl": 0.0006531716018798762,
"learning_rate": 3.874616564417178e-07,
"loss": 0.0004923004657030105,
"num_tokens": 4692996.0,
"reward": 1.6100001096725465,
"reward_std": 0.5153546214103699,
"rewards/boxed_rate/mean": 0.7999999940395355,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.3000000029802322,
"rewards/correctness/std": 0.18492921590805053,
"rewards/multi_component_reward/mean": 0.310000017285347,
"rewards/multi_component_reward/std": 0.3351393073797226,
"rewards/no_answer_rate/mean": 0.20000000596046447,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.050453129876405,
"sampling/importance_sampling_ratio/mean": 0.014345021429471671,
"sampling/importance_sampling_ratio/min": 0.0007461899496814892,
"sampling/sampling_logp_difference/max": 0.4133040189743042,
"sampling/sampling_logp_difference/mean": 0.012775494437664747,
"step": 725,
"step_time": 15.257122083753348
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20000000298023224,
"completions/max_length": 1528.6,
"completions/max_terminated_length": 1156.6,
"completions/mean_length": 1054.86669921875,
"completions/mean_terminated_length": 859.533349609375,
"completions/min_length": 625.0,
"completions/min_terminated_length": 625.0,
"entropy": 0.21140047113100688,
"epoch": 0.2659380692167577,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.018997710198163986,
"kl": 0.0006346999696688726,
"learning_rate": 3.8650306748466255e-07,
"loss": 0.0016422737389802933,
"num_tokens": 4727378.0,
"reward": 1.5333333969116212,
"reward_std": 0.7559072077274323,
"rewards/boxed_rate/mean": 0.8333333253860473,
"rewards/boxed_rate/std": 0.2728438377380371,
"rewards/correctness/mean": 0.23333333432674408,
"rewards/correctness/std": 0.30073869228363037,
"rewards/multi_component_reward/mean": 0.23333334922790527,
"rewards/multi_component_reward/std": 0.4953298598527908,
"rewards/no_answer_rate/mean": 0.16666666865348817,
"rewards/no_answer_rate/std": 0.2728438377380371,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.08731604730710388,
"sampling/importance_sampling_ratio/mean": 0.020629775966517626,
"sampling/importance_sampling_ratio/min": 0.0005315004007400171,
"sampling/sampling_logp_difference/max": 0.3350215911865234,
"sampling/sampling_logp_difference/mean": 0.013690494932234287,
"step": 730,
"step_time": 13.985034779645503
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13333333730697633,
"completions/max_length": 1815.2,
"completions/max_terminated_length": 1340.4,
"completions/mean_length": 1031.46669921875,
"completions/mean_terminated_length": 883.0133483886718,
"completions/min_length": 518.6,
"completions/min_terminated_length": 518.6,
"entropy": 0.18139945759127538,
"epoch": 0.2677595628415301,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.021211547777056694,
"kl": 0.000629005294952852,
"learning_rate": 3.8554447852760733e-07,
"loss": 0.0014220052398741245,
"num_tokens": 4761916.0,
"reward": 1.4866667628288268,
"reward_std": 0.8185165703296662,
"rewards/boxed_rate/mean": 0.8666666626930237,
"rewards/boxed_rate/std": 0.2665788769721985,
"rewards/correctness/mean": 0.20000000298023224,
"rewards/correctness/std": 0.35449349880218506,
"rewards/multi_component_reward/mean": 0.2200000137090683,
"rewards/multi_component_reward/std": 0.5127273321151733,
"rewards/no_answer_rate/mean": 0.13333333730697633,
"rewards/no_answer_rate/std": 0.2665788769721985,
"rewards/repetition_rate/mean": 0.06666666865348816,
"rewards/repetition_rate/std": 0.10327955484390258,
"sampling/importance_sampling_ratio/max": 0.06554706878960133,
"sampling/importance_sampling_ratio/mean": 0.02508344785310328,
"sampling/importance_sampling_ratio/min": 0.0035423538342001966,
"sampling/sampling_logp_difference/max": 0.31146485805511476,
"sampling/sampling_logp_difference/mean": 0.0113022543489933,
"step": 735,
"step_time": 15.48011973593384
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 883.8,
"completions/max_terminated_length": 883.8,
"completions/mean_length": 542.2000122070312,
"completions/mean_terminated_length": 542.2000122070312,
"completions/min_length": 352.4,
"completions/min_terminated_length": 352.4,
"entropy": 0.19473386829098066,
"epoch": 0.26958105646630237,
"frac_reward_zero_std": 0.4,
"grad_norm": 2.118746042251587,
"kl": 0.0021321156071887042,
"learning_rate": 3.845858895705521e-07,
"loss": -0.0064903564751148226,
"num_tokens": 4781002.0,
"reward": 1.7300001382827759,
"reward_std": 0.5729720234870911,
"rewards/boxed_rate/mean": 1.0,
"rewards/boxed_rate/std": 0.0,
"rewards/correctness/mean": 0.29999999701976776,
"rewards/correctness/std": 0.2728438377380371,
"rewards/multi_component_reward/mean": 0.430000028014183,
"rewards/multi_component_reward/std": 0.3001282274723053,
"rewards/no_answer_rate/mean": 0.0,
"rewards/no_answer_rate/std": 0.0,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.28090705554932355,
"sampling/importance_sampling_ratio/mean": 0.15755705069750547,
"sampling/importance_sampling_ratio/min": 0.0855626948821282,
"sampling/sampling_logp_difference/max": 0.3150986671447754,
"sampling/sampling_logp_difference/mean": 0.01262940764427185,
"step": 740,
"step_time": 9.168580058775841
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10000000298023223,
"completions/max_length": 1972.8,
"completions/max_terminated_length": 1486.4,
"completions/mean_length": 1172.000048828125,
"completions/mean_terminated_length": 1050.6600280761718,
"completions/min_length": 720.8,
"completions/min_terminated_length": 720.8,
"entropy": 0.11767480584482352,
"epoch": 0.27140255009107467,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.08145014941692352,
"kl": 0.0006028921416145749,
"learning_rate": 3.836273006134969e-07,
"loss": 0.0025216279551386832,
"num_tokens": 4818496.0,
"reward": 1.6700001001358031,
"reward_std": 0.8164093613624572,
"rewards/boxed_rate/mean": 0.899999988079071,
"rewards/boxed_rate/std": 0.24494898319244385,
"rewards/correctness/mean": 0.3,
"rewards/correctness/std": 0.34822853803634646,
"rewards/multi_component_reward/mean": 0.37000002562999723,
"rewards/multi_component_reward/std": 0.47640362083911897,
"rewards/no_answer_rate/mean": 0.10000000298023223,
"rewards/no_answer_rate/std": 0.24494898319244385,
"rewards/repetition_rate/mean": 0.0,
"rewards/repetition_rate/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.14499615281820297,
"sampling/importance_sampling_ratio/mean": 0.04840177483856678,
"sampling/importance_sampling_ratio/min": 0.0013758533679049771,
"sampling/sampling_logp_difference/max": 0.32971014976501467,
"sampling/sampling_logp_difference/mean": 0.008220831863582134,
"step": 745,
"step_time": 16.791488209925593
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.36666667759418486,
"completions/max_length": 1719.0,
"completions/max_terminated_length": 1232.4,
"completions/mean_length": 1217.600048828125,
"completions/mean_terminated_length": 822.3233520507813,
"completions/min_length": 564.6,
"completions/min_terminated_length": 564.6,
"entropy": 0.19243225703636804,
"epoch": 0.273224043715847,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03210171312093735,
"kl": 0.0007616788071269791,
"learning_rate": 3.826687116564417e-07,
"loss": -0.005124400556087494,
"num_tokens": 4864732.0,
"reward": 1.7300000667572022,
"reward_std": 0.9617222905158996,
"rewards/boxed_rate/mean": 0.7666666626930236,
"rewards/boxed_rate/std": 0.3761233925819397,
"rewards/correctness/mean": 0.33333334028720857,
"rewards/correctness/std": 0.3977532863616943,
"rewards/multi_component_reward/mean": 0.29666668772697447,
"rewards/multi_component_reward/std": 0.6426073163747787,
"rewards/no_answer_rate/mean": 0.2333333373069763,
"rewards/no_answer_rate/std": 0.3761233925819397,
"rewards/repetition_rate/mean": 0.1,
"rewards/repetition_rate/std": 0.10954451560974121,
"sampling/importance_sampling_ratio/max": 0.11619223132729531,
"sampling/importance_sampling_ratio/mean": 0.03387247350765392,
"sampling/importance_sampling_ratio/min": 0.0005780367053725346,
"sampling/sampling_logp_difference/max": 0.3284996747970581,
"sampling/sampling_logp_difference/mean": 0.012111451011151075,
"step": 750,
"step_time": 15.568404103443026
},
{
"epoch": 0.273224043715847,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.17000000417232514,
"eval_completions/max_length": 1756.9,
"eval_completions/max_terminated_length": 1208.56,
"eval_completions/mean_length": 1074.366694946289,
"eval_completions/mean_terminated_length": 874.3350158691406,
"eval_completions/min_length": 579.92,
"eval_completions/min_terminated_length": 579.92,
"eval_entropy": 0.18485607638955115,
"eval_frac_reward_zero_std": 0.1,
"eval_kl": 0.0007376758259488269,
"eval_loss": -0.0014031080063432455,
"eval_num_tokens": 4864732.0,
"eval_reward": 1.6413334274291993,
"eval_reward_std": 0.7854073830693961,
"eval_rewards/boxed_rate/mean": 0.8366666615009308,
"eval_rewards/boxed_rate/std": 0.27878632068634035,
"eval_rewards/correctness/mean": 0.2933333370089531,
"eval_rewards/correctness/std": 0.32616410493850706,
"eval_rewards/multi_component_reward/mean": 0.3146666841953993,
"eval_rewards/multi_component_reward/std": 0.4994544792175293,
"eval_rewards/no_answer_rate/mean": 0.16333333730697633,
"eval_rewards/no_answer_rate/std": 0.27878632068634035,
"eval_rewards/repetition_rate/mean": 0.03333333432674408,
"eval_rewards/repetition_rate/std": 0.06964570760726929,
"eval_runtime": 681.9794,
"eval_samples_per_second": 0.073,
"eval_sampling/importance_sampling_ratio/max": 0.09388977923779748,
"eval_sampling/importance_sampling_ratio/mean": 0.029817550851148553,
"eval_sampling/importance_sampling_ratio/min": 0.003447885300730625,
"eval_sampling/sampling_logp_difference/max": 0.7785206460952758,
"eval_sampling/sampling_logp_difference/mean": 0.014042644733563066,
"eval_steps_per_second": 0.013,
"step": 750
}
],
"logging_steps": 5,
"max_steps": 2745,
"num_input_tokens_seen": 4864732,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}