GRPO-Model / checkpoint-730 /trainer_state.json
mjf-su's picture
Add files using upload-large-folder tool
b228ef5 verified
Raw
History Blame Contribute Delete
106 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.1251500085719184,
"eval_steps": 500,
"global_step": 730,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009895833674818277,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 921.2,
"completions/mean_length": 747.1151245117187,
"completions/mean_terminated_length": 744.3689392089843,
"completions/min_length": 457.8,
"completions/min_terminated_length": 457.8,
"entropy": 0.7938701828320821,
"epoch": 0.0017143836790673753,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.294921875,
"learning_rate": 1e-06,
"loss": 0.0024,
"num_tokens": 4482493.0,
"reward": -11.88613681793213,
"reward_std": 9.975114345550537,
"rewards/ADERawReward/mean": -11.914521789550781,
"rewards/ADERawReward/std": 9.975466585159301,
"rewards/StrictFormatReward/mean": 0.28385417386889455,
"rewards/StrictFormatReward/std": 2.689612889289856,
"sampling/importance_sampling_ratio/max": 2.7710394859313965,
"sampling/importance_sampling_ratio/mean": 0.38329996466636657,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.348051142692566,
"sampling/sampling_logp_difference/mean": 0.02634742669761181,
"step": 10,
"step_time": 31.851699317700696
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.014062500465661288,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 901.0,
"completions/mean_length": 746.5093994140625,
"completions/mean_terminated_length": 742.5637084960938,
"completions/min_length": 319.8,
"completions/min_terminated_length": 319.8,
"entropy": 0.783385141690572,
"epoch": 0.0034287673581347507,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.341796875,
"learning_rate": 1e-06,
"loss": -0.0167,
"num_tokens": 8963695.0,
"reward": -10.739660549163819,
"reward_std": 7.596912527084351,
"rewards/ADERawReward/mean": -10.76804552078247,
"rewards/ADERawReward/std": 7.586498594284057,
"rewards/StrictFormatReward/mean": 0.2838541708886623,
"rewards/StrictFormatReward/std": 2.69962215423584,
"sampling/importance_sampling_ratio/max": 2.778630328178406,
"sampling/importance_sampling_ratio/mean": 0.38422776758670807,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.3024064064025875,
"sampling/sampling_logp_difference/mean": 0.026079285889863968,
"step": 20,
"step_time": 31.08811371029442
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01302083358168602,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 916.5,
"completions/mean_length": 745.9713745117188,
"completions/mean_terminated_length": 742.3107482910157,
"completions/min_length": 344.2,
"completions/min_terminated_length": 344.2,
"entropy": 0.777154250939687,
"epoch": 0.005143151037202126,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.240234375,
"learning_rate": 1e-06,
"loss": -0.0033,
"num_tokens": 13443080.0,
"reward": -10.726643276214599,
"reward_std": 6.941620349884033,
"rewards/ADERawReward/mean": -10.759612274169921,
"rewards/ADERawReward/std": 6.939747667312622,
"rewards/StrictFormatReward/mean": 0.32968750670552255,
"rewards/StrictFormatReward/std": 2.6174988746643066,
"sampling/importance_sampling_ratio/max": 2.820967411994934,
"sampling/importance_sampling_ratio/mean": 0.3970964789390564,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.230282878875732,
"sampling/sampling_logp_difference/mean": 0.025640238262712955,
"step": 30,
"step_time": 31.281337887697738
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006250000186264515,
"completions/max_length": 1006.1,
"completions/max_terminated_length": 952.6,
"completions/mean_length": 746.8578247070312,
"completions/mean_terminated_length": 745.1074523925781,
"completions/min_length": 574.6,
"completions/min_terminated_length": 574.6,
"entropy": 0.7786477545897166,
"epoch": 0.006857534716269501,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.353515625,
"learning_rate": 1e-06,
"loss": 0.0052,
"num_tokens": 17925079.0,
"reward": -10.400494289398193,
"reward_std": 8.363166570663452,
"rewards/ADERawReward/mean": -10.453514766693115,
"rewards/ADERawReward/std": 8.35603289604187,
"rewards/StrictFormatReward/mean": 0.5302083313465118,
"rewards/StrictFormatReward/std": 2.1997122406959533,
"sampling/importance_sampling_ratio/max": 2.7520124673843385,
"sampling/importance_sampling_ratio/mean": 0.4182109236717224,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.882514882087707,
"sampling/sampling_logp_difference/mean": 0.025688859820365905,
"step": 40,
"step_time": 30.94830545460136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010416666883975268,
"completions/max_length": 1008.7,
"completions/max_terminated_length": 908.0,
"completions/mean_length": 744.6708557128907,
"completions/mean_terminated_length": 741.7320251464844,
"completions/min_length": 318.0,
"completions/min_terminated_length": 318.0,
"entropy": 0.7797636012236278,
"epoch": 0.008571918395336876,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.296875,
"learning_rate": 1e-06,
"loss": 0.0059,
"num_tokens": 22402927.0,
"reward": -11.912008953094482,
"reward_std": 25.465690279006957,
"rewards/ADERawReward/mean": -11.948414325714111,
"rewards/ADERawReward/std": 25.452851724624633,
"rewards/StrictFormatReward/mean": 0.3640625074505806,
"rewards/StrictFormatReward/std": 2.5515830755233764,
"sampling/importance_sampling_ratio/max": 2.7972999811172485,
"sampling/importance_sampling_ratio/mean": 0.41742126941680907,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.1138537406921385,
"sampling/sampling_logp_difference/mean": 0.025952593609690666,
"step": 50,
"step_time": 30.99103705089656
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00572916679084301,
"completions/max_length": 982.4,
"completions/max_terminated_length": 894.4,
"completions/mean_length": 743.1708618164063,
"completions/mean_terminated_length": 741.5553588867188,
"completions/min_length": 389.5,
"completions/min_terminated_length": 389.5,
"entropy": 0.7705018659432729,
"epoch": 0.010286302074404252,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.294921875,
"learning_rate": 1e-06,
"loss": -0.001,
"num_tokens": 26877415.0,
"reward": -10.354321956634521,
"reward_std": 7.085966682434082,
"rewards/ADERawReward/mean": -10.3998948097229,
"rewards/ADERawReward/std": 7.078661155700684,
"rewards/StrictFormatReward/mean": 0.45572916492819787,
"rewards/StrictFormatReward/std": 2.357165718078613,
"sampling/importance_sampling_ratio/max": 2.8156795501708984,
"sampling/importance_sampling_ratio/mean": 0.4263548344373703,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.213912606239319,
"sampling/sampling_logp_difference/mean": 0.025748718157410623,
"step": 60,
"step_time": 30.629806772600567
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007812500139698387,
"completions/max_length": 997.5,
"completions/max_terminated_length": 901.5,
"completions/mean_length": 745.1635559082031,
"completions/mean_terminated_length": 742.9772766113281,
"completions/min_length": 448.0,
"completions/min_terminated_length": 448.0,
"entropy": 0.7808759689331055,
"epoch": 0.012000685753471628,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.416015625,
"learning_rate": 1e-06,
"loss": -0.0022,
"num_tokens": 31355793.0,
"reward": -10.355865859985352,
"reward_std": 7.629749822616577,
"rewards/ADERawReward/mean": -10.399147129058838,
"rewards/ADERawReward/std": 7.623885774612427,
"rewards/StrictFormatReward/mean": 0.43281250447034836,
"rewards/StrictFormatReward/std": 2.4088792324066164,
"sampling/importance_sampling_ratio/max": 2.7858126163482666,
"sampling/importance_sampling_ratio/mean": 0.4104765444993973,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.654694700241089,
"sampling/sampling_logp_difference/mean": 0.02596179526299238,
"step": 70,
"step_time": 30.90157012869895
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006250000139698386,
"completions/max_length": 997.0,
"completions/max_terminated_length": 911.7,
"completions/mean_length": 743.9010620117188,
"completions/mean_terminated_length": 742.1458801269531,
"completions/min_length": 506.5,
"completions/min_terminated_length": 506.5,
"entropy": 0.7690683722496032,
"epoch": 0.013715069432539003,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.259765625,
"learning_rate": 1e-06,
"loss": -0.0147,
"num_tokens": 35831699.0,
"reward": -9.603075504302979,
"reward_std": 7.397923803329467,
"rewards/ADERawReward/mean": -9.64406452178955,
"rewards/ADERawReward/std": 7.394052314758301,
"rewards/StrictFormatReward/mean": 0.40989583656191825,
"rewards/StrictFormatReward/std": 2.446663808822632,
"sampling/importance_sampling_ratio/max": 2.8509432315826415,
"sampling/importance_sampling_ratio/mean": 0.42068196535110475,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.871529459953308,
"sampling/sampling_logp_difference/mean": 0.025504560954868794,
"step": 80,
"step_time": 30.775000178898335
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006770833488553762,
"completions/max_length": 1018.6,
"completions/max_terminated_length": 921.4,
"completions/mean_length": 743.0687744140625,
"completions/mean_terminated_length": 741.1454040527344,
"completions/min_length": 413.2,
"completions/min_terminated_length": 413.2,
"entropy": 0.7730992416540782,
"epoch": 0.015429453111606378,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.330078125,
"learning_rate": 1e-06,
"loss": -0.0209,
"num_tokens": 40306439.0,
"reward": -9.999512577056885,
"reward_std": 6.795107650756836,
"rewards/ADERawReward/mean": -10.04222068786621,
"rewards/ADERawReward/std": 6.785987043380738,
"rewards/StrictFormatReward/mean": 0.4270833358168602,
"rewards/StrictFormatReward/std": 2.427087187767029,
"sampling/importance_sampling_ratio/max": 2.806507611274719,
"sampling/importance_sampling_ratio/mean": 0.41283826231956483,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.893343424797058,
"sampling/sampling_logp_difference/mean": 0.02597447969019413,
"step": 90,
"step_time": 31.03075063330034
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006250000139698386,
"completions/max_length": 989.1,
"completions/max_terminated_length": 911.9,
"completions/mean_length": 744.7958557128907,
"completions/mean_terminated_length": 743.0538269042969,
"completions/min_length": 479.0,
"completions/min_terminated_length": 479.0,
"entropy": 0.7617896695931753,
"epoch": 0.01714383679067375,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26953125,
"learning_rate": 1e-06,
"loss": 0.0056,
"num_tokens": 44785151.0,
"reward": -10.150122165679932,
"reward_std": 6.947455310821534,
"rewards/ADERawReward/mean": -10.1985595703125,
"rewards/ADERawReward/std": 6.9321370124816895,
"rewards/StrictFormatReward/mean": 0.48437499925494193,
"rewards/StrictFormatReward/std": 2.2780819296836854,
"sampling/importance_sampling_ratio/max": 2.8471882343292236,
"sampling/importance_sampling_ratio/mean": 0.4186240643262863,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 5.192489242553711,
"sampling/sampling_logp_difference/mean": 0.025915385968983174,
"step": 100,
"step_time": 30.777787425796852
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.008854166837409139,
"completions/max_length": 1016.1,
"completions/max_terminated_length": 927.6,
"completions/mean_length": 747.430224609375,
"completions/mean_terminated_length": 744.9568664550782,
"completions/min_length": 542.5,
"completions/min_terminated_length": 542.5,
"entropy": 0.7839122792085012,
"epoch": 0.018858220469741126,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.369140625,
"learning_rate": 1e-06,
"loss": 0.0095,
"num_tokens": 49267497.0,
"reward": -10.34236192703247,
"reward_std": 6.522932863235473,
"rewards/ADERawReward/mean": -10.377049255371094,
"rewards/ADERawReward/std": 6.512392950057984,
"rewards/StrictFormatReward/mean": 0.3468750029802322,
"rewards/StrictFormatReward/std": 2.5984415769577027,
"sampling/importance_sampling_ratio/max": 2.919663596153259,
"sampling/importance_sampling_ratio/mean": 0.42760642170906066,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.582067346572876,
"sampling/sampling_logp_difference/mean": 0.025711694732308388,
"step": 110,
"step_time": 31.13533423260669
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006250000139698386,
"completions/max_length": 981.8,
"completions/max_terminated_length": 905.6,
"completions/mean_length": 741.3906433105469,
"completions/mean_terminated_length": 739.6170837402344,
"completions/min_length": 322.0,
"completions/min_terminated_length": 322.0,
"entropy": 0.7655617932478587,
"epoch": 0.020572604148808505,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2255859375,
"learning_rate": 1e-06,
"loss": -0.0003,
"num_tokens": 53739095.0,
"reward": -9.396285820007325,
"reward_std": 6.019436550140381,
"rewards/ADERawReward/mean": -9.43211908340454,
"rewards/ADERawReward/std": 5.99921350479126,
"rewards/StrictFormatReward/mean": 0.35833333134651185,
"rewards/StrictFormatReward/std": 2.565337634086609,
"sampling/importance_sampling_ratio/max": 2.7736236810684205,
"sampling/importance_sampling_ratio/mean": 0.4329004347324371,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 5.103514838218689,
"sampling/sampling_logp_difference/mean": 0.02581946086138487,
"step": 120,
"step_time": 30.675611695800036
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006250000139698386,
"completions/max_length": 993.0,
"completions/max_terminated_length": 917.3,
"completions/mean_length": 743.2172119140625,
"completions/mean_terminated_length": 741.4531127929688,
"completions/min_length": 448.6,
"completions/min_terminated_length": 448.6,
"entropy": 0.7609198371569316,
"epoch": 0.02228698782787588,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.306640625,
"learning_rate": 1e-06,
"loss": -0.0186,
"num_tokens": 58213992.0,
"reward": -10.035307788848877,
"reward_std": 7.6791582107543945,
"rewards/ADERawReward/mean": -10.080307579040527,
"rewards/ADERawReward/std": 7.661759281158448,
"rewards/StrictFormatReward/mean": 0.45,
"rewards/StrictFormatReward/std": 2.383046817779541,
"sampling/importance_sampling_ratio/max": 2.7497065782547,
"sampling/importance_sampling_ratio/mean": 0.38673609495162964,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.264748442173004,
"sampling/sampling_logp_difference/mean": 0.02555535715073347,
"step": 130,
"step_time": 30.85125565490016
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.008333333535119891,
"completions/max_length": 995.9,
"completions/max_terminated_length": 936.1,
"completions/mean_length": 747.1015808105469,
"completions/mean_terminated_length": 744.7781860351563,
"completions/min_length": 504.1,
"completions/min_terminated_length": 504.1,
"entropy": 0.7680625875790914,
"epoch": 0.024001371506943255,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.283203125,
"learning_rate": 1e-06,
"loss": -0.014,
"num_tokens": 62696459.0,
"reward": -9.6462965965271,
"reward_std": 6.933882856369019,
"rewards/ADERawReward/mean": -9.687859153747558,
"rewards/ADERawReward/std": 6.932486724853516,
"rewards/StrictFormatReward/mean": 0.41562500298023225,
"rewards/StrictFormatReward/std": 2.435987985134125,
"sampling/importance_sampling_ratio/max": 2.7624801874160765,
"sampling/importance_sampling_ratio/mean": 0.39736475646495817,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.469653701782226,
"sampling/sampling_logp_difference/mean": 0.025803259573876858,
"step": 140,
"step_time": 30.751169363802184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0067708335351198915,
"completions/max_length": 996.8,
"completions/max_terminated_length": 927.2,
"completions/mean_length": 743.0797119140625,
"completions/mean_terminated_length": 741.1633605957031,
"completions/min_length": 364.4,
"completions/min_terminated_length": 364.4,
"entropy": 0.771180788675944,
"epoch": 0.02571575518601063,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.322265625,
"learning_rate": 1e-06,
"loss": -0.0135,
"num_tokens": 67171268.0,
"reward": -9.683474731445312,
"reward_std": 6.344115209579468,
"rewards/ADERawReward/mean": -9.721026802062989,
"rewards/ADERawReward/std": 6.331903743743896,
"rewards/StrictFormatReward/mean": 0.3755208313465118,
"rewards/StrictFormatReward/std": 2.536852979660034,
"sampling/importance_sampling_ratio/max": 2.6906844854354857,
"sampling/importance_sampling_ratio/mean": 0.3939074516296387,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.591675233840943,
"sampling/sampling_logp_difference/mean": 0.026031140238046646,
"step": 150,
"step_time": 30.759532438102177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375000186264516,
"completions/max_length": 1001.1,
"completions/max_terminated_length": 928.5,
"completions/mean_length": 743.9906616210938,
"completions/mean_terminated_length": 741.3461547851563,
"completions/min_length": 396.0,
"completions/min_terminated_length": 396.0,
"entropy": 0.772610588868459,
"epoch": 0.027430138865078006,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.291015625,
"learning_rate": 1e-06,
"loss": -0.006,
"num_tokens": 71648274.0,
"reward": -9.378812217712403,
"reward_std": 6.656644868850708,
"rewards/ADERawReward/mean": -9.419801712036133,
"rewards/ADERawReward/std": 6.638844442367554,
"rewards/StrictFormatReward/mean": 0.4098958343267441,
"rewards/StrictFormatReward/std": 2.457722854614258,
"sampling/importance_sampling_ratio/max": 2.8564789295196533,
"sampling/importance_sampling_ratio/mean": 0.3868180692195892,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 5.245336532592773,
"sampling/sampling_logp_difference/mean": 0.026049494743347168,
"step": 160,
"step_time": 31.00511598830053
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00572916679084301,
"completions/max_length": 990.5,
"completions/max_terminated_length": 943.8,
"completions/mean_length": 742.833349609375,
"completions/mean_terminated_length": 741.2216247558594,
"completions/min_length": 507.2,
"completions/min_terminated_length": 507.2,
"entropy": 0.7669690032800038,
"epoch": 0.02914452254414538,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2421875,
"learning_rate": 1e-06,
"loss": 0.0046,
"num_tokens": 76122162.0,
"reward": -9.605478668212891,
"reward_std": 5.803140306472779,
"rewards/ADERawReward/mean": -9.645895195007324,
"rewards/ADERawReward/std": 5.7928056716918945,
"rewards/StrictFormatReward/mean": 0.4041666634380817,
"rewards/StrictFormatReward/std": 2.4726577758789063,
"sampling/importance_sampling_ratio/max": 2.71953125,
"sampling/importance_sampling_ratio/mean": 0.4068588227033615,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.668109703063965,
"sampling/sampling_logp_difference/mean": 0.025866417028009892,
"step": 170,
"step_time": 30.801190392104036
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0067708335351198915,
"completions/max_length": 992.8,
"completions/max_terminated_length": 902.9,
"completions/mean_length": 745.7635559082031,
"completions/mean_terminated_length": 743.8595642089844,
"completions/min_length": 539.9,
"completions/min_terminated_length": 539.9,
"entropy": 0.7681126793225607,
"epoch": 0.030858906223212756,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.330078125,
"learning_rate": 1e-06,
"loss": -0.0009,
"num_tokens": 80601532.0,
"reward": -10.118904399871827,
"reward_std": 6.3121555805206295,
"rewards/ADERawReward/mean": -10.168487930297852,
"rewards/ADERawReward/std": 6.31596007347107,
"rewards/StrictFormatReward/mean": 0.49583332538604735,
"rewards/StrictFormatReward/std": 2.2922281503677366,
"sampling/importance_sampling_ratio/max": 2.808049750328064,
"sampling/importance_sampling_ratio/mean": 0.40117439031600954,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 5.4242959260940555,
"sampling/sampling_logp_difference/mean": 0.026261823065578938,
"step": 180,
"step_time": 30.95573297930241
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375000186264516,
"completions/max_length": 1006.3,
"completions/max_terminated_length": 940.1,
"completions/mean_length": 745.2932495117187,
"completions/mean_terminated_length": 742.6480163574219,
"completions/min_length": 493.3,
"completions/min_terminated_length": 493.3,
"entropy": 0.7715065141518911,
"epoch": 0.03257328990228013,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.349609375,
"learning_rate": 1e-06,
"loss": -0.0167,
"num_tokens": 85080239.0,
"reward": -9.032559633255005,
"reward_std": 5.6337450504302975,
"rewards/ADERawReward/mean": -9.070111560821534,
"rewards/ADERawReward/std": 5.615639925003052,
"rewards/StrictFormatReward/mean": 0.3755208343267441,
"rewards/StrictFormatReward/std": 2.532916522026062,
"sampling/importance_sampling_ratio/max": 2.906028723716736,
"sampling/importance_sampling_ratio/mean": 0.43047145903110506,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.560648036003113,
"sampling/sampling_logp_difference/mean": 0.02589159458875656,
"step": 190,
"step_time": 31.04905029379588
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.004687500139698386,
"completions/max_length": 967.5,
"completions/max_terminated_length": 901.5,
"completions/mean_length": 743.9588745117187,
"completions/mean_terminated_length": 742.6530639648438,
"completions/min_length": 482.7,
"completions/min_terminated_length": 482.7,
"entropy": 0.7573511083920796,
"epoch": 0.0342876735813475,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.373046875,
"learning_rate": 1e-06,
"loss": -0.0071,
"num_tokens": 89554992.0,
"reward": -9.594873905181885,
"reward_std": 7.4608996391296385,
"rewards/ADERawReward/mean": -9.645030212402343,
"rewards/ADERawReward/std": 7.4621100425720215,
"rewards/StrictFormatReward/mean": 0.5015625014901162,
"rewards/StrictFormatReward/std": 2.2230212569236754,
"sampling/importance_sampling_ratio/max": 2.762194800376892,
"sampling/importance_sampling_ratio/mean": 0.3942416876554489,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.715522122383118,
"sampling/sampling_logp_difference/mean": 0.02548809293657541,
"step": 200,
"step_time": 30.452962850301994
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.008854166883975268,
"completions/max_length": 1002.2,
"completions/max_terminated_length": 908.9,
"completions/mean_length": 749.1671997070313,
"completions/mean_terminated_length": 746.7061828613281,
"completions/min_length": 479.3,
"completions/min_terminated_length": 479.3,
"entropy": 0.7769242246945699,
"epoch": 0.03600205726041488,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.349609375,
"learning_rate": 1e-06,
"loss": -0.0264,
"num_tokens": 94041409.0,
"reward": -9.6312180519104,
"reward_std": 5.488570928573608,
"rewards/ADERawReward/mean": -9.677363443374634,
"rewards/ADERawReward/std": 5.482590246200561,
"rewards/StrictFormatReward/mean": 0.46145834028720856,
"rewards/StrictFormatReward/std": 2.369260883331299,
"sampling/importance_sampling_ratio/max": 2.827503228187561,
"sampling/importance_sampling_ratio/mean": 0.39889043271541597,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.615737795829773,
"sampling/sampling_logp_difference/mean": 0.02642081268131733,
"step": 210,
"step_time": 31.002005700898007
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007291666883975268,
"completions/max_length": 1011.2,
"completions/max_terminated_length": 917.8,
"completions/mean_length": 744.7901306152344,
"completions/mean_terminated_length": 742.7363525390625,
"completions/min_length": 326.2,
"completions/min_terminated_length": 326.2,
"entropy": 0.7611714641253153,
"epoch": 0.03771644093948225,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2470703125,
"learning_rate": 1e-06,
"loss": -0.0072,
"num_tokens": 98520158.0,
"reward": -9.730458450317382,
"reward_std": 6.581094980239868,
"rewards/ADERawReward/mean": -9.778323078155518,
"rewards/ADERawReward/std": 6.570281600952148,
"rewards/StrictFormatReward/mean": 0.478645833581686,
"rewards/StrictFormatReward/std": 2.319968819618225,
"sampling/importance_sampling_ratio/max": 2.9045305013656617,
"sampling/importance_sampling_ratio/mean": 0.4044792056083679,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.901871252059936,
"sampling/sampling_logp_difference/mean": 0.026320008002221583,
"step": 220,
"step_time": 31.057802741799968
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00833333358168602,
"completions/max_length": 1006.6,
"completions/max_terminated_length": 907.8,
"completions/mean_length": 744.116162109375,
"completions/mean_terminated_length": 741.7729553222656,
"completions/min_length": 364.6,
"completions/min_terminated_length": 364.6,
"entropy": 0.7634056508541107,
"epoch": 0.03943082461854963,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.35546875,
"learning_rate": 1e-06,
"loss": 0.0058,
"num_tokens": 102995469.0,
"reward": -9.594787311553954,
"reward_std": 6.4190247535705565,
"rewards/ADERawReward/mean": -9.632339668273925,
"rewards/ADERawReward/std": 6.41018123626709,
"rewards/StrictFormatReward/mean": 0.37552083283662796,
"rewards/StrictFormatReward/std": 2.4954795360565187,
"sampling/importance_sampling_ratio/max": 2.7927833795547485,
"sampling/importance_sampling_ratio/mean": 0.38037638664245604,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 5.303525161743164,
"sampling/sampling_logp_difference/mean": 0.026079374738037585,
"step": 230,
"step_time": 30.925862592204066
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625000232830644,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 922.7,
"completions/mean_length": 747.7349060058593,
"completions/mean_terminated_length": 743.3495300292968,
"completions/min_length": 425.3,
"completions/min_terminated_length": 425.3,
"entropy": 0.7614292343457539,
"epoch": 0.04114520829761701,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3984375,
"learning_rate": 1e-06,
"loss": 0.0014,
"num_tokens": 107478240.0,
"reward": -9.159432983398437,
"reward_std": 5.858185243606568,
"rewards/ADERawReward/mean": -9.192401695251466,
"rewards/ADERawReward/std": 5.8467125415802,
"rewards/StrictFormatReward/mean": 0.3296875016763806,
"rewards/StrictFormatReward/std": 2.6086880922317506,
"sampling/importance_sampling_ratio/max": 2.8600486755371093,
"sampling/importance_sampling_ratio/mean": 0.38983087837696073,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 5.5773406505584715,
"sampling/sampling_logp_difference/mean": 0.026277855411171912,
"step": 240,
"step_time": 31.37704919550306
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.011458333488553763,
"completions/max_length": 1012.8,
"completions/max_terminated_length": 905.7,
"completions/mean_length": 743.3661682128907,
"completions/mean_terminated_length": 740.1103088378907,
"completions/min_length": 399.4,
"completions/min_terminated_length": 399.4,
"entropy": 0.753006378809611,
"epoch": 0.04285959197668438,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.35546875,
"learning_rate": 1e-06,
"loss": -0.0194,
"num_tokens": 111952815.0,
"reward": -9.348959684371948,
"reward_std": 5.709445333480835,
"rewards/ADERawReward/mean": -9.387657499313354,
"rewards/ADERawReward/std": 5.683103704452515,
"rewards/StrictFormatReward/mean": 0.38697916716337205,
"rewards/StrictFormatReward/std": 2.5156437873840334,
"sampling/importance_sampling_ratio/max": 2.7942948818206785,
"sampling/importance_sampling_ratio/mean": 0.3898850232362747,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.9041135787963865,
"sampling/sampling_logp_difference/mean": 0.026003769040107726,
"step": 250,
"step_time": 31.078463642198766
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.016145833814516664,
"completions/max_length": 1017.2,
"completions/max_terminated_length": 912.4,
"completions/mean_length": 749.7567993164063,
"completions/mean_terminated_length": 745.2748840332031,
"completions/min_length": 587.3,
"completions/min_terminated_length": 587.3,
"entropy": 0.7615062018235524,
"epoch": 0.04457397565575176,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3984375,
"learning_rate": 1e-06,
"loss": -0.0051,
"num_tokens": 116440044.0,
"reward": -9.92240343093872,
"reward_std": 5.772085857391358,
"rewards/ADERawReward/mean": -9.958809566497802,
"rewards/ADERawReward/std": 5.756062030792236,
"rewards/StrictFormatReward/mean": 0.36406250670552254,
"rewards/StrictFormatReward/std": 2.54167058467865,
"sampling/importance_sampling_ratio/max": 2.7225013256072996,
"sampling/importance_sampling_ratio/mean": 0.3842493683099747,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.781629824638367,
"sampling/sampling_logp_difference/mean": 0.02624006476253271,
"step": 260,
"step_time": 31.103271031705663
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.008333333535119891,
"completions/max_length": 998.2,
"completions/max_terminated_length": 911.1,
"completions/mean_length": 743.7968994140625,
"completions/mean_terminated_length": 741.4540466308594,
"completions/min_length": 510.3,
"completions/min_terminated_length": 510.3,
"entropy": 0.7562108953793844,
"epoch": 0.04628835933481913,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1953125,
"learning_rate": 1e-06,
"loss": -0.0247,
"num_tokens": 120916390.0,
"reward": -9.01723370552063,
"reward_std": 5.40621428489685,
"rewards/ADERawReward/mean": -9.065098094940186,
"rewards/ADERawReward/std": 5.384741115570068,
"rewards/StrictFormatReward/mean": 0.4786458298563957,
"rewards/StrictFormatReward/std": 2.2954741954803466,
"sampling/importance_sampling_ratio/max": 2.789908194541931,
"sampling/importance_sampling_ratio/mean": 0.40749328434467313,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.034530830383301,
"sampling/sampling_logp_difference/mean": 0.026305202022194864,
"step": 270,
"step_time": 30.820387327201026
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375000139698386,
"completions/max_length": 1011.8,
"completions/max_terminated_length": 914.4,
"completions/mean_length": 745.3156433105469,
"completions/mean_terminated_length": 742.68203125,
"completions/min_length": 492.1,
"completions/min_terminated_length": 492.1,
"entropy": 0.757685911655426,
"epoch": 0.04800274301388651,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.349609375,
"learning_rate": 1e-06,
"loss": 0.002,
"num_tokens": 125395556.0,
"reward": -9.0065682888031,
"reward_std": 5.3750749111175535,
"rewards/ADERawReward/mean": -9.06875557899475,
"rewards/ADERawReward/std": 5.365862655639648,
"rewards/StrictFormatReward/mean": 0.6218749970197678,
"rewards/StrictFormatReward/std": 1.9849600195884705,
"sampling/importance_sampling_ratio/max": 2.791543960571289,
"sampling/importance_sampling_ratio/mean": 0.36947061419487,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 5.032503914833069,
"sampling/sampling_logp_difference/mean": 0.02653008736670017,
"step": 280,
"step_time": 31.023602066392776
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375000139698386,
"completions/max_length": 1011.5,
"completions/max_terminated_length": 906.5,
"completions/mean_length": 745.1588745117188,
"completions/mean_terminated_length": 742.5252624511719,
"completions/min_length": 384.9,
"completions/min_terminated_length": 384.9,
"entropy": 0.7569859484831493,
"epoch": 0.04971712669295388,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.275390625,
"learning_rate": 1e-06,
"loss": -0.0025,
"num_tokens": 129874581.0,
"reward": -9.8903018951416,
"reward_std": 5.79907751083374,
"rewards/ADERawReward/mean": -9.936447715759277,
"rewards/ADERawReward/std": 5.782765436172485,
"rewards/StrictFormatReward/mean": 0.4614583358168602,
"rewards/StrictFormatReward/std": 2.3571563005447387,
"sampling/importance_sampling_ratio/max": 2.780865716934204,
"sampling/importance_sampling_ratio/mean": 0.3752612203359604,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 5.010831832885742,
"sampling/sampling_logp_difference/mean": 0.026486290618777275,
"step": 290,
"step_time": 31.036961890700333
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010937500186264515,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 909.3,
"completions/mean_length": 743.6302307128906,
"completions/mean_terminated_length": 740.5345153808594,
"completions/min_length": 545.1,
"completions/min_terminated_length": 545.1,
"entropy": 0.7479812840620677,
"epoch": 0.05143151037202126,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.296875,
"learning_rate": 1e-06,
"loss": 0.0075,
"num_tokens": 134349103.0,
"reward": -8.726549053192139,
"reward_std": 5.281997394561768,
"rewards/ADERawReward/mean": -8.780142164230346,
"rewards/ADERawReward/std": 5.273246812820434,
"rewards/StrictFormatReward/mean": 0.5359374985098839,
"rewards/StrictFormatReward/std": 2.176564943790436,
"sampling/importance_sampling_ratio/max": 2.9104627132415772,
"sampling/importance_sampling_ratio/mean": 0.40789560675621034,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.580249309539795,
"sampling/sampling_logp_difference/mean": 0.025899984315037726,
"step": 300,
"step_time": 31.32498294780089
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007812500186264514,
"completions/max_length": 998.0,
"completions/max_terminated_length": 917.8,
"completions/mean_length": 745.6083557128907,
"completions/mean_terminated_length": 743.4208374023438,
"completions/min_length": 550.1,
"completions/min_terminated_length": 550.1,
"entropy": 0.7465774556001027,
"epoch": 0.05314589405108863,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.353515625,
"learning_rate": 1e-06,
"loss": -0.0123,
"num_tokens": 138828591.0,
"reward": -9.077164840698241,
"reward_std": 5.992494773864746,
"rewards/ADERawReward/mean": -9.131331491470338,
"rewards/ADERawReward/std": 5.984670972824096,
"rewards/StrictFormatReward/mean": 0.5416666656732559,
"rewards/StrictFormatReward/std": 2.169144892692566,
"sampling/importance_sampling_ratio/max": 2.749341917037964,
"sampling/importance_sampling_ratio/mean": 0.3683441460132599,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 5.0134562969207765,
"sampling/sampling_logp_difference/mean": 0.026201115176081657,
"step": 310,
"step_time": 30.797605072900478
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.016145833814516664,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 899.7,
"completions/mean_length": 747.6823181152344,
"completions/mean_terminated_length": 743.1548828125,
"completions/min_length": 458.0,
"completions/min_terminated_length": 458.0,
"entropy": 0.7556955059369405,
"epoch": 0.05486027773015601,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3671875,
"learning_rate": 1e-06,
"loss": 0.006,
"num_tokens": 143312285.0,
"reward": -9.226322269439697,
"reward_std": 6.171311950683593,
"rewards/ADERawReward/mean": -9.265593338012696,
"rewards/ADERawReward/std": 6.160300636291504,
"rewards/StrictFormatReward/mean": 0.39270834550261496,
"rewards/StrictFormatReward/std": 2.4976075172424315,
"sampling/importance_sampling_ratio/max": 2.7495847940444946,
"sampling/importance_sampling_ratio/mean": 0.384751632809639,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.004838466644287,
"sampling/sampling_logp_difference/mean": 0.026612747460603714,
"step": 320,
"step_time": 31.281772803403147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007812500093132257,
"completions/max_length": 1001.5,
"completions/max_terminated_length": 901.0,
"completions/mean_length": 745.114599609375,
"completions/mean_terminated_length": 742.9226379394531,
"completions/min_length": 415.5,
"completions/min_terminated_length": 415.5,
"entropy": 0.7487649619579315,
"epoch": 0.05657466140922338,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2099609375,
"learning_rate": 1e-06,
"loss": 0.0105,
"num_tokens": 147791241.0,
"reward": -8.852140951156617,
"reward_std": 6.015402030944824,
"rewards/ADERawReward/mean": -8.904588794708252,
"rewards/ADERawReward/std": 6.00461745262146,
"rewards/StrictFormatReward/mean": 0.5244791731238365,
"rewards/StrictFormatReward/std": 2.198140048980713,
"sampling/importance_sampling_ratio/max": 2.8983518362045286,
"sampling/importance_sampling_ratio/mean": 0.38104148209095,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.405174565315247,
"sampling/sampling_logp_difference/mean": 0.026203346997499467,
"step": 330,
"step_time": 30.943252457803464
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013541666883975267,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 910.4,
"completions/mean_length": 746.2218994140625,
"completions/mean_terminated_length": 742.4164428710938,
"completions/min_length": 451.8,
"completions/min_terminated_length": 451.8,
"entropy": 0.7482379992802938,
"epoch": 0.05828904508829076,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2490234375,
"learning_rate": 1e-06,
"loss": -0.0044,
"num_tokens": 152271555.0,
"reward": -9.297206592559814,
"reward_std": 5.522917318344116,
"rewards/ADERawReward/mean": -9.338768911361694,
"rewards/ADERawReward/std": 5.507352113723755,
"rewards/StrictFormatReward/mean": 0.4156250014901161,
"rewards/StrictFormatReward/std": 2.437586045265198,
"sampling/importance_sampling_ratio/max": 2.782149076461792,
"sampling/importance_sampling_ratio/mean": 0.37727462947368623,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.424329519271851,
"sampling/sampling_logp_difference/mean": 0.0264679953455925,
"step": 340,
"step_time": 31.394037494297663
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.012500000232830644,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 929.2,
"completions/mean_length": 747.1989868164062,
"completions/mean_terminated_length": 743.6892578125,
"completions/min_length": 461.7,
"completions/min_terminated_length": 461.7,
"entropy": 0.7526971201101939,
"epoch": 0.06000342876735813,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.373046875,
"learning_rate": 1e-06,
"loss": -0.0279,
"num_tokens": 156753761.0,
"reward": -8.87293882369995,
"reward_std": 5.407143402099609,
"rewards/ADERawReward/mean": -8.912782192230225,
"rewards/ADERawReward/std": 5.394438552856445,
"rewards/StrictFormatReward/mean": 0.3984375029802322,
"rewards/StrictFormatReward/std": 2.4990755558013915,
"sampling/importance_sampling_ratio/max": 2.7867547035217286,
"sampling/importance_sampling_ratio/mean": 0.3730757534503937,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.122068333625793,
"sampling/sampling_logp_difference/mean": 0.02654006313532591,
"step": 350,
"step_time": 31.515721038298217
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.012500000325962901,
"completions/max_length": 1007.6,
"completions/max_terminated_length": 894.1,
"completions/mean_length": 747.2932495117187,
"completions/mean_terminated_length": 743.7852661132813,
"completions/min_length": 591.8,
"completions/min_terminated_length": 591.8,
"entropy": 0.7486960550149282,
"epoch": 0.06171781244642551,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2373046875,
"learning_rate": 1e-06,
"loss": -0.0224,
"num_tokens": 161235908.0,
"reward": -9.346630477905274,
"reward_std": 5.943923234939575,
"rewards/ADERawReward/mean": -9.397359657287598,
"rewards/ADERawReward/std": 5.933114051818848,
"rewards/StrictFormatReward/mean": 0.5072916746139526,
"rewards/StrictFormatReward/std": 2.258626651763916,
"sampling/importance_sampling_ratio/max": 2.783932328224182,
"sampling/importance_sampling_ratio/mean": 0.3899604082107544,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.8976672172546385,
"sampling/sampling_logp_difference/mean": 0.02663377095013857,
"step": 360,
"step_time": 31.072972166599357
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010937500232830643,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 907.6,
"completions/mean_length": 743.9771057128906,
"completions/mean_terminated_length": 740.8768371582031,
"completions/min_length": 373.6,
"completions/min_terminated_length": 373.6,
"entropy": 0.7463085273901622,
"epoch": 0.06343219612549289,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.43359375,
"learning_rate": 1e-06,
"loss": -0.0195,
"num_tokens": 165711768.0,
"reward": -9.58823890686035,
"reward_std": 7.584380769729615,
"rewards/ADERawReward/mean": -9.62750973701477,
"rewards/ADERawReward/std": 7.566325521469116,
"rewards/StrictFormatReward/mean": 0.3927083402872086,
"rewards/StrictFormatReward/std": 2.502521538734436,
"sampling/importance_sampling_ratio/max": 2.824156880378723,
"sampling/importance_sampling_ratio/mean": 0.38374905586242675,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.386010980606079,
"sampling/sampling_logp_difference/mean": 0.02665374930948019,
"step": 370,
"step_time": 31.397882584496983
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009895833674818277,
"completions/max_length": 1000.8,
"completions/max_terminated_length": 913.6,
"completions/mean_length": 746.1416870117188,
"completions/mean_terminated_length": 743.3394592285156,
"completions/min_length": 440.4,
"completions/min_terminated_length": 440.4,
"entropy": 0.7471979439258576,
"epoch": 0.06514657980456026,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.33203125,
"learning_rate": 1e-06,
"loss": 0.0032,
"num_tokens": 170192072.0,
"reward": -9.293725442886352,
"reward_std": 5.743723678588867,
"rewards/ADERawReward/mean": -9.341016817092896,
"rewards/ADERawReward/std": 5.729398393630982,
"rewards/StrictFormatReward/mean": 0.4729166701436043,
"rewards/StrictFormatReward/std": 2.320163094997406,
"sampling/importance_sampling_ratio/max": 2.869658589363098,
"sampling/importance_sampling_ratio/mean": 0.3773952782154083,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.138100743293762,
"sampling/sampling_logp_difference/mean": 0.026742291823029517,
"step": 380,
"step_time": 30.96988229679846
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.012500000325962901,
"completions/max_length": 1016.4,
"completions/max_terminated_length": 914.7,
"completions/mean_length": 748.4885559082031,
"completions/mean_terminated_length": 744.993017578125,
"completions/min_length": 477.5,
"completions/min_terminated_length": 477.5,
"entropy": 0.7559548437595367,
"epoch": 0.06686096348362763,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24609375,
"learning_rate": 1e-06,
"loss": -0.0181,
"num_tokens": 174677202.0,
"reward": -9.28088207244873,
"reward_std": 5.592328834533691,
"rewards/ADERawReward/mean": -9.324163150787353,
"rewards/ADERawReward/std": 5.577477645874024,
"rewards/StrictFormatReward/mean": 0.43281249795109034,
"rewards/StrictFormatReward/std": 2.4112179040908814,
"sampling/importance_sampling_ratio/max": 2.825889301300049,
"sampling/importance_sampling_ratio/mean": 0.3906464457511902,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.902947115898132,
"sampling/sampling_logp_difference/mean": 0.026762423664331438,
"step": 390,
"step_time": 31.20840191419411
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007812500186264514,
"completions/max_length": 992.4,
"completions/max_terminated_length": 934.3,
"completions/mean_length": 745.6031433105469,
"completions/mean_terminated_length": 743.4128295898438,
"completions/min_length": 437.2,
"completions/min_terminated_length": 437.2,
"entropy": 0.7531329174836476,
"epoch": 0.068575347162695,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.376953125,
"learning_rate": 1e-06,
"loss": 0.008,
"num_tokens": 179156696.0,
"reward": -9.488264322280884,
"reward_std": 5.980498790740967,
"rewards/ADERawReward/mean": -9.5326913356781,
"rewards/ADERawReward/std": 5.96862416267395,
"rewards/StrictFormatReward/mean": 0.44427084028720853,
"rewards/StrictFormatReward/std": 2.4021025896072388,
"sampling/importance_sampling_ratio/max": 2.796902632713318,
"sampling/importance_sampling_ratio/mean": 0.38265294432640073,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.037104201316834,
"sampling/sampling_logp_difference/mean": 0.026842619478702544,
"step": 400,
"step_time": 30.975298491402647
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.008854166977107525,
"completions/max_length": 1020.0,
"completions/max_terminated_length": 935.6,
"completions/mean_length": 746.836474609375,
"completions/mean_terminated_length": 744.3670227050782,
"completions/min_length": 503.7,
"completions/min_terminated_length": 503.7,
"entropy": 0.7571933090686798,
"epoch": 0.07028973084176239,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.30859375,
"learning_rate": 1e-06,
"loss": -0.0061,
"num_tokens": 183639214.0,
"reward": -8.974328899383545,
"reward_std": 6.1143780708312985,
"rewards/ADERawReward/mean": -9.019901895523072,
"rewards/ADERawReward/std": 6.102041292190552,
"rewards/StrictFormatReward/mean": 0.45572916865348817,
"rewards/StrictFormatReward/std": 2.3774968147277833,
"sampling/importance_sampling_ratio/max": 2.655148983001709,
"sampling/importance_sampling_ratio/mean": 0.36126451194286346,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.974502754211426,
"sampling/sampling_logp_difference/mean": 0.027187051996588708,
"step": 410,
"step_time": 31.259745221505
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010416666930541395,
"completions/max_length": 1017.3,
"completions/max_terminated_length": 918.2,
"completions/mean_length": 747.1531433105469,
"completions/mean_terminated_length": 744.2319213867188,
"completions/min_length": 565.9,
"completions/min_terminated_length": 565.9,
"entropy": 0.7566624025503794,
"epoch": 0.07200411452082976,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3359375,
"learning_rate": 1e-06,
"loss": -0.0087,
"num_tokens": 188121876.0,
"reward": -8.923426342010497,
"reward_std": 6.084955930709839,
"rewards/ADERawReward/mean": -8.9724365234375,
"rewards/ADERawReward/std": 6.07550630569458,
"rewards/StrictFormatReward/mean": 0.4901041708886623,
"rewards/StrictFormatReward/std": 2.2838862776756286,
"sampling/importance_sampling_ratio/max": 2.812648606300354,
"sampling/importance_sampling_ratio/mean": 0.3750308662652969,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.534145450592041,
"sampling/sampling_logp_difference/mean": 0.026898518949747086,
"step": 420,
"step_time": 31.253269567198004
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010416666837409138,
"completions/max_length": 1000.9,
"completions/max_terminated_length": 899.6,
"completions/mean_length": 744.9640930175781,
"completions/mean_terminated_length": 742.0363525390625,
"completions/min_length": 414.5,
"completions/min_terminated_length": 414.5,
"entropy": 0.7489292581876119,
"epoch": 0.07371849819989713,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.58203125,
"learning_rate": 1e-06,
"loss": -0.0073,
"num_tokens": 192599695.0,
"reward": -9.504335880279541,
"reward_std": 5.772246503829956,
"rewards/ADERawReward/mean": -9.548762798309326,
"rewards/ADERawReward/std": 5.758863925933838,
"rewards/StrictFormatReward/mean": 0.4442708283662796,
"rewards/StrictFormatReward/std": 2.37346408367157,
"sampling/importance_sampling_ratio/max": 2.820726418495178,
"sampling/importance_sampling_ratio/mean": 0.36917952001094817,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.74177827835083,
"sampling/sampling_logp_difference/mean": 0.026925336755812167,
"step": 430,
"step_time": 31.05574051309668
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013541666977107524,
"completions/max_length": 1003.0,
"completions/max_terminated_length": 914.5,
"completions/mean_length": 746.8135620117188,
"completions/mean_terminated_length": 743.0150817871094,
"completions/min_length": 349.1,
"completions/min_terminated_length": 349.1,
"entropy": 0.7525838236014049,
"epoch": 0.0754328818789645,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3359375,
"learning_rate": 1e-06,
"loss": -0.0216,
"num_tokens": 197081145.0,
"reward": -8.942778491973877,
"reward_std": 5.134439134597779,
"rewards/ADERawReward/mean": -8.988924121856689,
"rewards/ADERawReward/std": 5.119956731796265,
"rewards/StrictFormatReward/mean": 0.4614583387970924,
"rewards/StrictFormatReward/std": 2.3441874384880066,
"sampling/importance_sampling_ratio/max": 2.7575156688690186,
"sampling/importance_sampling_ratio/mean": 0.3831501841545105,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 6.117226243019104,
"sampling/sampling_logp_difference/mean": 0.02673897407948971,
"step": 440,
"step_time": 31.136345591007558
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00989583358168602,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 962.1,
"completions/mean_length": 746.3265747070312,
"completions/mean_terminated_length": 743.5626037597656,
"completions/min_length": 502.5,
"completions/min_terminated_length": 502.5,
"entropy": 0.7494863112767537,
"epoch": 0.07714726555803189,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.396484375,
"learning_rate": 1e-06,
"loss": 0.0015,
"num_tokens": 201562060.0,
"reward": -8.833820581436157,
"reward_std": 6.585151100158692,
"rewards/ADERawReward/mean": -8.883976650238036,
"rewards/ADERawReward/std": 6.576332855224609,
"rewards/StrictFormatReward/mean": 0.5015625029802322,
"rewards/StrictFormatReward/std": 2.246139848232269,
"sampling/importance_sampling_ratio/max": 2.858155703544617,
"sampling/importance_sampling_ratio/mean": 0.38457230627536776,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.555598568916321,
"sampling/sampling_logp_difference/mean": 0.026770622283220292,
"step": 450,
"step_time": 31.314683586404136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010937500325962902,
"completions/max_length": 1012.7,
"completions/max_terminated_length": 908.7,
"completions/mean_length": 747.1239807128907,
"completions/mean_terminated_length": 744.033935546875,
"completions/min_length": 452.2,
"completions/min_terminated_length": 452.2,
"entropy": 0.7497527619202932,
"epoch": 0.07886164923709926,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2392578125,
"learning_rate": 1e-06,
"loss": -0.0312,
"num_tokens": 206044186.0,
"reward": -8.526823663711548,
"reward_std": 4.98410964012146,
"rewards/ADERawReward/mean": -8.568385744094849,
"rewards/ADERawReward/std": 4.961003923416138,
"rewards/StrictFormatReward/mean": 0.415625,
"rewards/StrictFormatReward/std": 2.4348717212677,
"sampling/importance_sampling_ratio/max": 2.7420751094818114,
"sampling/importance_sampling_ratio/mean": 0.3720589101314545,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.188339471817017,
"sampling/sampling_logp_difference/mean": 0.026370251737535,
"step": 460,
"step_time": 31.211517236700455
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010416666837409138,
"completions/max_length": 1013.2,
"completions/max_terminated_length": 913.8,
"completions/mean_length": 747.784912109375,
"completions/mean_terminated_length": 744.8851257324219,
"completions/min_length": 416.8,
"completions/min_terminated_length": 416.8,
"entropy": 0.7524498959382375,
"epoch": 0.08057603291616663,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.349609375,
"learning_rate": 1e-06,
"loss": -0.0088,
"num_tokens": 210527581.0,
"reward": -9.329790830612183,
"reward_std": 5.533093023300171,
"rewards/ADERawReward/mean": -9.378228092193604,
"rewards/ADERawReward/std": 5.5270590543746945,
"rewards/StrictFormatReward/mean": 0.4843750029802322,
"rewards/StrictFormatReward/std": 2.30391206741333,
"sampling/importance_sampling_ratio/max": 2.7844461679458616,
"sampling/importance_sampling_ratio/mean": 0.36996753215789796,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.251289129257202,
"sampling/sampling_logp_difference/mean": 0.026920023374259472,
"step": 470,
"step_time": 31.291990611601797
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.008333333535119891,
"completions/max_length": 1006.7,
"completions/max_terminated_length": 924.5,
"completions/mean_length": 744.3693054199218,
"completions/mean_terminated_length": 742.0288696289062,
"completions/min_length": 345.2,
"completions/min_terminated_length": 345.2,
"entropy": 0.7420653978983561,
"epoch": 0.08229041659523402,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25390625,
"learning_rate": 1e-06,
"loss": -0.0135,
"num_tokens": 215005026.0,
"reward": -8.811880826950073,
"reward_std": 5.741937494277954,
"rewards/ADERawReward/mean": -8.863182735443115,
"rewards/ADERawReward/std": 5.721714019775391,
"rewards/StrictFormatReward/mean": 0.5130208387970925,
"rewards/StrictFormatReward/std": 2.2228100419044496,
"sampling/importance_sampling_ratio/max": 2.887178134918213,
"sampling/importance_sampling_ratio/mean": 0.3799948424100876,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.423047304153442,
"sampling/sampling_logp_difference/mean": 0.026753966324031354,
"step": 480,
"step_time": 31.235763939598108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007812500232830644,
"completions/max_length": 1007.4,
"completions/max_terminated_length": 921.0,
"completions/mean_length": 745.5041809082031,
"completions/mean_terminated_length": 743.31552734375,
"completions/min_length": 503.1,
"completions/min_terminated_length": 503.1,
"entropy": 0.7548379202683767,
"epoch": 0.08400480027430139,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.302734375,
"learning_rate": 1e-06,
"loss": 0.0051,
"num_tokens": 219484362.0,
"reward": -8.726992988586426,
"reward_std": 5.2785539627075195,
"rewards/ADERawReward/mean": -8.783451509475707,
"rewards/ADERawReward/std": 5.263514995574951,
"rewards/StrictFormatReward/mean": 0.5645833373069763,
"rewards/StrictFormatReward/std": 2.121866428852081,
"sampling/importance_sampling_ratio/max": 2.8357564210891724,
"sampling/importance_sampling_ratio/mean": 0.36869050562381744,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.648310852050781,
"sampling/sampling_logp_difference/mean": 0.026755044236779213,
"step": 490,
"step_time": 31.2714247025011
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010416666930541395,
"completions/max_length": 1011.9,
"completions/max_terminated_length": 899.3,
"completions/mean_length": 745.266162109375,
"completions/mean_terminated_length": 742.3389892578125,
"completions/min_length": 504.1,
"completions/min_terminated_length": 504.1,
"entropy": 0.7439370552698771,
"epoch": 0.08571918395336876,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.337890625,
"learning_rate": 1e-06,
"loss": -0.0111,
"num_tokens": 223962569.0,
"reward": -8.893217849731446,
"reward_std": 6.001188564300537,
"rewards/ADERawReward/mean": -8.93993616104126,
"rewards/ADERawReward/std": 5.9909703731536865,
"rewards/StrictFormatReward/mean": 0.46718750298023226,
"rewards/StrictFormatReward/std": 2.34447557926178,
"sampling/importance_sampling_ratio/max": 2.847609305381775,
"sampling/importance_sampling_ratio/mean": 0.4031797587871552,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.063726282119751,
"sampling/sampling_logp_difference/mean": 0.02628465835005045,
"step": 500,
"step_time": 31.151053833100015
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.008854166837409139,
"completions/max_length": 1009.8,
"completions/max_terminated_length": 904.5,
"completions/mean_length": 745.569287109375,
"completions/mean_terminated_length": 743.0883544921875,
"completions/min_length": 499.8,
"completions/min_terminated_length": 499.8,
"entropy": 0.7393844624360403,
"epoch": 0.08743356763243613,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.345703125,
"learning_rate": 1e-06,
"loss": -0.003,
"num_tokens": 228442430.0,
"reward": -8.746051263809203,
"reward_std": 5.200268793106079,
"rewards/ADERawReward/mean": -8.796780586242676,
"rewards/ADERawReward/std": 5.189048337936401,
"rewards/StrictFormatReward/mean": 0.5072916716337204,
"rewards/StrictFormatReward/std": 2.2304036617279053,
"sampling/importance_sampling_ratio/max": 2.8343311309814454,
"sampling/importance_sampling_ratio/mean": 0.3753476530313492,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.841508960723877,
"sampling/sampling_logp_difference/mean": 0.026619693823158742,
"step": 510,
"step_time": 31.27263153729873
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007291666837409139,
"completions/max_length": 987.4,
"completions/max_terminated_length": 894.7,
"completions/mean_length": 743.181787109375,
"completions/mean_terminated_length": 741.12255859375,
"completions/min_length": 500.0,
"completions/min_terminated_length": 500.0,
"entropy": 0.7444122533003489,
"epoch": 0.08914795131150352,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.361328125,
"learning_rate": 1e-06,
"loss": 0.0089,
"num_tokens": 232916635.0,
"reward": -8.511340045928955,
"reward_std": 4.909872698783874,
"rewards/ADERawReward/mean": -8.566652154922485,
"rewards/ADERawReward/std": 4.893595862388611,
"rewards/StrictFormatReward/mean": 0.5531250022351741,
"rewards/StrictFormatReward/std": 2.0853070855140685,
"sampling/importance_sampling_ratio/max": 2.862236833572388,
"sampling/importance_sampling_ratio/mean": 0.38686116933822634,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.322235846519471,
"sampling/sampling_logp_difference/mean": 0.026448269747197627,
"step": 520,
"step_time": 30.78332566640311
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00677083358168602,
"completions/max_length": 991.6,
"completions/max_terminated_length": 919.4,
"completions/mean_length": 747.3963806152344,
"completions/mean_terminated_length": 745.5401245117188,
"completions/min_length": 380.7,
"completions/min_terminated_length": 380.7,
"entropy": 0.7540235598882039,
"epoch": 0.09086233499057089,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.322265625,
"learning_rate": 1e-06,
"loss": -0.0113,
"num_tokens": 237399348.0,
"reward": -8.865944004058838,
"reward_std": 5.564558506011963,
"rewards/ADERawReward/mean": -8.92240219116211,
"rewards/ADERawReward/std": 5.551964378356933,
"rewards/StrictFormatReward/mean": 0.5645833313465118,
"rewards/StrictFormatReward/std": 2.112958538532257,
"sampling/importance_sampling_ratio/max": 2.7960949182510375,
"sampling/importance_sampling_ratio/mean": 0.36649490892887115,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.830978512763977,
"sampling/sampling_logp_difference/mean": 0.02695600502192974,
"step": 530,
"step_time": 30.918909230499413
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006250000186264515,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 911.9,
"completions/mean_length": 744.5010681152344,
"completions/mean_terminated_length": 742.7408508300781,
"completions/min_length": 418.2,
"completions/min_terminated_length": 418.2,
"entropy": 0.7437257766723633,
"epoch": 0.09257671866963826,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.28515625,
"learning_rate": 1e-06,
"loss": -0.0034,
"num_tokens": 241877222.0,
"reward": -8.705044603347778,
"reward_std": 4.9693234920501705,
"rewards/ADERawReward/mean": -8.75405502319336,
"rewards/ADERawReward/std": 4.961550354957581,
"rewards/StrictFormatReward/mean": 0.490104167163372,
"rewards/StrictFormatReward/std": 2.2909578323364257,
"sampling/importance_sampling_ratio/max": 2.806812071800232,
"sampling/importance_sampling_ratio/mean": 0.38454858362674715,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.238021278381348,
"sampling/sampling_logp_difference/mean": 0.026717523485422133,
"step": 540,
"step_time": 31.30855839920405
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01250000037252903,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 931.3,
"completions/mean_length": 749.8541870117188,
"completions/mean_terminated_length": 746.3853088378906,
"completions/min_length": 535.8,
"completions/min_terminated_length": 535.8,
"entropy": 0.7400518854459127,
"epoch": 0.09429110234870564,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4609375,
"learning_rate": 1e-06,
"loss": 0.0011,
"num_tokens": 246365646.0,
"reward": -8.35037841796875,
"reward_std": 5.094221115112305,
"rewards/ADERawReward/mean": -8.400534439086915,
"rewards/ADERawReward/std": 5.078604912757873,
"rewards/StrictFormatReward/mean": 0.5015625029802322,
"rewards/StrictFormatReward/std": 2.2548654079437256,
"sampling/importance_sampling_ratio/max": 2.729209136962891,
"sampling/importance_sampling_ratio/mean": 0.3943050533533096,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.494288897514343,
"sampling/sampling_logp_difference/mean": 0.026609367132186888,
"step": 550,
"step_time": 31.5098136007
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.011979166930541397,
"completions/max_length": 1011.1,
"completions/max_terminated_length": 901.4,
"completions/mean_length": 746.1073059082031,
"completions/mean_terminated_length": 742.7442687988281,
"completions/min_length": 385.7,
"completions/min_terminated_length": 385.7,
"entropy": 0.7511671622594197,
"epoch": 0.09600548602777302,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.421875,
"learning_rate": 1e-06,
"loss": -0.0039,
"num_tokens": 250846300.0,
"reward": -8.354753160476685,
"reward_std": 4.998448300361633,
"rewards/ADERawReward/mean": -8.403190279006958,
"rewards/ADERawReward/std": 4.983683443069458,
"rewards/StrictFormatReward/mean": 0.484375,
"rewards/StrictFormatReward/std": 2.294073450565338,
"sampling/importance_sampling_ratio/max": 2.7630903482437135,
"sampling/importance_sampling_ratio/mean": 0.371540492773056,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.54563274383545,
"sampling/sampling_logp_difference/mean": 0.026747452840209008,
"step": 560,
"step_time": 31.339443079804187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006770833395421505,
"completions/max_length": 1006.3,
"completions/max_terminated_length": 919.6,
"completions/mean_length": 746.6364807128906,
"completions/mean_terminated_length": 744.7229919433594,
"completions/min_length": 485.7,
"completions/min_terminated_length": 485.7,
"entropy": 0.7559505820274353,
"epoch": 0.09771986970684039,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.298828125,
"learning_rate": 1e-06,
"loss": -0.022,
"num_tokens": 255327154.0,
"reward": -9.671938514709472,
"reward_std": 6.359064817428589,
"rewards/ADERawReward/mean": -9.72152156829834,
"rewards/ADERawReward/std": 6.352941226959229,
"rewards/StrictFormatReward/mean": 0.49583333283662795,
"rewards/StrictFormatReward/std": 2.2696222186088564,
"sampling/importance_sampling_ratio/max": 2.8152915716171263,
"sampling/importance_sampling_ratio/mean": 0.37120532989501953,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.6147093534469605,
"sampling/sampling_logp_difference/mean": 0.027104974165558814,
"step": 570,
"step_time": 31.2074279195047
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375000093132257,
"completions/max_length": 989.5,
"completions/max_terminated_length": 893.7,
"completions/mean_length": 747.6479309082031,
"completions/mean_terminated_length": 745.0316833496094,
"completions/min_length": 484.2,
"completions/min_terminated_length": 484.2,
"entropy": 0.7506202618281047,
"epoch": 0.09943425338590776,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.34765625,
"learning_rate": 1e-06,
"loss": -0.0002,
"num_tokens": 259810670.0,
"reward": -8.481088829040527,
"reward_std": 5.200103354454041,
"rewards/ADERawReward/mean": -8.532390689849853,
"rewards/ADERawReward/std": 5.18415892124176,
"rewards/StrictFormatReward/mean": 0.5130208402872085,
"rewards/StrictFormatReward/std": 2.2208899974823,
"sampling/importance_sampling_ratio/max": 2.803302502632141,
"sampling/importance_sampling_ratio/mean": 0.4020031362771988,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 5.206298041343689,
"sampling/sampling_logp_difference/mean": 0.02699448149651289,
"step": 580,
"step_time": 31.03782037871715
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007812500093132257,
"completions/max_length": 998.4,
"completions/max_terminated_length": 910.7,
"completions/mean_length": 747.5250183105469,
"completions/mean_terminated_length": 745.33720703125,
"completions/min_length": 399.1,
"completions/min_terminated_length": 399.1,
"entropy": 0.7567769944667816,
"epoch": 0.10114863706497514,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.28125,
"learning_rate": 1e-06,
"loss": -0.0121,
"num_tokens": 264294318.0,
"reward": -8.613714408874511,
"reward_std": 5.104084444046021,
"rewards/ADERawReward/mean": -8.672464275360108,
"rewards/ADERawReward/std": 5.086045837402343,
"rewards/StrictFormatReward/mean": 0.5874999970197677,
"rewards/StrictFormatReward/std": 2.0732890486717226,
"sampling/importance_sampling_ratio/max": 2.7467212677001953,
"sampling/importance_sampling_ratio/mean": 0.36461472511291504,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.379195928573608,
"sampling/sampling_logp_difference/mean": 0.027049308083951472,
"step": 590,
"step_time": 31.140472374291857
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0062500000931322575,
"completions/max_length": 992.1,
"completions/max_terminated_length": 916.2,
"completions/mean_length": 745.9307434082032,
"completions/mean_terminated_length": 744.1726928710938,
"completions/min_length": 395.7,
"completions/min_terminated_length": 395.7,
"entropy": 0.7494464596112569,
"epoch": 0.10286302074404252,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.283203125,
"learning_rate": 1e-06,
"loss": 0.0102,
"num_tokens": 268774665.0,
"reward": -8.773472213745118,
"reward_std": 5.6750462532043455,
"rewards/ADERawReward/mean": -8.832221984863281,
"rewards/ADERawReward/std": 5.654839897155762,
"rewards/StrictFormatReward/mean": 0.5875000059604645,
"rewards/StrictFormatReward/std": 2.0246779561042785,
"sampling/importance_sampling_ratio/max": 2.8275955438613893,
"sampling/importance_sampling_ratio/mean": 0.3867086052894592,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.084816575050354,
"sampling/sampling_logp_difference/mean": 0.027156472206115723,
"step": 600,
"step_time": 31.06699350780109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0057291668374091385,
"completions/max_length": 1017.1,
"completions/max_terminated_length": 910.3,
"completions/mean_length": 747.0932495117188,
"completions/mean_terminated_length": 745.496240234375,
"completions/min_length": 485.1,
"completions/min_terminated_length": 485.1,
"entropy": 0.7526243805885315,
"epoch": 0.1045774044231099,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.396484375,
"learning_rate": 1e-06,
"loss": -0.0151,
"num_tokens": 273256972.0,
"reward": -8.731144094467163,
"reward_std": 5.71333794593811,
"rewards/ADERawReward/mean": -8.782446098327636,
"rewards/ADERawReward/std": 5.699142122268677,
"rewards/StrictFormatReward/mean": 0.5130208373069763,
"rewards/StrictFormatReward/std": 2.253889012336731,
"sampling/importance_sampling_ratio/max": 2.8028954029083253,
"sampling/importance_sampling_ratio/mean": 0.3589755445718765,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.905529975891113,
"sampling/sampling_logp_difference/mean": 0.02720630671828985,
"step": 610,
"step_time": 31.30127082870167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007291666837409139,
"completions/max_length": 986.2,
"completions/max_terminated_length": 908.3,
"completions/mean_length": 746.2526245117188,
"completions/mean_terminated_length": 744.2300537109375,
"completions/min_length": 523.4,
"completions/min_terminated_length": 523.4,
"entropy": 0.7530896683533986,
"epoch": 0.10629178810217726,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.46875,
"learning_rate": 1e-06,
"loss": -0.0115,
"num_tokens": 277737025.0,
"reward": -8.564061069488526,
"reward_std": 4.777447652816773,
"rewards/ADERawReward/mean": -8.621665000915527,
"rewards/ADERawReward/std": 4.770789432525635,
"rewards/StrictFormatReward/mean": 0.5760416656732559,
"rewards/StrictFormatReward/std": 2.0861072897911073,
"sampling/importance_sampling_ratio/max": 2.810882544517517,
"sampling/importance_sampling_ratio/mean": 0.39588436484336853,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.1026554822921755,
"sampling/sampling_logp_difference/mean": 0.02695068046450615,
"step": 620,
"step_time": 30.90586839320604
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007291666837409139,
"completions/max_length": 988.7,
"completions/max_terminated_length": 898.0,
"completions/mean_length": 748.3562683105469,
"completions/mean_terminated_length": 746.3339233398438,
"completions/min_length": 629.4,
"completions/min_terminated_length": 629.4,
"entropy": 0.7435857594013214,
"epoch": 0.10800617178124464,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25390625,
"learning_rate": 1e-06,
"loss": -0.0193,
"num_tokens": 282222253.0,
"reward": -9.023520565032959,
"reward_std": 5.119320058822632,
"rewards/ADERawReward/mean": -9.07825984954834,
"rewards/ADERawReward/std": 5.104773378372192,
"rewards/StrictFormatReward/mean": 0.5473958343267441,
"rewards/StrictFormatReward/std": 2.160345029830933,
"sampling/importance_sampling_ratio/max": 2.8361830711364746,
"sampling/importance_sampling_ratio/mean": 0.37177495658397675,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.693781447410584,
"sampling/sampling_logp_difference/mean": 0.027231728471815587,
"step": 630,
"step_time": 30.870413991407258
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.004687500139698386,
"completions/max_length": 971.6,
"completions/max_terminated_length": 889.4,
"completions/mean_length": 747.0187744140625,
"completions/mean_terminated_length": 745.7044677734375,
"completions/min_length": 558.4,
"completions/min_terminated_length": 558.4,
"entropy": 0.7533182700475057,
"epoch": 0.10972055546031202,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.34375,
"learning_rate": 1e-06,
"loss": 0.0057,
"num_tokens": 286704369.0,
"reward": -9.217678833007813,
"reward_std": 5.054318857192993,
"rewards/ADERawReward/mean": -9.277001667022706,
"rewards/ADERawReward/std": 5.038924503326416,
"rewards/StrictFormatReward/mean": 0.5932291626930237,
"rewards/StrictFormatReward/std": 2.0508111119270325,
"sampling/importance_sampling_ratio/max": 2.866812252998352,
"sampling/importance_sampling_ratio/mean": 0.3646788477897644,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.960925984382629,
"sampling/sampling_logp_difference/mean": 0.027474281564354896,
"step": 640,
"step_time": 30.600575475511143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003645833395421505,
"completions/max_length": 960.3,
"completions/max_terminated_length": 891.2,
"completions/mean_length": 742.1609497070312,
"completions/mean_terminated_length": 741.1292663574219,
"completions/min_length": 366.8,
"completions/min_terminated_length": 366.8,
"entropy": 0.7416558722654979,
"epoch": 0.1114349391393794,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.31640625,
"learning_rate": 1e-06,
"loss": -0.0118,
"num_tokens": 291177734.0,
"reward": -8.407093048095703,
"reward_std": 5.326395010948181,
"rewards/ADERawReward/mean": -8.467561674118041,
"rewards/ADERawReward/std": 5.315518450737,
"rewards/StrictFormatReward/mean": 0.6046874970197678,
"rewards/StrictFormatReward/std": 2.026141095161438,
"sampling/importance_sampling_ratio/max": 2.862113666534424,
"sampling/importance_sampling_ratio/mean": 0.3777022361755371,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.170439219474792,
"sampling/sampling_logp_difference/mean": 0.027010084502398966,
"step": 650,
"step_time": 30.520393895095914
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.010937500279396772,
"completions/max_length": 1009.0,
"completions/max_terminated_length": 908.3,
"completions/mean_length": 746.7343872070312,
"completions/mean_terminated_length": 743.6657897949219,
"completions/min_length": 455.1,
"completions/min_terminated_length": 455.1,
"entropy": 0.748453684647878,
"epoch": 0.11314932281844677,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.220703125,
"learning_rate": 1e-06,
"loss": -0.0029,
"num_tokens": 295659304.0,
"reward": -8.47295446395874,
"reward_std": 5.315748786926269,
"rewards/ADERawReward/mean": -8.525974988937378,
"rewards/ADERawReward/std": 5.30073721408844,
"rewards/StrictFormatReward/mean": 0.5302083283662796,
"rewards/StrictFormatReward/std": 2.2150806427001952,
"sampling/importance_sampling_ratio/max": 2.8308970451354982,
"sampling/importance_sampling_ratio/mean": 0.36592633128166197,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.81339545249939,
"sampling/sampling_logp_difference/mean": 0.027280481532216072,
"step": 660,
"step_time": 31.34898143170285
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0031250000931322573,
"completions/max_length": 961.7,
"completions/max_terminated_length": 898.4,
"completions/mean_length": 745.1661743164062,
"completions/mean_terminated_length": 744.2929931640625,
"completions/min_length": 464.4,
"completions/min_terminated_length": 464.4,
"entropy": 0.7453262567520141,
"epoch": 0.11486370649751414,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.228515625,
"learning_rate": 1e-06,
"loss": 0.0075,
"num_tokens": 300137783.0,
"reward": -8.66427845954895,
"reward_std": 4.92835762500763,
"rewards/ADERawReward/mean": -8.727611589431763,
"rewards/ADERawReward/std": 4.9187664270401,
"rewards/StrictFormatReward/mean": 0.6333333373069763,
"rewards/StrictFormatReward/std": 1.9716094613075257,
"sampling/importance_sampling_ratio/max": 2.7208317041397097,
"sampling/importance_sampling_ratio/mean": 0.3644146382808685,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.467776584625244,
"sampling/sampling_logp_difference/mean": 0.027055931463837623,
"step": 670,
"step_time": 30.402021049396716
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007812500139698387,
"completions/max_length": 1009.0,
"completions/max_terminated_length": 900.7,
"completions/mean_length": 744.5047119140625,
"completions/mean_terminated_length": 742.3053039550781,
"completions/min_length": 501.0,
"completions/min_terminated_length": 501.0,
"entropy": 0.7389722168445587,
"epoch": 0.11657809017658152,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.310546875,
"learning_rate": 1e-06,
"loss": 0.0105,
"num_tokens": 304615648.0,
"reward": -8.14780044555664,
"reward_std": 5.765257406234741,
"rewards/ADERawReward/mean": -8.198529577255249,
"rewards/ADERawReward/std": 5.752817440032959,
"rewards/StrictFormatReward/mean": 0.507291667163372,
"rewards/StrictFormatReward/std": 2.2412220239639282,
"sampling/importance_sampling_ratio/max": 2.838437080383301,
"sampling/importance_sampling_ratio/mean": 0.3927165180444717,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.8545892953872682,
"sampling/sampling_logp_difference/mean": 0.026737906597554685,
"step": 680,
"step_time": 31.265542278197245
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007291666837409139,
"completions/max_length": 1011.0,
"completions/max_terminated_length": 918.1,
"completions/mean_length": 745.039599609375,
"completions/mean_terminated_length": 743.0080322265625,
"completions/min_length": 483.0,
"completions/min_terminated_length": 483.0,
"entropy": 0.7438246866067251,
"epoch": 0.1182924738556489,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.326171875,
"learning_rate": 1e-06,
"loss": -0.0139,
"num_tokens": 309093692.0,
"reward": -8.50984206199646,
"reward_std": 4.543536067008972,
"rewards/ADERawReward/mean": -8.562862634658813,
"rewards/ADERawReward/std": 4.5306751251220705,
"rewards/StrictFormatReward/mean": 0.5302083313465118,
"rewards/StrictFormatReward/std": 2.1870830774307253,
"sampling/importance_sampling_ratio/max": 2.8570920705795286,
"sampling/importance_sampling_ratio/mean": 0.3624374896287918,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.160294127464295,
"sampling/sampling_logp_difference/mean": 0.02724489979445934,
"step": 690,
"step_time": 31.26219962689502
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005208333441987634,
"completions/max_length": 996.0,
"completions/max_terminated_length": 924.7,
"completions/mean_length": 745.0484558105469,
"completions/mean_terminated_length": 743.5991821289062,
"completions/min_length": 539.6,
"completions/min_terminated_length": 539.6,
"entropy": 0.7445198019345601,
"epoch": 0.12000685753471627,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.390625,
"learning_rate": 1e-06,
"loss": -0.0064,
"num_tokens": 313571945.0,
"reward": -8.856493520736695,
"reward_std": 4.863119053840637,
"rewards/ADERawReward/mean": -8.914670515060426,
"rewards/ADERawReward/std": 4.857007074356079,
"rewards/StrictFormatReward/mean": 0.5817708313465119,
"rewards/StrictFormatReward/std": 2.0837946057319643,
"sampling/importance_sampling_ratio/max": 2.8482714176177977,
"sampling/importance_sampling_ratio/mean": 0.3727655470371246,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.315770363807678,
"sampling/sampling_logp_difference/mean": 0.02694130353629589,
"step": 700,
"step_time": 31.00740701830364
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00885416679084301,
"completions/max_length": 980.0,
"completions/max_terminated_length": 904.4,
"completions/mean_length": 745.5870056152344,
"completions/mean_terminated_length": 743.1242919921875,
"completions/min_length": 423.9,
"completions/min_terminated_length": 423.9,
"entropy": 0.7471017857392629,
"epoch": 0.12172124121378365,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3125,
"learning_rate": 1e-06,
"loss": 0.0164,
"num_tokens": 318051584.0,
"reward": -8.626063299179076,
"reward_std": 5.039473915100098,
"rewards/ADERawReward/mean": -8.679083728790284,
"rewards/ADERawReward/std": 5.022432565689087,
"rewards/StrictFormatReward/mean": 0.5302083313465118,
"rewards/StrictFormatReward/std": 2.2059712767601014,
"sampling/importance_sampling_ratio/max": 2.6881606578826904,
"sampling/importance_sampling_ratio/mean": 0.35196583569049833,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.9412872314453127,
"sampling/sampling_logp_difference/mean": 0.02722361944615841,
"step": 710,
"step_time": 30.79243855099194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007812500093132257,
"completions/max_length": 1011.8,
"completions/max_terminated_length": 889.3,
"completions/mean_length": 745.9838806152344,
"completions/mean_terminated_length": 743.8016540527344,
"completions/min_length": 430.3,
"completions/min_terminated_length": 430.3,
"entropy": 0.737281721830368,
"epoch": 0.12343562489285102,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.333984375,
"learning_rate": 1e-06,
"loss": -0.0019,
"num_tokens": 322532273.0,
"reward": -8.676169490814209,
"reward_std": 6.3611784934997555,
"rewards/ADERawReward/mean": -8.736637830734253,
"rewards/ADERawReward/std": 6.346572685241699,
"rewards/StrictFormatReward/mean": 0.6046874940395355,
"rewards/StrictFormatReward/std": 2.027357840538025,
"sampling/importance_sampling_ratio/max": 2.7995285272598265,
"sampling/importance_sampling_ratio/mean": 0.35478300750255587,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.93809871673584,
"sampling/sampling_logp_difference/mean": 0.027037655375897883,
"step": 720,
"step_time": 31.32133076491009
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006250000139698386,
"completions/max_length": 1008.3,
"completions/max_terminated_length": 902.0,
"completions/mean_length": 744.9453369140625,
"completions/mean_terminated_length": 743.1854309082031,
"completions/min_length": 398.3,
"completions/min_terminated_length": 398.3,
"entropy": 0.7388640781243642,
"epoch": 0.1251500085719184,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.388671875,
"learning_rate": 1e-06,
"loss": -0.0171,
"num_tokens": 327010056.0,
"reward": -8.387274026870728,
"reward_std": 4.668637943267822,
"rewards/ADERawReward/mean": -8.443159294128417,
"rewards/ADERawReward/std": 4.659743785858154,
"rewards/StrictFormatReward/mean": 0.5588541686534881,
"rewards/StrictFormatReward/std": 2.136683487892151,
"sampling/importance_sampling_ratio/max": 2.7782902002334593,
"sampling/importance_sampling_ratio/mean": 0.3791107714176178,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.789157485961914,
"sampling/sampling_logp_difference/mean": 0.02694005910307169,
"step": 730,
"step_time": 31.208155655500015
}
],
"logging_steps": 10,
"max_steps": 730,
"num_input_tokens_seen": 327010056,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}