{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1251500085719184, "eval_steps": 500, "global_step": 730, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009895833674818277, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.2, "completions/mean_length": 747.1151245117187, "completions/mean_terminated_length": 744.3689392089843, "completions/min_length": 457.8, "completions/min_terminated_length": 457.8, "entropy": 0.7938701828320821, "epoch": 0.0017143836790673753, "frac_reward_zero_std": 0.0, "grad_norm": 0.294921875, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 4482493.0, "reward": -11.88613681793213, "reward_std": 9.975114345550537, "rewards/ADERawReward/mean": -11.914521789550781, "rewards/ADERawReward/std": 9.975466585159301, "rewards/StrictFormatReward/mean": 0.28385417386889455, "rewards/StrictFormatReward/std": 2.689612889289856, "sampling/importance_sampling_ratio/max": 2.7710394859313965, "sampling/importance_sampling_ratio/mean": 0.38329996466636657, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.348051142692566, "sampling/sampling_logp_difference/mean": 0.02634742669761181, "step": 10, "step_time": 31.851699317700696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014062500465661288, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 746.5093994140625, "completions/mean_terminated_length": 742.5637084960938, "completions/min_length": 319.8, "completions/min_terminated_length": 319.8, "entropy": 0.783385141690572, "epoch": 0.0034287673581347507, "frac_reward_zero_std": 0.0, "grad_norm": 0.341796875, "learning_rate": 1e-06, "loss": -0.0167, "num_tokens": 8963695.0, "reward": -10.739660549163819, "reward_std": 7.596912527084351, "rewards/ADERawReward/mean": -10.76804552078247, "rewards/ADERawReward/std": 7.586498594284057, "rewards/StrictFormatReward/mean": 0.2838541708886623, "rewards/StrictFormatReward/std": 2.69962215423584, "sampling/importance_sampling_ratio/max": 2.778630328178406, "sampling/importance_sampling_ratio/mean": 0.38422776758670807, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.3024064064025875, "sampling/sampling_logp_difference/mean": 0.026079285889863968, "step": 20, "step_time": 31.08811371029442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01302083358168602, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.5, "completions/mean_length": 745.9713745117188, "completions/mean_terminated_length": 742.3107482910157, "completions/min_length": 344.2, "completions/min_terminated_length": 344.2, "entropy": 0.777154250939687, "epoch": 0.005143151037202126, "frac_reward_zero_std": 0.0, "grad_norm": 0.240234375, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 13443080.0, "reward": -10.726643276214599, "reward_std": 6.941620349884033, "rewards/ADERawReward/mean": -10.759612274169921, "rewards/ADERawReward/std": 6.939747667312622, "rewards/StrictFormatReward/mean": 0.32968750670552255, "rewards/StrictFormatReward/std": 2.6174988746643066, "sampling/importance_sampling_ratio/max": 2.820967411994934, "sampling/importance_sampling_ratio/mean": 0.3970964789390564, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.230282878875732, "sampling/sampling_logp_difference/mean": 0.025640238262712955, "step": 30, "step_time": 31.281337887697738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000186264515, "completions/max_length": 1006.1, "completions/max_terminated_length": 952.6, "completions/mean_length": 746.8578247070312, "completions/mean_terminated_length": 745.1074523925781, "completions/min_length": 574.6, "completions/min_terminated_length": 574.6, "entropy": 0.7786477545897166, "epoch": 0.006857534716269501, "frac_reward_zero_std": 0.0, "grad_norm": 0.353515625, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 17925079.0, "reward": -10.400494289398193, "reward_std": 8.363166570663452, "rewards/ADERawReward/mean": -10.453514766693115, "rewards/ADERawReward/std": 8.35603289604187, "rewards/StrictFormatReward/mean": 0.5302083313465118, "rewards/StrictFormatReward/std": 2.1997122406959533, "sampling/importance_sampling_ratio/max": 2.7520124673843385, "sampling/importance_sampling_ratio/mean": 0.4182109236717224, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.882514882087707, "sampling/sampling_logp_difference/mean": 0.025688859820365905, "step": 40, "step_time": 30.94830545460136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666883975268, "completions/max_length": 1008.7, "completions/max_terminated_length": 908.0, "completions/mean_length": 744.6708557128907, "completions/mean_terminated_length": 741.7320251464844, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "entropy": 0.7797636012236278, "epoch": 0.008571918395336876, "frac_reward_zero_std": 0.0, "grad_norm": 0.296875, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 22402927.0, "reward": -11.912008953094482, "reward_std": 25.465690279006957, "rewards/ADERawReward/mean": -11.948414325714111, "rewards/ADERawReward/std": 25.452851724624633, "rewards/StrictFormatReward/mean": 0.3640625074505806, "rewards/StrictFormatReward/std": 2.5515830755233764, "sampling/importance_sampling_ratio/max": 2.7972999811172485, "sampling/importance_sampling_ratio/mean": 0.41742126941680907, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.1138537406921385, "sampling/sampling_logp_difference/mean": 0.025952593609690666, "step": 50, "step_time": 30.99103705089656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00572916679084301, "completions/max_length": 982.4, "completions/max_terminated_length": 894.4, "completions/mean_length": 743.1708618164063, "completions/mean_terminated_length": 741.5553588867188, "completions/min_length": 389.5, "completions/min_terminated_length": 389.5, "entropy": 0.7705018659432729, "epoch": 0.010286302074404252, "frac_reward_zero_std": 0.0, "grad_norm": 0.294921875, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 26877415.0, "reward": -10.354321956634521, "reward_std": 7.085966682434082, "rewards/ADERawReward/mean": -10.3998948097229, "rewards/ADERawReward/std": 7.078661155700684, "rewards/StrictFormatReward/mean": 0.45572916492819787, "rewards/StrictFormatReward/std": 2.357165718078613, "sampling/importance_sampling_ratio/max": 2.8156795501708984, "sampling/importance_sampling_ratio/mean": 0.4263548344373703, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.213912606239319, "sampling/sampling_logp_difference/mean": 0.025748718157410623, "step": 60, "step_time": 30.629806772600567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007812500139698387, "completions/max_length": 997.5, "completions/max_terminated_length": 901.5, "completions/mean_length": 745.1635559082031, "completions/mean_terminated_length": 742.9772766113281, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 0.7808759689331055, "epoch": 0.012000685753471628, "frac_reward_zero_std": 0.0, "grad_norm": 0.416015625, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 31355793.0, "reward": -10.355865859985352, "reward_std": 7.629749822616577, "rewards/ADERawReward/mean": -10.399147129058838, "rewards/ADERawReward/std": 7.623885774612427, "rewards/StrictFormatReward/mean": 0.43281250447034836, "rewards/StrictFormatReward/std": 2.4088792324066164, "sampling/importance_sampling_ratio/max": 2.7858126163482666, "sampling/importance_sampling_ratio/mean": 0.4104765444993973, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.654694700241089, "sampling/sampling_logp_difference/mean": 0.02596179526299238, "step": 70, "step_time": 30.90157012869895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000139698386, "completions/max_length": 997.0, "completions/max_terminated_length": 911.7, "completions/mean_length": 743.9010620117188, "completions/mean_terminated_length": 742.1458801269531, "completions/min_length": 506.5, "completions/min_terminated_length": 506.5, "entropy": 0.7690683722496032, "epoch": 0.013715069432539003, "frac_reward_zero_std": 0.0, "grad_norm": 0.259765625, "learning_rate": 1e-06, "loss": -0.0147, "num_tokens": 35831699.0, "reward": -9.603075504302979, "reward_std": 7.397923803329467, "rewards/ADERawReward/mean": -9.64406452178955, "rewards/ADERawReward/std": 7.394052314758301, "rewards/StrictFormatReward/mean": 0.40989583656191825, "rewards/StrictFormatReward/std": 2.446663808822632, "sampling/importance_sampling_ratio/max": 2.8509432315826415, "sampling/importance_sampling_ratio/mean": 0.42068196535110475, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.871529459953308, "sampling/sampling_logp_difference/mean": 0.025504560954868794, "step": 80, "step_time": 30.775000178898335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006770833488553762, "completions/max_length": 1018.6, "completions/max_terminated_length": 921.4, "completions/mean_length": 743.0687744140625, "completions/mean_terminated_length": 741.1454040527344, "completions/min_length": 413.2, "completions/min_terminated_length": 413.2, "entropy": 0.7730992416540782, "epoch": 0.015429453111606378, "frac_reward_zero_std": 0.0, "grad_norm": 0.330078125, "learning_rate": 1e-06, "loss": -0.0209, "num_tokens": 40306439.0, "reward": -9.999512577056885, "reward_std": 6.795107650756836, "rewards/ADERawReward/mean": -10.04222068786621, "rewards/ADERawReward/std": 6.785987043380738, "rewards/StrictFormatReward/mean": 0.4270833358168602, "rewards/StrictFormatReward/std": 2.427087187767029, "sampling/importance_sampling_ratio/max": 2.806507611274719, "sampling/importance_sampling_ratio/mean": 0.41283826231956483, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.893343424797058, "sampling/sampling_logp_difference/mean": 0.02597447969019413, "step": 90, "step_time": 31.03075063330034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000139698386, "completions/max_length": 989.1, "completions/max_terminated_length": 911.9, "completions/mean_length": 744.7958557128907, "completions/mean_terminated_length": 743.0538269042969, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "entropy": 0.7617896695931753, "epoch": 0.01714383679067375, "frac_reward_zero_std": 0.0, "grad_norm": 0.26953125, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 44785151.0, "reward": -10.150122165679932, "reward_std": 6.947455310821534, "rewards/ADERawReward/mean": -10.1985595703125, "rewards/ADERawReward/std": 6.9321370124816895, "rewards/StrictFormatReward/mean": 0.48437499925494193, "rewards/StrictFormatReward/std": 2.2780819296836854, "sampling/importance_sampling_ratio/max": 2.8471882343292236, "sampling/importance_sampling_ratio/mean": 0.4186240643262863, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.192489242553711, "sampling/sampling_logp_difference/mean": 0.025915385968983174, "step": 100, "step_time": 30.777787425796852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008854166837409139, "completions/max_length": 1016.1, "completions/max_terminated_length": 927.6, "completions/mean_length": 747.430224609375, "completions/mean_terminated_length": 744.9568664550782, "completions/min_length": 542.5, "completions/min_terminated_length": 542.5, "entropy": 0.7839122792085012, "epoch": 0.018858220469741126, "frac_reward_zero_std": 0.0, "grad_norm": 0.369140625, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 49267497.0, "reward": -10.34236192703247, "reward_std": 6.522932863235473, "rewards/ADERawReward/mean": -10.377049255371094, "rewards/ADERawReward/std": 6.512392950057984, "rewards/StrictFormatReward/mean": 0.3468750029802322, "rewards/StrictFormatReward/std": 2.5984415769577027, "sampling/importance_sampling_ratio/max": 2.919663596153259, "sampling/importance_sampling_ratio/mean": 0.42760642170906066, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.582067346572876, "sampling/sampling_logp_difference/mean": 0.025711694732308388, "step": 110, "step_time": 31.13533423260669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000139698386, "completions/max_length": 981.8, "completions/max_terminated_length": 905.6, "completions/mean_length": 741.3906433105469, "completions/mean_terminated_length": 739.6170837402344, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "entropy": 0.7655617932478587, "epoch": 0.020572604148808505, "frac_reward_zero_std": 0.0, "grad_norm": 0.2255859375, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 53739095.0, "reward": -9.396285820007325, "reward_std": 6.019436550140381, "rewards/ADERawReward/mean": -9.43211908340454, "rewards/ADERawReward/std": 5.99921350479126, "rewards/StrictFormatReward/mean": 0.35833333134651185, "rewards/StrictFormatReward/std": 2.565337634086609, "sampling/importance_sampling_ratio/max": 2.7736236810684205, "sampling/importance_sampling_ratio/mean": 0.4329004347324371, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.103514838218689, "sampling/sampling_logp_difference/mean": 0.02581946086138487, "step": 120, "step_time": 30.675611695800036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000139698386, "completions/max_length": 993.0, "completions/max_terminated_length": 917.3, "completions/mean_length": 743.2172119140625, "completions/mean_terminated_length": 741.4531127929688, "completions/min_length": 448.6, "completions/min_terminated_length": 448.6, "entropy": 0.7609198371569316, "epoch": 0.02228698782787588, "frac_reward_zero_std": 0.0, "grad_norm": 0.306640625, "learning_rate": 1e-06, "loss": -0.0186, "num_tokens": 58213992.0, "reward": -10.035307788848877, "reward_std": 7.6791582107543945, "rewards/ADERawReward/mean": -10.080307579040527, "rewards/ADERawReward/std": 7.661759281158448, "rewards/StrictFormatReward/mean": 0.45, "rewards/StrictFormatReward/std": 2.383046817779541, "sampling/importance_sampling_ratio/max": 2.7497065782547, "sampling/importance_sampling_ratio/mean": 0.38673609495162964, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.264748442173004, "sampling/sampling_logp_difference/mean": 0.02555535715073347, "step": 130, "step_time": 30.85125565490016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008333333535119891, "completions/max_length": 995.9, "completions/max_terminated_length": 936.1, "completions/mean_length": 747.1015808105469, "completions/mean_terminated_length": 744.7781860351563, "completions/min_length": 504.1, "completions/min_terminated_length": 504.1, "entropy": 0.7680625875790914, "epoch": 0.024001371506943255, "frac_reward_zero_std": 0.0, "grad_norm": 0.283203125, "learning_rate": 1e-06, "loss": -0.014, "num_tokens": 62696459.0, "reward": -9.6462965965271, "reward_std": 6.933882856369019, "rewards/ADERawReward/mean": -9.687859153747558, "rewards/ADERawReward/std": 6.932486724853516, "rewards/StrictFormatReward/mean": 0.41562500298023225, "rewards/StrictFormatReward/std": 2.435987985134125, "sampling/importance_sampling_ratio/max": 2.7624801874160765, "sampling/importance_sampling_ratio/mean": 0.39736475646495817, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.469653701782226, "sampling/sampling_logp_difference/mean": 0.025803259573876858, "step": 140, "step_time": 30.751169363802184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0067708335351198915, "completions/max_length": 996.8, "completions/max_terminated_length": 927.2, "completions/mean_length": 743.0797119140625, "completions/mean_terminated_length": 741.1633605957031, "completions/min_length": 364.4, "completions/min_terminated_length": 364.4, "entropy": 0.771180788675944, "epoch": 0.02571575518601063, "frac_reward_zero_std": 0.0, "grad_norm": 0.322265625, "learning_rate": 1e-06, "loss": -0.0135, "num_tokens": 67171268.0, "reward": -9.683474731445312, "reward_std": 6.344115209579468, "rewards/ADERawReward/mean": -9.721026802062989, "rewards/ADERawReward/std": 6.331903743743896, "rewards/StrictFormatReward/mean": 0.3755208313465118, "rewards/StrictFormatReward/std": 2.536852979660034, "sampling/importance_sampling_ratio/max": 2.6906844854354857, "sampling/importance_sampling_ratio/mean": 0.3939074516296387, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.591675233840943, "sampling/sampling_logp_difference/mean": 0.026031140238046646, "step": 150, "step_time": 30.759532438102177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375000186264516, "completions/max_length": 1001.1, "completions/max_terminated_length": 928.5, "completions/mean_length": 743.9906616210938, "completions/mean_terminated_length": 741.3461547851563, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "entropy": 0.772610588868459, "epoch": 0.027430138865078006, "frac_reward_zero_std": 0.0, "grad_norm": 0.291015625, "learning_rate": 1e-06, "loss": -0.006, "num_tokens": 71648274.0, "reward": -9.378812217712403, "reward_std": 6.656644868850708, "rewards/ADERawReward/mean": -9.419801712036133, "rewards/ADERawReward/std": 6.638844442367554, "rewards/StrictFormatReward/mean": 0.4098958343267441, "rewards/StrictFormatReward/std": 2.457722854614258, "sampling/importance_sampling_ratio/max": 2.8564789295196533, "sampling/importance_sampling_ratio/mean": 0.3868180692195892, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.245336532592773, "sampling/sampling_logp_difference/mean": 0.026049494743347168, "step": 160, "step_time": 31.00511598830053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00572916679084301, "completions/max_length": 990.5, "completions/max_terminated_length": 943.8, "completions/mean_length": 742.833349609375, "completions/mean_terminated_length": 741.2216247558594, "completions/min_length": 507.2, "completions/min_terminated_length": 507.2, "entropy": 0.7669690032800038, "epoch": 0.02914452254414538, "frac_reward_zero_std": 0.0, "grad_norm": 0.2421875, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 76122162.0, "reward": -9.605478668212891, "reward_std": 5.803140306472779, "rewards/ADERawReward/mean": -9.645895195007324, "rewards/ADERawReward/std": 5.7928056716918945, "rewards/StrictFormatReward/mean": 0.4041666634380817, "rewards/StrictFormatReward/std": 2.4726577758789063, "sampling/importance_sampling_ratio/max": 2.71953125, "sampling/importance_sampling_ratio/mean": 0.4068588227033615, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.668109703063965, "sampling/sampling_logp_difference/mean": 0.025866417028009892, "step": 170, "step_time": 30.801190392104036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0067708335351198915, "completions/max_length": 992.8, "completions/max_terminated_length": 902.9, "completions/mean_length": 745.7635559082031, "completions/mean_terminated_length": 743.8595642089844, "completions/min_length": 539.9, "completions/min_terminated_length": 539.9, "entropy": 0.7681126793225607, "epoch": 0.030858906223212756, "frac_reward_zero_std": 0.0, "grad_norm": 0.330078125, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 80601532.0, "reward": -10.118904399871827, "reward_std": 6.3121555805206295, "rewards/ADERawReward/mean": -10.168487930297852, "rewards/ADERawReward/std": 6.31596007347107, "rewards/StrictFormatReward/mean": 0.49583332538604735, "rewards/StrictFormatReward/std": 2.2922281503677366, "sampling/importance_sampling_ratio/max": 2.808049750328064, "sampling/importance_sampling_ratio/mean": 0.40117439031600954, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.4242959260940555, "sampling/sampling_logp_difference/mean": 0.026261823065578938, "step": 180, "step_time": 30.95573297930241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375000186264516, "completions/max_length": 1006.3, "completions/max_terminated_length": 940.1, "completions/mean_length": 745.2932495117187, "completions/mean_terminated_length": 742.6480163574219, "completions/min_length": 493.3, "completions/min_terminated_length": 493.3, "entropy": 0.7715065141518911, "epoch": 0.03257328990228013, "frac_reward_zero_std": 0.0, "grad_norm": 0.349609375, "learning_rate": 1e-06, "loss": -0.0167, "num_tokens": 85080239.0, "reward": -9.032559633255005, "reward_std": 5.6337450504302975, "rewards/ADERawReward/mean": -9.070111560821534, "rewards/ADERawReward/std": 5.615639925003052, "rewards/StrictFormatReward/mean": 0.3755208343267441, "rewards/StrictFormatReward/std": 2.532916522026062, "sampling/importance_sampling_ratio/max": 2.906028723716736, "sampling/importance_sampling_ratio/mean": 0.43047145903110506, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.560648036003113, "sampling/sampling_logp_difference/mean": 0.02589159458875656, "step": 190, "step_time": 31.04905029379588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004687500139698386, "completions/max_length": 967.5, "completions/max_terminated_length": 901.5, "completions/mean_length": 743.9588745117187, "completions/mean_terminated_length": 742.6530639648438, "completions/min_length": 482.7, "completions/min_terminated_length": 482.7, "entropy": 0.7573511083920796, "epoch": 0.0342876735813475, "frac_reward_zero_std": 0.0, "grad_norm": 0.373046875, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 89554992.0, "reward": -9.594873905181885, "reward_std": 7.4608996391296385, "rewards/ADERawReward/mean": -9.645030212402343, "rewards/ADERawReward/std": 7.4621100425720215, "rewards/StrictFormatReward/mean": 0.5015625014901162, "rewards/StrictFormatReward/std": 2.2230212569236754, "sampling/importance_sampling_ratio/max": 2.762194800376892, "sampling/importance_sampling_ratio/mean": 0.3942416876554489, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.715522122383118, "sampling/sampling_logp_difference/mean": 0.02548809293657541, "step": 200, "step_time": 30.452962850301994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008854166883975268, "completions/max_length": 1002.2, "completions/max_terminated_length": 908.9, "completions/mean_length": 749.1671997070313, "completions/mean_terminated_length": 746.7061828613281, "completions/min_length": 479.3, "completions/min_terminated_length": 479.3, "entropy": 0.7769242246945699, "epoch": 0.03600205726041488, "frac_reward_zero_std": 0.0, "grad_norm": 0.349609375, "learning_rate": 1e-06, "loss": -0.0264, "num_tokens": 94041409.0, "reward": -9.6312180519104, "reward_std": 5.488570928573608, "rewards/ADERawReward/mean": -9.677363443374634, "rewards/ADERawReward/std": 5.482590246200561, "rewards/StrictFormatReward/mean": 0.46145834028720856, "rewards/StrictFormatReward/std": 2.369260883331299, "sampling/importance_sampling_ratio/max": 2.827503228187561, "sampling/importance_sampling_ratio/mean": 0.39889043271541597, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.615737795829773, "sampling/sampling_logp_difference/mean": 0.02642081268131733, "step": 210, "step_time": 31.002005700898007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007291666883975268, "completions/max_length": 1011.2, "completions/max_terminated_length": 917.8, "completions/mean_length": 744.7901306152344, "completions/mean_terminated_length": 742.7363525390625, "completions/min_length": 326.2, "completions/min_terminated_length": 326.2, "entropy": 0.7611714641253153, "epoch": 0.03771644093948225, "frac_reward_zero_std": 0.0, "grad_norm": 0.2470703125, "learning_rate": 1e-06, "loss": -0.0072, "num_tokens": 98520158.0, "reward": -9.730458450317382, "reward_std": 6.581094980239868, "rewards/ADERawReward/mean": -9.778323078155518, "rewards/ADERawReward/std": 6.570281600952148, "rewards/StrictFormatReward/mean": 0.478645833581686, "rewards/StrictFormatReward/std": 2.319968819618225, "sampling/importance_sampling_ratio/max": 2.9045305013656617, "sampling/importance_sampling_ratio/mean": 0.4044792056083679, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.901871252059936, "sampling/sampling_logp_difference/mean": 0.026320008002221583, "step": 220, "step_time": 31.057802741799968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00833333358168602, "completions/max_length": 1006.6, "completions/max_terminated_length": 907.8, "completions/mean_length": 744.116162109375, "completions/mean_terminated_length": 741.7729553222656, "completions/min_length": 364.6, "completions/min_terminated_length": 364.6, "entropy": 0.7634056508541107, "epoch": 0.03943082461854963, "frac_reward_zero_std": 0.0, "grad_norm": 0.35546875, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 102995469.0, "reward": -9.594787311553954, "reward_std": 6.4190247535705565, "rewards/ADERawReward/mean": -9.632339668273925, "rewards/ADERawReward/std": 6.41018123626709, "rewards/StrictFormatReward/mean": 0.37552083283662796, "rewards/StrictFormatReward/std": 2.4954795360565187, "sampling/importance_sampling_ratio/max": 2.7927833795547485, "sampling/importance_sampling_ratio/mean": 0.38037638664245604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.303525161743164, "sampling/sampling_logp_difference/mean": 0.026079374738037585, "step": 230, "step_time": 30.925862592204066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625000232830644, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.7, "completions/mean_length": 747.7349060058593, "completions/mean_terminated_length": 743.3495300292968, "completions/min_length": 425.3, "completions/min_terminated_length": 425.3, "entropy": 0.7614292343457539, "epoch": 0.04114520829761701, "frac_reward_zero_std": 0.0, "grad_norm": 0.3984375, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 107478240.0, "reward": -9.159432983398437, "reward_std": 5.858185243606568, "rewards/ADERawReward/mean": -9.192401695251466, "rewards/ADERawReward/std": 5.8467125415802, "rewards/StrictFormatReward/mean": 0.3296875016763806, "rewards/StrictFormatReward/std": 2.6086880922317506, "sampling/importance_sampling_ratio/max": 2.8600486755371093, "sampling/importance_sampling_ratio/mean": 0.38983087837696073, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.5773406505584715, "sampling/sampling_logp_difference/mean": 0.026277855411171912, "step": 240, "step_time": 31.37704919550306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011458333488553763, "completions/max_length": 1012.8, "completions/max_terminated_length": 905.7, "completions/mean_length": 743.3661682128907, "completions/mean_terminated_length": 740.1103088378907, "completions/min_length": 399.4, "completions/min_terminated_length": 399.4, "entropy": 0.753006378809611, "epoch": 0.04285959197668438, "frac_reward_zero_std": 0.0, "grad_norm": 0.35546875, "learning_rate": 1e-06, "loss": -0.0194, "num_tokens": 111952815.0, "reward": -9.348959684371948, "reward_std": 5.709445333480835, "rewards/ADERawReward/mean": -9.387657499313354, "rewards/ADERawReward/std": 5.683103704452515, "rewards/StrictFormatReward/mean": 0.38697916716337205, "rewards/StrictFormatReward/std": 2.5156437873840334, "sampling/importance_sampling_ratio/max": 2.7942948818206785, "sampling/importance_sampling_ratio/mean": 0.3898850232362747, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.9041135787963865, "sampling/sampling_logp_difference/mean": 0.026003769040107726, "step": 250, "step_time": 31.078463642198766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016145833814516664, "completions/max_length": 1017.2, "completions/max_terminated_length": 912.4, "completions/mean_length": 749.7567993164063, "completions/mean_terminated_length": 745.2748840332031, "completions/min_length": 587.3, "completions/min_terminated_length": 587.3, "entropy": 0.7615062018235524, "epoch": 0.04457397565575176, "frac_reward_zero_std": 0.0, "grad_norm": 0.3984375, "learning_rate": 1e-06, "loss": -0.0051, "num_tokens": 116440044.0, "reward": -9.92240343093872, "reward_std": 5.772085857391358, "rewards/ADERawReward/mean": -9.958809566497802, "rewards/ADERawReward/std": 5.756062030792236, "rewards/StrictFormatReward/mean": 0.36406250670552254, "rewards/StrictFormatReward/std": 2.54167058467865, "sampling/importance_sampling_ratio/max": 2.7225013256072996, "sampling/importance_sampling_ratio/mean": 0.3842493683099747, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.781629824638367, "sampling/sampling_logp_difference/mean": 0.02624006476253271, "step": 260, "step_time": 31.103271031705663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008333333535119891, "completions/max_length": 998.2, "completions/max_terminated_length": 911.1, "completions/mean_length": 743.7968994140625, "completions/mean_terminated_length": 741.4540466308594, "completions/min_length": 510.3, "completions/min_terminated_length": 510.3, "entropy": 0.7562108953793844, "epoch": 0.04628835933481913, "frac_reward_zero_std": 0.0, "grad_norm": 0.1953125, "learning_rate": 1e-06, "loss": -0.0247, "num_tokens": 120916390.0, "reward": -9.01723370552063, "reward_std": 5.40621428489685, "rewards/ADERawReward/mean": -9.065098094940186, "rewards/ADERawReward/std": 5.384741115570068, "rewards/StrictFormatReward/mean": 0.4786458298563957, "rewards/StrictFormatReward/std": 2.2954741954803466, "sampling/importance_sampling_ratio/max": 2.789908194541931, "sampling/importance_sampling_ratio/mean": 0.40749328434467313, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.034530830383301, "sampling/sampling_logp_difference/mean": 0.026305202022194864, "step": 270, "step_time": 30.820387327201026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375000139698386, "completions/max_length": 1011.8, "completions/max_terminated_length": 914.4, "completions/mean_length": 745.3156433105469, "completions/mean_terminated_length": 742.68203125, "completions/min_length": 492.1, "completions/min_terminated_length": 492.1, "entropy": 0.757685911655426, "epoch": 0.04800274301388651, "frac_reward_zero_std": 0.0, "grad_norm": 0.349609375, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 125395556.0, "reward": -9.0065682888031, "reward_std": 5.3750749111175535, "rewards/ADERawReward/mean": -9.06875557899475, "rewards/ADERawReward/std": 5.365862655639648, "rewards/StrictFormatReward/mean": 0.6218749970197678, "rewards/StrictFormatReward/std": 1.9849600195884705, "sampling/importance_sampling_ratio/max": 2.791543960571289, "sampling/importance_sampling_ratio/mean": 0.36947061419487, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.032503914833069, "sampling/sampling_logp_difference/mean": 0.02653008736670017, "step": 280, "step_time": 31.023602066392776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375000139698386, "completions/max_length": 1011.5, "completions/max_terminated_length": 906.5, "completions/mean_length": 745.1588745117188, "completions/mean_terminated_length": 742.5252624511719, "completions/min_length": 384.9, "completions/min_terminated_length": 384.9, "entropy": 0.7569859484831493, "epoch": 0.04971712669295388, "frac_reward_zero_std": 0.0, "grad_norm": 0.275390625, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 129874581.0, "reward": -9.8903018951416, "reward_std": 5.79907751083374, "rewards/ADERawReward/mean": -9.936447715759277, "rewards/ADERawReward/std": 5.782765436172485, "rewards/StrictFormatReward/mean": 0.4614583358168602, "rewards/StrictFormatReward/std": 2.3571563005447387, "sampling/importance_sampling_ratio/max": 2.780865716934204, "sampling/importance_sampling_ratio/mean": 0.3752612203359604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.010831832885742, "sampling/sampling_logp_difference/mean": 0.026486290618777275, "step": 290, "step_time": 31.036961890700333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010937500186264515, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.3, "completions/mean_length": 743.6302307128906, "completions/mean_terminated_length": 740.5345153808594, "completions/min_length": 545.1, "completions/min_terminated_length": 545.1, "entropy": 0.7479812840620677, "epoch": 0.05143151037202126, "frac_reward_zero_std": 0.0, "grad_norm": 0.296875, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 134349103.0, "reward": -8.726549053192139, "reward_std": 5.281997394561768, "rewards/ADERawReward/mean": -8.780142164230346, "rewards/ADERawReward/std": 5.273246812820434, "rewards/StrictFormatReward/mean": 0.5359374985098839, "rewards/StrictFormatReward/std": 2.176564943790436, "sampling/importance_sampling_ratio/max": 2.9104627132415772, "sampling/importance_sampling_ratio/mean": 0.40789560675621034, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.580249309539795, "sampling/sampling_logp_difference/mean": 0.025899984315037726, "step": 300, "step_time": 31.32498294780089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007812500186264514, "completions/max_length": 998.0, "completions/max_terminated_length": 917.8, "completions/mean_length": 745.6083557128907, "completions/mean_terminated_length": 743.4208374023438, "completions/min_length": 550.1, "completions/min_terminated_length": 550.1, "entropy": 0.7465774556001027, "epoch": 0.05314589405108863, "frac_reward_zero_std": 0.0, "grad_norm": 0.353515625, "learning_rate": 1e-06, "loss": -0.0123, "num_tokens": 138828591.0, "reward": -9.077164840698241, "reward_std": 5.992494773864746, "rewards/ADERawReward/mean": -9.131331491470338, "rewards/ADERawReward/std": 5.984670972824096, "rewards/StrictFormatReward/mean": 0.5416666656732559, "rewards/StrictFormatReward/std": 2.169144892692566, "sampling/importance_sampling_ratio/max": 2.749341917037964, "sampling/importance_sampling_ratio/mean": 0.3683441460132599, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.0134562969207765, "sampling/sampling_logp_difference/mean": 0.026201115176081657, "step": 310, "step_time": 30.797605072900478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016145833814516664, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.7, "completions/mean_length": 747.6823181152344, "completions/mean_terminated_length": 743.1548828125, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "entropy": 0.7556955059369405, "epoch": 0.05486027773015601, "frac_reward_zero_std": 0.0, "grad_norm": 0.3671875, "learning_rate": 1e-06, "loss": 0.006, "num_tokens": 143312285.0, "reward": -9.226322269439697, "reward_std": 6.171311950683593, "rewards/ADERawReward/mean": -9.265593338012696, "rewards/ADERawReward/std": 6.160300636291504, "rewards/StrictFormatReward/mean": 0.39270834550261496, "rewards/StrictFormatReward/std": 2.4976075172424315, "sampling/importance_sampling_ratio/max": 2.7495847940444946, "sampling/importance_sampling_ratio/mean": 0.384751632809639, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.004838466644287, "sampling/sampling_logp_difference/mean": 0.026612747460603714, "step": 320, "step_time": 31.281772803403147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007812500093132257, "completions/max_length": 1001.5, "completions/max_terminated_length": 901.0, "completions/mean_length": 745.114599609375, "completions/mean_terminated_length": 742.9226379394531, "completions/min_length": 415.5, "completions/min_terminated_length": 415.5, "entropy": 0.7487649619579315, "epoch": 0.05657466140922338, "frac_reward_zero_std": 0.0, "grad_norm": 0.2099609375, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 147791241.0, "reward": -8.852140951156617, "reward_std": 6.015402030944824, "rewards/ADERawReward/mean": -8.904588794708252, "rewards/ADERawReward/std": 6.00461745262146, "rewards/StrictFormatReward/mean": 0.5244791731238365, "rewards/StrictFormatReward/std": 2.198140048980713, "sampling/importance_sampling_ratio/max": 2.8983518362045286, "sampling/importance_sampling_ratio/mean": 0.38104148209095, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.405174565315247, "sampling/sampling_logp_difference/mean": 0.026203346997499467, "step": 330, "step_time": 30.943252457803464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013541666883975267, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.4, "completions/mean_length": 746.2218994140625, "completions/mean_terminated_length": 742.4164428710938, "completions/min_length": 451.8, "completions/min_terminated_length": 451.8, "entropy": 0.7482379992802938, "epoch": 0.05828904508829076, "frac_reward_zero_std": 0.0, "grad_norm": 0.2490234375, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 152271555.0, "reward": -9.297206592559814, "reward_std": 5.522917318344116, "rewards/ADERawReward/mean": -9.338768911361694, "rewards/ADERawReward/std": 5.507352113723755, "rewards/StrictFormatReward/mean": 0.4156250014901161, "rewards/StrictFormatReward/std": 2.437586045265198, "sampling/importance_sampling_ratio/max": 2.782149076461792, "sampling/importance_sampling_ratio/mean": 0.37727462947368623, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.424329519271851, "sampling/sampling_logp_difference/mean": 0.0264679953455925, "step": 340, "step_time": 31.394037494297663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012500000232830644, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.2, "completions/mean_length": 747.1989868164062, "completions/mean_terminated_length": 743.6892578125, "completions/min_length": 461.7, "completions/min_terminated_length": 461.7, "entropy": 0.7526971201101939, "epoch": 0.06000342876735813, "frac_reward_zero_std": 0.0, "grad_norm": 0.373046875, "learning_rate": 1e-06, "loss": -0.0279, "num_tokens": 156753761.0, "reward": -8.87293882369995, "reward_std": 5.407143402099609, "rewards/ADERawReward/mean": -8.912782192230225, "rewards/ADERawReward/std": 5.394438552856445, "rewards/StrictFormatReward/mean": 0.3984375029802322, "rewards/StrictFormatReward/std": 2.4990755558013915, "sampling/importance_sampling_ratio/max": 2.7867547035217286, "sampling/importance_sampling_ratio/mean": 0.3730757534503937, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.122068333625793, "sampling/sampling_logp_difference/mean": 0.02654006313532591, "step": 350, "step_time": 31.515721038298217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012500000325962901, "completions/max_length": 1007.6, "completions/max_terminated_length": 894.1, "completions/mean_length": 747.2932495117187, "completions/mean_terminated_length": 743.7852661132813, "completions/min_length": 591.8, "completions/min_terminated_length": 591.8, "entropy": 0.7486960550149282, "epoch": 0.06171781244642551, "frac_reward_zero_std": 0.0, "grad_norm": 0.2373046875, "learning_rate": 1e-06, "loss": -0.0224, "num_tokens": 161235908.0, "reward": -9.346630477905274, "reward_std": 5.943923234939575, "rewards/ADERawReward/mean": -9.397359657287598, "rewards/ADERawReward/std": 5.933114051818848, "rewards/StrictFormatReward/mean": 0.5072916746139526, "rewards/StrictFormatReward/std": 2.258626651763916, "sampling/importance_sampling_ratio/max": 2.783932328224182, "sampling/importance_sampling_ratio/mean": 0.3899604082107544, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.8976672172546385, "sampling/sampling_logp_difference/mean": 0.02663377095013857, "step": 360, "step_time": 31.072972166599357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010937500232830643, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.6, "completions/mean_length": 743.9771057128906, "completions/mean_terminated_length": 740.8768371582031, "completions/min_length": 373.6, "completions/min_terminated_length": 373.6, "entropy": 0.7463085273901622, "epoch": 0.06343219612549289, "frac_reward_zero_std": 0.0, "grad_norm": 0.43359375, "learning_rate": 1e-06, "loss": -0.0195, "num_tokens": 165711768.0, "reward": -9.58823890686035, "reward_std": 7.584380769729615, "rewards/ADERawReward/mean": -9.62750973701477, "rewards/ADERawReward/std": 7.566325521469116, "rewards/StrictFormatReward/mean": 0.3927083402872086, "rewards/StrictFormatReward/std": 2.502521538734436, "sampling/importance_sampling_ratio/max": 2.824156880378723, "sampling/importance_sampling_ratio/mean": 0.38374905586242675, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.386010980606079, "sampling/sampling_logp_difference/mean": 0.02665374930948019, "step": 370, "step_time": 31.397882584496983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009895833674818277, "completions/max_length": 1000.8, "completions/max_terminated_length": 913.6, "completions/mean_length": 746.1416870117188, "completions/mean_terminated_length": 743.3394592285156, "completions/min_length": 440.4, "completions/min_terminated_length": 440.4, "entropy": 0.7471979439258576, "epoch": 0.06514657980456026, "frac_reward_zero_std": 0.0, "grad_norm": 0.33203125, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 170192072.0, "reward": -9.293725442886352, "reward_std": 5.743723678588867, "rewards/ADERawReward/mean": -9.341016817092896, "rewards/ADERawReward/std": 5.729398393630982, "rewards/StrictFormatReward/mean": 0.4729166701436043, "rewards/StrictFormatReward/std": 2.320163094997406, "sampling/importance_sampling_ratio/max": 2.869658589363098, "sampling/importance_sampling_ratio/mean": 0.3773952782154083, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.138100743293762, "sampling/sampling_logp_difference/mean": 0.026742291823029517, "step": 380, "step_time": 30.96988229679846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012500000325962901, "completions/max_length": 1016.4, "completions/max_terminated_length": 914.7, "completions/mean_length": 748.4885559082031, "completions/mean_terminated_length": 744.993017578125, "completions/min_length": 477.5, "completions/min_terminated_length": 477.5, "entropy": 0.7559548437595367, "epoch": 0.06686096348362763, "frac_reward_zero_std": 0.0, "grad_norm": 0.24609375, "learning_rate": 1e-06, "loss": -0.0181, "num_tokens": 174677202.0, "reward": -9.28088207244873, "reward_std": 5.592328834533691, "rewards/ADERawReward/mean": -9.324163150787353, "rewards/ADERawReward/std": 5.577477645874024, "rewards/StrictFormatReward/mean": 0.43281249795109034, "rewards/StrictFormatReward/std": 2.4112179040908814, "sampling/importance_sampling_ratio/max": 2.825889301300049, "sampling/importance_sampling_ratio/mean": 0.3906464457511902, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.902947115898132, "sampling/sampling_logp_difference/mean": 0.026762423664331438, "step": 390, "step_time": 31.20840191419411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007812500186264514, "completions/max_length": 992.4, "completions/max_terminated_length": 934.3, "completions/mean_length": 745.6031433105469, "completions/mean_terminated_length": 743.4128295898438, "completions/min_length": 437.2, "completions/min_terminated_length": 437.2, "entropy": 0.7531329174836476, "epoch": 0.068575347162695, "frac_reward_zero_std": 0.0, "grad_norm": 0.376953125, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 179156696.0, "reward": -9.488264322280884, "reward_std": 5.980498790740967, "rewards/ADERawReward/mean": -9.5326913356781, "rewards/ADERawReward/std": 5.96862416267395, "rewards/StrictFormatReward/mean": 0.44427084028720853, "rewards/StrictFormatReward/std": 2.4021025896072388, "sampling/importance_sampling_ratio/max": 2.796902632713318, "sampling/importance_sampling_ratio/mean": 0.38265294432640073, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.037104201316834, "sampling/sampling_logp_difference/mean": 0.026842619478702544, "step": 400, "step_time": 30.975298491402647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008854166977107525, "completions/max_length": 1020.0, "completions/max_terminated_length": 935.6, "completions/mean_length": 746.836474609375, "completions/mean_terminated_length": 744.3670227050782, "completions/min_length": 503.7, "completions/min_terminated_length": 503.7, "entropy": 0.7571933090686798, "epoch": 0.07028973084176239, "frac_reward_zero_std": 0.0, "grad_norm": 0.30859375, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 183639214.0, "reward": -8.974328899383545, "reward_std": 6.1143780708312985, "rewards/ADERawReward/mean": -9.019901895523072, "rewards/ADERawReward/std": 6.102041292190552, "rewards/StrictFormatReward/mean": 0.45572916865348817, "rewards/StrictFormatReward/std": 2.3774968147277833, "sampling/importance_sampling_ratio/max": 2.655148983001709, "sampling/importance_sampling_ratio/mean": 0.36126451194286346, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.974502754211426, "sampling/sampling_logp_difference/mean": 0.027187051996588708, "step": 410, "step_time": 31.259745221505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666930541395, "completions/max_length": 1017.3, "completions/max_terminated_length": 918.2, "completions/mean_length": 747.1531433105469, "completions/mean_terminated_length": 744.2319213867188, "completions/min_length": 565.9, "completions/min_terminated_length": 565.9, "entropy": 0.7566624025503794, "epoch": 0.07200411452082976, "frac_reward_zero_std": 0.0, "grad_norm": 0.3359375, "learning_rate": 1e-06, "loss": -0.0087, "num_tokens": 188121876.0, "reward": -8.923426342010497, "reward_std": 6.084955930709839, "rewards/ADERawReward/mean": -8.9724365234375, "rewards/ADERawReward/std": 6.07550630569458, "rewards/StrictFormatReward/mean": 0.4901041708886623, "rewards/StrictFormatReward/std": 2.2838862776756286, "sampling/importance_sampling_ratio/max": 2.812648606300354, "sampling/importance_sampling_ratio/mean": 0.3750308662652969, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.534145450592041, "sampling/sampling_logp_difference/mean": 0.026898518949747086, "step": 420, "step_time": 31.253269567198004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666837409138, "completions/max_length": 1000.9, "completions/max_terminated_length": 899.6, "completions/mean_length": 744.9640930175781, "completions/mean_terminated_length": 742.0363525390625, "completions/min_length": 414.5, "completions/min_terminated_length": 414.5, "entropy": 0.7489292581876119, "epoch": 0.07371849819989713, "frac_reward_zero_std": 0.0, "grad_norm": 0.58203125, "learning_rate": 1e-06, "loss": -0.0073, "num_tokens": 192599695.0, "reward": -9.504335880279541, "reward_std": 5.772246503829956, "rewards/ADERawReward/mean": -9.548762798309326, "rewards/ADERawReward/std": 5.758863925933838, "rewards/StrictFormatReward/mean": 0.4442708283662796, "rewards/StrictFormatReward/std": 2.37346408367157, "sampling/importance_sampling_ratio/max": 2.820726418495178, "sampling/importance_sampling_ratio/mean": 0.36917952001094817, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.74177827835083, "sampling/sampling_logp_difference/mean": 0.026925336755812167, "step": 430, "step_time": 31.05574051309668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013541666977107524, "completions/max_length": 1003.0, "completions/max_terminated_length": 914.5, "completions/mean_length": 746.8135620117188, "completions/mean_terminated_length": 743.0150817871094, "completions/min_length": 349.1, "completions/min_terminated_length": 349.1, "entropy": 0.7525838236014049, "epoch": 0.0754328818789645, "frac_reward_zero_std": 0.0, "grad_norm": 0.3359375, "learning_rate": 1e-06, "loss": -0.0216, "num_tokens": 197081145.0, "reward": -8.942778491973877, "reward_std": 5.134439134597779, "rewards/ADERawReward/mean": -8.988924121856689, "rewards/ADERawReward/std": 5.119956731796265, "rewards/StrictFormatReward/mean": 0.4614583387970924, "rewards/StrictFormatReward/std": 2.3441874384880066, "sampling/importance_sampling_ratio/max": 2.7575156688690186, "sampling/importance_sampling_ratio/mean": 0.3831501841545105, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.117226243019104, "sampling/sampling_logp_difference/mean": 0.02673897407948971, "step": 440, "step_time": 31.136345591007558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00989583358168602, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.1, "completions/mean_length": 746.3265747070312, "completions/mean_terminated_length": 743.5626037597656, "completions/min_length": 502.5, "completions/min_terminated_length": 502.5, "entropy": 0.7494863112767537, "epoch": 0.07714726555803189, "frac_reward_zero_std": 0.0, "grad_norm": 0.396484375, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 201562060.0, "reward": -8.833820581436157, "reward_std": 6.585151100158692, "rewards/ADERawReward/mean": -8.883976650238036, "rewards/ADERawReward/std": 6.576332855224609, "rewards/StrictFormatReward/mean": 0.5015625029802322, "rewards/StrictFormatReward/std": 2.246139848232269, "sampling/importance_sampling_ratio/max": 2.858155703544617, "sampling/importance_sampling_ratio/mean": 0.38457230627536776, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.555598568916321, "sampling/sampling_logp_difference/mean": 0.026770622283220292, "step": 450, "step_time": 31.314683586404136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010937500325962902, "completions/max_length": 1012.7, "completions/max_terminated_length": 908.7, "completions/mean_length": 747.1239807128907, "completions/mean_terminated_length": 744.033935546875, "completions/min_length": 452.2, "completions/min_terminated_length": 452.2, "entropy": 0.7497527619202932, "epoch": 0.07886164923709926, "frac_reward_zero_std": 0.0, "grad_norm": 0.2392578125, "learning_rate": 1e-06, "loss": -0.0312, "num_tokens": 206044186.0, "reward": -8.526823663711548, "reward_std": 4.98410964012146, "rewards/ADERawReward/mean": -8.568385744094849, "rewards/ADERawReward/std": 4.961003923416138, "rewards/StrictFormatReward/mean": 0.415625, "rewards/StrictFormatReward/std": 2.4348717212677, "sampling/importance_sampling_ratio/max": 2.7420751094818114, "sampling/importance_sampling_ratio/mean": 0.3720589101314545, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.188339471817017, "sampling/sampling_logp_difference/mean": 0.026370251737535, "step": 460, "step_time": 31.211517236700455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666837409138, "completions/max_length": 1013.2, "completions/max_terminated_length": 913.8, "completions/mean_length": 747.784912109375, "completions/mean_terminated_length": 744.8851257324219, "completions/min_length": 416.8, "completions/min_terminated_length": 416.8, "entropy": 0.7524498959382375, "epoch": 0.08057603291616663, "frac_reward_zero_std": 0.0, "grad_norm": 0.349609375, "learning_rate": 1e-06, "loss": -0.0088, "num_tokens": 210527581.0, "reward": -9.329790830612183, "reward_std": 5.533093023300171, "rewards/ADERawReward/mean": -9.378228092193604, "rewards/ADERawReward/std": 5.5270590543746945, "rewards/StrictFormatReward/mean": 0.4843750029802322, "rewards/StrictFormatReward/std": 2.30391206741333, "sampling/importance_sampling_ratio/max": 2.7844461679458616, "sampling/importance_sampling_ratio/mean": 0.36996753215789796, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.251289129257202, "sampling/sampling_logp_difference/mean": 0.026920023374259472, "step": 470, "step_time": 31.291990611601797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008333333535119891, "completions/max_length": 1006.7, "completions/max_terminated_length": 924.5, "completions/mean_length": 744.3693054199218, "completions/mean_terminated_length": 742.0288696289062, "completions/min_length": 345.2, "completions/min_terminated_length": 345.2, "entropy": 0.7420653978983561, "epoch": 0.08229041659523402, "frac_reward_zero_std": 0.0, "grad_norm": 0.25390625, "learning_rate": 1e-06, "loss": -0.0135, "num_tokens": 215005026.0, "reward": -8.811880826950073, "reward_std": 5.741937494277954, "rewards/ADERawReward/mean": -8.863182735443115, "rewards/ADERawReward/std": 5.721714019775391, "rewards/StrictFormatReward/mean": 0.5130208387970925, "rewards/StrictFormatReward/std": 2.2228100419044496, "sampling/importance_sampling_ratio/max": 2.887178134918213, "sampling/importance_sampling_ratio/mean": 0.3799948424100876, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.423047304153442, "sampling/sampling_logp_difference/mean": 0.026753966324031354, "step": 480, "step_time": 31.235763939598108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007812500232830644, "completions/max_length": 1007.4, "completions/max_terminated_length": 921.0, "completions/mean_length": 745.5041809082031, "completions/mean_terminated_length": 743.31552734375, "completions/min_length": 503.1, "completions/min_terminated_length": 503.1, "entropy": 0.7548379202683767, "epoch": 0.08400480027430139, "frac_reward_zero_std": 0.0, "grad_norm": 0.302734375, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 219484362.0, "reward": -8.726992988586426, "reward_std": 5.2785539627075195, "rewards/ADERawReward/mean": -8.783451509475707, "rewards/ADERawReward/std": 5.263514995574951, "rewards/StrictFormatReward/mean": 0.5645833373069763, "rewards/StrictFormatReward/std": 2.121866428852081, "sampling/importance_sampling_ratio/max": 2.8357564210891724, "sampling/importance_sampling_ratio/mean": 0.36869050562381744, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.648310852050781, "sampling/sampling_logp_difference/mean": 0.026755044236779213, "step": 490, "step_time": 31.2714247025011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666930541395, "completions/max_length": 1011.9, "completions/max_terminated_length": 899.3, "completions/mean_length": 745.266162109375, "completions/mean_terminated_length": 742.3389892578125, "completions/min_length": 504.1, "completions/min_terminated_length": 504.1, "entropy": 0.7439370552698771, "epoch": 0.08571918395336876, "frac_reward_zero_std": 0.0, "grad_norm": 0.337890625, "learning_rate": 1e-06, "loss": -0.0111, "num_tokens": 223962569.0, "reward": -8.893217849731446, "reward_std": 6.001188564300537, "rewards/ADERawReward/mean": -8.93993616104126, "rewards/ADERawReward/std": 5.9909703731536865, "rewards/StrictFormatReward/mean": 0.46718750298023226, "rewards/StrictFormatReward/std": 2.34447557926178, "sampling/importance_sampling_ratio/max": 2.847609305381775, "sampling/importance_sampling_ratio/mean": 0.4031797587871552, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.063726282119751, "sampling/sampling_logp_difference/mean": 0.02628465835005045, "step": 500, "step_time": 31.151053833100015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008854166837409139, "completions/max_length": 1009.8, "completions/max_terminated_length": 904.5, "completions/mean_length": 745.569287109375, "completions/mean_terminated_length": 743.0883544921875, "completions/min_length": 499.8, "completions/min_terminated_length": 499.8, "entropy": 0.7393844624360403, "epoch": 0.08743356763243613, "frac_reward_zero_std": 0.0, "grad_norm": 0.345703125, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 228442430.0, "reward": -8.746051263809203, "reward_std": 5.200268793106079, "rewards/ADERawReward/mean": -8.796780586242676, "rewards/ADERawReward/std": 5.189048337936401, "rewards/StrictFormatReward/mean": 0.5072916716337204, "rewards/StrictFormatReward/std": 2.2304036617279053, "sampling/importance_sampling_ratio/max": 2.8343311309814454, "sampling/importance_sampling_ratio/mean": 0.3753476530313492, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.841508960723877, "sampling/sampling_logp_difference/mean": 0.026619693823158742, "step": 510, "step_time": 31.27263153729873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007291666837409139, "completions/max_length": 987.4, "completions/max_terminated_length": 894.7, "completions/mean_length": 743.181787109375, "completions/mean_terminated_length": 741.12255859375, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "entropy": 0.7444122533003489, "epoch": 0.08914795131150352, "frac_reward_zero_std": 0.0, "grad_norm": 0.361328125, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 232916635.0, "reward": -8.511340045928955, "reward_std": 4.909872698783874, "rewards/ADERawReward/mean": -8.566652154922485, "rewards/ADERawReward/std": 4.893595862388611, "rewards/StrictFormatReward/mean": 0.5531250022351741, "rewards/StrictFormatReward/std": 2.0853070855140685, "sampling/importance_sampling_ratio/max": 2.862236833572388, "sampling/importance_sampling_ratio/mean": 0.38686116933822634, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.322235846519471, "sampling/sampling_logp_difference/mean": 0.026448269747197627, "step": 520, "step_time": 30.78332566640311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00677083358168602, "completions/max_length": 991.6, "completions/max_terminated_length": 919.4, "completions/mean_length": 747.3963806152344, "completions/mean_terminated_length": 745.5401245117188, "completions/min_length": 380.7, "completions/min_terminated_length": 380.7, "entropy": 0.7540235598882039, "epoch": 0.09086233499057089, "frac_reward_zero_std": 0.0, "grad_norm": 0.322265625, "learning_rate": 1e-06, "loss": -0.0113, "num_tokens": 237399348.0, "reward": -8.865944004058838, "reward_std": 5.564558506011963, "rewards/ADERawReward/mean": -8.92240219116211, "rewards/ADERawReward/std": 5.551964378356933, "rewards/StrictFormatReward/mean": 0.5645833313465118, "rewards/StrictFormatReward/std": 2.112958538532257, "sampling/importance_sampling_ratio/max": 2.7960949182510375, "sampling/importance_sampling_ratio/mean": 0.36649490892887115, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.830978512763977, "sampling/sampling_logp_difference/mean": 0.02695600502192974, "step": 530, "step_time": 30.918909230499413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000186264515, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.9, "completions/mean_length": 744.5010681152344, "completions/mean_terminated_length": 742.7408508300781, "completions/min_length": 418.2, "completions/min_terminated_length": 418.2, "entropy": 0.7437257766723633, "epoch": 0.09257671866963826, "frac_reward_zero_std": 0.0, "grad_norm": 0.28515625, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 241877222.0, "reward": -8.705044603347778, "reward_std": 4.9693234920501705, "rewards/ADERawReward/mean": -8.75405502319336, "rewards/ADERawReward/std": 4.961550354957581, "rewards/StrictFormatReward/mean": 0.490104167163372, "rewards/StrictFormatReward/std": 2.2909578323364257, "sampling/importance_sampling_ratio/max": 2.806812071800232, "sampling/importance_sampling_ratio/mean": 0.38454858362674715, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.238021278381348, "sampling/sampling_logp_difference/mean": 0.026717523485422133, "step": 540, "step_time": 31.30855839920405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000037252903, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.3, "completions/mean_length": 749.8541870117188, "completions/mean_terminated_length": 746.3853088378906, "completions/min_length": 535.8, "completions/min_terminated_length": 535.8, "entropy": 0.7400518854459127, "epoch": 0.09429110234870564, "frac_reward_zero_std": 0.0, "grad_norm": 0.4609375, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 246365646.0, "reward": -8.35037841796875, "reward_std": 5.094221115112305, "rewards/ADERawReward/mean": -8.400534439086915, "rewards/ADERawReward/std": 5.078604912757873, "rewards/StrictFormatReward/mean": 0.5015625029802322, "rewards/StrictFormatReward/std": 2.2548654079437256, "sampling/importance_sampling_ratio/max": 2.729209136962891, "sampling/importance_sampling_ratio/mean": 0.3943050533533096, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.494288897514343, "sampling/sampling_logp_difference/mean": 0.026609367132186888, "step": 550, "step_time": 31.5098136007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011979166930541397, "completions/max_length": 1011.1, "completions/max_terminated_length": 901.4, "completions/mean_length": 746.1073059082031, "completions/mean_terminated_length": 742.7442687988281, "completions/min_length": 385.7, "completions/min_terminated_length": 385.7, "entropy": 0.7511671622594197, "epoch": 0.09600548602777302, "frac_reward_zero_std": 0.0, "grad_norm": 0.421875, "learning_rate": 1e-06, "loss": -0.0039, "num_tokens": 250846300.0, "reward": -8.354753160476685, "reward_std": 4.998448300361633, "rewards/ADERawReward/mean": -8.403190279006958, "rewards/ADERawReward/std": 4.983683443069458, "rewards/StrictFormatReward/mean": 0.484375, "rewards/StrictFormatReward/std": 2.294073450565338, "sampling/importance_sampling_ratio/max": 2.7630903482437135, "sampling/importance_sampling_ratio/mean": 0.371540492773056, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.54563274383545, "sampling/sampling_logp_difference/mean": 0.026747452840209008, "step": 560, "step_time": 31.339443079804187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006770833395421505, "completions/max_length": 1006.3, "completions/max_terminated_length": 919.6, "completions/mean_length": 746.6364807128906, "completions/mean_terminated_length": 744.7229919433594, "completions/min_length": 485.7, "completions/min_terminated_length": 485.7, "entropy": 0.7559505820274353, "epoch": 0.09771986970684039, "frac_reward_zero_std": 0.0, "grad_norm": 0.298828125, "learning_rate": 1e-06, "loss": -0.022, "num_tokens": 255327154.0, "reward": -9.671938514709472, "reward_std": 6.359064817428589, "rewards/ADERawReward/mean": -9.72152156829834, "rewards/ADERawReward/std": 6.352941226959229, "rewards/StrictFormatReward/mean": 0.49583333283662795, "rewards/StrictFormatReward/std": 2.2696222186088564, "sampling/importance_sampling_ratio/max": 2.8152915716171263, "sampling/importance_sampling_ratio/mean": 0.37120532989501953, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.6147093534469605, "sampling/sampling_logp_difference/mean": 0.027104974165558814, "step": 570, "step_time": 31.2074279195047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375000093132257, "completions/max_length": 989.5, "completions/max_terminated_length": 893.7, "completions/mean_length": 747.6479309082031, "completions/mean_terminated_length": 745.0316833496094, "completions/min_length": 484.2, "completions/min_terminated_length": 484.2, "entropy": 0.7506202618281047, "epoch": 0.09943425338590776, "frac_reward_zero_std": 0.0, "grad_norm": 0.34765625, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 259810670.0, "reward": -8.481088829040527, "reward_std": 5.200103354454041, "rewards/ADERawReward/mean": -8.532390689849853, "rewards/ADERawReward/std": 5.18415892124176, "rewards/StrictFormatReward/mean": 0.5130208402872085, "rewards/StrictFormatReward/std": 2.2208899974823, "sampling/importance_sampling_ratio/max": 2.803302502632141, "sampling/importance_sampling_ratio/mean": 0.4020031362771988, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 5.206298041343689, "sampling/sampling_logp_difference/mean": 0.02699448149651289, "step": 580, "step_time": 31.03782037871715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007812500093132257, "completions/max_length": 998.4, "completions/max_terminated_length": 910.7, "completions/mean_length": 747.5250183105469, "completions/mean_terminated_length": 745.33720703125, "completions/min_length": 399.1, "completions/min_terminated_length": 399.1, "entropy": 0.7567769944667816, "epoch": 0.10114863706497514, "frac_reward_zero_std": 0.0, "grad_norm": 0.28125, "learning_rate": 1e-06, "loss": -0.0121, "num_tokens": 264294318.0, "reward": -8.613714408874511, "reward_std": 5.104084444046021, "rewards/ADERawReward/mean": -8.672464275360108, "rewards/ADERawReward/std": 5.086045837402343, "rewards/StrictFormatReward/mean": 0.5874999970197677, "rewards/StrictFormatReward/std": 2.0732890486717226, "sampling/importance_sampling_ratio/max": 2.7467212677001953, "sampling/importance_sampling_ratio/mean": 0.36461472511291504, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.379195928573608, "sampling/sampling_logp_difference/mean": 0.027049308083951472, "step": 590, "step_time": 31.140472374291857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0062500000931322575, "completions/max_length": 992.1, "completions/max_terminated_length": 916.2, "completions/mean_length": 745.9307434082032, "completions/mean_terminated_length": 744.1726928710938, "completions/min_length": 395.7, "completions/min_terminated_length": 395.7, "entropy": 0.7494464596112569, "epoch": 0.10286302074404252, "frac_reward_zero_std": 0.0, "grad_norm": 0.283203125, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 268774665.0, "reward": -8.773472213745118, "reward_std": 5.6750462532043455, "rewards/ADERawReward/mean": -8.832221984863281, "rewards/ADERawReward/std": 5.654839897155762, "rewards/StrictFormatReward/mean": 0.5875000059604645, "rewards/StrictFormatReward/std": 2.0246779561042785, "sampling/importance_sampling_ratio/max": 2.8275955438613893, "sampling/importance_sampling_ratio/mean": 0.3867086052894592, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.084816575050354, "sampling/sampling_logp_difference/mean": 0.027156472206115723, "step": 600, "step_time": 31.06699350780109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0057291668374091385, "completions/max_length": 1017.1, "completions/max_terminated_length": 910.3, "completions/mean_length": 747.0932495117188, "completions/mean_terminated_length": 745.496240234375, "completions/min_length": 485.1, "completions/min_terminated_length": 485.1, "entropy": 0.7526243805885315, "epoch": 0.1045774044231099, "frac_reward_zero_std": 0.0, "grad_norm": 0.396484375, "learning_rate": 1e-06, "loss": -0.0151, "num_tokens": 273256972.0, "reward": -8.731144094467163, "reward_std": 5.71333794593811, "rewards/ADERawReward/mean": -8.782446098327636, "rewards/ADERawReward/std": 5.699142122268677, "rewards/StrictFormatReward/mean": 0.5130208373069763, "rewards/StrictFormatReward/std": 2.253889012336731, "sampling/importance_sampling_ratio/max": 2.8028954029083253, "sampling/importance_sampling_ratio/mean": 0.3589755445718765, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.905529975891113, "sampling/sampling_logp_difference/mean": 0.02720630671828985, "step": 610, "step_time": 31.30127082870167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007291666837409139, "completions/max_length": 986.2, "completions/max_terminated_length": 908.3, "completions/mean_length": 746.2526245117188, "completions/mean_terminated_length": 744.2300537109375, "completions/min_length": 523.4, "completions/min_terminated_length": 523.4, "entropy": 0.7530896683533986, "epoch": 0.10629178810217726, "frac_reward_zero_std": 0.0, "grad_norm": 0.46875, "learning_rate": 1e-06, "loss": -0.0115, "num_tokens": 277737025.0, "reward": -8.564061069488526, "reward_std": 4.777447652816773, "rewards/ADERawReward/mean": -8.621665000915527, "rewards/ADERawReward/std": 4.770789432525635, "rewards/StrictFormatReward/mean": 0.5760416656732559, "rewards/StrictFormatReward/std": 2.0861072897911073, "sampling/importance_sampling_ratio/max": 2.810882544517517, "sampling/importance_sampling_ratio/mean": 0.39588436484336853, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.1026554822921755, "sampling/sampling_logp_difference/mean": 0.02695068046450615, "step": 620, "step_time": 30.90586839320604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007291666837409139, "completions/max_length": 988.7, "completions/max_terminated_length": 898.0, "completions/mean_length": 748.3562683105469, "completions/mean_terminated_length": 746.3339233398438, "completions/min_length": 629.4, "completions/min_terminated_length": 629.4, "entropy": 0.7435857594013214, "epoch": 0.10800617178124464, "frac_reward_zero_std": 0.0, "grad_norm": 0.25390625, "learning_rate": 1e-06, "loss": -0.0193, "num_tokens": 282222253.0, "reward": -9.023520565032959, "reward_std": 5.119320058822632, "rewards/ADERawReward/mean": -9.07825984954834, "rewards/ADERawReward/std": 5.104773378372192, "rewards/StrictFormatReward/mean": 0.5473958343267441, "rewards/StrictFormatReward/std": 2.160345029830933, "sampling/importance_sampling_ratio/max": 2.8361830711364746, "sampling/importance_sampling_ratio/mean": 0.37177495658397675, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.693781447410584, "sampling/sampling_logp_difference/mean": 0.027231728471815587, "step": 630, "step_time": 30.870413991407258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004687500139698386, "completions/max_length": 971.6, "completions/max_terminated_length": 889.4, "completions/mean_length": 747.0187744140625, "completions/mean_terminated_length": 745.7044677734375, "completions/min_length": 558.4, "completions/min_terminated_length": 558.4, "entropy": 0.7533182700475057, "epoch": 0.10972055546031202, "frac_reward_zero_std": 0.0, "grad_norm": 0.34375, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 286704369.0, "reward": -9.217678833007813, "reward_std": 5.054318857192993, "rewards/ADERawReward/mean": -9.277001667022706, "rewards/ADERawReward/std": 5.038924503326416, "rewards/StrictFormatReward/mean": 0.5932291626930237, "rewards/StrictFormatReward/std": 2.0508111119270325, "sampling/importance_sampling_ratio/max": 2.866812252998352, "sampling/importance_sampling_ratio/mean": 0.3646788477897644, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.960925984382629, "sampling/sampling_logp_difference/mean": 0.027474281564354896, "step": 640, "step_time": 30.600575475511143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003645833395421505, "completions/max_length": 960.3, "completions/max_terminated_length": 891.2, "completions/mean_length": 742.1609497070312, "completions/mean_terminated_length": 741.1292663574219, "completions/min_length": 366.8, "completions/min_terminated_length": 366.8, "entropy": 0.7416558722654979, "epoch": 0.1114349391393794, "frac_reward_zero_std": 0.0, "grad_norm": 0.31640625, "learning_rate": 1e-06, "loss": -0.0118, "num_tokens": 291177734.0, "reward": -8.407093048095703, "reward_std": 5.326395010948181, "rewards/ADERawReward/mean": -8.467561674118041, "rewards/ADERawReward/std": 5.315518450737, "rewards/StrictFormatReward/mean": 0.6046874970197678, "rewards/StrictFormatReward/std": 2.026141095161438, "sampling/importance_sampling_ratio/max": 2.862113666534424, "sampling/importance_sampling_ratio/mean": 0.3777022361755371, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.170439219474792, "sampling/sampling_logp_difference/mean": 0.027010084502398966, "step": 650, "step_time": 30.520393895095914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010937500279396772, "completions/max_length": 1009.0, "completions/max_terminated_length": 908.3, "completions/mean_length": 746.7343872070312, "completions/mean_terminated_length": 743.6657897949219, "completions/min_length": 455.1, "completions/min_terminated_length": 455.1, "entropy": 0.748453684647878, "epoch": 0.11314932281844677, "frac_reward_zero_std": 0.0, "grad_norm": 0.220703125, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 295659304.0, "reward": -8.47295446395874, "reward_std": 5.315748786926269, "rewards/ADERawReward/mean": -8.525974988937378, "rewards/ADERawReward/std": 5.30073721408844, "rewards/StrictFormatReward/mean": 0.5302083283662796, "rewards/StrictFormatReward/std": 2.2150806427001952, "sampling/importance_sampling_ratio/max": 2.8308970451354982, "sampling/importance_sampling_ratio/mean": 0.36592633128166197, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.81339545249939, "sampling/sampling_logp_difference/mean": 0.027280481532216072, "step": 660, "step_time": 31.34898143170285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0031250000931322573, "completions/max_length": 961.7, "completions/max_terminated_length": 898.4, "completions/mean_length": 745.1661743164062, "completions/mean_terminated_length": 744.2929931640625, "completions/min_length": 464.4, "completions/min_terminated_length": 464.4, "entropy": 0.7453262567520141, "epoch": 0.11486370649751414, "frac_reward_zero_std": 0.0, "grad_norm": 0.228515625, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 300137783.0, "reward": -8.66427845954895, "reward_std": 4.92835762500763, "rewards/ADERawReward/mean": -8.727611589431763, "rewards/ADERawReward/std": 4.9187664270401, "rewards/StrictFormatReward/mean": 0.6333333373069763, "rewards/StrictFormatReward/std": 1.9716094613075257, "sampling/importance_sampling_ratio/max": 2.7208317041397097, "sampling/importance_sampling_ratio/mean": 0.3644146382808685, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.467776584625244, "sampling/sampling_logp_difference/mean": 0.027055931463837623, "step": 670, "step_time": 30.402021049396716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007812500139698387, "completions/max_length": 1009.0, "completions/max_terminated_length": 900.7, "completions/mean_length": 744.5047119140625, "completions/mean_terminated_length": 742.3053039550781, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "entropy": 0.7389722168445587, "epoch": 0.11657809017658152, "frac_reward_zero_std": 0.0, "grad_norm": 0.310546875, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 304615648.0, "reward": -8.14780044555664, "reward_std": 5.765257406234741, "rewards/ADERawReward/mean": -8.198529577255249, "rewards/ADERawReward/std": 5.752817440032959, "rewards/StrictFormatReward/mean": 0.507291667163372, "rewards/StrictFormatReward/std": 2.2412220239639282, "sampling/importance_sampling_ratio/max": 2.838437080383301, "sampling/importance_sampling_ratio/mean": 0.3927165180444717, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.8545892953872682, "sampling/sampling_logp_difference/mean": 0.026737906597554685, "step": 680, "step_time": 31.265542278197245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007291666837409139, "completions/max_length": 1011.0, "completions/max_terminated_length": 918.1, "completions/mean_length": 745.039599609375, "completions/mean_terminated_length": 743.0080322265625, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "entropy": 0.7438246866067251, "epoch": 0.1182924738556489, "frac_reward_zero_std": 0.0, "grad_norm": 0.326171875, "learning_rate": 1e-06, "loss": -0.0139, "num_tokens": 309093692.0, "reward": -8.50984206199646, "reward_std": 4.543536067008972, "rewards/ADERawReward/mean": -8.562862634658813, "rewards/ADERawReward/std": 4.5306751251220705, "rewards/StrictFormatReward/mean": 0.5302083313465118, "rewards/StrictFormatReward/std": 2.1870830774307253, "sampling/importance_sampling_ratio/max": 2.8570920705795286, "sampling/importance_sampling_ratio/mean": 0.3624374896287918, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.160294127464295, "sampling/sampling_logp_difference/mean": 0.02724489979445934, "step": 690, "step_time": 31.26219962689502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005208333441987634, "completions/max_length": 996.0, "completions/max_terminated_length": 924.7, "completions/mean_length": 745.0484558105469, "completions/mean_terminated_length": 743.5991821289062, "completions/min_length": 539.6, "completions/min_terminated_length": 539.6, "entropy": 0.7445198019345601, "epoch": 0.12000685753471627, "frac_reward_zero_std": 0.0, "grad_norm": 0.390625, "learning_rate": 1e-06, "loss": -0.0064, "num_tokens": 313571945.0, "reward": -8.856493520736695, "reward_std": 4.863119053840637, "rewards/ADERawReward/mean": -8.914670515060426, "rewards/ADERawReward/std": 4.857007074356079, "rewards/StrictFormatReward/mean": 0.5817708313465119, "rewards/StrictFormatReward/std": 2.0837946057319643, "sampling/importance_sampling_ratio/max": 2.8482714176177977, "sampling/importance_sampling_ratio/mean": 0.3727655470371246, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.315770363807678, "sampling/sampling_logp_difference/mean": 0.02694130353629589, "step": 700, "step_time": 31.00740701830364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00885416679084301, "completions/max_length": 980.0, "completions/max_terminated_length": 904.4, "completions/mean_length": 745.5870056152344, "completions/mean_terminated_length": 743.1242919921875, "completions/min_length": 423.9, "completions/min_terminated_length": 423.9, "entropy": 0.7471017857392629, "epoch": 0.12172124121378365, "frac_reward_zero_std": 0.0, "grad_norm": 0.3125, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 318051584.0, "reward": -8.626063299179076, "reward_std": 5.039473915100098, "rewards/ADERawReward/mean": -8.679083728790284, "rewards/ADERawReward/std": 5.022432565689087, "rewards/StrictFormatReward/mean": 0.5302083313465118, "rewards/StrictFormatReward/std": 2.2059712767601014, "sampling/importance_sampling_ratio/max": 2.6881606578826904, "sampling/importance_sampling_ratio/mean": 0.35196583569049833, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.9412872314453127, "sampling/sampling_logp_difference/mean": 0.02722361944615841, "step": 710, "step_time": 30.79243855099194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007812500093132257, "completions/max_length": 1011.8, "completions/max_terminated_length": 889.3, "completions/mean_length": 745.9838806152344, "completions/mean_terminated_length": 743.8016540527344, "completions/min_length": 430.3, "completions/min_terminated_length": 430.3, "entropy": 0.737281721830368, "epoch": 0.12343562489285102, "frac_reward_zero_std": 0.0, "grad_norm": 0.333984375, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 322532273.0, "reward": -8.676169490814209, "reward_std": 6.3611784934997555, "rewards/ADERawReward/mean": -8.736637830734253, "rewards/ADERawReward/std": 6.346572685241699, "rewards/StrictFormatReward/mean": 0.6046874940395355, "rewards/StrictFormatReward/std": 2.027357840538025, "sampling/importance_sampling_ratio/max": 2.7995285272598265, "sampling/importance_sampling_ratio/mean": 0.35478300750255587, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.93809871673584, "sampling/sampling_logp_difference/mean": 0.027037655375897883, "step": 720, "step_time": 31.32133076491009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006250000139698386, "completions/max_length": 1008.3, "completions/max_terminated_length": 902.0, "completions/mean_length": 744.9453369140625, "completions/mean_terminated_length": 743.1854309082031, "completions/min_length": 398.3, "completions/min_terminated_length": 398.3, "entropy": 0.7388640781243642, "epoch": 0.1251500085719184, "frac_reward_zero_std": 0.0, "grad_norm": 0.388671875, "learning_rate": 1e-06, "loss": -0.0171, "num_tokens": 327010056.0, "reward": -8.387274026870728, "reward_std": 4.668637943267822, "rewards/ADERawReward/mean": -8.443159294128417, "rewards/ADERawReward/std": 4.659743785858154, "rewards/StrictFormatReward/mean": 0.5588541686534881, "rewards/StrictFormatReward/std": 2.136683487892151, "sampling/importance_sampling_ratio/max": 2.7782902002334593, "sampling/importance_sampling_ratio/mean": 0.3791107714176178, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.789157485961914, "sampling/sampling_logp_difference/mean": 0.02694005910307169, "step": 730, "step_time": 31.208155655500015 } ], "logging_steps": 10, "max_steps": 730, "num_input_tokens_seen": 327010056, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }