{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8565310492505354, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 425.75, "completions/max_terminated_length": 407.25, "completions/mean_length": 224.9140625, "completions/mean_terminated_length": 222.6020851135254, "completions/min_length": 100.75, "completions/min_terminated_length": 100.75, "entropy": 0.3943025507032871, "epoch": 0.008565310492505354, "frac_reward_zero_std": 0.859375, "grad_norm": 2.234375, "learning_rate": 0.0, "loss": -0.0682, "num_tokens": 40707.0, "reward": 0.109375, "reward_std": 0.13258252362720668, "rewards/correctness_reward_func/mean": 0.046875, "rewards/correctness_reward_func/std": 0.1875, "rewards/int_reward_func/mean": 0.015625, "rewards/int_reward_func/std": 0.0625, "rewards/soft_format_reward_func/mean": 0.015625, "rewards/soft_format_reward_func/std": 0.05259781517088413, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.03125, "rewards/xmlcount_reward_func/std": 0.09922334365546703, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 454.625, "completions/max_terminated_length": 431.875, "completions/mean_length": 240.2421875, "completions/mean_terminated_length": 236.11719131469727, "completions/min_length": 120.125, "completions/min_terminated_length": 120.125, "entropy": 0.4173264354467392, "epoch": 0.017130620985010708, "frac_reward_zero_std": 0.625, "grad_norm": 4.46875, "learning_rate": 1.6666666666666667e-06, "loss": 0.0373, "num_tokens": 84076.0, "reward": 0.2431640625, "reward_std": 0.30797815788537264, "rewards/correctness_reward_func/mean": 0.078125, "rewards/correctness_reward_func/std": 0.3125, "rewards/int_reward_func/mean": 0.0390625, "rewards/int_reward_func/std": 0.13456955552101135, "rewards/soft_format_reward_func/mean": 0.046875, "rewards/soft_format_reward_func/std": 0.12433474138379097, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0791015625, "rewards/xmlcount_reward_func/std": 0.1614172589033842, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 379.375, "completions/max_terminated_length": 372.125, "completions/mean_length": 209.1484375, "completions/mean_terminated_length": 207.07291793823242, "completions/min_length": 90.5, "completions/min_terminated_length": 90.5, "entropy": 0.39367850497365, "epoch": 0.02569593147751606, "frac_reward_zero_std": 0.796875, "grad_norm": 3.0, "learning_rate": 3.3333333333333333e-06, "loss": 0.0145, "num_tokens": 122753.0, "reward": 0.1796875, "reward_std": 0.17677669739350677, "rewards/correctness_reward_func/mean": 0.078125, "rewards/correctness_reward_func/std": 0.2257782220840454, "rewards/int_reward_func/mean": 0.0234375, "rewards/int_reward_func/std": 0.07206955552101135, "rewards/soft_format_reward_func/mean": 0.02734375, "rewards/soft_format_reward_func/std": 0.07779237069189548, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.05078125, "rewards/xmlcount_reward_func/std": 0.11597390845417976, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.75, "completions/max_terminated_length": 381.75, "completions/mean_length": 205.140625, "completions/mean_terminated_length": 205.140625, "completions/min_length": 82.5, "completions/min_terminated_length": 82.5, "entropy": 0.42507942393422127, "epoch": 0.034261241970021415, "frac_reward_zero_std": 0.609375, "grad_norm": 4.5, "learning_rate": 5e-06, "loss": -0.1276, "num_tokens": 161051.0, "reward": 0.2958984375, "reward_std": 0.3052160106599331, "rewards/correctness_reward_func/mean": 0.078125, "rewards/correctness_reward_func/std": 0.2257782220840454, "rewards/int_reward_func/mean": 0.03125, "rewards/int_reward_func/std": 0.09341737069189548, "rewards/soft_format_reward_func/mean": 0.07421875, "rewards/soft_format_reward_func/std": 0.17638970352709293, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.1123046875, "rewards/xmlcount_reward_func/std": 0.1925698984414339, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.25, "completions/max_terminated_length": 383.25, "completions/mean_length": 207.828125, "completions/mean_terminated_length": 207.828125, "completions/min_length": 74.5, "completions/min_terminated_length": 74.5, "entropy": 0.4761287160217762, "epoch": 0.042826552462526764, "frac_reward_zero_std": 0.34375, "grad_norm": 6.28125, "learning_rate": 6.666666666666667e-06, "loss": -0.0135, "num_tokens": 199521.0, "reward": 1.1474609375, "reward_std": 0.5952402763068676, "rewards/correctness_reward_func/mean": 0.359375, "rewards/correctness_reward_func/std": 0.7196519374847412, "rewards/int_reward_func/mean": 0.12109375, "rewards/int_reward_func/std": 0.20104984939098358, "rewards/soft_format_reward_func/mean": 0.3203125, "rewards/soft_format_reward_func/std": 0.23448428697884083, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.3466796875, "rewards/xmlcount_reward_func/std": 0.20816493593156338, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 434.5, "completions/max_terminated_length": 387.375, "completions/mean_length": 231.28125, "completions/mean_terminated_length": 226.61354446411133, "completions/min_length": 118.25, "completions/min_terminated_length": 118.25, "entropy": 0.42499926686286926, "epoch": 0.05139186295503212, "frac_reward_zero_std": 0.578125, "grad_norm": 3.265625, "learning_rate": 8.333333333333334e-06, "loss": 0.1364, "num_tokens": 241131.0, "reward": 1.4375, "reward_std": 0.4833737723529339, "rewards/correctness_reward_func/mean": 0.375, "rewards/correctness_reward_func/std": 0.741176463663578, "rewards/int_reward_func/mean": 0.13671875, "rewards/int_reward_func/std": 0.2227986976504326, "rewards/soft_format_reward_func/mean": 0.44921875, "rewards/soft_format_reward_func/std": 0.1270910371094942, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.4765625, "rewards/xmlcount_reward_func/std": 0.07140547037124634, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 453.0, "completions/max_terminated_length": 433.125, "completions/mean_length": 238.6796875, "completions/mean_terminated_length": 234.18125343322754, "completions/min_length": 96.375, "completions/min_terminated_length": 96.375, "entropy": 0.4198453910648823, "epoch": 0.059957173447537475, "frac_reward_zero_std": 0.421875, "grad_norm": 4.15625, "learning_rate": 1e-05, "loss": 0.1899, "num_tokens": 283686.0, "reward": 1.7509765625, "reward_std": 0.7305849269032478, "rewards/correctness_reward_func/mean": 0.609375, "rewards/correctness_reward_func/std": 0.9355916231870651, "rewards/int_reward_func/mean": 0.1953125, "rewards/int_reward_func/std": 0.2459700107574463, "rewards/soft_format_reward_func/mean": 0.4609375, "rewards/soft_format_reward_func/std": 0.1128891110420227, "rewards/strict_format_reward_func/mean": 0.00390625, "rewards/strict_format_reward_func/std": 0.015625, "rewards/xmlcount_reward_func/mean": 0.4814453125, "rewards/xmlcount_reward_func/std": 0.046708236914128065, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.375, "completions/max_terminated_length": 364.375, "completions/mean_length": 220.34375, "completions/mean_terminated_length": 220.34375, "completions/min_length": 98.875, "completions/min_terminated_length": 98.875, "entropy": 0.42923642322421074, "epoch": 0.06852248394004283, "frac_reward_zero_std": 0.46875, "grad_norm": 4.59375, "learning_rate": 1.1666666666666668e-05, "loss": -0.0768, "num_tokens": 324156.0, "reward": 2.1044921875, "reward_std": 0.6339101828634739, "rewards/correctness_reward_func/mean": 0.859375, "rewards/correctness_reward_func/std": 0.9749292060732841, "rewards/int_reward_func/mean": 0.3046875, "rewards/int_reward_func/std": 0.23351078107953072, "rewards/soft_format_reward_func/mean": 0.46484375, "rewards/soft_format_reward_func/std": 0.08251741342246532, "rewards/strict_format_reward_func/mean": 0.00390625, "rewards/strict_format_reward_func/std": 0.015625, "rewards/xmlcount_reward_func/mean": 0.4716796875, "rewards/xmlcount_reward_func/std": 0.06621513469144702, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 376.5, "completions/max_terminated_length": 361.25, "completions/mean_length": 216.578125, "completions/mean_terminated_length": 212.43080520629883, "completions/min_length": 96.875, "completions/min_terminated_length": 96.875, "entropy": 0.3928321301937103, "epoch": 0.07708779443254818, "frac_reward_zero_std": 0.59375, "grad_norm": 3.5, "learning_rate": 1.3333333333333333e-05, "loss": 0.0392, "num_tokens": 363864.0, "reward": 2.5830078125, "reward_std": 0.40741503052413464, "rewards/correctness_reward_func/mean": 1.203125, "rewards/correctness_reward_func/std": 0.9139788597822189, "rewards/int_reward_func/mean": 0.42578125, "rewards/int_reward_func/std": 0.1317095011472702, "rewards/soft_format_reward_func/mean": 0.46875, "rewards/soft_format_reward_func/std": 0.09341737069189548, "rewards/strict_format_reward_func/mean": 0.0078125, "rewards/strict_format_reward_func/std": 0.03125, "rewards/xmlcount_reward_func/mean": 0.4775390625, "rewards/xmlcount_reward_func/std": 0.06064485618844628, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 403.75, "completions/max_terminated_length": 386.625, "completions/mean_length": 210.6640625, "completions/mean_terminated_length": 203.55908012390137, "completions/min_length": 75.875, "completions/min_terminated_length": 75.875, "entropy": 0.3184557221829891, "epoch": 0.08565310492505353, "frac_reward_zero_std": 0.421875, "grad_norm": 5.125, "learning_rate": 1.5000000000000002e-05, "loss": -0.0025, "num_tokens": 402647.0, "reward": 2.7490234375, "reward_std": 0.7140121199190617, "rewards/correctness_reward_func/mean": 1.375, "rewards/correctness_reward_func/std": 0.9443820938467979, "rewards/int_reward_func/mean": 0.453125, "rewards/int_reward_func/std": 0.12136822193861008, "rewards/soft_format_reward_func/mean": 0.4609375, "rewards/soft_format_reward_func/std": 0.10298692621290684, "rewards/strict_format_reward_func/mean": 0.03125, "rewards/strict_format_reward_func/std": 0.10519563034176826, "rewards/xmlcount_reward_func/mean": 0.4287109375, "rewards/xmlcount_reward_func/std": 0.144282141700387, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 434.75, "completions/max_terminated_length": 372.875, "completions/mean_length": 238.5390625, "completions/mean_terminated_length": 229.2265682220459, "completions/min_length": 88.375, "completions/min_terminated_length": 88.375, "entropy": 0.3128885291516781, "epoch": 0.09421841541755889, "frac_reward_zero_std": 0.359375, "grad_norm": 5.78125, "learning_rate": 1.6666666666666667e-05, "loss": 0.2439, "num_tokens": 445266.0, "reward": 3.0341796875, "reward_std": 0.5344732906669378, "rewards/correctness_reward_func/mean": 1.421875, "rewards/correctness_reward_func/std": 0.8460541293025017, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.09914018586277962, "rewards/soft_format_reward_func/mean": 0.484375, "rewards/soft_format_reward_func/std": 0.05259781517088413, "rewards/strict_format_reward_func/mean": 0.203125, "rewards/strict_format_reward_func/std": 0.2389280553907156, "rewards/xmlcount_reward_func/mean": 0.4599609375, "rewards/xmlcount_reward_func/std": 0.10840688459575176, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 373.125, "completions/max_terminated_length": 353.5, "completions/mean_length": 218.9140625, "completions/mean_terminated_length": 216.4234390258789, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2998475953936577, "epoch": 0.10278372591006424, "frac_reward_zero_std": 0.71875, "grad_norm": 4.1875, "learning_rate": 1.8333333333333333e-05, "loss": -0.1385, "num_tokens": 485929.0, "reward": 3.484375, "reward_std": 0.38393688201904297, "rewards/correctness_reward_func/mean": 1.59375, "rewards/correctness_reward_func/std": 0.7323416471481323, "rewards/int_reward_func/mean": 0.46875, "rewards/int_reward_func/std": 0.06877040676772594, "rewards/soft_format_reward_func/mean": 0.48828125, "rewards/soft_format_reward_func/std": 0.025194555521011353, "rewards/strict_format_reward_func/mean": 0.46875, "rewards/strict_format_reward_func/std": 0.09341737069189548, "rewards/xmlcount_reward_func/mean": 0.46484375, "rewards/xmlcount_reward_func/std": 0.09572842810302973, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 370.125, "completions/max_terminated_length": 333.5, "completions/mean_length": 204.46875, "completions/mean_terminated_length": 201.68333435058594, "completions/min_length": 95.875, "completions/min_terminated_length": 95.875, "entropy": 0.1816732920706272, "epoch": 0.11134903640256959, "frac_reward_zero_std": 0.703125, "grad_norm": 3.96875, "learning_rate": 2e-05, "loss": 0.2391, "num_tokens": 523887.0, "reward": 3.525390625, "reward_std": 0.30659707519225776, "rewards/correctness_reward_func/mean": 1.5625, "rewards/correctness_reward_func/std": 0.7897166311740875, "rewards/int_reward_func/mean": 0.4921875, "rewards/int_reward_func/std": 0.021347815170884132, "rewards/soft_format_reward_func/mean": 0.5, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.4765625, "rewards/strict_format_reward_func/std": 0.08384781517088413, "rewards/xmlcount_reward_func/mean": 0.494140625, "rewards/xmlcount_reward_func/std": 0.0234375, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 404.0, "completions/max_terminated_length": 354.75, "completions/mean_length": 187.515625, "completions/mean_terminated_length": 179.75893211364746, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.15807979460805655, "epoch": 0.11991434689507495, "frac_reward_zero_std": 0.75, "grad_norm": 4.46875, "learning_rate": 1.9995524322835035e-05, "loss": -0.0269, "num_tokens": 559761.0, "reward": 3.4208984375, "reward_std": 0.32731308368965983, "rewards/correctness_reward_func/mean": 1.53125, "rewards/correctness_reward_func/std": 0.8291211053729057, "rewards/int_reward_func/mean": 0.46875, "rewards/int_reward_func/std": 0.09341737069189548, "rewards/soft_format_reward_func/mean": 0.47265625, "rewards/soft_format_reward_func/std": 0.0660141110420227, "rewards/strict_format_reward_func/mean": 0.46875, "rewards/strict_format_reward_func/std": 0.07173692621290684, "rewards/xmlcount_reward_func/mean": 0.4794921875, "rewards/xmlcount_reward_func/std": 0.05074238684028387, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 349.625, "completions/max_terminated_length": 338.5, "completions/mean_length": 187.828125, "completions/mean_terminated_length": 185.50208473205566, "completions/min_length": 92.625, "completions/min_terminated_length": 92.625, "entropy": 0.17131240852177143, "epoch": 0.1284796573875803, "frac_reward_zero_std": 0.765625, "grad_norm": 3.765625, "learning_rate": 1.998210129767735e-05, "loss": 0.2086, "num_tokens": 596775.0, "reward": 3.4423828125, "reward_std": 0.3024538792669773, "rewards/correctness_reward_func/mean": 1.484375, "rewards/correctness_reward_func/std": 0.8679328411817551, "rewards/int_reward_func/mean": 0.484375, "rewards/int_reward_func/std": 0.05259781517088413, "rewards/soft_format_reward_func/mean": 0.49609375, "rewards/soft_format_reward_func/std": 0.015625, "rewards/strict_format_reward_func/mean": 0.484375, "rewards/strict_format_reward_func/std": 0.04081955552101135, "rewards/xmlcount_reward_func/mean": 0.4931640625, "rewards/xmlcount_reward_func/std": 0.02734375, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 401.375, "completions/max_terminated_length": 382.125, "completions/mean_length": 206.8984375, "completions/mean_terminated_length": 202.10052299499512, "completions/min_length": 102.25, "completions/min_terminated_length": 102.25, "entropy": 0.15633743070065975, "epoch": 0.13704496788008566, "frac_reward_zero_std": 0.765625, "grad_norm": 3.4375, "learning_rate": 1.9959742939952393e-05, "loss": 0.0938, "num_tokens": 635224.0, "reward": 3.591796875, "reward_std": 0.35631553269922733, "rewards/correctness_reward_func/mean": 1.625, "rewards/correctness_reward_func/std": 0.7407501488924026, "rewards/int_reward_func/mean": 0.48828125, "rewards/int_reward_func/std": 0.046875, "rewards/soft_format_reward_func/mean": 0.4921875, "rewards/soft_format_reward_func/std": 0.03125, "rewards/strict_format_reward_func/mean": 0.4921875, "rewards/strict_format_reward_func/std": 0.03125, "rewards/xmlcount_reward_func/mean": 0.494140625, "rewards/xmlcount_reward_func/std": 0.0234375, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 360.0, "completions/max_terminated_length": 330.125, "completions/mean_length": 199.09375, "completions/mean_terminated_length": 194.2984390258789, "completions/min_length": 104.625, "completions/min_terminated_length": 104.625, "entropy": 0.19571769889444113, "epoch": 0.145610278372591, "frac_reward_zero_std": 0.8125, "grad_norm": 2.9375, "learning_rate": 1.9928469263418376e-05, "loss": 0.0169, "num_tokens": 671932.0, "reward": 3.57421875, "reward_std": 0.23754368349909782, "rewards/correctness_reward_func/mean": 1.609375, "rewards/correctness_reward_func/std": 0.6196783930063248, "rewards/int_reward_func/mean": 0.484375, "rewards/int_reward_func/std": 0.04081955552101135, "rewards/soft_format_reward_func/mean": 0.4921875, "rewards/soft_format_reward_func/std": 0.03125, "rewards/strict_format_reward_func/mean": 0.4921875, "rewards/strict_format_reward_func/std": 0.03125, "rewards/xmlcount_reward_func/mean": 0.49609375, "rewards/xmlcount_reward_func/std": 0.015625, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 363.5, "completions/max_terminated_length": 352.125, "completions/mean_length": 207.1640625, "completions/mean_terminated_length": 204.95573043823242, "completions/min_length": 96.75, "completions/min_terminated_length": 96.75, "entropy": 0.20056522078812122, "epoch": 0.15417558886509636, "frac_reward_zero_std": 0.765625, "grad_norm": 5.84375, "learning_rate": 1.9888308262251286e-05, "loss": 0.0571, "num_tokens": 710529.0, "reward": 3.4853515625, "reward_std": 0.3079781490378082, "rewards/correctness_reward_func/mean": 1.578125, "rewards/correctness_reward_func/std": 0.7429328411817551, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.08923800103366375, "rewards/soft_format_reward_func/mean": 0.484375, "rewards/soft_format_reward_func/std": 0.04081955552101135, "rewards/strict_format_reward_func/mean": 0.47265625, "rewards/strict_format_reward_func/std": 0.07779237069189548, "rewards/xmlcount_reward_func/mean": 0.4853515625, "rewards/xmlcount_reward_func/std": 0.03691330552101135, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 242.7109375, "completions/mean_terminated_length": 242.7109375, "completions/min_length": 131.125, "completions/min_terminated_length": 131.125, "entropy": 0.18245846219360828, "epoch": 0.16274089935760172, "frac_reward_zero_std": 0.71875, "grad_norm": 2.609375, "learning_rate": 1.98392958859863e-05, "loss": -0.0092, "num_tokens": 753458.0, "reward": 3.4609375, "reward_std": 0.39774756878614426, "rewards/correctness_reward_func/mean": 1.484375, "rewards/correctness_reward_func/std": 0.8802329078316689, "rewards/int_reward_func/mean": 0.4765625, "rewards/int_reward_func/std": 0.07206955552101135, "rewards/soft_format_reward_func/mean": 0.5, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.5, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.5, "rewards/xmlcount_reward_func/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 453.25, "completions/max_terminated_length": 416.25, "completions/mean_length": 240.1328125, "completions/mean_terminated_length": 232.0755271911621, "completions/min_length": 136.875, "completions/min_terminated_length": 136.875, "entropy": 0.1867619026452303, "epoch": 0.17130620985010706, "frac_reward_zero_std": 0.765625, "grad_norm": 2.640625, "learning_rate": 1.9781476007338058e-05, "loss": 0.1299, "num_tokens": 796607.0, "reward": 3.5458984375, "reward_std": 0.33283737674355507, "rewards/correctness_reward_func/mean": 1.59375, "rewards/correctness_reward_func/std": 0.7794546857476234, "rewards/int_reward_func/mean": 0.48828125, "rewards/int_reward_func/std": 0.046875, "rewards/soft_format_reward_func/mean": 0.48828125, "rewards/soft_format_reward_func/std": 0.046875, "rewards/strict_format_reward_func/mean": 0.484375, "rewards/strict_format_reward_func/std": 0.0625, "rewards/xmlcount_reward_func/mean": 0.4912109375, "rewards/xmlcount_reward_func/std": 0.03515625, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.625, "completions/max_terminated_length": 364.625, "completions/mean_length": 236.8828125, "completions/mean_terminated_length": 236.8828125, "completions/min_length": 135.25, "completions/min_terminated_length": 135.25, "entropy": 0.17888653837144375, "epoch": 0.17987152034261242, "frac_reward_zero_std": 0.8125, "grad_norm": 2.40625, "learning_rate": 1.9714900382928674e-05, "loss": -0.0113, "num_tokens": 839112.0, "reward": 3.537109375, "reward_std": 0.2568786293268204, "rewards/correctness_reward_func/mean": 1.546875, "rewards/correctness_reward_func/std": 0.7888757362961769, "rewards/int_reward_func/mean": 0.49609375, "rewards/int_reward_func/std": 0.015625, "rewards/soft_format_reward_func/mean": 0.5, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.49609375, "rewards/strict_format_reward_func/std": 0.015625, "rewards/xmlcount_reward_func/mean": 0.498046875, "rewards/xmlcount_reward_func/std": 0.0078125, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 402.25, "completions/max_terminated_length": 371.75, "completions/mean_length": 232.953125, "completions/mean_terminated_length": 226.11860275268555, "completions/min_length": 142.125, "completions/min_terminated_length": 142.125, "entropy": 0.1563574131578207, "epoch": 0.18843683083511778, "frac_reward_zero_std": 0.78125, "grad_norm": 2.328125, "learning_rate": 1.9639628606958535e-05, "loss": 0.0542, "num_tokens": 881088.0, "reward": 3.5693359375, "reward_std": 0.30245387367904186, "rewards/correctness_reward_func/mean": 1.625, "rewards/correctness_reward_func/std": 0.7605545148253441, "rewards/int_reward_func/mean": 0.4765625, "rewards/int_reward_func/std": 0.08384781517088413, "rewards/soft_format_reward_func/mean": 0.48828125, "rewards/soft_format_reward_func/std": 0.03697281517088413, "rewards/strict_format_reward_func/mean": 0.48828125, "rewards/strict_format_reward_func/std": 0.03697281517088413, "rewards/xmlcount_reward_func/mean": 0.4912109375, "rewards/xmlcount_reward_func/std": 0.027729611843824387, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 446.75, "completions/max_terminated_length": 402.875, "completions/mean_length": 254.4765625, "completions/mean_terminated_length": 242.60871124267578, "completions/min_length": 135.75, "completions/min_terminated_length": 135.75, "entropy": 0.16357604414224625, "epoch": 0.19700214132762311, "frac_reward_zero_std": 0.6875, "grad_norm": 3.65625, "learning_rate": 1.955572805786141e-05, "loss": 0.0195, "num_tokens": 926107.0, "reward": 3.341796875, "reward_std": 0.4087961111217737, "rewards/correctness_reward_func/mean": 1.46875, "rewards/correctness_reward_func/std": 0.8502998873591423, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.08439540676772594, "rewards/soft_format_reward_func/mean": 0.46875, "rewards/soft_format_reward_func/std": 0.08054866641759872, "rewards/strict_format_reward_func/mean": 0.4609375, "rewards/strict_format_reward_func/std": 0.10189648158848286, "rewards/xmlcount_reward_func/mean": 0.478515625, "rewards/xmlcount_reward_func/std": 0.04778209747746587, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 403.5, "completions/max_terminated_length": 380.875, "completions/mean_length": 241.0703125, "completions/mean_terminated_length": 239.03854370117188, "completions/min_length": 119.25, "completions/min_terminated_length": 119.25, "entropy": 0.1647733524441719, "epoch": 0.20556745182012848, "frac_reward_zero_std": 0.6875, "grad_norm": 7.75, "learning_rate": 1.9463273837991643e-05, "loss": -0.128, "num_tokens": 968962.0, "reward": 3.3740234375, "reward_std": 0.48751697689294815, "rewards/correctness_reward_func/mean": 1.484375, "rewards/correctness_reward_func/std": 0.8504082337021828, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.0972641110420227, "rewards/soft_format_reward_func/mean": 0.4765625, "rewards/soft_format_reward_func/std": 0.050389111042022705, "rewards/strict_format_reward_func/mean": 0.47265625, "rewards/strict_format_reward_func/std": 0.0660141110420227, "rewards/xmlcount_reward_func/mean": 0.4755859375, "rewards/xmlcount_reward_func/std": 0.054295361042022705, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 408.75, "completions/max_terminated_length": 399.0, "completions/mean_length": 255.03125, "completions/mean_terminated_length": 253.1906280517578, "completions/min_length": 156.5, "completions/min_terminated_length": 156.5, "entropy": 0.125497592613101, "epoch": 0.21413276231263384, "frac_reward_zero_std": 0.8125, "grad_norm": 2.53125, "learning_rate": 1.9362348706397374e-05, "loss": -0.0327, "num_tokens": 1013778.0, "reward": 3.583984375, "reward_std": 0.22373299859464169, "rewards/correctness_reward_func/mean": 1.609375, "rewards/correctness_reward_func/std": 0.7345243394374847, "rewards/int_reward_func/mean": 0.48828125, "rewards/int_reward_func/std": 0.03697281517088413, "rewards/soft_format_reward_func/mean": 0.49609375, "rewards/soft_format_reward_func/std": 0.015625, "rewards/strict_format_reward_func/mean": 0.4921875, "rewards/strict_format_reward_func/std": 0.021347815170884132, "rewards/xmlcount_reward_func/mean": 0.498046875, "rewards/xmlcount_reward_func/std": 0.005336953792721033, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 403.125, "completions/max_terminated_length": 384.875, "completions/mean_length": 262.71875, "completions/mean_terminated_length": 260.7333335876465, "completions/min_length": 167.5, "completions/min_terminated_length": 167.5, "entropy": 0.18168668448925018, "epoch": 0.22269807280513917, "frac_reward_zero_std": 0.75, "grad_norm": 13.0, "learning_rate": 1.9253043004739967e-05, "loss": 0.054, "num_tokens": 1059274.0, "reward": 3.5341796875, "reward_std": 0.39360435120761395, "rewards/correctness_reward_func/mean": 1.625, "rewards/correctness_reward_func/std": 0.7649086192250252, "rewards/int_reward_func/mean": 0.47265625, "rewards/int_reward_func/std": 0.06116959825158119, "rewards/soft_format_reward_func/mean": 0.4765625, "rewards/soft_format_reward_func/std": 0.049298666417598724, "rewards/strict_format_reward_func/mean": 0.4765625, "rewards/strict_format_reward_func/std": 0.049298666417598724, "rewards/xmlcount_reward_func/mean": 0.4833984375, "rewards/xmlcount_reward_func/std": 0.03711527772247791, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 467.5, "completions/max_terminated_length": 456.625, "completions/mean_length": 268.328125, "completions/mean_terminated_length": 263.0275344848633, "completions/min_length": 158.375, "completions/min_terminated_length": 158.375, "entropy": 0.13837805949151516, "epoch": 0.23126338329764454, "frac_reward_zero_std": 0.6875, "grad_norm": 10.1875, "learning_rate": 1.913545457642601e-05, "loss": -0.0646, "num_tokens": 1105884.0, "reward": 3.2080078125, "reward_std": 0.4985655229538679, "rewards/correctness_reward_func/mean": 1.359375, "rewards/correctness_reward_func/std": 0.91850346326828, "rewards/int_reward_func/mean": 0.453125, "rewards/int_reward_func/std": 0.11255648173391819, "rewards/soft_format_reward_func/mean": 0.4609375, "rewards/soft_format_reward_func/std": 0.09308474138379097, "rewards/strict_format_reward_func/mean": 0.45703125, "rewards/strict_format_reward_func/std": 0.09693148173391819, "rewards/xmlcount_reward_func/mean": 0.4775390625, "rewards/xmlcount_reward_func/std": 0.05502833751961589, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 449.5, "completions/max_terminated_length": 401.75, "completions/mean_length": 268.9375, "completions/mean_terminated_length": 260.4345283508301, "completions/min_length": 159.25, "completions/min_terminated_length": 159.25, "entropy": 0.08630623016506433, "epoch": 0.2398286937901499, "frac_reward_zero_std": 0.734375, "grad_norm": 2.578125, "learning_rate": 1.900968867902419e-05, "loss": 0.0809, "num_tokens": 1152072.0, "reward": 3.59375, "reward_std": 0.34802911058068275, "rewards/correctness_reward_func/mean": 1.671875, "rewards/correctness_reward_func/std": 0.5796433389186859, "rewards/int_reward_func/mean": 0.4921875, "rewards/int_reward_func/std": 0.03125, "rewards/soft_format_reward_func/mean": 0.47265625, "rewards/soft_format_reward_func/std": 0.0796684455126524, "rewards/strict_format_reward_func/mean": 0.46875, "rewards/strict_format_reward_func/std": 0.0952934455126524, "rewards/xmlcount_reward_func/mean": 0.48828125, "rewards/xmlcount_reward_func/std": 0.03735560039058328, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 414.125, "completions/max_terminated_length": 404.75, "completions/mean_length": 265.9921875, "completions/mean_terminated_length": 261.0090808868408, "completions/min_length": 149.125, "completions/min_terminated_length": 149.125, "entropy": 0.08809430431574583, "epoch": 0.24839400428265523, "frac_reward_zero_std": 0.734375, "grad_norm": 7.03125, "learning_rate": 1.8875857890045544e-05, "loss": -0.0093, "num_tokens": 1198043.0, "reward": 3.376953125, "reward_std": 0.42813105694949627, "rewards/correctness_reward_func/mean": 1.5, "rewards/correctness_reward_func/std": 0.8573416471481323, "rewards/int_reward_func/mean": 0.4609375, "rewards/int_reward_func/std": 0.10298692621290684, "rewards/soft_format_reward_func/mean": 0.46875, "rewards/soft_format_reward_func/std": 0.09341737069189548, "rewards/strict_format_reward_func/mean": 0.46875, "rewards/strict_format_reward_func/std": 0.09341737069189548, "rewards/xmlcount_reward_func/mean": 0.478515625, "rewards/xmlcount_reward_func/std": 0.060735128819942474, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 468.625, "completions/max_terminated_length": 436.25, "completions/mean_length": 267.0859375, "completions/mean_terminated_length": 261.05000495910645, "completions/min_length": 156.5, "completions/min_terminated_length": 156.5, "entropy": 0.1107498500496149, "epoch": 0.2569593147751606, "frac_reward_zero_std": 0.828125, "grad_norm": 22.375, "learning_rate": 1.87340820061713e-05, "loss": 0.0968, "num_tokens": 1244242.0, "reward": 3.5576171875, "reward_std": 0.2610218357294798, "rewards/correctness_reward_func/mean": 1.671875, "rewards/correctness_reward_func/std": 0.6637040823698044, "rewards/int_reward_func/mean": 0.4609375, "rewards/int_reward_func/std": 0.11179866641759872, "rewards/soft_format_reward_func/mean": 0.4765625, "rewards/soft_format_reward_func/std": 0.05920085124671459, "rewards/strict_format_reward_func/mean": 0.47265625, "rewards/strict_format_reward_func/std": 0.07482585124671459, "rewards/xmlcount_reward_func/mean": 0.4755859375, "rewards/xmlcount_reward_func/std": 0.06549832038581371, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 465.625, "completions/max_terminated_length": 430.875, "completions/mean_length": 279.1953125, "completions/mean_terminated_length": 266.19639587402344, "completions/min_length": 156.5, "completions/min_terminated_length": 156.5, "entropy": 0.10201808018609881, "epoch": 0.26552462526766596, "frac_reward_zero_std": 0.734375, "grad_norm": 20.375, "learning_rate": 1.8584487936018663e-05, "loss": 0.1084, "num_tokens": 1292255.0, "reward": 3.431640625, "reward_std": 0.3204077649861574, "rewards/correctness_reward_func/mean": 1.5625, "rewards/correctness_reward_func/std": 0.7631078958511353, "rewards/int_reward_func/mean": 0.47265625, "rewards/int_reward_func/std": 0.07779237069189548, "rewards/soft_format_reward_func/mean": 0.46484375, "rewards/soft_format_reward_func/std": 0.09617366641759872, "rewards/strict_format_reward_func/mean": 0.4609375, "rewards/strict_format_reward_func/std": 0.10002040676772594, "rewards/xmlcount_reward_func/mean": 0.470703125, "rewards/xmlcount_reward_func/std": 0.0801805853843689, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 456.0, "completions/max_terminated_length": 420.5, "completions/mean_length": 273.09375, "completions/mean_terminated_length": 265.13988304138184, "completions/min_length": 162.625, "completions/min_terminated_length": 162.625, "entropy": 0.0854261638596654, "epoch": 0.2740899357601713, "frac_reward_zero_std": 0.640625, "grad_norm": 6.15625, "learning_rate": 1.8427209586540392e-05, "loss": 0.1575, "num_tokens": 1339511.0, "reward": 3.3798828125, "reward_std": 0.4875169713050127, "rewards/correctness_reward_func/mean": 1.578125, "rewards/correctness_reward_func/std": 0.80234594643116, "rewards/int_reward_func/mean": 0.46875, "rewards/int_reward_func/std": 0.09341737069189548, "rewards/soft_format_reward_func/mean": 0.4375, "rewards/soft_format_reward_func/std": 0.15228559263050556, "rewards/strict_format_reward_func/mean": 0.43359375, "rewards/strict_format_reward_func/std": 0.15613233298063278, "rewards/xmlcount_reward_func/mean": 0.4619140625, "rewards/xmlcount_reward_func/std": 0.10068135987967253, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 469.625, "completions/max_terminated_length": 423.625, "completions/mean_length": 274.953125, "completions/mean_terminated_length": 265.3171920776367, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.08486761944368482, "epoch": 0.2826552462526767, "frac_reward_zero_std": 0.65625, "grad_norm": 7.25, "learning_rate": 1.826238774315995e-05, "loss": 0.124, "num_tokens": 1386753.0, "reward": 3.3203125, "reward_std": 0.5109951309859753, "rewards/correctness_reward_func/mean": 1.59375, "rewards/correctness_reward_func/std": 0.6808668300509453, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.08736192621290684, "rewards/soft_format_reward_func/mean": 0.41796875, "rewards/soft_format_reward_func/std": 0.17672233283519745, "rewards/strict_format_reward_func/mean": 0.3984375, "rewards/strict_format_reward_func/std": 0.19531385228037834, "rewards/xmlcount_reward_func/mean": 0.4453125, "rewards/xmlcount_reward_func/std": 0.11597495479509234, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 500.375, "completions/max_terminated_length": 410.125, "completions/mean_length": 295.1015625, "completions/mean_terminated_length": 264.6992950439453, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.1147261718288064, "epoch": 0.291220556745182, "frac_reward_zero_std": 0.625, "grad_norm": 10.4375, "learning_rate": 1.8090169943749477e-05, "loss": 0.2837, "num_tokens": 1436884.0, "reward": 3.1953125, "reward_std": 0.5910970717668533, "rewards/correctness_reward_func/mean": 1.453125, "rewards/correctness_reward_func/std": 0.9149020090699196, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.1109184455126524, "rewards/soft_format_reward_func/mean": 0.453125, "rewards/soft_format_reward_func/std": 0.13611300103366375, "rewards/strict_format_reward_func/mean": 0.390625, "rewards/strict_format_reward_func/std": 0.21258162707090378, "rewards/xmlcount_reward_func/mean": 0.43359375, "rewards/xmlcount_reward_func/std": 0.147721191868186, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 471.0, "completions/max_terminated_length": 403.25, "completions/mean_length": 256.2578125, "completions/mean_terminated_length": 241.55090141296387, "completions/min_length": 135.5, "completions/min_terminated_length": 135.5, "entropy": 0.06437111645936966, "epoch": 0.29978586723768735, "frac_reward_zero_std": 0.671875, "grad_norm": 10.0625, "learning_rate": 1.7910710346563417e-05, "loss": 0.0329, "num_tokens": 1481663.0, "reward": 3.4833984375, "reward_std": 0.53999756090343, "rewards/correctness_reward_func/mean": 1.640625, "rewards/correctness_reward_func/std": 0.778965063393116, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.12082063034176826, "rewards/soft_format_reward_func/mean": 0.46484375, "rewards/soft_format_reward_func/std": 0.12082063034176826, "rewards/strict_format_reward_func/mean": 0.44921875, "rewards/strict_format_reward_func/std": 0.14689540676772594, "rewards/xmlcount_reward_func/mean": 0.4638671875, "rewards/xmlcount_reward_func/std": 0.10555252991616726, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 457.875, "completions/max_terminated_length": 417.625, "completions/mean_length": 246.2265625, "completions/mean_terminated_length": 232.80781745910645, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.09419436752796173, "epoch": 0.3083511777301927, "frac_reward_zero_std": 0.65625, "grad_norm": 16.25, "learning_rate": 1.7724169592245996e-05, "loss": -0.0012, "num_tokens": 1524892.0, "reward": 3.3369140625, "reward_std": 0.34941017907112837, "rewards/correctness_reward_func/mean": 1.546875, "rewards/correctness_reward_func/std": 0.7342507243156433, "rewards/int_reward_func/mean": 0.46875, "rewards/int_reward_func/std": 0.09341737069189548, "rewards/soft_format_reward_func/mean": 0.453125, "rewards/soft_format_reward_func/std": 0.1095899622887373, "rewards/strict_format_reward_func/mean": 0.43359375, "rewards/strict_format_reward_func/std": 0.14689540676772594, "rewards/xmlcount_reward_func/mean": 0.4345703125, "rewards/xmlcount_reward_func/std": 0.12757759355008602, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 466.875, "completions/max_terminated_length": 341.625, "completions/mean_length": 237.1953125, "completions/mean_terminated_length": 214.32523155212402, "completions/min_length": 102.5, "completions/min_terminated_length": 102.5, "entropy": 0.08123180363327265, "epoch": 0.3169164882226981, "frac_reward_zero_std": 0.75, "grad_norm": 12.0625, "learning_rate": 1.7530714660036112e-05, "loss": 0.4256, "num_tokens": 1566655.0, "reward": 3.6396484375, "reward_std": 0.23616261687129736, "rewards/correctness_reward_func/mean": 1.765625, "rewards/correctness_reward_func/std": 0.45028156042099, "rewards/int_reward_func/mean": 0.484375, "rewards/int_reward_func/std": 0.05259781517088413, "rewards/soft_format_reward_func/mean": 0.484375, "rewards/soft_format_reward_func/std": 0.0625, "rewards/strict_format_reward_func/mean": 0.44140625, "rewards/strict_format_reward_func/std": 0.15537451766431332, "rewards/xmlcount_reward_func/mean": 0.4638671875, "rewards/xmlcount_reward_func/std": 0.10096731083467603, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 446.875, "completions/max_terminated_length": 403.125, "completions/mean_length": 237.4453125, "completions/mean_terminated_length": 232.93490028381348, "completions/min_length": 120.625, "completions/min_terminated_length": 120.625, "entropy": 0.07567687798291445, "epoch": 0.32548179871520344, "frac_reward_zero_std": 0.75, "grad_norm": 8.8125, "learning_rate": 1.7330518718298263e-05, "loss": 0.1448, "num_tokens": 1609482.0, "reward": 3.5947265625, "reward_std": 0.3079781401902437, "rewards/correctness_reward_func/mean": 1.625, "rewards/correctness_reward_func/std": 0.7266493514180183, "rewards/int_reward_func/mean": 0.5, "rewards/int_reward_func/std": 0.0, "rewards/soft_format_reward_func/mean": 0.4921875, "rewards/soft_format_reward_func/std": 0.021347815170884132, "rewards/strict_format_reward_func/mean": 0.484375, "rewards/strict_format_reward_func/std": 0.05259781517088413, "rewards/xmlcount_reward_func/mean": 0.4931640625, "rewards/xmlcount_reward_func/std": 0.023663727566599846, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 412.75, "completions/max_terminated_length": 407.25, "completions/mean_length": 225.1953125, "completions/mean_terminated_length": 223.21041870117188, "completions/min_length": 123.125, "completions/min_terminated_length": 123.125, "entropy": 0.061285244300961494, "epoch": 0.3340471092077088, "frac_reward_zero_std": 0.84375, "grad_norm": 6.0, "learning_rate": 1.712376096951345e-05, "loss": 0.1485, "num_tokens": 1649845.0, "reward": 3.748046875, "reward_std": 0.17953883367590606, "rewards/correctness_reward_func/mean": 1.765625, "rewards/correctness_reward_func/std": 0.601259708404541, "rewards/int_reward_func/mean": 0.5, "rewards/int_reward_func/std": 0.0, "rewards/soft_format_reward_func/mean": 0.5, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.49609375, "rewards/strict_format_reward_func/std": 0.015625, "rewards/xmlcount_reward_func/mean": 0.486328125, "rewards/xmlcount_reward_func/std": 0.04478531517088413, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 397.125, "completions/max_terminated_length": 352.75, "completions/mean_length": 211.8359375, "completions/mean_terminated_length": 204.96146202087402, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.08425743412226439, "epoch": 0.3426124197002141, "frac_reward_zero_std": 0.796875, "grad_norm": 9.6875, "learning_rate": 1.691062648986865e-05, "loss": 0.0101, "num_tokens": 1689182.0, "reward": 3.4814453125, "reward_std": 0.3245509583503008, "rewards/correctness_reward_func/mean": 1.59375, "rewards/correctness_reward_func/std": 0.7279798686504364, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.09617366641759872, "rewards/soft_format_reward_func/mean": 0.4765625, "rewards/soft_format_reward_func/std": 0.07206955552101135, "rewards/strict_format_reward_func/mean": 0.46875, "rewards/strict_format_reward_func/std": 0.10331955552101135, "rewards/xmlcount_reward_func/mean": 0.4775390625, "rewards/xmlcount_reward_func/std": 0.06816330552101135, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 415.625, "completions/max_terminated_length": 373.375, "completions/mean_length": 202.0078125, "completions/mean_terminated_length": 192.4726963043213, "completions/min_length": 73.75, "completions/min_terminated_length": 73.75, "entropy": 0.08351494651287794, "epoch": 0.3511777301927195, "frac_reward_zero_std": 0.78125, "grad_norm": 7.1875, "learning_rate": 1.6691306063588583e-05, "loss": 0.0866, "num_tokens": 1726993.0, "reward": 3.4150390625, "reward_std": 0.23063834570348263, "rewards/correctness_reward_func/mean": 1.546875, "rewards/correctness_reward_func/std": 0.7765756696462631, "rewards/int_reward_func/mean": 0.453125, "rewards/int_reward_func/std": 0.11146603710949421, "rewards/soft_format_reward_func/mean": 0.48046875, "rewards/soft_format_reward_func/std": 0.046542370691895485, "rewards/strict_format_reward_func/mean": 0.4609375, "rewards/strict_format_reward_func/std": 0.10189648158848286, "rewards/xmlcount_reward_func/mean": 0.4736328125, "rewards/xmlcount_reward_func/std": 0.06877632485702634, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 393.0, "completions/max_terminated_length": 353.875, "completions/mean_length": 199.8359375, "completions/mean_terminated_length": 192.48474884033203, "completions/min_length": 87.5, "completions/min_terminated_length": 87.5, "entropy": 0.07358774542808533, "epoch": 0.35974304068522484, "frac_reward_zero_std": 0.765625, "grad_norm": 6.28125, "learning_rate": 1.6465996012157996e-05, "loss": 0.2018, "num_tokens": 1765048.0, "reward": 3.466796875, "reward_std": 0.3204077500849962, "rewards/correctness_reward_func/mean": 1.53125, "rewards/correctness_reward_func/std": 0.8384338021278381, "rewards/int_reward_func/mean": 0.48828125, "rewards/int_reward_func/std": 0.03697281517088413, "rewards/soft_format_reward_func/mean": 0.48828125, "rewards/soft_format_reward_func/std": 0.03697281517088413, "rewards/strict_format_reward_func/mean": 0.47265625, "rewards/strict_format_reward_func/std": 0.08957063034176826, "rewards/xmlcount_reward_func/mean": 0.486328125, "rewards/xmlcount_reward_func/std": 0.045771504286676645, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 463.75, "completions/max_terminated_length": 342.0, "completions/mean_length": 203.1796875, "completions/mean_terminated_length": 177.74688339233398, "completions/min_length": 76.625, "completions/min_terminated_length": 76.625, "entropy": 0.09338215924799442, "epoch": 0.3683083511777302, "frac_reward_zero_std": 0.671875, "grad_norm": 14.1875, "learning_rate": 1.6234898018587336e-05, "loss": 0.4962, "num_tokens": 1802763.0, "reward": 3.421875, "reward_std": 0.39774755109101534, "rewards/correctness_reward_func/mean": 1.53125, "rewards/correctness_reward_func/std": 0.8366330787539482, "rewards/int_reward_func/mean": 0.4921875, "rewards/int_reward_func/std": 0.03125, "rewards/soft_format_reward_func/mean": 0.5, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.4453125, "rewards/strict_format_reward_func/std": 0.1271837092936039, "rewards/xmlcount_reward_func/mean": 0.453125, "rewards/xmlcount_reward_func/std": 0.1128259189426899, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 434.5, "completions/max_terminated_length": 338.5, "completions/mean_length": 200.421875, "completions/mean_terminated_length": 181.6392650604248, "completions/min_length": 95.125, "completions/min_terminated_length": 95.125, "entropy": 0.09290066035464406, "epoch": 0.37687366167023556, "frac_reward_zero_std": 0.78125, "grad_norm": 129.0, "learning_rate": 1.599821894687914e-05, "loss": 0.2028, "num_tokens": 1840283.0, "reward": 3.3876953125, "reward_std": 0.27759466134011745, "rewards/correctness_reward_func/mean": 1.515625, "rewards/correctness_reward_func/std": 0.7650169730186462, "rewards/int_reward_func/mean": 0.48828125, "rewards/int_reward_func/std": 0.025194555521011353, "rewards/soft_format_reward_func/mean": 0.48046875, "rewards/soft_format_reward_func/std": 0.05644455552101135, "rewards/strict_format_reward_func/mean": 0.4453125, "rewards/strict_format_reward_func/std": 0.11806907318532467, "rewards/xmlcount_reward_func/mean": 0.4580078125, "rewards/xmlcount_reward_func/std": 0.10733805038034916, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 302.125, "completions/max_terminated_length": 273.5, "completions/mean_length": 161.5859375, "completions/mean_terminated_length": 158.94635581970215, "completions/min_length": 83.75, "completions/min_terminated_length": 83.75, "entropy": 0.07536831498146057, "epoch": 0.3854389721627409, "frac_reward_zero_std": 0.75, "grad_norm": 52.5, "learning_rate": 1.575617065685674e-05, "loss": 0.0339, "num_tokens": 1873008.0, "reward": 3.439453125, "reward_std": 0.4253689181059599, "rewards/correctness_reward_func/mean": 1.5625, "rewards/correctness_reward_func/std": 0.8102209344506264, "rewards/int_reward_func/mean": 0.45703125, "rewards/int_reward_func/std": 0.11861192621290684, "rewards/soft_format_reward_func/mean": 0.4765625, "rewards/soft_format_reward_func/std": 0.050389111042022705, "rewards/strict_format_reward_func/mean": 0.47265625, "rewards/strict_format_reward_func/std": 0.053145406767725945, "rewards/xmlcount_reward_func/mean": 0.470703125, "rewards/xmlcount_reward_func/std": 0.060957906767725945, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 444.5, "completions/max_terminated_length": 266.75, "completions/mean_length": 179.7578125, "completions/mean_terminated_length": 152.03579235076904, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.06918492680415511, "epoch": 0.39400428265524623, "frac_reward_zero_std": 0.625, "grad_norm": 24.125, "learning_rate": 1.5508969814521026e-05, "loss": 0.1968, "num_tokens": 1907887.0, "reward": 3.259765625, "reward_std": 0.4695630930364132, "rewards/correctness_reward_func/mean": 1.5, "rewards/correctness_reward_func/std": 0.8375296071171761, "rewards/int_reward_func/mean": 0.4765625, "rewards/int_reward_func/std": 0.07206955552101135, "rewards/soft_format_reward_func/mean": 0.4609375, "rewards/soft_format_reward_func/std": 0.08824022859334946, "rewards/strict_format_reward_func/mean": 0.38671875, "rewards/strict_format_reward_func/std": 0.20270179212093353, "rewards/xmlcount_reward_func/mean": 0.435546875, "rewards/xmlcount_reward_func/std": 0.12772688083350658, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5859375, "completions/max_length": 512.0, "completions/max_terminated_length": 234.25, "completions/mean_length": 361.9609375, "completions/mean_terminated_length": 144.42629528045654, "completions/min_length": 80.375, "completions/min_terminated_length": 80.375, "entropy": 0.06124910665675998, "epoch": 0.4025695931477516, "frac_reward_zero_std": 0.25, "grad_norm": 20.0, "learning_rate": 1.5256837698105047e-05, "loss": 0.4458, "num_tokens": 1965858.0, "reward": 2.03125, "reward_std": 0.770635899156332, "rewards/correctness_reward_func/mean": 1.0625, "rewards/correctness_reward_func/std": 0.978024922311306, "rewards/int_reward_func/mean": 0.41015625, "rewards/int_reward_func/std": 0.14781177043914795, "rewards/soft_format_reward_func/mean": 0.34765625, "rewards/soft_format_reward_func/std": 0.221479382365942, "rewards/strict_format_reward_func/mean": 0.046875, "rewards/strict_format_reward_func/std": 0.1095899622887373, "rewards/xmlcount_reward_func/mean": 0.1640625, "rewards/xmlcount_reward_func/std": 0.17734735272824764, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 512.0, "completions/max_terminated_length": 174.375, "completions/mean_length": 431.34375, "completions/mean_terminated_length": 122.72916984558105, "completions/min_length": 152.25, "completions/min_terminated_length": 88.25, "entropy": 0.06367563363164663, "epoch": 0.41113490364025695, "frac_reward_zero_std": 0.25, "grad_norm": 20.75, "learning_rate": 1.5000000000000002e-05, "loss": 0.5487, "num_tokens": 2033422.0, "reward": 1.4404296875, "reward_std": 0.7416334673762321, "rewards/correctness_reward_func/mean": 0.71875, "rewards/correctness_reward_func/std": 0.9663743898272514, "rewards/int_reward_func/mean": 0.30078125, "rewards/int_reward_func/std": 0.23859525099396706, "rewards/soft_format_reward_func/mean": 0.29296875, "rewards/soft_format_reward_func/std": 0.23586604371666908, "rewards/strict_format_reward_func/mean": 0.015625, "rewards/strict_format_reward_func/std": 0.0625, "rewards/xmlcount_reward_func/mean": 0.1123046875, "rewards/xmlcount_reward_func/std": 0.16386567754670978, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6328125, "completions/max_length": 512.0, "completions/max_terminated_length": 230.375, "completions/mean_length": 376.578125, "completions/mean_terminated_length": 143.8458366394043, "completions/min_length": 79.75, "completions/min_terminated_length": 79.75, "entropy": 0.06622787471860647, "epoch": 0.4197002141327623, "frac_reward_zero_std": 0.234375, "grad_norm": 69.5, "learning_rate": 1.4738686624729987e-05, "loss": 0.9777, "num_tokens": 2093936.0, "reward": 1.595703125, "reward_std": 0.7098689079284668, "rewards/correctness_reward_func/mean": 0.796875, "rewards/correctness_reward_func/std": 0.9754082411527634, "rewards/int_reward_func/mean": 0.33984375, "rewards/int_reward_func/std": 0.21231234446167946, "rewards/soft_format_reward_func/mean": 0.28125, "rewards/soft_format_reward_func/std": 0.23983315750956535, "rewards/strict_format_reward_func/mean": 0.015625, "rewards/strict_format_reward_func/std": 0.05259781517088413, "rewards/xmlcount_reward_func/mean": 0.162109375, "rewards/xmlcount_reward_func/std": 0.1913837492465973, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 512.0, "completions/max_terminated_length": 213.5, "completions/mean_length": 382.2734375, "completions/mean_terminated_length": 144.93779945373535, "completions/min_length": 91.75, "completions/min_terminated_length": 91.75, "entropy": 0.055487995967268944, "epoch": 0.4282655246252677, "frac_reward_zero_std": 0.265625, "grad_norm": 21.75, "learning_rate": 1.4473131483156326e-05, "loss": 0.6731, "num_tokens": 2154945.0, "reward": 1.291015625, "reward_std": 0.7789223082363605, "rewards/correctness_reward_func/mean": 0.609375, "rewards/correctness_reward_func/std": 0.9308035299181938, "rewards/int_reward_func/mean": 0.25, "rewards/int_reward_func/std": 0.2540716640651226, "rewards/soft_format_reward_func/mean": 0.25, "rewards/soft_format_reward_func/std": 0.24922906793653965, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.181640625, "rewards/xmlcount_reward_func/std": 0.20568038523197174, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5859375, "completions/max_length": 512.0, "completions/max_terminated_length": 302.25, "completions/mean_length": 374.421875, "completions/mean_terminated_length": 177.11860466003418, "completions/min_length": 93.125, "completions/min_terminated_length": 93.125, "entropy": 0.05054905638098717, "epoch": 0.43683083511777304, "frac_reward_zero_std": 0.1875, "grad_norm": 24.125, "learning_rate": 1.4203572283095657e-05, "loss": 1.1031, "num_tokens": 2214589.0, "reward": 1.5400390625, "reward_std": 1.028895616531372, "rewards/correctness_reward_func/mean": 0.765625, "rewards/correctness_reward_func/std": 0.9744589924812317, "rewards/int_reward_func/mean": 0.2734375, "rewards/int_reward_func/std": 0.2540593519806862, "rewards/soft_format_reward_func/mean": 0.2890625, "rewards/soft_format_reward_func/std": 0.25303449109196663, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.2119140625, "rewards/xmlcount_reward_func/std": 0.21405917219817638, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7109375, "completions/max_length": 512.0, "completions/max_terminated_length": 252.25, "completions/mean_length": 412.453125, "completions/mean_terminated_length": 173.9702386856079, "completions/min_length": 113.25, "completions/min_terminated_length": 113.25, "entropy": 0.051852176897227764, "epoch": 0.44539614561027835, "frac_reward_zero_std": 0.1875, "grad_norm": 17.75, "learning_rate": 1.3930250316539237e-05, "loss": 1.013, "num_tokens": 2280025.0, "reward": 0.9990234375, "reward_std": 0.885264553129673, "rewards/correctness_reward_func/mean": 0.4375, "rewards/correctness_reward_func/std": 0.811570405960083, "rewards/int_reward_func/mean": 0.15625, "rewards/int_reward_func/std": 0.22798974812030792, "rewards/soft_format_reward_func/mean": 0.22265625, "rewards/soft_format_reward_func/std": 0.25226276740431786, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.1826171875, "rewards/xmlcount_reward_func/std": 0.18229194171726704, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 512.0, "completions/max_terminated_length": 285.875, "completions/mean_length": 351.171875, "completions/mean_terminated_length": 178.26887321472168, "completions/min_length": 108.875, "completions/min_terminated_length": 108.875, "entropy": 0.0576583961956203, "epoch": 0.4539614561027837, "frac_reward_zero_std": 0.359375, "grad_norm": 21.125, "learning_rate": 1.3653410243663953e-05, "loss": 0.8749, "num_tokens": 2337543.0, "reward": 1.5859375, "reward_std": 0.7292038351297379, "rewards/correctness_reward_func/mean": 0.75, "rewards/correctness_reward_func/std": 0.9716326966881752, "rewards/int_reward_func/mean": 0.26953125, "rewards/int_reward_func/std": 0.2501082383096218, "rewards/soft_format_reward_func/mean": 0.29296875, "rewards/soft_format_reward_func/std": 0.24548756889998913, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.2734375, "rewards/xmlcount_reward_func/std": 0.20937122404575348, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 512.0, "completions/max_terminated_length": 285.125, "completions/mean_length": 242.625, "completions/mean_terminated_length": 169.74182319641113, "completions/min_length": 97.375, "completions/min_terminated_length": 97.375, "entropy": 0.06909441482275724, "epoch": 0.4625267665952891, "frac_reward_zero_std": 0.46875, "grad_norm": 131.0, "learning_rate": 1.3373299873828303e-05, "loss": 0.9203, "num_tokens": 2379831.0, "reward": 2.27734375, "reward_std": 0.7844465803354979, "rewards/correctness_reward_func/mean": 1.125, "rewards/correctness_reward_func/std": 0.9970766380429268, "rewards/int_reward_func/mean": 0.37890625, "rewards/int_reward_func/std": 0.2144309040158987, "rewards/soft_format_reward_func/mean": 0.37890625, "rewards/soft_format_reward_func/std": 0.21300501003861427, "rewards/strict_format_reward_func/mean": 0.0078125, "rewards/strict_format_reward_func/std": 0.03125, "rewards/xmlcount_reward_func/mean": 0.38671875, "rewards/xmlcount_reward_func/std": 0.18055572640150785, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 512.0, "completions/max_terminated_length": 287.125, "completions/mean_length": 227.8671875, "completions/mean_terminated_length": 178.1862964630127, "completions/min_length": 82.875, "completions/min_terminated_length": 82.875, "entropy": 0.0683035789988935, "epoch": 0.47109207708779444, "frac_reward_zero_std": 0.53125, "grad_norm": 75.5, "learning_rate": 1.3090169943749475e-05, "loss": 0.7331, "num_tokens": 2421244.0, "reward": 2.6162109375, "reward_std": 0.5620946288108826, "rewards/correctness_reward_func/mean": 1.375, "rewards/correctness_reward_func/std": 0.9013157784938812, "rewards/int_reward_func/mean": 0.41015625, "rewards/int_reward_func/std": 0.18550433963537216, "rewards/soft_format_reward_func/mean": 0.4140625, "rewards/soft_format_reward_func/std": 0.17175541445612907, "rewards/strict_format_reward_func/mean": 0.0078125, "rewards/strict_format_reward_func/std": 0.03125, "rewards/xmlcount_reward_func/mean": 0.4091796875, "rewards/xmlcount_reward_func/std": 0.15384871885180473, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 512.0, "completions/max_terminated_length": 274.875, "completions/mean_length": 230.6484375, "completions/mean_terminated_length": 173.36169624328613, "completions/min_length": 101.5, "completions/min_terminated_length": 101.5, "entropy": 0.06412349035963416, "epoch": 0.4796573875802998, "frac_reward_zero_std": 0.609375, "grad_norm": 17.75, "learning_rate": 1.2804273893060028e-05, "loss": 0.8135, "num_tokens": 2462929.0, "reward": 2.8583984375, "reward_std": 0.5731431804597378, "rewards/correctness_reward_func/mean": 1.546875, "rewards/correctness_reward_func/std": 0.8076042532920837, "rewards/int_reward_func/mean": 0.43359375, "rewards/int_reward_func/std": 0.15128782019019127, "rewards/soft_format_reward_func/mean": 0.4453125, "rewards/soft_format_reward_func/std": 0.12609326466917992, "rewards/strict_format_reward_func/mean": 0.01953125, "rewards/strict_format_reward_func/std": 0.06822281517088413, "rewards/xmlcount_reward_func/mean": 0.4130859375, "rewards/xmlcount_reward_func/std": 0.14727921038866043, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 481.625, "completions/max_terminated_length": 265.5, "completions/mean_length": 213.640625, "completions/mean_terminated_length": 158.69331169128418, "completions/min_length": 87.125, "completions/min_terminated_length": 87.125, "entropy": 0.06292425934225321, "epoch": 0.48822269807280516, "frac_reward_zero_std": 0.46875, "grad_norm": 15.0, "learning_rate": 1.2515867637445088e-05, "loss": 0.5508, "num_tokens": 2502783.0, "reward": 2.8046875, "reward_std": 0.6159562915563583, "rewards/correctness_reward_func/mean": 1.34375, "rewards/correctness_reward_func/std": 0.9502372145652771, "rewards/int_reward_func/mean": 0.41015625, "rewards/int_reward_func/std": 0.17638970352709293, "rewards/soft_format_reward_func/mean": 0.4296875, "rewards/soft_format_reward_func/std": 0.15997907333076, "rewards/strict_format_reward_func/mean": 0.25, "rewards/strict_format_reward_func/std": 0.24410519748926163, "rewards/xmlcount_reward_func/mean": 0.37109375, "rewards/xmlcount_reward_func/std": 0.1568639986217022, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 439.75, "completions/max_terminated_length": 258.375, "completions/mean_length": 189.484375, "completions/mean_terminated_length": 149.9307737350464, "completions/min_length": 81.625, "completions/min_terminated_length": 81.625, "entropy": 0.07262948993593454, "epoch": 0.49678800856531047, "frac_reward_zero_std": 0.671875, "grad_norm": 25.375, "learning_rate": 1.2225209339563144e-05, "loss": 0.5996, "num_tokens": 2539729.0, "reward": 3.1650390625, "reward_std": 0.42951212264597416, "rewards/correctness_reward_func/mean": 1.40625, "rewards/correctness_reward_func/std": 0.9186194837093353, "rewards/int_reward_func/mean": 0.44921875, "rewards/int_reward_func/std": 0.11343478411436081, "rewards/soft_format_reward_func/mean": 0.44921875, "rewards/soft_format_reward_func/std": 0.1270910371094942, "rewards/strict_format_reward_func/mean": 0.421875, "rewards/strict_format_reward_func/std": 0.15183541178703308, "rewards/xmlcount_reward_func/mean": 0.4384765625, "rewards/xmlcount_reward_func/std": 0.1242168415337801, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 428.125, "completions/max_terminated_length": 271.75, "completions/mean_length": 168.0390625, "completions/mean_terminated_length": 153.7322940826416, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.06558680208399892, "epoch": 0.5053533190578159, "frac_reward_zero_std": 0.71875, "grad_norm": 13.6875, "learning_rate": 1.1932559177955533e-05, "loss": 0.6108, "num_tokens": 2573054.0, "reward": 3.6171875, "reward_std": 0.38669902086257935, "rewards/correctness_reward_func/mean": 1.703125, "rewards/correctness_reward_func/std": 0.7130631133913994, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.09914018586277962, "rewards/soft_format_reward_func/mean": 0.48828125, "rewards/soft_format_reward_func/std": 0.046875, "rewards/strict_format_reward_func/mean": 0.47265625, "rewards/strict_format_reward_func/std": 0.09947281517088413, "rewards/xmlcount_reward_func/mean": 0.48828125, "rewards/xmlcount_reward_func/std": 0.046875, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 354.25, "completions/max_terminated_length": 267.375, "completions/mean_length": 166.265625, "completions/mean_terminated_length": 155.07939338684082, "completions/min_length": 84.25, "completions/min_terminated_length": 84.25, "entropy": 0.0687381848692894, "epoch": 0.5139186295503212, "frac_reward_zero_std": 0.6875, "grad_norm": 26.25, "learning_rate": 1.1638179114151378e-05, "loss": 0.2988, "num_tokens": 2606004.0, "reward": 3.314453125, "reward_std": 0.43365532672032714, "rewards/correctness_reward_func/mean": 1.4375, "rewards/correctness_reward_func/std": 0.8874192461371422, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.10904237069189548, "rewards/soft_format_reward_func/mean": 0.4765625, "rewards/soft_format_reward_func/std": 0.062167370691895485, "rewards/strict_format_reward_func/mean": 0.46484375, "rewards/strict_format_reward_func/std": 0.08923800103366375, "rewards/xmlcount_reward_func/mean": 0.470703125, "rewards/xmlcount_reward_func/std": 0.07348538748919964, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 394.875, "completions/max_terminated_length": 299.5, "completions/mean_length": 167.234375, "completions/mean_terminated_length": 155.96860313415527, "completions/min_length": 81.875, "completions/min_terminated_length": 81.875, "entropy": 0.06962216552346945, "epoch": 0.5224839400428265, "frac_reward_zero_std": 0.796875, "grad_norm": 13.0625, "learning_rate": 1.1342332658176556e-05, "loss": 0.2338, "num_tokens": 2639860.0, "reward": 3.4140625, "reward_std": 0.19334950856864452, "rewards/correctness_reward_func/mean": 1.515625, "rewards/correctness_reward_func/std": 0.8358171209692955, "rewards/int_reward_func/mean": 0.4765625, "rewards/int_reward_func/std": 0.07394563034176826, "rewards/soft_format_reward_func/mean": 0.4765625, "rewards/soft_format_reward_func/std": 0.07394563034176826, "rewards/strict_format_reward_func/mean": 0.46875, "rewards/strict_format_reward_func/std": 0.08351518586277962, "rewards/xmlcount_reward_func/mean": 0.4765625, "rewards/xmlcount_reward_func/std": 0.06315224710851908, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 390.375, "completions/max_terminated_length": 261.25, "completions/mean_length": 174.2734375, "completions/mean_terminated_length": 157.4187536239624, "completions/min_length": 87.5, "completions/min_terminated_length": 87.5, "entropy": 0.06758250948041677, "epoch": 0.5310492505353319, "frac_reward_zero_std": 0.671875, "grad_norm": 11.0, "learning_rate": 1.1045284632676535e-05, "loss": 0.1924, "num_tokens": 2674451.0, "reward": 3.5439453125, "reward_std": 0.4018907658755779, "rewards/correctness_reward_func/mean": 1.609375, "rewards/correctness_reward_func/std": 0.7834457755088806, "rewards/int_reward_func/mean": 0.48828125, "rewards/int_reward_func/std": 0.03697281517088413, "rewards/soft_format_reward_func/mean": 0.4921875, "rewards/soft_format_reward_func/std": 0.03125, "rewards/strict_format_reward_func/mean": 0.46875, "rewards/strict_format_reward_func/std": 0.0952934455126524, "rewards/xmlcount_reward_func/mean": 0.4853515625, "rewards/xmlcount_reward_func/std": 0.04692569188773632, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 269.5, "completions/max_terminated_length": 241.5, "completions/mean_length": 148.2890625, "completions/mean_terminated_length": 145.62083435058594, "completions/min_length": 89.625, "completions/min_terminated_length": 89.625, "entropy": 0.0636401055380702, "epoch": 0.5396145610278372, "frac_reward_zero_std": 0.8125, "grad_norm": 8.125, "learning_rate": 1.0747300935864245e-05, "loss": 0.0794, "num_tokens": 2705286.0, "reward": 3.6806640625, "reward_std": 0.2637839764356613, "rewards/correctness_reward_func/mean": 1.703125, "rewards/correctness_reward_func/std": 0.5994666591286659, "rewards/int_reward_func/mean": 0.484375, "rewards/int_reward_func/std": 0.042695630341768265, "rewards/soft_format_reward_func/mean": 0.5, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.49609375, "rewards/strict_format_reward_func/std": 0.015625, "rewards/xmlcount_reward_func/mean": 0.4970703125, "rewards/xmlcount_reward_func/std": 0.01171875, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 282.875, "completions/max_terminated_length": 242.0, "completions/mean_length": 154.0546875, "completions/mean_terminated_length": 151.16770935058594, "completions/min_length": 82.75, "completions/min_terminated_length": 82.75, "entropy": 0.06001953314989805, "epoch": 0.5481798715203426, "frac_reward_zero_std": 0.84375, "grad_norm": 7.78125, "learning_rate": 1.044864830350515e-05, "loss": 0.045, "num_tokens": 2737255.0, "reward": 3.6259765625, "reward_std": 0.1643470786511898, "rewards/correctness_reward_func/mean": 1.65625, "rewards/correctness_reward_func/std": 0.6852209344506264, "rewards/int_reward_func/mean": 0.4921875, "rewards/int_reward_func/std": 0.021347815170884132, "rewards/soft_format_reward_func/mean": 0.5, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.484375, "rewards/strict_format_reward_func/std": 0.0625, "rewards/xmlcount_reward_func/mean": 0.4931640625, "rewards/xmlcount_reward_func/std": 0.02734375, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 323.5, "completions/max_terminated_length": 268.75, "completions/mean_length": 161.921875, "completions/mean_terminated_length": 154.10967445373535, "completions/min_length": 83.125, "completions/min_terminated_length": 83.125, "entropy": 0.06517814612016082, "epoch": 0.556745182012848, "frac_reward_zero_std": 0.796875, "grad_norm": 7.8125, "learning_rate": 1.0149594070152638e-05, "loss": 0.0963, "num_tokens": 2770597.0, "reward": 3.3662109375, "reward_std": 0.29969173669815063, "rewards/correctness_reward_func/mean": 1.4375, "rewards/correctness_reward_func/std": 0.8671257123351097, "rewards/int_reward_func/mean": 0.46875, "rewards/int_reward_func/std": 0.08054866641759872, "rewards/soft_format_reward_func/mean": 0.48828125, "rewards/soft_format_reward_func/std": 0.03697281517088413, "rewards/strict_format_reward_func/mean": 0.484375, "rewards/strict_format_reward_func/std": 0.05259781517088413, "rewards/xmlcount_reward_func/mean": 0.4873046875, "rewards/xmlcount_reward_func/std": 0.04087906517088413, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 453.75, "completions/max_terminated_length": 281.875, "completions/mean_length": 192.828125, "completions/mean_terminated_length": 151.42723655700684, "completions/min_length": 82.125, "completions/min_terminated_length": 82.125, "entropy": 0.05498858401551843, "epoch": 0.5653104925053534, "frac_reward_zero_std": 0.796875, "grad_norm": 10.5, "learning_rate": 9.850405929847367e-06, "loss": 0.2824, "num_tokens": 2807611.0, "reward": 3.3330078125, "reward_std": 0.3135024200892076, "rewards/correctness_reward_func/mean": 1.578125, "rewards/correctness_reward_func/std": 0.8142120242118835, "rewards/int_reward_func/mean": 0.4375, "rewards/int_reward_func/std": 0.15119514800608158, "rewards/soft_format_reward_func/mean": 0.4453125, "rewards/soft_format_reward_func/std": 0.11994514800608158, "rewards/strict_format_reward_func/mean": 0.43359375, "rewards/strict_format_reward_func/std": 0.1569179631769657, "rewards/xmlcount_reward_func/mean": 0.4384765625, "rewards/xmlcount_reward_func/std": 0.15258620493113995, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 377.625, "completions/max_terminated_length": 273.0, "completions/mean_length": 154.65625, "completions/mean_terminated_length": 139.93846988677979, "completions/min_length": 75.75, "completions/min_terminated_length": 75.75, "entropy": 0.0641864649951458, "epoch": 0.5738758029978587, "frac_reward_zero_std": 0.796875, "grad_norm": 14.25, "learning_rate": 9.551351696494854e-06, "loss": 0.3309, "num_tokens": 2839197.0, "reward": 3.3779296875, "reward_std": 0.321788830216974, "rewards/correctness_reward_func/mean": 1.484375, "rewards/correctness_reward_func/std": 0.8679328411817551, "rewards/int_reward_func/mean": 0.4609375, "rewards/int_reward_func/std": 0.11664126068353653, "rewards/soft_format_reward_func/mean": 0.48046875, "rewards/soft_format_reward_func/std": 0.058320630341768265, "rewards/strict_format_reward_func/mean": 0.4765625, "rewards/strict_format_reward_func/std": 0.0640434455126524, "rewards/xmlcount_reward_func/mean": 0.4755859375, "rewards/xmlcount_reward_func/std": 0.07196457590907812, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 324.125, "completions/max_terminated_length": 250.75, "completions/mean_length": 156.265625, "completions/mean_terminated_length": 141.62864875793457, "completions/min_length": 72.625, "completions/min_terminated_length": 72.625, "entropy": 0.05262026563286781, "epoch": 0.582441113490364, "frac_reward_zero_std": 0.875, "grad_norm": 42.5, "learning_rate": 9.252699064135759e-06, "loss": 0.2027, "num_tokens": 2870879.0, "reward": 3.443359375, "reward_std": 0.1905873753130436, "rewards/correctness_reward_func/mean": 1.546875, "rewards/correctness_reward_func/std": 0.697150319814682, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.09914018586277962, "rewards/soft_format_reward_func/mean": 0.48046875, "rewards/soft_format_reward_func/std": 0.05644455552101135, "rewards/strict_format_reward_func/mean": 0.47265625, "rewards/strict_format_reward_func/std": 0.07482585124671459, "rewards/xmlcount_reward_func/mean": 0.478515625, "rewards/xmlcount_reward_func/std": 0.05138835124671459, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 314.25, "completions/max_terminated_length": 225.875, "completions/mean_length": 140.921875, "completions/mean_terminated_length": 131.4558048248291, "completions/min_length": 65.25, "completions/min_terminated_length": 65.25, "entropy": 0.05961746862158179, "epoch": 0.5910064239828694, "frac_reward_zero_std": 0.734375, "grad_norm": 29.5, "learning_rate": 8.954715367323468e-06, "loss": 0.2871, "num_tokens": 2900613.0, "reward": 3.34375, "reward_std": 0.44194173626601696, "rewards/correctness_reward_func/mean": 1.515625, "rewards/correctness_reward_func/std": 0.8490326702594757, "rewards/int_reward_func/mean": 0.4609375, "rewards/int_reward_func/std": 0.0783399622887373, "rewards/soft_format_reward_func/mean": 0.4609375, "rewards/soft_format_reward_func/std": 0.09120866656303406, "rewards/strict_format_reward_func/mean": 0.4453125, "rewards/strict_format_reward_func/std": 0.13093777745962143, "rewards/xmlcount_reward_func/mean": 0.4609375, "rewards/xmlcount_reward_func/std": 0.0871200654655695, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 289.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 149.90625, "completions/mean_terminated_length": 147.21406364440918, "completions/min_length": 84.25, "completions/min_terminated_length": 84.25, "entropy": 0.06772361230105162, "epoch": 0.5995717344753747, "frac_reward_zero_std": 0.84375, "grad_norm": 6.15625, "learning_rate": 8.657667341823449e-06, "loss": 0.0863, "num_tokens": 2931599.0, "reward": 3.6689453125, "reward_std": 0.20301698334515095, "rewards/correctness_reward_func/mean": 1.6875, "rewards/correctness_reward_func/std": 0.5998296737670898, "rewards/int_reward_func/mean": 0.4921875, "rewards/int_reward_func/std": 0.03125, "rewards/soft_format_reward_func/mean": 0.49609375, "rewards/soft_format_reward_func/std": 0.015625, "rewards/strict_format_reward_func/mean": 0.49609375, "rewards/strict_format_reward_func/std": 0.015625, "rewards/xmlcount_reward_func/mean": 0.4970703125, "rewards/xmlcount_reward_func/std": 0.01171875, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 149.4375, "completions/mean_terminated_length": 149.4375, "completions/min_length": 71.75, "completions/min_terminated_length": 71.75, "entropy": 0.06804852467030287, "epoch": 0.6081370449678801, "frac_reward_zero_std": 0.671875, "grad_norm": 14.75, "learning_rate": 8.361820885848623e-06, "loss": -0.1228, "num_tokens": 2963581.0, "reward": 3.4091796875, "reward_std": 0.4819926954805851, "rewards/correctness_reward_func/mean": 1.46875, "rewards/correctness_reward_func/std": 0.85780418664217, "rewards/int_reward_func/mean": 0.48046875, "rewards/int_reward_func/std": 0.05644455552101135, "rewards/soft_format_reward_func/mean": 0.48828125, "rewards/soft_format_reward_func/std": 0.025194555521011353, "rewards/strict_format_reward_func/mean": 0.484375, "rewards/strict_format_reward_func/std": 0.04081955552101135, "rewards/xmlcount_reward_func/mean": 0.4873046875, "rewards/xmlcount_reward_func/std": 0.029100805521011353, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 276.25, "completions/max_terminated_length": 237.375, "completions/mean_length": 143.1640625, "completions/mean_terminated_length": 140.2427101135254, "completions/min_length": 73.125, "completions/min_terminated_length": 73.125, "entropy": 0.06070453533902764, "epoch": 0.6167023554603854, "frac_reward_zero_std": 0.78125, "grad_norm": 8.9375, "learning_rate": 8.06744082204447e-06, "loss": 0.1705, "num_tokens": 2993796.0, "reward": 3.6064453125, "reward_std": 0.3135024197399616, "rewards/correctness_reward_func/mean": 1.640625, "rewards/correctness_reward_func/std": 0.7274979203939438, "rewards/int_reward_func/mean": 0.4765625, "rewards/int_reward_func/std": 0.07394563034176826, "rewards/soft_format_reward_func/mean": 0.5, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.4921875, "rewards/strict_format_reward_func/std": 0.03125, "rewards/xmlcount_reward_func/mean": 0.4970703125, "rewards/xmlcount_reward_func/std": 0.01171875, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 314.5, "completions/max_terminated_length": 300.125, "completions/mean_length": 152.578125, "completions/mean_terminated_length": 147.3067741394043, "completions/min_length": 77.5, "completions/min_terminated_length": 77.5, "entropy": 0.07308987434953451, "epoch": 0.6252676659528907, "frac_reward_zero_std": 0.78125, "grad_norm": 7.65625, "learning_rate": 7.774790660436857e-06, "loss": 0.0711, "num_tokens": 3025438.0, "reward": 3.4619140625, "reward_std": 0.2637839764356613, "rewards/correctness_reward_func/mean": 1.515625, "rewards/correctness_reward_func/std": 0.8590980246663094, "rewards/int_reward_func/mean": 0.47265625, "rewards/int_reward_func/std": 0.08957063034176826, "rewards/soft_format_reward_func/mean": 0.4921875, "rewards/soft_format_reward_func/std": 0.03125, "rewards/strict_format_reward_func/mean": 0.48828125, "rewards/strict_format_reward_func/std": 0.03697281517088413, "rewards/xmlcount_reward_func/mean": 0.4931640625, "rewards/xmlcount_reward_func/std": 0.023821823298931122, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 305.0, "completions/max_terminated_length": 273.75, "completions/mean_length": 165.515625, "completions/mean_terminated_length": 162.84323120117188, "completions/min_length": 89.625, "completions/min_terminated_length": 89.625, "entropy": 0.06814676942303777, "epoch": 0.6338329764453962, "frac_reward_zero_std": 0.8125, "grad_norm": 6.625, "learning_rate": 7.484132362554915e-06, "loss": 0.163, "num_tokens": 3059668.0, "reward": 3.490234375, "reward_std": 0.22097087278962135, "rewards/correctness_reward_func/mean": 1.53125, "rewards/correctness_reward_func/std": 0.8542027324438095, "rewards/int_reward_func/mean": 0.484375, "rewards/int_reward_func/std": 0.0625, "rewards/soft_format_reward_func/mean": 0.5, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.48046875, "rewards/strict_format_reward_func/std": 0.06822281517088413, "rewards/xmlcount_reward_func/mean": 0.494140625, "rewards/xmlcount_reward_func/std": 0.020961953792721033, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 289.625, "completions/max_terminated_length": 228.75, "completions/mean_length": 145.8671875, "completions/mean_terminated_length": 140.4114589691162, "completions/min_length": 80.75, "completions/min_terminated_length": 80.75, "entropy": 0.06801354000344872, "epoch": 0.6423982869379015, "frac_reward_zero_std": 0.8125, "grad_norm": 24.875, "learning_rate": 7.1957261069399745e-06, "loss": 0.1365, "num_tokens": 3089965.0, "reward": 3.5849609375, "reward_std": 0.26378397084772587, "rewards/correctness_reward_func/mean": 1.671875, "rewards/correctness_reward_func/std": 0.7019384130835533, "rewards/int_reward_func/mean": 0.47265625, "rewards/int_reward_func/std": 0.08769455552101135, "rewards/soft_format_reward_func/mean": 0.48828125, "rewards/soft_format_reward_func/std": 0.025194555521011353, "rewards/strict_format_reward_func/mean": 0.47265625, "rewards/strict_format_reward_func/std": 0.06492366641759872, "rewards/xmlcount_reward_func/mean": 0.4794921875, "rewards/xmlcount_reward_func/std": 0.05274027772247791, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.625, "completions/max_terminated_length": 276.625, "completions/mean_length": 151.796875, "completions/mean_terminated_length": 151.796875, "completions/min_length": 80.125, "completions/min_terminated_length": 80.125, "entropy": 0.06695270165801048, "epoch": 0.6509635974304069, "frac_reward_zero_std": 0.90625, "grad_norm": 4.125, "learning_rate": 6.909830056250527e-06, "loss": -0.0215, "num_tokens": 3121975.0, "reward": 3.591796875, "reward_std": 0.12153397500514984, "rewards/correctness_reward_func/mean": 1.609375, "rewards/correctness_reward_func/std": 0.6680332496762276, "rewards/int_reward_func/mean": 0.4921875, "rewards/int_reward_func/std": 0.03125, "rewards/soft_format_reward_func/mean": 0.5, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.4921875, "rewards/strict_format_reward_func/std": 0.021347815170884132, "rewards/xmlcount_reward_func/mean": 0.498046875, "rewards/xmlcount_reward_func/std": 0.005336953792721033, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 334.375, "completions/max_terminated_length": 308.625, "completions/mean_length": 162.4765625, "completions/mean_terminated_length": 157.2375030517578, "completions/min_length": 79.25, "completions/min_terminated_length": 79.25, "entropy": 0.06329814763739705, "epoch": 0.6595289079229122, "frac_reward_zero_std": 0.796875, "grad_norm": 19.25, "learning_rate": 6.6267001261717015e-06, "loss": 0.1003, "num_tokens": 3154864.0, "reward": 3.5283203125, "reward_std": 0.3135024178773165, "rewards/correctness_reward_func/mean": 1.59375, "rewards/correctness_reward_func/std": 0.6435378566384315, "rewards/int_reward_func/mean": 0.48046875, "rewards/int_reward_func/std": 0.05644455552101135, "rewards/soft_format_reward_func/mean": 0.48828125, "rewards/soft_format_reward_func/std": 0.025194555521011353, "rewards/strict_format_reward_func/mean": 0.48046875, "rewards/strict_format_reward_func/std": 0.05644455552101135, "rewards/xmlcount_reward_func/mean": 0.4853515625, "rewards/xmlcount_reward_func/std": 0.03691330552101135, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 296.875, "completions/max_terminated_length": 238.875, "completions/mean_length": 151.8203125, "completions/mean_terminated_length": 146.38125228881836, "completions/min_length": 76.5, "completions/min_terminated_length": 76.5, "entropy": 0.05917734792456031, "epoch": 0.6680942184154176, "frac_reward_zero_std": 0.796875, "grad_norm": 10.8125, "learning_rate": 6.34658975633605e-06, "loss": 0.1964, "num_tokens": 3186189.0, "reward": 3.5595703125, "reward_std": 0.22511406615376472, "rewards/correctness_reward_func/mean": 1.609375, "rewards/correctness_reward_func/std": 0.8085274025797844, "rewards/int_reward_func/mean": 0.48828125, "rewards/int_reward_func/std": 0.046875, "rewards/soft_format_reward_func/mean": 0.4921875, "rewards/soft_format_reward_func/std": 0.03125, "rewards/strict_format_reward_func/mean": 0.48046875, "rewards/strict_format_reward_func/std": 0.078125, "rewards/xmlcount_reward_func/mean": 0.4892578125, "rewards/xmlcount_reward_func/std": 0.04296875, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 340.0, "completions/max_terminated_length": 328.75, "completions/mean_length": 157.328125, "completions/mean_terminated_length": 154.62604331970215, "completions/min_length": 82.5, "completions/min_terminated_length": 82.5, "entropy": 0.07690681796520948, "epoch": 0.6766595289079229, "frac_reward_zero_std": 0.875, "grad_norm": 38.0, "learning_rate": 6.069749683460765e-06, "loss": 0.0732, "num_tokens": 3218069.0, "reward": 3.439453125, "reward_std": 0.21820873208343983, "rewards/correctness_reward_func/mean": 1.5625, "rewards/correctness_reward_func/std": 0.8027089610695839, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.07073915377259254, "rewards/soft_format_reward_func/mean": 0.47265625, "rewards/soft_format_reward_func/std": 0.0660141110420227, "rewards/strict_format_reward_func/mean": 0.46875, "rewards/strict_format_reward_func/std": 0.07173692621290684, "rewards/xmlcount_reward_func/mean": 0.470703125, "rewards/xmlcount_reward_func/std": 0.06738616153597832, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 296.25, "completions/max_terminated_length": 270.25, "completions/mean_length": 157.2421875, "completions/mean_terminated_length": 148.20973587036133, "completions/min_length": 79.875, "completions/min_terminated_length": 79.875, "entropy": 0.06810100981965661, "epoch": 0.6852248394004282, "frac_reward_zero_std": 0.859375, "grad_norm": 13.0, "learning_rate": 5.796427716904347e-06, "loss": -0.0428, "num_tokens": 3250002.0, "reward": 3.4755859375, "reward_std": 0.18920630402863026, "rewards/correctness_reward_func/mean": 1.546875, "rewards/correctness_reward_func/std": 0.8281612768769264, "rewards/int_reward_func/mean": 0.46875, "rewards/int_reward_func/std": 0.08351518586277962, "rewards/soft_format_reward_func/mean": 0.48828125, "rewards/soft_format_reward_func/std": 0.025194555521011353, "rewards/strict_format_reward_func/mean": 0.484375, "rewards/strict_format_reward_func/std": 0.04081955552101135, "rewards/xmlcount_reward_func/mean": 0.4873046875, "rewards/xmlcount_reward_func/std": 0.029100805521011353, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 295.0, "completions/max_terminated_length": 241.75, "completions/mean_length": 148.1953125, "completions/mean_terminated_length": 139.70900535583496, "completions/min_length": 78.5, "completions/min_terminated_length": 78.5, "entropy": 0.06864482956007123, "epoch": 0.6937901498929336, "frac_reward_zero_std": 0.8125, "grad_norm": 9.4375, "learning_rate": 5.526868516843673e-06, "loss": 0.3268, "num_tokens": 3280593.0, "reward": 3.451171875, "reward_std": 0.2458300832659006, "rewards/correctness_reward_func/mean": 1.484375, "rewards/correctness_reward_func/std": 0.8385421559214592, "rewards/int_reward_func/mean": 0.4921875, "rewards/int_reward_func/std": 0.021347815170884132, "rewards/soft_format_reward_func/mean": 0.49609375, "rewards/soft_format_reward_func/std": 0.015625, "rewards/strict_format_reward_func/mean": 0.48828125, "rewards/strict_format_reward_func/std": 0.03697281517088413, "rewards/xmlcount_reward_func/mean": 0.490234375, "rewards/xmlcount_reward_func/std": 0.030614666640758514, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 376.625, "completions/max_terminated_length": 287.0, "completions/mean_length": 170.5625, "completions/mean_terminated_length": 156.5812530517578, "completions/min_length": 78.375, "completions/min_terminated_length": 78.375, "entropy": 0.05719508696347475, "epoch": 0.702355460385439, "frac_reward_zero_std": 0.796875, "grad_norm": 21.75, "learning_rate": 5.2613133752700145e-06, "loss": 0.3157, "num_tokens": 3314333.0, "reward": 3.5078125, "reward_std": 0.27345145121216774, "rewards/correctness_reward_func/mean": 1.609375, "rewards/correctness_reward_func/std": 0.6161131635308266, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.08923800103366375, "rewards/soft_format_reward_func/mean": 0.48046875, "rewards/soft_format_reward_func/std": 0.05644455552101135, "rewards/strict_format_reward_func/mean": 0.47265625, "rewards/strict_format_reward_func/std": 0.06789018586277962, "rewards/xmlcount_reward_func/mean": 0.48046875, "rewards/xmlcount_reward_func/std": 0.049400702118873596, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 353.0, "completions/max_terminated_length": 253.125, "completions/mean_length": 157.8515625, "completions/mean_terminated_length": 149.17083740234375, "completions/min_length": 74.875, "completions/min_terminated_length": 74.875, "entropy": 0.06622256385162473, "epoch": 0.7109207708779444, "frac_reward_zero_std": 0.8125, "grad_norm": 15.875, "learning_rate": 5.000000000000003e-06, "loss": 0.292, "num_tokens": 3346592.0, "reward": 3.5224609375, "reward_std": 0.3217888306826353, "rewards/correctness_reward_func/mean": 1.625, "rewards/correctness_reward_func/std": 0.654181070625782, "rewards/int_reward_func/mean": 0.48046875, "rewards/int_reward_func/std": 0.05644455552101135, "rewards/soft_format_reward_func/mean": 0.4765625, "rewards/soft_format_reward_func/std": 0.05920085124671459, "rewards/strict_format_reward_func/mean": 0.46484375, "rewards/strict_format_reward_func/std": 0.10607585124671459, "rewards/xmlcount_reward_func/mean": 0.4755859375, "rewards/xmlcount_reward_func/std": 0.06840440817177296, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 409.75, "completions/max_terminated_length": 250.875, "completions/mean_length": 167.9375, "completions/mean_terminated_length": 148.38951301574707, "completions/min_length": 73.125, "completions/min_terminated_length": 73.125, "entropy": 0.05832461267709732, "epoch": 0.7194860813704497, "frac_reward_zero_std": 0.828125, "grad_norm": 9.875, "learning_rate": 4.743162301894952e-06, "loss": 0.4018, "num_tokens": 3380156.0, "reward": 3.4765625, "reward_std": 0.23201941419392824, "rewards/correctness_reward_func/mean": 1.5625, "rewards/correctness_reward_func/std": 0.8384527564048767, "rewards/int_reward_func/mean": 0.4765625, "rewards/int_reward_func/std": 0.07394563034176826, "rewards/soft_format_reward_func/mean": 0.484375, "rewards/soft_format_reward_func/std": 0.05259781517088413, "rewards/strict_format_reward_func/mean": 0.46875, "rewards/strict_format_reward_func/std": 0.0952934455126524, "rewards/xmlcount_reward_func/mean": 0.484375, "rewards/xmlcount_reward_func/std": 0.05507336184382439, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 318.375, "completions/max_terminated_length": 290.375, "completions/mean_length": 158.484375, "completions/mean_terminated_length": 152.8973217010498, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.07197121158242226, "epoch": 0.728051391862955, "frac_reward_zero_std": 0.78125, "grad_norm": 6.625, "learning_rate": 4.491030185478976e-06, "loss": 0.2278, "num_tokens": 3413046.0, "reward": 3.6376953125, "reward_std": 0.2803567871451378, "rewards/correctness_reward_func/mean": 1.671875, "rewards/correctness_reward_func/std": 0.6848579198122025, "rewards/int_reward_func/mean": 0.484375, "rewards/int_reward_func/std": 0.04081955552101135, "rewards/soft_format_reward_func/mean": 0.5, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.484375, "rewards/strict_format_reward_func/std": 0.04081955552101135, "rewards/xmlcount_reward_func/mean": 0.4970703125, "rewards/xmlcount_reward_func/std": 0.01171875, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 370.25, "completions/max_terminated_length": 291.125, "completions/mean_length": 175.3828125, "completions/mean_terminated_length": 161.8246307373047, "completions/min_length": 83.75, "completions/min_terminated_length": 83.75, "entropy": 0.06557085691019893, "epoch": 0.7366167023554604, "frac_reward_zero_std": 0.734375, "grad_norm": 13.4375, "learning_rate": 4.2438293431432665e-06, "loss": 0.3556, "num_tokens": 3447325.0, "reward": 3.4638671875, "reward_std": 0.30521600786596537, "rewards/correctness_reward_func/mean": 1.5625, "rewards/correctness_reward_func/std": 0.7508078292012215, "rewards/int_reward_func/mean": 0.46875, "rewards/int_reward_func/std": 0.06689241342246532, "rewards/soft_format_reward_func/mean": 0.484375, "rewards/soft_format_reward_func/std": 0.04081955552101135, "rewards/strict_format_reward_func/mean": 0.46484375, "rewards/strict_format_reward_func/std": 0.08627148158848286, "rewards/xmlcount_reward_func/mean": 0.4833984375, "rewards/xmlcount_reward_func/std": 0.03715440817177296, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 423.75, "completions/max_terminated_length": 293.125, "completions/mean_length": 168.0390625, "completions/mean_terminated_length": 154.1963596343994, "completions/min_length": 77.625, "completions/min_terminated_length": 77.625, "entropy": 0.06665381882339716, "epoch": 0.7451820128479657, "frac_reward_zero_std": 0.796875, "grad_norm": 8.625, "learning_rate": 4.001781053120863e-06, "loss": 0.2807, "num_tokens": 3481046.0, "reward": 3.435546875, "reward_std": 0.2762135900557041, "rewards/correctness_reward_func/mean": 1.515625, "rewards/correctness_reward_func/std": 0.7195080667734146, "rewards/int_reward_func/mean": 0.4609375, "rewards/int_reward_func/std": 0.09011822193861008, "rewards/soft_format_reward_func/mean": 0.48828125, "rewards/soft_format_reward_func/std": 0.046875, "rewards/strict_format_reward_func/mean": 0.48046875, "rewards/strict_format_reward_func/std": 0.078125, "rewards/xmlcount_reward_func/mean": 0.490234375, "rewards/xmlcount_reward_func/std": 0.0390625, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 390.75, "completions/max_terminated_length": 273.625, "completions/mean_length": 171.125, "completions/mean_terminated_length": 160.2234401702881, "completions/min_length": 88.25, "completions/min_terminated_length": 88.25, "entropy": 0.060074358712881804, "epoch": 0.7537473233404711, "frac_reward_zero_std": 0.796875, "grad_norm": 6.71875, "learning_rate": 3.7651019814126656e-06, "loss": 0.3052, "num_tokens": 3515172.0, "reward": 3.5400390625, "reward_std": 0.2748325187712908, "rewards/correctness_reward_func/mean": 1.59375, "rewards/correctness_reward_func/std": 0.7068260312080383, "rewards/int_reward_func/mean": 0.4765625, "rewards/int_reward_func/std": 0.062167370691895485, "rewards/soft_format_reward_func/mean": 0.49609375, "rewards/soft_format_reward_func/std": 0.015625, "rewards/strict_format_reward_func/mean": 0.48046875, "rewards/strict_format_reward_func/std": 0.078125, "rewards/xmlcount_reward_func/mean": 0.4931640625, "rewards/xmlcount_reward_func/std": 0.02734375, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 339.25, "completions/max_terminated_length": 269.0, "completions/mean_length": 155.5546875, "completions/mean_terminated_length": 143.96582794189453, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.06739555345848203, "epoch": 0.7623126338329764, "frac_reward_zero_std": 0.84375, "grad_norm": 7.3125, "learning_rate": 3.534003987842005e-06, "loss": 0.0879, "num_tokens": 3546821.0, "reward": 3.49609375, "reward_std": 0.14915533305611461, "rewards/correctness_reward_func/mean": 1.578125, "rewards/correctness_reward_func/std": 0.8041466698050499, "rewards/int_reward_func/mean": 0.4609375, "rewards/int_reward_func/std": 0.11476518586277962, "rewards/soft_format_reward_func/mean": 0.48828125, "rewards/soft_format_reward_func/std": 0.03697281517088413, "rewards/strict_format_reward_func/mean": 0.48046875, "rewards/strict_format_reward_func/std": 0.05644455552101135, "rewards/xmlcount_reward_func/mean": 0.48828125, "rewards/xmlcount_reward_func/std": 0.03697281517088413, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 367.0, "completions/max_terminated_length": 264.625, "completions/mean_length": 164.6796875, "completions/mean_terminated_length": 150.61161041259766, "completions/min_length": 72.5, "completions/min_terminated_length": 72.5, "entropy": 0.06468326412141323, "epoch": 0.7708779443254818, "frac_reward_zero_std": 0.78125, "grad_norm": 17.75, "learning_rate": 3.308693936411421e-06, "loss": 0.1794, "num_tokens": 3580028.0, "reward": 3.33203125, "reward_std": 0.27621358446776867, "rewards/correctness_reward_func/mean": 1.453125, "rewards/correctness_reward_func/std": 0.8369380235671997, "rewards/int_reward_func/mean": 0.45703125, "rewards/int_reward_func/std": 0.11861192621290684, "rewards/soft_format_reward_func/mean": 0.484375, "rewards/soft_format_reward_func/std": 0.04081955552101135, "rewards/strict_format_reward_func/mean": 0.4609375, "rewards/strict_format_reward_func/std": 0.10298692621290684, "rewards/xmlcount_reward_func/mean": 0.4765625, "rewards/xmlcount_reward_func/std": 0.061410133726894855, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 376.125, "completions/max_terminated_length": 301.625, "completions/mean_length": 162.484375, "completions/mean_terminated_length": 153.9395866394043, "completions/min_length": 76.375, "completions/min_terminated_length": 76.375, "entropy": 0.062019561883062124, "epoch": 0.7794432548179872, "frac_reward_zero_std": 0.765625, "grad_norm": 9.0, "learning_rate": 3.089373510131354e-06, "loss": 0.2742, "num_tokens": 3613084.0, "reward": 3.546875, "reward_std": 0.2872621323913336, "rewards/correctness_reward_func/mean": 1.578125, "rewards/correctness_reward_func/std": 0.8067077249288559, "rewards/int_reward_func/mean": 0.4921875, "rewards/int_reward_func/std": 0.03125, "rewards/soft_format_reward_func/mean": 0.49609375, "rewards/soft_format_reward_func/std": 0.015625, "rewards/strict_format_reward_func/mean": 0.484375, "rewards/strict_format_reward_func/std": 0.05259781517088413, "rewards/xmlcount_reward_func/mean": 0.49609375, "rewards/xmlcount_reward_func/std": 0.013149453792721033, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 385.625, "completions/max_terminated_length": 279.125, "completions/mean_length": 182.7734375, "completions/mean_terminated_length": 163.5002956390381, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.060410378966480494, "epoch": 0.7880085653104925, "frac_reward_zero_std": 0.796875, "grad_norm": 15.125, "learning_rate": 2.876239030486554e-06, "loss": 0.2067, "num_tokens": 3649339.0, "reward": 3.423828125, "reward_std": 0.31488349102437496, "rewards/correctness_reward_func/mean": 1.53125, "rewards/correctness_reward_func/std": 0.8038287088274956, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.10904237069189548, "rewards/soft_format_reward_func/mean": 0.4765625, "rewards/soft_format_reward_func/std": 0.07206955552101135, "rewards/strict_format_reward_func/mean": 0.47265625, "rewards/strict_format_reward_func/std": 0.07779237069189548, "rewards/xmlcount_reward_func/mean": 0.478515625, "rewards/xmlcount_reward_func/std": 0.060735128819942474, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 425.5, "completions/max_terminated_length": 323.0, "completions/mean_length": 188.359375, "completions/mean_terminated_length": 175.17136001586914, "completions/min_length": 89.5, "completions/min_terminated_length": 89.5, "entropy": 0.0746710579842329, "epoch": 0.7965738758029979, "frac_reward_zero_std": 0.78125, "grad_norm": 8.125, "learning_rate": 2.669481281701739e-06, "loss": 0.3599, "num_tokens": 3686281.0, "reward": 3.3671875, "reward_std": 0.3425048552453518, "rewards/correctness_reward_func/mean": 1.453125, "rewards/correctness_reward_func/std": 0.8768203780055046, "rewards/int_reward_func/mean": 0.46484375, "rewards/int_reward_func/std": 0.10904237069189548, "rewards/soft_format_reward_func/mean": 0.484375, "rewards/soft_format_reward_func/std": 0.0625, "rewards/strict_format_reward_func/mean": 0.4765625, "rewards/strict_format_reward_func/std": 0.08384781517088413, "rewards/xmlcount_reward_func/mean": 0.48828125, "rewards/xmlcount_reward_func/std": 0.04335307329893112, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 304.0, "completions/max_terminated_length": 273.125, "completions/mean_length": 151.4765625, "completions/mean_terminated_length": 148.6171875, "completions/min_length": 74.375, "completions/min_terminated_length": 74.375, "entropy": 0.06636462640017271, "epoch": 0.8051391862955032, "frac_reward_zero_std": 0.78125, "grad_norm": 6.34375, "learning_rate": 2.469285339963892e-06, "loss": 0.0483, "num_tokens": 3717830.0, "reward": 3.517578125, "reward_std": 0.32869415916502476, "rewards/correctness_reward_func/mean": 1.53125, "rewards/correctness_reward_func/std": 0.809794619679451, "rewards/int_reward_func/mean": 0.49609375, "rewards/int_reward_func/std": 0.015625, "rewards/soft_format_reward_func/mean": 0.49609375, "rewards/soft_format_reward_func/std": 0.015625, "rewards/strict_format_reward_func/mean": 0.49609375, "rewards/strict_format_reward_func/std": 0.015625, "rewards/xmlcount_reward_func/mean": 0.498046875, "rewards/xmlcount_reward_func/std": 0.0078125, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 319.125, "completions/max_terminated_length": 247.375, "completions/mean_length": 158.1015625, "completions/mean_terminated_length": 141.4641580581665, "completions/min_length": 79.625, "completions/min_terminated_length": 79.625, "entropy": 0.06498824106529355, "epoch": 0.8137044967880086, "frac_reward_zero_std": 0.828125, "grad_norm": 15.0625, "learning_rate": 2.275830407754006e-06, "loss": 0.2777, "num_tokens": 3750065.0, "reward": 3.41796875, "reward_std": 0.2485922183841467, "rewards/correctness_reward_func/mean": 1.5, "rewards/correctness_reward_func/std": 0.842106930911541, "rewards/int_reward_func/mean": 0.4765625, "rewards/int_reward_func/std": 0.04554459825158119, "rewards/soft_format_reward_func/mean": 0.48046875, "rewards/soft_format_reward_func/std": 0.04357585124671459, "rewards/strict_format_reward_func/mean": 0.4765625, "rewards/strict_format_reward_func/std": 0.04554459825158119, "rewards/xmlcount_reward_func/mean": 0.484375, "rewards/xmlcount_reward_func/std": 0.03324815817177296, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 324.25, "completions/max_terminated_length": 257.375, "completions/mean_length": 161.9140625, "completions/mean_terminated_length": 153.45208549499512, "completions/min_length": 83.75, "completions/min_terminated_length": 83.75, "entropy": 0.06923280376940966, "epoch": 0.8222698072805139, "frac_reward_zero_std": 0.84375, "grad_norm": 7.21875, "learning_rate": 2.08928965343659e-06, "loss": 0.0191, "num_tokens": 3783210.0, "reward": 3.6064453125, "reward_std": 0.2085412573069334, "rewards/correctness_reward_func/mean": 1.65625, "rewards/correctness_reward_func/std": 0.7442077249288559, "rewards/int_reward_func/mean": 0.48046875, "rewards/int_reward_func/std": 0.06822281517088413, "rewards/soft_format_reward_func/mean": 0.48828125, "rewards/soft_format_reward_func/std": 0.03697281517088413, "rewards/strict_format_reward_func/mean": 0.48828125, "rewards/strict_format_reward_func/std": 0.03697281517088413, "rewards/xmlcount_reward_func/mean": 0.4931640625, "rewards/xmlcount_reward_func/std": 0.021456445567309856, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 294.0, "completions/max_terminated_length": 224.5, "completions/mean_length": 143.0703125, "completions/mean_terminated_length": 137.4109401702881, "completions/min_length": 66.125, "completions/min_terminated_length": 66.125, "entropy": 0.06562586454674602, "epoch": 0.8308351177730193, "frac_reward_zero_std": 0.90625, "grad_norm": 4.4375, "learning_rate": 1.9098300562505266e-06, "loss": -0.0738, "num_tokens": 3813537.0, "reward": 3.6640625, "reward_std": 0.14086892642080784, "rewards/correctness_reward_func/mean": 1.734375, "rewards/correctness_reward_func/std": 0.5065634250640869, "rewards/int_reward_func/mean": 0.48046875, "rewards/int_reward_func/std": 0.046542370691895485, "rewards/soft_format_reward_func/mean": 0.484375, "rewards/soft_format_reward_func/std": 0.04081955552101135, "rewards/strict_format_reward_func/mean": 0.48046875, "rewards/strict_format_reward_func/std": 0.05644455552101135, "rewards/xmlcount_reward_func/mean": 0.484375, "rewards/xmlcount_reward_func/std": 0.04081955552101135, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.75, "completions/max_terminated_length": 262.75, "completions/mean_length": 146.15625, "completions/mean_terminated_length": 146.15625, "completions/min_length": 77.625, "completions/min_terminated_length": 77.625, "entropy": 0.05933028785511851, "epoch": 0.8394004282655246, "frac_reward_zero_std": 0.828125, "grad_norm": 8.5, "learning_rate": 1.7376122568400533e-06, "loss": -0.0062, "num_tokens": 3844527.0, "reward": 3.7138671875, "reward_std": 0.21682766266167164, "rewards/correctness_reward_func/mean": 1.734375, "rewards/correctness_reward_func/std": 0.5849205926060677, "rewards/int_reward_func/mean": 0.484375, "rewards/int_reward_func/std": 0.05259781517088413, "rewards/soft_format_reward_func/mean": 0.5, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.49609375, "rewards/strict_format_reward_func/std": 0.015625, "rewards/xmlcount_reward_func/mean": 0.4990234375, "rewards/xmlcount_reward_func/std": 0.00390625, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 298.125, "completions/max_terminated_length": 269.0, "completions/mean_length": 152.359375, "completions/mean_terminated_length": 149.32135486602783, "completions/min_length": 76.125, "completions/min_terminated_length": 76.125, "entropy": 0.07287971116602421, "epoch": 0.8479657387580299, "frac_reward_zero_std": 0.78125, "grad_norm": 35.5, "learning_rate": 1.5727904134596084e-06, "loss": 0.0803, "num_tokens": 3875861.0, "reward": 3.2978515625, "reward_std": 0.3632208569906652, "rewards/correctness_reward_func/mean": 1.421875, "rewards/correctness_reward_func/std": 0.9180332496762276, "rewards/int_reward_func/mean": 0.4609375, "rewards/int_reward_func/std": 0.10298692621290684, "rewards/soft_format_reward_func/mean": 0.47265625, "rewards/soft_format_reward_func/std": 0.0660141110420227, "rewards/strict_format_reward_func/mean": 0.47265625, "rewards/strict_format_reward_func/std": 0.0660141110420227, "rewards/xmlcount_reward_func/mean": 0.4697265625, "rewards/xmlcount_reward_func/std": 0.06928502768278122, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 399.5, "completions/max_terminated_length": 313.125, "completions/mean_length": 182.84375, "completions/mean_terminated_length": 160.92637634277344, "completions/min_length": 84.375, "completions/min_terminated_length": 84.375, "entropy": 0.07499845931306481, "epoch": 0.8565310492505354, "frac_reward_zero_std": 0.734375, "grad_norm": 15.375, "learning_rate": 1.4155120639813392e-06, "loss": 0.1946, "num_tokens": 3911453.0, "reward": 3.2890625, "reward_std": 0.40603396110236645, "rewards/correctness_reward_func/mean": 1.421875, "rewards/correctness_reward_func/std": 0.9128188416361809, "rewards/int_reward_func/mean": 0.44921875, "rewards/int_reward_func/std": 0.10904237069189548, "rewards/soft_format_reward_func/mean": 0.47265625, "rewards/soft_format_reward_func/std": 0.06116959825158119, "rewards/strict_format_reward_func/mean": 0.46875, "rewards/strict_format_reward_func/std": 0.07679459825158119, "rewards/xmlcount_reward_func/mean": 0.4765625, "rewards/xmlcount_reward_func/std": 0.05072539113461971, "step": 100 } ], "logging_steps": 1, "max_steps": 117, "num_input_tokens_seen": 3911453, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }