| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8565310492505354, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 425.75, | |
| "completions/max_terminated_length": 407.25, | |
| "completions/mean_length": 224.9140625, | |
| "completions/mean_terminated_length": 222.6020851135254, | |
| "completions/min_length": 100.75, | |
| "completions/min_terminated_length": 100.75, | |
| "entropy": 0.3943025507032871, | |
| "epoch": 0.008565310492505354, | |
| "frac_reward_zero_std": 0.859375, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 0.0, | |
| "loss": -0.0682, | |
| "num_tokens": 40707.0, | |
| "reward": 0.109375, | |
| "reward_std": 0.13258252362720668, | |
| "rewards/correctness_reward_func/mean": 0.046875, | |
| "rewards/correctness_reward_func/std": 0.1875, | |
| "rewards/int_reward_func/mean": 0.015625, | |
| "rewards/int_reward_func/std": 0.0625, | |
| "rewards/soft_format_reward_func/mean": 0.015625, | |
| "rewards/soft_format_reward_func/std": 0.05259781517088413, | |
| "rewards/strict_format_reward_func/mean": 0.0, | |
| "rewards/strict_format_reward_func/std": 0.0, | |
| "rewards/xmlcount_reward_func/mean": 0.03125, | |
| "rewards/xmlcount_reward_func/std": 0.09922334365546703, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 454.625, | |
| "completions/max_terminated_length": 431.875, | |
| "completions/mean_length": 240.2421875, | |
| "completions/mean_terminated_length": 236.11719131469727, | |
| "completions/min_length": 120.125, | |
| "completions/min_terminated_length": 120.125, | |
| "entropy": 0.4173264354467392, | |
| "epoch": 0.017130620985010708, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.0373, | |
| "num_tokens": 84076.0, | |
| "reward": 0.2431640625, | |
| "reward_std": 0.30797815788537264, | |
| "rewards/correctness_reward_func/mean": 0.078125, | |
| "rewards/correctness_reward_func/std": 0.3125, | |
| "rewards/int_reward_func/mean": 0.0390625, | |
| "rewards/int_reward_func/std": 0.13456955552101135, | |
| "rewards/soft_format_reward_func/mean": 0.046875, | |
| "rewards/soft_format_reward_func/std": 0.12433474138379097, | |
| "rewards/strict_format_reward_func/mean": 0.0, | |
| "rewards/strict_format_reward_func/std": 0.0, | |
| "rewards/xmlcount_reward_func/mean": 0.0791015625, | |
| "rewards/xmlcount_reward_func/std": 0.1614172589033842, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 379.375, | |
| "completions/max_terminated_length": 372.125, | |
| "completions/mean_length": 209.1484375, | |
| "completions/mean_terminated_length": 207.07291793823242, | |
| "completions/min_length": 90.5, | |
| "completions/min_terminated_length": 90.5, | |
| "entropy": 0.39367850497365, | |
| "epoch": 0.02569593147751606, | |
| "frac_reward_zero_std": 0.796875, | |
| "grad_norm": 3.0, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.0145, | |
| "num_tokens": 122753.0, | |
| "reward": 0.1796875, | |
| "reward_std": 0.17677669739350677, | |
| "rewards/correctness_reward_func/mean": 0.078125, | |
| "rewards/correctness_reward_func/std": 0.2257782220840454, | |
| "rewards/int_reward_func/mean": 0.0234375, | |
| "rewards/int_reward_func/std": 0.07206955552101135, | |
| "rewards/soft_format_reward_func/mean": 0.02734375, | |
| "rewards/soft_format_reward_func/std": 0.07779237069189548, | |
| "rewards/strict_format_reward_func/mean": 0.0, | |
| "rewards/strict_format_reward_func/std": 0.0, | |
| "rewards/xmlcount_reward_func/mean": 0.05078125, | |
| "rewards/xmlcount_reward_func/std": 0.11597390845417976, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 381.75, | |
| "completions/max_terminated_length": 381.75, | |
| "completions/mean_length": 205.140625, | |
| "completions/mean_terminated_length": 205.140625, | |
| "completions/min_length": 82.5, | |
| "completions/min_terminated_length": 82.5, | |
| "entropy": 0.42507942393422127, | |
| "epoch": 0.034261241970021415, | |
| "frac_reward_zero_std": 0.609375, | |
| "grad_norm": 4.5, | |
| "learning_rate": 5e-06, | |
| "loss": -0.1276, | |
| "num_tokens": 161051.0, | |
| "reward": 0.2958984375, | |
| "reward_std": 0.3052160106599331, | |
| "rewards/correctness_reward_func/mean": 0.078125, | |
| "rewards/correctness_reward_func/std": 0.2257782220840454, | |
| "rewards/int_reward_func/mean": 0.03125, | |
| "rewards/int_reward_func/std": 0.09341737069189548, | |
| "rewards/soft_format_reward_func/mean": 0.07421875, | |
| "rewards/soft_format_reward_func/std": 0.17638970352709293, | |
| "rewards/strict_format_reward_func/mean": 0.0, | |
| "rewards/strict_format_reward_func/std": 0.0, | |
| "rewards/xmlcount_reward_func/mean": 0.1123046875, | |
| "rewards/xmlcount_reward_func/std": 0.1925698984414339, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 383.25, | |
| "completions/max_terminated_length": 383.25, | |
| "completions/mean_length": 207.828125, | |
| "completions/mean_terminated_length": 207.828125, | |
| "completions/min_length": 74.5, | |
| "completions/min_terminated_length": 74.5, | |
| "entropy": 0.4761287160217762, | |
| "epoch": 0.042826552462526764, | |
| "frac_reward_zero_std": 0.34375, | |
| "grad_norm": 6.28125, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": -0.0135, | |
| "num_tokens": 199521.0, | |
| "reward": 1.1474609375, | |
| "reward_std": 0.5952402763068676, | |
| "rewards/correctness_reward_func/mean": 0.359375, | |
| "rewards/correctness_reward_func/std": 0.7196519374847412, | |
| "rewards/int_reward_func/mean": 0.12109375, | |
| "rewards/int_reward_func/std": 0.20104984939098358, | |
| "rewards/soft_format_reward_func/mean": 0.3203125, | |
| "rewards/soft_format_reward_func/std": 0.23448428697884083, | |
| "rewards/strict_format_reward_func/mean": 0.0, | |
| "rewards/strict_format_reward_func/std": 0.0, | |
| "rewards/xmlcount_reward_func/mean": 0.3466796875, | |
| "rewards/xmlcount_reward_func/std": 0.20816493593156338, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 434.5, | |
| "completions/max_terminated_length": 387.375, | |
| "completions/mean_length": 231.28125, | |
| "completions/mean_terminated_length": 226.61354446411133, | |
| "completions/min_length": 118.25, | |
| "completions/min_terminated_length": 118.25, | |
| "entropy": 0.42499926686286926, | |
| "epoch": 0.05139186295503212, | |
| "frac_reward_zero_std": 0.578125, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.1364, | |
| "num_tokens": 241131.0, | |
| "reward": 1.4375, | |
| "reward_std": 0.4833737723529339, | |
| "rewards/correctness_reward_func/mean": 0.375, | |
| "rewards/correctness_reward_func/std": 0.741176463663578, | |
| "rewards/int_reward_func/mean": 0.13671875, | |
| "rewards/int_reward_func/std": 0.2227986976504326, | |
| "rewards/soft_format_reward_func/mean": 0.44921875, | |
| "rewards/soft_format_reward_func/std": 0.1270910371094942, | |
| "rewards/strict_format_reward_func/mean": 0.0, | |
| "rewards/strict_format_reward_func/std": 0.0, | |
| "rewards/xmlcount_reward_func/mean": 0.4765625, | |
| "rewards/xmlcount_reward_func/std": 0.07140547037124634, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 453.0, | |
| "completions/max_terminated_length": 433.125, | |
| "completions/mean_length": 238.6796875, | |
| "completions/mean_terminated_length": 234.18125343322754, | |
| "completions/min_length": 96.375, | |
| "completions/min_terminated_length": 96.375, | |
| "entropy": 0.4198453910648823, | |
| "epoch": 0.059957173447537475, | |
| "frac_reward_zero_std": 0.421875, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1899, | |
| "num_tokens": 283686.0, | |
| "reward": 1.7509765625, | |
| "reward_std": 0.7305849269032478, | |
| "rewards/correctness_reward_func/mean": 0.609375, | |
| "rewards/correctness_reward_func/std": 0.9355916231870651, | |
| "rewards/int_reward_func/mean": 0.1953125, | |
| "rewards/int_reward_func/std": 0.2459700107574463, | |
| "rewards/soft_format_reward_func/mean": 0.4609375, | |
| "rewards/soft_format_reward_func/std": 0.1128891110420227, | |
| "rewards/strict_format_reward_func/mean": 0.00390625, | |
| "rewards/strict_format_reward_func/std": 0.015625, | |
| "rewards/xmlcount_reward_func/mean": 0.4814453125, | |
| "rewards/xmlcount_reward_func/std": 0.046708236914128065, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 364.375, | |
| "completions/max_terminated_length": 364.375, | |
| "completions/mean_length": 220.34375, | |
| "completions/mean_terminated_length": 220.34375, | |
| "completions/min_length": 98.875, | |
| "completions/min_terminated_length": 98.875, | |
| "entropy": 0.42923642322421074, | |
| "epoch": 0.06852248394004283, | |
| "frac_reward_zero_std": 0.46875, | |
| "grad_norm": 4.59375, | |
| "learning_rate": 1.1666666666666668e-05, | |
| "loss": -0.0768, | |
| "num_tokens": 324156.0, | |
| "reward": 2.1044921875, | |
| "reward_std": 0.6339101828634739, | |
| "rewards/correctness_reward_func/mean": 0.859375, | |
| "rewards/correctness_reward_func/std": 0.9749292060732841, | |
| "rewards/int_reward_func/mean": 0.3046875, | |
| "rewards/int_reward_func/std": 0.23351078107953072, | |
| "rewards/soft_format_reward_func/mean": 0.46484375, | |
| "rewards/soft_format_reward_func/std": 0.08251741342246532, | |
| "rewards/strict_format_reward_func/mean": 0.00390625, | |
| "rewards/strict_format_reward_func/std": 0.015625, | |
| "rewards/xmlcount_reward_func/mean": 0.4716796875, | |
| "rewards/xmlcount_reward_func/std": 0.06621513469144702, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 376.5, | |
| "completions/max_terminated_length": 361.25, | |
| "completions/mean_length": 216.578125, | |
| "completions/mean_terminated_length": 212.43080520629883, | |
| "completions/min_length": 96.875, | |
| "completions/min_terminated_length": 96.875, | |
| "entropy": 0.3928321301937103, | |
| "epoch": 0.07708779443254818, | |
| "frac_reward_zero_std": 0.59375, | |
| "grad_norm": 3.5, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.0392, | |
| "num_tokens": 363864.0, | |
| "reward": 2.5830078125, | |
| "reward_std": 0.40741503052413464, | |
| "rewards/correctness_reward_func/mean": 1.203125, | |
| "rewards/correctness_reward_func/std": 0.9139788597822189, | |
| "rewards/int_reward_func/mean": 0.42578125, | |
| "rewards/int_reward_func/std": 0.1317095011472702, | |
| "rewards/soft_format_reward_func/mean": 0.46875, | |
| "rewards/soft_format_reward_func/std": 0.09341737069189548, | |
| "rewards/strict_format_reward_func/mean": 0.0078125, | |
| "rewards/strict_format_reward_func/std": 0.03125, | |
| "rewards/xmlcount_reward_func/mean": 0.4775390625, | |
| "rewards/xmlcount_reward_func/std": 0.06064485618844628, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 403.75, | |
| "completions/max_terminated_length": 386.625, | |
| "completions/mean_length": 210.6640625, | |
| "completions/mean_terminated_length": 203.55908012390137, | |
| "completions/min_length": 75.875, | |
| "completions/min_terminated_length": 75.875, | |
| "entropy": 0.3184557221829891, | |
| "epoch": 0.08565310492505353, | |
| "frac_reward_zero_std": 0.421875, | |
| "grad_norm": 5.125, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": -0.0025, | |
| "num_tokens": 402647.0, | |
| "reward": 2.7490234375, | |
| "reward_std": 0.7140121199190617, | |
| "rewards/correctness_reward_func/mean": 1.375, | |
| "rewards/correctness_reward_func/std": 0.9443820938467979, | |
| "rewards/int_reward_func/mean": 0.453125, | |
| "rewards/int_reward_func/std": 0.12136822193861008, | |
| "rewards/soft_format_reward_func/mean": 0.4609375, | |
| "rewards/soft_format_reward_func/std": 0.10298692621290684, | |
| "rewards/strict_format_reward_func/mean": 0.03125, | |
| "rewards/strict_format_reward_func/std": 0.10519563034176826, | |
| "rewards/xmlcount_reward_func/mean": 0.4287109375, | |
| "rewards/xmlcount_reward_func/std": 0.144282141700387, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 434.75, | |
| "completions/max_terminated_length": 372.875, | |
| "completions/mean_length": 238.5390625, | |
| "completions/mean_terminated_length": 229.2265682220459, | |
| "completions/min_length": 88.375, | |
| "completions/min_terminated_length": 88.375, | |
| "entropy": 0.3128885291516781, | |
| "epoch": 0.09421841541755889, | |
| "frac_reward_zero_std": 0.359375, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.2439, | |
| "num_tokens": 445266.0, | |
| "reward": 3.0341796875, | |
| "reward_std": 0.5344732906669378, | |
| "rewards/correctness_reward_func/mean": 1.421875, | |
| "rewards/correctness_reward_func/std": 0.8460541293025017, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.09914018586277962, | |
| "rewards/soft_format_reward_func/mean": 0.484375, | |
| "rewards/soft_format_reward_func/std": 0.05259781517088413, | |
| "rewards/strict_format_reward_func/mean": 0.203125, | |
| "rewards/strict_format_reward_func/std": 0.2389280553907156, | |
| "rewards/xmlcount_reward_func/mean": 0.4599609375, | |
| "rewards/xmlcount_reward_func/std": 0.10840688459575176, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 373.125, | |
| "completions/max_terminated_length": 353.5, | |
| "completions/mean_length": 218.9140625, | |
| "completions/mean_terminated_length": 216.4234390258789, | |
| "completions/min_length": 88.0, | |
| "completions/min_terminated_length": 88.0, | |
| "entropy": 0.2998475953936577, | |
| "epoch": 0.10278372591006424, | |
| "frac_reward_zero_std": 0.71875, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 1.8333333333333333e-05, | |
| "loss": -0.1385, | |
| "num_tokens": 485929.0, | |
| "reward": 3.484375, | |
| "reward_std": 0.38393688201904297, | |
| "rewards/correctness_reward_func/mean": 1.59375, | |
| "rewards/correctness_reward_func/std": 0.7323416471481323, | |
| "rewards/int_reward_func/mean": 0.46875, | |
| "rewards/int_reward_func/std": 0.06877040676772594, | |
| "rewards/soft_format_reward_func/mean": 0.48828125, | |
| "rewards/soft_format_reward_func/std": 0.025194555521011353, | |
| "rewards/strict_format_reward_func/mean": 0.46875, | |
| "rewards/strict_format_reward_func/std": 0.09341737069189548, | |
| "rewards/xmlcount_reward_func/mean": 0.46484375, | |
| "rewards/xmlcount_reward_func/std": 0.09572842810302973, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 370.125, | |
| "completions/max_terminated_length": 333.5, | |
| "completions/mean_length": 204.46875, | |
| "completions/mean_terminated_length": 201.68333435058594, | |
| "completions/min_length": 95.875, | |
| "completions/min_terminated_length": 95.875, | |
| "entropy": 0.1816732920706272, | |
| "epoch": 0.11134903640256959, | |
| "frac_reward_zero_std": 0.703125, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2391, | |
| "num_tokens": 523887.0, | |
| "reward": 3.525390625, | |
| "reward_std": 0.30659707519225776, | |
| "rewards/correctness_reward_func/mean": 1.5625, | |
| "rewards/correctness_reward_func/std": 0.7897166311740875, | |
| "rewards/int_reward_func/mean": 0.4921875, | |
| "rewards/int_reward_func/std": 0.021347815170884132, | |
| "rewards/soft_format_reward_func/mean": 0.5, | |
| "rewards/soft_format_reward_func/std": 0.0, | |
| "rewards/strict_format_reward_func/mean": 0.4765625, | |
| "rewards/strict_format_reward_func/std": 0.08384781517088413, | |
| "rewards/xmlcount_reward_func/mean": 0.494140625, | |
| "rewards/xmlcount_reward_func/std": 0.0234375, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 404.0, | |
| "completions/max_terminated_length": 354.75, | |
| "completions/mean_length": 187.515625, | |
| "completions/mean_terminated_length": 179.75893211364746, | |
| "completions/min_length": 81.0, | |
| "completions/min_terminated_length": 81.0, | |
| "entropy": 0.15807979460805655, | |
| "epoch": 0.11991434689507495, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 1.9995524322835035e-05, | |
| "loss": -0.0269, | |
| "num_tokens": 559761.0, | |
| "reward": 3.4208984375, | |
| "reward_std": 0.32731308368965983, | |
| "rewards/correctness_reward_func/mean": 1.53125, | |
| "rewards/correctness_reward_func/std": 0.8291211053729057, | |
| "rewards/int_reward_func/mean": 0.46875, | |
| "rewards/int_reward_func/std": 0.09341737069189548, | |
| "rewards/soft_format_reward_func/mean": 0.47265625, | |
| "rewards/soft_format_reward_func/std": 0.0660141110420227, | |
| "rewards/strict_format_reward_func/mean": 0.46875, | |
| "rewards/strict_format_reward_func/std": 0.07173692621290684, | |
| "rewards/xmlcount_reward_func/mean": 0.4794921875, | |
| "rewards/xmlcount_reward_func/std": 0.05074238684028387, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 349.625, | |
| "completions/max_terminated_length": 338.5, | |
| "completions/mean_length": 187.828125, | |
| "completions/mean_terminated_length": 185.50208473205566, | |
| "completions/min_length": 92.625, | |
| "completions/min_terminated_length": 92.625, | |
| "entropy": 0.17131240852177143, | |
| "epoch": 0.1284796573875803, | |
| "frac_reward_zero_std": 0.765625, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 1.998210129767735e-05, | |
| "loss": 0.2086, | |
| "num_tokens": 596775.0, | |
| "reward": 3.4423828125, | |
| "reward_std": 0.3024538792669773, | |
| "rewards/correctness_reward_func/mean": 1.484375, | |
| "rewards/correctness_reward_func/std": 0.8679328411817551, | |
| "rewards/int_reward_func/mean": 0.484375, | |
| "rewards/int_reward_func/std": 0.05259781517088413, | |
| "rewards/soft_format_reward_func/mean": 0.49609375, | |
| "rewards/soft_format_reward_func/std": 0.015625, | |
| "rewards/strict_format_reward_func/mean": 0.484375, | |
| "rewards/strict_format_reward_func/std": 0.04081955552101135, | |
| "rewards/xmlcount_reward_func/mean": 0.4931640625, | |
| "rewards/xmlcount_reward_func/std": 0.02734375, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 401.375, | |
| "completions/max_terminated_length": 382.125, | |
| "completions/mean_length": 206.8984375, | |
| "completions/mean_terminated_length": 202.10052299499512, | |
| "completions/min_length": 102.25, | |
| "completions/min_terminated_length": 102.25, | |
| "entropy": 0.15633743070065975, | |
| "epoch": 0.13704496788008566, | |
| "frac_reward_zero_std": 0.765625, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 1.9959742939952393e-05, | |
| "loss": 0.0938, | |
| "num_tokens": 635224.0, | |
| "reward": 3.591796875, | |
| "reward_std": 0.35631553269922733, | |
| "rewards/correctness_reward_func/mean": 1.625, | |
| "rewards/correctness_reward_func/std": 0.7407501488924026, | |
| "rewards/int_reward_func/mean": 0.48828125, | |
| "rewards/int_reward_func/std": 0.046875, | |
| "rewards/soft_format_reward_func/mean": 0.4921875, | |
| "rewards/soft_format_reward_func/std": 0.03125, | |
| "rewards/strict_format_reward_func/mean": 0.4921875, | |
| "rewards/strict_format_reward_func/std": 0.03125, | |
| "rewards/xmlcount_reward_func/mean": 0.494140625, | |
| "rewards/xmlcount_reward_func/std": 0.0234375, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 360.0, | |
| "completions/max_terminated_length": 330.125, | |
| "completions/mean_length": 199.09375, | |
| "completions/mean_terminated_length": 194.2984390258789, | |
| "completions/min_length": 104.625, | |
| "completions/min_terminated_length": 104.625, | |
| "entropy": 0.19571769889444113, | |
| "epoch": 0.145610278372591, | |
| "frac_reward_zero_std": 0.8125, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 1.9928469263418376e-05, | |
| "loss": 0.0169, | |
| "num_tokens": 671932.0, | |
| "reward": 3.57421875, | |
| "reward_std": 0.23754368349909782, | |
| "rewards/correctness_reward_func/mean": 1.609375, | |
| "rewards/correctness_reward_func/std": 0.6196783930063248, | |
| "rewards/int_reward_func/mean": 0.484375, | |
| "rewards/int_reward_func/std": 0.04081955552101135, | |
| "rewards/soft_format_reward_func/mean": 0.4921875, | |
| "rewards/soft_format_reward_func/std": 0.03125, | |
| "rewards/strict_format_reward_func/mean": 0.4921875, | |
| "rewards/strict_format_reward_func/std": 0.03125, | |
| "rewards/xmlcount_reward_func/mean": 0.49609375, | |
| "rewards/xmlcount_reward_func/std": 0.015625, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 363.5, | |
| "completions/max_terminated_length": 352.125, | |
| "completions/mean_length": 207.1640625, | |
| "completions/mean_terminated_length": 204.95573043823242, | |
| "completions/min_length": 96.75, | |
| "completions/min_terminated_length": 96.75, | |
| "entropy": 0.20056522078812122, | |
| "epoch": 0.15417558886509636, | |
| "frac_reward_zero_std": 0.765625, | |
| "grad_norm": 5.84375, | |
| "learning_rate": 1.9888308262251286e-05, | |
| "loss": 0.0571, | |
| "num_tokens": 710529.0, | |
| "reward": 3.4853515625, | |
| "reward_std": 0.3079781490378082, | |
| "rewards/correctness_reward_func/mean": 1.578125, | |
| "rewards/correctness_reward_func/std": 0.7429328411817551, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.08923800103366375, | |
| "rewards/soft_format_reward_func/mean": 0.484375, | |
| "rewards/soft_format_reward_func/std": 0.04081955552101135, | |
| "rewards/strict_format_reward_func/mean": 0.47265625, | |
| "rewards/strict_format_reward_func/std": 0.07779237069189548, | |
| "rewards/xmlcount_reward_func/mean": 0.4853515625, | |
| "rewards/xmlcount_reward_func/std": 0.03691330552101135, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 404.0, | |
| "completions/max_terminated_length": 404.0, | |
| "completions/mean_length": 242.7109375, | |
| "completions/mean_terminated_length": 242.7109375, | |
| "completions/min_length": 131.125, | |
| "completions/min_terminated_length": 131.125, | |
| "entropy": 0.18245846219360828, | |
| "epoch": 0.16274089935760172, | |
| "frac_reward_zero_std": 0.71875, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.98392958859863e-05, | |
| "loss": -0.0092, | |
| "num_tokens": 753458.0, | |
| "reward": 3.4609375, | |
| "reward_std": 0.39774756878614426, | |
| "rewards/correctness_reward_func/mean": 1.484375, | |
| "rewards/correctness_reward_func/std": 0.8802329078316689, | |
| "rewards/int_reward_func/mean": 0.4765625, | |
| "rewards/int_reward_func/std": 0.07206955552101135, | |
| "rewards/soft_format_reward_func/mean": 0.5, | |
| "rewards/soft_format_reward_func/std": 0.0, | |
| "rewards/strict_format_reward_func/mean": 0.5, | |
| "rewards/strict_format_reward_func/std": 0.0, | |
| "rewards/xmlcount_reward_func/mean": 0.5, | |
| "rewards/xmlcount_reward_func/std": 0.0, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 453.25, | |
| "completions/max_terminated_length": 416.25, | |
| "completions/mean_length": 240.1328125, | |
| "completions/mean_terminated_length": 232.0755271911621, | |
| "completions/min_length": 136.875, | |
| "completions/min_terminated_length": 136.875, | |
| "entropy": 0.1867619026452303, | |
| "epoch": 0.17130620985010706, | |
| "frac_reward_zero_std": 0.765625, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.9781476007338058e-05, | |
| "loss": 0.1299, | |
| "num_tokens": 796607.0, | |
| "reward": 3.5458984375, | |
| "reward_std": 0.33283737674355507, | |
| "rewards/correctness_reward_func/mean": 1.59375, | |
| "rewards/correctness_reward_func/std": 0.7794546857476234, | |
| "rewards/int_reward_func/mean": 0.48828125, | |
| "rewards/int_reward_func/std": 0.046875, | |
| "rewards/soft_format_reward_func/mean": 0.48828125, | |
| "rewards/soft_format_reward_func/std": 0.046875, | |
| "rewards/strict_format_reward_func/mean": 0.484375, | |
| "rewards/strict_format_reward_func/std": 0.0625, | |
| "rewards/xmlcount_reward_func/mean": 0.4912109375, | |
| "rewards/xmlcount_reward_func/std": 0.03515625, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 364.625, | |
| "completions/max_terminated_length": 364.625, | |
| "completions/mean_length": 236.8828125, | |
| "completions/mean_terminated_length": 236.8828125, | |
| "completions/min_length": 135.25, | |
| "completions/min_terminated_length": 135.25, | |
| "entropy": 0.17888653837144375, | |
| "epoch": 0.17987152034261242, | |
| "frac_reward_zero_std": 0.8125, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.9714900382928674e-05, | |
| "loss": -0.0113, | |
| "num_tokens": 839112.0, | |
| "reward": 3.537109375, | |
| "reward_std": 0.2568786293268204, | |
| "rewards/correctness_reward_func/mean": 1.546875, | |
| "rewards/correctness_reward_func/std": 0.7888757362961769, | |
| "rewards/int_reward_func/mean": 0.49609375, | |
| "rewards/int_reward_func/std": 0.015625, | |
| "rewards/soft_format_reward_func/mean": 0.5, | |
| "rewards/soft_format_reward_func/std": 0.0, | |
| "rewards/strict_format_reward_func/mean": 0.49609375, | |
| "rewards/strict_format_reward_func/std": 0.015625, | |
| "rewards/xmlcount_reward_func/mean": 0.498046875, | |
| "rewards/xmlcount_reward_func/std": 0.0078125, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 402.25, | |
| "completions/max_terminated_length": 371.75, | |
| "completions/mean_length": 232.953125, | |
| "completions/mean_terminated_length": 226.11860275268555, | |
| "completions/min_length": 142.125, | |
| "completions/min_terminated_length": 142.125, | |
| "entropy": 0.1563574131578207, | |
| "epoch": 0.18843683083511778, | |
| "frac_reward_zero_std": 0.78125, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.9639628606958535e-05, | |
| "loss": 0.0542, | |
| "num_tokens": 881088.0, | |
| "reward": 3.5693359375, | |
| "reward_std": 0.30245387367904186, | |
| "rewards/correctness_reward_func/mean": 1.625, | |
| "rewards/correctness_reward_func/std": 0.7605545148253441, | |
| "rewards/int_reward_func/mean": 0.4765625, | |
| "rewards/int_reward_func/std": 0.08384781517088413, | |
| "rewards/soft_format_reward_func/mean": 0.48828125, | |
| "rewards/soft_format_reward_func/std": 0.03697281517088413, | |
| "rewards/strict_format_reward_func/mean": 0.48828125, | |
| "rewards/strict_format_reward_func/std": 0.03697281517088413, | |
| "rewards/xmlcount_reward_func/mean": 0.4912109375, | |
| "rewards/xmlcount_reward_func/std": 0.027729611843824387, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 446.75, | |
| "completions/max_terminated_length": 402.875, | |
| "completions/mean_length": 254.4765625, | |
| "completions/mean_terminated_length": 242.60871124267578, | |
| "completions/min_length": 135.75, | |
| "completions/min_terminated_length": 135.75, | |
| "entropy": 0.16357604414224625, | |
| "epoch": 0.19700214132762311, | |
| "frac_reward_zero_std": 0.6875, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 1.955572805786141e-05, | |
| "loss": 0.0195, | |
| "num_tokens": 926107.0, | |
| "reward": 3.341796875, | |
| "reward_std": 0.4087961111217737, | |
| "rewards/correctness_reward_func/mean": 1.46875, | |
| "rewards/correctness_reward_func/std": 0.8502998873591423, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.08439540676772594, | |
| "rewards/soft_format_reward_func/mean": 0.46875, | |
| "rewards/soft_format_reward_func/std": 0.08054866641759872, | |
| "rewards/strict_format_reward_func/mean": 0.4609375, | |
| "rewards/strict_format_reward_func/std": 0.10189648158848286, | |
| "rewards/xmlcount_reward_func/mean": 0.478515625, | |
| "rewards/xmlcount_reward_func/std": 0.04778209747746587, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 403.5, | |
| "completions/max_terminated_length": 380.875, | |
| "completions/mean_length": 241.0703125, | |
| "completions/mean_terminated_length": 239.03854370117188, | |
| "completions/min_length": 119.25, | |
| "completions/min_terminated_length": 119.25, | |
| "entropy": 0.1647733524441719, | |
| "epoch": 0.20556745182012848, | |
| "frac_reward_zero_std": 0.6875, | |
| "grad_norm": 7.75, | |
| "learning_rate": 1.9463273837991643e-05, | |
| "loss": -0.128, | |
| "num_tokens": 968962.0, | |
| "reward": 3.3740234375, | |
| "reward_std": 0.48751697689294815, | |
| "rewards/correctness_reward_func/mean": 1.484375, | |
| "rewards/correctness_reward_func/std": 0.8504082337021828, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.0972641110420227, | |
| "rewards/soft_format_reward_func/mean": 0.4765625, | |
| "rewards/soft_format_reward_func/std": 0.050389111042022705, | |
| "rewards/strict_format_reward_func/mean": 0.47265625, | |
| "rewards/strict_format_reward_func/std": 0.0660141110420227, | |
| "rewards/xmlcount_reward_func/mean": 0.4755859375, | |
| "rewards/xmlcount_reward_func/std": 0.054295361042022705, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 408.75, | |
| "completions/max_terminated_length": 399.0, | |
| "completions/mean_length": 255.03125, | |
| "completions/mean_terminated_length": 253.1906280517578, | |
| "completions/min_length": 156.5, | |
| "completions/min_terminated_length": 156.5, | |
| "entropy": 0.125497592613101, | |
| "epoch": 0.21413276231263384, | |
| "frac_reward_zero_std": 0.8125, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.9362348706397374e-05, | |
| "loss": -0.0327, | |
| "num_tokens": 1013778.0, | |
| "reward": 3.583984375, | |
| "reward_std": 0.22373299859464169, | |
| "rewards/correctness_reward_func/mean": 1.609375, | |
| "rewards/correctness_reward_func/std": 0.7345243394374847, | |
| "rewards/int_reward_func/mean": 0.48828125, | |
| "rewards/int_reward_func/std": 0.03697281517088413, | |
| "rewards/soft_format_reward_func/mean": 0.49609375, | |
| "rewards/soft_format_reward_func/std": 0.015625, | |
| "rewards/strict_format_reward_func/mean": 0.4921875, | |
| "rewards/strict_format_reward_func/std": 0.021347815170884132, | |
| "rewards/xmlcount_reward_func/mean": 0.498046875, | |
| "rewards/xmlcount_reward_func/std": 0.005336953792721033, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 403.125, | |
| "completions/max_terminated_length": 384.875, | |
| "completions/mean_length": 262.71875, | |
| "completions/mean_terminated_length": 260.7333335876465, | |
| "completions/min_length": 167.5, | |
| "completions/min_terminated_length": 167.5, | |
| "entropy": 0.18168668448925018, | |
| "epoch": 0.22269807280513917, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 13.0, | |
| "learning_rate": 1.9253043004739967e-05, | |
| "loss": 0.054, | |
| "num_tokens": 1059274.0, | |
| "reward": 3.5341796875, | |
| "reward_std": 0.39360435120761395, | |
| "rewards/correctness_reward_func/mean": 1.625, | |
| "rewards/correctness_reward_func/std": 0.7649086192250252, | |
| "rewards/int_reward_func/mean": 0.47265625, | |
| "rewards/int_reward_func/std": 0.06116959825158119, | |
| "rewards/soft_format_reward_func/mean": 0.4765625, | |
| "rewards/soft_format_reward_func/std": 0.049298666417598724, | |
| "rewards/strict_format_reward_func/mean": 0.4765625, | |
| "rewards/strict_format_reward_func/std": 0.049298666417598724, | |
| "rewards/xmlcount_reward_func/mean": 0.4833984375, | |
| "rewards/xmlcount_reward_func/std": 0.03711527772247791, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 467.5, | |
| "completions/max_terminated_length": 456.625, | |
| "completions/mean_length": 268.328125, | |
| "completions/mean_terminated_length": 263.0275344848633, | |
| "completions/min_length": 158.375, | |
| "completions/min_terminated_length": 158.375, | |
| "entropy": 0.13837805949151516, | |
| "epoch": 0.23126338329764454, | |
| "frac_reward_zero_std": 0.6875, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 1.913545457642601e-05, | |
| "loss": -0.0646, | |
| "num_tokens": 1105884.0, | |
| "reward": 3.2080078125, | |
| "reward_std": 0.4985655229538679, | |
| "rewards/correctness_reward_func/mean": 1.359375, | |
| "rewards/correctness_reward_func/std": 0.91850346326828, | |
| "rewards/int_reward_func/mean": 0.453125, | |
| "rewards/int_reward_func/std": 0.11255648173391819, | |
| "rewards/soft_format_reward_func/mean": 0.4609375, | |
| "rewards/soft_format_reward_func/std": 0.09308474138379097, | |
| "rewards/strict_format_reward_func/mean": 0.45703125, | |
| "rewards/strict_format_reward_func/std": 0.09693148173391819, | |
| "rewards/xmlcount_reward_func/mean": 0.4775390625, | |
| "rewards/xmlcount_reward_func/std": 0.05502833751961589, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 449.5, | |
| "completions/max_terminated_length": 401.75, | |
| "completions/mean_length": 268.9375, | |
| "completions/mean_terminated_length": 260.4345283508301, | |
| "completions/min_length": 159.25, | |
| "completions/min_terminated_length": 159.25, | |
| "entropy": 0.08630623016506433, | |
| "epoch": 0.2398286937901499, | |
| "frac_reward_zero_std": 0.734375, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.900968867902419e-05, | |
| "loss": 0.0809, | |
| "num_tokens": 1152072.0, | |
| "reward": 3.59375, | |
| "reward_std": 0.34802911058068275, | |
| "rewards/correctness_reward_func/mean": 1.671875, | |
| "rewards/correctness_reward_func/std": 0.5796433389186859, | |
| "rewards/int_reward_func/mean": 0.4921875, | |
| "rewards/int_reward_func/std": 0.03125, | |
| "rewards/soft_format_reward_func/mean": 0.47265625, | |
| "rewards/soft_format_reward_func/std": 0.0796684455126524, | |
| "rewards/strict_format_reward_func/mean": 0.46875, | |
| "rewards/strict_format_reward_func/std": 0.0952934455126524, | |
| "rewards/xmlcount_reward_func/mean": 0.48828125, | |
| "rewards/xmlcount_reward_func/std": 0.03735560039058328, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 414.125, | |
| "completions/max_terminated_length": 404.75, | |
| "completions/mean_length": 265.9921875, | |
| "completions/mean_terminated_length": 261.0090808868408, | |
| "completions/min_length": 149.125, | |
| "completions/min_terminated_length": 149.125, | |
| "entropy": 0.08809430431574583, | |
| "epoch": 0.24839400428265523, | |
| "frac_reward_zero_std": 0.734375, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 1.8875857890045544e-05, | |
| "loss": -0.0093, | |
| "num_tokens": 1198043.0, | |
| "reward": 3.376953125, | |
| "reward_std": 0.42813105694949627, | |
| "rewards/correctness_reward_func/mean": 1.5, | |
| "rewards/correctness_reward_func/std": 0.8573416471481323, | |
| "rewards/int_reward_func/mean": 0.4609375, | |
| "rewards/int_reward_func/std": 0.10298692621290684, | |
| "rewards/soft_format_reward_func/mean": 0.46875, | |
| "rewards/soft_format_reward_func/std": 0.09341737069189548, | |
| "rewards/strict_format_reward_func/mean": 0.46875, | |
| "rewards/strict_format_reward_func/std": 0.09341737069189548, | |
| "rewards/xmlcount_reward_func/mean": 0.478515625, | |
| "rewards/xmlcount_reward_func/std": 0.060735128819942474, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 468.625, | |
| "completions/max_terminated_length": 436.25, | |
| "completions/mean_length": 267.0859375, | |
| "completions/mean_terminated_length": 261.05000495910645, | |
| "completions/min_length": 156.5, | |
| "completions/min_terminated_length": 156.5, | |
| "entropy": 0.1107498500496149, | |
| "epoch": 0.2569593147751606, | |
| "frac_reward_zero_std": 0.828125, | |
| "grad_norm": 22.375, | |
| "learning_rate": 1.87340820061713e-05, | |
| "loss": 0.0968, | |
| "num_tokens": 1244242.0, | |
| "reward": 3.5576171875, | |
| "reward_std": 0.2610218357294798, | |
| "rewards/correctness_reward_func/mean": 1.671875, | |
| "rewards/correctness_reward_func/std": 0.6637040823698044, | |
| "rewards/int_reward_func/mean": 0.4609375, | |
| "rewards/int_reward_func/std": 0.11179866641759872, | |
| "rewards/soft_format_reward_func/mean": 0.4765625, | |
| "rewards/soft_format_reward_func/std": 0.05920085124671459, | |
| "rewards/strict_format_reward_func/mean": 0.47265625, | |
| "rewards/strict_format_reward_func/std": 0.07482585124671459, | |
| "rewards/xmlcount_reward_func/mean": 0.4755859375, | |
| "rewards/xmlcount_reward_func/std": 0.06549832038581371, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0546875, | |
| "completions/max_length": 465.625, | |
| "completions/max_terminated_length": 430.875, | |
| "completions/mean_length": 279.1953125, | |
| "completions/mean_terminated_length": 266.19639587402344, | |
| "completions/min_length": 156.5, | |
| "completions/min_terminated_length": 156.5, | |
| "entropy": 0.10201808018609881, | |
| "epoch": 0.26552462526766596, | |
| "frac_reward_zero_std": 0.734375, | |
| "grad_norm": 20.375, | |
| "learning_rate": 1.8584487936018663e-05, | |
| "loss": 0.1084, | |
| "num_tokens": 1292255.0, | |
| "reward": 3.431640625, | |
| "reward_std": 0.3204077649861574, | |
| "rewards/correctness_reward_func/mean": 1.5625, | |
| "rewards/correctness_reward_func/std": 0.7631078958511353, | |
| "rewards/int_reward_func/mean": 0.47265625, | |
| "rewards/int_reward_func/std": 0.07779237069189548, | |
| "rewards/soft_format_reward_func/mean": 0.46484375, | |
| "rewards/soft_format_reward_func/std": 0.09617366641759872, | |
| "rewards/strict_format_reward_func/mean": 0.4609375, | |
| "rewards/strict_format_reward_func/std": 0.10002040676772594, | |
| "rewards/xmlcount_reward_func/mean": 0.470703125, | |
| "rewards/xmlcount_reward_func/std": 0.0801805853843689, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 456.0, | |
| "completions/max_terminated_length": 420.5, | |
| "completions/mean_length": 273.09375, | |
| "completions/mean_terminated_length": 265.13988304138184, | |
| "completions/min_length": 162.625, | |
| "completions/min_terminated_length": 162.625, | |
| "entropy": 0.0854261638596654, | |
| "epoch": 0.2740899357601713, | |
| "frac_reward_zero_std": 0.640625, | |
| "grad_norm": 6.15625, | |
| "learning_rate": 1.8427209586540392e-05, | |
| "loss": 0.1575, | |
| "num_tokens": 1339511.0, | |
| "reward": 3.3798828125, | |
| "reward_std": 0.4875169713050127, | |
| "rewards/correctness_reward_func/mean": 1.578125, | |
| "rewards/correctness_reward_func/std": 0.80234594643116, | |
| "rewards/int_reward_func/mean": 0.46875, | |
| "rewards/int_reward_func/std": 0.09341737069189548, | |
| "rewards/soft_format_reward_func/mean": 0.4375, | |
| "rewards/soft_format_reward_func/std": 0.15228559263050556, | |
| "rewards/strict_format_reward_func/mean": 0.43359375, | |
| "rewards/strict_format_reward_func/std": 0.15613233298063278, | |
| "rewards/xmlcount_reward_func/mean": 0.4619140625, | |
| "rewards/xmlcount_reward_func/std": 0.10068135987967253, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0390625, | |
| "completions/max_length": 469.625, | |
| "completions/max_terminated_length": 423.625, | |
| "completions/mean_length": 274.953125, | |
| "completions/mean_terminated_length": 265.3171920776367, | |
| "completions/min_length": 137.0, | |
| "completions/min_terminated_length": 137.0, | |
| "entropy": 0.08486761944368482, | |
| "epoch": 0.2826552462526767, | |
| "frac_reward_zero_std": 0.65625, | |
| "grad_norm": 7.25, | |
| "learning_rate": 1.826238774315995e-05, | |
| "loss": 0.124, | |
| "num_tokens": 1386753.0, | |
| "reward": 3.3203125, | |
| "reward_std": 0.5109951309859753, | |
| "rewards/correctness_reward_func/mean": 1.59375, | |
| "rewards/correctness_reward_func/std": 0.6808668300509453, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.08736192621290684, | |
| "rewards/soft_format_reward_func/mean": 0.41796875, | |
| "rewards/soft_format_reward_func/std": 0.17672233283519745, | |
| "rewards/strict_format_reward_func/mean": 0.3984375, | |
| "rewards/strict_format_reward_func/std": 0.19531385228037834, | |
| "rewards/xmlcount_reward_func/mean": 0.4453125, | |
| "rewards/xmlcount_reward_func/std": 0.11597495479509234, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 500.375, | |
| "completions/max_terminated_length": 410.125, | |
| "completions/mean_length": 295.1015625, | |
| "completions/mean_terminated_length": 264.6992950439453, | |
| "completions/min_length": 134.0, | |
| "completions/min_terminated_length": 134.0, | |
| "entropy": 0.1147261718288064, | |
| "epoch": 0.291220556745182, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 1.8090169943749477e-05, | |
| "loss": 0.2837, | |
| "num_tokens": 1436884.0, | |
| "reward": 3.1953125, | |
| "reward_std": 0.5910970717668533, | |
| "rewards/correctness_reward_func/mean": 1.453125, | |
| "rewards/correctness_reward_func/std": 0.9149020090699196, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.1109184455126524, | |
| "rewards/soft_format_reward_func/mean": 0.453125, | |
| "rewards/soft_format_reward_func/std": 0.13611300103366375, | |
| "rewards/strict_format_reward_func/mean": 0.390625, | |
| "rewards/strict_format_reward_func/std": 0.21258162707090378, | |
| "rewards/xmlcount_reward_func/mean": 0.43359375, | |
| "rewards/xmlcount_reward_func/std": 0.147721191868186, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0546875, | |
| "completions/max_length": 471.0, | |
| "completions/max_terminated_length": 403.25, | |
| "completions/mean_length": 256.2578125, | |
| "completions/mean_terminated_length": 241.55090141296387, | |
| "completions/min_length": 135.5, | |
| "completions/min_terminated_length": 135.5, | |
| "entropy": 0.06437111645936966, | |
| "epoch": 0.29978586723768735, | |
| "frac_reward_zero_std": 0.671875, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 1.7910710346563417e-05, | |
| "loss": 0.0329, | |
| "num_tokens": 1481663.0, | |
| "reward": 3.4833984375, | |
| "reward_std": 0.53999756090343, | |
| "rewards/correctness_reward_func/mean": 1.640625, | |
| "rewards/correctness_reward_func/std": 0.778965063393116, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.12082063034176826, | |
| "rewards/soft_format_reward_func/mean": 0.46484375, | |
| "rewards/soft_format_reward_func/std": 0.12082063034176826, | |
| "rewards/strict_format_reward_func/mean": 0.44921875, | |
| "rewards/strict_format_reward_func/std": 0.14689540676772594, | |
| "rewards/xmlcount_reward_func/mean": 0.4638671875, | |
| "rewards/xmlcount_reward_func/std": 0.10555252991616726, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 457.875, | |
| "completions/max_terminated_length": 417.625, | |
| "completions/mean_length": 246.2265625, | |
| "completions/mean_terminated_length": 232.80781745910645, | |
| "completions/min_length": 77.0, | |
| "completions/min_terminated_length": 77.0, | |
| "entropy": 0.09419436752796173, | |
| "epoch": 0.3083511777301927, | |
| "frac_reward_zero_std": 0.65625, | |
| "grad_norm": 16.25, | |
| "learning_rate": 1.7724169592245996e-05, | |
| "loss": -0.0012, | |
| "num_tokens": 1524892.0, | |
| "reward": 3.3369140625, | |
| "reward_std": 0.34941017907112837, | |
| "rewards/correctness_reward_func/mean": 1.546875, | |
| "rewards/correctness_reward_func/std": 0.7342507243156433, | |
| "rewards/int_reward_func/mean": 0.46875, | |
| "rewards/int_reward_func/std": 0.09341737069189548, | |
| "rewards/soft_format_reward_func/mean": 0.453125, | |
| "rewards/soft_format_reward_func/std": 0.1095899622887373, | |
| "rewards/strict_format_reward_func/mean": 0.43359375, | |
| "rewards/strict_format_reward_func/std": 0.14689540676772594, | |
| "rewards/xmlcount_reward_func/mean": 0.4345703125, | |
| "rewards/xmlcount_reward_func/std": 0.12757759355008602, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 466.875, | |
| "completions/max_terminated_length": 341.625, | |
| "completions/mean_length": 237.1953125, | |
| "completions/mean_terminated_length": 214.32523155212402, | |
| "completions/min_length": 102.5, | |
| "completions/min_terminated_length": 102.5, | |
| "entropy": 0.08123180363327265, | |
| "epoch": 0.3169164882226981, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 1.7530714660036112e-05, | |
| "loss": 0.4256, | |
| "num_tokens": 1566655.0, | |
| "reward": 3.6396484375, | |
| "reward_std": 0.23616261687129736, | |
| "rewards/correctness_reward_func/mean": 1.765625, | |
| "rewards/correctness_reward_func/std": 0.45028156042099, | |
| "rewards/int_reward_func/mean": 0.484375, | |
| "rewards/int_reward_func/std": 0.05259781517088413, | |
| "rewards/soft_format_reward_func/mean": 0.484375, | |
| "rewards/soft_format_reward_func/std": 0.0625, | |
| "rewards/strict_format_reward_func/mean": 0.44140625, | |
| "rewards/strict_format_reward_func/std": 0.15537451766431332, | |
| "rewards/xmlcount_reward_func/mean": 0.4638671875, | |
| "rewards/xmlcount_reward_func/std": 0.10096731083467603, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 446.875, | |
| "completions/max_terminated_length": 403.125, | |
| "completions/mean_length": 237.4453125, | |
| "completions/mean_terminated_length": 232.93490028381348, | |
| "completions/min_length": 120.625, | |
| "completions/min_terminated_length": 120.625, | |
| "entropy": 0.07567687798291445, | |
| "epoch": 0.32548179871520344, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 1.7330518718298263e-05, | |
| "loss": 0.1448, | |
| "num_tokens": 1609482.0, | |
| "reward": 3.5947265625, | |
| "reward_std": 0.3079781401902437, | |
| "rewards/correctness_reward_func/mean": 1.625, | |
| "rewards/correctness_reward_func/std": 0.7266493514180183, | |
| "rewards/int_reward_func/mean": 0.5, | |
| "rewards/int_reward_func/std": 0.0, | |
| "rewards/soft_format_reward_func/mean": 0.4921875, | |
| "rewards/soft_format_reward_func/std": 0.021347815170884132, | |
| "rewards/strict_format_reward_func/mean": 0.484375, | |
| "rewards/strict_format_reward_func/std": 0.05259781517088413, | |
| "rewards/xmlcount_reward_func/mean": 0.4931640625, | |
| "rewards/xmlcount_reward_func/std": 0.023663727566599846, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 412.75, | |
| "completions/max_terminated_length": 407.25, | |
| "completions/mean_length": 225.1953125, | |
| "completions/mean_terminated_length": 223.21041870117188, | |
| "completions/min_length": 123.125, | |
| "completions/min_terminated_length": 123.125, | |
| "entropy": 0.061285244300961494, | |
| "epoch": 0.3340471092077088, | |
| "frac_reward_zero_std": 0.84375, | |
| "grad_norm": 6.0, | |
| "learning_rate": 1.712376096951345e-05, | |
| "loss": 0.1485, | |
| "num_tokens": 1649845.0, | |
| "reward": 3.748046875, | |
| "reward_std": 0.17953883367590606, | |
| "rewards/correctness_reward_func/mean": 1.765625, | |
| "rewards/correctness_reward_func/std": 0.601259708404541, | |
| "rewards/int_reward_func/mean": 0.5, | |
| "rewards/int_reward_func/std": 0.0, | |
| "rewards/soft_format_reward_func/mean": 0.5, | |
| "rewards/soft_format_reward_func/std": 0.0, | |
| "rewards/strict_format_reward_func/mean": 0.49609375, | |
| "rewards/strict_format_reward_func/std": 0.015625, | |
| "rewards/xmlcount_reward_func/mean": 0.486328125, | |
| "rewards/xmlcount_reward_func/std": 0.04478531517088413, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 397.125, | |
| "completions/max_terminated_length": 352.75, | |
| "completions/mean_length": 211.8359375, | |
| "completions/mean_terminated_length": 204.96146202087402, | |
| "completions/min_length": 96.0, | |
| "completions/min_terminated_length": 96.0, | |
| "entropy": 0.08425743412226439, | |
| "epoch": 0.3426124197002141, | |
| "frac_reward_zero_std": 0.796875, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 1.691062648986865e-05, | |
| "loss": 0.0101, | |
| "num_tokens": 1689182.0, | |
| "reward": 3.4814453125, | |
| "reward_std": 0.3245509583503008, | |
| "rewards/correctness_reward_func/mean": 1.59375, | |
| "rewards/correctness_reward_func/std": 0.7279798686504364, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.09617366641759872, | |
| "rewards/soft_format_reward_func/mean": 0.4765625, | |
| "rewards/soft_format_reward_func/std": 0.07206955552101135, | |
| "rewards/strict_format_reward_func/mean": 0.46875, | |
| "rewards/strict_format_reward_func/std": 0.10331955552101135, | |
| "rewards/xmlcount_reward_func/mean": 0.4775390625, | |
| "rewards/xmlcount_reward_func/std": 0.06816330552101135, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 415.625, | |
| "completions/max_terminated_length": 373.375, | |
| "completions/mean_length": 202.0078125, | |
| "completions/mean_terminated_length": 192.4726963043213, | |
| "completions/min_length": 73.75, | |
| "completions/min_terminated_length": 73.75, | |
| "entropy": 0.08351494651287794, | |
| "epoch": 0.3511777301927195, | |
| "frac_reward_zero_std": 0.78125, | |
| "grad_norm": 7.1875, | |
| "learning_rate": 1.6691306063588583e-05, | |
| "loss": 0.0866, | |
| "num_tokens": 1726993.0, | |
| "reward": 3.4150390625, | |
| "reward_std": 0.23063834570348263, | |
| "rewards/correctness_reward_func/mean": 1.546875, | |
| "rewards/correctness_reward_func/std": 0.7765756696462631, | |
| "rewards/int_reward_func/mean": 0.453125, | |
| "rewards/int_reward_func/std": 0.11146603710949421, | |
| "rewards/soft_format_reward_func/mean": 0.48046875, | |
| "rewards/soft_format_reward_func/std": 0.046542370691895485, | |
| "rewards/strict_format_reward_func/mean": 0.4609375, | |
| "rewards/strict_format_reward_func/std": 0.10189648158848286, | |
| "rewards/xmlcount_reward_func/mean": 0.4736328125, | |
| "rewards/xmlcount_reward_func/std": 0.06877632485702634, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 393.0, | |
| "completions/max_terminated_length": 353.875, | |
| "completions/mean_length": 199.8359375, | |
| "completions/mean_terminated_length": 192.48474884033203, | |
| "completions/min_length": 87.5, | |
| "completions/min_terminated_length": 87.5, | |
| "entropy": 0.07358774542808533, | |
| "epoch": 0.35974304068522484, | |
| "frac_reward_zero_std": 0.765625, | |
| "grad_norm": 6.28125, | |
| "learning_rate": 1.6465996012157996e-05, | |
| "loss": 0.2018, | |
| "num_tokens": 1765048.0, | |
| "reward": 3.466796875, | |
| "reward_std": 0.3204077500849962, | |
| "rewards/correctness_reward_func/mean": 1.53125, | |
| "rewards/correctness_reward_func/std": 0.8384338021278381, | |
| "rewards/int_reward_func/mean": 0.48828125, | |
| "rewards/int_reward_func/std": 0.03697281517088413, | |
| "rewards/soft_format_reward_func/mean": 0.48828125, | |
| "rewards/soft_format_reward_func/std": 0.03697281517088413, | |
| "rewards/strict_format_reward_func/mean": 0.47265625, | |
| "rewards/strict_format_reward_func/std": 0.08957063034176826, | |
| "rewards/xmlcount_reward_func/mean": 0.486328125, | |
| "rewards/xmlcount_reward_func/std": 0.045771504286676645, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 463.75, | |
| "completions/max_terminated_length": 342.0, | |
| "completions/mean_length": 203.1796875, | |
| "completions/mean_terminated_length": 177.74688339233398, | |
| "completions/min_length": 76.625, | |
| "completions/min_terminated_length": 76.625, | |
| "entropy": 0.09338215924799442, | |
| "epoch": 0.3683083511777302, | |
| "frac_reward_zero_std": 0.671875, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 1.6234898018587336e-05, | |
| "loss": 0.4962, | |
| "num_tokens": 1802763.0, | |
| "reward": 3.421875, | |
| "reward_std": 0.39774755109101534, | |
| "rewards/correctness_reward_func/mean": 1.53125, | |
| "rewards/correctness_reward_func/std": 0.8366330787539482, | |
| "rewards/int_reward_func/mean": 0.4921875, | |
| "rewards/int_reward_func/std": 0.03125, | |
| "rewards/soft_format_reward_func/mean": 0.5, | |
| "rewards/soft_format_reward_func/std": 0.0, | |
| "rewards/strict_format_reward_func/mean": 0.4453125, | |
| "rewards/strict_format_reward_func/std": 0.1271837092936039, | |
| "rewards/xmlcount_reward_func/mean": 0.453125, | |
| "rewards/xmlcount_reward_func/std": 0.1128259189426899, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0546875, | |
| "completions/max_length": 434.5, | |
| "completions/max_terminated_length": 338.5, | |
| "completions/mean_length": 200.421875, | |
| "completions/mean_terminated_length": 181.6392650604248, | |
| "completions/min_length": 95.125, | |
| "completions/min_terminated_length": 95.125, | |
| "entropy": 0.09290066035464406, | |
| "epoch": 0.37687366167023556, | |
| "frac_reward_zero_std": 0.78125, | |
| "grad_norm": 129.0, | |
| "learning_rate": 1.599821894687914e-05, | |
| "loss": 0.2028, | |
| "num_tokens": 1840283.0, | |
| "reward": 3.3876953125, | |
| "reward_std": 0.27759466134011745, | |
| "rewards/correctness_reward_func/mean": 1.515625, | |
| "rewards/correctness_reward_func/std": 0.7650169730186462, | |
| "rewards/int_reward_func/mean": 0.48828125, | |
| "rewards/int_reward_func/std": 0.025194555521011353, | |
| "rewards/soft_format_reward_func/mean": 0.48046875, | |
| "rewards/soft_format_reward_func/std": 0.05644455552101135, | |
| "rewards/strict_format_reward_func/mean": 0.4453125, | |
| "rewards/strict_format_reward_func/std": 0.11806907318532467, | |
| "rewards/xmlcount_reward_func/mean": 0.4580078125, | |
| "rewards/xmlcount_reward_func/std": 0.10733805038034916, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 302.125, | |
| "completions/max_terminated_length": 273.5, | |
| "completions/mean_length": 161.5859375, | |
| "completions/mean_terminated_length": 158.94635581970215, | |
| "completions/min_length": 83.75, | |
| "completions/min_terminated_length": 83.75, | |
| "entropy": 0.07536831498146057, | |
| "epoch": 0.3854389721627409, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 52.5, | |
| "learning_rate": 1.575617065685674e-05, | |
| "loss": 0.0339, | |
| "num_tokens": 1873008.0, | |
| "reward": 3.439453125, | |
| "reward_std": 0.4253689181059599, | |
| "rewards/correctness_reward_func/mean": 1.5625, | |
| "rewards/correctness_reward_func/std": 0.8102209344506264, | |
| "rewards/int_reward_func/mean": 0.45703125, | |
| "rewards/int_reward_func/std": 0.11861192621290684, | |
| "rewards/soft_format_reward_func/mean": 0.4765625, | |
| "rewards/soft_format_reward_func/std": 0.050389111042022705, | |
| "rewards/strict_format_reward_func/mean": 0.47265625, | |
| "rewards/strict_format_reward_func/std": 0.053145406767725945, | |
| "rewards/xmlcount_reward_func/mean": 0.470703125, | |
| "rewards/xmlcount_reward_func/std": 0.060957906767725945, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 444.5, | |
| "completions/max_terminated_length": 266.75, | |
| "completions/mean_length": 179.7578125, | |
| "completions/mean_terminated_length": 152.03579235076904, | |
| "completions/min_length": 82.0, | |
| "completions/min_terminated_length": 82.0, | |
| "entropy": 0.06918492680415511, | |
| "epoch": 0.39400428265524623, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 24.125, | |
| "learning_rate": 1.5508969814521026e-05, | |
| "loss": 0.1968, | |
| "num_tokens": 1907887.0, | |
| "reward": 3.259765625, | |
| "reward_std": 0.4695630930364132, | |
| "rewards/correctness_reward_func/mean": 1.5, | |
| "rewards/correctness_reward_func/std": 0.8375296071171761, | |
| "rewards/int_reward_func/mean": 0.4765625, | |
| "rewards/int_reward_func/std": 0.07206955552101135, | |
| "rewards/soft_format_reward_func/mean": 0.4609375, | |
| "rewards/soft_format_reward_func/std": 0.08824022859334946, | |
| "rewards/strict_format_reward_func/mean": 0.38671875, | |
| "rewards/strict_format_reward_func/std": 0.20270179212093353, | |
| "rewards/xmlcount_reward_func/mean": 0.435546875, | |
| "rewards/xmlcount_reward_func/std": 0.12772688083350658, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.5859375, | |
| "completions/max_length": 512.0, | |
| "completions/max_terminated_length": 234.25, | |
| "completions/mean_length": 361.9609375, | |
| "completions/mean_terminated_length": 144.42629528045654, | |
| "completions/min_length": 80.375, | |
| "completions/min_terminated_length": 80.375, | |
| "entropy": 0.06124910665675998, | |
| "epoch": 0.4025695931477516, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 20.0, | |
| "learning_rate": 1.5256837698105047e-05, | |
| "loss": 0.4458, | |
| "num_tokens": 1965858.0, | |
| "reward": 2.03125, | |
| "reward_std": 0.770635899156332, | |
| "rewards/correctness_reward_func/mean": 1.0625, | |
| "rewards/correctness_reward_func/std": 0.978024922311306, | |
| "rewards/int_reward_func/mean": 0.41015625, | |
| "rewards/int_reward_func/std": 0.14781177043914795, | |
| "rewards/soft_format_reward_func/mean": 0.34765625, | |
| "rewards/soft_format_reward_func/std": 0.221479382365942, | |
| "rewards/strict_format_reward_func/mean": 0.046875, | |
| "rewards/strict_format_reward_func/std": 0.1095899622887373, | |
| "rewards/xmlcount_reward_func/mean": 0.1640625, | |
| "rewards/xmlcount_reward_func/std": 0.17734735272824764, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.78125, | |
| "completions/max_length": 512.0, | |
| "completions/max_terminated_length": 174.375, | |
| "completions/mean_length": 431.34375, | |
| "completions/mean_terminated_length": 122.72916984558105, | |
| "completions/min_length": 152.25, | |
| "completions/min_terminated_length": 88.25, | |
| "entropy": 0.06367563363164663, | |
| "epoch": 0.41113490364025695, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 20.75, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.5487, | |
| "num_tokens": 2033422.0, | |
| "reward": 1.4404296875, | |
| "reward_std": 0.7416334673762321, | |
| "rewards/correctness_reward_func/mean": 0.71875, | |
| "rewards/correctness_reward_func/std": 0.9663743898272514, | |
| "rewards/int_reward_func/mean": 0.30078125, | |
| "rewards/int_reward_func/std": 0.23859525099396706, | |
| "rewards/soft_format_reward_func/mean": 0.29296875, | |
| "rewards/soft_format_reward_func/std": 0.23586604371666908, | |
| "rewards/strict_format_reward_func/mean": 0.015625, | |
| "rewards/strict_format_reward_func/std": 0.0625, | |
| "rewards/xmlcount_reward_func/mean": 0.1123046875, | |
| "rewards/xmlcount_reward_func/std": 0.16386567754670978, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.6328125, | |
| "completions/max_length": 512.0, | |
| "completions/max_terminated_length": 230.375, | |
| "completions/mean_length": 376.578125, | |
| "completions/mean_terminated_length": 143.8458366394043, | |
| "completions/min_length": 79.75, | |
| "completions/min_terminated_length": 79.75, | |
| "entropy": 0.06622787471860647, | |
| "epoch": 0.4197002141327623, | |
| "frac_reward_zero_std": 0.234375, | |
| "grad_norm": 69.5, | |
| "learning_rate": 1.4738686624729987e-05, | |
| "loss": 0.9777, | |
| "num_tokens": 2093936.0, | |
| "reward": 1.595703125, | |
| "reward_std": 0.7098689079284668, | |
| "rewards/correctness_reward_func/mean": 0.796875, | |
| "rewards/correctness_reward_func/std": 0.9754082411527634, | |
| "rewards/int_reward_func/mean": 0.33984375, | |
| "rewards/int_reward_func/std": 0.21231234446167946, | |
| "rewards/soft_format_reward_func/mean": 0.28125, | |
| "rewards/soft_format_reward_func/std": 0.23983315750956535, | |
| "rewards/strict_format_reward_func/mean": 0.015625, | |
| "rewards/strict_format_reward_func/std": 0.05259781517088413, | |
| "rewards/xmlcount_reward_func/mean": 0.162109375, | |
| "rewards/xmlcount_reward_func/std": 0.1913837492465973, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.640625, | |
| "completions/max_length": 512.0, | |
| "completions/max_terminated_length": 213.5, | |
| "completions/mean_length": 382.2734375, | |
| "completions/mean_terminated_length": 144.93779945373535, | |
| "completions/min_length": 91.75, | |
| "completions/min_terminated_length": 91.75, | |
| "entropy": 0.055487995967268944, | |
| "epoch": 0.4282655246252677, | |
| "frac_reward_zero_std": 0.265625, | |
| "grad_norm": 21.75, | |
| "learning_rate": 1.4473131483156326e-05, | |
| "loss": 0.6731, | |
| "num_tokens": 2154945.0, | |
| "reward": 1.291015625, | |
| "reward_std": 0.7789223082363605, | |
| "rewards/correctness_reward_func/mean": 0.609375, | |
| "rewards/correctness_reward_func/std": 0.9308035299181938, | |
| "rewards/int_reward_func/mean": 0.25, | |
| "rewards/int_reward_func/std": 0.2540716640651226, | |
| "rewards/soft_format_reward_func/mean": 0.25, | |
| "rewards/soft_format_reward_func/std": 0.24922906793653965, | |
| "rewards/strict_format_reward_func/mean": 0.0, | |
| "rewards/strict_format_reward_func/std": 0.0, | |
| "rewards/xmlcount_reward_func/mean": 0.181640625, | |
| "rewards/xmlcount_reward_func/std": 0.20568038523197174, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.5859375, | |
| "completions/max_length": 512.0, | |
| "completions/max_terminated_length": 302.25, | |
| "completions/mean_length": 374.421875, | |
| "completions/mean_terminated_length": 177.11860466003418, | |
| "completions/min_length": 93.125, | |
| "completions/min_terminated_length": 93.125, | |
| "entropy": 0.05054905638098717, | |
| "epoch": 0.43683083511777304, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 24.125, | |
| "learning_rate": 1.4203572283095657e-05, | |
| "loss": 1.1031, | |
| "num_tokens": 2214589.0, | |
| "reward": 1.5400390625, | |
| "reward_std": 1.028895616531372, | |
| "rewards/correctness_reward_func/mean": 0.765625, | |
| "rewards/correctness_reward_func/std": 0.9744589924812317, | |
| "rewards/int_reward_func/mean": 0.2734375, | |
| "rewards/int_reward_func/std": 0.2540593519806862, | |
| "rewards/soft_format_reward_func/mean": 0.2890625, | |
| "rewards/soft_format_reward_func/std": 0.25303449109196663, | |
| "rewards/strict_format_reward_func/mean": 0.0, | |
| "rewards/strict_format_reward_func/std": 0.0, | |
| "rewards/xmlcount_reward_func/mean": 0.2119140625, | |
| "rewards/xmlcount_reward_func/std": 0.21405917219817638, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.7109375, | |
| "completions/max_length": 512.0, | |
| "completions/max_terminated_length": 252.25, | |
| "completions/mean_length": 412.453125, | |
| "completions/mean_terminated_length": 173.9702386856079, | |
| "completions/min_length": 113.25, | |
| "completions/min_terminated_length": 113.25, | |
| "entropy": 0.051852176897227764, | |
| "epoch": 0.44539614561027835, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 17.75, | |
| "learning_rate": 1.3930250316539237e-05, | |
| "loss": 1.013, | |
| "num_tokens": 2280025.0, | |
| "reward": 0.9990234375, | |
| "reward_std": 0.885264553129673, | |
| "rewards/correctness_reward_func/mean": 0.4375, | |
| "rewards/correctness_reward_func/std": 0.811570405960083, | |
| "rewards/int_reward_func/mean": 0.15625, | |
| "rewards/int_reward_func/std": 0.22798974812030792, | |
| "rewards/soft_format_reward_func/mean": 0.22265625, | |
| "rewards/soft_format_reward_func/std": 0.25226276740431786, | |
| "rewards/strict_format_reward_func/mean": 0.0, | |
| "rewards/strict_format_reward_func/std": 0.0, | |
| "rewards/xmlcount_reward_func/mean": 0.1826171875, | |
| "rewards/xmlcount_reward_func/std": 0.18229194171726704, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.515625, | |
| "completions/max_length": 512.0, | |
| "completions/max_terminated_length": 285.875, | |
| "completions/mean_length": 351.171875, | |
| "completions/mean_terminated_length": 178.26887321472168, | |
| "completions/min_length": 108.875, | |
| "completions/min_terminated_length": 108.875, | |
| "entropy": 0.0576583961956203, | |
| "epoch": 0.4539614561027837, | |
| "frac_reward_zero_std": 0.359375, | |
| "grad_norm": 21.125, | |
| "learning_rate": 1.3653410243663953e-05, | |
| "loss": 0.8749, | |
| "num_tokens": 2337543.0, | |
| "reward": 1.5859375, | |
| "reward_std": 0.7292038351297379, | |
| "rewards/correctness_reward_func/mean": 0.75, | |
| "rewards/correctness_reward_func/std": 0.9716326966881752, | |
| "rewards/int_reward_func/mean": 0.26953125, | |
| "rewards/int_reward_func/std": 0.2501082383096218, | |
| "rewards/soft_format_reward_func/mean": 0.29296875, | |
| "rewards/soft_format_reward_func/std": 0.24548756889998913, | |
| "rewards/strict_format_reward_func/mean": 0.0, | |
| "rewards/strict_format_reward_func/std": 0.0, | |
| "rewards/xmlcount_reward_func/mean": 0.2734375, | |
| "rewards/xmlcount_reward_func/std": 0.20937122404575348, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.2109375, | |
| "completions/max_length": 512.0, | |
| "completions/max_terminated_length": 285.125, | |
| "completions/mean_length": 242.625, | |
| "completions/mean_terminated_length": 169.74182319641113, | |
| "completions/min_length": 97.375, | |
| "completions/min_terminated_length": 97.375, | |
| "entropy": 0.06909441482275724, | |
| "epoch": 0.4625267665952891, | |
| "frac_reward_zero_std": 0.46875, | |
| "grad_norm": 131.0, | |
| "learning_rate": 1.3373299873828303e-05, | |
| "loss": 0.9203, | |
| "num_tokens": 2379831.0, | |
| "reward": 2.27734375, | |
| "reward_std": 0.7844465803354979, | |
| "rewards/correctness_reward_func/mean": 1.125, | |
| "rewards/correctness_reward_func/std": 0.9970766380429268, | |
| "rewards/int_reward_func/mean": 0.37890625, | |
| "rewards/int_reward_func/std": 0.2144309040158987, | |
| "rewards/soft_format_reward_func/mean": 0.37890625, | |
| "rewards/soft_format_reward_func/std": 0.21300501003861427, | |
| "rewards/strict_format_reward_func/mean": 0.0078125, | |
| "rewards/strict_format_reward_func/std": 0.03125, | |
| "rewards/xmlcount_reward_func/mean": 0.38671875, | |
| "rewards/xmlcount_reward_func/std": 0.18055572640150785, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1484375, | |
| "completions/max_length": 512.0, | |
| "completions/max_terminated_length": 287.125, | |
| "completions/mean_length": 227.8671875, | |
| "completions/mean_terminated_length": 178.1862964630127, | |
| "completions/min_length": 82.875, | |
| "completions/min_terminated_length": 82.875, | |
| "entropy": 0.0683035789988935, | |
| "epoch": 0.47109207708779444, | |
| "frac_reward_zero_std": 0.53125, | |
| "grad_norm": 75.5, | |
| "learning_rate": 1.3090169943749475e-05, | |
| "loss": 0.7331, | |
| "num_tokens": 2421244.0, | |
| "reward": 2.6162109375, | |
| "reward_std": 0.5620946288108826, | |
| "rewards/correctness_reward_func/mean": 1.375, | |
| "rewards/correctness_reward_func/std": 0.9013157784938812, | |
| "rewards/int_reward_func/mean": 0.41015625, | |
| "rewards/int_reward_func/std": 0.18550433963537216, | |
| "rewards/soft_format_reward_func/mean": 0.4140625, | |
| "rewards/soft_format_reward_func/std": 0.17175541445612907, | |
| "rewards/strict_format_reward_func/mean": 0.0078125, | |
| "rewards/strict_format_reward_func/std": 0.03125, | |
| "rewards/xmlcount_reward_func/mean": 0.4091796875, | |
| "rewards/xmlcount_reward_func/std": 0.15384871885180473, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 512.0, | |
| "completions/max_terminated_length": 274.875, | |
| "completions/mean_length": 230.6484375, | |
| "completions/mean_terminated_length": 173.36169624328613, | |
| "completions/min_length": 101.5, | |
| "completions/min_terminated_length": 101.5, | |
| "entropy": 0.06412349035963416, | |
| "epoch": 0.4796573875802998, | |
| "frac_reward_zero_std": 0.609375, | |
| "grad_norm": 17.75, | |
| "learning_rate": 1.2804273893060028e-05, | |
| "loss": 0.8135, | |
| "num_tokens": 2462929.0, | |
| "reward": 2.8583984375, | |
| "reward_std": 0.5731431804597378, | |
| "rewards/correctness_reward_func/mean": 1.546875, | |
| "rewards/correctness_reward_func/std": 0.8076042532920837, | |
| "rewards/int_reward_func/mean": 0.43359375, | |
| "rewards/int_reward_func/std": 0.15128782019019127, | |
| "rewards/soft_format_reward_func/mean": 0.4453125, | |
| "rewards/soft_format_reward_func/std": 0.12609326466917992, | |
| "rewards/strict_format_reward_func/mean": 0.01953125, | |
| "rewards/strict_format_reward_func/std": 0.06822281517088413, | |
| "rewards/xmlcount_reward_func/mean": 0.4130859375, | |
| "rewards/xmlcount_reward_func/std": 0.14727921038866043, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 481.625, | |
| "completions/max_terminated_length": 265.5, | |
| "completions/mean_length": 213.640625, | |
| "completions/mean_terminated_length": 158.69331169128418, | |
| "completions/min_length": 87.125, | |
| "completions/min_terminated_length": 87.125, | |
| "entropy": 0.06292425934225321, | |
| "epoch": 0.48822269807280516, | |
| "frac_reward_zero_std": 0.46875, | |
| "grad_norm": 15.0, | |
| "learning_rate": 1.2515867637445088e-05, | |
| "loss": 0.5508, | |
| "num_tokens": 2502783.0, | |
| "reward": 2.8046875, | |
| "reward_std": 0.6159562915563583, | |
| "rewards/correctness_reward_func/mean": 1.34375, | |
| "rewards/correctness_reward_func/std": 0.9502372145652771, | |
| "rewards/int_reward_func/mean": 0.41015625, | |
| "rewards/int_reward_func/std": 0.17638970352709293, | |
| "rewards/soft_format_reward_func/mean": 0.4296875, | |
| "rewards/soft_format_reward_func/std": 0.15997907333076, | |
| "rewards/strict_format_reward_func/mean": 0.25, | |
| "rewards/strict_format_reward_func/std": 0.24410519748926163, | |
| "rewards/xmlcount_reward_func/mean": 0.37109375, | |
| "rewards/xmlcount_reward_func/std": 0.1568639986217022, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 439.75, | |
| "completions/max_terminated_length": 258.375, | |
| "completions/mean_length": 189.484375, | |
| "completions/mean_terminated_length": 149.9307737350464, | |
| "completions/min_length": 81.625, | |
| "completions/min_terminated_length": 81.625, | |
| "entropy": 0.07262948993593454, | |
| "epoch": 0.49678800856531047, | |
| "frac_reward_zero_std": 0.671875, | |
| "grad_norm": 25.375, | |
| "learning_rate": 1.2225209339563144e-05, | |
| "loss": 0.5996, | |
| "num_tokens": 2539729.0, | |
| "reward": 3.1650390625, | |
| "reward_std": 0.42951212264597416, | |
| "rewards/correctness_reward_func/mean": 1.40625, | |
| "rewards/correctness_reward_func/std": 0.9186194837093353, | |
| "rewards/int_reward_func/mean": 0.44921875, | |
| "rewards/int_reward_func/std": 0.11343478411436081, | |
| "rewards/soft_format_reward_func/mean": 0.44921875, | |
| "rewards/soft_format_reward_func/std": 0.1270910371094942, | |
| "rewards/strict_format_reward_func/mean": 0.421875, | |
| "rewards/strict_format_reward_func/std": 0.15183541178703308, | |
| "rewards/xmlcount_reward_func/mean": 0.4384765625, | |
| "rewards/xmlcount_reward_func/std": 0.1242168415337801, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0390625, | |
| "completions/max_length": 428.125, | |
| "completions/max_terminated_length": 271.75, | |
| "completions/mean_length": 168.0390625, | |
| "completions/mean_terminated_length": 153.7322940826416, | |
| "completions/min_length": 83.0, | |
| "completions/min_terminated_length": 83.0, | |
| "entropy": 0.06558680208399892, | |
| "epoch": 0.5053533190578159, | |
| "frac_reward_zero_std": 0.71875, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 1.1932559177955533e-05, | |
| "loss": 0.6108, | |
| "num_tokens": 2573054.0, | |
| "reward": 3.6171875, | |
| "reward_std": 0.38669902086257935, | |
| "rewards/correctness_reward_func/mean": 1.703125, | |
| "rewards/correctness_reward_func/std": 0.7130631133913994, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.09914018586277962, | |
| "rewards/soft_format_reward_func/mean": 0.48828125, | |
| "rewards/soft_format_reward_func/std": 0.046875, | |
| "rewards/strict_format_reward_func/mean": 0.47265625, | |
| "rewards/strict_format_reward_func/std": 0.09947281517088413, | |
| "rewards/xmlcount_reward_func/mean": 0.48828125, | |
| "rewards/xmlcount_reward_func/std": 0.046875, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 354.25, | |
| "completions/max_terminated_length": 267.375, | |
| "completions/mean_length": 166.265625, | |
| "completions/mean_terminated_length": 155.07939338684082, | |
| "completions/min_length": 84.25, | |
| "completions/min_terminated_length": 84.25, | |
| "entropy": 0.0687381848692894, | |
| "epoch": 0.5139186295503212, | |
| "frac_reward_zero_std": 0.6875, | |
| "grad_norm": 26.25, | |
| "learning_rate": 1.1638179114151378e-05, | |
| "loss": 0.2988, | |
| "num_tokens": 2606004.0, | |
| "reward": 3.314453125, | |
| "reward_std": 0.43365532672032714, | |
| "rewards/correctness_reward_func/mean": 1.4375, | |
| "rewards/correctness_reward_func/std": 0.8874192461371422, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.10904237069189548, | |
| "rewards/soft_format_reward_func/mean": 0.4765625, | |
| "rewards/soft_format_reward_func/std": 0.062167370691895485, | |
| "rewards/strict_format_reward_func/mean": 0.46484375, | |
| "rewards/strict_format_reward_func/std": 0.08923800103366375, | |
| "rewards/xmlcount_reward_func/mean": 0.470703125, | |
| "rewards/xmlcount_reward_func/std": 0.07348538748919964, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 394.875, | |
| "completions/max_terminated_length": 299.5, | |
| "completions/mean_length": 167.234375, | |
| "completions/mean_terminated_length": 155.96860313415527, | |
| "completions/min_length": 81.875, | |
| "completions/min_terminated_length": 81.875, | |
| "entropy": 0.06962216552346945, | |
| "epoch": 0.5224839400428265, | |
| "frac_reward_zero_std": 0.796875, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 1.1342332658176556e-05, | |
| "loss": 0.2338, | |
| "num_tokens": 2639860.0, | |
| "reward": 3.4140625, | |
| "reward_std": 0.19334950856864452, | |
| "rewards/correctness_reward_func/mean": 1.515625, | |
| "rewards/correctness_reward_func/std": 0.8358171209692955, | |
| "rewards/int_reward_func/mean": 0.4765625, | |
| "rewards/int_reward_func/std": 0.07394563034176826, | |
| "rewards/soft_format_reward_func/mean": 0.4765625, | |
| "rewards/soft_format_reward_func/std": 0.07394563034176826, | |
| "rewards/strict_format_reward_func/mean": 0.46875, | |
| "rewards/strict_format_reward_func/std": 0.08351518586277962, | |
| "rewards/xmlcount_reward_func/mean": 0.4765625, | |
| "rewards/xmlcount_reward_func/std": 0.06315224710851908, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 390.375, | |
| "completions/max_terminated_length": 261.25, | |
| "completions/mean_length": 174.2734375, | |
| "completions/mean_terminated_length": 157.4187536239624, | |
| "completions/min_length": 87.5, | |
| "completions/min_terminated_length": 87.5, | |
| "entropy": 0.06758250948041677, | |
| "epoch": 0.5310492505353319, | |
| "frac_reward_zero_std": 0.671875, | |
| "grad_norm": 11.0, | |
| "learning_rate": 1.1045284632676535e-05, | |
| "loss": 0.1924, | |
| "num_tokens": 2674451.0, | |
| "reward": 3.5439453125, | |
| "reward_std": 0.4018907658755779, | |
| "rewards/correctness_reward_func/mean": 1.609375, | |
| "rewards/correctness_reward_func/std": 0.7834457755088806, | |
| "rewards/int_reward_func/mean": 0.48828125, | |
| "rewards/int_reward_func/std": 0.03697281517088413, | |
| "rewards/soft_format_reward_func/mean": 0.4921875, | |
| "rewards/soft_format_reward_func/std": 0.03125, | |
| "rewards/strict_format_reward_func/mean": 0.46875, | |
| "rewards/strict_format_reward_func/std": 0.0952934455126524, | |
| "rewards/xmlcount_reward_func/mean": 0.4853515625, | |
| "rewards/xmlcount_reward_func/std": 0.04692569188773632, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 269.5, | |
| "completions/max_terminated_length": 241.5, | |
| "completions/mean_length": 148.2890625, | |
| "completions/mean_terminated_length": 145.62083435058594, | |
| "completions/min_length": 89.625, | |
| "completions/min_terminated_length": 89.625, | |
| "entropy": 0.0636401055380702, | |
| "epoch": 0.5396145610278372, | |
| "frac_reward_zero_std": 0.8125, | |
| "grad_norm": 8.125, | |
| "learning_rate": 1.0747300935864245e-05, | |
| "loss": 0.0794, | |
| "num_tokens": 2705286.0, | |
| "reward": 3.6806640625, | |
| "reward_std": 0.2637839764356613, | |
| "rewards/correctness_reward_func/mean": 1.703125, | |
| "rewards/correctness_reward_func/std": 0.5994666591286659, | |
| "rewards/int_reward_func/mean": 0.484375, | |
| "rewards/int_reward_func/std": 0.042695630341768265, | |
| "rewards/soft_format_reward_func/mean": 0.5, | |
| "rewards/soft_format_reward_func/std": 0.0, | |
| "rewards/strict_format_reward_func/mean": 0.49609375, | |
| "rewards/strict_format_reward_func/std": 0.015625, | |
| "rewards/xmlcount_reward_func/mean": 0.4970703125, | |
| "rewards/xmlcount_reward_func/std": 0.01171875, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 282.875, | |
| "completions/max_terminated_length": 242.0, | |
| "completions/mean_length": 154.0546875, | |
| "completions/mean_terminated_length": 151.16770935058594, | |
| "completions/min_length": 82.75, | |
| "completions/min_terminated_length": 82.75, | |
| "entropy": 0.06001953314989805, | |
| "epoch": 0.5481798715203426, | |
| "frac_reward_zero_std": 0.84375, | |
| "grad_norm": 7.78125, | |
| "learning_rate": 1.044864830350515e-05, | |
| "loss": 0.045, | |
| "num_tokens": 2737255.0, | |
| "reward": 3.6259765625, | |
| "reward_std": 0.1643470786511898, | |
| "rewards/correctness_reward_func/mean": 1.65625, | |
| "rewards/correctness_reward_func/std": 0.6852209344506264, | |
| "rewards/int_reward_func/mean": 0.4921875, | |
| "rewards/int_reward_func/std": 0.021347815170884132, | |
| "rewards/soft_format_reward_func/mean": 0.5, | |
| "rewards/soft_format_reward_func/std": 0.0, | |
| "rewards/strict_format_reward_func/mean": 0.484375, | |
| "rewards/strict_format_reward_func/std": 0.0625, | |
| "rewards/xmlcount_reward_func/mean": 0.4931640625, | |
| "rewards/xmlcount_reward_func/std": 0.02734375, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 323.5, | |
| "completions/max_terminated_length": 268.75, | |
| "completions/mean_length": 161.921875, | |
| "completions/mean_terminated_length": 154.10967445373535, | |
| "completions/min_length": 83.125, | |
| "completions/min_terminated_length": 83.125, | |
| "entropy": 0.06517814612016082, | |
| "epoch": 0.556745182012848, | |
| "frac_reward_zero_std": 0.796875, | |
| "grad_norm": 7.8125, | |
| "learning_rate": 1.0149594070152638e-05, | |
| "loss": 0.0963, | |
| "num_tokens": 2770597.0, | |
| "reward": 3.3662109375, | |
| "reward_std": 0.29969173669815063, | |
| "rewards/correctness_reward_func/mean": 1.4375, | |
| "rewards/correctness_reward_func/std": 0.8671257123351097, | |
| "rewards/int_reward_func/mean": 0.46875, | |
| "rewards/int_reward_func/std": 0.08054866641759872, | |
| "rewards/soft_format_reward_func/mean": 0.48828125, | |
| "rewards/soft_format_reward_func/std": 0.03697281517088413, | |
| "rewards/strict_format_reward_func/mean": 0.484375, | |
| "rewards/strict_format_reward_func/std": 0.05259781517088413, | |
| "rewards/xmlcount_reward_func/mean": 0.4873046875, | |
| "rewards/xmlcount_reward_func/std": 0.04087906517088413, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1171875, | |
| "completions/max_length": 453.75, | |
| "completions/max_terminated_length": 281.875, | |
| "completions/mean_length": 192.828125, | |
| "completions/mean_terminated_length": 151.42723655700684, | |
| "completions/min_length": 82.125, | |
| "completions/min_terminated_length": 82.125, | |
| "entropy": 0.05498858401551843, | |
| "epoch": 0.5653104925053534, | |
| "frac_reward_zero_std": 0.796875, | |
| "grad_norm": 10.5, | |
| "learning_rate": 9.850405929847367e-06, | |
| "loss": 0.2824, | |
| "num_tokens": 2807611.0, | |
| "reward": 3.3330078125, | |
| "reward_std": 0.3135024200892076, | |
| "rewards/correctness_reward_func/mean": 1.578125, | |
| "rewards/correctness_reward_func/std": 0.8142120242118835, | |
| "rewards/int_reward_func/mean": 0.4375, | |
| "rewards/int_reward_func/std": 0.15119514800608158, | |
| "rewards/soft_format_reward_func/mean": 0.4453125, | |
| "rewards/soft_format_reward_func/std": 0.11994514800608158, | |
| "rewards/strict_format_reward_func/mean": 0.43359375, | |
| "rewards/strict_format_reward_func/std": 0.1569179631769657, | |
| "rewards/xmlcount_reward_func/mean": 0.4384765625, | |
| "rewards/xmlcount_reward_func/std": 0.15258620493113995, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0390625, | |
| "completions/max_length": 377.625, | |
| "completions/max_terminated_length": 273.0, | |
| "completions/mean_length": 154.65625, | |
| "completions/mean_terminated_length": 139.93846988677979, | |
| "completions/min_length": 75.75, | |
| "completions/min_terminated_length": 75.75, | |
| "entropy": 0.0641864649951458, | |
| "epoch": 0.5738758029978587, | |
| "frac_reward_zero_std": 0.796875, | |
| "grad_norm": 14.25, | |
| "learning_rate": 9.551351696494854e-06, | |
| "loss": 0.3309, | |
| "num_tokens": 2839197.0, | |
| "reward": 3.3779296875, | |
| "reward_std": 0.321788830216974, | |
| "rewards/correctness_reward_func/mean": 1.484375, | |
| "rewards/correctness_reward_func/std": 0.8679328411817551, | |
| "rewards/int_reward_func/mean": 0.4609375, | |
| "rewards/int_reward_func/std": 0.11664126068353653, | |
| "rewards/soft_format_reward_func/mean": 0.48046875, | |
| "rewards/soft_format_reward_func/std": 0.058320630341768265, | |
| "rewards/strict_format_reward_func/mean": 0.4765625, | |
| "rewards/strict_format_reward_func/std": 0.0640434455126524, | |
| "rewards/xmlcount_reward_func/mean": 0.4755859375, | |
| "rewards/xmlcount_reward_func/std": 0.07196457590907812, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0390625, | |
| "completions/max_length": 324.125, | |
| "completions/max_terminated_length": 250.75, | |
| "completions/mean_length": 156.265625, | |
| "completions/mean_terminated_length": 141.62864875793457, | |
| "completions/min_length": 72.625, | |
| "completions/min_terminated_length": 72.625, | |
| "entropy": 0.05262026563286781, | |
| "epoch": 0.582441113490364, | |
| "frac_reward_zero_std": 0.875, | |
| "grad_norm": 42.5, | |
| "learning_rate": 9.252699064135759e-06, | |
| "loss": 0.2027, | |
| "num_tokens": 2870879.0, | |
| "reward": 3.443359375, | |
| "reward_std": 0.1905873753130436, | |
| "rewards/correctness_reward_func/mean": 1.546875, | |
| "rewards/correctness_reward_func/std": 0.697150319814682, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.09914018586277962, | |
| "rewards/soft_format_reward_func/mean": 0.48046875, | |
| "rewards/soft_format_reward_func/std": 0.05644455552101135, | |
| "rewards/strict_format_reward_func/mean": 0.47265625, | |
| "rewards/strict_format_reward_func/std": 0.07482585124671459, | |
| "rewards/xmlcount_reward_func/mean": 0.478515625, | |
| "rewards/xmlcount_reward_func/std": 0.05138835124671459, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 314.25, | |
| "completions/max_terminated_length": 225.875, | |
| "completions/mean_length": 140.921875, | |
| "completions/mean_terminated_length": 131.4558048248291, | |
| "completions/min_length": 65.25, | |
| "completions/min_terminated_length": 65.25, | |
| "entropy": 0.05961746862158179, | |
| "epoch": 0.5910064239828694, | |
| "frac_reward_zero_std": 0.734375, | |
| "grad_norm": 29.5, | |
| "learning_rate": 8.954715367323468e-06, | |
| "loss": 0.2871, | |
| "num_tokens": 2900613.0, | |
| "reward": 3.34375, | |
| "reward_std": 0.44194173626601696, | |
| "rewards/correctness_reward_func/mean": 1.515625, | |
| "rewards/correctness_reward_func/std": 0.8490326702594757, | |
| "rewards/int_reward_func/mean": 0.4609375, | |
| "rewards/int_reward_func/std": 0.0783399622887373, | |
| "rewards/soft_format_reward_func/mean": 0.4609375, | |
| "rewards/soft_format_reward_func/std": 0.09120866656303406, | |
| "rewards/strict_format_reward_func/mean": 0.4453125, | |
| "rewards/strict_format_reward_func/std": 0.13093777745962143, | |
| "rewards/xmlcount_reward_func/mean": 0.4609375, | |
| "rewards/xmlcount_reward_func/std": 0.0871200654655695, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 289.0, | |
| "completions/max_terminated_length": 257.0, | |
| "completions/mean_length": 149.90625, | |
| "completions/mean_terminated_length": 147.21406364440918, | |
| "completions/min_length": 84.25, | |
| "completions/min_terminated_length": 84.25, | |
| "entropy": 0.06772361230105162, | |
| "epoch": 0.5995717344753747, | |
| "frac_reward_zero_std": 0.84375, | |
| "grad_norm": 6.15625, | |
| "learning_rate": 8.657667341823449e-06, | |
| "loss": 0.0863, | |
| "num_tokens": 2931599.0, | |
| "reward": 3.6689453125, | |
| "reward_std": 0.20301698334515095, | |
| "rewards/correctness_reward_func/mean": 1.6875, | |
| "rewards/correctness_reward_func/std": 0.5998296737670898, | |
| "rewards/int_reward_func/mean": 0.4921875, | |
| "rewards/int_reward_func/std": 0.03125, | |
| "rewards/soft_format_reward_func/mean": 0.49609375, | |
| "rewards/soft_format_reward_func/std": 0.015625, | |
| "rewards/strict_format_reward_func/mean": 0.49609375, | |
| "rewards/strict_format_reward_func/std": 0.015625, | |
| "rewards/xmlcount_reward_func/mean": 0.4970703125, | |
| "rewards/xmlcount_reward_func/std": 0.01171875, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 268.0, | |
| "completions/max_terminated_length": 268.0, | |
| "completions/mean_length": 149.4375, | |
| "completions/mean_terminated_length": 149.4375, | |
| "completions/min_length": 71.75, | |
| "completions/min_terminated_length": 71.75, | |
| "entropy": 0.06804852467030287, | |
| "epoch": 0.6081370449678801, | |
| "frac_reward_zero_std": 0.671875, | |
| "grad_norm": 14.75, | |
| "learning_rate": 8.361820885848623e-06, | |
| "loss": -0.1228, | |
| "num_tokens": 2963581.0, | |
| "reward": 3.4091796875, | |
| "reward_std": 0.4819926954805851, | |
| "rewards/correctness_reward_func/mean": 1.46875, | |
| "rewards/correctness_reward_func/std": 0.85780418664217, | |
| "rewards/int_reward_func/mean": 0.48046875, | |
| "rewards/int_reward_func/std": 0.05644455552101135, | |
| "rewards/soft_format_reward_func/mean": 0.48828125, | |
| "rewards/soft_format_reward_func/std": 0.025194555521011353, | |
| "rewards/strict_format_reward_func/mean": 0.484375, | |
| "rewards/strict_format_reward_func/std": 0.04081955552101135, | |
| "rewards/xmlcount_reward_func/mean": 0.4873046875, | |
| "rewards/xmlcount_reward_func/std": 0.029100805521011353, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 276.25, | |
| "completions/max_terminated_length": 237.375, | |
| "completions/mean_length": 143.1640625, | |
| "completions/mean_terminated_length": 140.2427101135254, | |
| "completions/min_length": 73.125, | |
| "completions/min_terminated_length": 73.125, | |
| "entropy": 0.06070453533902764, | |
| "epoch": 0.6167023554603854, | |
| "frac_reward_zero_std": 0.78125, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 8.06744082204447e-06, | |
| "loss": 0.1705, | |
| "num_tokens": 2993796.0, | |
| "reward": 3.6064453125, | |
| "reward_std": 0.3135024197399616, | |
| "rewards/correctness_reward_func/mean": 1.640625, | |
| "rewards/correctness_reward_func/std": 0.7274979203939438, | |
| "rewards/int_reward_func/mean": 0.4765625, | |
| "rewards/int_reward_func/std": 0.07394563034176826, | |
| "rewards/soft_format_reward_func/mean": 0.5, | |
| "rewards/soft_format_reward_func/std": 0.0, | |
| "rewards/strict_format_reward_func/mean": 0.4921875, | |
| "rewards/strict_format_reward_func/std": 0.03125, | |
| "rewards/xmlcount_reward_func/mean": 0.4970703125, | |
| "rewards/xmlcount_reward_func/std": 0.01171875, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 314.5, | |
| "completions/max_terminated_length": 300.125, | |
| "completions/mean_length": 152.578125, | |
| "completions/mean_terminated_length": 147.3067741394043, | |
| "completions/min_length": 77.5, | |
| "completions/min_terminated_length": 77.5, | |
| "entropy": 0.07308987434953451, | |
| "epoch": 0.6252676659528907, | |
| "frac_reward_zero_std": 0.78125, | |
| "grad_norm": 7.65625, | |
| "learning_rate": 7.774790660436857e-06, | |
| "loss": 0.0711, | |
| "num_tokens": 3025438.0, | |
| "reward": 3.4619140625, | |
| "reward_std": 0.2637839764356613, | |
| "rewards/correctness_reward_func/mean": 1.515625, | |
| "rewards/correctness_reward_func/std": 0.8590980246663094, | |
| "rewards/int_reward_func/mean": 0.47265625, | |
| "rewards/int_reward_func/std": 0.08957063034176826, | |
| "rewards/soft_format_reward_func/mean": 0.4921875, | |
| "rewards/soft_format_reward_func/std": 0.03125, | |
| "rewards/strict_format_reward_func/mean": 0.48828125, | |
| "rewards/strict_format_reward_func/std": 0.03697281517088413, | |
| "rewards/xmlcount_reward_func/mean": 0.4931640625, | |
| "rewards/xmlcount_reward_func/std": 0.023821823298931122, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 305.0, | |
| "completions/max_terminated_length": 273.75, | |
| "completions/mean_length": 165.515625, | |
| "completions/mean_terminated_length": 162.84323120117188, | |
| "completions/min_length": 89.625, | |
| "completions/min_terminated_length": 89.625, | |
| "entropy": 0.06814676942303777, | |
| "epoch": 0.6338329764453962, | |
| "frac_reward_zero_std": 0.8125, | |
| "grad_norm": 6.625, | |
| "learning_rate": 7.484132362554915e-06, | |
| "loss": 0.163, | |
| "num_tokens": 3059668.0, | |
| "reward": 3.490234375, | |
| "reward_std": 0.22097087278962135, | |
| "rewards/correctness_reward_func/mean": 1.53125, | |
| "rewards/correctness_reward_func/std": 0.8542027324438095, | |
| "rewards/int_reward_func/mean": 0.484375, | |
| "rewards/int_reward_func/std": 0.0625, | |
| "rewards/soft_format_reward_func/mean": 0.5, | |
| "rewards/soft_format_reward_func/std": 0.0, | |
| "rewards/strict_format_reward_func/mean": 0.48046875, | |
| "rewards/strict_format_reward_func/std": 0.06822281517088413, | |
| "rewards/xmlcount_reward_func/mean": 0.494140625, | |
| "rewards/xmlcount_reward_func/std": 0.020961953792721033, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 289.625, | |
| "completions/max_terminated_length": 228.75, | |
| "completions/mean_length": 145.8671875, | |
| "completions/mean_terminated_length": 140.4114589691162, | |
| "completions/min_length": 80.75, | |
| "completions/min_terminated_length": 80.75, | |
| "entropy": 0.06801354000344872, | |
| "epoch": 0.6423982869379015, | |
| "frac_reward_zero_std": 0.8125, | |
| "grad_norm": 24.875, | |
| "learning_rate": 7.1957261069399745e-06, | |
| "loss": 0.1365, | |
| "num_tokens": 3089965.0, | |
| "reward": 3.5849609375, | |
| "reward_std": 0.26378397084772587, | |
| "rewards/correctness_reward_func/mean": 1.671875, | |
| "rewards/correctness_reward_func/std": 0.7019384130835533, | |
| "rewards/int_reward_func/mean": 0.47265625, | |
| "rewards/int_reward_func/std": 0.08769455552101135, | |
| "rewards/soft_format_reward_func/mean": 0.48828125, | |
| "rewards/soft_format_reward_func/std": 0.025194555521011353, | |
| "rewards/strict_format_reward_func/mean": 0.47265625, | |
| "rewards/strict_format_reward_func/std": 0.06492366641759872, | |
| "rewards/xmlcount_reward_func/mean": 0.4794921875, | |
| "rewards/xmlcount_reward_func/std": 0.05274027772247791, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 276.625, | |
| "completions/max_terminated_length": 276.625, | |
| "completions/mean_length": 151.796875, | |
| "completions/mean_terminated_length": 151.796875, | |
| "completions/min_length": 80.125, | |
| "completions/min_terminated_length": 80.125, | |
| "entropy": 0.06695270165801048, | |
| "epoch": 0.6509635974304069, | |
| "frac_reward_zero_std": 0.90625, | |
| "grad_norm": 4.125, | |
| "learning_rate": 6.909830056250527e-06, | |
| "loss": -0.0215, | |
| "num_tokens": 3121975.0, | |
| "reward": 3.591796875, | |
| "reward_std": 0.12153397500514984, | |
| "rewards/correctness_reward_func/mean": 1.609375, | |
| "rewards/correctness_reward_func/std": 0.6680332496762276, | |
| "rewards/int_reward_func/mean": 0.4921875, | |
| "rewards/int_reward_func/std": 0.03125, | |
| "rewards/soft_format_reward_func/mean": 0.5, | |
| "rewards/soft_format_reward_func/std": 0.0, | |
| "rewards/strict_format_reward_func/mean": 0.4921875, | |
| "rewards/strict_format_reward_func/std": 0.021347815170884132, | |
| "rewards/xmlcount_reward_func/mean": 0.498046875, | |
| "rewards/xmlcount_reward_func/std": 0.005336953792721033, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 334.375, | |
| "completions/max_terminated_length": 308.625, | |
| "completions/mean_length": 162.4765625, | |
| "completions/mean_terminated_length": 157.2375030517578, | |
| "completions/min_length": 79.25, | |
| "completions/min_terminated_length": 79.25, | |
| "entropy": 0.06329814763739705, | |
| "epoch": 0.6595289079229122, | |
| "frac_reward_zero_std": 0.796875, | |
| "grad_norm": 19.25, | |
| "learning_rate": 6.6267001261717015e-06, | |
| "loss": 0.1003, | |
| "num_tokens": 3154864.0, | |
| "reward": 3.5283203125, | |
| "reward_std": 0.3135024178773165, | |
| "rewards/correctness_reward_func/mean": 1.59375, | |
| "rewards/correctness_reward_func/std": 0.6435378566384315, | |
| "rewards/int_reward_func/mean": 0.48046875, | |
| "rewards/int_reward_func/std": 0.05644455552101135, | |
| "rewards/soft_format_reward_func/mean": 0.48828125, | |
| "rewards/soft_format_reward_func/std": 0.025194555521011353, | |
| "rewards/strict_format_reward_func/mean": 0.48046875, | |
| "rewards/strict_format_reward_func/std": 0.05644455552101135, | |
| "rewards/xmlcount_reward_func/mean": 0.4853515625, | |
| "rewards/xmlcount_reward_func/std": 0.03691330552101135, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 296.875, | |
| "completions/max_terminated_length": 238.875, | |
| "completions/mean_length": 151.8203125, | |
| "completions/mean_terminated_length": 146.38125228881836, | |
| "completions/min_length": 76.5, | |
| "completions/min_terminated_length": 76.5, | |
| "entropy": 0.05917734792456031, | |
| "epoch": 0.6680942184154176, | |
| "frac_reward_zero_std": 0.796875, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 6.34658975633605e-06, | |
| "loss": 0.1964, | |
| "num_tokens": 3186189.0, | |
| "reward": 3.5595703125, | |
| "reward_std": 0.22511406615376472, | |
| "rewards/correctness_reward_func/mean": 1.609375, | |
| "rewards/correctness_reward_func/std": 0.8085274025797844, | |
| "rewards/int_reward_func/mean": 0.48828125, | |
| "rewards/int_reward_func/std": 0.046875, | |
| "rewards/soft_format_reward_func/mean": 0.4921875, | |
| "rewards/soft_format_reward_func/std": 0.03125, | |
| "rewards/strict_format_reward_func/mean": 0.48046875, | |
| "rewards/strict_format_reward_func/std": 0.078125, | |
| "rewards/xmlcount_reward_func/mean": 0.4892578125, | |
| "rewards/xmlcount_reward_func/std": 0.04296875, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 340.0, | |
| "completions/max_terminated_length": 328.75, | |
| "completions/mean_length": 157.328125, | |
| "completions/mean_terminated_length": 154.62604331970215, | |
| "completions/min_length": 82.5, | |
| "completions/min_terminated_length": 82.5, | |
| "entropy": 0.07690681796520948, | |
| "epoch": 0.6766595289079229, | |
| "frac_reward_zero_std": 0.875, | |
| "grad_norm": 38.0, | |
| "learning_rate": 6.069749683460765e-06, | |
| "loss": 0.0732, | |
| "num_tokens": 3218069.0, | |
| "reward": 3.439453125, | |
| "reward_std": 0.21820873208343983, | |
| "rewards/correctness_reward_func/mean": 1.5625, | |
| "rewards/correctness_reward_func/std": 0.8027089610695839, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.07073915377259254, | |
| "rewards/soft_format_reward_func/mean": 0.47265625, | |
| "rewards/soft_format_reward_func/std": 0.0660141110420227, | |
| "rewards/strict_format_reward_func/mean": 0.46875, | |
| "rewards/strict_format_reward_func/std": 0.07173692621290684, | |
| "rewards/xmlcount_reward_func/mean": 0.470703125, | |
| "rewards/xmlcount_reward_func/std": 0.06738616153597832, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 296.25, | |
| "completions/max_terminated_length": 270.25, | |
| "completions/mean_length": 157.2421875, | |
| "completions/mean_terminated_length": 148.20973587036133, | |
| "completions/min_length": 79.875, | |
| "completions/min_terminated_length": 79.875, | |
| "entropy": 0.06810100981965661, | |
| "epoch": 0.6852248394004282, | |
| "frac_reward_zero_std": 0.859375, | |
| "grad_norm": 13.0, | |
| "learning_rate": 5.796427716904347e-06, | |
| "loss": -0.0428, | |
| "num_tokens": 3250002.0, | |
| "reward": 3.4755859375, | |
| "reward_std": 0.18920630402863026, | |
| "rewards/correctness_reward_func/mean": 1.546875, | |
| "rewards/correctness_reward_func/std": 0.8281612768769264, | |
| "rewards/int_reward_func/mean": 0.46875, | |
| "rewards/int_reward_func/std": 0.08351518586277962, | |
| "rewards/soft_format_reward_func/mean": 0.48828125, | |
| "rewards/soft_format_reward_func/std": 0.025194555521011353, | |
| "rewards/strict_format_reward_func/mean": 0.484375, | |
| "rewards/strict_format_reward_func/std": 0.04081955552101135, | |
| "rewards/xmlcount_reward_func/mean": 0.4873046875, | |
| "rewards/xmlcount_reward_func/std": 0.029100805521011353, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 295.0, | |
| "completions/max_terminated_length": 241.75, | |
| "completions/mean_length": 148.1953125, | |
| "completions/mean_terminated_length": 139.70900535583496, | |
| "completions/min_length": 78.5, | |
| "completions/min_terminated_length": 78.5, | |
| "entropy": 0.06864482956007123, | |
| "epoch": 0.6937901498929336, | |
| "frac_reward_zero_std": 0.8125, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 5.526868516843673e-06, | |
| "loss": 0.3268, | |
| "num_tokens": 3280593.0, | |
| "reward": 3.451171875, | |
| "reward_std": 0.2458300832659006, | |
| "rewards/correctness_reward_func/mean": 1.484375, | |
| "rewards/correctness_reward_func/std": 0.8385421559214592, | |
| "rewards/int_reward_func/mean": 0.4921875, | |
| "rewards/int_reward_func/std": 0.021347815170884132, | |
| "rewards/soft_format_reward_func/mean": 0.49609375, | |
| "rewards/soft_format_reward_func/std": 0.015625, | |
| "rewards/strict_format_reward_func/mean": 0.48828125, | |
| "rewards/strict_format_reward_func/std": 0.03697281517088413, | |
| "rewards/xmlcount_reward_func/mean": 0.490234375, | |
| "rewards/xmlcount_reward_func/std": 0.030614666640758514, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0390625, | |
| "completions/max_length": 376.625, | |
| "completions/max_terminated_length": 287.0, | |
| "completions/mean_length": 170.5625, | |
| "completions/mean_terminated_length": 156.5812530517578, | |
| "completions/min_length": 78.375, | |
| "completions/min_terminated_length": 78.375, | |
| "entropy": 0.05719508696347475, | |
| "epoch": 0.702355460385439, | |
| "frac_reward_zero_std": 0.796875, | |
| "grad_norm": 21.75, | |
| "learning_rate": 5.2613133752700145e-06, | |
| "loss": 0.3157, | |
| "num_tokens": 3314333.0, | |
| "reward": 3.5078125, | |
| "reward_std": 0.27345145121216774, | |
| "rewards/correctness_reward_func/mean": 1.609375, | |
| "rewards/correctness_reward_func/std": 0.6161131635308266, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.08923800103366375, | |
| "rewards/soft_format_reward_func/mean": 0.48046875, | |
| "rewards/soft_format_reward_func/std": 0.05644455552101135, | |
| "rewards/strict_format_reward_func/mean": 0.47265625, | |
| "rewards/strict_format_reward_func/std": 0.06789018586277962, | |
| "rewards/xmlcount_reward_func/mean": 0.48046875, | |
| "rewards/xmlcount_reward_func/std": 0.049400702118873596, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 353.0, | |
| "completions/max_terminated_length": 253.125, | |
| "completions/mean_length": 157.8515625, | |
| "completions/mean_terminated_length": 149.17083740234375, | |
| "completions/min_length": 74.875, | |
| "completions/min_terminated_length": 74.875, | |
| "entropy": 0.06622256385162473, | |
| "epoch": 0.7109207708779444, | |
| "frac_reward_zero_std": 0.8125, | |
| "grad_norm": 15.875, | |
| "learning_rate": 5.000000000000003e-06, | |
| "loss": 0.292, | |
| "num_tokens": 3346592.0, | |
| "reward": 3.5224609375, | |
| "reward_std": 0.3217888306826353, | |
| "rewards/correctness_reward_func/mean": 1.625, | |
| "rewards/correctness_reward_func/std": 0.654181070625782, | |
| "rewards/int_reward_func/mean": 0.48046875, | |
| "rewards/int_reward_func/std": 0.05644455552101135, | |
| "rewards/soft_format_reward_func/mean": 0.4765625, | |
| "rewards/soft_format_reward_func/std": 0.05920085124671459, | |
| "rewards/strict_format_reward_func/mean": 0.46484375, | |
| "rewards/strict_format_reward_func/std": 0.10607585124671459, | |
| "rewards/xmlcount_reward_func/mean": 0.4755859375, | |
| "rewards/xmlcount_reward_func/std": 0.06840440817177296, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0546875, | |
| "completions/max_length": 409.75, | |
| "completions/max_terminated_length": 250.875, | |
| "completions/mean_length": 167.9375, | |
| "completions/mean_terminated_length": 148.38951301574707, | |
| "completions/min_length": 73.125, | |
| "completions/min_terminated_length": 73.125, | |
| "entropy": 0.05832461267709732, | |
| "epoch": 0.7194860813704497, | |
| "frac_reward_zero_std": 0.828125, | |
| "grad_norm": 9.875, | |
| "learning_rate": 4.743162301894952e-06, | |
| "loss": 0.4018, | |
| "num_tokens": 3380156.0, | |
| "reward": 3.4765625, | |
| "reward_std": 0.23201941419392824, | |
| "rewards/correctness_reward_func/mean": 1.5625, | |
| "rewards/correctness_reward_func/std": 0.8384527564048767, | |
| "rewards/int_reward_func/mean": 0.4765625, | |
| "rewards/int_reward_func/std": 0.07394563034176826, | |
| "rewards/soft_format_reward_func/mean": 0.484375, | |
| "rewards/soft_format_reward_func/std": 0.05259781517088413, | |
| "rewards/strict_format_reward_func/mean": 0.46875, | |
| "rewards/strict_format_reward_func/std": 0.0952934455126524, | |
| "rewards/xmlcount_reward_func/mean": 0.484375, | |
| "rewards/xmlcount_reward_func/std": 0.05507336184382439, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 318.375, | |
| "completions/max_terminated_length": 290.375, | |
| "completions/mean_length": 158.484375, | |
| "completions/mean_terminated_length": 152.8973217010498, | |
| "completions/min_length": 78.0, | |
| "completions/min_terminated_length": 78.0, | |
| "entropy": 0.07197121158242226, | |
| "epoch": 0.728051391862955, | |
| "frac_reward_zero_std": 0.78125, | |
| "grad_norm": 6.625, | |
| "learning_rate": 4.491030185478976e-06, | |
| "loss": 0.2278, | |
| "num_tokens": 3413046.0, | |
| "reward": 3.6376953125, | |
| "reward_std": 0.2803567871451378, | |
| "rewards/correctness_reward_func/mean": 1.671875, | |
| "rewards/correctness_reward_func/std": 0.6848579198122025, | |
| "rewards/int_reward_func/mean": 0.484375, | |
| "rewards/int_reward_func/std": 0.04081955552101135, | |
| "rewards/soft_format_reward_func/mean": 0.5, | |
| "rewards/soft_format_reward_func/std": 0.0, | |
| "rewards/strict_format_reward_func/mean": 0.484375, | |
| "rewards/strict_format_reward_func/std": 0.04081955552101135, | |
| "rewards/xmlcount_reward_func/mean": 0.4970703125, | |
| "rewards/xmlcount_reward_func/std": 0.01171875, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0390625, | |
| "completions/max_length": 370.25, | |
| "completions/max_terminated_length": 291.125, | |
| "completions/mean_length": 175.3828125, | |
| "completions/mean_terminated_length": 161.8246307373047, | |
| "completions/min_length": 83.75, | |
| "completions/min_terminated_length": 83.75, | |
| "entropy": 0.06557085691019893, | |
| "epoch": 0.7366167023554604, | |
| "frac_reward_zero_std": 0.734375, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 4.2438293431432665e-06, | |
| "loss": 0.3556, | |
| "num_tokens": 3447325.0, | |
| "reward": 3.4638671875, | |
| "reward_std": 0.30521600786596537, | |
| "rewards/correctness_reward_func/mean": 1.5625, | |
| "rewards/correctness_reward_func/std": 0.7508078292012215, | |
| "rewards/int_reward_func/mean": 0.46875, | |
| "rewards/int_reward_func/std": 0.06689241342246532, | |
| "rewards/soft_format_reward_func/mean": 0.484375, | |
| "rewards/soft_format_reward_func/std": 0.04081955552101135, | |
| "rewards/strict_format_reward_func/mean": 0.46484375, | |
| "rewards/strict_format_reward_func/std": 0.08627148158848286, | |
| "rewards/xmlcount_reward_func/mean": 0.4833984375, | |
| "rewards/xmlcount_reward_func/std": 0.03715440817177296, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0390625, | |
| "completions/max_length": 423.75, | |
| "completions/max_terminated_length": 293.125, | |
| "completions/mean_length": 168.0390625, | |
| "completions/mean_terminated_length": 154.1963596343994, | |
| "completions/min_length": 77.625, | |
| "completions/min_terminated_length": 77.625, | |
| "entropy": 0.06665381882339716, | |
| "epoch": 0.7451820128479657, | |
| "frac_reward_zero_std": 0.796875, | |
| "grad_norm": 8.625, | |
| "learning_rate": 4.001781053120863e-06, | |
| "loss": 0.2807, | |
| "num_tokens": 3481046.0, | |
| "reward": 3.435546875, | |
| "reward_std": 0.2762135900557041, | |
| "rewards/correctness_reward_func/mean": 1.515625, | |
| "rewards/correctness_reward_func/std": 0.7195080667734146, | |
| "rewards/int_reward_func/mean": 0.4609375, | |
| "rewards/int_reward_func/std": 0.09011822193861008, | |
| "rewards/soft_format_reward_func/mean": 0.48828125, | |
| "rewards/soft_format_reward_func/std": 0.046875, | |
| "rewards/strict_format_reward_func/mean": 0.48046875, | |
| "rewards/strict_format_reward_func/std": 0.078125, | |
| "rewards/xmlcount_reward_func/mean": 0.490234375, | |
| "rewards/xmlcount_reward_func/std": 0.0390625, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 390.75, | |
| "completions/max_terminated_length": 273.625, | |
| "completions/mean_length": 171.125, | |
| "completions/mean_terminated_length": 160.2234401702881, | |
| "completions/min_length": 88.25, | |
| "completions/min_terminated_length": 88.25, | |
| "entropy": 0.060074358712881804, | |
| "epoch": 0.7537473233404711, | |
| "frac_reward_zero_std": 0.796875, | |
| "grad_norm": 6.71875, | |
| "learning_rate": 3.7651019814126656e-06, | |
| "loss": 0.3052, | |
| "num_tokens": 3515172.0, | |
| "reward": 3.5400390625, | |
| "reward_std": 0.2748325187712908, | |
| "rewards/correctness_reward_func/mean": 1.59375, | |
| "rewards/correctness_reward_func/std": 0.7068260312080383, | |
| "rewards/int_reward_func/mean": 0.4765625, | |
| "rewards/int_reward_func/std": 0.062167370691895485, | |
| "rewards/soft_format_reward_func/mean": 0.49609375, | |
| "rewards/soft_format_reward_func/std": 0.015625, | |
| "rewards/strict_format_reward_func/mean": 0.48046875, | |
| "rewards/strict_format_reward_func/std": 0.078125, | |
| "rewards/xmlcount_reward_func/mean": 0.4931640625, | |
| "rewards/xmlcount_reward_func/std": 0.02734375, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 339.25, | |
| "completions/max_terminated_length": 269.0, | |
| "completions/mean_length": 155.5546875, | |
| "completions/mean_terminated_length": 143.96582794189453, | |
| "completions/min_length": 78.0, | |
| "completions/min_terminated_length": 78.0, | |
| "entropy": 0.06739555345848203, | |
| "epoch": 0.7623126338329764, | |
| "frac_reward_zero_std": 0.84375, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 3.534003987842005e-06, | |
| "loss": 0.0879, | |
| "num_tokens": 3546821.0, | |
| "reward": 3.49609375, | |
| "reward_std": 0.14915533305611461, | |
| "rewards/correctness_reward_func/mean": 1.578125, | |
| "rewards/correctness_reward_func/std": 0.8041466698050499, | |
| "rewards/int_reward_func/mean": 0.4609375, | |
| "rewards/int_reward_func/std": 0.11476518586277962, | |
| "rewards/soft_format_reward_func/mean": 0.48828125, | |
| "rewards/soft_format_reward_func/std": 0.03697281517088413, | |
| "rewards/strict_format_reward_func/mean": 0.48046875, | |
| "rewards/strict_format_reward_func/std": 0.05644455552101135, | |
| "rewards/xmlcount_reward_func/mean": 0.48828125, | |
| "rewards/xmlcount_reward_func/std": 0.03697281517088413, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0390625, | |
| "completions/max_length": 367.0, | |
| "completions/max_terminated_length": 264.625, | |
| "completions/mean_length": 164.6796875, | |
| "completions/mean_terminated_length": 150.61161041259766, | |
| "completions/min_length": 72.5, | |
| "completions/min_terminated_length": 72.5, | |
| "entropy": 0.06468326412141323, | |
| "epoch": 0.7708779443254818, | |
| "frac_reward_zero_std": 0.78125, | |
| "grad_norm": 17.75, | |
| "learning_rate": 3.308693936411421e-06, | |
| "loss": 0.1794, | |
| "num_tokens": 3580028.0, | |
| "reward": 3.33203125, | |
| "reward_std": 0.27621358446776867, | |
| "rewards/correctness_reward_func/mean": 1.453125, | |
| "rewards/correctness_reward_func/std": 0.8369380235671997, | |
| "rewards/int_reward_func/mean": 0.45703125, | |
| "rewards/int_reward_func/std": 0.11861192621290684, | |
| "rewards/soft_format_reward_func/mean": 0.484375, | |
| "rewards/soft_format_reward_func/std": 0.04081955552101135, | |
| "rewards/strict_format_reward_func/mean": 0.4609375, | |
| "rewards/strict_format_reward_func/std": 0.10298692621290684, | |
| "rewards/xmlcount_reward_func/mean": 0.4765625, | |
| "rewards/xmlcount_reward_func/std": 0.061410133726894855, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 376.125, | |
| "completions/max_terminated_length": 301.625, | |
| "completions/mean_length": 162.484375, | |
| "completions/mean_terminated_length": 153.9395866394043, | |
| "completions/min_length": 76.375, | |
| "completions/min_terminated_length": 76.375, | |
| "entropy": 0.062019561883062124, | |
| "epoch": 0.7794432548179872, | |
| "frac_reward_zero_std": 0.765625, | |
| "grad_norm": 9.0, | |
| "learning_rate": 3.089373510131354e-06, | |
| "loss": 0.2742, | |
| "num_tokens": 3613084.0, | |
| "reward": 3.546875, | |
| "reward_std": 0.2872621323913336, | |
| "rewards/correctness_reward_func/mean": 1.578125, | |
| "rewards/correctness_reward_func/std": 0.8067077249288559, | |
| "rewards/int_reward_func/mean": 0.4921875, | |
| "rewards/int_reward_func/std": 0.03125, | |
| "rewards/soft_format_reward_func/mean": 0.49609375, | |
| "rewards/soft_format_reward_func/std": 0.015625, | |
| "rewards/strict_format_reward_func/mean": 0.484375, | |
| "rewards/strict_format_reward_func/std": 0.05259781517088413, | |
| "rewards/xmlcount_reward_func/mean": 0.49609375, | |
| "rewards/xmlcount_reward_func/std": 0.013149453792721033, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0546875, | |
| "completions/max_length": 385.625, | |
| "completions/max_terminated_length": 279.125, | |
| "completions/mean_length": 182.7734375, | |
| "completions/mean_terminated_length": 163.5002956390381, | |
| "completions/min_length": 93.0, | |
| "completions/min_terminated_length": 93.0, | |
| "entropy": 0.060410378966480494, | |
| "epoch": 0.7880085653104925, | |
| "frac_reward_zero_std": 0.796875, | |
| "grad_norm": 15.125, | |
| "learning_rate": 2.876239030486554e-06, | |
| "loss": 0.2067, | |
| "num_tokens": 3649339.0, | |
| "reward": 3.423828125, | |
| "reward_std": 0.31488349102437496, | |
| "rewards/correctness_reward_func/mean": 1.53125, | |
| "rewards/correctness_reward_func/std": 0.8038287088274956, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.10904237069189548, | |
| "rewards/soft_format_reward_func/mean": 0.4765625, | |
| "rewards/soft_format_reward_func/std": 0.07206955552101135, | |
| "rewards/strict_format_reward_func/mean": 0.47265625, | |
| "rewards/strict_format_reward_func/std": 0.07779237069189548, | |
| "rewards/xmlcount_reward_func/mean": 0.478515625, | |
| "rewards/xmlcount_reward_func/std": 0.060735128819942474, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0390625, | |
| "completions/max_length": 425.5, | |
| "completions/max_terminated_length": 323.0, | |
| "completions/mean_length": 188.359375, | |
| "completions/mean_terminated_length": 175.17136001586914, | |
| "completions/min_length": 89.5, | |
| "completions/min_terminated_length": 89.5, | |
| "entropy": 0.0746710579842329, | |
| "epoch": 0.7965738758029979, | |
| "frac_reward_zero_std": 0.78125, | |
| "grad_norm": 8.125, | |
| "learning_rate": 2.669481281701739e-06, | |
| "loss": 0.3599, | |
| "num_tokens": 3686281.0, | |
| "reward": 3.3671875, | |
| "reward_std": 0.3425048552453518, | |
| "rewards/correctness_reward_func/mean": 1.453125, | |
| "rewards/correctness_reward_func/std": 0.8768203780055046, | |
| "rewards/int_reward_func/mean": 0.46484375, | |
| "rewards/int_reward_func/std": 0.10904237069189548, | |
| "rewards/soft_format_reward_func/mean": 0.484375, | |
| "rewards/soft_format_reward_func/std": 0.0625, | |
| "rewards/strict_format_reward_func/mean": 0.4765625, | |
| "rewards/strict_format_reward_func/std": 0.08384781517088413, | |
| "rewards/xmlcount_reward_func/mean": 0.48828125, | |
| "rewards/xmlcount_reward_func/std": 0.04335307329893112, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 304.0, | |
| "completions/max_terminated_length": 273.125, | |
| "completions/mean_length": 151.4765625, | |
| "completions/mean_terminated_length": 148.6171875, | |
| "completions/min_length": 74.375, | |
| "completions/min_terminated_length": 74.375, | |
| "entropy": 0.06636462640017271, | |
| "epoch": 0.8051391862955032, | |
| "frac_reward_zero_std": 0.78125, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 2.469285339963892e-06, | |
| "loss": 0.0483, | |
| "num_tokens": 3717830.0, | |
| "reward": 3.517578125, | |
| "reward_std": 0.32869415916502476, | |
| "rewards/correctness_reward_func/mean": 1.53125, | |
| "rewards/correctness_reward_func/std": 0.809794619679451, | |
| "rewards/int_reward_func/mean": 0.49609375, | |
| "rewards/int_reward_func/std": 0.015625, | |
| "rewards/soft_format_reward_func/mean": 0.49609375, | |
| "rewards/soft_format_reward_func/std": 0.015625, | |
| "rewards/strict_format_reward_func/mean": 0.49609375, | |
| "rewards/strict_format_reward_func/std": 0.015625, | |
| "rewards/xmlcount_reward_func/mean": 0.498046875, | |
| "rewards/xmlcount_reward_func/std": 0.0078125, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 319.125, | |
| "completions/max_terminated_length": 247.375, | |
| "completions/mean_length": 158.1015625, | |
| "completions/mean_terminated_length": 141.4641580581665, | |
| "completions/min_length": 79.625, | |
| "completions/min_terminated_length": 79.625, | |
| "entropy": 0.06498824106529355, | |
| "epoch": 0.8137044967880086, | |
| "frac_reward_zero_std": 0.828125, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 2.275830407754006e-06, | |
| "loss": 0.2777, | |
| "num_tokens": 3750065.0, | |
| "reward": 3.41796875, | |
| "reward_std": 0.2485922183841467, | |
| "rewards/correctness_reward_func/mean": 1.5, | |
| "rewards/correctness_reward_func/std": 0.842106930911541, | |
| "rewards/int_reward_func/mean": 0.4765625, | |
| "rewards/int_reward_func/std": 0.04554459825158119, | |
| "rewards/soft_format_reward_func/mean": 0.48046875, | |
| "rewards/soft_format_reward_func/std": 0.04357585124671459, | |
| "rewards/strict_format_reward_func/mean": 0.4765625, | |
| "rewards/strict_format_reward_func/std": 0.04554459825158119, | |
| "rewards/xmlcount_reward_func/mean": 0.484375, | |
| "rewards/xmlcount_reward_func/std": 0.03324815817177296, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0234375, | |
| "completions/max_length": 324.25, | |
| "completions/max_terminated_length": 257.375, | |
| "completions/mean_length": 161.9140625, | |
| "completions/mean_terminated_length": 153.45208549499512, | |
| "completions/min_length": 83.75, | |
| "completions/min_terminated_length": 83.75, | |
| "entropy": 0.06923280376940966, | |
| "epoch": 0.8222698072805139, | |
| "frac_reward_zero_std": 0.84375, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 2.08928965343659e-06, | |
| "loss": 0.0191, | |
| "num_tokens": 3783210.0, | |
| "reward": 3.6064453125, | |
| "reward_std": 0.2085412573069334, | |
| "rewards/correctness_reward_func/mean": 1.65625, | |
| "rewards/correctness_reward_func/std": 0.7442077249288559, | |
| "rewards/int_reward_func/mean": 0.48046875, | |
| "rewards/int_reward_func/std": 0.06822281517088413, | |
| "rewards/soft_format_reward_func/mean": 0.48828125, | |
| "rewards/soft_format_reward_func/std": 0.03697281517088413, | |
| "rewards/strict_format_reward_func/mean": 0.48828125, | |
| "rewards/strict_format_reward_func/std": 0.03697281517088413, | |
| "rewards/xmlcount_reward_func/mean": 0.4931640625, | |
| "rewards/xmlcount_reward_func/std": 0.021456445567309856, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 294.0, | |
| "completions/max_terminated_length": 224.5, | |
| "completions/mean_length": 143.0703125, | |
| "completions/mean_terminated_length": 137.4109401702881, | |
| "completions/min_length": 66.125, | |
| "completions/min_terminated_length": 66.125, | |
| "entropy": 0.06562586454674602, | |
| "epoch": 0.8308351177730193, | |
| "frac_reward_zero_std": 0.90625, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 1.9098300562505266e-06, | |
| "loss": -0.0738, | |
| "num_tokens": 3813537.0, | |
| "reward": 3.6640625, | |
| "reward_std": 0.14086892642080784, | |
| "rewards/correctness_reward_func/mean": 1.734375, | |
| "rewards/correctness_reward_func/std": 0.5065634250640869, | |
| "rewards/int_reward_func/mean": 0.48046875, | |
| "rewards/int_reward_func/std": 0.046542370691895485, | |
| "rewards/soft_format_reward_func/mean": 0.484375, | |
| "rewards/soft_format_reward_func/std": 0.04081955552101135, | |
| "rewards/strict_format_reward_func/mean": 0.48046875, | |
| "rewards/strict_format_reward_func/std": 0.05644455552101135, | |
| "rewards/xmlcount_reward_func/mean": 0.484375, | |
| "rewards/xmlcount_reward_func/std": 0.04081955552101135, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 262.75, | |
| "completions/max_terminated_length": 262.75, | |
| "completions/mean_length": 146.15625, | |
| "completions/mean_terminated_length": 146.15625, | |
| "completions/min_length": 77.625, | |
| "completions/min_terminated_length": 77.625, | |
| "entropy": 0.05933028785511851, | |
| "epoch": 0.8394004282655246, | |
| "frac_reward_zero_std": 0.828125, | |
| "grad_norm": 8.5, | |
| "learning_rate": 1.7376122568400533e-06, | |
| "loss": -0.0062, | |
| "num_tokens": 3844527.0, | |
| "reward": 3.7138671875, | |
| "reward_std": 0.21682766266167164, | |
| "rewards/correctness_reward_func/mean": 1.734375, | |
| "rewards/correctness_reward_func/std": 0.5849205926060677, | |
| "rewards/int_reward_func/mean": 0.484375, | |
| "rewards/int_reward_func/std": 0.05259781517088413, | |
| "rewards/soft_format_reward_func/mean": 0.5, | |
| "rewards/soft_format_reward_func/std": 0.0, | |
| "rewards/strict_format_reward_func/mean": 0.49609375, | |
| "rewards/strict_format_reward_func/std": 0.015625, | |
| "rewards/xmlcount_reward_func/mean": 0.4990234375, | |
| "rewards/xmlcount_reward_func/std": 0.00390625, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 298.125, | |
| "completions/max_terminated_length": 269.0, | |
| "completions/mean_length": 152.359375, | |
| "completions/mean_terminated_length": 149.32135486602783, | |
| "completions/min_length": 76.125, | |
| "completions/min_terminated_length": 76.125, | |
| "entropy": 0.07287971116602421, | |
| "epoch": 0.8479657387580299, | |
| "frac_reward_zero_std": 0.78125, | |
| "grad_norm": 35.5, | |
| "learning_rate": 1.5727904134596084e-06, | |
| "loss": 0.0803, | |
| "num_tokens": 3875861.0, | |
| "reward": 3.2978515625, | |
| "reward_std": 0.3632208569906652, | |
| "rewards/correctness_reward_func/mean": 1.421875, | |
| "rewards/correctness_reward_func/std": 0.9180332496762276, | |
| "rewards/int_reward_func/mean": 0.4609375, | |
| "rewards/int_reward_func/std": 0.10298692621290684, | |
| "rewards/soft_format_reward_func/mean": 0.47265625, | |
| "rewards/soft_format_reward_func/std": 0.0660141110420227, | |
| "rewards/strict_format_reward_func/mean": 0.47265625, | |
| "rewards/strict_format_reward_func/std": 0.0660141110420227, | |
| "rewards/xmlcount_reward_func/mean": 0.4697265625, | |
| "rewards/xmlcount_reward_func/std": 0.06928502768278122, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 399.5, | |
| "completions/max_terminated_length": 313.125, | |
| "completions/mean_length": 182.84375, | |
| "completions/mean_terminated_length": 160.92637634277344, | |
| "completions/min_length": 84.375, | |
| "completions/min_terminated_length": 84.375, | |
| "entropy": 0.07499845931306481, | |
| "epoch": 0.8565310492505354, | |
| "frac_reward_zero_std": 0.734375, | |
| "grad_norm": 15.375, | |
| "learning_rate": 1.4155120639813392e-06, | |
| "loss": 0.1946, | |
| "num_tokens": 3911453.0, | |
| "reward": 3.2890625, | |
| "reward_std": 0.40603396110236645, | |
| "rewards/correctness_reward_func/mean": 1.421875, | |
| "rewards/correctness_reward_func/std": 0.9128188416361809, | |
| "rewards/int_reward_func/mean": 0.44921875, | |
| "rewards/int_reward_func/std": 0.10904237069189548, | |
| "rewards/soft_format_reward_func/mean": 0.47265625, | |
| "rewards/soft_format_reward_func/std": 0.06116959825158119, | |
| "rewards/strict_format_reward_func/mean": 0.46875, | |
| "rewards/strict_format_reward_func/std": 0.07679459825158119, | |
| "rewards/xmlcount_reward_func/mean": 0.4765625, | |
| "rewards/xmlcount_reward_func/std": 0.05072539113461971, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 117, | |
| "num_input_tokens_seen": 3911453, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |