{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00068, "eval_steps": 500, "global_step": 68, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 160.90625, "completions/mean_terminated_length": 160.90625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 8.38103711605072, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028616045601665974, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0002, "num_tokens": 18085.0, "reward": 0.03125002980232239, "reward_std": 0.6862481236457825, "rewards/rollout_reward_func/mean": 0.03125002980232239, "rewards/rollout_reward_func/std": 1.011366844177246, "sampling/importance_sampling_ratio/max": 0.010557955130934715, "sampling/importance_sampling_ratio/mean": 0.003285687882453203, "sampling/importance_sampling_ratio/min": 7.365059625542847e-13, "sampling/sampling_logp_difference/max": 10.706700325012207, "sampling/sampling_logp_difference/mean": 1.4828054904937744, "step": 1, "step_time": 5.417499241000769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.38103711605072, "epoch": 2e-05, "grad_norm": 0.002821348374709487, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": -0.0002, "step": 2, "step_time": 2.1234856260016386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 80.875, "completions/mean_terminated_length": 80.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.941936016082764, "epoch": 3e-05, "frac_reward_zero_std": 0.25, "grad_norm": 0.041607797145843506, "kl": 0.0004363941061455989, "learning_rate": 5.714285714285715e-07, "loss": 0.0023, "num_tokens": 33737.0, "reward": -0.13124999403953552, "reward_std": 0.7598094344139099, "rewards/rollout_reward_func/mean": -0.13124999403953552, "rewards/rollout_reward_func/std": 1.0014303922653198, "sampling/importance_sampling_ratio/max": 0.11680073291063309, "sampling/importance_sampling_ratio/mean": 0.033671747893095016, "sampling/importance_sampling_ratio/min": 1.0370337122367346e-06, "sampling/sampling_logp_difference/max": 4.6255574226379395, "sampling/sampling_logp_difference/mean": 1.339035987854004, "step": 3, "step_time": 3.9910208220007917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.934386253356934, "epoch": 4e-05, "grad_norm": 0.04307129234075546, "kl": 0.0004773353211930953, "learning_rate": 8.571428571428572e-07, "loss": 0.0023, "step": 4, "step_time": 2.6708831029991416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 129.0625, "completions/mean_terminated_length": 129.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.268575668334961, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.03974386304616928, "kl": 0.0007551452181360219, "learning_rate": 1.142857142857143e-06, "loss": 0.0022, "num_tokens": 51251.0, "reward": -0.53125, "reward_std": 0.5484436750411987, "rewards/rollout_reward_func/mean": -0.53125, "rewards/rollout_reward_func/std": 0.8789427280426025, "sampling/importance_sampling_ratio/max": 0.11807744950056076, "sampling/importance_sampling_ratio/mean": 0.024541109800338745, "sampling/importance_sampling_ratio/min": 1.4743216955430405e-13, "sampling/sampling_logp_difference/max": 10.85576057434082, "sampling/sampling_logp_difference/mean": 1.503469467163086, "step": 5, "step_time": 4.814693420999902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.273618221282959, "epoch": 6e-05, "grad_norm": 0.04039419814944267, "kl": 0.0003924804532289272, "learning_rate": 1.4285714285714286e-06, "loss": 0.0022, "step": 6, "step_time": 2.0722021359997598 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.03125, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 159.3125, "completions/mean_terminated_length": 157.77418518066406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.400971293449402, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.02198818139731884, "kl": 0.0007719468412688002, "learning_rate": 1.7142857142857145e-06, "loss": 0.0013, "num_tokens": 69453.0, "reward": -0.6624999642372131, "reward_std": 0.48922622203826904, "rewards/rollout_reward_func/mean": -0.6624999642372131, "rewards/rollout_reward_func/std": 0.8071255087852478, "sampling/importance_sampling_ratio/max": 0.10809381306171417, "sampling/importance_sampling_ratio/mean": 0.011609362438321114, "sampling/importance_sampling_ratio/min": 1.4364835228825295e-22, "sampling/sampling_logp_difference/max": 11.3631591796875, "sampling/sampling_logp_difference/mean": 1.7411997318267822, "step": 7, "step_time": 4.814280139999937 }, { "clip_ratio/high_max": 0.014583333861082792, "clip_ratio/high_mean": 0.007291666930541396, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007291666930541396, "entropy": 8.400816917419434, "epoch": 8e-05, "grad_norm": 0.021912436932325363, "kl": 0.000808713368314784, "learning_rate": 2.0000000000000003e-06, "loss": 0.0013, "step": 8, "step_time": 2.084449717998723 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0625, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 160.40625, "completions/mean_terminated_length": 156.1666717529297, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.220785737037659, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00841094646602869, "kl": 0.0007819603888492566, "learning_rate": 2.285714285714286e-06, "loss": -0.0006, "num_tokens": 88018.0, "reward": -0.6656249761581421, "reward_std": 0.30112773180007935, "rewards/rollout_reward_func/mean": -0.6656249761581421, "rewards/rollout_reward_func/std": 0.8090652823448181, "sampling/importance_sampling_ratio/max": 0.10453330725431442, "sampling/importance_sampling_ratio/mean": 0.009016389958560467, "sampling/importance_sampling_ratio/min": 3.77039369235492e-14, "sampling/sampling_logp_difference/max": 11.177996635437012, "sampling/sampling_logp_difference/mean": 1.46906578540802, "step": 9, "step_time": 5.767028449001373 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 8.221423089504242, "epoch": 0.0001, "grad_norm": 0.00875311903655529, "kl": 0.0009726146636239719, "learning_rate": 2.571428571428571e-06, "loss": -0.0006, "step": 10, "step_time": 3.8331103400005304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.224214434623718, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 0.005308263469487429, "kl": 0.0006370486844389234, "learning_rate": 2.8571428571428573e-06, "loss": -0.0005, "num_tokens": 105118.0, "reward": -0.6531249284744263, "reward_std": 0.27538806200027466, "rewards/rollout_reward_func/mean": -0.6531249284744263, "rewards/rollout_reward_func/std": 0.7935158014297485, "sampling/importance_sampling_ratio/max": 0.09377396106719971, "sampling/importance_sampling_ratio/mean": 0.007022843696177006, "sampling/importance_sampling_ratio/min": 1.1420680792263728e-27, "sampling/sampling_logp_difference/max": 4.9811320304870605, "sampling/sampling_logp_difference/mean": 1.3924936056137085, "step": 11, "step_time": 5.839474645000337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.226899027824402, "epoch": 0.00012, "grad_norm": 0.005312995053827763, "kl": 0.0006140958357718773, "learning_rate": 3.142857142857143e-06, "loss": -0.0005, "step": 12, "step_time": 2.6753236439999455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 188.25, "completions/mean_terminated_length": 188.25, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 8.331215858459473, "epoch": 0.00013, "frac_reward_zero_std": 0.5, "grad_norm": 0.0016523015219718218, "kl": 0.0006200866155268159, "learning_rate": 3.428571428571429e-06, "loss": -0.0001, "num_tokens": 124622.0, "reward": -1.009374976158142, "reward_std": 0.0265165027230978, "rewards/rollout_reward_func/mean": -1.009374976158142, "rewards/rollout_reward_func/std": 0.03901509940624237, "sampling/importance_sampling_ratio/max": 0.0077222432009875774, "sampling/importance_sampling_ratio/mean": 0.0027147922664880753, "sampling/importance_sampling_ratio/min": 3.714021278022894e-14, "sampling/sampling_logp_difference/max": 11.583379745483398, "sampling/sampling_logp_difference/mean": 1.4247815608978271, "step": 13, "step_time": 4.811962196999048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.332964897155762, "epoch": 0.00014, "grad_norm": 0.0017943360144272447, "kl": 0.0005818934332637582, "learning_rate": 3.7142857142857146e-06, "loss": -0.0001, "step": 14, "step_time": 2.0559995119983796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 134.34375, "completions/mean_terminated_length": 134.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.289089798927307, "epoch": 0.00015, "frac_reward_zero_std": 0.25, "grad_norm": 0.006275709718465805, "kl": 0.0006801459057896864, "learning_rate": 4.000000000000001e-06, "loss": -0.0006, "num_tokens": 142137.0, "reward": -1.0187499523162842, "reward_std": 0.04082316905260086, "rewards/rollout_reward_func/mean": -1.0187499523162842, "rewards/rollout_reward_func/std": 0.04709291458129883, "sampling/importance_sampling_ratio/max": 0.1032625362277031, "sampling/importance_sampling_ratio/mean": 0.010007976554334164, "sampling/importance_sampling_ratio/min": 1.6740179376029118e-07, "sampling/sampling_logp_difference/max": 4.918362617492676, "sampling/sampling_logp_difference/mean": 1.347560167312622, "step": 15, "step_time": 4.26574305899976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.295971155166626, "epoch": 0.00016, "grad_norm": 0.006583907175809145, "kl": 0.0010946946458716411, "learning_rate": 4.2857142857142855e-06, "loss": -0.0006, "step": 16, "step_time": 3.0452185289996123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 176.59375, "completions/mean_terminated_length": 176.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.386258959770203, "epoch": 0.00017, "frac_reward_zero_std": 0.0, "grad_norm": 0.019049938768148422, "kl": 0.0009286534441343974, "learning_rate": 4.571428571428572e-06, "loss": 0.0008, "num_tokens": 161764.0, "reward": 0.34687501192092896, "reward_std": 0.3751429617404938, "rewards/rollout_reward_func/mean": 0.34687501192092896, "rewards/rollout_reward_func/std": 0.9510552287101746, "sampling/importance_sampling_ratio/max": 0.10295701771974564, "sampling/importance_sampling_ratio/mean": 0.00530514121055603, "sampling/importance_sampling_ratio/min": 1.1150266265858022e-09, "sampling/sampling_logp_difference/max": 8.94926643371582, "sampling/sampling_logp_difference/mean": 1.3967115879058838, "step": 17, "step_time": 4.39325438600099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.376275658607483, "epoch": 0.00018, "grad_norm": 0.017463266849517822, "kl": 0.0013417988302535377, "learning_rate": 4.857142857142858e-06, "loss": 0.0007, "step": 18, "step_time": 2.0553056610006024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 163.53125, "completions/mean_terminated_length": 163.53125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 8.383512496948242, "epoch": 0.00019, "frac_reward_zero_std": 0.0, "grad_norm": 0.0029027618002146482, "kl": 0.0011069331085309386, "learning_rate": 5.142857142857142e-06, "loss": -0.0001, "num_tokens": 180341.0, "reward": -0.840624988079071, "reward_std": 0.2795426845550537, "rewards/rollout_reward_func/mean": -0.840624988079071, "rewards/rollout_reward_func/std": 0.570715069770813, "sampling/importance_sampling_ratio/max": 0.009223885834217072, "sampling/importance_sampling_ratio/mean": 0.002876629587262869, "sampling/importance_sampling_ratio/min": 3.4141774769962514e-21, "sampling/sampling_logp_difference/max": 11.647392272949219, "sampling/sampling_logp_difference/mean": 1.74857759475708, "step": 19, "step_time": 5.028274022000915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.370469331741333, "epoch": 0.0002, "grad_norm": 0.002040296094492078, "kl": 0.0010489341875654645, "learning_rate": 5.428571428571429e-06, "loss": -0.0001, "step": 20, "step_time": 2.1169578549997823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 155.15625, "completions/mean_terminated_length": 155.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.354791045188904, "epoch": 0.00021, "frac_reward_zero_std": 0.25, "grad_norm": 0.014582234434783459, "kl": 0.0016839846794027835, "learning_rate": 5.7142857142857145e-06, "loss": 0.0006, "num_tokens": 198770.0, "reward": 0.46875, "reward_std": 0.6740255355834961, "rewards/rollout_reward_func/mean": 0.46875, "rewards/rollout_reward_func/std": 0.9006941318511963, "sampling/importance_sampling_ratio/max": 0.08340345323085785, "sampling/importance_sampling_ratio/mean": 0.006231832783669233, "sampling/importance_sampling_ratio/min": 2.269645449359814e-08, "sampling/sampling_logp_difference/max": 2.1503682136535645, "sampling/sampling_logp_difference/mean": 1.321890115737915, "step": 21, "step_time": 4.95798639899931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.337073802947998, "epoch": 0.00022, "grad_norm": 0.014086912386119366, "kl": 0.0020258916774764657, "learning_rate": 6e-06, "loss": 0.0006, "step": 22, "step_time": 2.598393530000976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 152.59375, "completions/mean_terminated_length": 152.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.294317483901978, "epoch": 0.00023, "frac_reward_zero_std": 0.25, "grad_norm": 0.019008534029126167, "kl": 0.004680573721998371, "learning_rate": 6.285714285714286e-06, "loss": 0.0008, "num_tokens": 217253.0, "reward": -0.53125, "reward_std": 0.3699263334274292, "rewards/rollout_reward_func/mean": -0.53125, "rewards/rollout_reward_func/std": 0.8785756230354309, "sampling/importance_sampling_ratio/max": 0.09325665980577469, "sampling/importance_sampling_ratio/mean": 0.009642375633120537, "sampling/importance_sampling_ratio/min": 1.0861621381728576e-20, "sampling/sampling_logp_difference/max": 10.062246322631836, "sampling/sampling_logp_difference/mean": 1.4383571147918701, "step": 23, "step_time": 4.488339367000663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "entropy": 8.280492901802063, "epoch": 0.00024, "grad_norm": 0.01694045588374138, "kl": 0.007116928434697911, "learning_rate": 6.571428571428572e-06, "loss": 0.0007, "step": 24, "step_time": 2.144616393000433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 141.1875, "completions/mean_terminated_length": 141.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.207048416137695, "epoch": 0.00025, "frac_reward_zero_std": 0.25, "grad_norm": 0.008826951496303082, "kl": 0.011807448376202956, "learning_rate": 6.857142857142858e-06, "loss": -0.0003, "num_tokens": 235259.0, "reward": -0.7625000476837158, "reward_std": 0.2917833626270294, "rewards/rollout_reward_func/mean": -0.7625000476837158, "rewards/rollout_reward_func/std": 0.6776382327079773, "sampling/importance_sampling_ratio/max": 0.06867893040180206, "sampling/importance_sampling_ratio/mean": 0.007195095531642437, "sampling/importance_sampling_ratio/min": 8.830933421045788e-15, "sampling/sampling_logp_difference/max": 4.469038486480713, "sampling/sampling_logp_difference/mean": 1.387784719467163, "step": 25, "step_time": 5.051108140999531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.201621413230896, "epoch": 0.00026, "grad_norm": 0.009136058390140533, "kl": 0.014795138500630856, "learning_rate": 7.1428571428571436e-06, "loss": -0.0003, "step": 26, "step_time": 2.5399822110020978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 136.9375, "completions/mean_terminated_length": 136.9375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 8.213225603103638, "epoch": 0.00027, "frac_reward_zero_std": 0.5, "grad_norm": 0.002163404831662774, "kl": 0.0031390516232931986, "learning_rate": 7.428571428571429e-06, "loss": -0.0001, "num_tokens": 252577.0, "reward": -0.3937499523162842, "reward_std": 0.2508378326892853, "rewards/rollout_reward_func/mean": -0.3937499523162842, "rewards/rollout_reward_func/std": 0.9557761549949646, "sampling/importance_sampling_ratio/max": 0.010245459154248238, "sampling/importance_sampling_ratio/mean": 0.004281196743249893, "sampling/importance_sampling_ratio/min": 9.54090864979662e-07, "sampling/sampling_logp_difference/max": 3.967635154724121, "sampling/sampling_logp_difference/mean": 1.3100483417510986, "step": 27, "step_time": 6.014637065999523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.203812718391418, "epoch": 0.00028, "grad_norm": 0.002143328543752432, "kl": 0.003412064048461616, "learning_rate": 7.714285714285716e-06, "loss": -0.0001, "step": 28, "step_time": 2.5550204580013087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 146.15625, "completions/mean_terminated_length": 146.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.294090986251831, "epoch": 0.00029, "frac_reward_zero_std": 0.25, "grad_norm": 0.014697319827973843, "kl": 0.025721593061462045, "learning_rate": 8.000000000000001e-06, "loss": -0.0002, "num_tokens": 270222.0, "reward": -0.703125, "reward_std": 0.2893909811973572, "rewards/rollout_reward_func/mean": -0.703125, "rewards/rollout_reward_func/std": 0.7459500432014465, "sampling/importance_sampling_ratio/max": 0.05455424264073372, "sampling/importance_sampling_ratio/mean": 0.006639046128839254, "sampling/importance_sampling_ratio/min": 5.018679106327676e-15, "sampling/sampling_logp_difference/max": 9.214568138122559, "sampling/sampling_logp_difference/mean": 1.469120979309082, "step": 29, "step_time": 5.175338321999334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.281907558441162, "epoch": 0.0003, "grad_norm": 0.01356051117181778, "kl": 0.02538611611817032, "learning_rate": 8.285714285714287e-06, "loss": -0.0002, "step": 30, "step_time": 2.570307294000486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 122.40625, "completions/mean_terminated_length": 122.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.991218149662018, "epoch": 0.00031, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034507170785218477, "kl": 0.012121076317271218, "learning_rate": 8.571428571428571e-06, "loss": 0.0, "num_tokens": 287003.0, "reward": -1.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 0.06423835456371307, "sampling/importance_sampling_ratio/mean": 0.006418607663363218, "sampling/importance_sampling_ratio/min": 2.267779519726787e-09, "sampling/sampling_logp_difference/max": 7.289473533630371, "sampling/sampling_logp_difference/mean": 1.3377501964569092, "step": 31, "step_time": 4.991982824997649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.985842347145081, "epoch": 0.00032, "grad_norm": 0.0031677871011197567, "kl": 0.01152854437532369, "learning_rate": 8.857142857142858e-06, "loss": 0.0, "step": 32, "step_time": 2.551680210001905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 141.4375, "completions/mean_terminated_length": 141.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.251873970031738, "epoch": 0.00033, "frac_reward_zero_std": 0.0, "grad_norm": 0.011402878910303116, "kl": 0.015730057610198855, "learning_rate": 9.142857142857144e-06, "loss": -0.0004, "num_tokens": 305185.0, "reward": -0.3999999761581421, "reward_std": 0.5431233644485474, "rewards/rollout_reward_func/mean": -0.3999999761581421, "rewards/rollout_reward_func/std": 0.9466408491134644, "sampling/importance_sampling_ratio/max": 0.05344817042350769, "sampling/importance_sampling_ratio/mean": 0.005628373473882675, "sampling/importance_sampling_ratio/min": 1.8508519615559533e-23, "sampling/sampling_logp_difference/max": 13.587614059448242, "sampling/sampling_logp_difference/mean": 1.4650746583938599, "step": 33, "step_time": 6.090412677998756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.246023058891296, "epoch": 0.00034, "grad_norm": 0.008096246048808098, "kl": 0.014423061889829114, "learning_rate": 9.42857142857143e-06, "loss": -0.0004, "step": 34, "step_time": 2.5453261089996886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 132.625, "completions/mean_terminated_length": 132.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.162899851799011, "epoch": 0.00035, "frac_reward_zero_std": 0.75, "grad_norm": 0.004175754263997078, "kl": 0.018283673271071166, "learning_rate": 9.714285714285715e-06, "loss": -0.0001, "num_tokens": 323325.0, "reward": -0.5718749761581421, "reward_std": 0.18343815207481384, "rewards/rollout_reward_func/mean": -0.5718749761581421, "rewards/rollout_reward_func/std": 0.8301706910133362, "sampling/importance_sampling_ratio/max": 0.062451351433992386, "sampling/importance_sampling_ratio/mean": 0.007372735999524593, "sampling/importance_sampling_ratio/min": 1.2278145562505762e-20, "sampling/sampling_logp_difference/max": 9.404362678527832, "sampling/sampling_logp_difference/mean": 1.5296763181686401, "step": 35, "step_time": 5.175936893000653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.150159418582916, "epoch": 0.00036, "grad_norm": 0.004381407983601093, "kl": 0.017532640282297507, "learning_rate": 1e-05, "loss": -0.0001, "step": 36, "step_time": 2.551337270998374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 193.5625, "completions/mean_terminated_length": 193.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.269986033439636, "epoch": 0.00037, "frac_reward_zero_std": 0.0, "grad_norm": 0.008302879519760609, "kl": 0.009193298814352602, "learning_rate": 9.9999999995372e-06, "loss": 0.0005, "num_tokens": 342711.0, "reward": -0.03749999403953552, "reward_std": 0.7515531778335571, "rewards/rollout_reward_func/mean": -0.03749999403953552, "rewards/rollout_reward_func/std": 1.0247737169265747, "sampling/importance_sampling_ratio/max": 0.06419403851032257, "sampling/importance_sampling_ratio/mean": 0.003890752326697111, "sampling/importance_sampling_ratio/min": 1.164774526829504e-10, "sampling/sampling_logp_difference/max": 3.794468402862549, "sampling/sampling_logp_difference/mean": 1.2929356098175049, "step": 37, "step_time": 4.535253317000752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.270979046821594, "epoch": 0.00038, "grad_norm": 0.00880998931825161, "kl": 0.007889840024290606, "learning_rate": 9.999999998148802e-06, "loss": 0.0005, "step": 38, "step_time": 2.492936824000026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 171.09375, "completions/mean_terminated_length": 172.93548583984375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 8.237572193145752, "epoch": 0.00039, "frac_reward_zero_std": 0.25, "grad_norm": 0.003999819979071617, "kl": 0.003984437556937337, "learning_rate": 9.999999995834804e-06, "loss": -0.0003, "num_tokens": 361754.0, "reward": -0.2750000059604645, "reward_std": 0.29003918170928955, "rewards/rollout_reward_func/mean": -0.2750000059604645, "rewards/rollout_reward_func/std": 0.9721940755844116, "sampling/importance_sampling_ratio/max": 0.00961074884980917, "sampling/importance_sampling_ratio/mean": 0.0027457564137876034, "sampling/importance_sampling_ratio/min": 1.3412947045145382e-17, "sampling/sampling_logp_difference/max": 10.32085132598877, "sampling/sampling_logp_difference/mean": 1.3388471603393555, "step": 39, "step_time": 4.8369908450004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.241283535957336, "epoch": 0.0004, "grad_norm": 0.003973928280174732, "kl": 0.004160504468018189, "learning_rate": 9.999999992595207e-06, "loss": -0.0003, "step": 40, "step_time": 2.064312154999243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 149.6875, "completions/mean_terminated_length": 150.7096710205078, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 8.331173419952393, "epoch": 0.00041, "frac_reward_zero_std": 0.25, "grad_norm": 0.0036302730441093445, "kl": 0.002889991897973232, "learning_rate": 9.999999988430008e-06, "loss": -0.0, "num_tokens": 380000.0, "reward": -0.40312498807907104, "reward_std": 0.5359131097793579, "rewards/rollout_reward_func/mean": -0.40312498807907104, "rewards/rollout_reward_func/std": 0.9361845850944519, "sampling/importance_sampling_ratio/max": 0.009129444137215614, "sampling/importance_sampling_ratio/mean": 0.003908202983438969, "sampling/importance_sampling_ratio/min": 3.258544557229945e-14, "sampling/sampling_logp_difference/max": 4.673151016235352, "sampling/sampling_logp_difference/mean": 1.426315426826477, "step": 41, "step_time": 4.397045022999009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.317348182201385, "epoch": 0.00042, "grad_norm": 0.002291926182806492, "kl": 0.002586090617114678, "learning_rate": 9.999999983339212e-06, "loss": -0.0, "step": 42, "step_time": 2.1006949159991564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 168.1875, "completions/mean_terminated_length": 168.1875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 8.245356917381287, "epoch": 0.00043, "frac_reward_zero_std": 0.0, "grad_norm": 0.004080729093402624, "kl": 0.003670690639410168, "learning_rate": 9.999999977322818e-06, "loss": -0.0004, "num_tokens": 398190.0, "reward": -0.5843750238418579, "reward_std": 0.22265997529029846, "rewards/rollout_reward_func/mean": -0.5843750238418579, "rewards/rollout_reward_func/std": 0.8462857007980347, "sampling/importance_sampling_ratio/max": 0.010589290410280228, "sampling/importance_sampling_ratio/mean": 0.003948138561099768, "sampling/importance_sampling_ratio/min": 1.0654254561925924e-10, "sampling/sampling_logp_difference/max": 9.166659355163574, "sampling/sampling_logp_difference/mean": 1.3605971336364746, "step": 43, "step_time": 4.460575121997863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.227819621562958, "epoch": 0.00044, "grad_norm": 0.004100060556083918, "kl": 0.003359506925335154, "learning_rate": 9.999999970380822e-06, "loss": -0.0004, "step": 44, "step_time": 3.0909940020010254 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 160.40625, "completions/mean_terminated_length": 160.40625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 8.268202066421509, "epoch": 0.00045, "frac_reward_zero_std": 0.25, "grad_norm": 0.0023337905295193195, "kl": 0.0028031190449837595, "learning_rate": 9.999999962513228e-06, "loss": -0.0001, "num_tokens": 416179.0, "reward": -0.5843750238418579, "reward_std": 0.20319493114948273, "rewards/rollout_reward_func/mean": -0.5843750238418579, "rewards/rollout_reward_func/std": 0.830510675907135, "sampling/importance_sampling_ratio/max": 0.010615027509629726, "sampling/importance_sampling_ratio/mean": 0.003981100395321846, "sampling/importance_sampling_ratio/min": 1.0911494689562484e-13, "sampling/sampling_logp_difference/max": 11.518604278564453, "sampling/sampling_logp_difference/mean": 1.4530669450759888, "step": 45, "step_time": 4.283326912999655 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 8.239543914794922, "epoch": 0.00046, "grad_norm": 0.0035347214434295893, "kl": 0.0033496549731353298, "learning_rate": 9.999999953720035e-06, "loss": -0.0001, "step": 46, "step_time": 2.0328167930001655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 154.96875, "completions/mean_terminated_length": 154.96875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 8.14276933670044, "epoch": 0.00047, "frac_reward_zero_std": 0.0, "grad_norm": 0.003259505843743682, "kl": 0.005610214080661535, "learning_rate": 9.99999994400124e-06, "loss": -0.0004, "num_tokens": 434762.0, "reward": -1.0218749046325684, "reward_std": 0.04966200143098831, "rewards/rollout_reward_func/mean": -1.0218749046325684, "rewards/rollout_reward_func/std": 0.0490843690931797, "sampling/importance_sampling_ratio/max": 0.011610783636569977, "sampling/importance_sampling_ratio/mean": 0.003400737652555108, "sampling/importance_sampling_ratio/min": 9.674043361674659e-17, "sampling/sampling_logp_difference/max": 10.514336585998535, "sampling/sampling_logp_difference/mean": 1.5065617561340332, "step": 47, "step_time": 4.387635872000828 }, { "clip_ratio/high_max": 0.004807692486792803, "clip_ratio/high_mean": 0.0024038462433964014, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "entropy": 8.125671744346619, "epoch": 0.00048, "grad_norm": 0.0028330760542303324, "kl": 0.005603832833003253, "learning_rate": 9.999999933356848e-06, "loss": -0.0004, "step": 48, "step_time": 2.031070074998752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 153.46875, "completions/mean_terminated_length": 154.61289978027344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.048729181289673, "epoch": 0.00049, "frac_reward_zero_std": 0.25, "grad_norm": 0.005494742188602686, "kl": 0.03256297600455582, "learning_rate": 9.999999921786855e-06, "loss": -0.0001, "num_tokens": 453105.0, "reward": -0.078125, "reward_std": 0.5431658029556274, "rewards/rollout_reward_func/mean": -0.078125, "rewards/rollout_reward_func/std": 1.0044207572937012, "sampling/importance_sampling_ratio/max": 0.05582950636744499, "sampling/importance_sampling_ratio/mean": 0.004962150938808918, "sampling/importance_sampling_ratio/min": 2.203801160657881e-12, "sampling/sampling_logp_difference/max": 9.083192825317383, "sampling/sampling_logp_difference/mean": 1.358087182044983, "step": 49, "step_time": 4.150341610999931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.026359498500824, "epoch": 0.0005, "grad_norm": 0.005025926977396011, "kl": 0.02888420899398625, "learning_rate": 9.999999909291265e-06, "loss": -0.0001, "step": 50, "step_time": 2.9815445680005723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 143.65625, "completions/mean_terminated_length": 143.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.10213428735733, "epoch": 0.00051, "frac_reward_zero_std": 0.5, "grad_norm": 0.001717552193440497, "kl": 0.011197627696674317, "learning_rate": 9.999999895870075e-06, "loss": -0.0002, "num_tokens": 470670.0, "reward": -1.015625, "reward_std": 0.036278266459703445, "rewards/rollout_reward_func/mean": -1.015625, "rewards/rollout_reward_func/std": 0.05148990824818611, "sampling/importance_sampling_ratio/max": 0.06585416942834854, "sampling/importance_sampling_ratio/mean": 0.006765150930732489, "sampling/importance_sampling_ratio/min": 1.496036702519632e-06, "sampling/sampling_logp_difference/max": 4.3300042152404785, "sampling/sampling_logp_difference/mean": 1.3094828128814697, "step": 51, "step_time": 4.126140448999649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.08252465724945, "epoch": 0.00052, "grad_norm": 0.0015695166075602174, "kl": 0.0106147377518937, "learning_rate": 9.999999881523285e-06, "loss": -0.0002, "step": 52, "step_time": 2.0502745620005953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 175.29031372070312, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 7.971049547195435, "epoch": 0.00053, "frac_reward_zero_std": 0.25, "grad_norm": 0.002233212348073721, "kl": 0.004628648399375379, "learning_rate": 9.999999866250896e-06, "loss": -0.0001, "num_tokens": 489622.0, "reward": 0.04375000298023224, "reward_std": 0.6950876712799072, "rewards/rollout_reward_func/mean": 0.04375000298023224, "rewards/rollout_reward_func/std": 1.0162986516952515, "sampling/importance_sampling_ratio/max": 0.014256482943892479, "sampling/importance_sampling_ratio/mean": 0.0037497361190617085, "sampling/importance_sampling_ratio/min": 3.703341886623912e-13, "sampling/sampling_logp_difference/max": 10.441887855529785, "sampling/sampling_logp_difference/mean": 1.3701958656311035, "step": 53, "step_time": 4.342945265999333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.953936517238617, "epoch": 0.00054, "grad_norm": 0.0023060261737555265, "kl": 0.004763354663737118, "learning_rate": 9.999999850052909e-06, "loss": -0.0001, "step": 54, "step_time": 2.074805829001889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 133.125, "completions/mean_terminated_length": 133.61289978027344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.858851313591003, "epoch": 0.00055, "frac_reward_zero_std": 0.25, "grad_norm": 0.010865299962460995, "kl": 0.05283547495491803, "learning_rate": 9.99999983292932e-06, "loss": 0.0005, "num_tokens": 507130.0, "reward": -0.08750000596046448, "reward_std": 0.2218937873840332, "rewards/rollout_reward_func/mean": -0.08750000596046448, "rewards/rollout_reward_func/std": 1.0082721710205078, "sampling/importance_sampling_ratio/max": 0.0794573649764061, "sampling/importance_sampling_ratio/mean": 0.01186932623386383, "sampling/importance_sampling_ratio/min": 2.8194972302655857e-18, "sampling/sampling_logp_difference/max": 11.174360275268555, "sampling/sampling_logp_difference/mean": 1.4186177253723145, "step": 55, "step_time": 4.7503355889994054 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 7.84703141450882, "epoch": 0.00056, "grad_norm": 0.009876980446279049, "kl": 0.04835269978502765, "learning_rate": 9.999999814880132e-06, "loss": 0.0005, "step": 56, "step_time": 2.697479030000977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 161.03125, "completions/mean_terminated_length": 161.03125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 7.946120798587799, "epoch": 0.00057, "frac_reward_zero_std": 0.25, "grad_norm": 0.00416550925001502, "kl": 0.005225592787610367, "learning_rate": 9.999999795905347e-06, "loss": 0.0003, "num_tokens": 524811.0, "reward": -0.328125, "reward_std": 0.7972557544708252, "rewards/rollout_reward_func/mean": -0.328125, "rewards/rollout_reward_func/std": 0.9642762541770935, "sampling/importance_sampling_ratio/max": 0.011825657449662685, "sampling/importance_sampling_ratio/mean": 0.004940683953464031, "sampling/importance_sampling_ratio/min": 7.442185295759504e-14, "sampling/sampling_logp_difference/max": 10.764989852905273, "sampling/sampling_logp_difference/mean": 1.3635454177856445, "step": 57, "step_time": 4.399483632999363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.954219579696655, "epoch": 0.00058, "grad_norm": 0.004001974128186703, "kl": 0.005270991328870878, "learning_rate": 9.999999776004962e-06, "loss": 0.0003, "step": 58, "step_time": 2.068970861998423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 137.78125, "completions/mean_terminated_length": 137.78125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 7.7305819392204285, "epoch": 0.00059, "frac_reward_zero_std": 0.5, "grad_norm": 0.002240613801404834, "kl": 0.007570188608951867, "learning_rate": 9.999999755178978e-06, "loss": -0.0002, "num_tokens": 542324.0, "reward": -0.565625011920929, "reward_std": 0.18561552464962006, "rewards/rollout_reward_func/mean": -0.565625011920929, "rewards/rollout_reward_func/std": 0.841890811920166, "sampling/importance_sampling_ratio/max": 0.014670869335532188, "sampling/importance_sampling_ratio/mean": 0.006734498776495457, "sampling/importance_sampling_ratio/min": 9.527041698945138e-13, "sampling/sampling_logp_difference/max": 10.319395065307617, "sampling/sampling_logp_difference/mean": 1.346876621246338, "step": 59, "step_time": 4.076921171999857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.7493292689323425, "epoch": 0.0006, "grad_norm": 0.0021323147229850292, "kl": 0.007373731641564518, "learning_rate": 9.999999733427394e-06, "loss": -0.0002, "step": 60, "step_time": 2.065178036998077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 131.15625, "completions/mean_terminated_length": 132.53334045410156, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 7.80867725610733, "epoch": 0.00061, "frac_reward_zero_std": 0.5, "grad_norm": 0.002022077329456806, "kl": 0.004840011039050296, "learning_rate": 9.99999971075021e-06, "loss": -0.0003, "num_tokens": 559385.0, "reward": -0.25312501192092896, "reward_std": 0.2761000692844391, "rewards/rollout_reward_func/mean": -0.25312501192092896, "rewards/rollout_reward_func/std": 0.9863533973693848, "sampling/importance_sampling_ratio/max": 0.011284389533102512, "sampling/importance_sampling_ratio/mean": 0.006653377786278725, "sampling/importance_sampling_ratio/min": 2.8632414147966578e-11, "sampling/sampling_logp_difference/max": 4.169203758239746, "sampling/sampling_logp_difference/mean": 1.2035651206970215, "step": 61, "step_time": 4.5231009249973795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.829549431800842, "epoch": 0.00062, "grad_norm": 0.0023180257994681597, "kl": 0.004549442324787378, "learning_rate": 9.999999687147426e-06, "loss": -0.0003, "step": 62, "step_time": 2.04733120399942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 123.1875, "completions/mean_terminated_length": 123.1875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 8.036745607852936, "epoch": 0.00063, "frac_reward_zero_std": 0.25, "grad_norm": 0.0024225530214607716, "kl": 0.007575233932584524, "learning_rate": 9.999999662619046e-06, "loss": -0.0004, "num_tokens": 577175.0, "reward": -0.6437500715255737, "reward_std": 0.2552982568740845, "rewards/rollout_reward_func/mean": -0.6437500715255737, "rewards/rollout_reward_func/std": 0.7873751521110535, "sampling/importance_sampling_ratio/max": 0.010909978300333023, "sampling/importance_sampling_ratio/mean": 0.005395432468503714, "sampling/importance_sampling_ratio/min": 2.1033281019655625e-11, "sampling/sampling_logp_difference/max": 10.216733932495117, "sampling/sampling_logp_difference/mean": 1.502685546875, "step": 63, "step_time": 3.9012105610008803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.038543045520782, "epoch": 0.00064, "grad_norm": 0.0024051358923316, "kl": 0.007686805154662579, "learning_rate": 9.999999637165062e-06, "loss": -0.0004, "step": 64, "step_time": 2.0093538759983858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 141.65625, "completions/mean_terminated_length": 141.65625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 8.191402852535248, "epoch": 0.00065, "frac_reward_zero_std": 0.0, "grad_norm": 0.003754268866032362, "kl": 0.005535919044632465, "learning_rate": 9.999999610785483e-06, "loss": -0.0, "num_tokens": 594644.0, "reward": -0.33125001192092896, "reward_std": 0.4657542407512665, "rewards/rollout_reward_func/mean": -0.33125001192092896, "rewards/rollout_reward_func/std": 0.9663491249084473, "sampling/importance_sampling_ratio/max": 0.010239495895802975, "sampling/importance_sampling_ratio/mean": 0.004091148264706135, "sampling/importance_sampling_ratio/min": 1.6428119986328787e-13, "sampling/sampling_logp_difference/max": 11.840818405151367, "sampling/sampling_logp_difference/mean": 1.458545446395874, "step": 65, "step_time": 4.098211152998374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.207817077636719, "epoch": 0.00066, "grad_norm": 0.0028874659910798073, "kl": 0.005369087448343635, "learning_rate": 9.999999583480304e-06, "loss": -0.0, "step": 66, "step_time": 2.03046784199978 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 131.0625, "completions/mean_terminated_length": 131.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.138030529022217, "epoch": 0.00067, "frac_reward_zero_std": 0.25, "grad_norm": 0.008762937970459461, "kl": 0.03389387877541594, "learning_rate": 9.999999555249524e-06, "loss": 0.0004, "num_tokens": 612302.0, "reward": -0.32500001788139343, "reward_std": 0.44983184337615967, "rewards/rollout_reward_func/mean": -0.32500001788139343, "rewards/rollout_reward_func/std": 0.9615175724029541, "sampling/importance_sampling_ratio/max": 0.06631788611412048, "sampling/importance_sampling_ratio/mean": 0.009149353951215744, "sampling/importance_sampling_ratio/min": 2.947741995564272e-15, "sampling/sampling_logp_difference/max": 4.271122455596924, "sampling/sampling_logp_difference/mean": 1.307502269744873, "step": 67, "step_time": 4.921273821997602 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "entropy": 8.132037341594696, "epoch": 0.00068, "grad_norm": 0.009227960370481014, "kl": 0.03418682742631063, "learning_rate": 9.999999526093148e-06, "loss": 0.0004, "step": 68, "step_time": 2.032772711999314 } ], "logging_steps": 1.0, "max_steps": 200000, "num_input_tokens_seen": 612302, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }